diff options
Diffstat (limited to 'kernel')
104 files changed, 5964 insertions, 2670 deletions
| diff --git a/kernel/Makefile b/kernel/Makefile index 1408b3353a3c..0f8f8b0bc1bf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -9,7 +9,9 @@ obj-y = fork.o exec_domain.o panic.o \ | |||
| 9 | extable.o params.o \ | 9 | extable.o params.o \ | 
| 10 | kthread.o sys_ni.o nsproxy.o \ | 10 | kthread.o sys_ni.o nsproxy.o \ | 
| 11 | notifier.o ksysfs.o cred.o reboot.o \ | 11 | notifier.o ksysfs.o cred.o reboot.o \ | 
| 12 | async.o range.o groups.o smpboot.o | 12 | async.o range.o smpboot.o | 
| 13 | |||
| 14 | obj-$(CONFIG_MULTIUSER) += groups.o | ||
| 13 | 15 | ||
| 14 | ifdef CONFIG_FUNCTION_TRACER | 16 | ifdef CONFIG_FUNCTION_TRACER | 
| 15 | # Do not trace debug files and internal ftrace files | 17 | # Do not trace debug files and internal ftrace files | 
| diff --git a/kernel/acct.c b/kernel/acct.c index e6c10d1a4058..74963d192c5d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -213,7 +213,7 @@ static int acct_on(struct filename *pathname) | |||
| 213 | return -EACCES; | 213 | return -EACCES; | 
| 214 | } | 214 | } | 
| 215 | 215 | ||
| 216 | if (!file->f_op->write) { | 216 | if (!(file->f_mode & FMODE_CAN_WRITE)) { | 
| 217 | kfree(acct); | 217 | kfree(acct); | 
| 218 | filp_close(file, NULL); | 218 | filp_close(file, NULL); | 
| 219 | return -EIO; | 219 | return -EIO; | 
| diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index a5ae60f0b0a2..e6983be12bd3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
| @@ -1,5 +1,2 @@ | |||
| 1 | obj-y := core.o | 1 | obj-y := core.o | 
| 2 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o | 2 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o | 
| 3 | ifdef CONFIG_TEST_BPF | ||
| 4 | obj-$(CONFIG_BPF_SYSCALL) += test_stub.o | ||
| 5 | endif | ||
| diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 9eb4d8a7cd87..8a6616583f38 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
| @@ -134,7 +134,7 @@ static void array_map_free(struct bpf_map *map) | |||
| 134 | kvfree(array); | 134 | kvfree(array); | 
| 135 | } | 135 | } | 
| 136 | 136 | ||
| 137 | static struct bpf_map_ops array_ops = { | 137 | static const struct bpf_map_ops array_ops = { | 
| 138 | .map_alloc = array_map_alloc, | 138 | .map_alloc = array_map_alloc, | 
| 139 | .map_free = array_map_free, | 139 | .map_free = array_map_free, | 
| 140 | .map_get_next_key = array_map_get_next_key, | 140 | .map_get_next_key = array_map_get_next_key, | 
| @@ -143,14 +143,14 @@ static struct bpf_map_ops array_ops = { | |||
| 143 | .map_delete_elem = array_map_delete_elem, | 143 | .map_delete_elem = array_map_delete_elem, | 
| 144 | }; | 144 | }; | 
| 145 | 145 | ||
| 146 | static struct bpf_map_type_list tl = { | 146 | static struct bpf_map_type_list array_type __read_mostly = { | 
| 147 | .ops = &array_ops, | 147 | .ops = &array_ops, | 
| 148 | .type = BPF_MAP_TYPE_ARRAY, | 148 | .type = BPF_MAP_TYPE_ARRAY, | 
| 149 | }; | 149 | }; | 
| 150 | 150 | ||
| 151 | static int __init register_array_map(void) | 151 | static int __init register_array_map(void) | 
| 152 | { | 152 | { | 
| 153 | bpf_register_map_type(&tl); | 153 | bpf_register_map_type(&array_type); | 
| 154 | return 0; | 154 | return 0; | 
| 155 | } | 155 | } | 
| 156 | late_initcall(register_array_map); | 156 | late_initcall(register_array_map); | 
| diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a64e7a207d2b..4139a0f8b558 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -656,6 +656,14 @@ void bpf_prog_free(struct bpf_prog *fp) | |||
| 656 | } | 656 | } | 
| 657 | EXPORT_SYMBOL_GPL(bpf_prog_free); | 657 | EXPORT_SYMBOL_GPL(bpf_prog_free); | 
| 658 | 658 | ||
| 659 | /* Weak definitions of helper functions in case we don't have bpf syscall. */ | ||
| 660 | const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; | ||
| 661 | const struct bpf_func_proto bpf_map_update_elem_proto __weak; | ||
| 662 | const struct bpf_func_proto bpf_map_delete_elem_proto __weak; | ||
| 663 | |||
| 664 | const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; | ||
| 665 | const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; | ||
| 666 | |||
| 659 | /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call | 667 | /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call | 
| 660 | * skb_copy_bits(), so provide a weak definition of it for NET-less config. | 668 | * skb_copy_bits(), so provide a weak definition of it for NET-less config. | 
| 661 | */ | 669 | */ | 
| diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b3ba43674310..83c209d9b17a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
| @@ -345,7 +345,7 @@ static void htab_map_free(struct bpf_map *map) | |||
| 345 | kfree(htab); | 345 | kfree(htab); | 
| 346 | } | 346 | } | 
| 347 | 347 | ||
| 348 | static struct bpf_map_ops htab_ops = { | 348 | static const struct bpf_map_ops htab_ops = { | 
| 349 | .map_alloc = htab_map_alloc, | 349 | .map_alloc = htab_map_alloc, | 
| 350 | .map_free = htab_map_free, | 350 | .map_free = htab_map_free, | 
| 351 | .map_get_next_key = htab_map_get_next_key, | 351 | .map_get_next_key = htab_map_get_next_key, | 
| @@ -354,14 +354,14 @@ static struct bpf_map_ops htab_ops = { | |||
| 354 | .map_delete_elem = htab_map_delete_elem, | 354 | .map_delete_elem = htab_map_delete_elem, | 
| 355 | }; | 355 | }; | 
| 356 | 356 | ||
| 357 | static struct bpf_map_type_list tl = { | 357 | static struct bpf_map_type_list htab_type __read_mostly = { | 
| 358 | .ops = &htab_ops, | 358 | .ops = &htab_ops, | 
| 359 | .type = BPF_MAP_TYPE_HASH, | 359 | .type = BPF_MAP_TYPE_HASH, | 
| 360 | }; | 360 | }; | 
| 361 | 361 | ||
| 362 | static int __init register_htab_map(void) | 362 | static int __init register_htab_map(void) | 
| 363 | { | 363 | { | 
| 364 | bpf_register_map_type(&tl); | 364 | bpf_register_map_type(&htab_type); | 
| 365 | return 0; | 365 | return 0; | 
| 366 | } | 366 | } | 
| 367 | late_initcall(register_htab_map); | 367 | late_initcall(register_htab_map); | 
| diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9e3414d85459..bd7f5988ed9c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c | |||
| @@ -11,6 +11,8 @@ | |||
| 11 | */ | 11 | */ | 
| 12 | #include <linux/bpf.h> | 12 | #include <linux/bpf.h> | 
| 13 | #include <linux/rcupdate.h> | 13 | #include <linux/rcupdate.h> | 
| 14 | #include <linux/random.h> | ||
| 15 | #include <linux/smp.h> | ||
| 14 | 16 | ||
| 15 | /* If kernel subsystem is allowing eBPF programs to call this function, | 17 | /* If kernel subsystem is allowing eBPF programs to call this function, | 
| 16 | * inside its own verifier_ops->get_func_proto() callback it should return | 18 | * inside its own verifier_ops->get_func_proto() callback it should return | 
| @@ -41,7 +43,7 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
| 41 | return (unsigned long) value; | 43 | return (unsigned long) value; | 
| 42 | } | 44 | } | 
| 43 | 45 | ||
| 44 | struct bpf_func_proto bpf_map_lookup_elem_proto = { | 46 | const struct bpf_func_proto bpf_map_lookup_elem_proto = { | 
| 45 | .func = bpf_map_lookup_elem, | 47 | .func = bpf_map_lookup_elem, | 
| 46 | .gpl_only = false, | 48 | .gpl_only = false, | 
| 47 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | 49 | .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, | 
| @@ -60,7 +62,7 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
| 60 | return map->ops->map_update_elem(map, key, value, r4); | 62 | return map->ops->map_update_elem(map, key, value, r4); | 
| 61 | } | 63 | } | 
| 62 | 64 | ||
| 63 | struct bpf_func_proto bpf_map_update_elem_proto = { | 65 | const struct bpf_func_proto bpf_map_update_elem_proto = { | 
| 64 | .func = bpf_map_update_elem, | 66 | .func = bpf_map_update_elem, | 
| 65 | .gpl_only = false, | 67 | .gpl_only = false, | 
| 66 | .ret_type = RET_INTEGER, | 68 | .ret_type = RET_INTEGER, | 
| @@ -80,10 +82,32 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | |||
| 80 | return map->ops->map_delete_elem(map, key); | 82 | return map->ops->map_delete_elem(map, key); | 
| 81 | } | 83 | } | 
| 82 | 84 | ||
| 83 | struct bpf_func_proto bpf_map_delete_elem_proto = { | 85 | const struct bpf_func_proto bpf_map_delete_elem_proto = { | 
| 84 | .func = bpf_map_delete_elem, | 86 | .func = bpf_map_delete_elem, | 
| 85 | .gpl_only = false, | 87 | .gpl_only = false, | 
| 86 | .ret_type = RET_INTEGER, | 88 | .ret_type = RET_INTEGER, | 
| 87 | .arg1_type = ARG_CONST_MAP_PTR, | 89 | .arg1_type = ARG_CONST_MAP_PTR, | 
| 88 | .arg2_type = ARG_PTR_TO_MAP_KEY, | 90 | .arg2_type = ARG_PTR_TO_MAP_KEY, | 
| 89 | }; | 91 | }; | 
| 92 | |||
| 93 | static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 94 | { | ||
| 95 | return prandom_u32(); | ||
| 96 | } | ||
| 97 | |||
| 98 | const struct bpf_func_proto bpf_get_prandom_u32_proto = { | ||
| 99 | .func = bpf_get_prandom_u32, | ||
| 100 | .gpl_only = false, | ||
| 101 | .ret_type = RET_INTEGER, | ||
| 102 | }; | ||
| 103 | |||
| 104 | static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 105 | { | ||
| 106 | return raw_smp_processor_id(); | ||
| 107 | } | ||
| 108 | |||
| 109 | const struct bpf_func_proto bpf_get_smp_processor_id_proto = { | ||
| 110 | .func = bpf_get_smp_processor_id, | ||
| 111 | .gpl_only = false, | ||
| 112 | .ret_type = RET_INTEGER, | ||
| 113 | }; | ||
| diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 536edc2be307..3bae6c591914 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/file.h> | 16 | #include <linux/file.h> | 
| 17 | #include <linux/license.h> | 17 | #include <linux/license.h> | 
| 18 | #include <linux/filter.h> | 18 | #include <linux/filter.h> | 
| 19 | #include <linux/version.h> | ||
| 19 | 20 | ||
| 20 | static LIST_HEAD(bpf_map_types); | 21 | static LIST_HEAD(bpf_map_types); | 
| 21 | 22 | ||
| @@ -354,10 +355,11 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) | |||
| 354 | list_for_each_entry(tl, &bpf_prog_types, list_node) { | 355 | list_for_each_entry(tl, &bpf_prog_types, list_node) { | 
| 355 | if (tl->type == type) { | 356 | if (tl->type == type) { | 
| 356 | prog->aux->ops = tl->ops; | 357 | prog->aux->ops = tl->ops; | 
| 357 | prog->aux->prog_type = type; | 358 | prog->type = type; | 
| 358 | return 0; | 359 | return 0; | 
| 359 | } | 360 | } | 
| 360 | } | 361 | } | 
| 362 | |||
| 361 | return -EINVAL; | 363 | return -EINVAL; | 
| 362 | } | 364 | } | 
| 363 | 365 | ||
| @@ -418,6 +420,7 @@ void bpf_prog_put(struct bpf_prog *prog) | |||
| 418 | bpf_prog_free(prog); | 420 | bpf_prog_free(prog); | 
| 419 | } | 421 | } | 
| 420 | } | 422 | } | 
| 423 | EXPORT_SYMBOL_GPL(bpf_prog_put); | ||
| 421 | 424 | ||
| 422 | static int bpf_prog_release(struct inode *inode, struct file *filp) | 425 | static int bpf_prog_release(struct inode *inode, struct file *filp) | 
| 423 | { | 426 | { | 
| @@ -465,9 +468,10 @@ struct bpf_prog *bpf_prog_get(u32 ufd) | |||
| 465 | fdput(f); | 468 | fdput(f); | 
| 466 | return prog; | 469 | return prog; | 
| 467 | } | 470 | } | 
| 471 | EXPORT_SYMBOL_GPL(bpf_prog_get); | ||
| 468 | 472 | ||
| 469 | /* last field in 'union bpf_attr' used by this command */ | 473 | /* last field in 'union bpf_attr' used by this command */ | 
| 470 | #define BPF_PROG_LOAD_LAST_FIELD log_buf | 474 | #define BPF_PROG_LOAD_LAST_FIELD kern_version | 
| 471 | 475 | ||
| 472 | static int bpf_prog_load(union bpf_attr *attr) | 476 | static int bpf_prog_load(union bpf_attr *attr) | 
| 473 | { | 477 | { | 
| @@ -492,6 +496,10 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 492 | if (attr->insn_cnt >= BPF_MAXINSNS) | 496 | if (attr->insn_cnt >= BPF_MAXINSNS) | 
| 493 | return -EINVAL; | 497 | return -EINVAL; | 
| 494 | 498 | ||
| 499 | if (type == BPF_PROG_TYPE_KPROBE && | ||
| 500 | attr->kern_version != LINUX_VERSION_CODE) | ||
| 501 | return -EINVAL; | ||
| 502 | |||
| 495 | /* plain bpf_prog allocation */ | 503 | /* plain bpf_prog allocation */ | 
| 496 | prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); | 504 | prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); | 
| 497 | if (!prog) | 505 | if (!prog) | 
| @@ -508,7 +516,7 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 508 | prog->jited = false; | 516 | prog->jited = false; | 
| 509 | 517 | ||
| 510 | atomic_set(&prog->aux->refcnt, 1); | 518 | atomic_set(&prog->aux->refcnt, 1); | 
| 511 | prog->aux->is_gpl_compatible = is_gpl; | 519 | prog->gpl_compatible = is_gpl; | 
| 512 | 520 | ||
| 513 | /* find program type: socket_filter vs tracing_filter */ | 521 | /* find program type: socket_filter vs tracing_filter */ | 
| 514 | err = find_prog_type(type, prog); | 522 | err = find_prog_type(type, prog); | 
| @@ -516,8 +524,7 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 516 | goto free_prog; | 524 | goto free_prog; | 
| 517 | 525 | ||
| 518 | /* run eBPF verifier */ | 526 | /* run eBPF verifier */ | 
| 519 | err = bpf_check(prog, attr); | 527 | err = bpf_check(&prog, attr); | 
| 520 | |||
| 521 | if (err < 0) | 528 | if (err < 0) | 
| 522 | goto free_used_maps; | 529 | goto free_used_maps; | 
| 523 | 530 | ||
| @@ -528,7 +535,6 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
| 528 | bpf_prog_select_runtime(prog); | 535 | bpf_prog_select_runtime(prog); | 
| 529 | 536 | ||
| 530 | err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); | 537 | err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); | 
| 531 | |||
| 532 | if (err < 0) | 538 | if (err < 0) | 
| 533 | /* failed to allocate fd */ | 539 | /* failed to allocate fd */ | 
| 534 | goto free_used_maps; | 540 | goto free_used_maps; | 
| diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c deleted file mode 100644 index 0ceae1e6e8b5..000000000000 --- a/kernel/bpf/test_stub.c +++ /dev/null | |||
| @@ -1,78 +0,0 @@ | |||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | ||
| 7 | #include <linux/kernel.h> | ||
| 8 | #include <linux/types.h> | ||
| 9 | #include <linux/slab.h> | ||
| 10 | #include <linux/err.h> | ||
| 11 | #include <linux/bpf.h> | ||
| 12 | |||
| 13 | /* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC | ||
| 14 | * to be used by user space verifier testsuite | ||
| 15 | */ | ||
| 16 | struct bpf_context { | ||
| 17 | u64 arg1; | ||
| 18 | u64 arg2; | ||
| 19 | }; | ||
| 20 | |||
| 21 | static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) | ||
| 22 | { | ||
| 23 | switch (func_id) { | ||
| 24 | case BPF_FUNC_map_lookup_elem: | ||
| 25 | return &bpf_map_lookup_elem_proto; | ||
| 26 | case BPF_FUNC_map_update_elem: | ||
| 27 | return &bpf_map_update_elem_proto; | ||
| 28 | case BPF_FUNC_map_delete_elem: | ||
| 29 | return &bpf_map_delete_elem_proto; | ||
| 30 | default: | ||
| 31 | return NULL; | ||
| 32 | } | ||
| 33 | } | ||
| 34 | |||
| 35 | static const struct bpf_context_access { | ||
| 36 | int size; | ||
| 37 | enum bpf_access_type type; | ||
| 38 | } test_ctx_access[] = { | ||
| 39 | [offsetof(struct bpf_context, arg1)] = { | ||
| 40 | FIELD_SIZEOF(struct bpf_context, arg1), | ||
| 41 | BPF_READ | ||
| 42 | }, | ||
| 43 | [offsetof(struct bpf_context, arg2)] = { | ||
| 44 | FIELD_SIZEOF(struct bpf_context, arg2), | ||
| 45 | BPF_READ | ||
| 46 | }, | ||
| 47 | }; | ||
| 48 | |||
| 49 | static bool test_is_valid_access(int off, int size, enum bpf_access_type type) | ||
| 50 | { | ||
| 51 | const struct bpf_context_access *access; | ||
| 52 | |||
| 53 | if (off < 0 || off >= ARRAY_SIZE(test_ctx_access)) | ||
| 54 | return false; | ||
| 55 | |||
| 56 | access = &test_ctx_access[off]; | ||
| 57 | if (access->size == size && (access->type & type)) | ||
| 58 | return true; | ||
| 59 | |||
| 60 | return false; | ||
| 61 | } | ||
| 62 | |||
| 63 | static struct bpf_verifier_ops test_ops = { | ||
| 64 | .get_func_proto = test_func_proto, | ||
| 65 | .is_valid_access = test_is_valid_access, | ||
| 66 | }; | ||
| 67 | |||
| 68 | static struct bpf_prog_type_list tl_prog = { | ||
| 69 | .ops = &test_ops, | ||
| 70 | .type = BPF_PROG_TYPE_UNSPEC, | ||
| 71 | }; | ||
| 72 | |||
| 73 | static int __init register_test_ops(void) | ||
| 74 | { | ||
| 75 | bpf_register_prog_type(&tl_prog); | ||
| 76 | return 0; | ||
| 77 | } | ||
| 78 | late_initcall(register_test_ops); | ||
| diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a28e09c7825d..630a7bac1e51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -755,7 +755,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
| 755 | enum bpf_reg_type expected_type; | 755 | enum bpf_reg_type expected_type; | 
| 756 | int err = 0; | 756 | int err = 0; | 
| 757 | 757 | ||
| 758 | if (arg_type == ARG_ANYTHING) | 758 | if (arg_type == ARG_DONTCARE) | 
| 759 | return 0; | 759 | return 0; | 
| 760 | 760 | ||
| 761 | if (reg->type == NOT_INIT) { | 761 | if (reg->type == NOT_INIT) { | 
| @@ -763,6 +763,9 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
| 763 | return -EACCES; | 763 | return -EACCES; | 
| 764 | } | 764 | } | 
| 765 | 765 | ||
| 766 | if (arg_type == ARG_ANYTHING) | ||
| 767 | return 0; | ||
| 768 | |||
| 766 | if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || | 769 | if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || | 
| 767 | arg_type == ARG_PTR_TO_MAP_VALUE) { | 770 | arg_type == ARG_PTR_TO_MAP_VALUE) { | 
| 768 | expected_type = PTR_TO_STACK; | 771 | expected_type = PTR_TO_STACK; | 
| @@ -770,6 +773,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, | |||
| 770 | expected_type = CONST_IMM; | 773 | expected_type = CONST_IMM; | 
| 771 | } else if (arg_type == ARG_CONST_MAP_PTR) { | 774 | } else if (arg_type == ARG_CONST_MAP_PTR) { | 
| 772 | expected_type = CONST_PTR_TO_MAP; | 775 | expected_type = CONST_PTR_TO_MAP; | 
| 776 | } else if (arg_type == ARG_PTR_TO_CTX) { | ||
| 777 | expected_type = PTR_TO_CTX; | ||
| 773 | } else { | 778 | } else { | 
| 774 | verbose("unsupported arg_type %d\n", arg_type); | 779 | verbose("unsupported arg_type %d\n", arg_type); | 
| 775 | return -EFAULT; | 780 | return -EFAULT; | 
| @@ -852,7 +857,7 @@ static int check_call(struct verifier_env *env, int func_id) | |||
| 852 | } | 857 | } | 
| 853 | 858 | ||
| 854 | /* eBPF programs must be GPL compatible to use GPL-ed functions */ | 859 | /* eBPF programs must be GPL compatible to use GPL-ed functions */ | 
| 855 | if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { | 860 | if (!env->prog->gpl_compatible && fn->gpl_only) { | 
| 856 | verbose("cannot call GPL only function from proprietary program\n"); | 861 | verbose("cannot call GPL only function from proprietary program\n"); | 
| 857 | return -EINVAL; | 862 | return -EINVAL; | 
| 858 | } | 863 | } | 
| @@ -1172,6 +1177,18 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1172 | return 0; | 1177 | return 0; | 
| 1173 | } | 1178 | } | 
| 1174 | 1179 | ||
| 1180 | static bool may_access_skb(enum bpf_prog_type type) | ||
| 1181 | { | ||
| 1182 | switch (type) { | ||
| 1183 | case BPF_PROG_TYPE_SOCKET_FILTER: | ||
| 1184 | case BPF_PROG_TYPE_SCHED_CLS: | ||
| 1185 | case BPF_PROG_TYPE_SCHED_ACT: | ||
| 1186 | return true; | ||
| 1187 | default: | ||
| 1188 | return false; | ||
| 1189 | } | ||
| 1190 | } | ||
| 1191 | |||
| 1175 | /* verify safety of LD_ABS|LD_IND instructions: | 1192 | /* verify safety of LD_ABS|LD_IND instructions: | 
| 1176 | * - they can only appear in the programs where ctx == skb | 1193 | * - they can only appear in the programs where ctx == skb | 
| 1177 | * - since they are wrappers of function calls, they scratch R1-R5 registers, | 1194 | * - since they are wrappers of function calls, they scratch R1-R5 registers, | 
| @@ -1194,8 +1211,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) | |||
| 1194 | struct reg_state *reg; | 1211 | struct reg_state *reg; | 
| 1195 | int i, err; | 1212 | int i, err; | 
| 1196 | 1213 | ||
| 1197 | if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { | 1214 | if (!may_access_skb(env->prog->type)) { | 
| 1198 | verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); | 1215 | verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); | 
| 1199 | return -EINVAL; | 1216 | return -EINVAL; | 
| 1200 | } | 1217 | } | 
| 1201 | 1218 | ||
| @@ -1606,11 +1623,10 @@ static int do_check(struct verifier_env *env) | |||
| 1606 | return err; | 1623 | return err; | 
| 1607 | 1624 | ||
| 1608 | } else if (class == BPF_LDX) { | 1625 | } else if (class == BPF_LDX) { | 
| 1609 | if (BPF_MODE(insn->code) != BPF_MEM || | 1626 | enum bpf_reg_type src_reg_type; | 
| 1610 | insn->imm != 0) { | 1627 | |
| 1611 | verbose("BPF_LDX uses reserved fields\n"); | 1628 | /* check for reserved fields is already done */ | 
| 1612 | return -EINVAL; | 1629 | |
| 1613 | } | ||
| 1614 | /* check src operand */ | 1630 | /* check src operand */ | 
| 1615 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | 1631 | err = check_reg_arg(regs, insn->src_reg, SRC_OP); | 
| 1616 | if (err) | 1632 | if (err) | 
| @@ -1629,6 +1645,29 @@ static int do_check(struct verifier_env *env) | |||
| 1629 | if (err) | 1645 | if (err) | 
| 1630 | return err; | 1646 | return err; | 
| 1631 | 1647 | ||
| 1648 | src_reg_type = regs[insn->src_reg].type; | ||
| 1649 | |||
| 1650 | if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) { | ||
| 1651 | /* saw a valid insn | ||
| 1652 | * dst_reg = *(u32 *)(src_reg + off) | ||
| 1653 | * use reserved 'imm' field to mark this insn | ||
| 1654 | */ | ||
| 1655 | insn->imm = src_reg_type; | ||
| 1656 | |||
| 1657 | } else if (src_reg_type != insn->imm && | ||
| 1658 | (src_reg_type == PTR_TO_CTX || | ||
| 1659 | insn->imm == PTR_TO_CTX)) { | ||
| 1660 | /* ABuser program is trying to use the same insn | ||
| 1661 | * dst_reg = *(u32*) (src_reg + off) | ||
| 1662 | * with different pointer types: | ||
| 1663 | * src_reg == ctx in one branch and | ||
| 1664 | * src_reg == stack|map in some other branch. | ||
| 1665 | * Reject it. | ||
| 1666 | */ | ||
| 1667 | verbose("same insn cannot be used with different pointers\n"); | ||
| 1668 | return -EINVAL; | ||
| 1669 | } | ||
| 1670 | |||
| 1632 | } else if (class == BPF_STX) { | 1671 | } else if (class == BPF_STX) { | 
| 1633 | if (BPF_MODE(insn->code) == BPF_XADD) { | 1672 | if (BPF_MODE(insn->code) == BPF_XADD) { | 
| 1634 | err = check_xadd(env, insn); | 1673 | err = check_xadd(env, insn); | 
| @@ -1776,6 +1815,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) | |||
| 1776 | int i, j; | 1815 | int i, j; | 
| 1777 | 1816 | ||
| 1778 | for (i = 0; i < insn_cnt; i++, insn++) { | 1817 | for (i = 0; i < insn_cnt; i++, insn++) { | 
| 1818 | if (BPF_CLASS(insn->code) == BPF_LDX && | ||
| 1819 | (BPF_MODE(insn->code) != BPF_MEM || | ||
| 1820 | insn->imm != 0)) { | ||
| 1821 | verbose("BPF_LDX uses reserved fields\n"); | ||
| 1822 | return -EINVAL; | ||
| 1823 | } | ||
| 1824 | |||
| 1779 | if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { | 1825 | if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { | 
| 1780 | struct bpf_map *map; | 1826 | struct bpf_map *map; | 
| 1781 | struct fd f; | 1827 | struct fd f; | 
| @@ -1867,6 +1913,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) | |||
| 1867 | insn->src_reg = 0; | 1913 | insn->src_reg = 0; | 
| 1868 | } | 1914 | } | 
| 1869 | 1915 | ||
| 1916 | static void adjust_branches(struct bpf_prog *prog, int pos, int delta) | ||
| 1917 | { | ||
| 1918 | struct bpf_insn *insn = prog->insnsi; | ||
| 1919 | int insn_cnt = prog->len; | ||
| 1920 | int i; | ||
| 1921 | |||
| 1922 | for (i = 0; i < insn_cnt; i++, insn++) { | ||
| 1923 | if (BPF_CLASS(insn->code) != BPF_JMP || | ||
| 1924 | BPF_OP(insn->code) == BPF_CALL || | ||
| 1925 | BPF_OP(insn->code) == BPF_EXIT) | ||
| 1926 | continue; | ||
| 1927 | |||
| 1928 | /* adjust offset of jmps if necessary */ | ||
| 1929 | if (i < pos && i + insn->off + 1 > pos) | ||
| 1930 | insn->off += delta; | ||
| 1931 | else if (i > pos && i + insn->off + 1 < pos) | ||
| 1932 | insn->off -= delta; | ||
| 1933 | } | ||
| 1934 | } | ||
| 1935 | |||
| 1936 | /* convert load instructions that access fields of 'struct __sk_buff' | ||
| 1937 | * into sequence of instructions that access fields of 'struct sk_buff' | ||
| 1938 | */ | ||
| 1939 | static int convert_ctx_accesses(struct verifier_env *env) | ||
| 1940 | { | ||
| 1941 | struct bpf_insn *insn = env->prog->insnsi; | ||
| 1942 | int insn_cnt = env->prog->len; | ||
| 1943 | struct bpf_insn insn_buf[16]; | ||
| 1944 | struct bpf_prog *new_prog; | ||
| 1945 | u32 cnt; | ||
| 1946 | int i; | ||
| 1947 | |||
| 1948 | if (!env->prog->aux->ops->convert_ctx_access) | ||
| 1949 | return 0; | ||
| 1950 | |||
| 1951 | for (i = 0; i < insn_cnt; i++, insn++) { | ||
| 1952 | if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) | ||
| 1953 | continue; | ||
| 1954 | |||
| 1955 | if (insn->imm != PTR_TO_CTX) { | ||
| 1956 | /* clear internal mark */ | ||
| 1957 | insn->imm = 0; | ||
| 1958 | continue; | ||
| 1959 | } | ||
| 1960 | |||
| 1961 | cnt = env->prog->aux->ops-> | ||
| 1962 | convert_ctx_access(insn->dst_reg, insn->src_reg, | ||
| 1963 | insn->off, insn_buf); | ||
| 1964 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { | ||
| 1965 | verbose("bpf verifier is misconfigured\n"); | ||
| 1966 | return -EINVAL; | ||
| 1967 | } | ||
| 1968 | |||
| 1969 | if (cnt == 1) { | ||
| 1970 | memcpy(insn, insn_buf, sizeof(*insn)); | ||
| 1971 | continue; | ||
| 1972 | } | ||
| 1973 | |||
| 1974 | /* several new insns need to be inserted. Make room for them */ | ||
| 1975 | insn_cnt += cnt - 1; | ||
| 1976 | new_prog = bpf_prog_realloc(env->prog, | ||
| 1977 | bpf_prog_size(insn_cnt), | ||
| 1978 | GFP_USER); | ||
| 1979 | if (!new_prog) | ||
| 1980 | return -ENOMEM; | ||
| 1981 | |||
| 1982 | new_prog->len = insn_cnt; | ||
| 1983 | |||
| 1984 | memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, | ||
| 1985 | sizeof(*insn) * (insn_cnt - i - cnt)); | ||
| 1986 | |||
| 1987 | /* copy substitute insns in place of load instruction */ | ||
| 1988 | memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); | ||
| 1989 | |||
| 1990 | /* adjust branches in the whole program */ | ||
| 1991 | adjust_branches(new_prog, i, cnt - 1); | ||
| 1992 | |||
| 1993 | /* keep walking new program and skip insns we just inserted */ | ||
| 1994 | env->prog = new_prog; | ||
| 1995 | insn = new_prog->insnsi + i + cnt - 1; | ||
| 1996 | i += cnt - 1; | ||
| 1997 | } | ||
| 1998 | |||
| 1999 | return 0; | ||
| 2000 | } | ||
| 2001 | |||
| 1870 | static void free_states(struct verifier_env *env) | 2002 | static void free_states(struct verifier_env *env) | 
| 1871 | { | 2003 | { | 
| 1872 | struct verifier_state_list *sl, *sln; | 2004 | struct verifier_state_list *sl, *sln; | 
| @@ -1889,13 +2021,13 @@ static void free_states(struct verifier_env *env) | |||
| 1889 | kfree(env->explored_states); | 2021 | kfree(env->explored_states); | 
| 1890 | } | 2022 | } | 
| 1891 | 2023 | ||
| 1892 | int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) | 2024 | int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) | 
| 1893 | { | 2025 | { | 
| 1894 | char __user *log_ubuf = NULL; | 2026 | char __user *log_ubuf = NULL; | 
| 1895 | struct verifier_env *env; | 2027 | struct verifier_env *env; | 
| 1896 | int ret = -EINVAL; | 2028 | int ret = -EINVAL; | 
| 1897 | 2029 | ||
| 1898 | if (prog->len <= 0 || prog->len > BPF_MAXINSNS) | 2030 | if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) | 
| 1899 | return -E2BIG; | 2031 | return -E2BIG; | 
| 1900 | 2032 | ||
| 1901 | /* 'struct verifier_env' can be global, but since it's not small, | 2033 | /* 'struct verifier_env' can be global, but since it's not small, | 
| @@ -1905,7 +2037,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) | |||
| 1905 | if (!env) | 2037 | if (!env) | 
| 1906 | return -ENOMEM; | 2038 | return -ENOMEM; | 
| 1907 | 2039 | ||
| 1908 | env->prog = prog; | 2040 | env->prog = *prog; | 
| 1909 | 2041 | ||
| 1910 | /* grab the mutex to protect few globals used by verifier */ | 2042 | /* grab the mutex to protect few globals used by verifier */ | 
| 1911 | mutex_lock(&bpf_verifier_lock); | 2043 | mutex_lock(&bpf_verifier_lock); | 
| @@ -1937,7 +2069,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) | |||
| 1937 | if (ret < 0) | 2069 | if (ret < 0) | 
| 1938 | goto skip_full_check; | 2070 | goto skip_full_check; | 
| 1939 | 2071 | ||
| 1940 | env->explored_states = kcalloc(prog->len, | 2072 | env->explored_states = kcalloc(env->prog->len, | 
| 1941 | sizeof(struct verifier_state_list *), | 2073 | sizeof(struct verifier_state_list *), | 
| 1942 | GFP_USER); | 2074 | GFP_USER); | 
| 1943 | ret = -ENOMEM; | 2075 | ret = -ENOMEM; | 
| @@ -1954,6 +2086,10 @@ skip_full_check: | |||
| 1954 | while (pop_stack(env, NULL) >= 0); | 2086 | while (pop_stack(env, NULL) >= 0); | 
| 1955 | free_states(env); | 2087 | free_states(env); | 
| 1956 | 2088 | ||
| 2089 | if (ret == 0) | ||
| 2090 | /* program is valid, convert *(u32*)(ctx + off) accesses */ | ||
| 2091 | ret = convert_ctx_accesses(env); | ||
| 2092 | |||
| 1957 | if (log_level && log_len >= log_size - 1) { | 2093 | if (log_level && log_len >= log_size - 1) { | 
| 1958 | BUG_ON(log_len >= log_size); | 2094 | BUG_ON(log_len >= log_size); | 
| 1959 | /* verifier log exceeded user supplied buffer */ | 2095 | /* verifier log exceeded user supplied buffer */ | 
| @@ -1969,18 +2105,18 @@ skip_full_check: | |||
| 1969 | 2105 | ||
| 1970 | if (ret == 0 && env->used_map_cnt) { | 2106 | if (ret == 0 && env->used_map_cnt) { | 
| 1971 | /* if program passed verifier, update used_maps in bpf_prog_info */ | 2107 | /* if program passed verifier, update used_maps in bpf_prog_info */ | 
| 1972 | prog->aux->used_maps = kmalloc_array(env->used_map_cnt, | 2108 | env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, | 
| 1973 | sizeof(env->used_maps[0]), | 2109 | sizeof(env->used_maps[0]), | 
| 1974 | GFP_KERNEL); | 2110 | GFP_KERNEL); | 
| 1975 | 2111 | ||
| 1976 | if (!prog->aux->used_maps) { | 2112 | if (!env->prog->aux->used_maps) { | 
| 1977 | ret = -ENOMEM; | 2113 | ret = -ENOMEM; | 
| 1978 | goto free_log_buf; | 2114 | goto free_log_buf; | 
| 1979 | } | 2115 | } | 
| 1980 | 2116 | ||
| 1981 | memcpy(prog->aux->used_maps, env->used_maps, | 2117 | memcpy(env->prog->aux->used_maps, env->used_maps, | 
| 1982 | sizeof(env->used_maps[0]) * env->used_map_cnt); | 2118 | sizeof(env->used_maps[0]) * env->used_map_cnt); | 
| 1983 | prog->aux->used_map_cnt = env->used_map_cnt; | 2119 | env->prog->aux->used_map_cnt = env->used_map_cnt; | 
| 1984 | 2120 | ||
| 1985 | /* program is valid. Convert pseudo bpf_ld_imm64 into generic | 2121 | /* program is valid. Convert pseudo bpf_ld_imm64 into generic | 
| 1986 | * bpf_ld_imm64 instructions | 2122 | * bpf_ld_imm64 instructions | 
| @@ -1992,11 +2128,12 @@ free_log_buf: | |||
| 1992 | if (log_level) | 2128 | if (log_level) | 
| 1993 | vfree(log_buf); | 2129 | vfree(log_buf); | 
| 1994 | free_env: | 2130 | free_env: | 
| 1995 | if (!prog->aux->used_maps) | 2131 | if (!env->prog->aux->used_maps) | 
| 1996 | /* if we didn't copy map pointers into bpf_prog_info, release | 2132 | /* if we didn't copy map pointers into bpf_prog_info, release | 
| 1997 | * them now. Otherwise free_bpf_prog_info() will release them. | 2133 | * them now. Otherwise free_bpf_prog_info() will release them. | 
| 1998 | */ | 2134 | */ | 
| 1999 | release_maps(env); | 2135 | release_maps(env); | 
| 2136 | *prog = env->prog; | ||
| 2000 | kfree(env); | 2137 | kfree(env); | 
| 2001 | mutex_unlock(&bpf_verifier_lock); | 2138 | mutex_unlock(&bpf_verifier_lock); | 
| 2002 | return ret; | 2139 | return ret; | 
| diff --git a/kernel/capability.c b/kernel/capability.c index 989f5bfc57dc..45432b54d5c6 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
| @@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str) | |||
| 35 | } | 35 | } | 
| 36 | __setup("no_file_caps", file_caps_disable); | 36 | __setup("no_file_caps", file_caps_disable); | 
| 37 | 37 | ||
| 38 | #ifdef CONFIG_MULTIUSER | ||
| 38 | /* | 39 | /* | 
| 39 | * More recent versions of libcap are available from: | 40 | * More recent versions of libcap are available from: | 
| 40 | * | 41 | * | 
| @@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap) | |||
| 386 | } | 387 | } | 
| 387 | EXPORT_SYMBOL(ns_capable); | 388 | EXPORT_SYMBOL(ns_capable); | 
| 388 | 389 | ||
| 390 | |||
| 391 | /** | ||
| 392 | * capable - Determine if the current task has a superior capability in effect | ||
| 393 | * @cap: The capability to be tested for | ||
| 394 | * | ||
| 395 | * Return true if the current task has the given superior capability currently | ||
| 396 | * available for use, false if not. | ||
| 397 | * | ||
| 398 | * This sets PF_SUPERPRIV on the task if the capability is available on the | ||
| 399 | * assumption that it's about to be used. | ||
| 400 | */ | ||
| 401 | bool capable(int cap) | ||
| 402 | { | ||
| 403 | return ns_capable(&init_user_ns, cap); | ||
| 404 | } | ||
| 405 | EXPORT_SYMBOL(capable); | ||
| 406 | #endif /* CONFIG_MULTIUSER */ | ||
| 407 | |||
| 389 | /** | 408 | /** | 
| 390 | * file_ns_capable - Determine if the file's opener had a capability in effect | 409 | * file_ns_capable - Determine if the file's opener had a capability in effect | 
| 391 | * @file: The file we want to check | 410 | * @file: The file we want to check | 
| @@ -412,22 +431,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, | |||
| 412 | EXPORT_SYMBOL(file_ns_capable); | 431 | EXPORT_SYMBOL(file_ns_capable); | 
| 413 | 432 | ||
| 414 | /** | 433 | /** | 
| 415 | * capable - Determine if the current task has a superior capability in effect | ||
| 416 | * @cap: The capability to be tested for | ||
| 417 | * | ||
| 418 | * Return true if the current task has the given superior capability currently | ||
| 419 | * available for use, false if not. | ||
| 420 | * | ||
| 421 | * This sets PF_SUPERPRIV on the task if the capability is available on the | ||
| 422 | * assumption that it's about to be used. | ||
| 423 | */ | ||
| 424 | bool capable(int cap) | ||
| 425 | { | ||
| 426 | return ns_capable(&init_user_ns, cap); | ||
| 427 | } | ||
| 428 | EXPORT_SYMBOL(capable); | ||
| 429 | |||
| 430 | /** | ||
| 431 | * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped | 434 | * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped | 
| 432 | * @inode: The inode in question | 435 | * @inode: The inode in question | 
| 433 | * @cap: The capability in question | 436 | * @cap: The capability in question | 
| diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 29a7b2cc593e..469dd547770c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count) | |||
| 3806 | 3806 | ||
| 3807 | static void pidlist_free(void *p) | 3807 | static void pidlist_free(void *p) | 
| 3808 | { | 3808 | { | 
| 3809 | if (is_vmalloc_addr(p)) | 3809 | kvfree(p); | 
| 3810 | vfree(p); | ||
| 3811 | else | ||
| 3812 | kfree(p); | ||
| 3813 | } | 3810 | } | 
| 3814 | 3811 | ||
| 3815 | /* | 3812 | /* | 
| @@ -4199,7 +4196,9 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) | |||
| 4199 | 4196 | ||
| 4200 | static int cgroup_pidlist_show(struct seq_file *s, void *v) | 4197 | static int cgroup_pidlist_show(struct seq_file *s, void *v) | 
| 4201 | { | 4198 | { | 
| 4202 | return seq_printf(s, "%d\n", *(int *)v); | 4199 | seq_printf(s, "%d\n", *(int *)v); | 
| 4200 | |||
| 4201 | return 0; | ||
| 4203 | } | 4202 | } | 
| 4204 | 4203 | ||
| 4205 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | 4204 | static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, | 
| @@ -5040,6 +5039,9 @@ int __init cgroup_init(void) | |||
| 5040 | WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); | 5039 | WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); | 
| 5041 | WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); | 5040 | WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); | 
| 5042 | } | 5041 | } | 
| 5042 | |||
| 5043 | if (ss->bind) | ||
| 5044 | ss->bind(init_css_set.subsys[ssid]); | ||
| 5043 | } | 5045 | } | 
| 5044 | 5046 | ||
| 5045 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 5047 | cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); | 
| @@ -5451,7 +5453,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, | |||
| 5451 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | 5453 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) | 
| 5452 | { | 5454 | { | 
| 5453 | WARN_ON_ONCE(!rcu_read_lock_held()); | 5455 | WARN_ON_ONCE(!rcu_read_lock_held()); | 
| 5454 | return idr_find(&ss->css_idr, id); | 5456 | return id > 0 ? idr_find(&ss->css_idr, id) : NULL; | 
| 5455 | } | 5457 | } | 
| 5456 | 5458 | ||
| 5457 | #ifdef CONFIG_CGROUP_DEBUG | 5459 | #ifdef CONFIG_CGROUP_DEBUG | 
| diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 937ecdfdf258..72d59a1a6eb6 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
| @@ -39,15 +39,15 @@ void context_tracking_cpu_set(int cpu) | |||
| 39 | } | 39 | } | 
| 40 | 40 | ||
| 41 | /** | 41 | /** | 
| 42 | * context_tracking_user_enter - Inform the context tracking that the CPU is going to | 42 | * context_tracking_enter - Inform the context tracking that the CPU is going | 
| 43 | * enter userspace mode. | 43 | * enter user or guest space mode. | 
| 44 | * | 44 | * | 
| 45 | * This function must be called right before we switch from the kernel | 45 | * This function must be called right before we switch from the kernel | 
| 46 | * to userspace, when it's guaranteed the remaining kernel instructions | 46 | * to user or guest space, when it's guaranteed the remaining kernel | 
| 47 | * to execute won't use any RCU read side critical section because this | 47 | * instructions to execute won't use any RCU read side critical section | 
| 48 | * function sets RCU in extended quiescent state. | 48 | * because this function sets RCU in extended quiescent state. | 
| 49 | */ | 49 | */ | 
| 50 | void context_tracking_user_enter(void) | 50 | void context_tracking_enter(enum ctx_state state) | 
| 51 | { | 51 | { | 
| 52 | unsigned long flags; | 52 | unsigned long flags; | 
| 53 | 53 | ||
| @@ -75,9 +75,8 @@ void context_tracking_user_enter(void) | |||
| 75 | WARN_ON_ONCE(!current->mm); | 75 | WARN_ON_ONCE(!current->mm); | 
| 76 | 76 | ||
| 77 | local_irq_save(flags); | 77 | local_irq_save(flags); | 
| 78 | if ( __this_cpu_read(context_tracking.state) != IN_USER) { | 78 | if ( __this_cpu_read(context_tracking.state) != state) { | 
| 79 | if (__this_cpu_read(context_tracking.active)) { | 79 | if (__this_cpu_read(context_tracking.active)) { | 
| 80 | trace_user_enter(0); | ||
| 81 | /* | 80 | /* | 
| 82 | * At this stage, only low level arch entry code remains and | 81 | * At this stage, only low level arch entry code remains and | 
| 83 | * then we'll run in userspace. We can assume there won't be | 82 | * then we'll run in userspace. We can assume there won't be | 
| @@ -85,7 +84,10 @@ void context_tracking_user_enter(void) | |||
| 85 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | 84 | * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency | 
| 86 | * on the tick. | 85 | * on the tick. | 
| 87 | */ | 86 | */ | 
| 88 | vtime_user_enter(current); | 87 | if (state == CONTEXT_USER) { | 
| 88 | trace_user_enter(0); | ||
| 89 | vtime_user_enter(current); | ||
| 90 | } | ||
| 89 | rcu_user_enter(); | 91 | rcu_user_enter(); | 
| 90 | } | 92 | } | 
| 91 | /* | 93 | /* | 
| @@ -101,24 +103,32 @@ void context_tracking_user_enter(void) | |||
| 101 | * OTOH we can spare the calls to vtime and RCU when context_tracking.active | 103 | * OTOH we can spare the calls to vtime and RCU when context_tracking.active | 
| 102 | * is false because we know that CPU is not tickless. | 104 | * is false because we know that CPU is not tickless. | 
| 103 | */ | 105 | */ | 
| 104 | __this_cpu_write(context_tracking.state, IN_USER); | 106 | __this_cpu_write(context_tracking.state, state); | 
| 105 | } | 107 | } | 
| 106 | local_irq_restore(flags); | 108 | local_irq_restore(flags); | 
| 107 | } | 109 | } | 
| 110 | NOKPROBE_SYMBOL(context_tracking_enter); | ||
| 111 | EXPORT_SYMBOL_GPL(context_tracking_enter); | ||
| 112 | |||
| 113 | void context_tracking_user_enter(void) | ||
| 114 | { | ||
| 115 | context_tracking_enter(CONTEXT_USER); | ||
| 116 | } | ||
| 108 | NOKPROBE_SYMBOL(context_tracking_user_enter); | 117 | NOKPROBE_SYMBOL(context_tracking_user_enter); | 
| 109 | 118 | ||
| 110 | /** | 119 | /** | 
| 111 | * context_tracking_user_exit - Inform the context tracking that the CPU is | 120 | * context_tracking_exit - Inform the context tracking that the CPU is | 
| 112 | * exiting userspace mode and entering the kernel. | 121 | * exiting user or guest mode and entering the kernel. | 
| 113 | * | 122 | * | 
| 114 | * This function must be called after we entered the kernel from userspace | 123 | * This function must be called after we entered the kernel from user or | 
| 115 | * before any use of RCU read side critical section. This potentially include | 124 | * guest space before any use of RCU read side critical section. This | 
| 116 | * any high level kernel code like syscalls, exceptions, signal handling, etc... | 125 | * potentially include any high level kernel code like syscalls, exceptions, | 
| 126 | * signal handling, etc... | ||
| 117 | * | 127 | * | 
| 118 | * This call supports re-entrancy. This way it can be called from any exception | 128 | * This call supports re-entrancy. This way it can be called from any exception | 
| 119 | * handler without needing to know if we came from userspace or not. | 129 | * handler without needing to know if we came from userspace or not. | 
| 120 | */ | 130 | */ | 
| 121 | void context_tracking_user_exit(void) | 131 | void context_tracking_exit(enum ctx_state state) | 
| 122 | { | 132 | { | 
| 123 | unsigned long flags; | 133 | unsigned long flags; | 
| 124 | 134 | ||
| @@ -129,20 +139,29 @@ void context_tracking_user_exit(void) | |||
| 129 | return; | 139 | return; | 
| 130 | 140 | ||
| 131 | local_irq_save(flags); | 141 | local_irq_save(flags); | 
| 132 | if (__this_cpu_read(context_tracking.state) == IN_USER) { | 142 | if (__this_cpu_read(context_tracking.state) == state) { | 
| 133 | if (__this_cpu_read(context_tracking.active)) { | 143 | if (__this_cpu_read(context_tracking.active)) { | 
| 134 | /* | 144 | /* | 
| 135 | * We are going to run code that may use RCU. Inform | 145 | * We are going to run code that may use RCU. Inform | 
| 136 | * RCU core about that (ie: we may need the tick again). | 146 | * RCU core about that (ie: we may need the tick again). | 
| 137 | */ | 147 | */ | 
| 138 | rcu_user_exit(); | 148 | rcu_user_exit(); | 
| 139 | vtime_user_exit(current); | 149 | if (state == CONTEXT_USER) { | 
| 140 | trace_user_exit(0); | 150 | vtime_user_exit(current); | 
| 151 | trace_user_exit(0); | ||
| 152 | } | ||
| 141 | } | 153 | } | 
| 142 | __this_cpu_write(context_tracking.state, IN_KERNEL); | 154 | __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); | 
| 143 | } | 155 | } | 
| 144 | local_irq_restore(flags); | 156 | local_irq_restore(flags); | 
| 145 | } | 157 | } | 
| 158 | NOKPROBE_SYMBOL(context_tracking_exit); | ||
| 159 | EXPORT_SYMBOL_GPL(context_tracking_exit); | ||
| 160 | |||
| 161 | void context_tracking_user_exit(void) | ||
| 162 | { | ||
| 163 | context_tracking_exit(CONTEXT_USER); | ||
| 164 | } | ||
| 146 | NOKPROBE_SYMBOL(context_tracking_user_exit); | 165 | NOKPROBE_SYMBOL(context_tracking_user_exit); | 
| 147 | 166 | ||
| 148 | /** | 167 | /** | 
| diff --git a/kernel/cpu.c b/kernel/cpu.c index 1972b161c61e..94bbe4695232 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/gfp.h> | 20 | #include <linux/gfp.h> | 
| 21 | #include <linux/suspend.h> | 21 | #include <linux/suspend.h> | 
| 22 | #include <linux/lockdep.h> | 22 | #include <linux/lockdep.h> | 
| 23 | #include <linux/tick.h> | ||
| 23 | #include <trace/events/power.h> | 24 | #include <trace/events/power.h> | 
| 24 | 25 | ||
| 25 | #include "smpboot.h" | 26 | #include "smpboot.h" | 
| @@ -338,6 +339,8 @@ static int __ref take_cpu_down(void *_param) | |||
| 338 | return err; | 339 | return err; | 
| 339 | 340 | ||
| 340 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 341 | cpu_notify(CPU_DYING | param->mod, param->hcpu); | 
| 342 | /* Give up timekeeping duties */ | ||
| 343 | tick_handover_do_timer(); | ||
| 341 | /* Park the stopper thread */ | 344 | /* Park the stopper thread */ | 
| 342 | kthread_park(current); | 345 | kthread_park(current); | 
| 343 | return 0; | 346 | return 0; | 
| @@ -408,13 +411,17 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 408 | * | 411 | * | 
| 409 | * Wait for the stop thread to go away. | 412 | * Wait for the stop thread to go away. | 
| 410 | */ | 413 | */ | 
| 411 | while (!idle_cpu(cpu)) | 414 | while (!per_cpu(cpu_dead_idle, cpu)) | 
| 412 | cpu_relax(); | 415 | cpu_relax(); | 
| 416 | smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ | ||
| 417 | per_cpu(cpu_dead_idle, cpu) = false; | ||
| 413 | 418 | ||
| 419 | hotplug_cpu__broadcast_tick_pull(cpu); | ||
| 414 | /* This actually kills the CPU. */ | 420 | /* This actually kills the CPU. */ | 
| 415 | __cpu_die(cpu); | 421 | __cpu_die(cpu); | 
| 416 | 422 | ||
| 417 | /* CPU is completely dead: tell everyone. Too late to complain. */ | 423 | /* CPU is completely dead: tell everyone. Too late to complain. */ | 
| 424 | tick_cleanup_dead_cpu(cpu); | ||
| 418 | cpu_notify_nofail(CPU_DEAD | mod, hcpu); | 425 | cpu_notify_nofail(CPU_DEAD | mod, hcpu); | 
| 419 | 426 | ||
| 420 | check_for_tasks(cpu); | 427 | check_for_tasks(cpu); | 
| @@ -446,6 +453,37 @@ out: | |||
| 446 | EXPORT_SYMBOL(cpu_down); | 453 | EXPORT_SYMBOL(cpu_down); | 
| 447 | #endif /*CONFIG_HOTPLUG_CPU*/ | 454 | #endif /*CONFIG_HOTPLUG_CPU*/ | 
| 448 | 455 | ||
| 456 | /* | ||
| 457 | * Unpark per-CPU smpboot kthreads at CPU-online time. | ||
| 458 | */ | ||
| 459 | static int smpboot_thread_call(struct notifier_block *nfb, | ||
| 460 | unsigned long action, void *hcpu) | ||
| 461 | { | ||
| 462 | int cpu = (long)hcpu; | ||
| 463 | |||
| 464 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 465 | |||
| 466 | case CPU_ONLINE: | ||
| 467 | smpboot_unpark_threads(cpu); | ||
| 468 | break; | ||
| 469 | |||
| 470 | default: | ||
| 471 | break; | ||
| 472 | } | ||
| 473 | |||
| 474 | return NOTIFY_OK; | ||
| 475 | } | ||
| 476 | |||
| 477 | static struct notifier_block smpboot_thread_notifier = { | ||
| 478 | .notifier_call = smpboot_thread_call, | ||
| 479 | .priority = CPU_PRI_SMPBOOT, | ||
| 480 | }; | ||
| 481 | |||
| 482 | void __cpuinit smpboot_thread_init(void) | ||
| 483 | { | ||
| 484 | register_cpu_notifier(&smpboot_thread_notifier); | ||
| 485 | } | ||
| 486 | |||
| 449 | /* Requires cpu_add_remove_lock to be held */ | 487 | /* Requires cpu_add_remove_lock to be held */ | 
| 450 | static int _cpu_up(unsigned int cpu, int tasks_frozen) | 488 | static int _cpu_up(unsigned int cpu, int tasks_frozen) | 
| 451 | { | 489 | { | 
| @@ -485,9 +523,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) | |||
| 485 | goto out_notify; | 523 | goto out_notify; | 
| 486 | BUG_ON(!cpu_online(cpu)); | 524 | BUG_ON(!cpu_online(cpu)); | 
| 487 | 525 | ||
| 488 | /* Wake the per cpu threads */ | ||
| 489 | smpboot_unpark_threads(cpu); | ||
| 490 | |||
| 491 | /* Now call notifier in preparation. */ | 526 | /* Now call notifier in preparation. */ | 
| 492 | cpu_notify(CPU_ONLINE | mod, hcpu); | 527 | cpu_notify(CPU_ONLINE | mod, hcpu); | 
| 493 | 528 | ||
| diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fc7f4748d34a..ee14e3a35a29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 622 | int csn; /* how many cpuset ptrs in csa so far */ | 622 | int csn; /* how many cpuset ptrs in csa so far */ | 
| 623 | int i, j, k; /* indices for partition finding loops */ | 623 | int i, j, k; /* indices for partition finding loops */ | 
| 624 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ | 624 | cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ | 
| 625 | cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ | ||
| 625 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 626 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 
| 626 | int ndoms = 0; /* number of sched domains in result */ | 627 | int ndoms = 0; /* number of sched domains in result */ | 
| 627 | int nslot; /* next empty doms[] struct cpumask slot */ | 628 | int nslot; /* next empty doms[] struct cpumask slot */ | 
| @@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 631 | dattr = NULL; | 632 | dattr = NULL; | 
| 632 | csa = NULL; | 633 | csa = NULL; | 
| 633 | 634 | ||
| 635 | if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) | ||
| 636 | goto done; | ||
| 637 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | ||
| 638 | |||
| 634 | /* Special case for the 99% of systems with one, full, sched domain */ | 639 | /* Special case for the 99% of systems with one, full, sched domain */ | 
| 635 | if (is_sched_load_balance(&top_cpuset)) { | 640 | if (is_sched_load_balance(&top_cpuset)) { | 
| 636 | ndoms = 1; | 641 | ndoms = 1; | 
| @@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 643 | *dattr = SD_ATTR_INIT; | 648 | *dattr = SD_ATTR_INIT; | 
| 644 | update_domain_attr_tree(dattr, &top_cpuset); | 649 | update_domain_attr_tree(dattr, &top_cpuset); | 
| 645 | } | 650 | } | 
| 646 | cpumask_copy(doms[0], top_cpuset.effective_cpus); | 651 | cpumask_and(doms[0], top_cpuset.effective_cpus, | 
| 652 | non_isolated_cpus); | ||
| 647 | 653 | ||
| 648 | goto done; | 654 | goto done; | 
| 649 | } | 655 | } | 
| @@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
| 666 | * the corresponding sched domain. | 672 | * the corresponding sched domain. | 
| 667 | */ | 673 | */ | 
| 668 | if (!cpumask_empty(cp->cpus_allowed) && | 674 | if (!cpumask_empty(cp->cpus_allowed) && | 
| 669 | !is_sched_load_balance(cp)) | 675 | !(is_sched_load_balance(cp) && | 
| 676 | cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) | ||
| 670 | continue; | 677 | continue; | 
| 671 | 678 | ||
| 672 | if (is_sched_load_balance(cp)) | 679 | if (is_sched_load_balance(cp)) | 
| @@ -748,6 +755,7 @@ restart: | |||
| 748 | 755 | ||
| 749 | if (apn == b->pn) { | 756 | if (apn == b->pn) { | 
| 750 | cpumask_or(dp, dp, b->effective_cpus); | 757 | cpumask_or(dp, dp, b->effective_cpus); | 
| 758 | cpumask_and(dp, dp, non_isolated_cpus); | ||
| 751 | if (dattr) | 759 | if (dattr) | 
| 752 | update_domain_attr_tree(dattr + nslot, b); | 760 | update_domain_attr_tree(dattr + nslot, b); | 
| 753 | 761 | ||
| @@ -760,6 +768,7 @@ restart: | |||
| 760 | BUG_ON(nslot != ndoms); | 768 | BUG_ON(nslot != ndoms); | 
| 761 | 769 | ||
| 762 | done: | 770 | done: | 
| 771 | free_cpumask_var(non_isolated_cpus); | ||
| 763 | kfree(csa); | 772 | kfree(csa); | 
| 764 | 773 | ||
| 765 | /* | 774 | /* | 
| @@ -2444,20 +2453,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
| 2444 | * @node: is this an allowed node? | 2453 | * @node: is this an allowed node? | 
| 2445 | * @gfp_mask: memory allocation flags | 2454 | * @gfp_mask: memory allocation flags | 
| 2446 | * | 2455 | * | 
| 2447 | * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is | 2456 | * If we're in interrupt, yes, we can always allocate. If @node is set in | 
| 2448 | * set, yes, we can always allocate. If node is in our task's mems_allowed, | 2457 | * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this | 
| 2449 | * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest | 2458 | * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, | 
| 2450 | * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been | 2459 | * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. | 
| 2451 | * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE | ||
| 2452 | * flag, yes. | ||
| 2453 | * Otherwise, no. | 2460 | * Otherwise, no. | 
| 2454 | * | 2461 | * | 
| 2455 | * The __GFP_THISNODE placement logic is really handled elsewhere, | ||
| 2456 | * by forcibly using a zonelist starting at a specified node, and by | ||
| 2457 | * (in get_page_from_freelist()) refusing to consider the zones for | ||
| 2458 | * any node on the zonelist except the first. By the time any such | ||
| 2459 | * calls get to this routine, we should just shut up and say 'yes'. | ||
| 2460 | * | ||
| 2461 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, | 2462 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, | 
| 2462 | * and do not allow allocations outside the current tasks cpuset | 2463 | * and do not allow allocations outside the current tasks cpuset | 
| 2463 | * unless the task has been OOM killed as is marked TIF_MEMDIE. | 2464 | * unless the task has been OOM killed as is marked TIF_MEMDIE. | 
| @@ -2493,7 +2494,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) | |||
| 2493 | int allowed; /* is allocation in zone z allowed? */ | 2494 | int allowed; /* is allocation in zone z allowed? */ | 
| 2494 | unsigned long flags; | 2495 | unsigned long flags; | 
| 2495 | 2496 | ||
| 2496 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2497 | if (in_interrupt()) | 
| 2497 | return 1; | 2498 | return 1; | 
| 2498 | if (node_isset(node, current->mems_allowed)) | 2499 | if (node_isset(node, current->mems_allowed)) | 
| 2499 | return 1; | 2500 | return 1; | 
| diff --git a/kernel/cred.c b/kernel/cred.c index e0573a43c7df..ec1c07667ec1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
| @@ -29,6 +29,9 @@ | |||
| 29 | 29 | ||
| 30 | static struct kmem_cache *cred_jar; | 30 | static struct kmem_cache *cred_jar; | 
| 31 | 31 | ||
| 32 | /* init to 2 - one for init_task, one to ensure it is never freed */ | ||
| 33 | struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; | ||
| 34 | |||
| 32 | /* | 35 | /* | 
| 33 | * The initial credentials for the initial task | 36 | * The initial credentials for the initial task | 
| 34 | */ | 37 | */ | 
| diff --git a/kernel/events/core.c b/kernel/events/core.c index f04daabfd1cf..81aa3a4ece9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -34,14 +34,16 @@ | |||
| 34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> | 
| 35 | #include <linux/anon_inodes.h> | 35 | #include <linux/anon_inodes.h> | 
| 36 | #include <linux/kernel_stat.h> | 36 | #include <linux/kernel_stat.h> | 
| 37 | #include <linux/cgroup.h> | ||
| 37 | #include <linux/perf_event.h> | 38 | #include <linux/perf_event.h> | 
| 38 | #include <linux/ftrace_event.h> | 39 | #include <linux/ftrace_event.h> | 
| 39 | #include <linux/hw_breakpoint.h> | 40 | #include <linux/hw_breakpoint.h> | 
| 40 | #include <linux/mm_types.h> | 41 | #include <linux/mm_types.h> | 
| 41 | #include <linux/cgroup.h> | ||
| 42 | #include <linux/module.h> | 42 | #include <linux/module.h> | 
| 43 | #include <linux/mman.h> | 43 | #include <linux/mman.h> | 
| 44 | #include <linux/compat.h> | 44 | #include <linux/compat.h> | 
| 45 | #include <linux/bpf.h> | ||
| 46 | #include <linux/filter.h> | ||
| 45 | 47 | ||
| 46 | #include "internal.h" | 48 | #include "internal.h" | 
| 47 | 49 | ||
| @@ -153,7 +155,7 @@ enum event_type_t { | |||
| 153 | */ | 155 | */ | 
| 154 | struct static_key_deferred perf_sched_events __read_mostly; | 156 | struct static_key_deferred perf_sched_events __read_mostly; | 
| 155 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 157 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 
| 156 | static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); | 158 | static DEFINE_PER_CPU(int, perf_sched_cb_usages); | 
| 157 | 159 | ||
| 158 | static atomic_t nr_mmap_events __read_mostly; | 160 | static atomic_t nr_mmap_events __read_mostly; | 
| 159 | static atomic_t nr_comm_events __read_mostly; | 161 | static atomic_t nr_comm_events __read_mostly; | 
| @@ -327,6 +329,11 @@ static inline u64 perf_clock(void) | |||
| 327 | return local_clock(); | 329 | return local_clock(); | 
| 328 | } | 330 | } | 
| 329 | 331 | ||
| 332 | static inline u64 perf_event_clock(struct perf_event *event) | ||
| 333 | { | ||
| 334 | return event->clock(); | ||
| 335 | } | ||
| 336 | |||
| 330 | static inline struct perf_cpu_context * | 337 | static inline struct perf_cpu_context * | 
| 331 | __get_cpu_context(struct perf_event_context *ctx) | 338 | __get_cpu_context(struct perf_event_context *ctx) | 
| 332 | { | 339 | { | 
| @@ -351,32 +358,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, | |||
| 351 | 358 | ||
| 352 | #ifdef CONFIG_CGROUP_PERF | 359 | #ifdef CONFIG_CGROUP_PERF | 
| 353 | 360 | ||
| 354 | /* | ||
| 355 | * perf_cgroup_info keeps track of time_enabled for a cgroup. | ||
| 356 | * This is a per-cpu dynamically allocated data structure. | ||
| 357 | */ | ||
| 358 | struct perf_cgroup_info { | ||
| 359 | u64 time; | ||
| 360 | u64 timestamp; | ||
| 361 | }; | ||
| 362 | |||
| 363 | struct perf_cgroup { | ||
| 364 | struct cgroup_subsys_state css; | ||
| 365 | struct perf_cgroup_info __percpu *info; | ||
| 366 | }; | ||
| 367 | |||
| 368 | /* | ||
| 369 | * Must ensure cgroup is pinned (css_get) before calling | ||
| 370 | * this function. In other words, we cannot call this function | ||
| 371 | * if there is no cgroup event for the current CPU context. | ||
| 372 | */ | ||
| 373 | static inline struct perf_cgroup * | ||
| 374 | perf_cgroup_from_task(struct task_struct *task) | ||
| 375 | { | ||
| 376 | return container_of(task_css(task, perf_event_cgrp_id), | ||
| 377 | struct perf_cgroup, css); | ||
| 378 | } | ||
| 379 | |||
| 380 | static inline bool | 361 | static inline bool | 
| 381 | perf_cgroup_match(struct perf_event *event) | 362 | perf_cgroup_match(struct perf_event *event) | 
| 382 | { | 363 | { | 
| @@ -905,6 +886,15 @@ static void get_ctx(struct perf_event_context *ctx) | |||
| 905 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); | 886 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); | 
| 906 | } | 887 | } | 
| 907 | 888 | ||
| 889 | static void free_ctx(struct rcu_head *head) | ||
| 890 | { | ||
| 891 | struct perf_event_context *ctx; | ||
| 892 | |||
| 893 | ctx = container_of(head, struct perf_event_context, rcu_head); | ||
| 894 | kfree(ctx->task_ctx_data); | ||
| 895 | kfree(ctx); | ||
| 896 | } | ||
| 897 | |||
| 908 | static void put_ctx(struct perf_event_context *ctx) | 898 | static void put_ctx(struct perf_event_context *ctx) | 
| 909 | { | 899 | { | 
| 910 | if (atomic_dec_and_test(&ctx->refcount)) { | 900 | if (atomic_dec_and_test(&ctx->refcount)) { | 
| @@ -912,7 +902,7 @@ static void put_ctx(struct perf_event_context *ctx) | |||
| 912 | put_ctx(ctx->parent_ctx); | 902 | put_ctx(ctx->parent_ctx); | 
| 913 | if (ctx->task) | 903 | if (ctx->task) | 
| 914 | put_task_struct(ctx->task); | 904 | put_task_struct(ctx->task); | 
| 915 | kfree_rcu(ctx, rcu_head); | 905 | call_rcu(&ctx->rcu_head, free_ctx); | 
| 916 | } | 906 | } | 
| 917 | } | 907 | } | 
| 918 | 908 | ||
| @@ -1239,9 +1229,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1239 | if (is_cgroup_event(event)) | 1229 | if (is_cgroup_event(event)) | 
| 1240 | ctx->nr_cgroups++; | 1230 | ctx->nr_cgroups++; | 
| 1241 | 1231 | ||
| 1242 | if (has_branch_stack(event)) | ||
| 1243 | ctx->nr_branch_stack++; | ||
| 1244 | |||
| 1245 | list_add_rcu(&event->event_entry, &ctx->event_list); | 1232 | list_add_rcu(&event->event_entry, &ctx->event_list); | 
| 1246 | ctx->nr_events++; | 1233 | ctx->nr_events++; | 
| 1247 | if (event->attr.inherit_stat) | 1234 | if (event->attr.inherit_stat) | 
| @@ -1408,9 +1395,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
| 1408 | cpuctx->cgrp = NULL; | 1395 | cpuctx->cgrp = NULL; | 
| 1409 | } | 1396 | } | 
| 1410 | 1397 | ||
| 1411 | if (has_branch_stack(event)) | ||
| 1412 | ctx->nr_branch_stack--; | ||
| 1413 | |||
| 1414 | ctx->nr_events--; | 1398 | ctx->nr_events--; | 
| 1415 | if (event->attr.inherit_stat) | 1399 | if (event->attr.inherit_stat) | 
| 1416 | ctx->nr_stat--; | 1400 | ctx->nr_stat--; | 
| @@ -1847,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event, | |||
| 1847 | #define MAX_INTERRUPTS (~0ULL) | 1831 | #define MAX_INTERRUPTS (~0ULL) | 
| 1848 | 1832 | ||
| 1849 | static void perf_log_throttle(struct perf_event *event, int enable); | 1833 | static void perf_log_throttle(struct perf_event *event, int enable); | 
| 1834 | static void perf_log_itrace_start(struct perf_event *event); | ||
| 1850 | 1835 | ||
| 1851 | static int | 1836 | static int | 
| 1852 | event_sched_in(struct perf_event *event, | 1837 | event_sched_in(struct perf_event *event, | 
| @@ -1881,6 +1866,12 @@ event_sched_in(struct perf_event *event, | |||
| 1881 | 1866 | ||
| 1882 | perf_pmu_disable(event->pmu); | 1867 | perf_pmu_disable(event->pmu); | 
| 1883 | 1868 | ||
| 1869 | event->tstamp_running += tstamp - event->tstamp_stopped; | ||
| 1870 | |||
| 1871 | perf_set_shadow_time(event, ctx, tstamp); | ||
| 1872 | |||
| 1873 | perf_log_itrace_start(event); | ||
| 1874 | |||
| 1884 | if (event->pmu->add(event, PERF_EF_START)) { | 1875 | if (event->pmu->add(event, PERF_EF_START)) { | 
| 1885 | event->state = PERF_EVENT_STATE_INACTIVE; | 1876 | event->state = PERF_EVENT_STATE_INACTIVE; | 
| 1886 | event->oncpu = -1; | 1877 | event->oncpu = -1; | 
| @@ -1888,10 +1879,6 @@ event_sched_in(struct perf_event *event, | |||
| 1888 | goto out; | 1879 | goto out; | 
| 1889 | } | 1880 | } | 
| 1890 | 1881 | ||
| 1891 | event->tstamp_running += tstamp - event->tstamp_stopped; | ||
| 1892 | |||
| 1893 | perf_set_shadow_time(event, ctx, tstamp); | ||
| 1894 | |||
| 1895 | if (!is_software_event(event)) | 1882 | if (!is_software_event(event)) | 
| 1896 | cpuctx->active_oncpu++; | 1883 | cpuctx->active_oncpu++; | 
| 1897 | if (!ctx->nr_active++) | 1884 | if (!ctx->nr_active++) | 
| @@ -2559,6 +2546,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 2559 | next->perf_event_ctxp[ctxn] = ctx; | 2546 | next->perf_event_ctxp[ctxn] = ctx; | 
| 2560 | ctx->task = next; | 2547 | ctx->task = next; | 
| 2561 | next_ctx->task = task; | 2548 | next_ctx->task = task; | 
| 2549 | |||
| 2550 | swap(ctx->task_ctx_data, next_ctx->task_ctx_data); | ||
| 2551 | |||
| 2562 | do_switch = 0; | 2552 | do_switch = 0; | 
| 2563 | 2553 | ||
| 2564 | perf_event_sync_stat(ctx, next_ctx); | 2554 | perf_event_sync_stat(ctx, next_ctx); | 
| @@ -2577,6 +2567,56 @@ unlock: | |||
| 2577 | } | 2567 | } | 
| 2578 | } | 2568 | } | 
| 2579 | 2569 | ||
| 2570 | void perf_sched_cb_dec(struct pmu *pmu) | ||
| 2571 | { | ||
| 2572 | this_cpu_dec(perf_sched_cb_usages); | ||
| 2573 | } | ||
| 2574 | |||
| 2575 | void perf_sched_cb_inc(struct pmu *pmu) | ||
| 2576 | { | ||
| 2577 | this_cpu_inc(perf_sched_cb_usages); | ||
| 2578 | } | ||
| 2579 | |||
| 2580 | /* | ||
| 2581 | * This function provides the context switch callback to the lower code | ||
| 2582 | * layer. It is invoked ONLY when the context switch callback is enabled. | ||
| 2583 | */ | ||
| 2584 | static void perf_pmu_sched_task(struct task_struct *prev, | ||
| 2585 | struct task_struct *next, | ||
| 2586 | bool sched_in) | ||
| 2587 | { | ||
| 2588 | struct perf_cpu_context *cpuctx; | ||
| 2589 | struct pmu *pmu; | ||
| 2590 | unsigned long flags; | ||
| 2591 | |||
| 2592 | if (prev == next) | ||
| 2593 | return; | ||
| 2594 | |||
| 2595 | local_irq_save(flags); | ||
| 2596 | |||
| 2597 | rcu_read_lock(); | ||
| 2598 | |||
| 2599 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
| 2600 | if (pmu->sched_task) { | ||
| 2601 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
| 2602 | |||
| 2603 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
| 2604 | |||
| 2605 | perf_pmu_disable(pmu); | ||
| 2606 | |||
| 2607 | pmu->sched_task(cpuctx->task_ctx, sched_in); | ||
| 2608 | |||
| 2609 | perf_pmu_enable(pmu); | ||
| 2610 | |||
| 2611 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
| 2612 | } | ||
| 2613 | } | ||
| 2614 | |||
| 2615 | rcu_read_unlock(); | ||
| 2616 | |||
| 2617 | local_irq_restore(flags); | ||
| 2618 | } | ||
| 2619 | |||
| 2580 | #define for_each_task_context_nr(ctxn) \ | 2620 | #define for_each_task_context_nr(ctxn) \ | 
| 2581 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) | 2621 | for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) | 
| 2582 | 2622 | ||
| @@ -2596,6 +2636,9 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
| 2596 | { | 2636 | { | 
| 2597 | int ctxn; | 2637 | int ctxn; | 
| 2598 | 2638 | ||
| 2639 | if (__this_cpu_read(perf_sched_cb_usages)) | ||
| 2640 | perf_pmu_sched_task(task, next, false); | ||
| 2641 | |||
| 2599 | for_each_task_context_nr(ctxn) | 2642 | for_each_task_context_nr(ctxn) | 
| 2600 | perf_event_context_sched_out(task, ctxn, next); | 2643 | perf_event_context_sched_out(task, ctxn, next); | 
| 2601 | 2644 | ||
| @@ -2755,64 +2798,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
| 2755 | } | 2798 | } | 
| 2756 | 2799 | ||
| 2757 | /* | 2800 | /* | 
| 2758 | * When sampling the branck stack in system-wide, it may be necessary | ||
| 2759 | * to flush the stack on context switch. This happens when the branch | ||
| 2760 | * stack does not tag its entries with the pid of the current task. | ||
| 2761 | * Otherwise it becomes impossible to associate a branch entry with a | ||
| 2762 | * task. This ambiguity is more likely to appear when the branch stack | ||
| 2763 | * supports priv level filtering and the user sets it to monitor only | ||
| 2764 | * at the user level (which could be a useful measurement in system-wide | ||
| 2765 | * mode). In that case, the risk is high of having a branch stack with | ||
| 2766 | * branch from multiple tasks. Flushing may mean dropping the existing | ||
| 2767 | * entries or stashing them somewhere in the PMU specific code layer. | ||
| 2768 | * | ||
| 2769 | * This function provides the context switch callback to the lower code | ||
| 2770 | * layer. It is invoked ONLY when there is at least one system-wide context | ||
| 2771 | * with at least one active event using taken branch sampling. | ||
| 2772 | */ | ||
| 2773 | static void perf_branch_stack_sched_in(struct task_struct *prev, | ||
| 2774 | struct task_struct *task) | ||
| 2775 | { | ||
| 2776 | struct perf_cpu_context *cpuctx; | ||
| 2777 | struct pmu *pmu; | ||
| 2778 | unsigned long flags; | ||
| 2779 | |||
| 2780 | /* no need to flush branch stack if not changing task */ | ||
| 2781 | if (prev == task) | ||
| 2782 | return; | ||
| 2783 | |||
| 2784 | local_irq_save(flags); | ||
| 2785 | |||
| 2786 | rcu_read_lock(); | ||
| 2787 | |||
| 2788 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
| 2789 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
| 2790 | |||
| 2791 | /* | ||
| 2792 | * check if the context has at least one | ||
| 2793 | * event using PERF_SAMPLE_BRANCH_STACK | ||
| 2794 | */ | ||
| 2795 | if (cpuctx->ctx.nr_branch_stack > 0 | ||
| 2796 | && pmu->flush_branch_stack) { | ||
| 2797 | |||
| 2798 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
| 2799 | |||
| 2800 | perf_pmu_disable(pmu); | ||
| 2801 | |||
| 2802 | pmu->flush_branch_stack(); | ||
| 2803 | |||
| 2804 | perf_pmu_enable(pmu); | ||
| 2805 | |||
| 2806 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
| 2807 | } | ||
| 2808 | } | ||
| 2809 | |||
| 2810 | rcu_read_unlock(); | ||
| 2811 | |||
| 2812 | local_irq_restore(flags); | ||
| 2813 | } | ||
| 2814 | |||
| 2815 | /* | ||
| 2816 | * Called from scheduler to add the events of the current task | 2801 | * Called from scheduler to add the events of the current task | 
| 2817 | * with interrupts disabled. | 2802 | * with interrupts disabled. | 
| 2818 | * | 2803 | * | 
| @@ -2844,9 +2829,8 @@ void __perf_event_task_sched_in(struct task_struct *prev, | |||
| 2844 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) | 2829 | if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) | 
| 2845 | perf_cgroup_sched_in(prev, task); | 2830 | perf_cgroup_sched_in(prev, task); | 
| 2846 | 2831 | ||
| 2847 | /* check for system-wide branch_stack events */ | 2832 | if (__this_cpu_read(perf_sched_cb_usages)) | 
| 2848 | if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) | 2833 | perf_pmu_sched_task(prev, task, true); | 
| 2849 | perf_branch_stack_sched_in(prev, task); | ||
| 2850 | } | 2834 | } | 
| 2851 | 2835 | ||
| 2852 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2836 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 
| @@ -3220,7 +3204,10 @@ static void __perf_event_read(void *info) | |||
| 3220 | 3204 | ||
| 3221 | static inline u64 perf_event_count(struct perf_event *event) | 3205 | static inline u64 perf_event_count(struct perf_event *event) | 
| 3222 | { | 3206 | { | 
| 3223 | return local64_read(&event->count) + atomic64_read(&event->child_count); | 3207 | if (event->pmu->count) | 
| 3208 | return event->pmu->count(event); | ||
| 3209 | |||
| 3210 | return __perf_event_count(event); | ||
| 3224 | } | 3211 | } | 
| 3225 | 3212 | ||
| 3226 | static u64 perf_event_read(struct perf_event *event) | 3213 | static u64 perf_event_read(struct perf_event *event) | 
| @@ -3321,12 +3308,15 @@ errout: | |||
| 3321 | * Returns a matching context with refcount and pincount. | 3308 | * Returns a matching context with refcount and pincount. | 
| 3322 | */ | 3309 | */ | 
| 3323 | static struct perf_event_context * | 3310 | static struct perf_event_context * | 
| 3324 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | 3311 | find_get_context(struct pmu *pmu, struct task_struct *task, | 
| 3312 | struct perf_event *event) | ||
| 3325 | { | 3313 | { | 
| 3326 | struct perf_event_context *ctx, *clone_ctx = NULL; | 3314 | struct perf_event_context *ctx, *clone_ctx = NULL; | 
| 3327 | struct perf_cpu_context *cpuctx; | 3315 | struct perf_cpu_context *cpuctx; | 
| 3316 | void *task_ctx_data = NULL; | ||
| 3328 | unsigned long flags; | 3317 | unsigned long flags; | 
| 3329 | int ctxn, err; | 3318 | int ctxn, err; | 
| 3319 | int cpu = event->cpu; | ||
| 3330 | 3320 | ||
| 3331 | if (!task) { | 3321 | if (!task) { | 
| 3332 | /* Must be root to operate on a CPU event: */ | 3322 | /* Must be root to operate on a CPU event: */ | 
| @@ -3354,11 +3344,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
| 3354 | if (ctxn < 0) | 3344 | if (ctxn < 0) | 
| 3355 | goto errout; | 3345 | goto errout; | 
| 3356 | 3346 | ||
| 3347 | if (event->attach_state & PERF_ATTACH_TASK_DATA) { | ||
| 3348 | task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); | ||
| 3349 | if (!task_ctx_data) { | ||
| 3350 | err = -ENOMEM; | ||
| 3351 | goto errout; | ||
| 3352 | } | ||
| 3353 | } | ||
| 3354 | |||
| 3357 | retry: | 3355 | retry: | 
| 3358 | ctx = perf_lock_task_context(task, ctxn, &flags); | 3356 | ctx = perf_lock_task_context(task, ctxn, &flags); | 
| 3359 | if (ctx) { | 3357 | if (ctx) { | 
| 3360 | clone_ctx = unclone_ctx(ctx); | 3358 | clone_ctx = unclone_ctx(ctx); | 
| 3361 | ++ctx->pin_count; | 3359 | ++ctx->pin_count; | 
| 3360 | |||
| 3361 | if (task_ctx_data && !ctx->task_ctx_data) { | ||
| 3362 | ctx->task_ctx_data = task_ctx_data; | ||
| 3363 | task_ctx_data = NULL; | ||
| 3364 | } | ||
| 3362 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 3365 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 
| 3363 | 3366 | ||
| 3364 | if (clone_ctx) | 3367 | if (clone_ctx) | 
| @@ -3369,6 +3372,11 @@ retry: | |||
| 3369 | if (!ctx) | 3372 | if (!ctx) | 
| 3370 | goto errout; | 3373 | goto errout; | 
| 3371 | 3374 | ||
| 3375 | if (task_ctx_data) { | ||
| 3376 | ctx->task_ctx_data = task_ctx_data; | ||
| 3377 | task_ctx_data = NULL; | ||
| 3378 | } | ||
| 3379 | |||
| 3372 | err = 0; | 3380 | err = 0; | 
| 3373 | mutex_lock(&task->perf_event_mutex); | 3381 | mutex_lock(&task->perf_event_mutex); | 
| 3374 | /* | 3382 | /* | 
| @@ -3395,13 +3403,16 @@ retry: | |||
| 3395 | } | 3403 | } | 
| 3396 | } | 3404 | } | 
| 3397 | 3405 | ||
| 3406 | kfree(task_ctx_data); | ||
| 3398 | return ctx; | 3407 | return ctx; | 
| 3399 | 3408 | ||
| 3400 | errout: | 3409 | errout: | 
| 3410 | kfree(task_ctx_data); | ||
| 3401 | return ERR_PTR(err); | 3411 | return ERR_PTR(err); | 
| 3402 | } | 3412 | } | 
| 3403 | 3413 | ||
| 3404 | static void perf_event_free_filter(struct perf_event *event); | 3414 | static void perf_event_free_filter(struct perf_event *event); | 
| 3415 | static void perf_event_free_bpf_prog(struct perf_event *event); | ||
| 3405 | 3416 | ||
| 3406 | static void free_event_rcu(struct rcu_head *head) | 3417 | static void free_event_rcu(struct rcu_head *head) | 
| 3407 | { | 3418 | { | 
| @@ -3411,10 +3422,10 @@ static void free_event_rcu(struct rcu_head *head) | |||
| 3411 | if (event->ns) | 3422 | if (event->ns) | 
| 3412 | put_pid_ns(event->ns); | 3423 | put_pid_ns(event->ns); | 
| 3413 | perf_event_free_filter(event); | 3424 | perf_event_free_filter(event); | 
| 3425 | perf_event_free_bpf_prog(event); | ||
| 3414 | kfree(event); | 3426 | kfree(event); | 
| 3415 | } | 3427 | } | 
| 3416 | 3428 | ||
| 3417 | static void ring_buffer_put(struct ring_buffer *rb); | ||
| 3418 | static void ring_buffer_attach(struct perf_event *event, | 3429 | static void ring_buffer_attach(struct perf_event *event, | 
| 3419 | struct ring_buffer *rb); | 3430 | struct ring_buffer *rb); | 
| 3420 | 3431 | ||
| @@ -3423,10 +3434,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) | |||
| 3423 | if (event->parent) | 3434 | if (event->parent) | 
| 3424 | return; | 3435 | return; | 
| 3425 | 3436 | ||
| 3426 | if (has_branch_stack(event)) { | ||
| 3427 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
| 3428 | atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); | ||
| 3429 | } | ||
| 3430 | if (is_cgroup_event(event)) | 3437 | if (is_cgroup_event(event)) | 
| 3431 | atomic_dec(&per_cpu(perf_cgroup_events, cpu)); | 3438 | atomic_dec(&per_cpu(perf_cgroup_events, cpu)); | 
| 3432 | } | 3439 | } | 
| @@ -3454,6 +3461,91 @@ static void unaccount_event(struct perf_event *event) | |||
| 3454 | unaccount_event_cpu(event, event->cpu); | 3461 | unaccount_event_cpu(event, event->cpu); | 
| 3455 | } | 3462 | } | 
| 3456 | 3463 | ||
| 3464 | /* | ||
| 3465 | * The following implement mutual exclusion of events on "exclusive" pmus | ||
| 3466 | * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled | ||
| 3467 | * at a time, so we disallow creating events that might conflict, namely: | ||
| 3468 | * | ||
| 3469 | * 1) cpu-wide events in the presence of per-task events, | ||
| 3470 | * 2) per-task events in the presence of cpu-wide events, | ||
| 3471 | * 3) two matching events on the same context. | ||
| 3472 | * | ||
| 3473 | * The former two cases are handled in the allocation path (perf_event_alloc(), | ||
| 3474 | * __free_event()), the latter -- before the first perf_install_in_context(). | ||
| 3475 | */ | ||
| 3476 | static int exclusive_event_init(struct perf_event *event) | ||
| 3477 | { | ||
| 3478 | struct pmu *pmu = event->pmu; | ||
| 3479 | |||
| 3480 | if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) | ||
| 3481 | return 0; | ||
| 3482 | |||
| 3483 | /* | ||
| 3484 | * Prevent co-existence of per-task and cpu-wide events on the | ||
| 3485 | * same exclusive pmu. | ||
| 3486 | * | ||
| 3487 | * Negative pmu::exclusive_cnt means there are cpu-wide | ||
| 3488 | * events on this "exclusive" pmu, positive means there are | ||
| 3489 | * per-task events. | ||
| 3490 | * | ||
| 3491 | * Since this is called in perf_event_alloc() path, event::ctx | ||
| 3492 | * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK | ||
| 3493 | * to mean "per-task event", because unlike other attach states it | ||
| 3494 | * never gets cleared. | ||
| 3495 | */ | ||
| 3496 | if (event->attach_state & PERF_ATTACH_TASK) { | ||
| 3497 | if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) | ||
| 3498 | return -EBUSY; | ||
| 3499 | } else { | ||
| 3500 | if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) | ||
| 3501 | return -EBUSY; | ||
| 3502 | } | ||
| 3503 | |||
| 3504 | return 0; | ||
| 3505 | } | ||
| 3506 | |||
| 3507 | static void exclusive_event_destroy(struct perf_event *event) | ||
| 3508 | { | ||
| 3509 | struct pmu *pmu = event->pmu; | ||
| 3510 | |||
| 3511 | if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) | ||
| 3512 | return; | ||
| 3513 | |||
| 3514 | /* see comment in exclusive_event_init() */ | ||
| 3515 | if (event->attach_state & PERF_ATTACH_TASK) | ||
| 3516 | atomic_dec(&pmu->exclusive_cnt); | ||
| 3517 | else | ||
| 3518 | atomic_inc(&pmu->exclusive_cnt); | ||
| 3519 | } | ||
| 3520 | |||
| 3521 | static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) | ||
| 3522 | { | ||
| 3523 | if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && | ||
| 3524 | (e1->cpu == e2->cpu || | ||
| 3525 | e1->cpu == -1 || | ||
| 3526 | e2->cpu == -1)) | ||
| 3527 | return true; | ||
| 3528 | return false; | ||
| 3529 | } | ||
| 3530 | |||
| 3531 | /* Called under the same ctx::mutex as perf_install_in_context() */ | ||
| 3532 | static bool exclusive_event_installable(struct perf_event *event, | ||
| 3533 | struct perf_event_context *ctx) | ||
| 3534 | { | ||
| 3535 | struct perf_event *iter_event; | ||
| 3536 | struct pmu *pmu = event->pmu; | ||
| 3537 | |||
| 3538 | if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) | ||
| 3539 | return true; | ||
| 3540 | |||
| 3541 | list_for_each_entry(iter_event, &ctx->event_list, event_entry) { | ||
| 3542 | if (exclusive_event_match(iter_event, event)) | ||
| 3543 | return false; | ||
| 3544 | } | ||
| 3545 | |||
| 3546 | return true; | ||
| 3547 | } | ||
| 3548 | |||
| 3457 | static void __free_event(struct perf_event *event) | 3549 | static void __free_event(struct perf_event *event) | 
| 3458 | { | 3550 | { | 
| 3459 | if (!event->parent) { | 3551 | if (!event->parent) { | 
| @@ -3467,8 +3559,10 @@ static void __free_event(struct perf_event *event) | |||
| 3467 | if (event->ctx) | 3559 | if (event->ctx) | 
| 3468 | put_ctx(event->ctx); | 3560 | put_ctx(event->ctx); | 
| 3469 | 3561 | ||
| 3470 | if (event->pmu) | 3562 | if (event->pmu) { | 
| 3563 | exclusive_event_destroy(event); | ||
| 3471 | module_put(event->pmu->module); | 3564 | module_put(event->pmu->module); | 
| 3565 | } | ||
| 3472 | 3566 | ||
| 3473 | call_rcu(&event->rcu_head, free_event_rcu); | 3567 | call_rcu(&event->rcu_head, free_event_rcu); | 
| 3474 | } | 3568 | } | 
| @@ -3591,7 +3685,7 @@ static void put_event(struct perf_event *event) | |||
| 3591 | ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); | 3685 | ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); | 
| 3592 | WARN_ON_ONCE(ctx->parent_ctx); | 3686 | WARN_ON_ONCE(ctx->parent_ctx); | 
| 3593 | perf_remove_from_context(event, true); | 3687 | perf_remove_from_context(event, true); | 
| 3594 | mutex_unlock(&ctx->mutex); | 3688 | perf_event_ctx_unlock(event, ctx); | 
| 3595 | 3689 | ||
| 3596 | _free_event(event); | 3690 | _free_event(event); | 
| 3597 | } | 3691 | } | 
| @@ -3927,6 +4021,7 @@ static inline int perf_fget_light(int fd, struct fd *p) | |||
| 3927 | static int perf_event_set_output(struct perf_event *event, | 4021 | static int perf_event_set_output(struct perf_event *event, | 
| 3928 | struct perf_event *output_event); | 4022 | struct perf_event *output_event); | 
| 3929 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); | 4023 | static int perf_event_set_filter(struct perf_event *event, void __user *arg); | 
| 4024 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); | ||
| 3930 | 4025 | ||
| 3931 | static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) | 4026 | static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) | 
| 3932 | { | 4027 | { | 
| @@ -3980,6 +4075,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon | |||
| 3980 | case PERF_EVENT_IOC_SET_FILTER: | 4075 | case PERF_EVENT_IOC_SET_FILTER: | 
| 3981 | return perf_event_set_filter(event, (void __user *)arg); | 4076 | return perf_event_set_filter(event, (void __user *)arg); | 
| 3982 | 4077 | ||
| 4078 | case PERF_EVENT_IOC_SET_BPF: | ||
| 4079 | return perf_event_set_bpf_prog(event, arg); | ||
| 4080 | |||
| 3983 | default: | 4081 | default: | 
| 3984 | return -ENOTTY; | 4082 | return -ENOTTY; | 
| 3985 | } | 4083 | } | 
| @@ -4096,6 +4194,8 @@ static void perf_event_init_userpage(struct perf_event *event) | |||
| 4096 | /* Allow new userspace to detect that bit 0 is deprecated */ | 4194 | /* Allow new userspace to detect that bit 0 is deprecated */ | 
| 4097 | userpg->cap_bit0_is_deprecated = 1; | 4195 | userpg->cap_bit0_is_deprecated = 1; | 
| 4098 | userpg->size = offsetof(struct perf_event_mmap_page, __reserved); | 4196 | userpg->size = offsetof(struct perf_event_mmap_page, __reserved); | 
| 4197 | userpg->data_offset = PAGE_SIZE; | ||
| 4198 | userpg->data_size = perf_data_size(rb); | ||
| 4099 | 4199 | ||
| 4100 | unlock: | 4200 | unlock: | 
| 4101 | rcu_read_unlock(); | 4201 | rcu_read_unlock(); | 
| @@ -4263,7 +4363,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head) | |||
| 4263 | rb_free(rb); | 4363 | rb_free(rb); | 
| 4264 | } | 4364 | } | 
| 4265 | 4365 | ||
| 4266 | static struct ring_buffer *ring_buffer_get(struct perf_event *event) | 4366 | struct ring_buffer *ring_buffer_get(struct perf_event *event) | 
| 4267 | { | 4367 | { | 
| 4268 | struct ring_buffer *rb; | 4368 | struct ring_buffer *rb; | 
| 4269 | 4369 | ||
| @@ -4278,7 +4378,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) | |||
| 4278 | return rb; | 4378 | return rb; | 
| 4279 | } | 4379 | } | 
| 4280 | 4380 | ||
| 4281 | static void ring_buffer_put(struct ring_buffer *rb) | 4381 | void ring_buffer_put(struct ring_buffer *rb) | 
| 4282 | { | 4382 | { | 
| 4283 | if (!atomic_dec_and_test(&rb->refcount)) | 4383 | if (!atomic_dec_and_test(&rb->refcount)) | 
| 4284 | return; | 4384 | return; | 
| @@ -4295,6 +4395,9 @@ static void perf_mmap_open(struct vm_area_struct *vma) | |||
| 4295 | atomic_inc(&event->mmap_count); | 4395 | atomic_inc(&event->mmap_count); | 
| 4296 | atomic_inc(&event->rb->mmap_count); | 4396 | atomic_inc(&event->rb->mmap_count); | 
| 4297 | 4397 | ||
| 4398 | if (vma->vm_pgoff) | ||
| 4399 | atomic_inc(&event->rb->aux_mmap_count); | ||
| 4400 | |||
| 4298 | if (event->pmu->event_mapped) | 4401 | if (event->pmu->event_mapped) | 
| 4299 | event->pmu->event_mapped(event); | 4402 | event->pmu->event_mapped(event); | 
| 4300 | } | 4403 | } | 
| @@ -4319,6 +4422,20 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
| 4319 | if (event->pmu->event_unmapped) | 4422 | if (event->pmu->event_unmapped) | 
| 4320 | event->pmu->event_unmapped(event); | 4423 | event->pmu->event_unmapped(event); | 
| 4321 | 4424 | ||
| 4425 | /* | ||
| 4426 | * rb->aux_mmap_count will always drop before rb->mmap_count and | ||
| 4427 | * event->mmap_count, so it is ok to use event->mmap_mutex to | ||
| 4428 | * serialize with perf_mmap here. | ||
| 4429 | */ | ||
| 4430 | if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && | ||
| 4431 | atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { | ||
| 4432 | atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); | ||
| 4433 | vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; | ||
| 4434 | |||
| 4435 | rb_free_aux(rb); | ||
| 4436 | mutex_unlock(&event->mmap_mutex); | ||
| 4437 | } | ||
| 4438 | |||
| 4322 | atomic_dec(&rb->mmap_count); | 4439 | atomic_dec(&rb->mmap_count); | 
| 4323 | 4440 | ||
| 4324 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) | 4441 | if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) | 
| @@ -4392,7 +4509,7 @@ out_put: | |||
| 4392 | 4509 | ||
| 4393 | static const struct vm_operations_struct perf_mmap_vmops = { | 4510 | static const struct vm_operations_struct perf_mmap_vmops = { | 
| 4394 | .open = perf_mmap_open, | 4511 | .open = perf_mmap_open, | 
| 4395 | .close = perf_mmap_close, | 4512 | .close = perf_mmap_close, /* non mergable */ | 
| 4396 | .fault = perf_mmap_fault, | 4513 | .fault = perf_mmap_fault, | 
| 4397 | .page_mkwrite = perf_mmap_fault, | 4514 | .page_mkwrite = perf_mmap_fault, | 
| 4398 | }; | 4515 | }; | 
| @@ -4403,10 +4520,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 4403 | unsigned long user_locked, user_lock_limit; | 4520 | unsigned long user_locked, user_lock_limit; | 
| 4404 | struct user_struct *user = current_user(); | 4521 | struct user_struct *user = current_user(); | 
| 4405 | unsigned long locked, lock_limit; | 4522 | unsigned long locked, lock_limit; | 
| 4406 | struct ring_buffer *rb; | 4523 | struct ring_buffer *rb = NULL; | 
| 4407 | unsigned long vma_size; | 4524 | unsigned long vma_size; | 
| 4408 | unsigned long nr_pages; | 4525 | unsigned long nr_pages; | 
| 4409 | long user_extra, extra; | 4526 | long user_extra = 0, extra = 0; | 
| 4410 | int ret = 0, flags = 0; | 4527 | int ret = 0, flags = 0; | 
| 4411 | 4528 | ||
| 4412 | /* | 4529 | /* | 
| @@ -4421,7 +4538,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 4421 | return -EINVAL; | 4538 | return -EINVAL; | 
| 4422 | 4539 | ||
| 4423 | vma_size = vma->vm_end - vma->vm_start; | 4540 | vma_size = vma->vm_end - vma->vm_start; | 
| 4424 | nr_pages = (vma_size / PAGE_SIZE) - 1; | 4541 | |
| 4542 | if (vma->vm_pgoff == 0) { | ||
| 4543 | nr_pages = (vma_size / PAGE_SIZE) - 1; | ||
| 4544 | } else { | ||
| 4545 | /* | ||
| 4546 | * AUX area mapping: if rb->aux_nr_pages != 0, it's already | ||
| 4547 | * mapped, all subsequent mappings should have the same size | ||
| 4548 | * and offset. Must be above the normal perf buffer. | ||
| 4549 | */ | ||
| 4550 | u64 aux_offset, aux_size; | ||
| 4551 | |||
| 4552 | if (!event->rb) | ||
| 4553 | return -EINVAL; | ||
| 4554 | |||
| 4555 | nr_pages = vma_size / PAGE_SIZE; | ||
| 4556 | |||
| 4557 | mutex_lock(&event->mmap_mutex); | ||
| 4558 | ret = -EINVAL; | ||
| 4559 | |||
| 4560 | rb = event->rb; | ||
| 4561 | if (!rb) | ||
| 4562 | goto aux_unlock; | ||
| 4563 | |||
| 4564 | aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); | ||
| 4565 | aux_size = ACCESS_ONCE(rb->user_page->aux_size); | ||
| 4566 | |||
| 4567 | if (aux_offset < perf_data_size(rb) + PAGE_SIZE) | ||
| 4568 | goto aux_unlock; | ||
| 4569 | |||
| 4570 | if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) | ||
| 4571 | goto aux_unlock; | ||
| 4572 | |||
| 4573 | /* already mapped with a different offset */ | ||
| 4574 | if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) | ||
| 4575 | goto aux_unlock; | ||
| 4576 | |||
| 4577 | if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) | ||
| 4578 | goto aux_unlock; | ||
| 4579 | |||
| 4580 | /* already mapped with a different size */ | ||
| 4581 | if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) | ||
| 4582 | goto aux_unlock; | ||
| 4583 | |||
| 4584 | if (!is_power_of_2(nr_pages)) | ||
| 4585 | goto aux_unlock; | ||
| 4586 | |||
| 4587 | if (!atomic_inc_not_zero(&rb->mmap_count)) | ||
| 4588 | goto aux_unlock; | ||
| 4589 | |||
| 4590 | if (rb_has_aux(rb)) { | ||
| 4591 | atomic_inc(&rb->aux_mmap_count); | ||
| 4592 | ret = 0; | ||
| 4593 | goto unlock; | ||
| 4594 | } | ||
| 4595 | |||
| 4596 | atomic_set(&rb->aux_mmap_count, 1); | ||
| 4597 | user_extra = nr_pages; | ||
| 4598 | |||
| 4599 | goto accounting; | ||
| 4600 | } | ||
| 4425 | 4601 | ||
| 4426 | /* | 4602 | /* | 
| 4427 | * If we have rb pages ensure they're a power-of-two number, so we | 4603 | * If we have rb pages ensure they're a power-of-two number, so we | 
| @@ -4433,9 +4609,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 4433 | if (vma_size != PAGE_SIZE * (1 + nr_pages)) | 4609 | if (vma_size != PAGE_SIZE * (1 + nr_pages)) | 
| 4434 | return -EINVAL; | 4610 | return -EINVAL; | 
| 4435 | 4611 | ||
| 4436 | if (vma->vm_pgoff != 0) | ||
| 4437 | return -EINVAL; | ||
| 4438 | |||
| 4439 | WARN_ON_ONCE(event->ctx->parent_ctx); | 4612 | WARN_ON_ONCE(event->ctx->parent_ctx); | 
| 4440 | again: | 4613 | again: | 
| 4441 | mutex_lock(&event->mmap_mutex); | 4614 | mutex_lock(&event->mmap_mutex); | 
| @@ -4459,6 +4632,8 @@ again: | |||
| 4459 | } | 4632 | } | 
| 4460 | 4633 | ||
| 4461 | user_extra = nr_pages + 1; | 4634 | user_extra = nr_pages + 1; | 
| 4635 | |||
| 4636 | accounting: | ||
| 4462 | user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); | 4637 | user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); | 
| 4463 | 4638 | ||
| 4464 | /* | 4639 | /* | 
| @@ -4468,7 +4643,6 @@ again: | |||
| 4468 | 4643 | ||
| 4469 | user_locked = atomic_long_read(&user->locked_vm) + user_extra; | 4644 | user_locked = atomic_long_read(&user->locked_vm) + user_extra; | 
| 4470 | 4645 | ||
| 4471 | extra = 0; | ||
| 4472 | if (user_locked > user_lock_limit) | 4646 | if (user_locked > user_lock_limit) | 
| 4473 | extra = user_locked - user_lock_limit; | 4647 | extra = user_locked - user_lock_limit; | 
| 4474 | 4648 | ||
| @@ -4482,35 +4656,46 @@ again: | |||
| 4482 | goto unlock; | 4656 | goto unlock; | 
| 4483 | } | 4657 | } | 
| 4484 | 4658 | ||
| 4485 | WARN_ON(event->rb); | 4659 | WARN_ON(!rb && event->rb); | 
| 4486 | 4660 | ||
| 4487 | if (vma->vm_flags & VM_WRITE) | 4661 | if (vma->vm_flags & VM_WRITE) | 
| 4488 | flags |= RING_BUFFER_WRITABLE; | 4662 | flags |= RING_BUFFER_WRITABLE; | 
| 4489 | 4663 | ||
| 4490 | rb = rb_alloc(nr_pages, | ||
| 4491 | event->attr.watermark ? event->attr.wakeup_watermark : 0, | ||
| 4492 | event->cpu, flags); | ||
| 4493 | |||
| 4494 | if (!rb) { | 4664 | if (!rb) { | 
| 4495 | ret = -ENOMEM; | 4665 | rb = rb_alloc(nr_pages, | 
| 4496 | goto unlock; | 4666 | event->attr.watermark ? event->attr.wakeup_watermark : 0, | 
| 4497 | } | 4667 | event->cpu, flags); | 
| 4498 | 4668 | ||
| 4499 | atomic_set(&rb->mmap_count, 1); | 4669 | if (!rb) { | 
| 4500 | rb->mmap_locked = extra; | 4670 | ret = -ENOMEM; | 
| 4501 | rb->mmap_user = get_current_user(); | 4671 | goto unlock; | 
| 4672 | } | ||
| 4502 | 4673 | ||
| 4503 | atomic_long_add(user_extra, &user->locked_vm); | 4674 | atomic_set(&rb->mmap_count, 1); | 
| 4504 | vma->vm_mm->pinned_vm += extra; | 4675 | rb->mmap_user = get_current_user(); | 
| 4676 | rb->mmap_locked = extra; | ||
| 4505 | 4677 | ||
| 4506 | ring_buffer_attach(event, rb); | 4678 | ring_buffer_attach(event, rb); | 
| 4507 | 4679 | ||
| 4508 | perf_event_init_userpage(event); | 4680 | perf_event_init_userpage(event); | 
| 4509 | perf_event_update_userpage(event); | 4681 | perf_event_update_userpage(event); | 
| 4682 | } else { | ||
| 4683 | ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, | ||
| 4684 | event->attr.aux_watermark, flags); | ||
| 4685 | if (!ret) | ||
| 4686 | rb->aux_mmap_locked = extra; | ||
| 4687 | } | ||
| 4510 | 4688 | ||
| 4511 | unlock: | 4689 | unlock: | 
| 4512 | if (!ret) | 4690 | if (!ret) { | 
| 4691 | atomic_long_add(user_extra, &user->locked_vm); | ||
| 4692 | vma->vm_mm->pinned_vm += extra; | ||
| 4693 | |||
| 4513 | atomic_inc(&event->mmap_count); | 4694 | atomic_inc(&event->mmap_count); | 
| 4695 | } else if (rb) { | ||
| 4696 | atomic_dec(&rb->mmap_count); | ||
| 4697 | } | ||
| 4698 | aux_unlock: | ||
| 4514 | mutex_unlock(&event->mmap_mutex); | 4699 | mutex_unlock(&event->mmap_mutex); | 
| 4515 | 4700 | ||
| 4516 | /* | 4701 | /* | 
| @@ -4574,6 +4759,13 @@ static void perf_pending_event(struct irq_work *entry) | |||
| 4574 | { | 4759 | { | 
| 4575 | struct perf_event *event = container_of(entry, | 4760 | struct perf_event *event = container_of(entry, | 
| 4576 | struct perf_event, pending); | 4761 | struct perf_event, pending); | 
| 4762 | int rctx; | ||
| 4763 | |||
| 4764 | rctx = perf_swevent_get_recursion_context(); | ||
| 4765 | /* | ||
| 4766 | * If we 'fail' here, that's OK, it means recursion is already disabled | ||
| 4767 | * and we won't recurse 'further'. | ||
| 4768 | */ | ||
| 4577 | 4769 | ||
| 4578 | if (event->pending_disable) { | 4770 | if (event->pending_disable) { | 
| 4579 | event->pending_disable = 0; | 4771 | event->pending_disable = 0; | 
| @@ -4584,6 +4776,9 @@ static void perf_pending_event(struct irq_work *entry) | |||
| 4584 | event->pending_wakeup = 0; | 4776 | event->pending_wakeup = 0; | 
| 4585 | perf_event_wakeup(event); | 4777 | perf_event_wakeup(event); | 
| 4586 | } | 4778 | } | 
| 4779 | |||
| 4780 | if (rctx >= 0) | ||
| 4781 | perf_swevent_put_recursion_context(rctx); | ||
| 4587 | } | 4782 | } | 
| 4588 | 4783 | ||
| 4589 | /* | 4784 | /* | 
| @@ -4756,7 +4951,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, | |||
| 4756 | } | 4951 | } | 
| 4757 | 4952 | ||
| 4758 | if (sample_type & PERF_SAMPLE_TIME) | 4953 | if (sample_type & PERF_SAMPLE_TIME) | 
| 4759 | data->time = perf_clock(); | 4954 | data->time = perf_event_clock(event); | 
| 4760 | 4955 | ||
| 4761 | if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) | 4956 | if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) | 
| 4762 | data->id = primary_event_id(event); | 4957 | data->id = primary_event_id(event); | 
| @@ -5334,6 +5529,8 @@ static void perf_event_task_output(struct perf_event *event, | |||
| 5334 | task_event->event_id.tid = perf_event_tid(event, task); | 5529 | task_event->event_id.tid = perf_event_tid(event, task); | 
| 5335 | task_event->event_id.ptid = perf_event_tid(event, current); | 5530 | task_event->event_id.ptid = perf_event_tid(event, current); | 
| 5336 | 5531 | ||
| 5532 | task_event->event_id.time = perf_event_clock(event); | ||
| 5533 | |||
| 5337 | perf_output_put(&handle, task_event->event_id); | 5534 | perf_output_put(&handle, task_event->event_id); | 
| 5338 | 5535 | ||
| 5339 | perf_event__output_id_sample(event, &handle, &sample); | 5536 | perf_event__output_id_sample(event, &handle, &sample); | 
| @@ -5367,7 +5564,7 @@ static void perf_event_task(struct task_struct *task, | |||
| 5367 | /* .ppid */ | 5564 | /* .ppid */ | 
| 5368 | /* .tid */ | 5565 | /* .tid */ | 
| 5369 | /* .ptid */ | 5566 | /* .ptid */ | 
| 5370 | .time = perf_clock(), | 5567 | /* .time */ | 
| 5371 | }, | 5568 | }, | 
| 5372 | }; | 5569 | }; | 
| 5373 | 5570 | ||
| @@ -5722,6 +5919,40 @@ void perf_event_mmap(struct vm_area_struct *vma) | |||
| 5722 | perf_event_mmap_event(&mmap_event); | 5919 | perf_event_mmap_event(&mmap_event); | 
| 5723 | } | 5920 | } | 
| 5724 | 5921 | ||
| 5922 | void perf_event_aux_event(struct perf_event *event, unsigned long head, | ||
| 5923 | unsigned long size, u64 flags) | ||
| 5924 | { | ||
| 5925 | struct perf_output_handle handle; | ||
| 5926 | struct perf_sample_data sample; | ||
| 5927 | struct perf_aux_event { | ||
| 5928 | struct perf_event_header header; | ||
| 5929 | u64 offset; | ||
| 5930 | u64 size; | ||
| 5931 | u64 flags; | ||
| 5932 | } rec = { | ||
| 5933 | .header = { | ||
| 5934 | .type = PERF_RECORD_AUX, | ||
| 5935 | .misc = 0, | ||
| 5936 | .size = sizeof(rec), | ||
| 5937 | }, | ||
| 5938 | .offset = head, | ||
| 5939 | .size = size, | ||
| 5940 | .flags = flags, | ||
| 5941 | }; | ||
| 5942 | int ret; | ||
| 5943 | |||
| 5944 | perf_event_header__init_id(&rec.header, &sample, event); | ||
| 5945 | ret = perf_output_begin(&handle, event, rec.header.size); | ||
| 5946 | |||
| 5947 | if (ret) | ||
| 5948 | return; | ||
| 5949 | |||
| 5950 | perf_output_put(&handle, rec); | ||
| 5951 | perf_event__output_id_sample(event, &handle, &sample); | ||
| 5952 | |||
| 5953 | perf_output_end(&handle); | ||
| 5954 | } | ||
| 5955 | |||
| 5725 | /* | 5956 | /* | 
| 5726 | * IRQ throttle logging | 5957 | * IRQ throttle logging | 
| 5727 | */ | 5958 | */ | 
| @@ -5743,7 +5974,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
| 5743 | .misc = 0, | 5974 | .misc = 0, | 
| 5744 | .size = sizeof(throttle_event), | 5975 | .size = sizeof(throttle_event), | 
| 5745 | }, | 5976 | }, | 
| 5746 | .time = perf_clock(), | 5977 | .time = perf_event_clock(event), | 
| 5747 | .id = primary_event_id(event), | 5978 | .id = primary_event_id(event), | 
| 5748 | .stream_id = event->id, | 5979 | .stream_id = event->id, | 
| 5749 | }; | 5980 | }; | 
| @@ -5763,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
| 5763 | perf_output_end(&handle); | 5994 | perf_output_end(&handle); | 
| 5764 | } | 5995 | } | 
| 5765 | 5996 | ||
| 5997 | static void perf_log_itrace_start(struct perf_event *event) | ||
| 5998 | { | ||
| 5999 | struct perf_output_handle handle; | ||
| 6000 | struct perf_sample_data sample; | ||
| 6001 | struct perf_aux_event { | ||
| 6002 | struct perf_event_header header; | ||
| 6003 | u32 pid; | ||
| 6004 | u32 tid; | ||
| 6005 | } rec; | ||
| 6006 | int ret; | ||
| 6007 | |||
| 6008 | if (event->parent) | ||
| 6009 | event = event->parent; | ||
| 6010 | |||
| 6011 | if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || | ||
| 6012 | event->hw.itrace_started) | ||
| 6013 | return; | ||
| 6014 | |||
| 6015 | event->hw.itrace_started = 1; | ||
| 6016 | |||
| 6017 | rec.header.type = PERF_RECORD_ITRACE_START; | ||
| 6018 | rec.header.misc = 0; | ||
| 6019 | rec.header.size = sizeof(rec); | ||
| 6020 | rec.pid = perf_event_pid(event, current); | ||
| 6021 | rec.tid = perf_event_tid(event, current); | ||
| 6022 | |||
| 6023 | perf_event_header__init_id(&rec.header, &sample, event); | ||
| 6024 | ret = perf_output_begin(&handle, event, rec.header.size); | ||
| 6025 | |||
| 6026 | if (ret) | ||
| 6027 | return; | ||
| 6028 | |||
| 6029 | perf_output_put(&handle, rec); | ||
| 6030 | perf_event__output_id_sample(event, &handle, &sample); | ||
| 6031 | |||
| 6032 | perf_output_end(&handle); | ||
| 6033 | } | ||
| 6034 | |||
| 5766 | /* | 6035 | /* | 
| 5767 | * Generic event overflow handling, sampling. | 6036 | * Generic event overflow handling, sampling. | 
| 5768 | */ | 6037 | */ | 
| @@ -6123,6 +6392,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) | |||
| 6123 | } | 6392 | } | 
| 6124 | 6393 | ||
| 6125 | hlist_add_head_rcu(&event->hlist_entry, head); | 6394 | hlist_add_head_rcu(&event->hlist_entry, head); | 
| 6395 | perf_event_update_userpage(event); | ||
| 6126 | 6396 | ||
| 6127 | return 0; | 6397 | return 0; | 
| 6128 | } | 6398 | } | 
| @@ -6286,6 +6556,8 @@ static int perf_swevent_init(struct perf_event *event) | |||
| 6286 | static struct pmu perf_swevent = { | 6556 | static struct pmu perf_swevent = { | 
| 6287 | .task_ctx_nr = perf_sw_context, | 6557 | .task_ctx_nr = perf_sw_context, | 
| 6288 | 6558 | ||
| 6559 | .capabilities = PERF_PMU_CAP_NO_NMI, | ||
| 6560 | |||
| 6289 | .event_init = perf_swevent_init, | 6561 | .event_init = perf_swevent_init, | 
| 6290 | .add = perf_swevent_add, | 6562 | .add = perf_swevent_add, | 
| 6291 | .del = perf_swevent_del, | 6563 | .del = perf_swevent_del, | 
| @@ -6439,6 +6711,49 @@ static void perf_event_free_filter(struct perf_event *event) | |||
| 6439 | ftrace_profile_free_filter(event); | 6711 | ftrace_profile_free_filter(event); | 
| 6440 | } | 6712 | } | 
| 6441 | 6713 | ||
| 6714 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | ||
| 6715 | { | ||
| 6716 | struct bpf_prog *prog; | ||
| 6717 | |||
| 6718 | if (event->attr.type != PERF_TYPE_TRACEPOINT) | ||
| 6719 | return -EINVAL; | ||
| 6720 | |||
| 6721 | if (event->tp_event->prog) | ||
| 6722 | return -EEXIST; | ||
| 6723 | |||
| 6724 | if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) | ||
| 6725 | /* bpf programs can only be attached to kprobes */ | ||
| 6726 | return -EINVAL; | ||
| 6727 | |||
| 6728 | prog = bpf_prog_get(prog_fd); | ||
| 6729 | if (IS_ERR(prog)) | ||
| 6730 | return PTR_ERR(prog); | ||
| 6731 | |||
| 6732 | if (prog->type != BPF_PROG_TYPE_KPROBE) { | ||
| 6733 | /* valid fd, but invalid bpf program type */ | ||
| 6734 | bpf_prog_put(prog); | ||
| 6735 | return -EINVAL; | ||
| 6736 | } | ||
| 6737 | |||
| 6738 | event->tp_event->prog = prog; | ||
| 6739 | |||
| 6740 | return 0; | ||
| 6741 | } | ||
| 6742 | |||
| 6743 | static void perf_event_free_bpf_prog(struct perf_event *event) | ||
| 6744 | { | ||
| 6745 | struct bpf_prog *prog; | ||
| 6746 | |||
| 6747 | if (!event->tp_event) | ||
| 6748 | return; | ||
| 6749 | |||
| 6750 | prog = event->tp_event->prog; | ||
| 6751 | if (prog) { | ||
| 6752 | event->tp_event->prog = NULL; | ||
| 6753 | bpf_prog_put(prog); | ||
| 6754 | } | ||
| 6755 | } | ||
| 6756 | |||
| 6442 | #else | 6757 | #else | 
| 6443 | 6758 | ||
| 6444 | static inline void perf_tp_register(void) | 6759 | static inline void perf_tp_register(void) | 
| @@ -6454,6 +6769,14 @@ static void perf_event_free_filter(struct perf_event *event) | |||
| 6454 | { | 6769 | { | 
| 6455 | } | 6770 | } | 
| 6456 | 6771 | ||
| 6772 | static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) | ||
| 6773 | { | ||
| 6774 | return -ENOENT; | ||
| 6775 | } | ||
| 6776 | |||
| 6777 | static void perf_event_free_bpf_prog(struct perf_event *event) | ||
| 6778 | { | ||
| 6779 | } | ||
| 6457 | #endif /* CONFIG_EVENT_TRACING */ | 6780 | #endif /* CONFIG_EVENT_TRACING */ | 
| 6458 | 6781 | ||
| 6459 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 6782 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | 
| @@ -6592,6 +6915,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) | |||
| 6592 | { | 6915 | { | 
| 6593 | if (flags & PERF_EF_START) | 6916 | if (flags & PERF_EF_START) | 
| 6594 | cpu_clock_event_start(event, flags); | 6917 | cpu_clock_event_start(event, flags); | 
| 6918 | perf_event_update_userpage(event); | ||
| 6595 | 6919 | ||
| 6596 | return 0; | 6920 | return 0; | 
| 6597 | } | 6921 | } | 
| @@ -6628,6 +6952,8 @@ static int cpu_clock_event_init(struct perf_event *event) | |||
| 6628 | static struct pmu perf_cpu_clock = { | 6952 | static struct pmu perf_cpu_clock = { | 
| 6629 | .task_ctx_nr = perf_sw_context, | 6953 | .task_ctx_nr = perf_sw_context, | 
| 6630 | 6954 | ||
| 6955 | .capabilities = PERF_PMU_CAP_NO_NMI, | ||
| 6956 | |||
| 6631 | .event_init = cpu_clock_event_init, | 6957 | .event_init = cpu_clock_event_init, | 
| 6632 | .add = cpu_clock_event_add, | 6958 | .add = cpu_clock_event_add, | 
| 6633 | .del = cpu_clock_event_del, | 6959 | .del = cpu_clock_event_del, | 
| @@ -6666,6 +6992,7 @@ static int task_clock_event_add(struct perf_event *event, int flags) | |||
| 6666 | { | 6992 | { | 
| 6667 | if (flags & PERF_EF_START) | 6993 | if (flags & PERF_EF_START) | 
| 6668 | task_clock_event_start(event, flags); | 6994 | task_clock_event_start(event, flags); | 
| 6995 | perf_event_update_userpage(event); | ||
| 6669 | 6996 | ||
| 6670 | return 0; | 6997 | return 0; | 
| 6671 | } | 6998 | } | 
| @@ -6706,6 +7033,8 @@ static int task_clock_event_init(struct perf_event *event) | |||
| 6706 | static struct pmu perf_task_clock = { | 7033 | static struct pmu perf_task_clock = { | 
| 6707 | .task_ctx_nr = perf_sw_context, | 7034 | .task_ctx_nr = perf_sw_context, | 
| 6708 | 7035 | ||
| 7036 | .capabilities = PERF_PMU_CAP_NO_NMI, | ||
| 7037 | |||
| 6709 | .event_init = task_clock_event_init, | 7038 | .event_init = task_clock_event_init, | 
| 6710 | .add = task_clock_event_add, | 7039 | .add = task_clock_event_add, | 
| 6711 | .del = task_clock_event_del, | 7040 | .del = task_clock_event_del, | 
| @@ -6983,6 +7312,7 @@ got_cpu_context: | |||
| 6983 | pmu->event_idx = perf_event_idx_default; | 7312 | pmu->event_idx = perf_event_idx_default; | 
| 6984 | 7313 | ||
| 6985 | list_add_rcu(&pmu->entry, &pmus); | 7314 | list_add_rcu(&pmu->entry, &pmus); | 
| 7315 | atomic_set(&pmu->exclusive_cnt, 0); | ||
| 6986 | ret = 0; | 7316 | ret = 0; | 
| 6987 | unlock: | 7317 | unlock: | 
| 6988 | mutex_unlock(&pmus_lock); | 7318 | mutex_unlock(&pmus_lock); | 
| @@ -7027,12 +7357,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister); | |||
| 7027 | 7357 | ||
| 7028 | static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) | 7358 | static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) | 
| 7029 | { | 7359 | { | 
| 7360 | struct perf_event_context *ctx = NULL; | ||
| 7030 | int ret; | 7361 | int ret; | 
| 7031 | 7362 | ||
| 7032 | if (!try_module_get(pmu->module)) | 7363 | if (!try_module_get(pmu->module)) | 
| 7033 | return -ENODEV; | 7364 | return -ENODEV; | 
| 7365 | |||
| 7366 | if (event->group_leader != event) { | ||
| 7367 | ctx = perf_event_ctx_lock(event->group_leader); | ||
| 7368 | BUG_ON(!ctx); | ||
| 7369 | } | ||
| 7370 | |||
| 7034 | event->pmu = pmu; | 7371 | event->pmu = pmu; | 
| 7035 | ret = pmu->event_init(event); | 7372 | ret = pmu->event_init(event); | 
| 7373 | |||
| 7374 | if (ctx) | ||
| 7375 | perf_event_ctx_unlock(event->group_leader, ctx); | ||
| 7376 | |||
| 7036 | if (ret) | 7377 | if (ret) | 
| 7037 | module_put(pmu->module); | 7378 | module_put(pmu->module); | 
| 7038 | 7379 | ||
| @@ -7079,10 +7420,6 @@ static void account_event_cpu(struct perf_event *event, int cpu) | |||
| 7079 | if (event->parent) | 7420 | if (event->parent) | 
| 7080 | return; | 7421 | return; | 
| 7081 | 7422 | ||
| 7082 | if (has_branch_stack(event)) { | ||
| 7083 | if (!(event->attach_state & PERF_ATTACH_TASK)) | ||
| 7084 | atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); | ||
| 7085 | } | ||
| 7086 | if (is_cgroup_event(event)) | 7423 | if (is_cgroup_event(event)) | 
| 7087 | atomic_inc(&per_cpu(perf_cgroup_events, cpu)); | 7424 | atomic_inc(&per_cpu(perf_cgroup_events, cpu)); | 
| 7088 | } | 7425 | } | 
| @@ -7121,7 +7458,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 7121 | struct perf_event *group_leader, | 7458 | struct perf_event *group_leader, | 
| 7122 | struct perf_event *parent_event, | 7459 | struct perf_event *parent_event, | 
| 7123 | perf_overflow_handler_t overflow_handler, | 7460 | perf_overflow_handler_t overflow_handler, | 
| 7124 | void *context) | 7461 | void *context, int cgroup_fd) | 
| 7125 | { | 7462 | { | 
| 7126 | struct pmu *pmu; | 7463 | struct pmu *pmu; | 
| 7127 | struct perf_event *event; | 7464 | struct perf_event *event; | 
| @@ -7176,18 +7513,18 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 7176 | 7513 | ||
| 7177 | if (task) { | 7514 | if (task) { | 
| 7178 | event->attach_state = PERF_ATTACH_TASK; | 7515 | event->attach_state = PERF_ATTACH_TASK; | 
| 7179 | |||
| 7180 | if (attr->type == PERF_TYPE_TRACEPOINT) | ||
| 7181 | event->hw.tp_target = task; | ||
| 7182 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
| 7183 | /* | 7516 | /* | 
| 7184 | * hw_breakpoint is a bit difficult here.. | 7517 | * XXX pmu::event_init needs to know what task to account to | 
| 7518 | * and we cannot use the ctx information because we need the | ||
| 7519 | * pmu before we get a ctx. | ||
| 7185 | */ | 7520 | */ | 
| 7186 | else if (attr->type == PERF_TYPE_BREAKPOINT) | 7521 | event->hw.target = task; | 
| 7187 | event->hw.bp_target = task; | ||
| 7188 | #endif | ||
| 7189 | } | 7522 | } | 
| 7190 | 7523 | ||
| 7524 | event->clock = &local_clock; | ||
| 7525 | if (parent_event) | ||
| 7526 | event->clock = parent_event->clock; | ||
| 7527 | |||
| 7191 | if (!overflow_handler && parent_event) { | 7528 | if (!overflow_handler && parent_event) { | 
| 7192 | overflow_handler = parent_event->overflow_handler; | 7529 | overflow_handler = parent_event->overflow_handler; | 
| 7193 | context = parent_event->overflow_handler_context; | 7530 | context = parent_event->overflow_handler_context; | 
| @@ -7214,6 +7551,15 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 7214 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 7551 | if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) | 
| 7215 | goto err_ns; | 7552 | goto err_ns; | 
| 7216 | 7553 | ||
| 7554 | if (!has_branch_stack(event)) | ||
| 7555 | event->attr.branch_sample_type = 0; | ||
| 7556 | |||
| 7557 | if (cgroup_fd != -1) { | ||
| 7558 | err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); | ||
| 7559 | if (err) | ||
| 7560 | goto err_ns; | ||
| 7561 | } | ||
| 7562 | |||
| 7217 | pmu = perf_init_event(event); | 7563 | pmu = perf_init_event(event); | 
| 7218 | if (!pmu) | 7564 | if (!pmu) | 
| 7219 | goto err_ns; | 7565 | goto err_ns; | 
| @@ -7222,21 +7568,30 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 7222 | goto err_ns; | 7568 | goto err_ns; | 
| 7223 | } | 7569 | } | 
| 7224 | 7570 | ||
| 7571 | err = exclusive_event_init(event); | ||
| 7572 | if (err) | ||
| 7573 | goto err_pmu; | ||
| 7574 | |||
| 7225 | if (!event->parent) { | 7575 | if (!event->parent) { | 
| 7226 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | 7576 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { | 
| 7227 | err = get_callchain_buffers(); | 7577 | err = get_callchain_buffers(); | 
| 7228 | if (err) | 7578 | if (err) | 
| 7229 | goto err_pmu; | 7579 | goto err_per_task; | 
| 7230 | } | 7580 | } | 
| 7231 | } | 7581 | } | 
| 7232 | 7582 | ||
| 7233 | return event; | 7583 | return event; | 
| 7234 | 7584 | ||
| 7585 | err_per_task: | ||
| 7586 | exclusive_event_destroy(event); | ||
| 7587 | |||
| 7235 | err_pmu: | 7588 | err_pmu: | 
| 7236 | if (event->destroy) | 7589 | if (event->destroy) | 
| 7237 | event->destroy(event); | 7590 | event->destroy(event); | 
| 7238 | module_put(pmu->module); | 7591 | module_put(pmu->module); | 
| 7239 | err_ns: | 7592 | err_ns: | 
| 7593 | if (is_cgroup_event(event)) | ||
| 7594 | perf_detach_cgroup(event); | ||
| 7240 | if (event->ns) | 7595 | if (event->ns) | 
| 7241 | put_pid_ns(event->ns); | 7596 | put_pid_ns(event->ns); | 
| 7242 | kfree(event); | 7597 | kfree(event); | 
| @@ -7399,6 +7754,19 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | |||
| 7399 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) | 7754 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) | 
| 7400 | goto out; | 7755 | goto out; | 
| 7401 | 7756 | ||
| 7757 | /* | ||
| 7758 | * Mixing clocks in the same buffer is trouble you don't need. | ||
| 7759 | */ | ||
| 7760 | if (output_event->clock != event->clock) | ||
| 7761 | goto out; | ||
| 7762 | |||
| 7763 | /* | ||
| 7764 | * If both events generate aux data, they must be on the same PMU | ||
| 7765 | */ | ||
| 7766 | if (has_aux(event) && has_aux(output_event) && | ||
| 7767 | event->pmu != output_event->pmu) | ||
| 7768 | goto out; | ||
| 7769 | |||
| 7402 | set: | 7770 | set: | 
| 7403 | mutex_lock(&event->mmap_mutex); | 7771 | mutex_lock(&event->mmap_mutex); | 
| 7404 | /* Can't redirect output if we've got an active mmap() */ | 7772 | /* Can't redirect output if we've got an active mmap() */ | 
| @@ -7431,6 +7799,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b) | |||
| 7431 | mutex_lock_nested(b, SINGLE_DEPTH_NESTING); | 7799 | mutex_lock_nested(b, SINGLE_DEPTH_NESTING); | 
| 7432 | } | 7800 | } | 
| 7433 | 7801 | ||
| 7802 | static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) | ||
| 7803 | { | ||
| 7804 | bool nmi_safe = false; | ||
| 7805 | |||
| 7806 | switch (clk_id) { | ||
| 7807 | case CLOCK_MONOTONIC: | ||
| 7808 | event->clock = &ktime_get_mono_fast_ns; | ||
| 7809 | nmi_safe = true; | ||
| 7810 | break; | ||
| 7811 | |||
| 7812 | case CLOCK_MONOTONIC_RAW: | ||
| 7813 | event->clock = &ktime_get_raw_fast_ns; | ||
| 7814 | nmi_safe = true; | ||
| 7815 | break; | ||
| 7816 | |||
| 7817 | case CLOCK_REALTIME: | ||
| 7818 | event->clock = &ktime_get_real_ns; | ||
| 7819 | break; | ||
| 7820 | |||
| 7821 | case CLOCK_BOOTTIME: | ||
| 7822 | event->clock = &ktime_get_boot_ns; | ||
| 7823 | break; | ||
| 7824 | |||
| 7825 | case CLOCK_TAI: | ||
| 7826 | event->clock = &ktime_get_tai_ns; | ||
| 7827 | break; | ||
| 7828 | |||
| 7829 | default: | ||
| 7830 | return -EINVAL; | ||
| 7831 | } | ||
| 7832 | |||
| 7833 | if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) | ||
| 7834 | return -EINVAL; | ||
| 7835 | |||
| 7836 | return 0; | ||
| 7837 | } | ||
| 7838 | |||
| 7434 | /** | 7839 | /** | 
| 7435 | * sys_perf_event_open - open a performance event, associate it to a task/cpu | 7840 | * sys_perf_event_open - open a performance event, associate it to a task/cpu | 
| 7436 | * | 7841 | * | 
| @@ -7455,6 +7860,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7455 | int move_group = 0; | 7860 | int move_group = 0; | 
| 7456 | int err; | 7861 | int err; | 
| 7457 | int f_flags = O_RDWR; | 7862 | int f_flags = O_RDWR; | 
| 7863 | int cgroup_fd = -1; | ||
| 7458 | 7864 | ||
| 7459 | /* for future expandability... */ | 7865 | /* for future expandability... */ | 
| 7460 | if (flags & ~PERF_FLAG_ALL) | 7866 | if (flags & ~PERF_FLAG_ALL) | 
| @@ -7520,21 +7926,16 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7520 | 7926 | ||
| 7521 | get_online_cpus(); | 7927 | get_online_cpus(); | 
| 7522 | 7928 | ||
| 7929 | if (flags & PERF_FLAG_PID_CGROUP) | ||
| 7930 | cgroup_fd = pid; | ||
| 7931 | |||
| 7523 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, | 7932 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, | 
| 7524 | NULL, NULL); | 7933 | NULL, NULL, cgroup_fd); | 
| 7525 | if (IS_ERR(event)) { | 7934 | if (IS_ERR(event)) { | 
| 7526 | err = PTR_ERR(event); | 7935 | err = PTR_ERR(event); | 
| 7527 | goto err_cpus; | 7936 | goto err_cpus; | 
| 7528 | } | 7937 | } | 
| 7529 | 7938 | ||
| 7530 | if (flags & PERF_FLAG_PID_CGROUP) { | ||
| 7531 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | ||
| 7532 | if (err) { | ||
| 7533 | __free_event(event); | ||
| 7534 | goto err_cpus; | ||
| 7535 | } | ||
| 7536 | } | ||
| 7537 | |||
| 7538 | if (is_sampling_event(event)) { | 7939 | if (is_sampling_event(event)) { | 
| 7539 | if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { | 7940 | if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { | 
| 7540 | err = -ENOTSUPP; | 7941 | err = -ENOTSUPP; | 
| @@ -7550,6 +7951,12 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7550 | */ | 7951 | */ | 
| 7551 | pmu = event->pmu; | 7952 | pmu = event->pmu; | 
| 7552 | 7953 | ||
| 7954 | if (attr.use_clockid) { | ||
| 7955 | err = perf_event_set_clock(event, attr.clockid); | ||
| 7956 | if (err) | ||
| 7957 | goto err_alloc; | ||
| 7958 | } | ||
| 7959 | |||
| 7553 | if (group_leader && | 7960 | if (group_leader && | 
| 7554 | (is_software_event(event) != is_software_event(group_leader))) { | 7961 | (is_software_event(event) != is_software_event(group_leader))) { | 
| 7555 | if (is_software_event(event)) { | 7962 | if (is_software_event(event)) { | 
| @@ -7576,12 +7983,17 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7576 | /* | 7983 | /* | 
| 7577 | * Get the target context (task or percpu): | 7984 | * Get the target context (task or percpu): | 
| 7578 | */ | 7985 | */ | 
| 7579 | ctx = find_get_context(pmu, task, event->cpu); | 7986 | ctx = find_get_context(pmu, task, event); | 
| 7580 | if (IS_ERR(ctx)) { | 7987 | if (IS_ERR(ctx)) { | 
| 7581 | err = PTR_ERR(ctx); | 7988 | err = PTR_ERR(ctx); | 
| 7582 | goto err_alloc; | 7989 | goto err_alloc; | 
| 7583 | } | 7990 | } | 
| 7584 | 7991 | ||
| 7992 | if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { | ||
| 7993 | err = -EBUSY; | ||
| 7994 | goto err_context; | ||
| 7995 | } | ||
| 7996 | |||
| 7585 | if (task) { | 7997 | if (task) { | 
| 7586 | put_task_struct(task); | 7998 | put_task_struct(task); | 
| 7587 | task = NULL; | 7999 | task = NULL; | 
| @@ -7599,6 +8011,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7599 | */ | 8011 | */ | 
| 7600 | if (group_leader->group_leader != group_leader) | 8012 | if (group_leader->group_leader != group_leader) | 
| 7601 | goto err_context; | 8013 | goto err_context; | 
| 8014 | |||
| 8015 | /* All events in a group should have the same clock */ | ||
| 8016 | if (group_leader->clock != event->clock) | ||
| 8017 | goto err_context; | ||
| 8018 | |||
| 7602 | /* | 8019 | /* | 
| 7603 | * Do not allow to attach to a group in a different | 8020 | * Do not allow to attach to a group in a different | 
| 7604 | * task or CPU context: | 8021 | * task or CPU context: | 
| @@ -7699,6 +8116,13 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 7699 | get_ctx(ctx); | 8116 | get_ctx(ctx); | 
| 7700 | } | 8117 | } | 
| 7701 | 8118 | ||
| 8119 | if (!exclusive_event_installable(event, ctx)) { | ||
| 8120 | err = -EBUSY; | ||
| 8121 | mutex_unlock(&ctx->mutex); | ||
| 8122 | fput(event_file); | ||
| 8123 | goto err_context; | ||
| 8124 | } | ||
| 8125 | |||
| 7702 | perf_install_in_context(ctx, event, event->cpu); | 8126 | perf_install_in_context(ctx, event, event->cpu); | 
| 7703 | perf_unpin_context(ctx); | 8127 | perf_unpin_context(ctx); | 
| 7704 | 8128 | ||
| @@ -7771,7 +8195,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 7771 | */ | 8195 | */ | 
| 7772 | 8196 | ||
| 7773 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, | 8197 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, | 
| 7774 | overflow_handler, context); | 8198 | overflow_handler, context, -1); | 
| 7775 | if (IS_ERR(event)) { | 8199 | if (IS_ERR(event)) { | 
| 7776 | err = PTR_ERR(event); | 8200 | err = PTR_ERR(event); | 
| 7777 | goto err; | 8201 | goto err; | 
| @@ -7782,7 +8206,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 7782 | 8206 | ||
| 7783 | account_event(event); | 8207 | account_event(event); | 
| 7784 | 8208 | ||
| 7785 | ctx = find_get_context(event->pmu, task, cpu); | 8209 | ctx = find_get_context(event->pmu, task, event); | 
| 7786 | if (IS_ERR(ctx)) { | 8210 | if (IS_ERR(ctx)) { | 
| 7787 | err = PTR_ERR(ctx); | 8211 | err = PTR_ERR(ctx); | 
| 7788 | goto err_free; | 8212 | goto err_free; | 
| @@ -7790,6 +8214,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 7790 | 8214 | ||
| 7791 | WARN_ON_ONCE(ctx->parent_ctx); | 8215 | WARN_ON_ONCE(ctx->parent_ctx); | 
| 7792 | mutex_lock(&ctx->mutex); | 8216 | mutex_lock(&ctx->mutex); | 
| 8217 | if (!exclusive_event_installable(event, ctx)) { | ||
| 8218 | mutex_unlock(&ctx->mutex); | ||
| 8219 | perf_unpin_context(ctx); | ||
| 8220 | put_ctx(ctx); | ||
| 8221 | err = -EBUSY; | ||
| 8222 | goto err_free; | ||
| 8223 | } | ||
| 8224 | |||
| 7793 | perf_install_in_context(ctx, event, cpu); | 8225 | perf_install_in_context(ctx, event, cpu); | 
| 7794 | perf_unpin_context(ctx); | 8226 | perf_unpin_context(ctx); | 
| 7795 | mutex_unlock(&ctx->mutex); | 8227 | mutex_unlock(&ctx->mutex); | 
| @@ -8132,7 +8564,7 @@ inherit_event(struct perf_event *parent_event, | |||
| 8132 | parent_event->cpu, | 8564 | parent_event->cpu, | 
| 8133 | child, | 8565 | child, | 
| 8134 | group_leader, parent_event, | 8566 | group_leader, parent_event, | 
| 8135 | NULL, NULL); | 8567 | NULL, NULL, -1); | 
| 8136 | if (IS_ERR(child_event)) | 8568 | if (IS_ERR(child_event)) | 
| 8137 | return child_event; | 8569 | return child_event; | 
| 8138 | 8570 | ||
| diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9803a6600d49..92ce5f4ccc26 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
| @@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) | |||
| 116 | */ | 116 | */ | 
| 117 | static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) | 117 | static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) | 
| 118 | { | 118 | { | 
| 119 | struct task_struct *tsk = bp->hw.bp_target; | 119 | struct task_struct *tsk = bp->hw.target; | 
| 120 | struct perf_event *iter; | 120 | struct perf_event *iter; | 
| 121 | int count = 0; | 121 | int count = 0; | 
| 122 | 122 | ||
| 123 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 123 | list_for_each_entry(iter, &bp_task_head, hw.bp_list) { | 
| 124 | if (iter->hw.bp_target == tsk && | 124 | if (iter->hw.target == tsk && | 
| 125 | find_slot_idx(iter) == type && | 125 | find_slot_idx(iter) == type && | 
| 126 | (iter->cpu < 0 || cpu == iter->cpu)) | 126 | (iter->cpu < 0 || cpu == iter->cpu)) | 
| 127 | count += hw_breakpoint_weight(iter); | 127 | count += hw_breakpoint_weight(iter); | 
| @@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, | |||
| 153 | int nr; | 153 | int nr; | 
| 154 | 154 | ||
| 155 | nr = info->cpu_pinned; | 155 | nr = info->cpu_pinned; | 
| 156 | if (!bp->hw.bp_target) | 156 | if (!bp->hw.target) | 
| 157 | nr += max_task_bp_pinned(cpu, type); | 157 | nr += max_task_bp_pinned(cpu, type); | 
| 158 | else | 158 | else | 
| 159 | nr += task_bp_pinned(cpu, bp, type); | 159 | nr += task_bp_pinned(cpu, bp, type); | 
| @@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, | |||
| 210 | weight = -weight; | 210 | weight = -weight; | 
| 211 | 211 | ||
| 212 | /* Pinned counter cpu profiling */ | 212 | /* Pinned counter cpu profiling */ | 
| 213 | if (!bp->hw.bp_target) { | 213 | if (!bp->hw.target) { | 
| 214 | get_bp_info(bp->cpu, type)->cpu_pinned += weight; | 214 | get_bp_info(bp->cpu, type)->cpu_pinned += weight; | 
| 215 | return; | 215 | return; | 
| 216 | } | 216 | } | 
| diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 569b218782ad..9f6ce9ba4a04 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
| @@ -27,6 +27,7 @@ struct ring_buffer { | |||
| 27 | local_t lost; /* nr records lost */ | 27 | local_t lost; /* nr records lost */ | 
| 28 | 28 | ||
| 29 | long watermark; /* wakeup watermark */ | 29 | long watermark; /* wakeup watermark */ | 
| 30 | long aux_watermark; | ||
| 30 | /* poll crap */ | 31 | /* poll crap */ | 
| 31 | spinlock_t event_lock; | 32 | spinlock_t event_lock; | 
| 32 | struct list_head event_list; | 33 | struct list_head event_list; | 
| @@ -35,6 +36,20 @@ struct ring_buffer { | |||
| 35 | unsigned long mmap_locked; | 36 | unsigned long mmap_locked; | 
| 36 | struct user_struct *mmap_user; | 37 | struct user_struct *mmap_user; | 
| 37 | 38 | ||
| 39 | /* AUX area */ | ||
| 40 | local_t aux_head; | ||
| 41 | local_t aux_nest; | ||
| 42 | local_t aux_wakeup; | ||
| 43 | unsigned long aux_pgoff; | ||
| 44 | int aux_nr_pages; | ||
| 45 | int aux_overwrite; | ||
| 46 | atomic_t aux_mmap_count; | ||
| 47 | unsigned long aux_mmap_locked; | ||
| 48 | void (*free_aux)(void *); | ||
| 49 | atomic_t aux_refcount; | ||
| 50 | void **aux_pages; | ||
| 51 | void *aux_priv; | ||
| 52 | |||
| 38 | struct perf_event_mmap_page *user_page; | 53 | struct perf_event_mmap_page *user_page; | 
| 39 | void *data_pages[0]; | 54 | void *data_pages[0]; | 
| 40 | }; | 55 | }; | 
| @@ -43,6 +58,19 @@ extern void rb_free(struct ring_buffer *rb); | |||
| 43 | extern struct ring_buffer * | 58 | extern struct ring_buffer * | 
| 44 | rb_alloc(int nr_pages, long watermark, int cpu, int flags); | 59 | rb_alloc(int nr_pages, long watermark, int cpu, int flags); | 
| 45 | extern void perf_event_wakeup(struct perf_event *event); | 60 | extern void perf_event_wakeup(struct perf_event *event); | 
| 61 | extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, | ||
| 62 | pgoff_t pgoff, int nr_pages, long watermark, int flags); | ||
| 63 | extern void rb_free_aux(struct ring_buffer *rb); | ||
| 64 | extern struct ring_buffer *ring_buffer_get(struct perf_event *event); | ||
| 65 | extern void ring_buffer_put(struct ring_buffer *rb); | ||
| 66 | |||
| 67 | static inline bool rb_has_aux(struct ring_buffer *rb) | ||
| 68 | { | ||
| 69 | return !!rb->aux_nr_pages; | ||
| 70 | } | ||
| 71 | |||
| 72 | void perf_event_aux_event(struct perf_event *event, unsigned long head, | ||
| 73 | unsigned long size, u64 flags); | ||
| 46 | 74 | ||
| 47 | extern void | 75 | extern void | 
| 48 | perf_event_header__init_id(struct perf_event_header *header, | 76 | perf_event_header__init_id(struct perf_event_header *header, | 
| @@ -81,6 +109,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) | |||
| 81 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); | 109 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); | 
| 82 | } | 110 | } | 
| 83 | 111 | ||
| 112 | static inline unsigned long perf_aux_size(struct ring_buffer *rb) | ||
| 113 | { | ||
| 114 | return rb->aux_nr_pages << PAGE_SHIFT; | ||
| 115 | } | ||
| 116 | |||
| 84 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | 117 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ | 
| 85 | static inline unsigned long \ | 118 | static inline unsigned long \ | 
| 86 | func_name(struct perf_output_handle *handle, \ | 119 | func_name(struct perf_output_handle *handle, \ | 
| diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index eadb95ce7aac..232f00f273cb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
| @@ -243,14 +243,317 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) | |||
| 243 | spin_lock_init(&rb->event_lock); | 243 | spin_lock_init(&rb->event_lock); | 
| 244 | } | 244 | } | 
| 245 | 245 | ||
| 246 | /* | ||
| 247 | * This is called before hardware starts writing to the AUX area to | ||
| 248 | * obtain an output handle and make sure there's room in the buffer. | ||
| 249 | * When the capture completes, call perf_aux_output_end() to commit | ||
| 250 | * the recorded data to the buffer. | ||
| 251 | * | ||
| 252 | * The ordering is similar to that of perf_output_{begin,end}, with | ||
| 253 | * the exception of (B), which should be taken care of by the pmu | ||
| 254 | * driver, since ordering rules will differ depending on hardware. | ||
| 255 | */ | ||
| 256 | void *perf_aux_output_begin(struct perf_output_handle *handle, | ||
| 257 | struct perf_event *event) | ||
| 258 | { | ||
| 259 | struct perf_event *output_event = event; | ||
| 260 | unsigned long aux_head, aux_tail; | ||
| 261 | struct ring_buffer *rb; | ||
| 262 | |||
| 263 | if (output_event->parent) | ||
| 264 | output_event = output_event->parent; | ||
| 265 | |||
| 266 | /* | ||
| 267 | * Since this will typically be open across pmu::add/pmu::del, we | ||
| 268 | * grab ring_buffer's refcount instead of holding rcu read lock | ||
| 269 | * to make sure it doesn't disappear under us. | ||
| 270 | */ | ||
| 271 | rb = ring_buffer_get(output_event); | ||
| 272 | if (!rb) | ||
| 273 | return NULL; | ||
| 274 | |||
| 275 | if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) | ||
| 276 | goto err; | ||
| 277 | |||
| 278 | /* | ||
| 279 | * Nesting is not supported for AUX area, make sure nested | ||
| 280 | * writers are caught early | ||
| 281 | */ | ||
| 282 | if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) | ||
| 283 | goto err_put; | ||
| 284 | |||
| 285 | aux_head = local_read(&rb->aux_head); | ||
| 286 | |||
| 287 | handle->rb = rb; | ||
| 288 | handle->event = event; | ||
| 289 | handle->head = aux_head; | ||
| 290 | handle->size = 0; | ||
| 291 | |||
| 292 | /* | ||
| 293 | * In overwrite mode, AUX data stores do not depend on aux_tail, | ||
| 294 | * therefore (A) control dependency barrier does not exist. The | ||
| 295 | * (B) <-> (C) ordering is still observed by the pmu driver. | ||
| 296 | */ | ||
| 297 | if (!rb->aux_overwrite) { | ||
| 298 | aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); | ||
| 299 | handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; | ||
| 300 | if (aux_head - aux_tail < perf_aux_size(rb)) | ||
| 301 | handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); | ||
| 302 | |||
| 303 | /* | ||
| 304 | * handle->size computation depends on aux_tail load; this forms a | ||
| 305 | * control dependency barrier separating aux_tail load from aux data | ||
| 306 | * store that will be enabled on successful return | ||
| 307 | */ | ||
| 308 | if (!handle->size) { /* A, matches D */ | ||
| 309 | event->pending_disable = 1; | ||
| 310 | perf_output_wakeup(handle); | ||
| 311 | local_set(&rb->aux_nest, 0); | ||
| 312 | goto err_put; | ||
| 313 | } | ||
| 314 | } | ||
| 315 | |||
| 316 | return handle->rb->aux_priv; | ||
| 317 | |||
| 318 | err_put: | ||
| 319 | rb_free_aux(rb); | ||
| 320 | |||
| 321 | err: | ||
| 322 | ring_buffer_put(rb); | ||
| 323 | handle->event = NULL; | ||
| 324 | |||
| 325 | return NULL; | ||
| 326 | } | ||
| 327 | |||
| 328 | /* | ||
| 329 | * Commit the data written by hardware into the ring buffer by adjusting | ||
| 330 | * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the | ||
| 331 | * pmu driver's responsibility to observe ordering rules of the hardware, | ||
| 332 | * so that all the data is externally visible before this is called. | ||
| 333 | */ | ||
| 334 | void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, | ||
| 335 | bool truncated) | ||
| 336 | { | ||
| 337 | struct ring_buffer *rb = handle->rb; | ||
| 338 | unsigned long aux_head; | ||
| 339 | u64 flags = 0; | ||
| 340 | |||
| 341 | if (truncated) | ||
| 342 | flags |= PERF_AUX_FLAG_TRUNCATED; | ||
| 343 | |||
| 344 | /* in overwrite mode, driver provides aux_head via handle */ | ||
| 345 | if (rb->aux_overwrite) { | ||
| 346 | flags |= PERF_AUX_FLAG_OVERWRITE; | ||
| 347 | |||
| 348 | aux_head = handle->head; | ||
| 349 | local_set(&rb->aux_head, aux_head); | ||
| 350 | } else { | ||
| 351 | aux_head = local_read(&rb->aux_head); | ||
| 352 | local_add(size, &rb->aux_head); | ||
| 353 | } | ||
| 354 | |||
| 355 | if (size || flags) { | ||
| 356 | /* | ||
| 357 | * Only send RECORD_AUX if we have something useful to communicate | ||
| 358 | */ | ||
| 359 | |||
| 360 | perf_event_aux_event(handle->event, aux_head, size, flags); | ||
| 361 | } | ||
| 362 | |||
| 363 | aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); | ||
| 364 | |||
| 365 | if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { | ||
| 366 | perf_output_wakeup(handle); | ||
| 367 | local_add(rb->aux_watermark, &rb->aux_wakeup); | ||
| 368 | } | ||
| 369 | handle->event = NULL; | ||
| 370 | |||
| 371 | local_set(&rb->aux_nest, 0); | ||
| 372 | rb_free_aux(rb); | ||
| 373 | ring_buffer_put(rb); | ||
| 374 | } | ||
| 375 | |||
| 376 | /* | ||
| 377 | * Skip over a given number of bytes in the AUX buffer, due to, for example, | ||
| 378 | * hardware's alignment constraints. | ||
| 379 | */ | ||
| 380 | int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) | ||
| 381 | { | ||
| 382 | struct ring_buffer *rb = handle->rb; | ||
| 383 | unsigned long aux_head; | ||
| 384 | |||
| 385 | if (size > handle->size) | ||
| 386 | return -ENOSPC; | ||
| 387 | |||
| 388 | local_add(size, &rb->aux_head); | ||
| 389 | |||
| 390 | aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); | ||
| 391 | if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { | ||
| 392 | perf_output_wakeup(handle); | ||
| 393 | local_add(rb->aux_watermark, &rb->aux_wakeup); | ||
| 394 | handle->wakeup = local_read(&rb->aux_wakeup) + | ||
| 395 | rb->aux_watermark; | ||
| 396 | } | ||
| 397 | |||
| 398 | handle->head = aux_head; | ||
| 399 | handle->size -= size; | ||
| 400 | |||
| 401 | return 0; | ||
| 402 | } | ||
| 403 | |||
| 404 | void *perf_get_aux(struct perf_output_handle *handle) | ||
| 405 | { | ||
| 406 | /* this is only valid between perf_aux_output_begin and *_end */ | ||
| 407 | if (!handle->event) | ||
| 408 | return NULL; | ||
| 409 | |||
| 410 | return handle->rb->aux_priv; | ||
| 411 | } | ||
| 412 | |||
| 413 | #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) | ||
| 414 | |||
| 415 | static struct page *rb_alloc_aux_page(int node, int order) | ||
| 416 | { | ||
| 417 | struct page *page; | ||
| 418 | |||
| 419 | if (order > MAX_ORDER) | ||
| 420 | order = MAX_ORDER; | ||
| 421 | |||
| 422 | do { | ||
| 423 | page = alloc_pages_node(node, PERF_AUX_GFP, order); | ||
| 424 | } while (!page && order--); | ||
| 425 | |||
| 426 | if (page && order) { | ||
| 427 | /* | ||
| 428 | * Communicate the allocation size to the driver | ||
| 429 | */ | ||
| 430 | split_page(page, order); | ||
| 431 | SetPagePrivate(page); | ||
| 432 | set_page_private(page, order); | ||
| 433 | } | ||
| 434 | |||
| 435 | return page; | ||
| 436 | } | ||
| 437 | |||
| 438 | static void rb_free_aux_page(struct ring_buffer *rb, int idx) | ||
| 439 | { | ||
| 440 | struct page *page = virt_to_page(rb->aux_pages[idx]); | ||
| 441 | |||
| 442 | ClearPagePrivate(page); | ||
| 443 | page->mapping = NULL; | ||
| 444 | __free_page(page); | ||
| 445 | } | ||
| 446 | |||
| 447 | int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, | ||
| 448 | pgoff_t pgoff, int nr_pages, long watermark, int flags) | ||
| 449 | { | ||
| 450 | bool overwrite = !(flags & RING_BUFFER_WRITABLE); | ||
| 451 | int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); | ||
| 452 | int ret = -ENOMEM, max_order = 0; | ||
| 453 | |||
| 454 | if (!has_aux(event)) | ||
| 455 | return -ENOTSUPP; | ||
| 456 | |||
| 457 | if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { | ||
| 458 | /* | ||
| 459 | * We need to start with the max_order that fits in nr_pages, | ||
| 460 | * not the other way around, hence ilog2() and not get_order. | ||
| 461 | */ | ||
| 462 | max_order = ilog2(nr_pages); | ||
| 463 | |||
| 464 | /* | ||
| 465 | * PMU requests more than one contiguous chunks of memory | ||
| 466 | * for SW double buffering | ||
| 467 | */ | ||
| 468 | if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && | ||
| 469 | !overwrite) { | ||
| 470 | if (!max_order) | ||
| 471 | return -EINVAL; | ||
| 472 | |||
| 473 | max_order--; | ||
| 474 | } | ||
| 475 | } | ||
| 476 | |||
| 477 | rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); | ||
| 478 | if (!rb->aux_pages) | ||
| 479 | return -ENOMEM; | ||
| 480 | |||
| 481 | rb->free_aux = event->pmu->free_aux; | ||
| 482 | for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { | ||
| 483 | struct page *page; | ||
| 484 | int last, order; | ||
| 485 | |||
| 486 | order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); | ||
| 487 | page = rb_alloc_aux_page(node, order); | ||
| 488 | if (!page) | ||
| 489 | goto out; | ||
| 490 | |||
| 491 | for (last = rb->aux_nr_pages + (1 << page_private(page)); | ||
| 492 | last > rb->aux_nr_pages; rb->aux_nr_pages++) | ||
| 493 | rb->aux_pages[rb->aux_nr_pages] = page_address(page++); | ||
| 494 | } | ||
| 495 | |||
| 496 | rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, | ||
| 497 | overwrite); | ||
| 498 | if (!rb->aux_priv) | ||
| 499 | goto out; | ||
| 500 | |||
| 501 | ret = 0; | ||
| 502 | |||
| 503 | /* | ||
| 504 | * aux_pages (and pmu driver's private data, aux_priv) will be | ||
| 505 | * referenced in both producer's and consumer's contexts, thus | ||
| 506 | * we keep a refcount here to make sure either of the two can | ||
| 507 | * reference them safely. | ||
| 508 | */ | ||
| 509 | atomic_set(&rb->aux_refcount, 1); | ||
| 510 | |||
| 511 | rb->aux_overwrite = overwrite; | ||
| 512 | rb->aux_watermark = watermark; | ||
| 513 | |||
| 514 | if (!rb->aux_watermark && !rb->aux_overwrite) | ||
| 515 | rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); | ||
| 516 | |||
| 517 | out: | ||
| 518 | if (!ret) | ||
| 519 | rb->aux_pgoff = pgoff; | ||
| 520 | else | ||
| 521 | rb_free_aux(rb); | ||
| 522 | |||
| 523 | return ret; | ||
| 524 | } | ||
| 525 | |||
| 526 | static void __rb_free_aux(struct ring_buffer *rb) | ||
| 527 | { | ||
| 528 | int pg; | ||
| 529 | |||
| 530 | if (rb->aux_priv) { | ||
| 531 | rb->free_aux(rb->aux_priv); | ||
| 532 | rb->free_aux = NULL; | ||
| 533 | rb->aux_priv = NULL; | ||
| 534 | } | ||
| 535 | |||
| 536 | for (pg = 0; pg < rb->aux_nr_pages; pg++) | ||
| 537 | rb_free_aux_page(rb, pg); | ||
| 538 | |||
| 539 | kfree(rb->aux_pages); | ||
| 540 | rb->aux_nr_pages = 0; | ||
| 541 | } | ||
| 542 | |||
| 543 | void rb_free_aux(struct ring_buffer *rb) | ||
| 544 | { | ||
| 545 | if (atomic_dec_and_test(&rb->aux_refcount)) | ||
| 546 | __rb_free_aux(rb); | ||
| 547 | } | ||
| 548 | |||
| 246 | #ifndef CONFIG_PERF_USE_VMALLOC | 549 | #ifndef CONFIG_PERF_USE_VMALLOC | 
| 247 | 550 | ||
| 248 | /* | 551 | /* | 
| 249 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. | 552 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. | 
| 250 | */ | 553 | */ | 
| 251 | 554 | ||
| 252 | struct page * | 555 | static struct page * | 
| 253 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | 556 | __perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | 
| 254 | { | 557 | { | 
| 255 | if (pgoff > rb->nr_pages) | 558 | if (pgoff > rb->nr_pages) | 
| 256 | return NULL; | 559 | return NULL; | 
| @@ -340,8 +643,8 @@ static int data_page_nr(struct ring_buffer *rb) | |||
| 340 | return rb->nr_pages << page_order(rb); | 643 | return rb->nr_pages << page_order(rb); | 
| 341 | } | 644 | } | 
| 342 | 645 | ||
| 343 | struct page * | 646 | static struct page * | 
| 344 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | 647 | __perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | 
| 345 | { | 648 | { | 
| 346 | /* The '>' counts in the user page. */ | 649 | /* The '>' counts in the user page. */ | 
| 347 | if (pgoff > data_page_nr(rb)) | 650 | if (pgoff > data_page_nr(rb)) | 
| @@ -416,3 +719,19 @@ fail: | |||
| 416 | } | 719 | } | 
| 417 | 720 | ||
| 418 | #endif | 721 | #endif | 
| 722 | |||
| 723 | struct page * | ||
| 724 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | ||
| 725 | { | ||
| 726 | if (rb->aux_nr_pages) { | ||
| 727 | /* above AUX space */ | ||
| 728 | if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) | ||
| 729 | return NULL; | ||
| 730 | |||
| 731 | /* AUX space */ | ||
| 732 | if (pgoff >= rb->aux_pgoff) | ||
| 733 | return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); | ||
| 734 | } | ||
| 735 | |||
| 736 | return __perf_mmap_to_page(rb, pgoff); | ||
| 737 | } | ||
| diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 83d4382f5699..6873bb3e6b7e 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c | |||
| @@ -20,145 +20,10 @@ | |||
| 20 | #include <linux/types.h> | 20 | #include <linux/types.h> | 
| 21 | #include <linux/fs_struct.h> | 21 | #include <linux/fs_struct.h> | 
| 22 | 22 | ||
| 23 | |||
| 24 | static void default_handler(int, struct pt_regs *); | ||
| 25 | |||
| 26 | static struct exec_domain *exec_domains = &default_exec_domain; | ||
| 27 | static DEFINE_RWLOCK(exec_domains_lock); | ||
| 28 | |||
| 29 | |||
| 30 | static unsigned long ident_map[32] = { | ||
| 31 | 0, 1, 2, 3, 4, 5, 6, 7, | ||
| 32 | 8, 9, 10, 11, 12, 13, 14, 15, | ||
| 33 | 16, 17, 18, 19, 20, 21, 22, 23, | ||
| 34 | 24, 25, 26, 27, 28, 29, 30, 31 | ||
| 35 | }; | ||
| 36 | |||
| 37 | struct exec_domain default_exec_domain = { | ||
| 38 | .name = "Linux", /* name */ | ||
| 39 | .handler = default_handler, /* lcall7 causes a seg fault. */ | ||
| 40 | .pers_low = 0, /* PER_LINUX personality. */ | ||
| 41 | .pers_high = 0, /* PER_LINUX personality. */ | ||
| 42 | .signal_map = ident_map, /* Identity map signals. */ | ||
| 43 | .signal_invmap = ident_map, /* - both ways. */ | ||
| 44 | }; | ||
| 45 | |||
| 46 | |||
| 47 | static void | ||
| 48 | default_handler(int segment, struct pt_regs *regp) | ||
| 49 | { | ||
| 50 | set_personality(0); | ||
| 51 | |||
| 52 | if (current_thread_info()->exec_domain->handler != default_handler) | ||
| 53 | current_thread_info()->exec_domain->handler(segment, regp); | ||
| 54 | else | ||
| 55 | send_sig(SIGSEGV, current, 1); | ||
| 56 | } | ||
| 57 | |||
| 58 | static struct exec_domain * | ||
| 59 | lookup_exec_domain(unsigned int personality) | ||
| 60 | { | ||
| 61 | unsigned int pers = personality(personality); | ||
| 62 | struct exec_domain *ep; | ||
| 63 | |||
| 64 | read_lock(&exec_domains_lock); | ||
| 65 | for (ep = exec_domains; ep; ep = ep->next) { | ||
| 66 | if (pers >= ep->pers_low && pers <= ep->pers_high) | ||
| 67 | if (try_module_get(ep->module)) | ||
| 68 | goto out; | ||
| 69 | } | ||
| 70 | |||
| 71 | #ifdef CONFIG_MODULES | ||
| 72 | read_unlock(&exec_domains_lock); | ||
| 73 | request_module("personality-%d", pers); | ||
| 74 | read_lock(&exec_domains_lock); | ||
| 75 | |||
| 76 | for (ep = exec_domains; ep; ep = ep->next) { | ||
| 77 | if (pers >= ep->pers_low && pers <= ep->pers_high) | ||
| 78 | if (try_module_get(ep->module)) | ||
| 79 | goto out; | ||
| 80 | } | ||
| 81 | #endif | ||
| 82 | |||
| 83 | ep = &default_exec_domain; | ||
| 84 | out: | ||
| 85 | read_unlock(&exec_domains_lock); | ||
| 86 | return ep; | ||
| 87 | } | ||
| 88 | |||
| 89 | int | ||
| 90 | register_exec_domain(struct exec_domain *ep) | ||
| 91 | { | ||
| 92 | struct exec_domain *tmp; | ||
| 93 | int err = -EBUSY; | ||
| 94 | |||
| 95 | if (ep == NULL) | ||
| 96 | return -EINVAL; | ||
| 97 | |||
| 98 | if (ep->next != NULL) | ||
| 99 | return -EBUSY; | ||
| 100 | |||
| 101 | write_lock(&exec_domains_lock); | ||
| 102 | for (tmp = exec_domains; tmp; tmp = tmp->next) { | ||
| 103 | if (tmp == ep) | ||
| 104 | goto out; | ||
| 105 | } | ||
| 106 | |||
| 107 | ep->next = exec_domains; | ||
| 108 | exec_domains = ep; | ||
| 109 | err = 0; | ||
| 110 | |||
| 111 | out: | ||
| 112 | write_unlock(&exec_domains_lock); | ||
| 113 | return err; | ||
| 114 | } | ||
| 115 | EXPORT_SYMBOL(register_exec_domain); | ||
| 116 | |||
| 117 | int | ||
| 118 | unregister_exec_domain(struct exec_domain *ep) | ||
| 119 | { | ||
| 120 | struct exec_domain **epp; | ||
| 121 | |||
| 122 | epp = &exec_domains; | ||
| 123 | write_lock(&exec_domains_lock); | ||
| 124 | for (epp = &exec_domains; *epp; epp = &(*epp)->next) { | ||
| 125 | if (ep == *epp) | ||
| 126 | goto unregister; | ||
| 127 | } | ||
| 128 | write_unlock(&exec_domains_lock); | ||
| 129 | return -EINVAL; | ||
| 130 | |||
| 131 | unregister: | ||
| 132 | *epp = ep->next; | ||
| 133 | ep->next = NULL; | ||
| 134 | write_unlock(&exec_domains_lock); | ||
| 135 | return 0; | ||
| 136 | } | ||
| 137 | EXPORT_SYMBOL(unregister_exec_domain); | ||
| 138 | |||
| 139 | int __set_personality(unsigned int personality) | ||
| 140 | { | ||
| 141 | struct exec_domain *oep = current_thread_info()->exec_domain; | ||
| 142 | |||
| 143 | current_thread_info()->exec_domain = lookup_exec_domain(personality); | ||
| 144 | current->personality = personality; | ||
| 145 | module_put(oep->module); | ||
| 146 | |||
| 147 | return 0; | ||
| 148 | } | ||
| 149 | EXPORT_SYMBOL(__set_personality); | ||
| 150 | |||
| 151 | #ifdef CONFIG_PROC_FS | 23 | #ifdef CONFIG_PROC_FS | 
| 152 | static int execdomains_proc_show(struct seq_file *m, void *v) | 24 | static int execdomains_proc_show(struct seq_file *m, void *v) | 
| 153 | { | 25 | { | 
| 154 | struct exec_domain *ep; | 26 | seq_puts(m, "0-0\tLinux \t[kernel]\n"); | 
| 155 | |||
| 156 | read_lock(&exec_domains_lock); | ||
| 157 | for (ep = exec_domains; ep; ep = ep->next) | ||
| 158 | seq_printf(m, "%d-%d\t%-16s\t[%s]\n", | ||
| 159 | ep->pers_low, ep->pers_high, ep->name, | ||
| 160 | module_name(ep->module)); | ||
| 161 | read_unlock(&exec_domains_lock); | ||
| 162 | return 0; | 27 | return 0; | 
| 163 | } | 28 | } | 
| 164 | 29 | ||
| diff --git a/kernel/exit.c b/kernel/exit.c index feff10bbb307..22fcc05dec40 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -756,8 +756,6 @@ void do_exit(long code) | |||
| 756 | 756 | ||
| 757 | cgroup_exit(tsk); | 757 | cgroup_exit(tsk); | 
| 758 | 758 | ||
| 759 | module_put(task_thread_info(tsk)->exec_domain->module); | ||
| 760 | |||
| 761 | /* | 759 | /* | 
| 762 | * FIXME: do that only when needed, using sched_exit tracepoint | 760 | * FIXME: do that only when needed, using sched_exit tracepoint | 
| 763 | */ | 761 | */ | 
| diff --git a/kernel/fork.c b/kernel/fork.c index cf65139615a0..f2c1e7352298 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -1279,9 +1279,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1279 | if (nr_threads >= max_threads) | 1279 | if (nr_threads >= max_threads) | 
| 1280 | goto bad_fork_cleanup_count; | 1280 | goto bad_fork_cleanup_count; | 
| 1281 | 1281 | ||
| 1282 | if (!try_module_get(task_thread_info(p)->exec_domain->module)) | ||
| 1283 | goto bad_fork_cleanup_count; | ||
| 1284 | |||
| 1285 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ | 1282 | delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ | 
| 1286 | p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); | 1283 | p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); | 
| 1287 | p->flags |= PF_FORKNOEXEC; | 1284 | p->flags |= PF_FORKNOEXEC; | 
| @@ -1590,7 +1587,6 @@ bad_fork_cleanup_threadgroup_lock: | |||
| 1590 | if (clone_flags & CLONE_THREAD) | 1587 | if (clone_flags & CLONE_THREAD) | 
| 1591 | threadgroup_change_end(current); | 1588 | threadgroup_change_end(current); | 
| 1592 | delayacct_tsk_free(p); | 1589 | delayacct_tsk_free(p); | 
| 1593 | module_put(task_thread_info(p)->exec_domain->module); | ||
| 1594 | bad_fork_cleanup_count: | 1590 | bad_fork_cleanup_count: | 
| 1595 | atomic_dec(&p->cred->user->processes); | 1591 | atomic_dec(&p->cred->user->processes); | 
| 1596 | exit_creds(p); | 1592 | exit_creds(p); | 
| diff --git a/kernel/futex.c b/kernel/futex.c index 2a5e3830e953..2579e407ff67 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, | |||
| 900 | if (!p) | 900 | if (!p) | 
| 901 | return -ESRCH; | 901 | return -ESRCH; | 
| 902 | 902 | ||
| 903 | if (!p->mm) { | 903 | if (unlikely(p->flags & PF_KTHREAD)) { | 
| 904 | put_task_struct(p); | 904 | put_task_struct(p); | 
| 905 | return -EPERM; | 905 | return -EPERM; | 
| 906 | } | 906 | } | 
| diff --git a/kernel/groups.c b/kernel/groups.c index 664411f171b5..74d431d25251 100644 --- a/kernel/groups.c +++ b/kernel/groups.c | |||
| @@ -9,9 +9,6 @@ | |||
| 9 | #include <linux/user_namespace.h> | 9 | #include <linux/user_namespace.h> | 
| 10 | #include <asm/uaccess.h> | 10 | #include <asm/uaccess.h> | 
| 11 | 11 | ||
| 12 | /* init to 2 - one for init_task, one to ensure it is never freed */ | ||
| 13 | struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; | ||
| 14 | |||
| 15 | struct group_info *groups_alloc(int gidsetsize) | 12 | struct group_info *groups_alloc(int gidsetsize) | 
| 16 | { | 13 | { | 
| 17 | struct group_info *group_info; | 14 | struct group_info *group_info; | 
| diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06db12434d72..e0f90c2b57aa 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
| @@ -169,7 +169,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) | |||
| 169 | return; | 169 | return; | 
| 170 | 170 | ||
| 171 | rcu_read_lock(); | 171 | rcu_read_lock(); | 
| 172 | do_each_thread(g, t) { | 172 | for_each_process_thread(g, t) { | 
| 173 | if (!max_count--) | 173 | if (!max_count--) | 
| 174 | goto unlock; | 174 | goto unlock; | 
| 175 | if (!--batch_count) { | 175 | if (!--batch_count) { | 
| @@ -180,7 +180,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) | |||
| 180 | /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ | 180 | /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ | 
| 181 | if (t->state == TASK_UNINTERRUPTIBLE) | 181 | if (t->state == TASK_UNINTERRUPTIBLE) | 
| 182 | check_hung_task(t, timeout); | 182 | check_hung_task(t, timeout); | 
| 183 | } while_each_thread(g, t); | 183 | } | 
| 184 | unlock: | 184 | unlock: | 
| 185 | rcu_read_unlock(); | 185 | rcu_read_unlock(); | 
| 186 | } | 186 | } | 
| diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6f1c7a566b95..eb9a4ea394ab 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) | |||
| 948 | 948 | ||
| 949 | return -ENOSYS; | 949 | return -ENOSYS; | 
| 950 | } | 950 | } | 
| 951 | |||
| 952 | /** | ||
| 953 | * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt | ||
| 954 | * @data: Pointer to interrupt specific data | ||
| 955 | * @on: Whether to set or reset the wake-up capability of this irq | ||
| 956 | * | ||
| 957 | * Conditional, as the underlying parent chip might not implement it. | ||
| 958 | */ | ||
| 959 | int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) | ||
| 960 | { | ||
| 961 | data = data->parent_data; | ||
| 962 | if (data->chip->irq_set_wake) | ||
| 963 | return data->chip->irq_set_wake(data, on); | ||
| 964 | |||
| 965 | return -ENOSYS; | ||
| 966 | } | ||
| 951 | #endif | 967 | #endif | 
| 952 | 968 | ||
| 953 | /** | 969 | /** | 
| diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 886d09e691d5..e68932bb308e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc) | |||
| 68 | * Do not use this for shutdown scenarios where you must be sure | 68 | * Do not use this for shutdown scenarios where you must be sure | 
| 69 | * that all parts (hardirq and threaded handler) have completed. | 69 | * that all parts (hardirq and threaded handler) have completed. | 
| 70 | * | 70 | * | 
| 71 | * Returns: false if a threaded handler is active. | ||
| 72 | * | ||
| 71 | * This function may be called - with care - from IRQ context. | 73 | * This function may be called - with care - from IRQ context. | 
| 72 | */ | 74 | */ | 
| 73 | void synchronize_hardirq(unsigned int irq) | 75 | bool synchronize_hardirq(unsigned int irq) | 
| 74 | { | 76 | { | 
| 75 | struct irq_desc *desc = irq_to_desc(irq); | 77 | struct irq_desc *desc = irq_to_desc(irq); | 
| 76 | 78 | ||
| 77 | if (desc) | 79 | if (desc) { | 
| 78 | __synchronize_hardirq(desc); | 80 | __synchronize_hardirq(desc); | 
| 81 | return !atomic_read(&desc->threads_active); | ||
| 82 | } | ||
| 83 | |||
| 84 | return true; | ||
| 79 | } | 85 | } | 
| 80 | EXPORT_SYMBOL(synchronize_hardirq); | 86 | EXPORT_SYMBOL(synchronize_hardirq); | 
| 81 | 87 | ||
| @@ -440,6 +446,32 @@ void disable_irq(unsigned int irq) | |||
| 440 | } | 446 | } | 
| 441 | EXPORT_SYMBOL(disable_irq); | 447 | EXPORT_SYMBOL(disable_irq); | 
| 442 | 448 | ||
| 449 | /** | ||
| 450 | * disable_hardirq - disables an irq and waits for hardirq completion | ||
| 451 | * @irq: Interrupt to disable | ||
| 452 | * | ||
| 453 | * Disable the selected interrupt line. Enables and Disables are | ||
| 454 | * nested. | ||
| 455 | * This function waits for any pending hard IRQ handlers for this | ||
| 456 | * interrupt to complete before returning. If you use this function while | ||
| 457 | * holding a resource the hard IRQ handler may need you will deadlock. | ||
| 458 | * | ||
| 459 | * When used to optimistically disable an interrupt from atomic context | ||
| 460 | * the return value must be checked. | ||
| 461 | * | ||
| 462 | * Returns: false if a threaded handler is active. | ||
| 463 | * | ||
| 464 | * This function may be called - with care - from IRQ context. | ||
| 465 | */ | ||
| 466 | bool disable_hardirq(unsigned int irq) | ||
| 467 | { | ||
| 468 | if (!__disable_irq_nosync(irq)) | ||
| 469 | return synchronize_hardirq(irq); | ||
| 470 | |||
| 471 | return false; | ||
| 472 | } | ||
| 473 | EXPORT_SYMBOL_GPL(disable_hardirq); | ||
| 474 | |||
| 443 | void __enable_irq(struct irq_desc *desc, unsigned int irq) | 475 | void __enable_irq(struct irq_desc *desc, unsigned int irq) | 
| 444 | { | 476 | { | 
| 445 | switch (desc->depth) { | 477 | switch (desc->depth) { | 
| @@ -1766,3 +1798,94 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler, | |||
| 1766 | 1798 | ||
| 1767 | return retval; | 1799 | return retval; | 
| 1768 | } | 1800 | } | 
| 1801 | |||
| 1802 | /** | ||
| 1803 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. | ||
| 1804 | * @irq: Interrupt line that is forwarded to a VM | ||
| 1805 | * @which: One of IRQCHIP_STATE_* the caller wants to know about | ||
| 1806 | * @state: a pointer to a boolean where the state is to be storeed | ||
| 1807 | * | ||
| 1808 | * This call snapshots the internal irqchip state of an | ||
| 1809 | * interrupt, returning into @state the bit corresponding to | ||
| 1810 | * stage @which | ||
| 1811 | * | ||
| 1812 | * This function should be called with preemption disabled if the | ||
| 1813 | * interrupt controller has per-cpu registers. | ||
| 1814 | */ | ||
| 1815 | int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | ||
| 1816 | bool *state) | ||
| 1817 | { | ||
| 1818 | struct irq_desc *desc; | ||
| 1819 | struct irq_data *data; | ||
| 1820 | struct irq_chip *chip; | ||
| 1821 | unsigned long flags; | ||
| 1822 | int err = -EINVAL; | ||
| 1823 | |||
| 1824 | desc = irq_get_desc_buslock(irq, &flags, 0); | ||
| 1825 | if (!desc) | ||
| 1826 | return err; | ||
| 1827 | |||
| 1828 | data = irq_desc_get_irq_data(desc); | ||
| 1829 | |||
| 1830 | do { | ||
| 1831 | chip = irq_data_get_irq_chip(data); | ||
| 1832 | if (chip->irq_get_irqchip_state) | ||
| 1833 | break; | ||
| 1834 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 1835 | data = data->parent_data; | ||
| 1836 | #else | ||
| 1837 | data = NULL; | ||
| 1838 | #endif | ||
| 1839 | } while (data); | ||
| 1840 | |||
| 1841 | if (data) | ||
| 1842 | err = chip->irq_get_irqchip_state(data, which, state); | ||
| 1843 | |||
| 1844 | irq_put_desc_busunlock(desc, flags); | ||
| 1845 | return err; | ||
| 1846 | } | ||
| 1847 | |||
| 1848 | /** | ||
| 1849 | * irq_set_irqchip_state - set the state of a forwarded interrupt. | ||
| 1850 | * @irq: Interrupt line that is forwarded to a VM | ||
| 1851 | * @which: State to be restored (one of IRQCHIP_STATE_*) | ||
| 1852 | * @val: Value corresponding to @which | ||
| 1853 | * | ||
| 1854 | * This call sets the internal irqchip state of an interrupt, | ||
| 1855 | * depending on the value of @which. | ||
| 1856 | * | ||
| 1857 | * This function should be called with preemption disabled if the | ||
| 1858 | * interrupt controller has per-cpu registers. | ||
| 1859 | */ | ||
| 1860 | int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | ||
| 1861 | bool val) | ||
| 1862 | { | ||
| 1863 | struct irq_desc *desc; | ||
| 1864 | struct irq_data *data; | ||
| 1865 | struct irq_chip *chip; | ||
| 1866 | unsigned long flags; | ||
| 1867 | int err = -EINVAL; | ||
| 1868 | |||
| 1869 | desc = irq_get_desc_buslock(irq, &flags, 0); | ||
| 1870 | if (!desc) | ||
| 1871 | return err; | ||
| 1872 | |||
| 1873 | data = irq_desc_get_irq_data(desc); | ||
| 1874 | |||
| 1875 | do { | ||
| 1876 | chip = irq_data_get_irq_chip(data); | ||
| 1877 | if (chip->irq_set_irqchip_state) | ||
| 1878 | break; | ||
| 1879 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 1880 | data = data->parent_data; | ||
| 1881 | #else | ||
| 1882 | data = NULL; | ||
| 1883 | #endif | ||
| 1884 | } while (data); | ||
| 1885 | |||
| 1886 | if (data) | ||
| 1887 | err = chip->irq_set_irqchip_state(data, which, val); | ||
| 1888 | |||
| 1889 | irq_put_desc_busunlock(desc, flags); | ||
| 1890 | return err; | ||
| 1891 | } | ||
| diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3e18163f336f..474de5cb394d 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c | |||
| @@ -310,8 +310,15 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) | |||
| 310 | struct msi_desc *desc; | 310 | struct msi_desc *desc; | 
| 311 | 311 | ||
| 312 | for_each_msi_entry(desc, dev) { | 312 | for_each_msi_entry(desc, dev) { | 
| 313 | irq_domain_free_irqs(desc->irq, desc->nvec_used); | 313 | /* | 
| 314 | desc->irq = 0; | 314 | * We might have failed to allocate an MSI early | 
| 315 | * enough that there is no IRQ associated to this | ||
| 316 | * entry. If that's the case, don't do anything. | ||
| 317 | */ | ||
| 318 | if (desc->irq) { | ||
| 319 | irq_domain_free_irqs(desc->irq, desc->nvec_used); | ||
| 320 | desc->irq = 0; | ||
| 321 | } | ||
| 315 | } | 322 | } | 
| 316 | } | 323 | } | 
| 317 | 324 | ||
| diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 01ca08804f51..284e2691e380 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
| @@ -89,16 +89,28 @@ static bool klp_is_object_loaded(struct klp_object *obj) | |||
| 89 | /* sets obj->mod if object is not vmlinux and module is found */ | 89 | /* sets obj->mod if object is not vmlinux and module is found */ | 
| 90 | static void klp_find_object_module(struct klp_object *obj) | 90 | static void klp_find_object_module(struct klp_object *obj) | 
| 91 | { | 91 | { | 
| 92 | struct module *mod; | ||
| 93 | |||
| 92 | if (!klp_is_module(obj)) | 94 | if (!klp_is_module(obj)) | 
| 93 | return; | 95 | return; | 
| 94 | 96 | ||
| 95 | mutex_lock(&module_mutex); | 97 | mutex_lock(&module_mutex); | 
| 96 | /* | 98 | /* | 
| 97 | * We don't need to take a reference on the module here because we have | 99 | * We do not want to block removal of patched modules and therefore | 
| 98 | * the klp_mutex, which is also taken by the module notifier. This | 100 | * we do not take a reference here. The patches are removed by | 
| 99 | * prevents any module from unloading until we release the klp_mutex. | 101 | * a going module handler instead. | 
| 102 | */ | ||
| 103 | mod = find_module(obj->name); | ||
| 104 | /* | ||
| 105 | * Do not mess work of the module coming and going notifiers. | ||
| 106 | * Note that the patch might still be needed before the going handler | ||
| 107 | * is called. Module functions can be called even in the GOING state | ||
| 108 | * until mod->exit() finishes. This is especially important for | ||
| 109 | * patches that modify semantic of the functions. | ||
| 100 | */ | 110 | */ | 
| 101 | obj->mod = find_module(obj->name); | 111 | if (mod && mod->klp_alive) | 
| 112 | obj->mod = mod; | ||
| 113 | |||
| 102 | mutex_unlock(&module_mutex); | 114 | mutex_unlock(&module_mutex); | 
| 103 | } | 115 | } | 
| 104 | 116 | ||
| @@ -323,32 +335,20 @@ unlock: | |||
| 323 | rcu_read_unlock(); | 335 | rcu_read_unlock(); | 
| 324 | } | 336 | } | 
| 325 | 337 | ||
| 326 | static int klp_disable_func(struct klp_func *func) | 338 | static void klp_disable_func(struct klp_func *func) | 
| 327 | { | 339 | { | 
| 328 | struct klp_ops *ops; | 340 | struct klp_ops *ops; | 
| 329 | int ret; | ||
| 330 | 341 | ||
| 331 | if (WARN_ON(func->state != KLP_ENABLED)) | 342 | WARN_ON(func->state != KLP_ENABLED); | 
| 332 | return -EINVAL; | 343 | WARN_ON(!func->old_addr); | 
| 333 | |||
| 334 | if (WARN_ON(!func->old_addr)) | ||
| 335 | return -EINVAL; | ||
| 336 | 344 | ||
| 337 | ops = klp_find_ops(func->old_addr); | 345 | ops = klp_find_ops(func->old_addr); | 
| 338 | if (WARN_ON(!ops)) | 346 | if (WARN_ON(!ops)) | 
| 339 | return -EINVAL; | 347 | return; | 
| 340 | 348 | ||
| 341 | if (list_is_singular(&ops->func_stack)) { | 349 | if (list_is_singular(&ops->func_stack)) { | 
| 342 | ret = unregister_ftrace_function(&ops->fops); | 350 | WARN_ON(unregister_ftrace_function(&ops->fops)); | 
| 343 | if (ret) { | 351 | WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0)); | 
| 344 | pr_err("failed to unregister ftrace handler for function '%s' (%d)\n", | ||
| 345 | func->old_name, ret); | ||
| 346 | return ret; | ||
| 347 | } | ||
| 348 | |||
| 349 | ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0); | ||
| 350 | if (ret) | ||
| 351 | pr_warn("function unregister succeeded but failed to clear the filter\n"); | ||
| 352 | 352 | ||
| 353 | list_del_rcu(&func->stack_node); | 353 | list_del_rcu(&func->stack_node); | 
| 354 | list_del(&ops->node); | 354 | list_del(&ops->node); | 
| @@ -358,8 +358,6 @@ static int klp_disable_func(struct klp_func *func) | |||
| 358 | } | 358 | } | 
| 359 | 359 | ||
| 360 | func->state = KLP_DISABLED; | 360 | func->state = KLP_DISABLED; | 
| 361 | |||
| 362 | return 0; | ||
| 363 | } | 361 | } | 
| 364 | 362 | ||
| 365 | static int klp_enable_func(struct klp_func *func) | 363 | static int klp_enable_func(struct klp_func *func) | 
| @@ -420,23 +418,15 @@ err: | |||
| 420 | return ret; | 418 | return ret; | 
| 421 | } | 419 | } | 
| 422 | 420 | ||
| 423 | static int klp_disable_object(struct klp_object *obj) | 421 | static void klp_disable_object(struct klp_object *obj) | 
| 424 | { | 422 | { | 
| 425 | struct klp_func *func; | 423 | struct klp_func *func; | 
| 426 | int ret; | ||
| 427 | |||
| 428 | for (func = obj->funcs; func->old_name; func++) { | ||
| 429 | if (func->state != KLP_ENABLED) | ||
| 430 | continue; | ||
| 431 | 424 | ||
| 432 | ret = klp_disable_func(func); | 425 | for (func = obj->funcs; func->old_name; func++) | 
| 433 | if (ret) | 426 | if (func->state == KLP_ENABLED) | 
| 434 | return ret; | 427 | klp_disable_func(func); | 
| 435 | } | ||
| 436 | 428 | ||
| 437 | obj->state = KLP_DISABLED; | 429 | obj->state = KLP_DISABLED; | 
| 438 | |||
| 439 | return 0; | ||
| 440 | } | 430 | } | 
| 441 | 431 | ||
| 442 | static int klp_enable_object(struct klp_object *obj) | 432 | static int klp_enable_object(struct klp_object *obj) | 
| @@ -452,22 +442,19 @@ static int klp_enable_object(struct klp_object *obj) | |||
| 452 | 442 | ||
| 453 | for (func = obj->funcs; func->old_name; func++) { | 443 | for (func = obj->funcs; func->old_name; func++) { | 
| 454 | ret = klp_enable_func(func); | 444 | ret = klp_enable_func(func); | 
| 455 | if (ret) | 445 | if (ret) { | 
| 456 | goto unregister; | 446 | klp_disable_object(obj); | 
| 447 | return ret; | ||
| 448 | } | ||
| 457 | } | 449 | } | 
| 458 | obj->state = KLP_ENABLED; | 450 | obj->state = KLP_ENABLED; | 
| 459 | 451 | ||
| 460 | return 0; | 452 | return 0; | 
| 461 | |||
| 462 | unregister: | ||
| 463 | WARN_ON(klp_disable_object(obj)); | ||
| 464 | return ret; | ||
| 465 | } | 453 | } | 
| 466 | 454 | ||
| 467 | static int __klp_disable_patch(struct klp_patch *patch) | 455 | static int __klp_disable_patch(struct klp_patch *patch) | 
| 468 | { | 456 | { | 
| 469 | struct klp_object *obj; | 457 | struct klp_object *obj; | 
| 470 | int ret; | ||
| 471 | 458 | ||
| 472 | /* enforce stacking: only the last enabled patch can be disabled */ | 459 | /* enforce stacking: only the last enabled patch can be disabled */ | 
| 473 | if (!list_is_last(&patch->list, &klp_patches) && | 460 | if (!list_is_last(&patch->list, &klp_patches) && | 
| @@ -477,12 +464,8 @@ static int __klp_disable_patch(struct klp_patch *patch) | |||
| 477 | pr_notice("disabling patch '%s'\n", patch->mod->name); | 464 | pr_notice("disabling patch '%s'\n", patch->mod->name); | 
| 478 | 465 | ||
| 479 | for (obj = patch->objs; obj->funcs; obj++) { | 466 | for (obj = patch->objs; obj->funcs; obj++) { | 
| 480 | if (obj->state != KLP_ENABLED) | 467 | if (obj->state == KLP_ENABLED) | 
| 481 | continue; | 468 | klp_disable_object(obj); | 
| 482 | |||
| 483 | ret = klp_disable_object(obj); | ||
| 484 | if (ret) | ||
| 485 | return ret; | ||
| 486 | } | 469 | } | 
| 487 | 470 | ||
| 488 | patch->state = KLP_DISABLED; | 471 | patch->state = KLP_DISABLED; | 
| @@ -541,8 +524,6 @@ static int __klp_enable_patch(struct klp_patch *patch) | |||
| 541 | pr_notice("enabling patch '%s'\n", patch->mod->name); | 524 | pr_notice("enabling patch '%s'\n", patch->mod->name); | 
| 542 | 525 | ||
| 543 | for (obj = patch->objs; obj->funcs; obj++) { | 526 | for (obj = patch->objs; obj->funcs; obj++) { | 
| 544 | klp_find_object_module(obj); | ||
| 545 | |||
| 546 | if (!klp_is_object_loaded(obj)) | 527 | if (!klp_is_object_loaded(obj)) | 
| 547 | continue; | 528 | continue; | 
| 548 | 529 | ||
| @@ -767,6 +748,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) | |||
| 767 | return -EINVAL; | 748 | return -EINVAL; | 
| 768 | 749 | ||
| 769 | obj->state = KLP_DISABLED; | 750 | obj->state = KLP_DISABLED; | 
| 751 | obj->mod = NULL; | ||
| 770 | 752 | ||
| 771 | klp_find_object_module(obj); | 753 | klp_find_object_module(obj); | 
| 772 | 754 | ||
| @@ -932,7 +914,6 @@ static void klp_module_notify_going(struct klp_patch *patch, | |||
| 932 | { | 914 | { | 
| 933 | struct module *pmod = patch->mod; | 915 | struct module *pmod = patch->mod; | 
| 934 | struct module *mod = obj->mod; | 916 | struct module *mod = obj->mod; | 
| 935 | int ret; | ||
| 936 | 917 | ||
| 937 | if (patch->state == KLP_DISABLED) | 918 | if (patch->state == KLP_DISABLED) | 
| 938 | goto disabled; | 919 | goto disabled; | 
| @@ -940,10 +921,7 @@ static void klp_module_notify_going(struct klp_patch *patch, | |||
| 940 | pr_notice("reverting patch '%s' on unloading module '%s'\n", | 921 | pr_notice("reverting patch '%s' on unloading module '%s'\n", | 
| 941 | pmod->name, mod->name); | 922 | pmod->name, mod->name); | 
| 942 | 923 | ||
| 943 | ret = klp_disable_object(obj); | 924 | klp_disable_object(obj); | 
| 944 | if (ret) | ||
| 945 | pr_warn("failed to revert patch '%s' on module '%s' (%d)\n", | ||
| 946 | pmod->name, mod->name, ret); | ||
| 947 | 925 | ||
| 948 | disabled: | 926 | disabled: | 
| 949 | klp_free_object_loaded(obj); | 927 | klp_free_object_loaded(obj); | 
| @@ -961,6 +939,15 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action, | |||
| 961 | 939 | ||
| 962 | mutex_lock(&klp_mutex); | 940 | mutex_lock(&klp_mutex); | 
| 963 | 941 | ||
| 942 | /* | ||
| 943 | * Each module has to know that the notifier has been called. | ||
| 944 | * We never know what module will get patched by a new patch. | ||
| 945 | */ | ||
| 946 | if (action == MODULE_STATE_COMING) | ||
| 947 | mod->klp_alive = true; | ||
| 948 | else /* MODULE_STATE_GOING */ | ||
| 949 | mod->klp_alive = false; | ||
| 950 | |||
| 964 | list_for_each_entry(patch, &klp_patches, list) { | 951 | list_for_each_entry(patch, &klp_patches, list) { | 
| 965 | for (obj = patch->objs; obj->funcs; obj++) { | 952 | for (obj = patch->objs; obj->funcs; obj++) { | 
| 966 | if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) | 953 | if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) | 
| diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 88d0d4420ad2..ba77ab5f64dd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
| @@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class) | |||
| 633 | if (!new_class->name) | 633 | if (!new_class->name) | 
| 634 | return 0; | 634 | return 0; | 
| 635 | 635 | ||
| 636 | list_for_each_entry(class, &all_lock_classes, lock_entry) { | 636 | list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { | 
| 637 | if (new_class->key - new_class->subclass == class->key) | 637 | if (new_class->key - new_class->subclass == class->key) | 
| 638 | return class->name_version; | 638 | return class->name_version; | 
| 639 | if (class->name && !strcmp(class->name, new_class->name)) | 639 | if (class->name && !strcmp(class->name, new_class->name)) | 
| @@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) | |||
| 700 | hash_head = classhashentry(key); | 700 | hash_head = classhashentry(key); | 
| 701 | 701 | ||
| 702 | /* | 702 | /* | 
| 703 | * We can walk the hash lockfree, because the hash only | 703 | * We do an RCU walk of the hash, see lockdep_free_key_range(). | 
| 704 | * grows, and we are careful when adding entries to the end: | ||
| 705 | */ | 704 | */ | 
| 706 | list_for_each_entry(class, hash_head, hash_entry) { | 705 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 
| 706 | return NULL; | ||
| 707 | |||
| 708 | list_for_each_entry_rcu(class, hash_head, hash_entry) { | ||
| 707 | if (class->key == key) { | 709 | if (class->key == key) { | 
| 708 | /* | 710 | /* | 
| 709 | * Huh! same key, different name? Did someone trample | 711 | * Huh! same key, different name? Did someone trample | 
| @@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
| 728 | struct lockdep_subclass_key *key; | 730 | struct lockdep_subclass_key *key; | 
| 729 | struct list_head *hash_head; | 731 | struct list_head *hash_head; | 
| 730 | struct lock_class *class; | 732 | struct lock_class *class; | 
| 731 | unsigned long flags; | 733 | |
| 734 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); | ||
| 732 | 735 | ||
| 733 | class = look_up_lock_class(lock, subclass); | 736 | class = look_up_lock_class(lock, subclass); | 
| 734 | if (likely(class)) | 737 | if (likely(class)) | 
| @@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
| 750 | key = lock->key->subkeys + subclass; | 753 | key = lock->key->subkeys + subclass; | 
| 751 | hash_head = classhashentry(key); | 754 | hash_head = classhashentry(key); | 
| 752 | 755 | ||
| 753 | raw_local_irq_save(flags); | ||
| 754 | if (!graph_lock()) { | 756 | if (!graph_lock()) { | 
| 755 | raw_local_irq_restore(flags); | ||
| 756 | return NULL; | 757 | return NULL; | 
| 757 | } | 758 | } | 
| 758 | /* | 759 | /* | 
| 759 | * We have to do the hash-walk again, to avoid races | 760 | * We have to do the hash-walk again, to avoid races | 
| 760 | * with another CPU: | 761 | * with another CPU: | 
| 761 | */ | 762 | */ | 
| 762 | list_for_each_entry(class, hash_head, hash_entry) | 763 | list_for_each_entry_rcu(class, hash_head, hash_entry) { | 
| 763 | if (class->key == key) | 764 | if (class->key == key) | 
| 764 | goto out_unlock_set; | 765 | goto out_unlock_set; | 
| 766 | } | ||
| 767 | |||
| 765 | /* | 768 | /* | 
| 766 | * Allocate a new key from the static array, and add it to | 769 | * Allocate a new key from the static array, and add it to | 
| 767 | * the hash: | 770 | * the hash: | 
| 768 | */ | 771 | */ | 
| 769 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | 772 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | 
| 770 | if (!debug_locks_off_graph_unlock()) { | 773 | if (!debug_locks_off_graph_unlock()) { | 
| 771 | raw_local_irq_restore(flags); | ||
| 772 | return NULL; | 774 | return NULL; | 
| 773 | } | 775 | } | 
| 774 | raw_local_irq_restore(flags); | ||
| 775 | 776 | ||
| 776 | print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); | 777 | print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); | 
| 777 | dump_stack(); | 778 | dump_stack(); | 
| @@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
| 798 | 799 | ||
| 799 | if (verbose(class)) { | 800 | if (verbose(class)) { | 
| 800 | graph_unlock(); | 801 | graph_unlock(); | 
| 801 | raw_local_irq_restore(flags); | ||
| 802 | 802 | ||
| 803 | printk("\nnew class %p: %s", class->key, class->name); | 803 | printk("\nnew class %p: %s", class->key, class->name); | 
| 804 | if (class->name_version > 1) | 804 | if (class->name_version > 1) | 
| @@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
| 806 | printk("\n"); | 806 | printk("\n"); | 
| 807 | dump_stack(); | 807 | dump_stack(); | 
| 808 | 808 | ||
| 809 | raw_local_irq_save(flags); | ||
| 810 | if (!graph_lock()) { | 809 | if (!graph_lock()) { | 
| 811 | raw_local_irq_restore(flags); | ||
| 812 | return NULL; | 810 | return NULL; | 
| 813 | } | 811 | } | 
| 814 | } | 812 | } | 
| 815 | out_unlock_set: | 813 | out_unlock_set: | 
| 816 | graph_unlock(); | 814 | graph_unlock(); | 
| 817 | raw_local_irq_restore(flags); | ||
| 818 | 815 | ||
| 819 | out_set_class_cache: | 816 | out_set_class_cache: | 
| 820 | if (!subclass || force) | 817 | if (!subclass || force) | 
| @@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, | |||
| 870 | entry->distance = distance; | 867 | entry->distance = distance; | 
| 871 | entry->trace = *trace; | 868 | entry->trace = *trace; | 
| 872 | /* | 869 | /* | 
| 873 | * Since we never remove from the dependency list, the list can | 870 | * Both allocation and removal are done under the graph lock; but | 
| 874 | * be walked lockless by other CPUs, it's only allocation | 871 | * iteration is under RCU-sched; see look_up_lock_class() and | 
| 875 | * that must be protected by the spinlock. But this also means | 872 | * lockdep_free_key_range(). | 
| 876 | * we must make new entries visible only once writes to the | ||
| 877 | * entry become visible - hence the RCU op: | ||
| 878 | */ | 873 | */ | 
| 879 | list_add_tail_rcu(&entry->entry, head); | 874 | list_add_tail_rcu(&entry->entry, head); | 
| 880 | 875 | ||
| @@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry, | |||
| 1025 | else | 1020 | else | 
| 1026 | head = &lock->class->locks_before; | 1021 | head = &lock->class->locks_before; | 
| 1027 | 1022 | ||
| 1028 | list_for_each_entry(entry, head, entry) { | 1023 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); | 
| 1024 | |||
| 1025 | list_for_each_entry_rcu(entry, head, entry) { | ||
| 1029 | if (!lock_accessed(entry)) { | 1026 | if (!lock_accessed(entry)) { | 
| 1030 | unsigned int cq_depth; | 1027 | unsigned int cq_depth; | 
| 1031 | mark_lock_accessed(entry, lock); | 1028 | mark_lock_accessed(entry, lock); | 
| @@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, | |||
| 2022 | * We can walk it lock-free, because entries only get added | 2019 | * We can walk it lock-free, because entries only get added | 
| 2023 | * to the hash: | 2020 | * to the hash: | 
| 2024 | */ | 2021 | */ | 
| 2025 | list_for_each_entry(chain, hash_head, entry) { | 2022 | list_for_each_entry_rcu(chain, hash_head, entry) { | 
| 2026 | if (chain->chain_key == chain_key) { | 2023 | if (chain->chain_key == chain_key) { | 
| 2027 | cache_hit: | 2024 | cache_hit: | 
| 2028 | debug_atomic_inc(chain_lookup_hits); | 2025 | debug_atomic_inc(chain_lookup_hits); | 
| @@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, | |||
| 2996 | if (unlikely(!debug_locks)) | 2993 | if (unlikely(!debug_locks)) | 
| 2997 | return; | 2994 | return; | 
| 2998 | 2995 | ||
| 2999 | if (subclass) | 2996 | if (subclass) { | 
| 2997 | unsigned long flags; | ||
| 2998 | |||
| 2999 | if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) | ||
| 3000 | return; | ||
| 3001 | |||
| 3002 | raw_local_irq_save(flags); | ||
| 3003 | current->lockdep_recursion = 1; | ||
| 3000 | register_lock_class(lock, subclass, 1); | 3004 | register_lock_class(lock, subclass, 1); | 
| 3005 | current->lockdep_recursion = 0; | ||
| 3006 | raw_local_irq_restore(flags); | ||
| 3007 | } | ||
| 3001 | } | 3008 | } | 
| 3002 | EXPORT_SYMBOL_GPL(lockdep_init_map); | 3009 | EXPORT_SYMBOL_GPL(lockdep_init_map); | 
| 3003 | 3010 | ||
| @@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size) | |||
| 3887 | return addr >= start && addr < start + size; | 3894 | return addr >= start && addr < start + size; | 
| 3888 | } | 3895 | } | 
| 3889 | 3896 | ||
| 3897 | /* | ||
| 3898 | * Used in module.c to remove lock classes from memory that is going to be | ||
| 3899 | * freed; and possibly re-used by other modules. | ||
| 3900 | * | ||
| 3901 | * We will have had one sync_sched() before getting here, so we're guaranteed | ||
| 3902 | * nobody will look up these exact classes -- they're properly dead but still | ||
| 3903 | * allocated. | ||
| 3904 | */ | ||
| 3890 | void lockdep_free_key_range(void *start, unsigned long size) | 3905 | void lockdep_free_key_range(void *start, unsigned long size) | 
| 3891 | { | 3906 | { | 
| 3892 | struct lock_class *class, *next; | 3907 | struct lock_class *class; | 
| 3893 | struct list_head *head; | 3908 | struct list_head *head; | 
| 3894 | unsigned long flags; | 3909 | unsigned long flags; | 
| 3895 | int i; | 3910 | int i; | 
| @@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
| 3905 | head = classhash_table + i; | 3920 | head = classhash_table + i; | 
| 3906 | if (list_empty(head)) | 3921 | if (list_empty(head)) | 
| 3907 | continue; | 3922 | continue; | 
| 3908 | list_for_each_entry_safe(class, next, head, hash_entry) { | 3923 | list_for_each_entry_rcu(class, head, hash_entry) { | 
| 3909 | if (within(class->key, start, size)) | 3924 | if (within(class->key, start, size)) | 
| 3910 | zap_class(class); | 3925 | zap_class(class); | 
| 3911 | else if (within(class->name, start, size)) | 3926 | else if (within(class->name, start, size)) | 
| @@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
| 3916 | if (locked) | 3931 | if (locked) | 
| 3917 | graph_unlock(); | 3932 | graph_unlock(); | 
| 3918 | raw_local_irq_restore(flags); | 3933 | raw_local_irq_restore(flags); | 
| 3934 | |||
| 3935 | /* | ||
| 3936 | * Wait for any possible iterators from look_up_lock_class() to pass | ||
| 3937 | * before continuing to free the memory they refer to. | ||
| 3938 | * | ||
| 3939 | * sync_sched() is sufficient because the read-side is IRQ disable. | ||
| 3940 | */ | ||
| 3941 | synchronize_sched(); | ||
| 3942 | |||
| 3943 | /* | ||
| 3944 | * XXX at this point we could return the resources to the pool; | ||
| 3945 | * instead we leak them. We would need to change to bitmap allocators | ||
| 3946 | * instead of the linear allocators we have now. | ||
| 3947 | */ | ||
| 3919 | } | 3948 | } | 
| 3920 | 3949 | ||
| 3921 | void lockdep_reset_lock(struct lockdep_map *lock) | 3950 | void lockdep_reset_lock(struct lockdep_map *lock) | 
| 3922 | { | 3951 | { | 
| 3923 | struct lock_class *class, *next; | 3952 | struct lock_class *class; | 
| 3924 | struct list_head *head; | 3953 | struct list_head *head; | 
| 3925 | unsigned long flags; | 3954 | unsigned long flags; | 
| 3926 | int i, j; | 3955 | int i, j; | 
| @@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
| 3948 | head = classhash_table + i; | 3977 | head = classhash_table + i; | 
| 3949 | if (list_empty(head)) | 3978 | if (list_empty(head)) | 
| 3950 | continue; | 3979 | continue; | 
| 3951 | list_for_each_entry_safe(class, next, head, hash_entry) { | 3980 | list_for_each_entry_rcu(class, head, hash_entry) { | 
| 3952 | int match = 0; | 3981 | int match = 0; | 
| 3953 | 3982 | ||
| 3954 | for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) | 3983 | for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) | 
| diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index d1fe2ba5bac9..75e114bdf3f2 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h | |||
| @@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
| 78 | */ | 78 | */ | 
| 79 | return; | 79 | return; | 
| 80 | } | 80 | } | 
| 81 | ACCESS_ONCE(prev->next) = node; | 81 | WRITE_ONCE(prev->next, node); | 
| 82 | 82 | ||
| 83 | /* Wait until the lock holder passes the lock down. */ | 83 | /* Wait until the lock holder passes the lock down. */ | 
| 84 | arch_mcs_spin_lock_contended(&node->locked); | 84 | arch_mcs_spin_lock_contended(&node->locked); | 
| @@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
| 91 | static inline | 91 | static inline | 
| 92 | void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | 92 | void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | 
| 93 | { | 93 | { | 
| 94 | struct mcs_spinlock *next = ACCESS_ONCE(node->next); | 94 | struct mcs_spinlock *next = READ_ONCE(node->next); | 
| 95 | 95 | ||
| 96 | if (likely(!next)) { | 96 | if (likely(!next)) { | 
| 97 | /* | 97 | /* | 
| @@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) | |||
| 100 | if (likely(cmpxchg(lock, node, NULL) == node)) | 100 | if (likely(cmpxchg(lock, node, NULL) == node)) | 
| 101 | return; | 101 | return; | 
| 102 | /* Wait until the next pointer is set */ | 102 | /* Wait until the next pointer is set */ | 
| 103 | while (!(next = ACCESS_ONCE(node->next))) | 103 | while (!(next = READ_ONCE(node->next))) | 
| 104 | cpu_relax_lowlatency(); | 104 | cpu_relax_lowlatency(); | 
| 105 | } | 105 | } | 
| 106 | 106 | ||
| diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 94674e5919cb..4cccea6b8934 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | #include <linux/spinlock.h> | 25 | #include <linux/spinlock.h> | 
| 26 | #include <linux/interrupt.h> | 26 | #include <linux/interrupt.h> | 
| 27 | #include <linux/debug_locks.h> | 27 | #include <linux/debug_locks.h> | 
| 28 | #include "mcs_spinlock.h" | 28 | #include <linux/osq_lock.h> | 
| 29 | 29 | ||
| 30 | /* | 30 | /* | 
| 31 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, | 31 | * In the DEBUG case we are using the "NULL fastpath" for mutexes, | 
| @@ -217,44 +217,35 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock, | |||
| 217 | } | 217 | } | 
| 218 | 218 | ||
| 219 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 219 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 
| 220 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | ||
| 221 | { | ||
| 222 | if (lock->owner != owner) | ||
| 223 | return false; | ||
| 224 | |||
| 225 | /* | ||
| 226 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
| 227 | * lock->owner still matches owner, if that fails, owner might | ||
| 228 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
| 229 | * ensures the memory stays valid. | ||
| 230 | */ | ||
| 231 | barrier(); | ||
| 232 | |||
| 233 | return owner->on_cpu; | ||
| 234 | } | ||
| 235 | |||
| 236 | /* | 220 | /* | 
| 237 | * Look out! "owner" is an entirely speculative pointer | 221 | * Look out! "owner" is an entirely speculative pointer | 
| 238 | * access and not reliable. | 222 | * access and not reliable. | 
| 239 | */ | 223 | */ | 
| 240 | static noinline | 224 | static noinline | 
| 241 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | 225 | bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | 
| 242 | { | 226 | { | 
| 227 | bool ret = true; | ||
| 228 | |||
| 243 | rcu_read_lock(); | 229 | rcu_read_lock(); | 
| 244 | while (owner_running(lock, owner)) { | 230 | while (lock->owner == owner) { | 
| 245 | if (need_resched()) | 231 | /* | 
| 232 | * Ensure we emit the owner->on_cpu, dereference _after_ | ||
| 233 | * checking lock->owner still matches owner. If that fails, | ||
| 234 | * owner might point to freed memory. If it still matches, | ||
| 235 | * the rcu_read_lock() ensures the memory stays valid. | ||
| 236 | */ | ||
| 237 | barrier(); | ||
| 238 | |||
| 239 | if (!owner->on_cpu || need_resched()) { | ||
| 240 | ret = false; | ||
| 246 | break; | 241 | break; | 
| 242 | } | ||
| 247 | 243 | ||
| 248 | cpu_relax_lowlatency(); | 244 | cpu_relax_lowlatency(); | 
| 249 | } | 245 | } | 
| 250 | rcu_read_unlock(); | 246 | rcu_read_unlock(); | 
| 251 | 247 | ||
| 252 | /* | 248 | return ret; | 
| 253 | * We break out the loop above on need_resched() and when the | ||
| 254 | * owner changed, which is a sign for heavy contention. Return | ||
| 255 | * success only when lock->owner is NULL. | ||
| 256 | */ | ||
| 257 | return lock->owner == NULL; | ||
| 258 | } | 249 | } | 
| 259 | 250 | ||
| 260 | /* | 251 | /* | 
| @@ -269,7 +260,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) | |||
| 269 | return 0; | 260 | return 0; | 
| 270 | 261 | ||
| 271 | rcu_read_lock(); | 262 | rcu_read_lock(); | 
| 272 | owner = ACCESS_ONCE(lock->owner); | 263 | owner = READ_ONCE(lock->owner); | 
| 273 | if (owner) | 264 | if (owner) | 
| 274 | retval = owner->on_cpu; | 265 | retval = owner->on_cpu; | 
| 275 | rcu_read_unlock(); | 266 | rcu_read_unlock(); | 
| @@ -343,7 +334,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, | |||
| 343 | * As such, when deadlock detection needs to be | 334 | * As such, when deadlock detection needs to be | 
| 344 | * performed the optimistic spinning cannot be done. | 335 | * performed the optimistic spinning cannot be done. | 
| 345 | */ | 336 | */ | 
| 346 | if (ACCESS_ONCE(ww->ctx)) | 337 | if (READ_ONCE(ww->ctx)) | 
| 347 | break; | 338 | break; | 
| 348 | } | 339 | } | 
| 349 | 340 | ||
| @@ -351,7 +342,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, | |||
| 351 | * If there's an owner, wait for it to either | 342 | * If there's an owner, wait for it to either | 
| 352 | * release the lock or go to sleep. | 343 | * release the lock or go to sleep. | 
| 353 | */ | 344 | */ | 
| 354 | owner = ACCESS_ONCE(lock->owner); | 345 | owner = READ_ONCE(lock->owner); | 
| 355 | if (owner && !mutex_spin_on_owner(lock, owner)) | 346 | if (owner && !mutex_spin_on_owner(lock, owner)) | 
| 356 | break; | 347 | break; | 
| 357 | 348 | ||
| @@ -490,7 +481,7 @@ static inline int __sched | |||
| 490 | __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) | 481 | __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) | 
| 491 | { | 482 | { | 
| 492 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); | 483 | struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); | 
| 493 | struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); | 484 | struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); | 
| 494 | 485 | ||
| 495 | if (!hold_ctx) | 486 | if (!hold_ctx) | 
| 496 | return 0; | 487 | return 0; | 
| diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index c112d00341b0..dc85ee23a26f 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c | |||
| @@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) | |||
| 98 | 98 | ||
| 99 | prev = decode_cpu(old); | 99 | prev = decode_cpu(old); | 
| 100 | node->prev = prev; | 100 | node->prev = prev; | 
| 101 | ACCESS_ONCE(prev->next) = node; | 101 | WRITE_ONCE(prev->next, node); | 
| 102 | 102 | ||
| 103 | /* | 103 | /* | 
| 104 | * Normally @prev is untouchable after the above store; because at that | 104 | * Normally @prev is untouchable after the above store; because at that | 
| @@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) | |||
| 109 | * cmpxchg in an attempt to undo our queueing. | 109 | * cmpxchg in an attempt to undo our queueing. | 
| 110 | */ | 110 | */ | 
| 111 | 111 | ||
| 112 | while (!ACCESS_ONCE(node->locked)) { | 112 | while (!READ_ONCE(node->locked)) { | 
| 113 | /* | 113 | /* | 
| 114 | * If we need to reschedule bail... so we can block. | 114 | * If we need to reschedule bail... so we can block. | 
| 115 | */ | 115 | */ | 
| @@ -148,7 +148,7 @@ unqueue: | |||
| 148 | * Or we race against a concurrent unqueue()'s step-B, in which | 148 | * Or we race against a concurrent unqueue()'s step-B, in which | 
| 149 | * case its step-C will write us a new @node->prev pointer. | 149 | * case its step-C will write us a new @node->prev pointer. | 
| 150 | */ | 150 | */ | 
| 151 | prev = ACCESS_ONCE(node->prev); | 151 | prev = READ_ONCE(node->prev); | 
| 152 | } | 152 | } | 
| 153 | 153 | ||
| 154 | /* | 154 | /* | 
| @@ -170,8 +170,8 @@ unqueue: | |||
| 170 | * it will wait in Step-A. | 170 | * it will wait in Step-A. | 
| 171 | */ | 171 | */ | 
| 172 | 172 | ||
| 173 | ACCESS_ONCE(next->prev) = prev; | 173 | WRITE_ONCE(next->prev, prev); | 
| 174 | ACCESS_ONCE(prev->next) = next; | 174 | WRITE_ONCE(prev->next, next); | 
| 175 | 175 | ||
| 176 | return false; | 176 | return false; | 
| 177 | } | 177 | } | 
| @@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock) | |||
| 193 | node = this_cpu_ptr(&osq_node); | 193 | node = this_cpu_ptr(&osq_node); | 
| 194 | next = xchg(&node->next, NULL); | 194 | next = xchg(&node->next, NULL); | 
| 195 | if (next) { | 195 | if (next) { | 
| 196 | ACCESS_ONCE(next->locked) = 1; | 196 | WRITE_ONCE(next->locked, 1); | 
| 197 | return; | 197 | return; | 
| 198 | } | 198 | } | 
| 199 | 199 | ||
| 200 | next = osq_wait_next(lock, node, NULL); | 200 | next = osq_wait_next(lock, node, NULL); | 
| 201 | if (next) | 201 | if (next) | 
| 202 | ACCESS_ONCE(next->locked) = 1; | 202 | WRITE_ONCE(next->locked, 1); | 
| 203 | } | 203 | } | 
| diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6357265a31ad..b73279367087 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) | |||
| 349 | * | 349 | * | 
| 350 | * @task: the task owning the mutex (owner) for which a chain walk is | 350 | * @task: the task owning the mutex (owner) for which a chain walk is | 
| 351 | * probably needed | 351 | * probably needed | 
| 352 | * @deadlock_detect: do we have to carry out deadlock detection? | 352 | * @chwalk: do we have to carry out deadlock detection? | 
| 353 | * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck | 353 | * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck | 
| 354 | * things for a task that has just got its priority adjusted, and | 354 | * things for a task that has just got its priority adjusted, and | 
| 355 | * is waiting on a mutex) | 355 | * is waiting on a mutex) | 
| diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 2555ae15ec14..3a5048572065 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c | |||
| @@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) | |||
| 85 | 85 | ||
| 86 | list_del(&waiter->list); | 86 | list_del(&waiter->list); | 
| 87 | tsk = waiter->task; | 87 | tsk = waiter->task; | 
| 88 | /* | ||
| 89 | * Make sure we do not wakeup the next reader before | ||
| 90 | * setting the nil condition to grant the next reader; | ||
| 91 | * otherwise we could miss the wakeup on the other | ||
| 92 | * side and end up sleeping again. See the pairing | ||
| 93 | * in rwsem_down_read_failed(). | ||
| 94 | */ | ||
| 88 | smp_mb(); | 95 | smp_mb(); | 
| 89 | waiter->task = NULL; | 96 | waiter->task = NULL; | 
| 90 | wake_up_process(tsk); | 97 | wake_up_process(tsk); | 
| diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2f7cc4076f50..3417d0172a5d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
| @@ -14,8 +14,9 @@ | |||
| 14 | #include <linux/init.h> | 14 | #include <linux/init.h> | 
| 15 | #include <linux/export.h> | 15 | #include <linux/export.h> | 
| 16 | #include <linux/sched/rt.h> | 16 | #include <linux/sched/rt.h> | 
| 17 | #include <linux/osq_lock.h> | ||
| 17 | 18 | ||
| 18 | #include "mcs_spinlock.h" | 19 | #include "rwsem.h" | 
| 19 | 20 | ||
| 20 | /* | 21 | /* | 
| 21 | * Guide to the rw_semaphore's count field for common values. | 22 | * Guide to the rw_semaphore's count field for common values. | 
| @@ -186,6 +187,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) | |||
| 186 | waiter = list_entry(next, struct rwsem_waiter, list); | 187 | waiter = list_entry(next, struct rwsem_waiter, list); | 
| 187 | next = waiter->list.next; | 188 | next = waiter->list.next; | 
| 188 | tsk = waiter->task; | 189 | tsk = waiter->task; | 
| 190 | /* | ||
| 191 | * Make sure we do not wakeup the next reader before | ||
| 192 | * setting the nil condition to grant the next reader; | ||
| 193 | * otherwise we could miss the wakeup on the other | ||
| 194 | * side and end up sleeping again. See the pairing | ||
| 195 | * in rwsem_down_read_failed(). | ||
| 196 | */ | ||
| 189 | smp_mb(); | 197 | smp_mb(); | 
| 190 | waiter->task = NULL; | 198 | waiter->task = NULL; | 
| 191 | wake_up_process(tsk); | 199 | wake_up_process(tsk); | 
| @@ -258,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
| 258 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | 266 | RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { | 
| 259 | if (!list_is_singular(&sem->wait_list)) | 267 | if (!list_is_singular(&sem->wait_list)) | 
| 260 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 268 | rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); | 
| 269 | rwsem_set_owner(sem); | ||
| 261 | return true; | 270 | return true; | 
| 262 | } | 271 | } | 
| 263 | 272 | ||
| @@ -270,15 +279,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | |||
| 270 | */ | 279 | */ | 
| 271 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | 280 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | 
| 272 | { | 281 | { | 
| 273 | long old, count = ACCESS_ONCE(sem->count); | 282 | long old, count = READ_ONCE(sem->count); | 
| 274 | 283 | ||
| 275 | while (true) { | 284 | while (true) { | 
| 276 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) | 285 | if (!(count == 0 || count == RWSEM_WAITING_BIAS)) | 
| 277 | return false; | 286 | return false; | 
| 278 | 287 | ||
| 279 | old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); | 288 | old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); | 
| 280 | if (old == count) | 289 | if (old == count) { | 
| 290 | rwsem_set_owner(sem); | ||
| 281 | return true; | 291 | return true; | 
| 292 | } | ||
| 282 | 293 | ||
| 283 | count = old; | 294 | count = old; | 
| 284 | } | 295 | } | 
| @@ -287,60 +298,67 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | |||
| 287 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | 298 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | 
| 288 | { | 299 | { | 
| 289 | struct task_struct *owner; | 300 | struct task_struct *owner; | 
| 290 | bool on_cpu = false; | 301 | bool ret = true; | 
| 291 | 302 | ||
| 292 | if (need_resched()) | 303 | if (need_resched()) | 
| 293 | return false; | 304 | return false; | 
| 294 | 305 | ||
| 295 | rcu_read_lock(); | 306 | rcu_read_lock(); | 
| 296 | owner = ACCESS_ONCE(sem->owner); | 307 | owner = READ_ONCE(sem->owner); | 
| 297 | if (owner) | 308 | if (!owner) { | 
| 298 | on_cpu = owner->on_cpu; | 309 | long count = READ_ONCE(sem->count); | 
| 299 | rcu_read_unlock(); | 310 | /* | 
| 300 | 311 | * If sem->owner is not set, yet we have just recently entered the | |
| 301 | /* | 312 | * slowpath with the lock being active, then there is a possibility | 
| 302 | * If sem->owner is not set, yet we have just recently entered the | 313 | * reader(s) may have the lock. To be safe, bail spinning in these | 
| 303 | * slowpath, then there is a possibility reader(s) may have the lock. | 314 | * situations. | 
| 304 | * To be safe, avoid spinning in these situations. | 315 | */ | 
| 305 | */ | 316 | if (count & RWSEM_ACTIVE_MASK) | 
| 306 | return on_cpu; | 317 | ret = false; | 
| 307 | } | 318 | goto done; | 
| 308 | 319 | } | |
| 309 | static inline bool owner_running(struct rw_semaphore *sem, | ||
| 310 | struct task_struct *owner) | ||
| 311 | { | ||
| 312 | if (sem->owner != owner) | ||
| 313 | return false; | ||
| 314 | |||
| 315 | /* | ||
| 316 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
| 317 | * sem->owner still matches owner, if that fails, owner might | ||
| 318 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
| 319 | * ensures the memory stays valid. | ||
| 320 | */ | ||
| 321 | barrier(); | ||
| 322 | 320 | ||
| 323 | return owner->on_cpu; | 321 | ret = owner->on_cpu; | 
| 322 | done: | ||
| 323 | rcu_read_unlock(); | ||
| 324 | return ret; | ||
| 324 | } | 325 | } | 
| 325 | 326 | ||
| 326 | static noinline | 327 | static noinline | 
| 327 | bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | 328 | bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) | 
| 328 | { | 329 | { | 
| 330 | long count; | ||
| 331 | |||
| 329 | rcu_read_lock(); | 332 | rcu_read_lock(); | 
| 330 | while (owner_running(sem, owner)) { | 333 | while (sem->owner == owner) { | 
| 331 | if (need_resched()) | 334 | /* | 
| 332 | break; | 335 | * Ensure we emit the owner->on_cpu, dereference _after_ | 
| 336 | * checking sem->owner still matches owner, if that fails, | ||
| 337 | * owner might point to free()d memory, if it still matches, | ||
| 338 | * the rcu_read_lock() ensures the memory stays valid. | ||
| 339 | */ | ||
| 340 | barrier(); | ||
| 341 | |||
| 342 | /* abort spinning when need_resched or owner is not running */ | ||
| 343 | if (!owner->on_cpu || need_resched()) { | ||
| 344 | rcu_read_unlock(); | ||
| 345 | return false; | ||
| 346 | } | ||
| 333 | 347 | ||
| 334 | cpu_relax_lowlatency(); | 348 | cpu_relax_lowlatency(); | 
| 335 | } | 349 | } | 
| 336 | rcu_read_unlock(); | 350 | rcu_read_unlock(); | 
| 337 | 351 | ||
| 352 | if (READ_ONCE(sem->owner)) | ||
| 353 | return true; /* new owner, continue spinning */ | ||
| 354 | |||
| 338 | /* | 355 | /* | 
| 339 | * We break out the loop above on need_resched() or when the | 356 | * When the owner is not set, the lock could be free or | 
| 340 | * owner changed, which is a sign for heavy contention. Return | 357 | * held by readers. Check the counter to verify the | 
| 341 | * success only when sem->owner is NULL. | 358 | * state. | 
| 342 | */ | 359 | */ | 
| 343 | return sem->owner == NULL; | 360 | count = READ_ONCE(sem->count); | 
| 361 | return (count == 0 || count == RWSEM_WAITING_BIAS); | ||
| 344 | } | 362 | } | 
| 345 | 363 | ||
| 346 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | 364 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | 
| @@ -358,7 +376,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | |||
| 358 | goto done; | 376 | goto done; | 
| 359 | 377 | ||
| 360 | while (true) { | 378 | while (true) { | 
| 361 | owner = ACCESS_ONCE(sem->owner); | 379 | owner = READ_ONCE(sem->owner); | 
| 362 | if (owner && !rwsem_spin_on_owner(sem, owner)) | 380 | if (owner && !rwsem_spin_on_owner(sem, owner)) | 
| 363 | break; | 381 | break; | 
| 364 | 382 | ||
| @@ -432,7 +450,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) | |||
| 432 | 450 | ||
| 433 | /* we're now waiting on the lock, but no longer actively locking */ | 451 | /* we're now waiting on the lock, but no longer actively locking */ | 
| 434 | if (waiting) { | 452 | if (waiting) { | 
| 435 | count = ACCESS_ONCE(sem->count); | 453 | count = READ_ONCE(sem->count); | 
| 436 | 454 | ||
| 437 | /* | 455 | /* | 
| 438 | * If there were already threads queued before us and there are | 456 | * If there were already threads queued before us and there are | 
| diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e2d3bc7f03b4..205be0ce34de 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
| @@ -9,29 +9,9 @@ | |||
| 9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> | 
| 10 | #include <linux/export.h> | 10 | #include <linux/export.h> | 
| 11 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> | 
| 12 | |||
| 13 | #include <linux/atomic.h> | 12 | #include <linux/atomic.h> | 
| 14 | 13 | ||
| 15 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | 14 | #include "rwsem.h" | 
| 16 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
| 17 | { | ||
| 18 | sem->owner = current; | ||
| 19 | } | ||
| 20 | |||
| 21 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
| 22 | { | ||
| 23 | sem->owner = NULL; | ||
| 24 | } | ||
| 25 | |||
| 26 | #else | ||
| 27 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
| 28 | { | ||
| 29 | } | ||
| 30 | |||
| 31 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
| 32 | { | ||
| 33 | } | ||
| 34 | #endif | ||
| 35 | 15 | ||
| 36 | /* | 16 | /* | 
| 37 | * lock for reading | 17 | * lock for reading | 
| diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h new file mode 100644 index 000000000000..870ed9a5b426 --- /dev/null +++ b/kernel/locking/rwsem.h | |||
| @@ -0,0 +1,20 @@ | |||
| 1 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | ||
| 2 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
| 3 | { | ||
| 4 | sem->owner = current; | ||
| 5 | } | ||
| 6 | |||
| 7 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
| 8 | { | ||
| 9 | sem->owner = NULL; | ||
| 10 | } | ||
| 11 | |||
| 12 | #else | ||
| 13 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
| 14 | { | ||
| 15 | } | ||
| 16 | |||
| 17 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
| 18 | { | ||
| 19 | } | ||
| 20 | #endif | ||
| diff --git a/kernel/module.c b/kernel/module.c index b3d634ed06c9..650b038ae520 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -1865,7 +1865,7 @@ static void free_module(struct module *mod) | |||
| 1865 | kfree(mod->args); | 1865 | kfree(mod->args); | 
| 1866 | percpu_modfree(mod); | 1866 | percpu_modfree(mod); | 
| 1867 | 1867 | ||
| 1868 | /* Free lock-classes: */ | 1868 | /* Free lock-classes; relies on the preceding sync_rcu(). */ | 
| 1869 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1869 | lockdep_free_key_range(mod->module_core, mod->core_size); | 
| 1870 | 1870 | ||
| 1871 | /* Finally, free the core (containing the module structure) */ | 1871 | /* Finally, free the core (containing the module structure) */ | 
| @@ -2479,6 +2479,23 @@ static int elf_header_check(struct load_info *info) | |||
| 2479 | return 0; | 2479 | return 0; | 
| 2480 | } | 2480 | } | 
| 2481 | 2481 | ||
| 2482 | #define COPY_CHUNK_SIZE (16*PAGE_SIZE) | ||
| 2483 | |||
| 2484 | static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned long len) | ||
| 2485 | { | ||
| 2486 | do { | ||
| 2487 | unsigned long n = min(len, COPY_CHUNK_SIZE); | ||
| 2488 | |||
| 2489 | if (copy_from_user(dst, usrc, n) != 0) | ||
| 2490 | return -EFAULT; | ||
| 2491 | cond_resched(); | ||
| 2492 | dst += n; | ||
| 2493 | usrc += n; | ||
| 2494 | len -= n; | ||
| 2495 | } while (len); | ||
| 2496 | return 0; | ||
| 2497 | } | ||
| 2498 | |||
| 2482 | /* Sets info->hdr and info->len. */ | 2499 | /* Sets info->hdr and info->len. */ | 
| 2483 | static int copy_module_from_user(const void __user *umod, unsigned long len, | 2500 | static int copy_module_from_user(const void __user *umod, unsigned long len, | 
| 2484 | struct load_info *info) | 2501 | struct load_info *info) | 
| @@ -2498,7 +2515,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, | |||
| 2498 | if (!info->hdr) | 2515 | if (!info->hdr) | 
| 2499 | return -ENOMEM; | 2516 | return -ENOMEM; | 
| 2500 | 2517 | ||
| 2501 | if (copy_from_user(info->hdr, umod, info->len) != 0) { | 2518 | if (copy_chunked_from_user(info->hdr, umod, info->len) != 0) { | 
| 2502 | vfree(info->hdr); | 2519 | vfree(info->hdr); | 
| 2503 | return -EFAULT; | 2520 | return -EFAULT; | 
| 2504 | } | 2521 | } | 
| @@ -2753,6 +2770,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) | |||
| 2753 | mod->trace_events = section_objs(info, "_ftrace_events", | 2770 | mod->trace_events = section_objs(info, "_ftrace_events", | 
| 2754 | sizeof(*mod->trace_events), | 2771 | sizeof(*mod->trace_events), | 
| 2755 | &mod->num_trace_events); | 2772 | &mod->num_trace_events); | 
| 2773 | mod->trace_enums = section_objs(info, "_ftrace_enum_map", | ||
| 2774 | sizeof(*mod->trace_enums), | ||
| 2775 | &mod->num_trace_enums); | ||
| 2756 | #endif | 2776 | #endif | 
| 2757 | #ifdef CONFIG_TRACING | 2777 | #ifdef CONFIG_TRACING | 
| 2758 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", | 2778 | mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", | 
| @@ -3349,9 +3369,6 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
| 3349 | module_bug_cleanup(mod); | 3369 | module_bug_cleanup(mod); | 
| 3350 | mutex_unlock(&module_mutex); | 3370 | mutex_unlock(&module_mutex); | 
| 3351 | 3371 | ||
| 3352 | /* Free lock-classes: */ | ||
| 3353 | lockdep_free_key_range(mod->module_core, mod->core_size); | ||
| 3354 | |||
| 3355 | /* we can't deallocate the module until we clear memory protection */ | 3372 | /* we can't deallocate the module until we clear memory protection */ | 
| 3356 | unset_module_init_ro_nx(mod); | 3373 | unset_module_init_ro_nx(mod); | 
| 3357 | unset_module_core_ro_nx(mod); | 3374 | unset_module_core_ro_nx(mod); | 
| @@ -3375,6 +3392,9 @@ static int load_module(struct load_info *info, const char __user *uargs, | |||
| 3375 | synchronize_rcu(); | 3392 | synchronize_rcu(); | 
| 3376 | mutex_unlock(&module_mutex); | 3393 | mutex_unlock(&module_mutex); | 
| 3377 | free_module: | 3394 | free_module: | 
| 3395 | /* Free lock-classes; relies on the preceding sync_rcu() */ | ||
| 3396 | lockdep_free_key_range(mod->module_core, mod->core_size); | ||
| 3397 | |||
| 3378 | module_deallocate(mod, info); | 3398 | module_deallocate(mod, info); | 
| 3379 | free_copy: | 3399 | free_copy: | 
| 3380 | free_copy(info); | 3400 | free_copy(info); | 
| diff --git a/kernel/power/main.c b/kernel/power/main.c index 9a59d042ea84..86e8157a450f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -11,7 +11,7 @@ | |||
| 11 | #include <linux/export.h> | 11 | #include <linux/export.h> | 
| 12 | #include <linux/kobject.h> | 12 | #include <linux/kobject.h> | 
| 13 | #include <linux/string.h> | 13 | #include <linux/string.h> | 
| 14 | #include <linux/resume-trace.h> | 14 | #include <linux/pm-trace.h> | 
| 15 | #include <linux/workqueue.h> | 15 | #include <linux/workqueue.h> | 
| 16 | #include <linux/debugfs.h> | 16 | #include <linux/debugfs.h> | 
| 17 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> | 
| diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c24d5a23bf93..5235dd4e1e2f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
| 955 | } | 955 | } | 
| 956 | } | 956 | } | 
| 957 | 957 | ||
| 958 | static bool is_nosave_page(unsigned long pfn) | ||
| 959 | { | ||
| 960 | struct nosave_region *region; | ||
| 961 | |||
| 962 | list_for_each_entry(region, &nosave_regions, list) { | ||
| 963 | if (pfn >= region->start_pfn && pfn < region->end_pfn) { | ||
| 964 | pr_err("PM: %#010llx in e820 nosave region: " | ||
| 965 | "[mem %#010llx-%#010llx]\n", | ||
| 966 | (unsigned long long) pfn << PAGE_SHIFT, | ||
| 967 | (unsigned long long) region->start_pfn << PAGE_SHIFT, | ||
| 968 | ((unsigned long long) region->end_pfn << PAGE_SHIFT) | ||
| 969 | - 1); | ||
| 970 | return true; | ||
| 971 | } | ||
| 972 | } | ||
| 973 | |||
| 974 | return false; | ||
| 975 | } | ||
| 976 | |||
| 977 | /** | 958 | /** | 
| 978 | * create_basic_memory_bitmaps - create bitmaps needed for marking page | 959 | * create_basic_memory_bitmaps - create bitmaps needed for marking page | 
| 979 | * frames that should not be saved and free page frames. The pointers | 960 | * frames that should not be saved and free page frames. The pointers | 
| @@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) | |||
| 2042 | do { | 2023 | do { | 
| 2043 | pfn = memory_bm_next_pfn(bm); | 2024 | pfn = memory_bm_next_pfn(bm); | 
| 2044 | if (likely(pfn != BM_END_OF_MAP)) { | 2025 | if (likely(pfn != BM_END_OF_MAP)) { | 
| 2045 | if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) | 2026 | if (likely(pfn_valid(pfn))) | 
| 2046 | swsusp_set_page_free(pfn_to_page(pfn)); | 2027 | swsusp_set_page_free(pfn_to_page(pfn)); | 
| 2047 | else | 2028 | else | 
| 2048 | return -EFAULT; | 2029 | return -EFAULT; | 
| diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b7d6b3a721b1..8d7a1ef72758 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/ftrace.h> | 28 | #include <linux/ftrace.h> | 
| 29 | #include <trace/events/power.h> | 29 | #include <trace/events/power.h> | 
| 30 | #include <linux/compiler.h> | 30 | #include <linux/compiler.h> | 
| 31 | #include <linux/moduleparam.h> | ||
| 31 | 32 | ||
| 32 | #include "power.h" | 33 | #include "power.h" | 
| 33 | 34 | ||
| @@ -233,12 +234,20 @@ static bool platform_suspend_again(suspend_state_t state) | |||
| 233 | suspend_ops->suspend_again() : false; | 234 | suspend_ops->suspend_again() : false; | 
| 234 | } | 235 | } | 
| 235 | 236 | ||
| 237 | #ifdef CONFIG_PM_DEBUG | ||
| 238 | static unsigned int pm_test_delay = 5; | ||
| 239 | module_param(pm_test_delay, uint, 0644); | ||
| 240 | MODULE_PARM_DESC(pm_test_delay, | ||
| 241 | "Number of seconds to wait before resuming from suspend test"); | ||
| 242 | #endif | ||
| 243 | |||
| 236 | static int suspend_test(int level) | 244 | static int suspend_test(int level) | 
| 237 | { | 245 | { | 
| 238 | #ifdef CONFIG_PM_DEBUG | 246 | #ifdef CONFIG_PM_DEBUG | 
| 239 | if (pm_test_level == level) { | 247 | if (pm_test_level == level) { | 
| 240 | printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); | 248 | printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", | 
| 241 | mdelay(5000); | 249 | pm_test_delay); | 
| 250 | mdelay(pm_test_delay * 1000); | ||
| 242 | return 1; | 251 | return 1; | 
| 243 | } | 252 | } | 
| 244 | #endif /* !CONFIG_PM_DEBUG */ | 253 | #endif /* !CONFIG_PM_DEBUG */ | 
| diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bb0635bd74f2..879edfc5ee52 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -32,7 +32,6 @@ | |||
| 32 | #include <linux/security.h> | 32 | #include <linux/security.h> | 
| 33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> | 
| 34 | #include <linux/memblock.h> | 34 | #include <linux/memblock.h> | 
| 35 | #include <linux/aio.h> | ||
| 36 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> | 
| 37 | #include <linux/kexec.h> | 36 | #include <linux/kexec.h> | 
| 38 | #include <linux/kdb.h> | 37 | #include <linux/kdb.h> | 
| @@ -46,6 +45,7 @@ | |||
| 46 | #include <linux/irq_work.h> | 45 | #include <linux/irq_work.h> | 
| 47 | #include <linux/utsname.h> | 46 | #include <linux/utsname.h> | 
| 48 | #include <linux/ctype.h> | 47 | #include <linux/ctype.h> | 
| 48 | #include <linux/uio.h> | ||
| 49 | 49 | ||
| 50 | #include <asm/uaccess.h> | 50 | #include <asm/uaccess.h> | 
| 51 | 51 | ||
| @@ -521,7 +521,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) | |||
| 521 | int i; | 521 | int i; | 
| 522 | int level = default_message_loglevel; | 522 | int level = default_message_loglevel; | 
| 523 | int facility = 1; /* LOG_USER */ | 523 | int facility = 1; /* LOG_USER */ | 
| 524 | size_t len = iocb->ki_nbytes; | 524 | size_t len = iov_iter_count(from); | 
| 525 | ssize_t ret = len; | 525 | ssize_t ret = len; | 
| 526 | 526 | ||
| 527 | if (len > LOG_LINE_MAX) | 527 | if (len > LOG_LINE_MAX) | 
| diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 30d42aa55d83..8dbe27611ec3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -853,6 +853,8 @@ rcu_torture_fqs(void *arg) | |||
| 853 | static int | 853 | static int | 
| 854 | rcu_torture_writer(void *arg) | 854 | rcu_torture_writer(void *arg) | 
| 855 | { | 855 | { | 
| 856 | bool can_expedite = !rcu_gp_is_expedited(); | ||
| 857 | int expediting = 0; | ||
| 856 | unsigned long gp_snap; | 858 | unsigned long gp_snap; | 
| 857 | bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; | 859 | bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; | 
| 858 | bool gp_sync1 = gp_sync; | 860 | bool gp_sync1 = gp_sync; | 
| @@ -865,9 +867,15 @@ rcu_torture_writer(void *arg) | |||
| 865 | int nsynctypes = 0; | 867 | int nsynctypes = 0; | 
| 866 | 868 | ||
| 867 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); | 869 | VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); | 
| 870 | pr_alert("%s" TORTURE_FLAG | ||
| 871 | " Grace periods expedited from boot/sysfs for %s,\n", | ||
| 872 | torture_type, cur_ops->name); | ||
| 873 | pr_alert("%s" TORTURE_FLAG | ||
| 874 | " Testing of dynamic grace-period expediting diabled.\n", | ||
| 875 | torture_type); | ||
| 868 | 876 | ||
| 869 | /* Initialize synctype[] array. If none set, take default. */ | 877 | /* Initialize synctype[] array. If none set, take default. */ | 
| 870 | if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) | 878 | if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) | 
| 871 | gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; | 879 | gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; | 
| 872 | if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) | 880 | if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) | 
| 873 | synctype[nsynctypes++] = RTWS_COND_GET; | 881 | synctype[nsynctypes++] = RTWS_COND_GET; | 
| @@ -949,9 +957,26 @@ rcu_torture_writer(void *arg) | |||
| 949 | } | 957 | } | 
| 950 | } | 958 | } | 
| 951 | rcutorture_record_progress(++rcu_torture_current_version); | 959 | rcutorture_record_progress(++rcu_torture_current_version); | 
| 960 | /* Cycle through nesting levels of rcu_expedite_gp() calls. */ | ||
| 961 | if (can_expedite && | ||
| 962 | !(torture_random(&rand) & 0xff & (!!expediting - 1))) { | ||
| 963 | WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited()); | ||
| 964 | if (expediting >= 0) | ||
| 965 | rcu_expedite_gp(); | ||
| 966 | else | ||
| 967 | rcu_unexpedite_gp(); | ||
| 968 | if (++expediting > 3) | ||
| 969 | expediting = -expediting; | ||
| 970 | } | ||
| 952 | rcu_torture_writer_state = RTWS_STUTTER; | 971 | rcu_torture_writer_state = RTWS_STUTTER; | 
| 953 | stutter_wait("rcu_torture_writer"); | 972 | stutter_wait("rcu_torture_writer"); | 
| 954 | } while (!torture_must_stop()); | 973 | } while (!torture_must_stop()); | 
| 974 | /* Reset expediting back to unexpedited. */ | ||
| 975 | if (expediting > 0) | ||
| 976 | expediting = -expediting; | ||
| 977 | while (can_expedite && expediting++ < 0) | ||
| 978 | rcu_unexpedite_gp(); | ||
| 979 | WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); | ||
| 955 | rcu_torture_writer_state = RTWS_STOPPING; | 980 | rcu_torture_writer_state = RTWS_STOPPING; | 
| 956 | torture_kthread_stopping("rcu_torture_writer"); | 981 | torture_kthread_stopping("rcu_torture_writer"); | 
| 957 | return 0; | 982 | return 0; | 
| diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 445bf8ffe3fb..cad76e76b4e7 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c | |||
| @@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | |||
| 402 | } | 402 | } | 
| 403 | EXPORT_SYMBOL_GPL(call_srcu); | 403 | EXPORT_SYMBOL_GPL(call_srcu); | 
| 404 | 404 | ||
| 405 | struct rcu_synchronize { | ||
| 406 | struct rcu_head head; | ||
| 407 | struct completion completion; | ||
| 408 | }; | ||
| 409 | |||
| 410 | /* | ||
| 411 | * Awaken the corresponding synchronize_srcu() instance now that a | ||
| 412 | * grace period has elapsed. | ||
| 413 | */ | ||
| 414 | static void wakeme_after_rcu(struct rcu_head *head) | ||
| 415 | { | ||
| 416 | struct rcu_synchronize *rcu; | ||
| 417 | |||
| 418 | rcu = container_of(head, struct rcu_synchronize, head); | ||
| 419 | complete(&rcu->completion); | ||
| 420 | } | ||
| 421 | |||
| 422 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); | 405 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); | 
| 423 | static void srcu_reschedule(struct srcu_struct *sp); | 406 | static void srcu_reschedule(struct srcu_struct *sp); | 
| 424 | 407 | ||
| @@ -507,7 +490,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | |||
| 507 | */ | 490 | */ | 
| 508 | void synchronize_srcu(struct srcu_struct *sp) | 491 | void synchronize_srcu(struct srcu_struct *sp) | 
| 509 | { | 492 | { | 
| 510 | __synchronize_srcu(sp, rcu_expedited | 493 | __synchronize_srcu(sp, rcu_gp_is_expedited() | 
| 511 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT | 494 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT | 
| 512 | : SYNCHRONIZE_SRCU_TRYCOUNT); | 495 | : SYNCHRONIZE_SRCU_TRYCOUNT); | 
| 513 | } | 496 | } | 
| diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index cc9ceca7bde1..069742d61c68 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c | |||
| @@ -103,8 +103,7 @@ EXPORT_SYMBOL(__rcu_is_watching); | |||
| 103 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 103 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 
| 104 | { | 104 | { | 
| 105 | RCU_TRACE(reset_cpu_stall_ticks(rcp)); | 105 | RCU_TRACE(reset_cpu_stall_ticks(rcp)); | 
| 106 | if (rcp->rcucblist != NULL && | 106 | if (rcp->donetail != rcp->curtail) { | 
| 107 | rcp->donetail != rcp->curtail) { | ||
| 108 | rcp->donetail = rcp->curtail; | 107 | rcp->donetail = rcp->curtail; | 
| 109 | return 1; | 108 | return 1; | 
| 110 | } | 109 | } | 
| @@ -169,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 169 | unsigned long flags; | 168 | unsigned long flags; | 
| 170 | RCU_TRACE(int cb_count = 0); | 169 | RCU_TRACE(int cb_count = 0); | 
| 171 | 170 | ||
| 172 | /* If no RCU callbacks ready to invoke, just return. */ | ||
| 173 | if (&rcp->rcucblist == rcp->donetail) { | ||
| 174 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); | ||
| 175 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, | ||
| 176 | !!ACCESS_ONCE(rcp->rcucblist), | ||
| 177 | need_resched(), | ||
| 178 | is_idle_task(current), | ||
| 179 | false)); | ||
| 180 | return; | ||
| 181 | } | ||
| 182 | |||
| 183 | /* Move the ready-to-invoke callbacks to a local list. */ | 171 | /* Move the ready-to-invoke callbacks to a local list. */ | 
| 184 | local_irq_save(flags); | 172 | local_irq_save(flags); | 
| 185 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); | 173 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); | 
| diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..233165da782f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var | |||
| 91 | 91 | ||
| 92 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | 92 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | 
| 93 | DEFINE_RCU_TPS(sname) \ | 93 | DEFINE_RCU_TPS(sname) \ | 
| 94 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ | ||
| 94 | struct rcu_state sname##_state = { \ | 95 | struct rcu_state sname##_state = { \ | 
| 95 | .level = { &sname##_state.node[0] }, \ | 96 | .level = { &sname##_state.node[0] }, \ | 
| 97 | .rda = &sname##_data, \ | ||
| 96 | .call = cr, \ | 98 | .call = cr, \ | 
| 97 | .fqs_state = RCU_GP_IDLE, \ | 99 | .fqs_state = RCU_GP_IDLE, \ | 
| 98 | .gpnum = 0UL - 300UL, \ | 100 | .gpnum = 0UL - 300UL, \ | 
| @@ -101,11 +103,9 @@ struct rcu_state sname##_state = { \ | |||
| 101 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 103 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 
| 102 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 104 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 
| 103 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 105 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 
| 104 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | ||
| 105 | .name = RCU_STATE_NAME(sname), \ | 106 | .name = RCU_STATE_NAME(sname), \ | 
| 106 | .abbr = sabbr, \ | 107 | .abbr = sabbr, \ | 
| 107 | }; \ | 108 | } | 
| 108 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data) | ||
| 109 | 109 | ||
| 110 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | 110 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | 
| 111 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 111 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | 
| @@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); | |||
| 152 | */ | 152 | */ | 
| 153 | static int rcu_scheduler_fully_active __read_mostly; | 153 | static int rcu_scheduler_fully_active __read_mostly; | 
| 154 | 154 | ||
| 155 | static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); | ||
| 156 | static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); | ||
| 155 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | 157 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | 
| 156 | static void invoke_rcu_core(void); | 158 | static void invoke_rcu_core(void); | 
| 157 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 159 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 
| @@ -160,6 +162,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | |||
| 160 | static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; | 162 | static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; | 
| 161 | module_param(kthread_prio, int, 0644); | 163 | module_param(kthread_prio, int, 0644); | 
| 162 | 164 | ||
| 165 | /* Delay in jiffies for grace-period initialization delays. */ | ||
| 166 | static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) | ||
| 167 | ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY | ||
| 168 | : 0; | ||
| 169 | module_param(gp_init_delay, int, 0644); | ||
| 170 | |||
| 163 | /* | 171 | /* | 
| 164 | * Track the rcutorture test sequence number and the update version | 172 | * Track the rcutorture test sequence number and the update version | 
| 165 | * number within a given test. The rcutorture_testseq is incremented | 173 | * number within a given test. The rcutorture_testseq is incremented | 
| @@ -173,6 +181,17 @@ unsigned long rcutorture_testseq; | |||
| 173 | unsigned long rcutorture_vernum; | 181 | unsigned long rcutorture_vernum; | 
| 174 | 182 | ||
| 175 | /* | 183 | /* | 
| 184 | * Compute the mask of online CPUs for the specified rcu_node structure. | ||
| 185 | * This will not be stable unless the rcu_node structure's ->lock is | ||
| 186 | * held, but the bit corresponding to the current CPU will be stable | ||
| 187 | * in most contexts. | ||
| 188 | */ | ||
| 189 | unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) | ||
| 190 | { | ||
| 191 | return ACCESS_ONCE(rnp->qsmaskinitnext); | ||
| 192 | } | ||
| 193 | |||
| 194 | /* | ||
| 176 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 195 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 
| 177 | * permit this function to be invoked without holding the root rcu_node | 196 | * permit this function to be invoked without holding the root rcu_node | 
| 178 | * structure's ->lock, but of course results can be subject to change. | 197 | * structure's ->lock, but of course results can be subject to change. | 
| @@ -292,10 +311,10 @@ void rcu_note_context_switch(void) | |||
| 292 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 311 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 
| 293 | 312 | ||
| 294 | /* | 313 | /* | 
| 295 | * Register a quiesecent state for all RCU flavors. If there is an | 314 | * Register a quiescent state for all RCU flavors. If there is an | 
| 296 | * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight | 315 | * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight | 
| 297 | * dyntick-idle quiescent state visible to other CPUs (but only for those | 316 | * dyntick-idle quiescent state visible to other CPUs (but only for those | 
| 298 | * RCU flavors in desparate need of a quiescent state, which will normally | 317 | * RCU flavors in desperate need of a quiescent state, which will normally | 
| 299 | * be none of them). Either way, do a lightweight quiescent state for | 318 | * be none of them). Either way, do a lightweight quiescent state for | 
| 300 | * all RCU flavors. | 319 | * all RCU flavors. | 
| 301 | */ | 320 | */ | 
| @@ -410,6 +429,15 @@ void rcu_bh_force_quiescent_state(void) | |||
| 410 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 429 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 
| 411 | 430 | ||
| 412 | /* | 431 | /* | 
| 432 | * Force a quiescent state for RCU-sched. | ||
| 433 | */ | ||
| 434 | void rcu_sched_force_quiescent_state(void) | ||
| 435 | { | ||
| 436 | force_quiescent_state(&rcu_sched_state); | ||
| 437 | } | ||
| 438 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | ||
| 439 | |||
| 440 | /* | ||
| 413 | * Show the state of the grace-period kthreads. | 441 | * Show the state of the grace-period kthreads. | 
| 414 | */ | 442 | */ | 
| 415 | void show_rcu_gp_kthreads(void) | 443 | void show_rcu_gp_kthreads(void) | 
| @@ -483,15 +511,6 @@ void rcutorture_record_progress(unsigned long vernum) | |||
| 483 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | 511 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | 
| 484 | 512 | ||
| 485 | /* | 513 | /* | 
| 486 | * Force a quiescent state for RCU-sched. | ||
| 487 | */ | ||
| 488 | void rcu_sched_force_quiescent_state(void) | ||
| 489 | { | ||
| 490 | force_quiescent_state(&rcu_sched_state); | ||
| 491 | } | ||
| 492 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | ||
| 493 | |||
| 494 | /* | ||
| 495 | * Does the CPU have callbacks ready to be invoked? | 514 | * Does the CPU have callbacks ready to be invoked? | 
| 496 | */ | 515 | */ | 
| 497 | static int | 516 | static int | 
| @@ -954,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void) | |||
| 954 | preempt_disable(); | 973 | preempt_disable(); | 
| 955 | rdp = this_cpu_ptr(&rcu_sched_data); | 974 | rdp = this_cpu_ptr(&rcu_sched_data); | 
| 956 | rnp = rdp->mynode; | 975 | rnp = rdp->mynode; | 
| 957 | ret = (rdp->grpmask & rnp->qsmaskinit) || | 976 | ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || | 
| 958 | !rcu_scheduler_fully_active; | 977 | !rcu_scheduler_fully_active; | 
| 959 | preempt_enable(); | 978 | preempt_enable(); | 
| 960 | return ret; | 979 | return ret; | 
| @@ -1196,9 +1215,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) | |||
| 1196 | } else { | 1215 | } else { | 
| 1197 | j = jiffies; | 1216 | j = jiffies; | 
| 1198 | gpa = ACCESS_ONCE(rsp->gp_activity); | 1217 | gpa = ACCESS_ONCE(rsp->gp_activity); | 
| 1199 | pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", | 1218 | pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", | 
| 1200 | rsp->name, j - gpa, j, gpa, | 1219 | rsp->name, j - gpa, j, gpa, | 
| 1201 | jiffies_till_next_fqs); | 1220 | jiffies_till_next_fqs, | 
| 1221 | rcu_get_root(rsp)->qsmask); | ||
| 1202 | /* In this case, the current CPU might be at fault. */ | 1222 | /* In this case, the current CPU might be at fault. */ | 
| 1203 | sched_show_task(current); | 1223 | sched_show_task(current); | 
| 1204 | } | 1224 | } | 
| @@ -1328,20 +1348,30 @@ void rcu_cpu_stall_reset(void) | |||
| 1328 | } | 1348 | } | 
| 1329 | 1349 | ||
| 1330 | /* | 1350 | /* | 
| 1331 | * Initialize the specified rcu_data structure's callback list to empty. | 1351 | * Initialize the specified rcu_data structure's default callback list | 
| 1352 | * to empty. The default callback list is the one that is not used by | ||
| 1353 | * no-callbacks CPUs. | ||
| 1332 | */ | 1354 | */ | 
| 1333 | static void init_callback_list(struct rcu_data *rdp) | 1355 | static void init_default_callback_list(struct rcu_data *rdp) | 
| 1334 | { | 1356 | { | 
| 1335 | int i; | 1357 | int i; | 
| 1336 | 1358 | ||
| 1337 | if (init_nocb_callback_list(rdp)) | ||
| 1338 | return; | ||
| 1339 | rdp->nxtlist = NULL; | 1359 | rdp->nxtlist = NULL; | 
| 1340 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1360 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 
| 1341 | rdp->nxttail[i] = &rdp->nxtlist; | 1361 | rdp->nxttail[i] = &rdp->nxtlist; | 
| 1342 | } | 1362 | } | 
| 1343 | 1363 | ||
| 1344 | /* | 1364 | /* | 
| 1365 | * Initialize the specified rcu_data structure's callback list to empty. | ||
| 1366 | */ | ||
| 1367 | static void init_callback_list(struct rcu_data *rdp) | ||
| 1368 | { | ||
| 1369 | if (init_nocb_callback_list(rdp)) | ||
| 1370 | return; | ||
| 1371 | init_default_callback_list(rdp); | ||
| 1372 | } | ||
| 1373 | |||
| 1374 | /* | ||
| 1345 | * Determine the value that ->completed will have at the end of the | 1375 | * Determine the value that ->completed will have at the end of the | 
| 1346 | * next subsequent grace period. This is used to tag callbacks so that | 1376 | * next subsequent grace period. This is used to tag callbacks so that | 
| 1347 | * a CPU can invoke callbacks in a timely fashion even if that CPU has | 1377 | * a CPU can invoke callbacks in a timely fashion even if that CPU has | 
| @@ -1703,11 +1733,11 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1703 | */ | 1733 | */ | 
| 1704 | static int rcu_gp_init(struct rcu_state *rsp) | 1734 | static int rcu_gp_init(struct rcu_state *rsp) | 
| 1705 | { | 1735 | { | 
| 1736 | unsigned long oldmask; | ||
| 1706 | struct rcu_data *rdp; | 1737 | struct rcu_data *rdp; | 
| 1707 | struct rcu_node *rnp = rcu_get_root(rsp); | 1738 | struct rcu_node *rnp = rcu_get_root(rsp); | 
| 1708 | 1739 | ||
| 1709 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | 1740 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | 
| 1710 | rcu_bind_gp_kthread(); | ||
| 1711 | raw_spin_lock_irq(&rnp->lock); | 1741 | raw_spin_lock_irq(&rnp->lock); | 
| 1712 | smp_mb__after_unlock_lock(); | 1742 | smp_mb__after_unlock_lock(); | 
| 1713 | if (!ACCESS_ONCE(rsp->gp_flags)) { | 1743 | if (!ACCESS_ONCE(rsp->gp_flags)) { | 
| @@ -1733,9 +1763,54 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1733 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); | 1763 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); | 
| 1734 | raw_spin_unlock_irq(&rnp->lock); | 1764 | raw_spin_unlock_irq(&rnp->lock); | 
| 1735 | 1765 | ||
| 1736 | /* Exclude any concurrent CPU-hotplug operations. */ | 1766 | /* | 
| 1737 | mutex_lock(&rsp->onoff_mutex); | 1767 | * Apply per-leaf buffered online and offline operations to the | 
| 1738 | smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ | 1768 | * rcu_node tree. Note that this new grace period need not wait | 
| 1769 | * for subsequent online CPUs, and that quiescent-state forcing | ||
| 1770 | * will handle subsequent offline CPUs. | ||
| 1771 | */ | ||
| 1772 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 1773 | raw_spin_lock_irq(&rnp->lock); | ||
| 1774 | smp_mb__after_unlock_lock(); | ||
| 1775 | if (rnp->qsmaskinit == rnp->qsmaskinitnext && | ||
| 1776 | !rnp->wait_blkd_tasks) { | ||
| 1777 | /* Nothing to do on this leaf rcu_node structure. */ | ||
| 1778 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1779 | continue; | ||
| 1780 | } | ||
| 1781 | |||
| 1782 | /* Record old state, apply changes to ->qsmaskinit field. */ | ||
| 1783 | oldmask = rnp->qsmaskinit; | ||
| 1784 | rnp->qsmaskinit = rnp->qsmaskinitnext; | ||
| 1785 | |||
| 1786 | /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ | ||
| 1787 | if (!oldmask != !rnp->qsmaskinit) { | ||
| 1788 | if (!oldmask) /* First online CPU for this rcu_node. */ | ||
| 1789 | rcu_init_new_rnp(rnp); | ||
| 1790 | else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ | ||
| 1791 | rnp->wait_blkd_tasks = true; | ||
| 1792 | else /* Last offline CPU and can propagate. */ | ||
| 1793 | rcu_cleanup_dead_rnp(rnp); | ||
| 1794 | } | ||
| 1795 | |||
| 1796 | /* | ||
| 1797 | * If all waited-on tasks from prior grace period are | ||
| 1798 | * done, and if all this rcu_node structure's CPUs are | ||
| 1799 | * still offline, propagate up the rcu_node tree and | ||
| 1800 | * clear ->wait_blkd_tasks. Otherwise, if one of this | ||
| 1801 | * rcu_node structure's CPUs has since come back online, | ||
| 1802 | * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() | ||
| 1803 | * checks for this, so just call it unconditionally). | ||
| 1804 | */ | ||
| 1805 | if (rnp->wait_blkd_tasks && | ||
| 1806 | (!rcu_preempt_has_tasks(rnp) || | ||
| 1807 | rnp->qsmaskinit)) { | ||
| 1808 | rnp->wait_blkd_tasks = false; | ||
| 1809 | rcu_cleanup_dead_rnp(rnp); | ||
| 1810 | } | ||
| 1811 | |||
| 1812 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1813 | } | ||
| 1739 | 1814 | ||
| 1740 | /* | 1815 | /* | 
| 1741 | * Set the quiescent-state-needed bits in all the rcu_node | 1816 | * Set the quiescent-state-needed bits in all the rcu_node | 
| @@ -1757,8 +1832,8 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1757 | rcu_preempt_check_blocked_tasks(rnp); | 1832 | rcu_preempt_check_blocked_tasks(rnp); | 
| 1758 | rnp->qsmask = rnp->qsmaskinit; | 1833 | rnp->qsmask = rnp->qsmaskinit; | 
| 1759 | ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; | 1834 | ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; | 
| 1760 | WARN_ON_ONCE(rnp->completed != rsp->completed); | 1835 | if (WARN_ON_ONCE(rnp->completed != rsp->completed)) | 
| 1761 | ACCESS_ONCE(rnp->completed) = rsp->completed; | 1836 | ACCESS_ONCE(rnp->completed) = rsp->completed; | 
| 1762 | if (rnp == rdp->mynode) | 1837 | if (rnp == rdp->mynode) | 
| 1763 | (void)__note_gp_changes(rsp, rnp, rdp); | 1838 | (void)__note_gp_changes(rsp, rnp, rdp); | 
| 1764 | rcu_preempt_boost_start_gp(rnp); | 1839 | rcu_preempt_boost_start_gp(rnp); | 
| @@ -1768,9 +1843,12 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
| 1768 | raw_spin_unlock_irq(&rnp->lock); | 1843 | raw_spin_unlock_irq(&rnp->lock); | 
| 1769 | cond_resched_rcu_qs(); | 1844 | cond_resched_rcu_qs(); | 
| 1770 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | 1845 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | 
| 1846 | if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) && | ||
| 1847 | gp_init_delay > 0 && | ||
| 1848 | !(rsp->gpnum % (rcu_num_nodes * 10))) | ||
| 1849 | schedule_timeout_uninterruptible(gp_init_delay); | ||
| 1771 | } | 1850 | } | 
| 1772 | 1851 | ||
| 1773 | mutex_unlock(&rsp->onoff_mutex); | ||
| 1774 | return 1; | 1852 | return 1; | 
| 1775 | } | 1853 | } | 
| 1776 | 1854 | ||
| @@ -1798,7 +1876,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
| 1798 | fqs_state = RCU_FORCE_QS; | 1876 | fqs_state = RCU_FORCE_QS; | 
| 1799 | } else { | 1877 | } else { | 
| 1800 | /* Handle dyntick-idle and offline CPUs. */ | 1878 | /* Handle dyntick-idle and offline CPUs. */ | 
| 1801 | isidle = false; | 1879 | isidle = true; | 
| 1802 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | 1880 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | 
| 1803 | } | 1881 | } | 
| 1804 | /* Clear flag to prevent immediate re-entry. */ | 1882 | /* Clear flag to prevent immediate re-entry. */ | 
| @@ -1852,6 +1930,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
| 1852 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1930 | rcu_for_each_node_breadth_first(rsp, rnp) { | 
| 1853 | raw_spin_lock_irq(&rnp->lock); | 1931 | raw_spin_lock_irq(&rnp->lock); | 
| 1854 | smp_mb__after_unlock_lock(); | 1932 | smp_mb__after_unlock_lock(); | 
| 1933 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); | ||
| 1934 | WARN_ON_ONCE(rnp->qsmask); | ||
| 1855 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; | 1935 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; | 
| 1856 | rdp = this_cpu_ptr(rsp->rda); | 1936 | rdp = this_cpu_ptr(rsp->rda); | 
| 1857 | if (rnp == rdp->mynode) | 1937 | if (rnp == rdp->mynode) | 
| @@ -1895,6 +1975,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
| 1895 | struct rcu_state *rsp = arg; | 1975 | struct rcu_state *rsp = arg; | 
| 1896 | struct rcu_node *rnp = rcu_get_root(rsp); | 1976 | struct rcu_node *rnp = rcu_get_root(rsp); | 
| 1897 | 1977 | ||
| 1978 | rcu_bind_gp_kthread(); | ||
| 1898 | for (;;) { | 1979 | for (;;) { | 
| 1899 | 1980 | ||
| 1900 | /* Handle grace-period start. */ | 1981 | /* Handle grace-period start. */ | 
| @@ -2062,25 +2143,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
| 2062 | * Similar to rcu_report_qs_rdp(), for which it is a helper function. | 2143 | * Similar to rcu_report_qs_rdp(), for which it is a helper function. | 
| 2063 | * Allows quiescent states for a group of CPUs to be reported at one go | 2144 | * Allows quiescent states for a group of CPUs to be reported at one go | 
| 2064 | * to the specified rcu_node structure, though all the CPUs in the group | 2145 | * to the specified rcu_node structure, though all the CPUs in the group | 
| 2065 | * must be represented by the same rcu_node structure (which need not be | 2146 | * must be represented by the same rcu_node structure (which need not be a | 
| 2066 | * a leaf rcu_node structure, though it often will be). That structure's | 2147 | * leaf rcu_node structure, though it often will be). The gps parameter | 
| 2067 | * lock must be held upon entry, and it is released before return. | 2148 | * is the grace-period snapshot, which means that the quiescent states | 
| 2149 | * are valid only if rnp->gpnum is equal to gps. That structure's lock | ||
| 2150 | * must be held upon entry, and it is released before return. | ||
| 2068 | */ | 2151 | */ | 
| 2069 | static void | 2152 | static void | 
| 2070 | rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | 2153 | rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | 
| 2071 | struct rcu_node *rnp, unsigned long flags) | 2154 | struct rcu_node *rnp, unsigned long gps, unsigned long flags) | 
| 2072 | __releases(rnp->lock) | 2155 | __releases(rnp->lock) | 
| 2073 | { | 2156 | { | 
| 2157 | unsigned long oldmask = 0; | ||
| 2074 | struct rcu_node *rnp_c; | 2158 | struct rcu_node *rnp_c; | 
| 2075 | 2159 | ||
| 2076 | /* Walk up the rcu_node hierarchy. */ | 2160 | /* Walk up the rcu_node hierarchy. */ | 
| 2077 | for (;;) { | 2161 | for (;;) { | 
| 2078 | if (!(rnp->qsmask & mask)) { | 2162 | if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { | 
| 2079 | 2163 | ||
| 2080 | /* Our bit has already been cleared, so done. */ | 2164 | /* | 
| 2165 | * Our bit has already been cleared, or the | ||
| 2166 | * relevant grace period is already over, so done. | ||
| 2167 | */ | ||
| 2081 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2168 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 
| 2082 | return; | 2169 | return; | 
| 2083 | } | 2170 | } | 
| 2171 | WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ | ||
| 2084 | rnp->qsmask &= ~mask; | 2172 | rnp->qsmask &= ~mask; | 
| 2085 | trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, | 2173 | trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, | 
| 2086 | mask, rnp->qsmask, rnp->level, | 2174 | mask, rnp->qsmask, rnp->level, | 
| @@ -2104,7 +2192,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
| 2104 | rnp = rnp->parent; | 2192 | rnp = rnp->parent; | 
| 2105 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2193 | raw_spin_lock_irqsave(&rnp->lock, flags); | 
| 2106 | smp_mb__after_unlock_lock(); | 2194 | smp_mb__after_unlock_lock(); | 
| 2107 | WARN_ON_ONCE(rnp_c->qsmask); | 2195 | oldmask = rnp_c->qsmask; | 
| 2108 | } | 2196 | } | 
| 2109 | 2197 | ||
| 2110 | /* | 2198 | /* | 
| @@ -2116,6 +2204,46 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
| 2116 | } | 2204 | } | 
| 2117 | 2205 | ||
| 2118 | /* | 2206 | /* | 
| 2207 | * Record a quiescent state for all tasks that were previously queued | ||
| 2208 | * on the specified rcu_node structure and that were blocking the current | ||
| 2209 | * RCU grace period. The caller must hold the specified rnp->lock with | ||
| 2210 | * irqs disabled, and this lock is released upon return, but irqs remain | ||
| 2211 | * disabled. | ||
| 2212 | */ | ||
| 2213 | static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, | ||
| 2214 | struct rcu_node *rnp, unsigned long flags) | ||
| 2215 | __releases(rnp->lock) | ||
| 2216 | { | ||
| 2217 | unsigned long gps; | ||
| 2218 | unsigned long mask; | ||
| 2219 | struct rcu_node *rnp_p; | ||
| 2220 | |||
| 2221 | if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || | ||
| 2222 | rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | ||
| 2223 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 2224 | return; /* Still need more quiescent states! */ | ||
| 2225 | } | ||
| 2226 | |||
| 2227 | rnp_p = rnp->parent; | ||
| 2228 | if (rnp_p == NULL) { | ||
| 2229 | /* | ||
| 2230 | * Only one rcu_node structure in the tree, so don't | ||
| 2231 | * try to report up to its nonexistent parent! | ||
| 2232 | */ | ||
| 2233 | rcu_report_qs_rsp(rsp, flags); | ||
| 2234 | return; | ||
| 2235 | } | ||
| 2236 | |||
| 2237 | /* Report up the rest of the hierarchy, tracking current ->gpnum. */ | ||
| 2238 | gps = rnp->gpnum; | ||
| 2239 | mask = rnp->grpmask; | ||
| 2240 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 2241 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ | ||
| 2242 | smp_mb__after_unlock_lock(); | ||
| 2243 | rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); | ||
| 2244 | } | ||
| 2245 | |||
| 2246 | /* | ||
| 2119 | * Record a quiescent state for the specified CPU to that CPU's rcu_data | 2247 | * Record a quiescent state for the specified CPU to that CPU's rcu_data | 
| 2120 | * structure. This must be either called from the specified CPU, or | 2248 | * structure. This must be either called from the specified CPU, or | 
| 2121 | * called when the specified CPU is known to be offline (and when it is | 2249 | * called when the specified CPU is known to be offline (and when it is | 
| @@ -2163,7 +2291,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 2163 | */ | 2291 | */ | 
| 2164 | needwake = rcu_accelerate_cbs(rsp, rnp, rdp); | 2292 | needwake = rcu_accelerate_cbs(rsp, rnp, rdp); | 
| 2165 | 2293 | ||
| 2166 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ | 2294 | rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); | 
| 2295 | /* ^^^ Released rnp->lock */ | ||
| 2167 | if (needwake) | 2296 | if (needwake) | 
| 2168 | rcu_gp_kthread_wake(rsp); | 2297 | rcu_gp_kthread_wake(rsp); | 
| 2169 | } | 2298 | } | 
| @@ -2256,8 +2385,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
| 2256 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; | 2385 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; | 
| 2257 | } | 2386 | } | 
| 2258 | 2387 | ||
| 2259 | /* Finally, initialize the rcu_data structure's list to empty. */ | 2388 | /* | 
| 2389 | * Finally, initialize the rcu_data structure's list to empty and | ||
| 2390 | * disallow further callbacks on this CPU. | ||
| 2391 | */ | ||
| 2260 | init_callback_list(rdp); | 2392 | init_callback_list(rdp); | 
| 2393 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
| 2261 | } | 2394 | } | 
| 2262 | 2395 | ||
| 2263 | /* | 2396 | /* | 
| @@ -2355,6 +2488,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) | |||
| 2355 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 2488 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 
| 2356 | smp_mb__after_unlock_lock(); /* GP memory ordering. */ | 2489 | smp_mb__after_unlock_lock(); /* GP memory ordering. */ | 
| 2357 | rnp->qsmaskinit &= ~mask; | 2490 | rnp->qsmaskinit &= ~mask; | 
| 2491 | rnp->qsmask &= ~mask; | ||
| 2358 | if (rnp->qsmaskinit) { | 2492 | if (rnp->qsmaskinit) { | 
| 2359 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2493 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 
| 2360 | return; | 2494 | return; | 
| @@ -2364,6 +2498,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) | |||
| 2364 | } | 2498 | } | 
| 2365 | 2499 | ||
| 2366 | /* | 2500 | /* | 
| 2501 | * The CPU is exiting the idle loop into the arch_cpu_idle_dead() | ||
| 2502 | * function. We now remove it from the rcu_node tree's ->qsmaskinit | ||
| 2503 | * bit masks. | ||
| 2504 | */ | ||
| 2505 | static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | ||
| 2506 | { | ||
| 2507 | unsigned long flags; | ||
| 2508 | unsigned long mask; | ||
| 2509 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 2510 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ | ||
| 2511 | |||
| 2512 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ | ||
| 2513 | mask = rdp->grpmask; | ||
| 2514 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 2515 | smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ | ||
| 2516 | rnp->qsmaskinitnext &= ~mask; | ||
| 2517 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 2518 | } | ||
| 2519 | |||
| 2520 | /* | ||
| 2367 | * The CPU has been completely removed, and some other CPU is reporting | 2521 | * The CPU has been completely removed, and some other CPU is reporting | 
| 2368 | * this fact from process context. Do the remainder of the cleanup, | 2522 | * this fact from process context. Do the remainder of the cleanup, | 
| 2369 | * including orphaning the outgoing CPU's RCU callbacks, and also | 2523 | * including orphaning the outgoing CPU's RCU callbacks, and also | 
| @@ -2379,29 +2533,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
| 2379 | /* Adjust any no-longer-needed kthreads. */ | 2533 | /* Adjust any no-longer-needed kthreads. */ | 
| 2380 | rcu_boost_kthread_setaffinity(rnp, -1); | 2534 | rcu_boost_kthread_setaffinity(rnp, -1); | 
| 2381 | 2535 | ||
| 2382 | /* Exclude any attempts to start a new grace period. */ | ||
| 2383 | mutex_lock(&rsp->onoff_mutex); | ||
| 2384 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); | ||
| 2385 | |||
| 2386 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | 2536 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | 
| 2537 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); | ||
| 2387 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | 2538 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | 
| 2388 | rcu_adopt_orphan_cbs(rsp, flags); | 2539 | rcu_adopt_orphan_cbs(rsp, flags); | 
| 2389 | raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); | 2540 | raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); | 
| 2390 | 2541 | ||
| 2391 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ | ||
| 2392 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 2393 | smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ | ||
| 2394 | rnp->qsmaskinit &= ~rdp->grpmask; | ||
| 2395 | if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) | ||
| 2396 | rcu_cleanup_dead_rnp(rnp); | ||
| 2397 | rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */ | ||
| 2398 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | 2542 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | 
| 2399 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | 2543 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | 
| 2400 | cpu, rdp->qlen, rdp->nxtlist); | 2544 | cpu, rdp->qlen, rdp->nxtlist); | 
| 2401 | init_callback_list(rdp); | ||
| 2402 | /* Disallow further callbacks on this CPU. */ | ||
| 2403 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
| 2404 | mutex_unlock(&rsp->onoff_mutex); | ||
| 2405 | } | 2545 | } | 
| 2406 | 2546 | ||
| 2407 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 2547 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 
| @@ -2414,6 +2554,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) | |||
| 2414 | { | 2554 | { | 
| 2415 | } | 2555 | } | 
| 2416 | 2556 | ||
| 2557 | static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | ||
| 2558 | { | ||
| 2559 | } | ||
| 2560 | |||
| 2417 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | 2561 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | 
| 2418 | { | 2562 | { | 
| 2419 | } | 2563 | } | 
| @@ -2589,26 +2733,47 @@ static void force_qs_rnp(struct rcu_state *rsp, | |||
| 2589 | return; | 2733 | return; | 
| 2590 | } | 2734 | } | 
| 2591 | if (rnp->qsmask == 0) { | 2735 | if (rnp->qsmask == 0) { | 
| 2592 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | 2736 | if (rcu_state_p == &rcu_sched_state || | 
| 2593 | continue; | 2737 | rsp != rcu_state_p || | 
| 2738 | rcu_preempt_blocked_readers_cgp(rnp)) { | ||
| 2739 | /* | ||
| 2740 | * No point in scanning bits because they | ||
| 2741 | * are all zero. But we might need to | ||
| 2742 | * priority-boost blocked readers. | ||
| 2743 | */ | ||
| 2744 | rcu_initiate_boost(rnp, flags); | ||
| 2745 | /* rcu_initiate_boost() releases rnp->lock */ | ||
| 2746 | continue; | ||
| 2747 | } | ||
| 2748 | if (rnp->parent && | ||
| 2749 | (rnp->parent->qsmask & rnp->grpmask)) { | ||
| 2750 | /* | ||
| 2751 | * Race between grace-period | ||
| 2752 | * initialization and task exiting RCU | ||
| 2753 | * read-side critical section: Report. | ||
| 2754 | */ | ||
| 2755 | rcu_report_unblock_qs_rnp(rsp, rnp, flags); | ||
| 2756 | /* rcu_report_unblock_qs_rnp() rlses ->lock */ | ||
| 2757 | continue; | ||
| 2758 | } | ||
| 2594 | } | 2759 | } | 
| 2595 | cpu = rnp->grplo; | 2760 | cpu = rnp->grplo; | 
| 2596 | bit = 1; | 2761 | bit = 1; | 
| 2597 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 2762 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | 
| 2598 | if ((rnp->qsmask & bit) != 0) { | 2763 | if ((rnp->qsmask & bit) != 0) { | 
| 2599 | if ((rnp->qsmaskinit & bit) != 0) | 2764 | if ((rnp->qsmaskinit & bit) == 0) | 
| 2600 | *isidle = false; | 2765 | *isidle = false; /* Pending hotplug. */ | 
| 2601 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | 2766 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | 
| 2602 | mask |= bit; | 2767 | mask |= bit; | 
| 2603 | } | 2768 | } | 
| 2604 | } | 2769 | } | 
| 2605 | if (mask != 0) { | 2770 | if (mask != 0) { | 
| 2606 | 2771 | /* Idle/offline CPUs, report (releases rnp->lock. */ | |
| 2607 | /* rcu_report_qs_rnp() releases rnp->lock. */ | 2772 | rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); | 
| 2608 | rcu_report_qs_rnp(mask, rsp, rnp, flags); | 2773 | } else { | 
| 2609 | continue; | 2774 | /* Nothing to do here, so just drop the lock. */ | 
| 2775 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 2610 | } | 2776 | } | 
| 2611 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 2612 | } | 2777 | } | 
| 2613 | } | 2778 | } | 
| 2614 | 2779 | ||
| @@ -2741,7 +2906,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
| 2741 | * If called from an extended quiescent state, invoke the RCU | 2906 | * If called from an extended quiescent state, invoke the RCU | 
| 2742 | * core in order to force a re-evaluation of RCU's idleness. | 2907 | * core in order to force a re-evaluation of RCU's idleness. | 
| 2743 | */ | 2908 | */ | 
| 2744 | if (!rcu_is_watching() && cpu_online(smp_processor_id())) | 2909 | if (!rcu_is_watching()) | 
| 2745 | invoke_rcu_core(); | 2910 | invoke_rcu_core(); | 
| 2746 | 2911 | ||
| 2747 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ | 2912 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ | 
| @@ -2827,11 +2992,22 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 2827 | 2992 | ||
| 2828 | if (cpu != -1) | 2993 | if (cpu != -1) | 
| 2829 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2994 | rdp = per_cpu_ptr(rsp->rda, cpu); | 
| 2830 | offline = !__call_rcu_nocb(rdp, head, lazy, flags); | 2995 | if (likely(rdp->mynode)) { | 
| 2831 | WARN_ON_ONCE(offline); | 2996 | /* Post-boot, so this should be for a no-CBs CPU. */ | 
| 2832 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ | 2997 | offline = !__call_rcu_nocb(rdp, head, lazy, flags); | 
| 2833 | local_irq_restore(flags); | 2998 | WARN_ON_ONCE(offline); | 
| 2834 | return; | 2999 | /* Offline CPU, _call_rcu() illegal, leak callback. */ | 
| 3000 | local_irq_restore(flags); | ||
| 3001 | return; | ||
| 3002 | } | ||
| 3003 | /* | ||
| 3004 | * Very early boot, before rcu_init(). Initialize if needed | ||
| 3005 | * and then drop through to queue the callback. | ||
| 3006 | */ | ||
| 3007 | BUG_ON(cpu != -1); | ||
| 3008 | WARN_ON_ONCE(!rcu_is_watching()); | ||
| 3009 | if (!likely(rdp->nxtlist)) | ||
| 3010 | init_default_callback_list(rdp); | ||
| 2835 | } | 3011 | } | 
| 2836 | ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; | 3012 | ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; | 
| 2837 | if (lazy) | 3013 | if (lazy) | 
| @@ -2954,7 +3130,7 @@ void synchronize_sched(void) | |||
| 2954 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); | 3130 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); | 
| 2955 | if (rcu_blocking_is_gp()) | 3131 | if (rcu_blocking_is_gp()) | 
| 2956 | return; | 3132 | return; | 
| 2957 | if (rcu_expedited) | 3133 | if (rcu_gp_is_expedited()) | 
| 2958 | synchronize_sched_expedited(); | 3134 | synchronize_sched_expedited(); | 
| 2959 | else | 3135 | else | 
| 2960 | wait_rcu_gp(call_rcu_sched); | 3136 | wait_rcu_gp(call_rcu_sched); | 
| @@ -2981,7 +3157,7 @@ void synchronize_rcu_bh(void) | |||
| 2981 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); | 3157 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); | 
| 2982 | if (rcu_blocking_is_gp()) | 3158 | if (rcu_blocking_is_gp()) | 
| 2983 | return; | 3159 | return; | 
| 2984 | if (rcu_expedited) | 3160 | if (rcu_gp_is_expedited()) | 
| 2985 | synchronize_rcu_bh_expedited(); | 3161 | synchronize_rcu_bh_expedited(); | 
| 2986 | else | 3162 | else | 
| 2987 | wait_rcu_gp(call_rcu_bh); | 3163 | wait_rcu_gp(call_rcu_bh); | 
| @@ -3518,6 +3694,28 @@ void rcu_barrier_sched(void) | |||
| 3518 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | 3694 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | 
| 3519 | 3695 | ||
| 3520 | /* | 3696 | /* | 
| 3697 | * Propagate ->qsinitmask bits up the rcu_node tree to account for the | ||
| 3698 | * first CPU in a given leaf rcu_node structure coming online. The caller | ||
| 3699 | * must hold the corresponding leaf rcu_node ->lock with interrrupts | ||
| 3700 | * disabled. | ||
| 3701 | */ | ||
| 3702 | static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) | ||
| 3703 | { | ||
| 3704 | long mask; | ||
| 3705 | struct rcu_node *rnp = rnp_leaf; | ||
| 3706 | |||
| 3707 | for (;;) { | ||
| 3708 | mask = rnp->grpmask; | ||
| 3709 | rnp = rnp->parent; | ||
| 3710 | if (rnp == NULL) | ||
| 3711 | return; | ||
| 3712 | raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ | ||
| 3713 | rnp->qsmaskinit |= mask; | ||
| 3714 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ | ||
| 3715 | } | ||
| 3716 | } | ||
| 3717 | |||
| 3718 | /* | ||
| 3521 | * Do boot-time initialization of a CPU's per-CPU RCU data. | 3719 | * Do boot-time initialization of a CPU's per-CPU RCU data. | 
| 3522 | */ | 3720 | */ | 
| 3523 | static void __init | 3721 | static void __init | 
| @@ -3553,49 +3751,37 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 3553 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 3751 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 
| 3554 | struct rcu_node *rnp = rcu_get_root(rsp); | 3752 | struct rcu_node *rnp = rcu_get_root(rsp); | 
| 3555 | 3753 | ||
| 3556 | /* Exclude new grace periods. */ | ||
| 3557 | mutex_lock(&rsp->onoff_mutex); | ||
| 3558 | |||
| 3559 | /* Set up local state, ensuring consistent view of global state. */ | 3754 | /* Set up local state, ensuring consistent view of global state. */ | 
| 3560 | raw_spin_lock_irqsave(&rnp->lock, flags); | 3755 | raw_spin_lock_irqsave(&rnp->lock, flags); | 
| 3561 | rdp->beenonline = 1; /* We have now been online. */ | 3756 | rdp->beenonline = 1; /* We have now been online. */ | 
| 3562 | rdp->qlen_last_fqs_check = 0; | 3757 | rdp->qlen_last_fqs_check = 0; | 
| 3563 | rdp->n_force_qs_snap = rsp->n_force_qs; | 3758 | rdp->n_force_qs_snap = rsp->n_force_qs; | 
| 3564 | rdp->blimit = blimit; | 3759 | rdp->blimit = blimit; | 
| 3565 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | 3760 | if (!rdp->nxtlist) | 
| 3761 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | ||
| 3566 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 3762 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 
| 3567 | rcu_sysidle_init_percpu_data(rdp->dynticks); | 3763 | rcu_sysidle_init_percpu_data(rdp->dynticks); | 
| 3568 | atomic_set(&rdp->dynticks->dynticks, | 3764 | atomic_set(&rdp->dynticks->dynticks, | 
| 3569 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 3765 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 
| 3570 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 3766 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 
| 3571 | 3767 | ||
| 3572 | /* Add CPU to rcu_node bitmasks. */ | 3768 | /* | 
| 3769 | * Add CPU to leaf rcu_node pending-online bitmask. Any needed | ||
| 3770 | * propagation up the rcu_node tree will happen at the beginning | ||
| 3771 | * of the next grace period. | ||
| 3772 | */ | ||
| 3573 | rnp = rdp->mynode; | 3773 | rnp = rdp->mynode; | 
| 3574 | mask = rdp->grpmask; | 3774 | mask = rdp->grpmask; | 
| 3575 | do { | 3775 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 
| 3576 | /* Exclude any attempts to start a new GP on small systems. */ | 3776 | smp_mb__after_unlock_lock(); | 
| 3577 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 3777 | rnp->qsmaskinitnext |= mask; | 
| 3578 | rnp->qsmaskinit |= mask; | 3778 | rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ | 
| 3579 | mask = rnp->grpmask; | 3779 | rdp->completed = rnp->completed; | 
| 3580 | if (rnp == rdp->mynode) { | 3780 | rdp->passed_quiesce = false; | 
| 3581 | /* | 3781 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | 
| 3582 | * If there is a grace period in progress, we will | 3782 | rdp->qs_pending = false; | 
| 3583 | * set up to wait for it next time we run the | 3783 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); | 
| 3584 | * RCU core code. | 3784 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 
| 3585 | */ | ||
| 3586 | rdp->gpnum = rnp->completed; | ||
| 3587 | rdp->completed = rnp->completed; | ||
| 3588 | rdp->passed_quiesce = 0; | ||
| 3589 | rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); | ||
| 3590 | rdp->qs_pending = 0; | ||
| 3591 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); | ||
| 3592 | } | ||
| 3593 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | ||
| 3594 | rnp = rnp->parent; | ||
| 3595 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); | ||
| 3596 | local_irq_restore(flags); | ||
| 3597 | |||
| 3598 | mutex_unlock(&rsp->onoff_mutex); | ||
| 3599 | } | 3785 | } | 
| 3600 | 3786 | ||
| 3601 | static void rcu_prepare_cpu(int cpu) | 3787 | static void rcu_prepare_cpu(int cpu) | 
| @@ -3609,15 +3795,14 @@ static void rcu_prepare_cpu(int cpu) | |||
| 3609 | /* | 3795 | /* | 
| 3610 | * Handle CPU online/offline notification events. | 3796 | * Handle CPU online/offline notification events. | 
| 3611 | */ | 3797 | */ | 
| 3612 | static int rcu_cpu_notify(struct notifier_block *self, | 3798 | int rcu_cpu_notify(struct notifier_block *self, | 
| 3613 | unsigned long action, void *hcpu) | 3799 | unsigned long action, void *hcpu) | 
| 3614 | { | 3800 | { | 
| 3615 | long cpu = (long)hcpu; | 3801 | long cpu = (long)hcpu; | 
| 3616 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | 3802 | struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); | 
| 3617 | struct rcu_node *rnp = rdp->mynode; | 3803 | struct rcu_node *rnp = rdp->mynode; | 
| 3618 | struct rcu_state *rsp; | 3804 | struct rcu_state *rsp; | 
| 3619 | 3805 | ||
| 3620 | trace_rcu_utilization(TPS("Start CPU hotplug")); | ||
| 3621 | switch (action) { | 3806 | switch (action) { | 
| 3622 | case CPU_UP_PREPARE: | 3807 | case CPU_UP_PREPARE: | 
| 3623 | case CPU_UP_PREPARE_FROZEN: | 3808 | case CPU_UP_PREPARE_FROZEN: | 
| @@ -3637,6 +3822,11 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
| 3637 | for_each_rcu_flavor(rsp) | 3822 | for_each_rcu_flavor(rsp) | 
| 3638 | rcu_cleanup_dying_cpu(rsp); | 3823 | rcu_cleanup_dying_cpu(rsp); | 
| 3639 | break; | 3824 | break; | 
| 3825 | case CPU_DYING_IDLE: | ||
| 3826 | for_each_rcu_flavor(rsp) { | ||
| 3827 | rcu_cleanup_dying_idle_cpu(cpu, rsp); | ||
| 3828 | } | ||
| 3829 | break; | ||
| 3640 | case CPU_DEAD: | 3830 | case CPU_DEAD: | 
| 3641 | case CPU_DEAD_FROZEN: | 3831 | case CPU_DEAD_FROZEN: | 
| 3642 | case CPU_UP_CANCELED: | 3832 | case CPU_UP_CANCELED: | 
| @@ -3649,7 +3839,6 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
| 3649 | default: | 3839 | default: | 
| 3650 | break; | 3840 | break; | 
| 3651 | } | 3841 | } | 
| 3652 | trace_rcu_utilization(TPS("End CPU hotplug")); | ||
| 3653 | return NOTIFY_OK; | 3842 | return NOTIFY_OK; | 
| 3654 | } | 3843 | } | 
| 3655 | 3844 | ||
| @@ -3660,11 +3849,12 @@ static int rcu_pm_notify(struct notifier_block *self, | |||
| 3660 | case PM_HIBERNATION_PREPARE: | 3849 | case PM_HIBERNATION_PREPARE: | 
| 3661 | case PM_SUSPEND_PREPARE: | 3850 | case PM_SUSPEND_PREPARE: | 
| 3662 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | 3851 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | 
| 3663 | rcu_expedited = 1; | 3852 | rcu_expedite_gp(); | 
| 3664 | break; | 3853 | break; | 
| 3665 | case PM_POST_HIBERNATION: | 3854 | case PM_POST_HIBERNATION: | 
| 3666 | case PM_POST_SUSPEND: | 3855 | case PM_POST_SUSPEND: | 
| 3667 | rcu_expedited = 0; | 3856 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | 
| 3857 | rcu_unexpedite_gp(); | ||
| 3668 | break; | 3858 | break; | 
| 3669 | default: | 3859 | default: | 
| 3670 | break; | 3860 | break; | 
| @@ -3734,30 +3924,26 @@ void rcu_scheduler_starting(void) | |||
| 3734 | * Compute the per-level fanout, either using the exact fanout specified | 3924 | * Compute the per-level fanout, either using the exact fanout specified | 
| 3735 | * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. | 3925 | * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. | 
| 3736 | */ | 3926 | */ | 
| 3737 | #ifdef CONFIG_RCU_FANOUT_EXACT | ||
| 3738 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | ||
| 3739 | { | ||
| 3740 | int i; | ||
| 3741 | |||
| 3742 | rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; | ||
| 3743 | for (i = rcu_num_lvls - 2; i >= 0; i--) | ||
| 3744 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | ||
| 3745 | } | ||
| 3746 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | ||
| 3747 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 3927 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | 
| 3748 | { | 3928 | { | 
| 3749 | int ccur; | ||
| 3750 | int cprv; | ||
| 3751 | int i; | 3929 | int i; | 
| 3752 | 3930 | ||
| 3753 | cprv = nr_cpu_ids; | 3931 | if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) { | 
| 3754 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | 3932 | rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; | 
| 3755 | ccur = rsp->levelcnt[i]; | 3933 | for (i = rcu_num_lvls - 2; i >= 0; i--) | 
| 3756 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; | 3934 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | 
| 3757 | cprv = ccur; | 3935 | } else { | 
| 3936 | int ccur; | ||
| 3937 | int cprv; | ||
| 3938 | |||
| 3939 | cprv = nr_cpu_ids; | ||
| 3940 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | ||
| 3941 | ccur = rsp->levelcnt[i]; | ||
| 3942 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; | ||
| 3943 | cprv = ccur; | ||
| 3944 | } | ||
| 3758 | } | 3945 | } | 
| 3759 | } | 3946 | } | 
| 3760 | #endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ | ||
| 3761 | 3947 | ||
| 3762 | /* | 3948 | /* | 
| 3763 | * Helper function for rcu_init() that initializes one rcu_state structure. | 3949 | * Helper function for rcu_init() that initializes one rcu_state structure. | 
| @@ -3833,7 +4019,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 3833 | } | 4019 | } | 
| 3834 | } | 4020 | } | 
| 3835 | 4021 | ||
| 3836 | rsp->rda = rda; | ||
| 3837 | init_waitqueue_head(&rsp->gp_wq); | 4022 | init_waitqueue_head(&rsp->gp_wq); | 
| 3838 | rnp = rsp->level[rcu_num_lvls - 1]; | 4023 | rnp = rsp->level[rcu_num_lvls - 1]; | 
| 3839 | for_each_possible_cpu(i) { | 4024 | for_each_possible_cpu(i) { | 
| @@ -3926,6 +4111,8 @@ void __init rcu_init(void) | |||
| 3926 | { | 4111 | { | 
| 3927 | int cpu; | 4112 | int cpu; | 
| 3928 | 4113 | ||
| 4114 | rcu_early_boot_tests(); | ||
| 4115 | |||
| 3929 | rcu_bootup_announce(); | 4116 | rcu_bootup_announce(); | 
| 3930 | rcu_init_geometry(); | 4117 | rcu_init_geometry(); | 
| 3931 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 4118 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 
| @@ -3942,8 +4129,6 @@ void __init rcu_init(void) | |||
| 3942 | pm_notifier(rcu_pm_notify, 0); | 4129 | pm_notifier(rcu_pm_notify, 0); | 
| 3943 | for_each_online_cpu(cpu) | 4130 | for_each_online_cpu(cpu) | 
| 3944 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 4131 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | 
| 3945 | |||
| 3946 | rcu_early_boot_tests(); | ||
| 3947 | } | 4132 | } | 
| 3948 | 4133 | ||
| 3949 | #include "tree_plugin.h" | 4134 | #include "tree_plugin.h" | 
| diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 119de399eb2f..a69d3dab2ec4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -141,12 +141,20 @@ struct rcu_node { | |||
| 141 | /* complete (only for PREEMPT_RCU). */ | 141 | /* complete (only for PREEMPT_RCU). */ | 
| 142 | unsigned long qsmaskinit; | 142 | unsigned long qsmaskinit; | 
| 143 | /* Per-GP initial value for qsmask & expmask. */ | 143 | /* Per-GP initial value for qsmask & expmask. */ | 
| 144 | /* Initialized from ->qsmaskinitnext at the */ | ||
| 145 | /* beginning of each grace period. */ | ||
| 146 | unsigned long qsmaskinitnext; | ||
| 147 | /* Online CPUs for next grace period. */ | ||
| 144 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 148 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 
| 145 | /* Only one bit will be set in this mask. */ | 149 | /* Only one bit will be set in this mask. */ | 
| 146 | int grplo; /* lowest-numbered CPU or group here. */ | 150 | int grplo; /* lowest-numbered CPU or group here. */ | 
| 147 | int grphi; /* highest-numbered CPU or group here. */ | 151 | int grphi; /* highest-numbered CPU or group here. */ | 
| 148 | u8 grpnum; /* CPU/group number for next level up. */ | 152 | u8 grpnum; /* CPU/group number for next level up. */ | 
| 149 | u8 level; /* root is at level 0. */ | 153 | u8 level; /* root is at level 0. */ | 
| 154 | bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ | ||
| 155 | /* exit RCU read-side critical sections */ | ||
| 156 | /* before propagating offline up the */ | ||
| 157 | /* rcu_node tree? */ | ||
| 150 | struct rcu_node *parent; | 158 | struct rcu_node *parent; | 
| 151 | struct list_head blkd_tasks; | 159 | struct list_head blkd_tasks; | 
| 152 | /* Tasks blocked in RCU read-side critical */ | 160 | /* Tasks blocked in RCU read-side critical */ | 
| @@ -448,8 +456,6 @@ struct rcu_state { | |||
| 448 | long qlen; /* Total number of callbacks. */ | 456 | long qlen; /* Total number of callbacks. */ | 
| 449 | /* End of fields guarded by orphan_lock. */ | 457 | /* End of fields guarded by orphan_lock. */ | 
| 450 | 458 | ||
| 451 | struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ | ||
| 452 | |||
| 453 | struct mutex barrier_mutex; /* Guards barrier fields. */ | 459 | struct mutex barrier_mutex; /* Guards barrier fields. */ | 
| 454 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ | 460 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ | 
| 455 | struct completion barrier_completion; /* Wake at barrier end. */ | 461 | struct completion barrier_completion; /* Wake at barrier end. */ | 
| @@ -559,6 +565,7 @@ static void rcu_prepare_kthreads(int cpu); | |||
| 559 | static void rcu_cleanup_after_idle(void); | 565 | static void rcu_cleanup_after_idle(void); | 
| 560 | static void rcu_prepare_for_idle(void); | 566 | static void rcu_prepare_for_idle(void); | 
| 561 | static void rcu_idle_count_callbacks_posted(void); | 567 | static void rcu_idle_count_callbacks_posted(void); | 
| 568 | static bool rcu_preempt_has_tasks(struct rcu_node *rnp); | ||
| 562 | static void print_cpu_stall_info_begin(void); | 569 | static void print_cpu_stall_info_begin(void); | 
| 563 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | 570 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | 
| 564 | static void print_cpu_stall_info_end(void); | 571 | static void print_cpu_stall_info_end(void); | 
| diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a571e9a0f1d..8c0ec0f5a027 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -58,38 +58,33 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ | |||
| 58 | */ | 58 | */ | 
| 59 | static void __init rcu_bootup_announce_oddness(void) | 59 | static void __init rcu_bootup_announce_oddness(void) | 
| 60 | { | 60 | { | 
| 61 | #ifdef CONFIG_RCU_TRACE | 61 | if (IS_ENABLED(CONFIG_RCU_TRACE)) | 
| 62 | pr_info("\tRCU debugfs-based tracing is enabled.\n"); | 62 | pr_info("\tRCU debugfs-based tracing is enabled.\n"); | 
| 63 | #endif | 63 | if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || | 
| 64 | #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) | 64 | (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)) | 
| 65 | pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", | 65 | pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", | 
| 66 | CONFIG_RCU_FANOUT); | 66 | CONFIG_RCU_FANOUT); | 
| 67 | #endif | 67 | if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) | 
| 68 | #ifdef CONFIG_RCU_FANOUT_EXACT | 68 | pr_info("\tHierarchical RCU autobalancing is disabled.\n"); | 
| 69 | pr_info("\tHierarchical RCU autobalancing is disabled.\n"); | 69 | if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) | 
| 70 | #endif | 70 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); | 
| 71 | #ifdef CONFIG_RCU_FAST_NO_HZ | 71 | if (IS_ENABLED(CONFIG_PROVE_RCU)) | 
| 72 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); | 72 | pr_info("\tRCU lockdep checking is enabled.\n"); | 
| 73 | #endif | 73 | if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) | 
| 74 | #ifdef CONFIG_PROVE_RCU | 74 | pr_info("\tRCU torture testing starts during boot.\n"); | 
| 75 | pr_info("\tRCU lockdep checking is enabled.\n"); | 75 | if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) | 
| 76 | #endif | 76 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); | 
| 77 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | 77 | if (NUM_RCU_LVL_4 != 0) | 
| 78 | pr_info("\tRCU torture testing starts during boot.\n"); | 78 | pr_info("\tFour-level hierarchy is enabled.\n"); | 
| 79 | #endif | 79 | if (CONFIG_RCU_FANOUT_LEAF != 16) | 
| 80 | #if defined(CONFIG_RCU_CPU_STALL_INFO) | 80 | pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", | 
| 81 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); | 81 | CONFIG_RCU_FANOUT_LEAF); | 
| 82 | #endif | ||
| 83 | #if NUM_RCU_LVL_4 != 0 | ||
| 84 | pr_info("\tFour-level hierarchy is enabled.\n"); | ||
| 85 | #endif | ||
| 86 | if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) | 82 | if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) | 
| 87 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 83 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | 
| 88 | if (nr_cpu_ids != NR_CPUS) | 84 | if (nr_cpu_ids != NR_CPUS) | 
| 89 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 85 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 
| 90 | #ifdef CONFIG_RCU_BOOST | 86 | if (IS_ENABLED(CONFIG_RCU_BOOST)) | 
| 91 | pr_info("\tRCU kthread priority: %d.\n", kthread_prio); | 87 | pr_info("\tRCU kthread priority: %d.\n", kthread_prio); | 
| 92 | #endif | ||
| 93 | } | 88 | } | 
| 94 | 89 | ||
| 95 | #ifdef CONFIG_PREEMPT_RCU | 90 | #ifdef CONFIG_PREEMPT_RCU | 
| @@ -180,7 +175,7 @@ static void rcu_preempt_note_context_switch(void) | |||
| 180 | * But first, note that the current CPU must still be | 175 | * But first, note that the current CPU must still be | 
| 181 | * on line! | 176 | * on line! | 
| 182 | */ | 177 | */ | 
| 183 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); | 178 | WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); | 
| 184 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 179 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 
| 185 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { | 180 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { | 
| 186 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); | 181 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); | 
| @@ -233,43 +228,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) | |||
| 233 | } | 228 | } | 
| 234 | 229 | ||
| 235 | /* | 230 | /* | 
| 236 | * Record a quiescent state for all tasks that were previously queued | ||
| 237 | * on the specified rcu_node structure and that were blocking the current | ||
| 238 | * RCU grace period. The caller must hold the specified rnp->lock with | ||
| 239 | * irqs disabled, and this lock is released upon return, but irqs remain | ||
| 240 | * disabled. | ||
| 241 | */ | ||
| 242 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | ||
| 243 | __releases(rnp->lock) | ||
| 244 | { | ||
| 245 | unsigned long mask; | ||
| 246 | struct rcu_node *rnp_p; | ||
| 247 | |||
| 248 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | ||
| 249 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 250 | return; /* Still need more quiescent states! */ | ||
| 251 | } | ||
| 252 | |||
| 253 | rnp_p = rnp->parent; | ||
| 254 | if (rnp_p == NULL) { | ||
| 255 | /* | ||
| 256 | * Either there is only one rcu_node in the tree, | ||
| 257 | * or tasks were kicked up to root rcu_node due to | ||
| 258 | * CPUs going offline. | ||
| 259 | */ | ||
| 260 | rcu_report_qs_rsp(&rcu_preempt_state, flags); | ||
| 261 | return; | ||
| 262 | } | ||
| 263 | |||
| 264 | /* Report up the rest of the hierarchy. */ | ||
| 265 | mask = rnp->grpmask; | ||
| 266 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 267 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ | ||
| 268 | smp_mb__after_unlock_lock(); | ||
| 269 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); | ||
| 270 | } | ||
| 271 | |||
| 272 | /* | ||
| 273 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | 231 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | 
| 274 | * returning NULL if at the end of the list. | 232 | * returning NULL if at the end of the list. | 
| 275 | */ | 233 | */ | 
| @@ -300,7 +258,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) | |||
| 300 | */ | 258 | */ | 
| 301 | void rcu_read_unlock_special(struct task_struct *t) | 259 | void rcu_read_unlock_special(struct task_struct *t) | 
| 302 | { | 260 | { | 
| 303 | bool empty; | ||
| 304 | bool empty_exp; | 261 | bool empty_exp; | 
| 305 | bool empty_norm; | 262 | bool empty_norm; | 
| 306 | bool empty_exp_now; | 263 | bool empty_exp_now; | 
| @@ -334,7 +291,13 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 334 | } | 291 | } | 
| 335 | 292 | ||
| 336 | /* Hardware IRQ handlers cannot block, complain if they get here. */ | 293 | /* Hardware IRQ handlers cannot block, complain if they get here. */ | 
| 337 | if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { | 294 | if (in_irq() || in_serving_softirq()) { | 
| 295 | lockdep_rcu_suspicious(__FILE__, __LINE__, | ||
| 296 | "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); | ||
| 297 | pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", | ||
| 298 | t->rcu_read_unlock_special.s, | ||
| 299 | t->rcu_read_unlock_special.b.blocked, | ||
| 300 | t->rcu_read_unlock_special.b.need_qs); | ||
| 338 | local_irq_restore(flags); | 301 | local_irq_restore(flags); | 
| 339 | return; | 302 | return; | 
| 340 | } | 303 | } | 
| @@ -356,7 +319,6 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 356 | break; | 319 | break; | 
| 357 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 320 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 
| 358 | } | 321 | } | 
| 359 | empty = !rcu_preempt_has_tasks(rnp); | ||
| 360 | empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); | 322 | empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); | 
| 361 | empty_exp = !rcu_preempted_readers_exp(rnp); | 323 | empty_exp = !rcu_preempted_readers_exp(rnp); | 
| 362 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 324 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 
| @@ -377,14 +339,6 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 377 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 339 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 
| 378 | 340 | ||
| 379 | /* | 341 | /* | 
| 380 | * If this was the last task on the list, go see if we | ||
| 381 | * need to propagate ->qsmaskinit bit clearing up the | ||
| 382 | * rcu_node tree. | ||
| 383 | */ | ||
| 384 | if (!empty && !rcu_preempt_has_tasks(rnp)) | ||
| 385 | rcu_cleanup_dead_rnp(rnp); | ||
| 386 | |||
| 387 | /* | ||
| 388 | * If this was the last task on the current list, and if | 342 | * If this was the last task on the current list, and if | 
| 389 | * we aren't waiting on any CPUs, report the quiescent state. | 343 | * we aren't waiting on any CPUs, report the quiescent state. | 
| 390 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, | 344 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, | 
| @@ -399,7 +353,8 @@ void rcu_read_unlock_special(struct task_struct *t) | |||
| 399 | rnp->grplo, | 353 | rnp->grplo, | 
| 400 | rnp->grphi, | 354 | rnp->grphi, | 
| 401 | !!rnp->gp_tasks); | 355 | !!rnp->gp_tasks); | 
| 402 | rcu_report_unblock_qs_rnp(rnp, flags); | 356 | rcu_report_unblock_qs_rnp(&rcu_preempt_state, | 
| 357 | rnp, flags); | ||
| 403 | } else { | 358 | } else { | 
| 404 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 359 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 
| 405 | } | 360 | } | 
| @@ -520,10 +475,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |||
| 520 | WARN_ON_ONCE(rnp->qsmask); | 475 | WARN_ON_ONCE(rnp->qsmask); | 
| 521 | } | 476 | } | 
| 522 | 477 | ||
| 523 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 524 | |||
| 525 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 526 | |||
| 527 | /* | 478 | /* | 
| 528 | * Check for a quiescent state from the current CPU. When a task blocks, | 479 | * Check for a quiescent state from the current CPU. When a task blocks, | 
| 529 | * the task is recorded in the corresponding CPU's rcu_node structure, | 480 | * the task is recorded in the corresponding CPU's rcu_node structure, | 
| @@ -585,7 +536,7 @@ void synchronize_rcu(void) | |||
| 585 | "Illegal synchronize_rcu() in RCU read-side critical section"); | 536 | "Illegal synchronize_rcu() in RCU read-side critical section"); | 
| 586 | if (!rcu_scheduler_active) | 537 | if (!rcu_scheduler_active) | 
| 587 | return; | 538 | return; | 
| 588 | if (rcu_expedited) | 539 | if (rcu_gp_is_expedited()) | 
| 589 | synchronize_rcu_expedited(); | 540 | synchronize_rcu_expedited(); | 
| 590 | else | 541 | else | 
| 591 | wait_rcu_gp(call_rcu); | 542 | wait_rcu_gp(call_rcu); | 
| @@ -630,9 +581,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | |||
| 630 | * recursively up the tree. (Calm down, calm down, we do the recursion | 581 | * recursively up the tree. (Calm down, calm down, we do the recursion | 
| 631 | * iteratively!) | 582 | * iteratively!) | 
| 632 | * | 583 | * | 
| 633 | * Most callers will set the "wake" flag, but the task initiating the | ||
| 634 | * expedited grace period need not wake itself. | ||
| 635 | * | ||
| 636 | * Caller must hold sync_rcu_preempt_exp_mutex. | 584 | * Caller must hold sync_rcu_preempt_exp_mutex. | 
| 637 | */ | 585 | */ | 
| 638 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | 586 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | 
| @@ -667,29 +615,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | |||
| 667 | 615 | ||
| 668 | /* | 616 | /* | 
| 669 | * Snapshot the tasks blocking the newly started preemptible-RCU expedited | 617 | * Snapshot the tasks blocking the newly started preemptible-RCU expedited | 
| 670 | * grace period for the specified rcu_node structure. If there are no such | 618 | * grace period for the specified rcu_node structure, phase 1. If there | 
| 671 | * tasks, report it up the rcu_node hierarchy. | 619 | * are such tasks, set the ->expmask bits up the rcu_node tree and also | 
| 620 | * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 | ||
| 621 | * that work is needed here. | ||
| 672 | * | 622 | * | 
| 673 | * Caller must hold sync_rcu_preempt_exp_mutex and must exclude | 623 | * Caller must hold sync_rcu_preempt_exp_mutex. | 
| 674 | * CPU hotplug operations. | ||
| 675 | */ | 624 | */ | 
| 676 | static void | 625 | static void | 
| 677 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | 626 | sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) | 
| 678 | { | 627 | { | 
| 679 | unsigned long flags; | 628 | unsigned long flags; | 
| 680 | int must_wait = 0; | 629 | unsigned long mask; | 
| 630 | struct rcu_node *rnp_up; | ||
| 681 | 631 | ||
| 682 | raw_spin_lock_irqsave(&rnp->lock, flags); | 632 | raw_spin_lock_irqsave(&rnp->lock, flags); | 
| 683 | smp_mb__after_unlock_lock(); | 633 | smp_mb__after_unlock_lock(); | 
| 634 | WARN_ON_ONCE(rnp->expmask); | ||
| 635 | WARN_ON_ONCE(rnp->exp_tasks); | ||
| 684 | if (!rcu_preempt_has_tasks(rnp)) { | 636 | if (!rcu_preempt_has_tasks(rnp)) { | 
| 637 | /* No blocked tasks, nothing to do. */ | ||
| 685 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 638 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 
| 686 | } else { | 639 | return; | 
| 640 | } | ||
| 641 | /* Call for Phase 2 and propagate ->expmask bits up the tree. */ | ||
| 642 | rnp->expmask = 1; | ||
| 643 | rnp_up = rnp; | ||
| 644 | while (rnp_up->parent) { | ||
| 645 | mask = rnp_up->grpmask; | ||
| 646 | rnp_up = rnp_up->parent; | ||
| 647 | if (rnp_up->expmask & mask) | ||
| 648 | break; | ||
| 649 | raw_spin_lock(&rnp_up->lock); /* irqs already off */ | ||
| 650 | smp_mb__after_unlock_lock(); | ||
| 651 | rnp_up->expmask |= mask; | ||
| 652 | raw_spin_unlock(&rnp_up->lock); /* irqs still off */ | ||
| 653 | } | ||
| 654 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 655 | } | ||
| 656 | |||
| 657 | /* | ||
| 658 | * Snapshot the tasks blocking the newly started preemptible-RCU expedited | ||
| 659 | * grace period for the specified rcu_node structure, phase 2. If the | ||
| 660 | * leaf rcu_node structure has its ->expmask field set, check for tasks. | ||
| 661 | * If there are some, clear ->expmask and set ->exp_tasks accordingly, | ||
| 662 | * then initiate RCU priority boosting. Otherwise, clear ->expmask and | ||
| 663 | * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, | ||
| 664 | * enabling rcu_read_unlock_special() to do the bit-clearing. | ||
| 665 | * | ||
| 666 | * Caller must hold sync_rcu_preempt_exp_mutex. | ||
| 667 | */ | ||
| 668 | static void | ||
| 669 | sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) | ||
| 670 | { | ||
| 671 | unsigned long flags; | ||
| 672 | |||
| 673 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 674 | smp_mb__after_unlock_lock(); | ||
| 675 | if (!rnp->expmask) { | ||
| 676 | /* Phase 1 didn't do anything, so Phase 2 doesn't either. */ | ||
| 677 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 678 | return; | ||
| 679 | } | ||
| 680 | |||
| 681 | /* Phase 1 is over. */ | ||
| 682 | rnp->expmask = 0; | ||
| 683 | |||
| 684 | /* | ||
| 685 | * If there are still blocked tasks, set up ->exp_tasks so that | ||
| 686 | * rcu_read_unlock_special() will wake us and then boost them. | ||
| 687 | */ | ||
| 688 | if (rcu_preempt_has_tasks(rnp)) { | ||
| 687 | rnp->exp_tasks = rnp->blkd_tasks.next; | 689 | rnp->exp_tasks = rnp->blkd_tasks.next; | 
| 688 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | 690 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | 
| 689 | must_wait = 1; | 691 | return; | 
| 690 | } | 692 | } | 
| 691 | if (!must_wait) | 693 | |
| 692 | rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ | 694 | /* No longer any blocked tasks, so undo bit setting. */ | 
| 695 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 696 | rcu_report_exp_rnp(rsp, rnp, false); | ||
| 693 | } | 697 | } | 
| 694 | 698 | ||
| 695 | /** | 699 | /** | 
| @@ -706,7 +710,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 706 | */ | 710 | */ | 
| 707 | void synchronize_rcu_expedited(void) | 711 | void synchronize_rcu_expedited(void) | 
| 708 | { | 712 | { | 
| 709 | unsigned long flags; | ||
| 710 | struct rcu_node *rnp; | 713 | struct rcu_node *rnp; | 
| 711 | struct rcu_state *rsp = &rcu_preempt_state; | 714 | struct rcu_state *rsp = &rcu_preempt_state; | 
| 712 | unsigned long snap; | 715 | unsigned long snap; | 
| @@ -757,19 +760,16 @@ void synchronize_rcu_expedited(void) | |||
| 757 | /* force all RCU readers onto ->blkd_tasks lists. */ | 760 | /* force all RCU readers onto ->blkd_tasks lists. */ | 
| 758 | synchronize_sched_expedited(); | 761 | synchronize_sched_expedited(); | 
| 759 | 762 | ||
| 760 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ | 763 | /* | 
| 761 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { | 764 | * Snapshot current state of ->blkd_tasks lists into ->expmask. | 
| 762 | raw_spin_lock_irqsave(&rnp->lock, flags); | 765 | * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special() | 
| 763 | smp_mb__after_unlock_lock(); | 766 | * to start clearing them. Doing this in one phase leads to | 
| 764 | rnp->expmask = rnp->qsmaskinit; | 767 | * strange races between setting and clearing bits, so just say "no"! | 
| 765 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 768 | */ | 
| 766 | } | 769 | rcu_for_each_leaf_node(rsp, rnp) | 
| 767 | 770 | sync_rcu_preempt_exp_init1(rsp, rnp); | |
| 768 | /* Snapshot current state of ->blkd_tasks lists. */ | ||
| 769 | rcu_for_each_leaf_node(rsp, rnp) | 771 | rcu_for_each_leaf_node(rsp, rnp) | 
| 770 | sync_rcu_preempt_exp_init(rsp, rnp); | 772 | sync_rcu_preempt_exp_init2(rsp, rnp); | 
| 771 | if (NUM_RCU_NODES > 1) | ||
| 772 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); | ||
| 773 | 773 | ||
| 774 | put_online_cpus(); | 774 | put_online_cpus(); | 
| 775 | 775 | ||
| @@ -859,8 +859,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) | |||
| 859 | return 0; | 859 | return 0; | 
| 860 | } | 860 | } | 
| 861 | 861 | ||
| 862 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 863 | |||
| 864 | /* | 862 | /* | 
| 865 | * Because there is no preemptible RCU, there can be no readers blocked. | 863 | * Because there is no preemptible RCU, there can be no readers blocked. | 
| 866 | */ | 864 | */ | 
| @@ -869,8 +867,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) | |||
| 869 | return false; | 867 | return false; | 
| 870 | } | 868 | } | 
| 871 | 869 | ||
| 872 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 873 | |||
| 874 | /* | 870 | /* | 
| 875 | * Because preemptible RCU does not exist, we never have to check for | 871 | * Because preemptible RCU does not exist, we never have to check for | 
| 876 | * tasks blocked within RCU read-side critical sections. | 872 | * tasks blocked within RCU read-side critical sections. | 
| @@ -1170,7 +1166,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | |||
| 1170 | * Returns zero if all is well, a negated errno otherwise. | 1166 | * Returns zero if all is well, a negated errno otherwise. | 
| 1171 | */ | 1167 | */ | 
| 1172 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 1168 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 
| 1173 | struct rcu_node *rnp) | 1169 | struct rcu_node *rnp) | 
| 1174 | { | 1170 | { | 
| 1175 | int rnp_index = rnp - &rsp->node[0]; | 1171 | int rnp_index = rnp - &rsp->node[0]; | 
| 1176 | unsigned long flags; | 1172 | unsigned long flags; | 
| @@ -1180,7 +1176,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
| 1180 | if (&rcu_preempt_state != rsp) | 1176 | if (&rcu_preempt_state != rsp) | 
| 1181 | return 0; | 1177 | return 0; | 
| 1182 | 1178 | ||
| 1183 | if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) | 1179 | if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) | 
| 1184 | return 0; | 1180 | return 0; | 
| 1185 | 1181 | ||
| 1186 | rsp->boost = 1; | 1182 | rsp->boost = 1; | 
| @@ -1273,7 +1269,7 @@ static void rcu_cpu_kthread(unsigned int cpu) | |||
| 1273 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | 1269 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | 
| 1274 | { | 1270 | { | 
| 1275 | struct task_struct *t = rnp->boost_kthread_task; | 1271 | struct task_struct *t = rnp->boost_kthread_task; | 
| 1276 | unsigned long mask = rnp->qsmaskinit; | 1272 | unsigned long mask = rcu_rnp_online_cpus(rnp); | 
| 1277 | cpumask_var_t cm; | 1273 | cpumask_var_t cm; | 
| 1278 | int cpu; | 1274 | int cpu; | 
| 1279 | 1275 | ||
| @@ -1945,7 +1941,8 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) | |||
| 1945 | rhp = ACCESS_ONCE(rdp->nocb_follower_head); | 1941 | rhp = ACCESS_ONCE(rdp->nocb_follower_head); | 
| 1946 | 1942 | ||
| 1947 | /* Having no rcuo kthread but CBs after scheduler starts is bad! */ | 1943 | /* Having no rcuo kthread but CBs after scheduler starts is bad! */ | 
| 1948 | if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { | 1944 | if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp && | 
| 1945 | rcu_scheduler_fully_active) { | ||
| 1949 | /* RCU callback enqueued before CPU first came online??? */ | 1946 | /* RCU callback enqueued before CPU first came online??? */ | 
| 1950 | pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", | 1947 | pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", | 
| 1951 | cpu, rhp->func); | 1948 | cpu, rhp->func); | 
| @@ -2392,18 +2389,8 @@ void __init rcu_init_nohz(void) | |||
| 2392 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | 2389 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | 
| 2393 | 2390 | ||
| 2394 | for_each_rcu_flavor(rsp) { | 2391 | for_each_rcu_flavor(rsp) { | 
| 2395 | for_each_cpu(cpu, rcu_nocb_mask) { | 2392 | for_each_cpu(cpu, rcu_nocb_mask) | 
| 2396 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 2393 | init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu)); | 
| 2397 | |||
| 2398 | /* | ||
| 2399 | * If there are early callbacks, they will need | ||
| 2400 | * to be moved to the nocb lists. | ||
| 2401 | */ | ||
| 2402 | WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] != | ||
| 2403 | &rdp->nxtlist && | ||
| 2404 | rdp->nxttail[RCU_NEXT_TAIL] != NULL); | ||
| 2405 | init_nocb_callback_list(rdp); | ||
| 2406 | } | ||
| 2407 | rcu_organize_nocb_kthreads(rsp); | 2394 | rcu_organize_nocb_kthreads(rsp); | 
| 2408 | } | 2395 | } | 
| 2409 | } | 2396 | } | 
| @@ -2540,6 +2527,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) | |||
| 2540 | if (!rcu_is_nocb_cpu(rdp->cpu)) | 2527 | if (!rcu_is_nocb_cpu(rdp->cpu)) | 
| 2541 | return false; | 2528 | return false; | 
| 2542 | 2529 | ||
| 2530 | /* If there are early-boot callbacks, move them to nocb lists. */ | ||
| 2531 | if (rdp->nxtlist) { | ||
| 2532 | rdp->nocb_head = rdp->nxtlist; | ||
| 2533 | rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; | ||
| 2534 | atomic_long_set(&rdp->nocb_q_count, rdp->qlen); | ||
| 2535 | atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); | ||
| 2536 | rdp->nxtlist = NULL; | ||
| 2537 | rdp->qlen = 0; | ||
| 2538 | rdp->qlen_lazy = 0; | ||
| 2539 | } | ||
| 2543 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 2540 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 
| 2544 | return true; | 2541 | return true; | 
| 2545 | } | 2542 | } | 
| @@ -2763,7 +2760,8 @@ static void rcu_sysidle_exit(int irq) | |||
| 2763 | 2760 | ||
| 2764 | /* | 2761 | /* | 
| 2765 | * Check to see if the current CPU is idle. Note that usermode execution | 2762 | * Check to see if the current CPU is idle. Note that usermode execution | 
| 2766 | * does not count as idle. The caller must have disabled interrupts. | 2763 | * does not count as idle. The caller must have disabled interrupts, | 
| 2764 | * and must be running on tick_do_timer_cpu. | ||
| 2767 | */ | 2765 | */ | 
| 2768 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | 2766 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | 
| 2769 | unsigned long *maxj) | 2767 | unsigned long *maxj) | 
| @@ -2784,8 +2782,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | |||
| 2784 | if (!*isidle || rdp->rsp != rcu_state_p || | 2782 | if (!*isidle || rdp->rsp != rcu_state_p || | 
| 2785 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | 2783 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | 
| 2786 | return; | 2784 | return; | 
| 2787 | if (rcu_gp_in_progress(rdp->rsp)) | 2785 | /* Verify affinity of current kthread. */ | 
| 2788 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | 2786 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | 
| 2789 | 2787 | ||
| 2790 | /* Pick up current idle and NMI-nesting counter and check. */ | 2788 | /* Pick up current idle and NMI-nesting counter and check. */ | 
| 2791 | cur = atomic_read(&rdtp->dynticks_idle); | 2789 | cur = atomic_read(&rdtp->dynticks_idle); | 
| @@ -3068,11 +3066,10 @@ static void rcu_bind_gp_kthread(void) | |||
| 3068 | return; | 3066 | return; | 
| 3069 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 3067 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | 
| 3070 | cpu = tick_do_timer_cpu; | 3068 | cpu = tick_do_timer_cpu; | 
| 3071 | if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) | 3069 | if (cpu >= 0 && cpu < nr_cpu_ids) | 
| 3072 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | 3070 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | 
| 3073 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 3071 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 
| 3074 | if (!is_housekeeping_cpu(raw_smp_processor_id())) | 3072 | housekeeping_affine(current); | 
| 3075 | housekeeping_affine(current); | ||
| 3076 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 3073 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | 
| 3077 | } | 3074 | } | 
| 3078 | 3075 | ||
| diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index fbb6240509ea..f92361efd0f5 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c | |||
| @@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
| 283 | seq_puts(m, "\n"); | 283 | seq_puts(m, "\n"); | 
| 284 | level = rnp->level; | 284 | level = rnp->level; | 
| 285 | } | 285 | } | 
| 286 | seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", | 286 | seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", | 
| 287 | rnp->qsmask, rnp->qsmaskinit, | 287 | rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, | 
| 288 | ".G"[rnp->gp_tasks != NULL], | 288 | ".G"[rnp->gp_tasks != NULL], | 
| 289 | ".E"[rnp->exp_tasks != NULL], | 289 | ".E"[rnp->exp_tasks != NULL], | 
| 290 | ".T"[!list_empty(&rnp->blkd_tasks)], | 290 | ".T"[!list_empty(&rnp->blkd_tasks)], | 
| diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index e0d31a345ee6..1f133350da01 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -62,6 +62,63 @@ MODULE_ALIAS("rcupdate"); | |||
| 62 | 62 | ||
| 63 | module_param(rcu_expedited, int, 0); | 63 | module_param(rcu_expedited, int, 0); | 
| 64 | 64 | ||
| 65 | #ifndef CONFIG_TINY_RCU | ||
| 66 | |||
| 67 | static atomic_t rcu_expedited_nesting = | ||
| 68 | ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); | ||
| 69 | |||
| 70 | /* | ||
| 71 | * Should normal grace-period primitives be expedited? Intended for | ||
| 72 | * use within RCU. Note that this function takes the rcu_expedited | ||
| 73 | * sysfs/boot variable into account as well as the rcu_expedite_gp() | ||
| 74 | * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited() | ||
| 75 | * returns false is a -really- bad idea. | ||
| 76 | */ | ||
| 77 | bool rcu_gp_is_expedited(void) | ||
| 78 | { | ||
| 79 | return rcu_expedited || atomic_read(&rcu_expedited_nesting); | ||
| 80 | } | ||
| 81 | EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); | ||
| 82 | |||
| 83 | /** | ||
| 84 | * rcu_expedite_gp - Expedite future RCU grace periods | ||
| 85 | * | ||
| 86 | * After a call to this function, future calls to synchronize_rcu() and | ||
| 87 | * friends act as the corresponding synchronize_rcu_expedited() function | ||
| 88 | * had instead been called. | ||
| 89 | */ | ||
| 90 | void rcu_expedite_gp(void) | ||
| 91 | { | ||
| 92 | atomic_inc(&rcu_expedited_nesting); | ||
| 93 | } | ||
| 94 | EXPORT_SYMBOL_GPL(rcu_expedite_gp); | ||
| 95 | |||
| 96 | /** | ||
| 97 | * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation | ||
| 98 | * | ||
| 99 | * Undo a prior call to rcu_expedite_gp(). If all prior calls to | ||
| 100 | * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(), | ||
| 101 | * and if the rcu_expedited sysfs/boot parameter is not set, then all | ||
| 102 | * subsequent calls to synchronize_rcu() and friends will return to | ||
| 103 | * their normal non-expedited behavior. | ||
| 104 | */ | ||
| 105 | void rcu_unexpedite_gp(void) | ||
| 106 | { | ||
| 107 | atomic_dec(&rcu_expedited_nesting); | ||
| 108 | } | ||
| 109 | EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); | ||
| 110 | |||
| 111 | #endif /* #ifndef CONFIG_TINY_RCU */ | ||
| 112 | |||
| 113 | /* | ||
| 114 | * Inform RCU of the end of the in-kernel boot sequence. | ||
| 115 | */ | ||
| 116 | void rcu_end_inkernel_boot(void) | ||
| 117 | { | ||
| 118 | if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) | ||
| 119 | rcu_unexpedite_gp(); | ||
| 120 | } | ||
| 121 | |||
| 65 | #ifdef CONFIG_PREEMPT_RCU | 122 | #ifdef CONFIG_PREEMPT_RCU | 
| 66 | 123 | ||
| 67 | /* | 124 | /* | 
| @@ -199,16 +256,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | |||
| 199 | 256 | ||
| 200 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 257 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 
| 201 | 258 | ||
| 202 | struct rcu_synchronize { | 259 | /** | 
| 203 | struct rcu_head head; | 260 | * wakeme_after_rcu() - Callback function to awaken a task after grace period | 
| 204 | struct completion completion; | 261 | * @head: Pointer to rcu_head member within rcu_synchronize structure | 
| 205 | }; | 262 | * | 
| 206 | 263 | * Awaken the corresponding task now that a grace period has elapsed. | |
| 207 | /* | ||
| 208 | * Awaken the corresponding synchronize_rcu() instance now that a | ||
| 209 | * grace period has elapsed. | ||
| 210 | */ | 264 | */ | 
| 211 | static void wakeme_after_rcu(struct rcu_head *head) | 265 | void wakeme_after_rcu(struct rcu_head *head) | 
| 212 | { | 266 | { | 
| 213 | struct rcu_synchronize *rcu; | 267 | struct rcu_synchronize *rcu; | 
| 214 | 268 | ||
| diff --git a/kernel/reboot.c b/kernel/reboot.c index 5925f5ae8dff..d20c85d9f8c0 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
| @@ -387,8 +387,9 @@ void ctrl_alt_del(void) | |||
| 387 | } | 387 | } | 
| 388 | 388 | ||
| 389 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; | 389 | char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; | 
| 390 | static const char reboot_cmd[] = "/sbin/reboot"; | ||
| 390 | 391 | ||
| 391 | static int __orderly_poweroff(bool force) | 392 | static int run_cmd(const char *cmd) | 
| 392 | { | 393 | { | 
| 393 | char **argv; | 394 | char **argv; | 
| 394 | static char *envp[] = { | 395 | static char *envp[] = { | 
| @@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force) | |||
| 397 | NULL | 398 | NULL | 
| 398 | }; | 399 | }; | 
| 399 | int ret; | 400 | int ret; | 
| 400 | 401 | argv = argv_split(GFP_KERNEL, cmd, NULL); | |
| 401 | argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); | ||
| 402 | if (argv) { | 402 | if (argv) { | 
| 403 | ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | 403 | ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); | 
| 404 | argv_free(argv); | 404 | argv_free(argv); | 
| @@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force) | |||
| 406 | ret = -ENOMEM; | 406 | ret = -ENOMEM; | 
| 407 | } | 407 | } | 
| 408 | 408 | ||
| 409 | return ret; | ||
| 410 | } | ||
| 411 | |||
| 412 | static int __orderly_reboot(void) | ||
| 413 | { | ||
| 414 | int ret; | ||
| 415 | |||
| 416 | ret = run_cmd(reboot_cmd); | ||
| 417 | |||
| 418 | if (ret) { | ||
| 419 | pr_warn("Failed to start orderly reboot: forcing the issue\n"); | ||
| 420 | emergency_sync(); | ||
| 421 | kernel_restart(NULL); | ||
| 422 | } | ||
| 423 | |||
| 424 | return ret; | ||
| 425 | } | ||
| 426 | |||
| 427 | static int __orderly_poweroff(bool force) | ||
| 428 | { | ||
| 429 | int ret; | ||
| 430 | |||
| 431 | ret = run_cmd(poweroff_cmd); | ||
| 432 | |||
| 409 | if (ret && force) { | 433 | if (ret && force) { | 
| 410 | pr_warn("Failed to start orderly shutdown: forcing the issue\n"); | 434 | pr_warn("Failed to start orderly shutdown: forcing the issue\n"); | 
| 435 | |||
| 411 | /* | 436 | /* | 
| 412 | * I guess this should try to kick off some daemon to sync and | 437 | * I guess this should try to kick off some daemon to sync and | 
| 413 | * poweroff asap. Or not even bother syncing if we're doing an | 438 | * poweroff asap. Or not even bother syncing if we're doing an | 
| @@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func); | |||
| 436 | * This may be called from any context to trigger a system shutdown. | 461 | * This may be called from any context to trigger a system shutdown. | 
| 437 | * If the orderly shutdown fails, it will force an immediate shutdown. | 462 | * If the orderly shutdown fails, it will force an immediate shutdown. | 
| 438 | */ | 463 | */ | 
| 439 | int orderly_poweroff(bool force) | 464 | void orderly_poweroff(bool force) | 
| 440 | { | 465 | { | 
| 441 | if (force) /* do not override the pending "true" */ | 466 | if (force) /* do not override the pending "true" */ | 
| 442 | poweroff_force = true; | 467 | poweroff_force = true; | 
| 443 | schedule_work(&poweroff_work); | 468 | schedule_work(&poweroff_work); | 
| 444 | return 0; | ||
| 445 | } | 469 | } | 
| 446 | EXPORT_SYMBOL_GPL(orderly_poweroff); | 470 | EXPORT_SYMBOL_GPL(orderly_poweroff); | 
| 447 | 471 | ||
| 472 | static void reboot_work_func(struct work_struct *work) | ||
| 473 | { | ||
| 474 | __orderly_reboot(); | ||
| 475 | } | ||
| 476 | |||
| 477 | static DECLARE_WORK(reboot_work, reboot_work_func); | ||
| 478 | |||
| 479 | /** | ||
| 480 | * orderly_reboot - Trigger an orderly system reboot | ||
| 481 | * | ||
| 482 | * This may be called from any context to trigger a system reboot. | ||
| 483 | * If the orderly reboot fails, it will force an immediate reboot. | ||
| 484 | */ | ||
| 485 | void orderly_reboot(void) | ||
| 486 | { | ||
| 487 | schedule_work(&reboot_work); | ||
| 488 | } | ||
| 489 | EXPORT_SYMBOL_GPL(orderly_reboot); | ||
| 490 | |||
| 448 | static int __init reboot_setup(char *str) | 491 | static int __init reboot_setup(char *str) | 
| 449 | { | 492 | { | 
| 450 | for (;;) { | 493 | for (;;) { | 
| diff --git a/kernel/resource.c b/kernel/resource.c index 19f2357dfda3..90552aab5f2d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res) | |||
| 1034 | * | 1034 | * | 
| 1035 | * request_region creates a new busy region. | 1035 | * request_region creates a new busy region. | 
| 1036 | * | 1036 | * | 
| 1037 | * check_region returns non-zero if the area is already busy. | ||
| 1038 | * | ||
| 1039 | * release_region releases a matching busy region. | 1037 | * release_region releases a matching busy region. | 
| 1040 | */ | 1038 | */ | 
| 1041 | 1039 | ||
| @@ -1098,36 +1096,6 @@ struct resource * __request_region(struct resource *parent, | |||
| 1098 | EXPORT_SYMBOL(__request_region); | 1096 | EXPORT_SYMBOL(__request_region); | 
| 1099 | 1097 | ||
| 1100 | /** | 1098 | /** | 
| 1101 | * __check_region - check if a resource region is busy or free | ||
| 1102 | * @parent: parent resource descriptor | ||
| 1103 | * @start: resource start address | ||
| 1104 | * @n: resource region size | ||
| 1105 | * | ||
| 1106 | * Returns 0 if the region is free at the moment it is checked, | ||
| 1107 | * returns %-EBUSY if the region is busy. | ||
| 1108 | * | ||
| 1109 | * NOTE: | ||
| 1110 | * This function is deprecated because its use is racy. | ||
| 1111 | * Even if it returns 0, a subsequent call to request_region() | ||
| 1112 | * may fail because another driver etc. just allocated the region. | ||
| 1113 | * Do NOT use it. It will be removed from the kernel. | ||
| 1114 | */ | ||
| 1115 | int __check_region(struct resource *parent, resource_size_t start, | ||
| 1116 | resource_size_t n) | ||
| 1117 | { | ||
| 1118 | struct resource * res; | ||
| 1119 | |||
| 1120 | res = __request_region(parent, start, n, "check-region", 0); | ||
| 1121 | if (!res) | ||
| 1122 | return -EBUSY; | ||
| 1123 | |||
| 1124 | release_resource(res); | ||
| 1125 | free_resource(res); | ||
| 1126 | return 0; | ||
| 1127 | } | ||
| 1128 | EXPORT_SYMBOL(__check_region); | ||
| 1129 | |||
| 1130 | /** | ||
| 1131 | * __release_region - release a previously reserved resource region | 1099 | * __release_region - release a previously reserved resource region | 
| 1132 | * @parent: parent resource descriptor | 1100 | * @parent: parent resource descriptor | 
| 1133 | * @start: resource start address | 1101 | * @start: resource start address | 
| diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..f9123a82cbb6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -306,6 +306,9 @@ __read_mostly int scheduler_running; | |||
| 306 | */ | 306 | */ | 
| 307 | int sysctl_sched_rt_runtime = 950000; | 307 | int sysctl_sched_rt_runtime = 950000; | 
| 308 | 308 | ||
| 309 | /* cpus with isolated domains */ | ||
| 310 | cpumask_var_t cpu_isolated_map; | ||
| 311 | |||
| 309 | /* | 312 | /* | 
| 310 | * this_rq_lock - lock this runqueue and disable interrupts. | 313 | * this_rq_lock - lock this runqueue and disable interrupts. | 
| 311 | */ | 314 | */ | 
| @@ -690,6 +693,23 @@ static inline bool got_nohz_idle_kick(void) | |||
| 690 | bool sched_can_stop_tick(void) | 693 | bool sched_can_stop_tick(void) | 
| 691 | { | 694 | { | 
| 692 | /* | 695 | /* | 
| 696 | * FIFO realtime policy runs the highest priority task. Other runnable | ||
| 697 | * tasks are of a lower priority. The scheduler tick does nothing. | ||
| 698 | */ | ||
| 699 | if (current->policy == SCHED_FIFO) | ||
| 700 | return true; | ||
| 701 | |||
| 702 | /* | ||
| 703 | * Round-robin realtime tasks time slice with other tasks at the same | ||
| 704 | * realtime priority. Is this task the only one at this priority? | ||
| 705 | */ | ||
| 706 | if (current->policy == SCHED_RR) { | ||
| 707 | struct sched_rt_entity *rt_se = ¤t->rt; | ||
| 708 | |||
| 709 | return rt_se->run_list.prev == rt_se->run_list.next; | ||
| 710 | } | ||
| 711 | |||
| 712 | /* | ||
| 693 | * More than one running task need preemption. | 713 | * More than one running task need preemption. | 
| 694 | * nr_running update is assumed to be visible | 714 | * nr_running update is assumed to be visible | 
| 695 | * after IPI is sent from wakers. | 715 | * after IPI is sent from wakers. | 
| @@ -996,6 +1016,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
| 996 | rq_clock_skip_update(rq, true); | 1016 | rq_clock_skip_update(rq, true); | 
| 997 | } | 1017 | } | 
| 998 | 1018 | ||
| 1019 | static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); | ||
| 1020 | |||
| 1021 | void register_task_migration_notifier(struct notifier_block *n) | ||
| 1022 | { | ||
| 1023 | atomic_notifier_chain_register(&task_migration_notifier, n); | ||
| 1024 | } | ||
| 1025 | |||
| 999 | #ifdef CONFIG_SMP | 1026 | #ifdef CONFIG_SMP | 
| 1000 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 1027 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 
| 1001 | { | 1028 | { | 
| @@ -1026,10 +1053,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 1026 | trace_sched_migrate_task(p, new_cpu); | 1053 | trace_sched_migrate_task(p, new_cpu); | 
| 1027 | 1054 | ||
| 1028 | if (task_cpu(p) != new_cpu) { | 1055 | if (task_cpu(p) != new_cpu) { | 
| 1056 | struct task_migration_notifier tmn; | ||
| 1057 | |||
| 1029 | if (p->sched_class->migrate_task_rq) | 1058 | if (p->sched_class->migrate_task_rq) | 
| 1030 | p->sched_class->migrate_task_rq(p, new_cpu); | 1059 | p->sched_class->migrate_task_rq(p, new_cpu); | 
| 1031 | p->se.nr_migrations++; | 1060 | p->se.nr_migrations++; | 
| 1032 | perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); | 1061 | perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); | 
| 1062 | |||
| 1063 | tmn.task = p; | ||
| 1064 | tmn.from_cpu = task_cpu(p); | ||
| 1065 | tmn.to_cpu = new_cpu; | ||
| 1066 | |||
| 1067 | atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); | ||
| 1033 | } | 1068 | } | 
| 1034 | 1069 | ||
| 1035 | __set_task_cpu(p, new_cpu); | 1070 | __set_task_cpu(p, new_cpu); | 
| @@ -2818,7 +2853,7 @@ asmlinkage __visible void __sched schedule_user(void) | |||
| 2818 | * we find a better solution. | 2853 | * we find a better solution. | 
| 2819 | * | 2854 | * | 
| 2820 | * NB: There are buggy callers of this function. Ideally we | 2855 | * NB: There are buggy callers of this function. Ideally we | 
| 2821 | * should warn if prev_state != IN_USER, but that will trigger | 2856 | * should warn if prev_state != CONTEXT_USER, but that will trigger | 
| 2822 | * too frequently to make sense yet. | 2857 | * too frequently to make sense yet. | 
| 2823 | */ | 2858 | */ | 
| 2824 | enum ctx_state prev_state = exception_enter(); | 2859 | enum ctx_state prev_state = exception_enter(); | 
| @@ -3034,6 +3069,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 3034 | } else { | 3069 | } else { | 
| 3035 | if (dl_prio(oldprio)) | 3070 | if (dl_prio(oldprio)) | 
| 3036 | p->dl.dl_boosted = 0; | 3071 | p->dl.dl_boosted = 0; | 
| 3072 | if (rt_prio(oldprio)) | ||
| 3073 | p->rt.timeout = 0; | ||
| 3037 | p->sched_class = &fair_sched_class; | 3074 | p->sched_class = &fair_sched_class; | 
| 3038 | } | 3075 | } | 
| 3039 | 3076 | ||
| @@ -5318,36 +5355,13 @@ static int sched_cpu_active(struct notifier_block *nfb, | |||
| 5318 | static int sched_cpu_inactive(struct notifier_block *nfb, | 5355 | static int sched_cpu_inactive(struct notifier_block *nfb, | 
| 5319 | unsigned long action, void *hcpu) | 5356 | unsigned long action, void *hcpu) | 
| 5320 | { | 5357 | { | 
| 5321 | unsigned long flags; | ||
| 5322 | long cpu = (long)hcpu; | ||
| 5323 | struct dl_bw *dl_b; | ||
| 5324 | |||
| 5325 | switch (action & ~CPU_TASKS_FROZEN) { | 5358 | switch (action & ~CPU_TASKS_FROZEN) { | 
| 5326 | case CPU_DOWN_PREPARE: | 5359 | case CPU_DOWN_PREPARE: | 
| 5327 | set_cpu_active(cpu, false); | 5360 | set_cpu_active((long)hcpu, false); | 
| 5328 | |||
| 5329 | /* explicitly allow suspend */ | ||
| 5330 | if (!(action & CPU_TASKS_FROZEN)) { | ||
| 5331 | bool overflow; | ||
| 5332 | int cpus; | ||
| 5333 | |||
| 5334 | rcu_read_lock_sched(); | ||
| 5335 | dl_b = dl_bw_of(cpu); | ||
| 5336 | |||
| 5337 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 5338 | cpus = dl_bw_cpus(cpu); | ||
| 5339 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
| 5340 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 5341 | |||
| 5342 | rcu_read_unlock_sched(); | ||
| 5343 | |||
| 5344 | if (overflow) | ||
| 5345 | return notifier_from_errno(-EBUSY); | ||
| 5346 | } | ||
| 5347 | return NOTIFY_OK; | 5361 | return NOTIFY_OK; | 
| 5362 | default: | ||
| 5363 | return NOTIFY_DONE; | ||
| 5348 | } | 5364 | } | 
| 5349 | |||
| 5350 | return NOTIFY_DONE; | ||
| 5351 | } | 5365 | } | 
| 5352 | 5366 | ||
| 5353 | static int __init migration_init(void) | 5367 | static int __init migration_init(void) | 
| @@ -5428,17 +5442,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 5428 | break; | 5442 | break; | 
| 5429 | } | 5443 | } | 
| 5430 | 5444 | ||
| 5431 | /* | ||
| 5432 | * Even though we initialize ->capacity to something semi-sane, | ||
| 5433 | * we leave capacity_orig unset. This allows us to detect if | ||
| 5434 | * domain iteration is still funny without causing /0 traps. | ||
| 5435 | */ | ||
| 5436 | if (!group->sgc->capacity_orig) { | ||
| 5437 | printk(KERN_CONT "\n"); | ||
| 5438 | printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); | ||
| 5439 | break; | ||
| 5440 | } | ||
| 5441 | |||
| 5442 | if (!cpumask_weight(sched_group_cpus(group))) { | 5445 | if (!cpumask_weight(sched_group_cpus(group))) { | 
| 5443 | printk(KERN_CONT "\n"); | 5446 | printk(KERN_CONT "\n"); | 
| 5444 | printk(KERN_ERR "ERROR: empty group\n"); | 5447 | printk(KERN_ERR "ERROR: empty group\n"); | 
| @@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 5811 | update_top_cache_domain(cpu); | 5814 | update_top_cache_domain(cpu); | 
| 5812 | } | 5815 | } | 
| 5813 | 5816 | ||
| 5814 | /* cpus with isolated domains */ | ||
| 5815 | static cpumask_var_t cpu_isolated_map; | ||
| 5816 | |||
| 5817 | /* Setup the mask of cpus configured for isolated domains */ | 5817 | /* Setup the mask of cpus configured for isolated domains */ | 
| 5818 | static int __init isolated_cpu_setup(char *str) | 5818 | static int __init isolated_cpu_setup(char *str) | 
| 5819 | { | 5819 | { | 
| @@ -5922,7 +5922,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5922 | * die on a /0 trap. | 5922 | * die on a /0 trap. | 
| 5923 | */ | 5923 | */ | 
| 5924 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | 5924 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | 
| 5925 | sg->sgc->capacity_orig = sg->sgc->capacity; | ||
| 5926 | 5925 | ||
| 5927 | /* | 5926 | /* | 
| 5928 | * Make sure the first group of this domain contains the | 5927 | * Make sure the first group of this domain contains the | 
| @@ -6233,6 +6232,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) | |||
| 6233 | */ | 6232 | */ | 
| 6234 | 6233 | ||
| 6235 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | 6234 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | 
| 6235 | sd->flags |= SD_PREFER_SIBLING; | ||
| 6236 | sd->imbalance_pct = 110; | 6236 | sd->imbalance_pct = 110; | 
| 6237 | sd->smt_gain = 1178; /* ~15% */ | 6237 | sd->smt_gain = 1178; /* ~15% */ | 
| 6238 | 6238 | ||
| @@ -6998,7 +6998,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
| 6998 | */ | 6998 | */ | 
| 6999 | 6999 | ||
| 7000 | case CPU_ONLINE: | 7000 | case CPU_ONLINE: | 
| 7001 | case CPU_DOWN_FAILED: | ||
| 7002 | cpuset_update_active_cpus(true); | 7001 | cpuset_update_active_cpus(true); | 
| 7003 | break; | 7002 | break; | 
| 7004 | default: | 7003 | default: | 
| @@ -7010,8 +7009,30 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, | |||
| 7010 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 7009 | static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, | 
| 7011 | void *hcpu) | 7010 | void *hcpu) | 
| 7012 | { | 7011 | { | 
| 7013 | switch (action) { | 7012 | unsigned long flags; | 
| 7013 | long cpu = (long)hcpu; | ||
| 7014 | struct dl_bw *dl_b; | ||
| 7015 | |||
| 7016 | switch (action & ~CPU_TASKS_FROZEN) { | ||
| 7014 | case CPU_DOWN_PREPARE: | 7017 | case CPU_DOWN_PREPARE: | 
| 7018 | /* explicitly allow suspend */ | ||
| 7019 | if (!(action & CPU_TASKS_FROZEN)) { | ||
| 7020 | bool overflow; | ||
| 7021 | int cpus; | ||
| 7022 | |||
| 7023 | rcu_read_lock_sched(); | ||
| 7024 | dl_b = dl_bw_of(cpu); | ||
| 7025 | |||
| 7026 | raw_spin_lock_irqsave(&dl_b->lock, flags); | ||
| 7027 | cpus = dl_bw_cpus(cpu); | ||
| 7028 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | ||
| 7029 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | ||
| 7030 | |||
| 7031 | rcu_read_unlock_sched(); | ||
| 7032 | |||
| 7033 | if (overflow) | ||
| 7034 | return notifier_from_errno(-EBUSY); | ||
| 7035 | } | ||
| 7015 | cpuset_update_active_cpus(false); | 7036 | cpuset_update_active_cpus(false); | 
| 7016 | break; | 7037 | break; | 
| 7017 | case CPU_DOWN_PREPARE_FROZEN: | 7038 | case CPU_DOWN_PREPARE_FROZEN: | 
| @@ -7156,8 +7177,8 @@ void __init sched_init(void) | |||
| 7156 | rq->calc_load_active = 0; | 7177 | rq->calc_load_active = 0; | 
| 7157 | rq->calc_load_update = jiffies + LOAD_FREQ; | 7178 | rq->calc_load_update = jiffies + LOAD_FREQ; | 
| 7158 | init_cfs_rq(&rq->cfs); | 7179 | init_cfs_rq(&rq->cfs); | 
| 7159 | init_rt_rq(&rq->rt, rq); | 7180 | init_rt_rq(&rq->rt); | 
| 7160 | init_dl_rq(&rq->dl, rq); | 7181 | init_dl_rq(&rq->dl); | 
| 7161 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7182 | #ifdef CONFIG_FAIR_GROUP_SCHED | 
| 7162 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 7183 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 
| 7163 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7184 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 
| @@ -7197,7 +7218,7 @@ void __init sched_init(void) | |||
| 7197 | #ifdef CONFIG_SMP | 7218 | #ifdef CONFIG_SMP | 
| 7198 | rq->sd = NULL; | 7219 | rq->sd = NULL; | 
| 7199 | rq->rd = NULL; | 7220 | rq->rd = NULL; | 
| 7200 | rq->cpu_capacity = SCHED_CAPACITY_SCALE; | 7221 | rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; | 
| 7201 | rq->post_schedule = 0; | 7222 | rq->post_schedule = 0; | 
| 7202 | rq->active_balance = 0; | 7223 | rq->active_balance = 0; | 
| 7203 | rq->next_balance = jiffies; | 7224 | rq->next_balance = jiffies; | 
| @@ -7796,7 +7817,7 @@ static int sched_rt_global_constraints(void) | |||
| 7796 | } | 7817 | } | 
| 7797 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7818 | #endif /* CONFIG_RT_GROUP_SCHED */ | 
| 7798 | 7819 | ||
| 7799 | static int sched_dl_global_constraints(void) | 7820 | static int sched_dl_global_validate(void) | 
| 7800 | { | 7821 | { | 
| 7801 | u64 runtime = global_rt_runtime(); | 7822 | u64 runtime = global_rt_runtime(); | 
| 7802 | u64 period = global_rt_period(); | 7823 | u64 period = global_rt_period(); | 
| @@ -7897,11 +7918,11 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
| 7897 | if (ret) | 7918 | if (ret) | 
| 7898 | goto undo; | 7919 | goto undo; | 
| 7899 | 7920 | ||
| 7900 | ret = sched_rt_global_constraints(); | 7921 | ret = sched_dl_global_validate(); | 
| 7901 | if (ret) | 7922 | if (ret) | 
| 7902 | goto undo; | 7923 | goto undo; | 
| 7903 | 7924 | ||
| 7904 | ret = sched_dl_global_constraints(); | 7925 | ret = sched_rt_global_constraints(); | 
| 7905 | if (ret) | 7926 | if (ret) | 
| 7906 | goto undo; | 7927 | goto undo; | 
| 7907 | 7928 | ||
| diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3fa8fa6d9403..5e95145088fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) | |||
| 69 | dl_b->total_bw = 0; | 69 | dl_b->total_bw = 0; | 
| 70 | } | 70 | } | 
| 71 | 71 | ||
| 72 | void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) | 72 | void init_dl_rq(struct dl_rq *dl_rq) | 
| 73 | { | 73 | { | 
| 74 | dl_rq->rb_root = RB_ROOT; | 74 | dl_rq->rb_root = RB_ROOT; | 
| 75 | 75 | ||
| @@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) | |||
| 218 | rq->post_schedule = has_pushable_dl_tasks(rq); | 218 | rq->post_schedule = has_pushable_dl_tasks(rq); | 
| 219 | } | 219 | } | 
| 220 | 220 | ||
| 221 | static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); | ||
| 222 | |||
| 223 | static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) | ||
| 224 | { | ||
| 225 | struct rq *later_rq = NULL; | ||
| 226 | bool fallback = false; | ||
| 227 | |||
| 228 | later_rq = find_lock_later_rq(p, rq); | ||
| 229 | |||
| 230 | if (!later_rq) { | ||
| 231 | int cpu; | ||
| 232 | |||
| 233 | /* | ||
| 234 | * If we cannot preempt any rq, fall back to pick any | ||
| 235 | * online cpu. | ||
| 236 | */ | ||
| 237 | fallback = true; | ||
| 238 | cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); | ||
| 239 | if (cpu >= nr_cpu_ids) { | ||
| 240 | /* | ||
| 241 | * Fail to find any suitable cpu. | ||
| 242 | * The task will never come back! | ||
| 243 | */ | ||
| 244 | BUG_ON(dl_bandwidth_enabled()); | ||
| 245 | |||
| 246 | /* | ||
| 247 | * If admission control is disabled we | ||
| 248 | * try a little harder to let the task | ||
| 249 | * run. | ||
| 250 | */ | ||
| 251 | cpu = cpumask_any(cpu_active_mask); | ||
| 252 | } | ||
| 253 | later_rq = cpu_rq(cpu); | ||
| 254 | double_lock_balance(rq, later_rq); | ||
| 255 | } | ||
| 256 | |||
| 257 | deactivate_task(rq, p, 0); | ||
| 258 | set_task_cpu(p, later_rq->cpu); | ||
| 259 | activate_task(later_rq, p, ENQUEUE_REPLENISH); | ||
| 260 | |||
| 261 | if (!fallback) | ||
| 262 | resched_curr(later_rq); | ||
| 263 | |||
| 264 | double_unlock_balance(rq, later_rq); | ||
| 265 | } | ||
| 266 | |||
| 221 | #else | 267 | #else | 
| 222 | 268 | ||
| 223 | static inline | 269 | static inline | 
| @@ -514,7 +560,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 514 | unsigned long flags; | 560 | unsigned long flags; | 
| 515 | struct rq *rq; | 561 | struct rq *rq; | 
| 516 | 562 | ||
| 517 | rq = task_rq_lock(current, &flags); | 563 | rq = task_rq_lock(p, &flags); | 
| 518 | 564 | ||
| 519 | /* | 565 | /* | 
| 520 | * We need to take care of several possible races here: | 566 | * We need to take care of several possible races here: | 
| @@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 536 | sched_clock_tick(); | 582 | sched_clock_tick(); | 
| 537 | update_rq_clock(rq); | 583 | update_rq_clock(rq); | 
| 538 | 584 | ||
| 585 | #ifdef CONFIG_SMP | ||
| 586 | /* | ||
| 587 | * If we find that the rq the task was on is no longer | ||
| 588 | * available, we need to select a new rq. | ||
| 589 | */ | ||
| 590 | if (unlikely(!rq->online)) { | ||
| 591 | dl_task_offline_migration(rq, p); | ||
| 592 | goto unlock; | ||
| 593 | } | ||
| 594 | #endif | ||
| 595 | |||
| 539 | /* | 596 | /* | 
| 540 | * If the throttle happened during sched-out; like: | 597 | * If the throttle happened during sched-out; like: | 
| 541 | * | 598 | * | 
| @@ -569,7 +626,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
| 569 | push_dl_task(rq); | 626 | push_dl_task(rq); | 
| 570 | #endif | 627 | #endif | 
| 571 | unlock: | 628 | unlock: | 
| 572 | task_rq_unlock(rq, current, &flags); | 629 | task_rq_unlock(rq, p, &flags); | 
| 573 | 630 | ||
| 574 | return HRTIMER_NORESTART; | 631 | return HRTIMER_NORESTART; | 
| 575 | } | 632 | } | 
| @@ -914,6 +971,12 @@ static void yield_task_dl(struct rq *rq) | |||
| 914 | } | 971 | } | 
| 915 | update_rq_clock(rq); | 972 | update_rq_clock(rq); | 
| 916 | update_curr_dl(rq); | 973 | update_curr_dl(rq); | 
| 974 | /* | ||
| 975 | * Tell update_rq_clock() that we've just updated, | ||
| 976 | * so we don't do microscopic update in schedule() | ||
| 977 | * and double the fastpath cost. | ||
| 978 | */ | ||
| 979 | rq_clock_skip_update(rq, true); | ||
| 917 | } | 980 | } | 
| 918 | 981 | ||
| 919 | #ifdef CONFIG_SMP | 982 | #ifdef CONFIG_SMP | 
| @@ -1659,14 +1722,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
| 1659 | { | 1722 | { | 
| 1660 | int check_resched = 1; | 1723 | int check_resched = 1; | 
| 1661 | 1724 | ||
| 1662 | /* | ||
| 1663 | * If p is throttled, don't consider the possibility | ||
| 1664 | * of preempting rq->curr, the check will be done right | ||
| 1665 | * after its runtime will get replenished. | ||
| 1666 | */ | ||
| 1667 | if (unlikely(p->dl.dl_throttled)) | ||
| 1668 | return; | ||
| 1669 | |||
| 1670 | if (task_on_rq_queued(p) && rq->curr != p) { | 1725 | if (task_on_rq_queued(p) && rq->curr != p) { | 
| 1671 | #ifdef CONFIG_SMP | 1726 | #ifdef CONFIG_SMP | 
| 1672 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && | 1727 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && | 
| diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8baaf858d25c..a245c1fc6f0a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 71 | if (!se) { | 71 | if (!se) { | 
| 72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | 72 | struct sched_avg *avg = &cpu_rq(cpu)->avg; | 
| 73 | P(avg->runnable_avg_sum); | 73 | P(avg->runnable_avg_sum); | 
| 74 | P(avg->runnable_avg_period); | 74 | P(avg->avg_period); | 
| 75 | return; | 75 | return; | 
| 76 | } | 76 | } | 
| 77 | 77 | ||
| @@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
| 94 | P(se->load.weight); | 94 | P(se->load.weight); | 
| 95 | #ifdef CONFIG_SMP | 95 | #ifdef CONFIG_SMP | 
| 96 | P(se->avg.runnable_avg_sum); | 96 | P(se->avg.runnable_avg_sum); | 
| 97 | P(se->avg.runnable_avg_period); | 97 | P(se->avg.running_avg_sum); | 
| 98 | P(se->avg.avg_period); | ||
| 98 | P(se->avg.load_avg_contrib); | 99 | P(se->avg.load_avg_contrib); | 
| 100 | P(se->avg.utilization_avg_contrib); | ||
| 99 | P(se->avg.decay_count); | 101 | P(se->avg.decay_count); | 
| 100 | #endif | 102 | #endif | 
| 101 | #undef PN | 103 | #undef PN | 
| @@ -214,6 +216,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
| 214 | cfs_rq->runnable_load_avg); | 216 | cfs_rq->runnable_load_avg); | 
| 215 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", | 217 | SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", | 
| 216 | cfs_rq->blocked_load_avg); | 218 | cfs_rq->blocked_load_avg); | 
| 219 | SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", | ||
| 220 | cfs_rq->utilization_load_avg); | ||
| 217 | #ifdef CONFIG_FAIR_GROUP_SCHED | 221 | #ifdef CONFIG_FAIR_GROUP_SCHED | 
| 218 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", | 222 | SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", | 
| 219 | cfs_rq->tg_load_contrib); | 223 | cfs_rq->tg_load_contrib); | 
| @@ -636,8 +640,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 636 | P(se.load.weight); | 640 | P(se.load.weight); | 
| 637 | #ifdef CONFIG_SMP | 641 | #ifdef CONFIG_SMP | 
| 638 | P(se.avg.runnable_avg_sum); | 642 | P(se.avg.runnable_avg_sum); | 
| 639 | P(se.avg.runnable_avg_period); | 643 | P(se.avg.running_avg_sum); | 
| 644 | P(se.avg.avg_period); | ||
| 640 | P(se.avg.load_avg_contrib); | 645 | P(se.avg.load_avg_contrib); | 
| 646 | P(se.avg.utilization_avg_contrib); | ||
| 641 | P(se.avg.decay_count); | 647 | P(se.avg.decay_count); | 
| 642 | #endif | 648 | #endif | 
| 643 | P(policy); | 649 | P(policy); | 
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..ffeaa4105e48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); | |||
| 670 | static unsigned long task_h_load(struct task_struct *p); | 670 | static unsigned long task_h_load(struct task_struct *p); | 
| 671 | 671 | ||
| 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 672 | static inline void __update_task_entity_contrib(struct sched_entity *se); | 
| 673 | static inline void __update_task_entity_utilization(struct sched_entity *se); | ||
| 673 | 674 | ||
| 674 | /* Give new task start runnable values to heavy its load in infant time */ | 675 | /* Give new task start runnable values to heavy its load in infant time */ | 
| 675 | void init_task_runnable_average(struct task_struct *p) | 676 | void init_task_runnable_average(struct task_struct *p) | 
| @@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p) | |||
| 677 | u32 slice; | 678 | u32 slice; | 
| 678 | 679 | ||
| 679 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | 680 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | 
| 680 | p->se.avg.runnable_avg_sum = slice; | 681 | p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; | 
| 681 | p->se.avg.runnable_avg_period = slice; | 682 | p->se.avg.avg_period = slice; | 
| 682 | __update_task_entity_contrib(&p->se); | 683 | __update_task_entity_contrib(&p->se); | 
| 684 | __update_task_entity_utilization(&p->se); | ||
| 683 | } | 685 | } | 
| 684 | #else | 686 | #else | 
| 685 | void init_task_runnable_average(struct task_struct *p) | 687 | void init_task_runnable_average(struct task_struct *p) | 
| @@ -1196,9 +1198,11 @@ static void task_numa_assign(struct task_numa_env *env, | |||
| 1196 | static bool load_too_imbalanced(long src_load, long dst_load, | 1198 | static bool load_too_imbalanced(long src_load, long dst_load, | 
| 1197 | struct task_numa_env *env) | 1199 | struct task_numa_env *env) | 
| 1198 | { | 1200 | { | 
| 1199 | long imb, old_imb; | ||
| 1200 | long orig_src_load, orig_dst_load; | ||
| 1201 | long src_capacity, dst_capacity; | 1201 | long src_capacity, dst_capacity; | 
| 1202 | long orig_src_load; | ||
| 1203 | long load_a, load_b; | ||
| 1204 | long moved_load; | ||
| 1205 | long imb; | ||
| 1202 | 1206 | ||
| 1203 | /* | 1207 | /* | 
| 1204 | * The load is corrected for the CPU capacity available on each node. | 1208 | * The load is corrected for the CPU capacity available on each node. | 
| @@ -1211,30 +1215,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, | |||
| 1211 | dst_capacity = env->dst_stats.compute_capacity; | 1215 | dst_capacity = env->dst_stats.compute_capacity; | 
| 1212 | 1216 | ||
| 1213 | /* We care about the slope of the imbalance, not the direction. */ | 1217 | /* We care about the slope of the imbalance, not the direction. */ | 
| 1214 | if (dst_load < src_load) | 1218 | load_a = dst_load; | 
| 1215 | swap(dst_load, src_load); | 1219 | load_b = src_load; | 
| 1220 | if (load_a < load_b) | ||
| 1221 | swap(load_a, load_b); | ||
| 1216 | 1222 | ||
| 1217 | /* Is the difference below the threshold? */ | 1223 | /* Is the difference below the threshold? */ | 
| 1218 | imb = dst_load * src_capacity * 100 - | 1224 | imb = load_a * src_capacity * 100 - | 
| 1219 | src_load * dst_capacity * env->imbalance_pct; | 1225 | load_b * dst_capacity * env->imbalance_pct; | 
| 1220 | if (imb <= 0) | 1226 | if (imb <= 0) | 
| 1221 | return false; | 1227 | return false; | 
| 1222 | 1228 | ||
| 1223 | /* | 1229 | /* | 
| 1224 | * The imbalance is above the allowed threshold. | 1230 | * The imbalance is above the allowed threshold. | 
| 1225 | * Compare it with the old imbalance. | 1231 | * Allow a move that brings us closer to a balanced situation, | 
| 1232 | * without moving things past the point of balance. | ||
| 1226 | */ | 1233 | */ | 
| 1227 | orig_src_load = env->src_stats.load; | 1234 | orig_src_load = env->src_stats.load; | 
| 1228 | orig_dst_load = env->dst_stats.load; | ||
| 1229 | 1235 | ||
| 1230 | if (orig_dst_load < orig_src_load) | 1236 | /* | 
| 1231 | swap(orig_dst_load, orig_src_load); | 1237 | * In a task swap, there will be one load moving from src to dst, | 
| 1232 | 1238 | * and another moving back. This is the net sum of both moves. | |
| 1233 | old_imb = orig_dst_load * src_capacity * 100 - | 1239 | * A simple task move will always have a positive value. | 
| 1234 | orig_src_load * dst_capacity * env->imbalance_pct; | 1240 | * Allow the move if it brings the system closer to a balanced | 
| 1241 | * situation, without crossing over the balance point. | ||
| 1242 | */ | ||
| 1243 | moved_load = orig_src_load - src_load; | ||
| 1235 | 1244 | ||
| 1236 | /* Would this change make things worse? */ | 1245 | if (moved_load > 0) | 
| 1237 | return (imb > old_imb); | 1246 | /* Moving src -> dst. Did we overshoot balance? */ | 
| 1247 | return src_load * dst_capacity < dst_load * src_capacity; | ||
| 1248 | else | ||
| 1249 | /* Moving dst -> src. Did we overshoot balance? */ | ||
| 1250 | return dst_load * src_capacity < src_load * dst_capacity; | ||
| 1238 | } | 1251 | } | 
| 1239 | 1252 | ||
| 1240 | /* | 1253 | /* | 
| @@ -1609,9 +1622,11 @@ static void update_task_scan_period(struct task_struct *p, | |||
| 1609 | /* | 1622 | /* | 
| 1610 | * If there were no record hinting faults then either the task is | 1623 | * If there were no record hinting faults then either the task is | 
| 1611 | * completely idle or all activity is areas that are not of interest | 1624 | * completely idle or all activity is areas that are not of interest | 
| 1612 | * to automatic numa balancing. Scan slower | 1625 | * to automatic numa balancing. Related to that, if there were failed | 
| 1626 | * migration then it implies we are migrating too quickly or the local | ||
| 1627 | * node is overloaded. In either case, scan slower | ||
| 1613 | */ | 1628 | */ | 
| 1614 | if (local + shared == 0) { | 1629 | if (local + shared == 0 || p->numa_faults_locality[2]) { | 
| 1615 | p->numa_scan_period = min(p->numa_scan_period_max, | 1630 | p->numa_scan_period = min(p->numa_scan_period_max, | 
| 1616 | p->numa_scan_period << 1); | 1631 | p->numa_scan_period << 1); | 
| 1617 | 1632 | ||
| @@ -1673,7 +1688,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) | |||
| 1673 | *period = now - p->last_task_numa_placement; | 1688 | *period = now - p->last_task_numa_placement; | 
| 1674 | } else { | 1689 | } else { | 
| 1675 | delta = p->se.avg.runnable_avg_sum; | 1690 | delta = p->se.avg.runnable_avg_sum; | 
| 1676 | *period = p->se.avg.runnable_avg_period; | 1691 | *period = p->se.avg.avg_period; | 
| 1677 | } | 1692 | } | 
| 1678 | 1693 | ||
| 1679 | p->last_sum_exec_runtime = runtime; | 1694 | p->last_sum_exec_runtime = runtime; | 
| @@ -1763,6 +1778,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) | |||
| 1763 | } | 1778 | } | 
| 1764 | } | 1779 | } | 
| 1765 | /* Next round, evaluate the nodes within max_group. */ | 1780 | /* Next round, evaluate the nodes within max_group. */ | 
| 1781 | if (!max_faults) | ||
| 1782 | break; | ||
| 1766 | nodes = max_group; | 1783 | nodes = max_group; | 
| 1767 | } | 1784 | } | 
| 1768 | return nid; | 1785 | return nid; | 
| @@ -2080,6 +2097,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) | |||
| 2080 | 2097 | ||
| 2081 | if (migrated) | 2098 | if (migrated) | 
| 2082 | p->numa_pages_migrated += pages; | 2099 | p->numa_pages_migrated += pages; | 
| 2100 | if (flags & TNF_MIGRATE_FAIL) | ||
| 2101 | p->numa_faults_locality[2] += pages; | ||
| 2083 | 2102 | ||
| 2084 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; | 2103 | p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; | 
| 2085 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; | 2104 | p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; | 
| @@ -2161,8 +2180,10 @@ void task_numa_work(struct callback_head *work) | |||
| 2161 | vma = mm->mmap; | 2180 | vma = mm->mmap; | 
| 2162 | } | 2181 | } | 
| 2163 | for (; vma; vma = vma->vm_next) { | 2182 | for (; vma; vma = vma->vm_next) { | 
| 2164 | if (!vma_migratable(vma) || !vma_policy_mof(vma)) | 2183 | if (!vma_migratable(vma) || !vma_policy_mof(vma) || | 
| 2184 | is_vm_hugetlb_page(vma)) { | ||
| 2165 | continue; | 2185 | continue; | 
| 2186 | } | ||
| 2166 | 2187 | ||
| 2167 | /* | 2188 | /* | 
| 2168 | * Shared library pages mapped by multiple processes are not | 2189 | * Shared library pages mapped by multiple processes are not | 
| @@ -2497,13 +2518,15 @@ static u32 __compute_runnable_contrib(u64 n) | |||
| 2497 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | 2518 | * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) | 
| 2498 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 2519 | * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] | 
| 2499 | */ | 2520 | */ | 
| 2500 | static __always_inline int __update_entity_runnable_avg(u64 now, | 2521 | static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, | 
| 2501 | struct sched_avg *sa, | 2522 | struct sched_avg *sa, | 
| 2502 | int runnable) | 2523 | int runnable, | 
| 2524 | int running) | ||
| 2503 | { | 2525 | { | 
| 2504 | u64 delta, periods; | 2526 | u64 delta, periods; | 
| 2505 | u32 runnable_contrib; | 2527 | u32 runnable_contrib; | 
| 2506 | int delta_w, decayed = 0; | 2528 | int delta_w, decayed = 0; | 
| 2529 | unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
| 2507 | 2530 | ||
| 2508 | delta = now - sa->last_runnable_update; | 2531 | delta = now - sa->last_runnable_update; | 
| 2509 | /* | 2532 | /* | 
| @@ -2525,7 +2548,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2525 | sa->last_runnable_update = now; | 2548 | sa->last_runnable_update = now; | 
| 2526 | 2549 | ||
| 2527 | /* delta_w is the amount already accumulated against our next period */ | 2550 | /* delta_w is the amount already accumulated against our next period */ | 
| 2528 | delta_w = sa->runnable_avg_period % 1024; | 2551 | delta_w = sa->avg_period % 1024; | 
| 2529 | if (delta + delta_w >= 1024) { | 2552 | if (delta + delta_w >= 1024) { | 
| 2530 | /* period roll-over */ | 2553 | /* period roll-over */ | 
| 2531 | decayed = 1; | 2554 | decayed = 1; | 
| @@ -2538,7 +2561,10 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2538 | delta_w = 1024 - delta_w; | 2561 | delta_w = 1024 - delta_w; | 
| 2539 | if (runnable) | 2562 | if (runnable) | 
| 2540 | sa->runnable_avg_sum += delta_w; | 2563 | sa->runnable_avg_sum += delta_w; | 
| 2541 | sa->runnable_avg_period += delta_w; | 2564 | if (running) | 
| 2565 | sa->running_avg_sum += delta_w * scale_freq | ||
| 2566 | >> SCHED_CAPACITY_SHIFT; | ||
| 2567 | sa->avg_period += delta_w; | ||
| 2542 | 2568 | ||
| 2543 | delta -= delta_w; | 2569 | delta -= delta_w; | 
| 2544 | 2570 | ||
| @@ -2548,20 +2574,28 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
| 2548 | 2574 | ||
| 2549 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | 2575 | sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, | 
| 2550 | periods + 1); | 2576 | periods + 1); | 
| 2551 | sa->runnable_avg_period = decay_load(sa->runnable_avg_period, | 2577 | sa->running_avg_sum = decay_load(sa->running_avg_sum, | 
| 2578 | periods + 1); | ||
| 2579 | sa->avg_period = decay_load(sa->avg_period, | ||
| 2552 | periods + 1); | 2580 | periods + 1); | 
| 2553 | 2581 | ||
| 2554 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 2582 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | 
| 2555 | runnable_contrib = __compute_runnable_contrib(periods); | 2583 | runnable_contrib = __compute_runnable_contrib(periods); | 
| 2556 | if (runnable) | 2584 | if (runnable) | 
| 2557 | sa->runnable_avg_sum += runnable_contrib; | 2585 | sa->runnable_avg_sum += runnable_contrib; | 
| 2558 | sa->runnable_avg_period += runnable_contrib; | 2586 | if (running) | 
| 2587 | sa->running_avg_sum += runnable_contrib * scale_freq | ||
| 2588 | >> SCHED_CAPACITY_SHIFT; | ||
| 2589 | sa->avg_period += runnable_contrib; | ||
| 2559 | } | 2590 | } | 
| 2560 | 2591 | ||
| 2561 | /* Remainder of delta accrued against u_0` */ | 2592 | /* Remainder of delta accrued against u_0` */ | 
| 2562 | if (runnable) | 2593 | if (runnable) | 
| 2563 | sa->runnable_avg_sum += delta; | 2594 | sa->runnable_avg_sum += delta; | 
| 2564 | sa->runnable_avg_period += delta; | 2595 | if (running) | 
| 2596 | sa->running_avg_sum += delta * scale_freq | ||
| 2597 | >> SCHED_CAPACITY_SHIFT; | ||
| 2598 | sa->avg_period += delta; | ||
| 2565 | 2599 | ||
| 2566 | return decayed; | 2600 | return decayed; | 
| 2567 | } | 2601 | } | 
| @@ -2578,6 +2612,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) | |||
| 2578 | return 0; | 2612 | return 0; | 
| 2579 | 2613 | ||
| 2580 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | 2614 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | 
| 2615 | se->avg.utilization_avg_contrib = | ||
| 2616 | decay_load(se->avg.utilization_avg_contrib, decays); | ||
| 2581 | 2617 | ||
| 2582 | return decays; | 2618 | return decays; | 
| 2583 | } | 2619 | } | 
| @@ -2613,7 +2649,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, | |||
| 2613 | 2649 | ||
| 2614 | /* The fraction of a cpu used by this cfs_rq */ | 2650 | /* The fraction of a cpu used by this cfs_rq */ | 
| 2615 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, | 2651 | contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, | 
| 2616 | sa->runnable_avg_period + 1); | 2652 | sa->avg_period + 1); | 
| 2617 | contrib -= cfs_rq->tg_runnable_contrib; | 2653 | contrib -= cfs_rq->tg_runnable_contrib; | 
| 2618 | 2654 | ||
| 2619 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | 2655 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | 
| @@ -2666,7 +2702,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) | |||
| 2666 | 2702 | ||
| 2667 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 2703 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 
| 2668 | { | 2704 | { | 
| 2669 | __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); | 2705 | __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, | 
| 2706 | runnable, runnable); | ||
| 2670 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | 2707 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | 
| 2671 | } | 2708 | } | 
| 2672 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2709 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 
| @@ -2684,7 +2721,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) | |||
| 2684 | 2721 | ||
| 2685 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | 2722 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | 
| 2686 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | 2723 | contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); | 
| 2687 | contrib /= (se->avg.runnable_avg_period + 1); | 2724 | contrib /= (se->avg.avg_period + 1); | 
| 2688 | se->avg.load_avg_contrib = scale_load(contrib); | 2725 | se->avg.load_avg_contrib = scale_load(contrib); | 
| 2689 | } | 2726 | } | 
| 2690 | 2727 | ||
| @@ -2703,6 +2740,30 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) | |||
| 2703 | return se->avg.load_avg_contrib - old_contrib; | 2740 | return se->avg.load_avg_contrib - old_contrib; | 
| 2704 | } | 2741 | } | 
| 2705 | 2742 | ||
| 2743 | |||
| 2744 | static inline void __update_task_entity_utilization(struct sched_entity *se) | ||
| 2745 | { | ||
| 2746 | u32 contrib; | ||
| 2747 | |||
| 2748 | /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ | ||
| 2749 | contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); | ||
| 2750 | contrib /= (se->avg.avg_period + 1); | ||
| 2751 | se->avg.utilization_avg_contrib = scale_load(contrib); | ||
| 2752 | } | ||
| 2753 | |||
| 2754 | static long __update_entity_utilization_avg_contrib(struct sched_entity *se) | ||
| 2755 | { | ||
| 2756 | long old_contrib = se->avg.utilization_avg_contrib; | ||
| 2757 | |||
| 2758 | if (entity_is_task(se)) | ||
| 2759 | __update_task_entity_utilization(se); | ||
| 2760 | else | ||
| 2761 | se->avg.utilization_avg_contrib = | ||
| 2762 | group_cfs_rq(se)->utilization_load_avg; | ||
| 2763 | |||
| 2764 | return se->avg.utilization_avg_contrib - old_contrib; | ||
| 2765 | } | ||
| 2766 | |||
| 2706 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | 2767 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | 
| 2707 | long load_contrib) | 2768 | long load_contrib) | 
| 2708 | { | 2769 | { | 
| @@ -2719,7 +2780,8 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
| 2719 | int update_cfs_rq) | 2780 | int update_cfs_rq) | 
| 2720 | { | 2781 | { | 
| 2721 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 2782 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 
| 2722 | long contrib_delta; | 2783 | long contrib_delta, utilization_delta; | 
| 2784 | int cpu = cpu_of(rq_of(cfs_rq)); | ||
| 2723 | u64 now; | 2785 | u64 now; | 
| 2724 | 2786 | ||
| 2725 | /* | 2787 | /* | 
| @@ -2731,18 +2793,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, | |||
| 2731 | else | 2793 | else | 
| 2732 | now = cfs_rq_clock_task(group_cfs_rq(se)); | 2794 | now = cfs_rq_clock_task(group_cfs_rq(se)); | 
| 2733 | 2795 | ||
| 2734 | if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) | 2796 | if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, | 
| 2797 | cfs_rq->curr == se)) | ||
| 2735 | return; | 2798 | return; | 
| 2736 | 2799 | ||
| 2737 | contrib_delta = __update_entity_load_avg_contrib(se); | 2800 | contrib_delta = __update_entity_load_avg_contrib(se); | 
| 2801 | utilization_delta = __update_entity_utilization_avg_contrib(se); | ||
| 2738 | 2802 | ||
| 2739 | if (!update_cfs_rq) | 2803 | if (!update_cfs_rq) | 
| 2740 | return; | 2804 | return; | 
| 2741 | 2805 | ||
| 2742 | if (se->on_rq) | 2806 | if (se->on_rq) { | 
| 2743 | cfs_rq->runnable_load_avg += contrib_delta; | 2807 | cfs_rq->runnable_load_avg += contrib_delta; | 
| 2744 | else | 2808 | cfs_rq->utilization_load_avg += utilization_delta; | 
| 2809 | } else { | ||
| 2745 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | 2810 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | 
| 2811 | } | ||
| 2746 | } | 2812 | } | 
| 2747 | 2813 | ||
| 2748 | /* | 2814 | /* | 
| @@ -2817,6 +2883,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
| 2817 | } | 2883 | } | 
| 2818 | 2884 | ||
| 2819 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | 2885 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | 
| 2886 | cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; | ||
| 2820 | /* we force update consideration on load-balancer moves */ | 2887 | /* we force update consideration on load-balancer moves */ | 
| 2821 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | 2888 | update_cfs_rq_blocked_load(cfs_rq, !wakeup); | 
| 2822 | } | 2889 | } | 
| @@ -2835,6 +2902,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
| 2835 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | 2902 | update_cfs_rq_blocked_load(cfs_rq, !sleep); | 
| 2836 | 2903 | ||
| 2837 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | 2904 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | 
| 2905 | cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; | ||
| 2838 | if (sleep) { | 2906 | if (sleep) { | 
| 2839 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | 2907 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | 
| 2840 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 2908 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 
| @@ -3172,6 +3240,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 3172 | */ | 3240 | */ | 
| 3173 | update_stats_wait_end(cfs_rq, se); | 3241 | update_stats_wait_end(cfs_rq, se); | 
| 3174 | __dequeue_entity(cfs_rq, se); | 3242 | __dequeue_entity(cfs_rq, se); | 
| 3243 | update_entity_load_avg(se, 1); | ||
| 3175 | } | 3244 | } | 
| 3176 | 3245 | ||
| 3177 | update_stats_curr_start(cfs_rq, se); | 3246 | update_stats_curr_start(cfs_rq, se); | 
| @@ -4298,6 +4367,11 @@ static unsigned long capacity_of(int cpu) | |||
| 4298 | return cpu_rq(cpu)->cpu_capacity; | 4367 | return cpu_rq(cpu)->cpu_capacity; | 
| 4299 | } | 4368 | } | 
| 4300 | 4369 | ||
| 4370 | static unsigned long capacity_orig_of(int cpu) | ||
| 4371 | { | ||
| 4372 | return cpu_rq(cpu)->cpu_capacity_orig; | ||
| 4373 | } | ||
| 4374 | |||
| 4301 | static unsigned long cpu_avg_load_per_task(int cpu) | 4375 | static unsigned long cpu_avg_load_per_task(int cpu) | 
| 4302 | { | 4376 | { | 
| 4303 | struct rq *rq = cpu_rq(cpu); | 4377 | struct rq *rq = cpu_rq(cpu); | 
| @@ -4711,6 +4785,33 @@ next: | |||
| 4711 | done: | 4785 | done: | 
| 4712 | return target; | 4786 | return target; | 
| 4713 | } | 4787 | } | 
| 4788 | /* | ||
| 4789 | * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS | ||
| 4790 | * tasks. The unit of the return value must be the one of capacity so we can | ||
| 4791 | * compare the usage with the capacity of the CPU that is available for CFS | ||
| 4792 | * task (ie cpu_capacity). | ||
| 4793 | * cfs.utilization_load_avg is the sum of running time of runnable tasks on a | ||
| 4794 | * CPU. It represents the amount of utilization of a CPU in the range | ||
| 4795 | * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full | ||
| 4796 | * capacity of the CPU because it's about the running time on this CPU. | ||
| 4797 | * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE | ||
| 4798 | * because of unfortunate rounding in avg_period and running_load_avg or just | ||
| 4799 | * after migrating tasks until the average stabilizes with the new running | ||
| 4800 | * time. So we need to check that the usage stays into the range | ||
| 4801 | * [0..cpu_capacity_orig] and cap if necessary. | ||
| 4802 | * Without capping the usage, a group could be seen as overloaded (CPU0 usage | ||
| 4803 | * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity | ||
| 4804 | */ | ||
| 4805 | static int get_cpu_usage(int cpu) | ||
| 4806 | { | ||
| 4807 | unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; | ||
| 4808 | unsigned long capacity = capacity_orig_of(cpu); | ||
| 4809 | |||
| 4810 | if (usage >= SCHED_LOAD_SCALE) | ||
| 4811 | return capacity; | ||
| 4812 | |||
| 4813 | return (usage * capacity) >> SCHED_LOAD_SHIFT; | ||
| 4814 | } | ||
| 4714 | 4815 | ||
| 4715 | /* | 4816 | /* | 
| 4716 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 4817 | * select_task_rq_fair: Select target runqueue for the waking task in domains | 
| @@ -5837,12 +5938,12 @@ struct sg_lb_stats { | |||
| 5837 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 5938 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | 
| 5838 | unsigned long load_per_task; | 5939 | unsigned long load_per_task; | 
| 5839 | unsigned long group_capacity; | 5940 | unsigned long group_capacity; | 
| 5941 | unsigned long group_usage; /* Total usage of the group */ | ||
| 5840 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 5942 | unsigned int sum_nr_running; /* Nr tasks running in the group */ | 
| 5841 | unsigned int group_capacity_factor; | ||
| 5842 | unsigned int idle_cpus; | 5943 | unsigned int idle_cpus; | 
| 5843 | unsigned int group_weight; | 5944 | unsigned int group_weight; | 
| 5844 | enum group_type group_type; | 5945 | enum group_type group_type; | 
| 5845 | int group_has_free_capacity; | 5946 | int group_no_capacity; | 
| 5846 | #ifdef CONFIG_NUMA_BALANCING | 5947 | #ifdef CONFIG_NUMA_BALANCING | 
| 5847 | unsigned int nr_numa_running; | 5948 | unsigned int nr_numa_running; | 
| 5848 | unsigned int nr_preferred_running; | 5949 | unsigned int nr_preferred_running; | 
| @@ -5913,16 +6014,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, | |||
| 5913 | return load_idx; | 6014 | return load_idx; | 
| 5914 | } | 6015 | } | 
| 5915 | 6016 | ||
| 5916 | static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) | ||
| 5917 | { | ||
| 5918 | return SCHED_CAPACITY_SCALE; | ||
| 5919 | } | ||
| 5920 | |||
| 5921 | unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
| 5922 | { | ||
| 5923 | return default_scale_capacity(sd, cpu); | ||
| 5924 | } | ||
| 5925 | |||
| 5926 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | 6017 | static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) | 
| 5927 | { | 6018 | { | 
| 5928 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | 6019 | if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) | 
| @@ -5939,7 +6030,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | |||
| 5939 | static unsigned long scale_rt_capacity(int cpu) | 6030 | static unsigned long scale_rt_capacity(int cpu) | 
| 5940 | { | 6031 | { | 
| 5941 | struct rq *rq = cpu_rq(cpu); | 6032 | struct rq *rq = cpu_rq(cpu); | 
| 5942 | u64 total, available, age_stamp, avg; | 6033 | u64 total, used, age_stamp, avg; | 
| 5943 | s64 delta; | 6034 | s64 delta; | 
| 5944 | 6035 | ||
| 5945 | /* | 6036 | /* | 
| @@ -5955,19 +6046,12 @@ static unsigned long scale_rt_capacity(int cpu) | |||
| 5955 | 6046 | ||
| 5956 | total = sched_avg_period() + delta; | 6047 | total = sched_avg_period() + delta; | 
| 5957 | 6048 | ||
| 5958 | if (unlikely(total < avg)) { | 6049 | used = div_u64(avg, total); | 
| 5959 | /* Ensures that capacity won't end up being negative */ | ||
| 5960 | available = 0; | ||
| 5961 | } else { | ||
| 5962 | available = total - avg; | ||
| 5963 | } | ||
| 5964 | |||
| 5965 | if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) | ||
| 5966 | total = SCHED_CAPACITY_SCALE; | ||
| 5967 | 6050 | ||
| 5968 | total >>= SCHED_CAPACITY_SHIFT; | 6051 | if (likely(used < SCHED_CAPACITY_SCALE)) | 
| 6052 | return SCHED_CAPACITY_SCALE - used; | ||
| 5969 | 6053 | ||
| 5970 | return div_u64(available, total); | 6054 | return 1; | 
| 5971 | } | 6055 | } | 
| 5972 | 6056 | ||
| 5973 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 6057 | static void update_cpu_capacity(struct sched_domain *sd, int cpu) | 
| @@ -5982,14 +6066,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) | |||
| 5982 | 6066 | ||
| 5983 | capacity >>= SCHED_CAPACITY_SHIFT; | 6067 | capacity >>= SCHED_CAPACITY_SHIFT; | 
| 5984 | 6068 | ||
| 5985 | sdg->sgc->capacity_orig = capacity; | 6069 | cpu_rq(cpu)->cpu_capacity_orig = capacity; | 
| 5986 | |||
| 5987 | if (sched_feat(ARCH_CAPACITY)) | ||
| 5988 | capacity *= arch_scale_freq_capacity(sd, cpu); | ||
| 5989 | else | ||
| 5990 | capacity *= default_scale_capacity(sd, cpu); | ||
| 5991 | |||
| 5992 | capacity >>= SCHED_CAPACITY_SHIFT; | ||
| 5993 | 6070 | ||
| 5994 | capacity *= scale_rt_capacity(cpu); | 6071 | capacity *= scale_rt_capacity(cpu); | 
| 5995 | capacity >>= SCHED_CAPACITY_SHIFT; | 6072 | capacity >>= SCHED_CAPACITY_SHIFT; | 
| @@ -6005,7 +6082,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6005 | { | 6082 | { | 
| 6006 | struct sched_domain *child = sd->child; | 6083 | struct sched_domain *child = sd->child; | 
| 6007 | struct sched_group *group, *sdg = sd->groups; | 6084 | struct sched_group *group, *sdg = sd->groups; | 
| 6008 | unsigned long capacity, capacity_orig; | 6085 | unsigned long capacity; | 
| 6009 | unsigned long interval; | 6086 | unsigned long interval; | 
| 6010 | 6087 | ||
| 6011 | interval = msecs_to_jiffies(sd->balance_interval); | 6088 | interval = msecs_to_jiffies(sd->balance_interval); | 
| @@ -6017,7 +6094,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6017 | return; | 6094 | return; | 
| 6018 | } | 6095 | } | 
| 6019 | 6096 | ||
| 6020 | capacity_orig = capacity = 0; | 6097 | capacity = 0; | 
| 6021 | 6098 | ||
| 6022 | if (child->flags & SD_OVERLAP) { | 6099 | if (child->flags & SD_OVERLAP) { | 
| 6023 | /* | 6100 | /* | 
| @@ -6037,19 +6114,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6037 | * Use capacity_of(), which is set irrespective of domains | 6114 | * Use capacity_of(), which is set irrespective of domains | 
| 6038 | * in update_cpu_capacity(). | 6115 | * in update_cpu_capacity(). | 
| 6039 | * | 6116 | * | 
| 6040 | * This avoids capacity/capacity_orig from being 0 and | 6117 | * This avoids capacity from being 0 and | 
| 6041 | * causing divide-by-zero issues on boot. | 6118 | * causing divide-by-zero issues on boot. | 
| 6042 | * | ||
| 6043 | * Runtime updates will correct capacity_orig. | ||
| 6044 | */ | 6119 | */ | 
| 6045 | if (unlikely(!rq->sd)) { | 6120 | if (unlikely(!rq->sd)) { | 
| 6046 | capacity_orig += capacity_of(cpu); | ||
| 6047 | capacity += capacity_of(cpu); | 6121 | capacity += capacity_of(cpu); | 
| 6048 | continue; | 6122 | continue; | 
| 6049 | } | 6123 | } | 
| 6050 | 6124 | ||
| 6051 | sgc = rq->sd->groups->sgc; | 6125 | sgc = rq->sd->groups->sgc; | 
| 6052 | capacity_orig += sgc->capacity_orig; | ||
| 6053 | capacity += sgc->capacity; | 6126 | capacity += sgc->capacity; | 
| 6054 | } | 6127 | } | 
| 6055 | } else { | 6128 | } else { | 
| @@ -6060,39 +6133,24 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
| 6060 | 6133 | ||
| 6061 | group = child->groups; | 6134 | group = child->groups; | 
| 6062 | do { | 6135 | do { | 
| 6063 | capacity_orig += group->sgc->capacity_orig; | ||
| 6064 | capacity += group->sgc->capacity; | 6136 | capacity += group->sgc->capacity; | 
| 6065 | group = group->next; | 6137 | group = group->next; | 
| 6066 | } while (group != child->groups); | 6138 | } while (group != child->groups); | 
| 6067 | } | 6139 | } | 
| 6068 | 6140 | ||
| 6069 | sdg->sgc->capacity_orig = capacity_orig; | ||
| 6070 | sdg->sgc->capacity = capacity; | 6141 | sdg->sgc->capacity = capacity; | 
| 6071 | } | 6142 | } | 
| 6072 | 6143 | ||
| 6073 | /* | 6144 | /* | 
| 6074 | * Try and fix up capacity for tiny siblings, this is needed when | 6145 | * Check whether the capacity of the rq has been noticeably reduced by side | 
| 6075 | * things like SD_ASYM_PACKING need f_b_g to select another sibling | 6146 | * activity. The imbalance_pct is used for the threshold. | 
| 6076 | * which on its own isn't powerful enough. | 6147 | * Return true is the capacity is reduced | 
| 6077 | * | ||
| 6078 | * See update_sd_pick_busiest() and check_asym_packing(). | ||
| 6079 | */ | 6148 | */ | 
| 6080 | static inline int | 6149 | static inline int | 
| 6081 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 6150 | check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | 
| 6082 | { | 6151 | { | 
| 6083 | /* | 6152 | return ((rq->cpu_capacity * sd->imbalance_pct) < | 
| 6084 | * Only siblings can have significantly less than SCHED_CAPACITY_SCALE | 6153 | (rq->cpu_capacity_orig * 100)); | 
| 6085 | */ | ||
| 6086 | if (!(sd->flags & SD_SHARE_CPUCAPACITY)) | ||
| 6087 | return 0; | ||
| 6088 | |||
| 6089 | /* | ||
| 6090 | * If ~90% of the cpu_capacity is still there, we're good. | ||
| 6091 | */ | ||
| 6092 | if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) | ||
| 6093 | return 1; | ||
| 6094 | |||
| 6095 | return 0; | ||
| 6096 | } | 6154 | } | 
| 6097 | 6155 | ||
| 6098 | /* | 6156 | /* | 
| @@ -6130,37 +6188,56 @@ static inline int sg_imbalanced(struct sched_group *group) | |||
| 6130 | } | 6188 | } | 
| 6131 | 6189 | ||
| 6132 | /* | 6190 | /* | 
| 6133 | * Compute the group capacity factor. | 6191 | * group_has_capacity returns true if the group has spare capacity that could | 
| 6134 | * | 6192 | * be used by some tasks. | 
| 6135 | * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by | 6193 | * We consider that a group has spare capacity if the * number of task is | 
| 6136 | * first dividing out the smt factor and computing the actual number of cores | 6194 | * smaller than the number of CPUs or if the usage is lower than the available | 
| 6137 | * and limit unit capacity with that. | 6195 | * capacity for CFS tasks. | 
| 6196 | * For the latter, we use a threshold to stabilize the state, to take into | ||
| 6197 | * account the variance of the tasks' load and to return true if the available | ||
| 6198 | * capacity in meaningful for the load balancer. | ||
| 6199 | * As an example, an available capacity of 1% can appear but it doesn't make | ||
| 6200 | * any benefit for the load balance. | ||
| 6138 | */ | 6201 | */ | 
| 6139 | static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) | 6202 | static inline bool | 
| 6203 | group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) | ||
| 6140 | { | 6204 | { | 
| 6141 | unsigned int capacity_factor, smt, cpus; | 6205 | if (sgs->sum_nr_running < sgs->group_weight) | 
| 6142 | unsigned int capacity, capacity_orig; | 6206 | return true; | 
| 6143 | 6207 | ||
| 6144 | capacity = group->sgc->capacity; | 6208 | if ((sgs->group_capacity * 100) > | 
| 6145 | capacity_orig = group->sgc->capacity_orig; | 6209 | (sgs->group_usage * env->sd->imbalance_pct)) | 
| 6146 | cpus = group->group_weight; | 6210 | return true; | 
| 6211 | |||
| 6212 | return false; | ||
| 6213 | } | ||
| 6147 | 6214 | ||
| 6148 | /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ | 6215 | /* | 
| 6149 | smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); | 6216 | * group_is_overloaded returns true if the group has more tasks than it can | 
| 6150 | capacity_factor = cpus / smt; /* cores */ | 6217 | * handle. | 
| 6218 | * group_is_overloaded is not equals to !group_has_capacity because a group | ||
| 6219 | * with the exact right number of tasks, has no more spare capacity but is not | ||
| 6220 | * overloaded so both group_has_capacity and group_is_overloaded return | ||
| 6221 | * false. | ||
| 6222 | */ | ||
| 6223 | static inline bool | ||
| 6224 | group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | ||
| 6225 | { | ||
| 6226 | if (sgs->sum_nr_running <= sgs->group_weight) | ||
| 6227 | return false; | ||
| 6151 | 6228 | ||
| 6152 | capacity_factor = min_t(unsigned, | 6229 | if ((sgs->group_capacity * 100) < | 
| 6153 | capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); | 6230 | (sgs->group_usage * env->sd->imbalance_pct)) | 
| 6154 | if (!capacity_factor) | 6231 | return true; | 
| 6155 | capacity_factor = fix_small_capacity(env->sd, group); | ||
| 6156 | 6232 | ||
| 6157 | return capacity_factor; | 6233 | return false; | 
| 6158 | } | 6234 | } | 
| 6159 | 6235 | ||
| 6160 | static enum group_type | 6236 | static enum group_type group_classify(struct lb_env *env, | 
| 6161 | group_classify(struct sched_group *group, struct sg_lb_stats *sgs) | 6237 | struct sched_group *group, | 
| 6238 | struct sg_lb_stats *sgs) | ||
| 6162 | { | 6239 | { | 
| 6163 | if (sgs->sum_nr_running > sgs->group_capacity_factor) | 6240 | if (sgs->group_no_capacity) | 
| 6164 | return group_overloaded; | 6241 | return group_overloaded; | 
| 6165 | 6242 | ||
| 6166 | if (sg_imbalanced(group)) | 6243 | if (sg_imbalanced(group)) | 
| @@ -6198,6 +6275,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 6198 | load = source_load(i, load_idx); | 6275 | load = source_load(i, load_idx); | 
| 6199 | 6276 | ||
| 6200 | sgs->group_load += load; | 6277 | sgs->group_load += load; | 
| 6278 | sgs->group_usage += get_cpu_usage(i); | ||
| 6201 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 6279 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 
| 6202 | 6280 | ||
| 6203 | if (rq->nr_running > 1) | 6281 | if (rq->nr_running > 1) | 
| @@ -6220,11 +6298,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 6220 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 6298 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 
| 6221 | 6299 | ||
| 6222 | sgs->group_weight = group->group_weight; | 6300 | sgs->group_weight = group->group_weight; | 
| 6223 | sgs->group_capacity_factor = sg_capacity_factor(env, group); | ||
| 6224 | sgs->group_type = group_classify(group, sgs); | ||
| 6225 | 6301 | ||
| 6226 | if (sgs->group_capacity_factor > sgs->sum_nr_running) | 6302 | sgs->group_no_capacity = group_is_overloaded(env, sgs); | 
| 6227 | sgs->group_has_free_capacity = 1; | 6303 | sgs->group_type = group_classify(env, group, sgs); | 
| 6228 | } | 6304 | } | 
| 6229 | 6305 | ||
| 6230 | /** | 6306 | /** | 
| @@ -6346,18 +6422,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd | |||
| 6346 | 6422 | ||
| 6347 | /* | 6423 | /* | 
| 6348 | * In case the child domain prefers tasks go to siblings | 6424 | * In case the child domain prefers tasks go to siblings | 
| 6349 | * first, lower the sg capacity factor to one so that we'll try | 6425 | * first, lower the sg capacity so that we'll try | 
| 6350 | * and move all the excess tasks away. We lower the capacity | 6426 | * and move all the excess tasks away. We lower the capacity | 
| 6351 | * of a group only if the local group has the capacity to fit | 6427 | * of a group only if the local group has the capacity to fit | 
| 6352 | * these excess tasks, i.e. nr_running < group_capacity_factor. The | 6428 | * these excess tasks. The extra check prevents the case where | 
| 6353 | * extra check prevents the case where you always pull from the | 6429 | * you always pull from the heaviest group when it is already | 
| 6354 | * heaviest group when it is already under-utilized (possible | 6430 | * under-utilized (possible with a large weight task outweighs | 
| 6355 | * with a large weight task outweighs the tasks on the system). | 6431 | * the tasks on the system). | 
| 6356 | */ | 6432 | */ | 
| 6357 | if (prefer_sibling && sds->local && | 6433 | if (prefer_sibling && sds->local && | 
| 6358 | sds->local_stat.group_has_free_capacity) { | 6434 | group_has_capacity(env, &sds->local_stat) && | 
| 6359 | sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); | 6435 | (sgs->sum_nr_running > 1)) { | 
| 6360 | sgs->group_type = group_classify(sg, sgs); | 6436 | sgs->group_no_capacity = 1; | 
| 6437 | sgs->group_type = group_overloaded; | ||
| 6361 | } | 6438 | } | 
| 6362 | 6439 | ||
| 6363 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 6440 | if (update_sd_pick_busiest(env, sds, sg, sgs)) { | 
| @@ -6537,11 +6614,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 6537 | */ | 6614 | */ | 
| 6538 | if (busiest->group_type == group_overloaded && | 6615 | if (busiest->group_type == group_overloaded && | 
| 6539 | local->group_type == group_overloaded) { | 6616 | local->group_type == group_overloaded) { | 
| 6540 | load_above_capacity = | 6617 | load_above_capacity = busiest->sum_nr_running * | 
| 6541 | (busiest->sum_nr_running - busiest->group_capacity_factor); | 6618 | SCHED_LOAD_SCALE; | 
| 6542 | 6619 | if (load_above_capacity > busiest->group_capacity) | |
| 6543 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); | 6620 | load_above_capacity -= busiest->group_capacity; | 
| 6544 | load_above_capacity /= busiest->group_capacity; | 6621 | else | 
| 6622 | load_above_capacity = ~0UL; | ||
| 6545 | } | 6623 | } | 
| 6546 | 6624 | ||
| 6547 | /* | 6625 | /* | 
| @@ -6604,6 +6682,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6604 | local = &sds.local_stat; | 6682 | local = &sds.local_stat; | 
| 6605 | busiest = &sds.busiest_stat; | 6683 | busiest = &sds.busiest_stat; | 
| 6606 | 6684 | ||
| 6685 | /* ASYM feature bypasses nice load balance check */ | ||
| 6607 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 6686 | if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && | 
| 6608 | check_asym_packing(env, &sds)) | 6687 | check_asym_packing(env, &sds)) | 
| 6609 | return sds.busiest; | 6688 | return sds.busiest; | 
| @@ -6624,8 +6703,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 6624 | goto force_balance; | 6703 | goto force_balance; | 
| 6625 | 6704 | ||
| 6626 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 6705 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 
| 6627 | if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && | 6706 | if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && | 
| 6628 | !busiest->group_has_free_capacity) | 6707 | busiest->group_no_capacity) | 
| 6629 | goto force_balance; | 6708 | goto force_balance; | 
| 6630 | 6709 | ||
| 6631 | /* | 6710 | /* | 
| @@ -6684,7 +6763,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6684 | int i; | 6763 | int i; | 
| 6685 | 6764 | ||
| 6686 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 6765 | for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { | 
| 6687 | unsigned long capacity, capacity_factor, wl; | 6766 | unsigned long capacity, wl; | 
| 6688 | enum fbq_type rt; | 6767 | enum fbq_type rt; | 
| 6689 | 6768 | ||
| 6690 | rq = cpu_rq(i); | 6769 | rq = cpu_rq(i); | 
| @@ -6713,9 +6792,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6713 | continue; | 6792 | continue; | 
| 6714 | 6793 | ||
| 6715 | capacity = capacity_of(i); | 6794 | capacity = capacity_of(i); | 
| 6716 | capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); | ||
| 6717 | if (!capacity_factor) | ||
| 6718 | capacity_factor = fix_small_capacity(env->sd, group); | ||
| 6719 | 6795 | ||
| 6720 | wl = weighted_cpuload(i); | 6796 | wl = weighted_cpuload(i); | 
| 6721 | 6797 | ||
| @@ -6723,7 +6799,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 6723 | * When comparing with imbalance, use weighted_cpuload() | 6799 | * When comparing with imbalance, use weighted_cpuload() | 
| 6724 | * which is not scaled with the cpu capacity. | 6800 | * which is not scaled with the cpu capacity. | 
| 6725 | */ | 6801 | */ | 
| 6726 | if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) | 6802 | |
| 6803 | if (rq->nr_running == 1 && wl > env->imbalance && | ||
| 6804 | !check_cpu_capacity(rq, env->sd)) | ||
| 6727 | continue; | 6805 | continue; | 
| 6728 | 6806 | ||
| 6729 | /* | 6807 | /* | 
| @@ -6771,6 +6849,19 @@ static int need_active_balance(struct lb_env *env) | |||
| 6771 | return 1; | 6849 | return 1; | 
| 6772 | } | 6850 | } | 
| 6773 | 6851 | ||
| 6852 | /* | ||
| 6853 | * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. | ||
| 6854 | * It's worth migrating the task if the src_cpu's capacity is reduced | ||
| 6855 | * because of other sched_class or IRQs if more capacity stays | ||
| 6856 | * available on dst_cpu. | ||
| 6857 | */ | ||
| 6858 | if ((env->idle != CPU_NOT_IDLE) && | ||
| 6859 | (env->src_rq->cfs.h_nr_running == 1)) { | ||
| 6860 | if ((check_cpu_capacity(env->src_rq, sd)) && | ||
| 6861 | (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) | ||
| 6862 | return 1; | ||
| 6863 | } | ||
| 6864 | |||
| 6774 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 6865 | return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); | 
| 6775 | } | 6866 | } | 
| 6776 | 6867 | ||
| @@ -6870,6 +6961,9 @@ redo: | |||
| 6870 | 6961 | ||
| 6871 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 6962 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 
| 6872 | 6963 | ||
| 6964 | env.src_cpu = busiest->cpu; | ||
| 6965 | env.src_rq = busiest; | ||
| 6966 | |||
| 6873 | ld_moved = 0; | 6967 | ld_moved = 0; | 
| 6874 | if (busiest->nr_running > 1) { | 6968 | if (busiest->nr_running > 1) { | 
| 6875 | /* | 6969 | /* | 
| @@ -6879,8 +6973,6 @@ redo: | |||
| 6879 | * correctly treated as an imbalance. | 6973 | * correctly treated as an imbalance. | 
| 6880 | */ | 6974 | */ | 
| 6881 | env.flags |= LBF_ALL_PINNED; | 6975 | env.flags |= LBF_ALL_PINNED; | 
| 6882 | env.src_cpu = busiest->cpu; | ||
| 6883 | env.src_rq = busiest; | ||
| 6884 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 6976 | env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); | 
| 6885 | 6977 | ||
| 6886 | more_balance: | 6978 | more_balance: | 
| @@ -7580,22 +7672,25 @@ end: | |||
| 7580 | 7672 | ||
| 7581 | /* | 7673 | /* | 
| 7582 | * Current heuristic for kicking the idle load balancer in the presence | 7674 | * Current heuristic for kicking the idle load balancer in the presence | 
| 7583 | * of an idle cpu is the system. | 7675 | * of an idle cpu in the system. | 
| 7584 | * - This rq has more than one task. | 7676 | * - This rq has more than one task. | 
| 7585 | * - At any scheduler domain level, this cpu's scheduler group has multiple | 7677 | * - This rq has at least one CFS task and the capacity of the CPU is | 
| 7586 | * busy cpu's exceeding the group's capacity. | 7678 | * significantly reduced because of RT tasks or IRQs. | 
| 7679 | * - At parent of LLC scheduler domain level, this cpu's scheduler group has | ||
| 7680 | * multiple busy cpu. | ||
| 7587 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 7681 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 
| 7588 | * domain span are idle. | 7682 | * domain span are idle. | 
| 7589 | */ | 7683 | */ | 
| 7590 | static inline int nohz_kick_needed(struct rq *rq) | 7684 | static inline bool nohz_kick_needed(struct rq *rq) | 
| 7591 | { | 7685 | { | 
| 7592 | unsigned long now = jiffies; | 7686 | unsigned long now = jiffies; | 
| 7593 | struct sched_domain *sd; | 7687 | struct sched_domain *sd; | 
| 7594 | struct sched_group_capacity *sgc; | 7688 | struct sched_group_capacity *sgc; | 
| 7595 | int nr_busy, cpu = rq->cpu; | 7689 | int nr_busy, cpu = rq->cpu; | 
| 7690 | bool kick = false; | ||
| 7596 | 7691 | ||
| 7597 | if (unlikely(rq->idle_balance)) | 7692 | if (unlikely(rq->idle_balance)) | 
| 7598 | return 0; | 7693 | return false; | 
| 7599 | 7694 | ||
| 7600 | /* | 7695 | /* | 
| 7601 | * We may be recently in ticked or tickless idle mode. At the first | 7696 | * We may be recently in ticked or tickless idle mode. At the first | 
| @@ -7609,38 +7704,46 @@ static inline int nohz_kick_needed(struct rq *rq) | |||
| 7609 | * balancing. | 7704 | * balancing. | 
| 7610 | */ | 7705 | */ | 
| 7611 | if (likely(!atomic_read(&nohz.nr_cpus))) | 7706 | if (likely(!atomic_read(&nohz.nr_cpus))) | 
| 7612 | return 0; | 7707 | return false; | 
| 7613 | 7708 | ||
| 7614 | if (time_before(now, nohz.next_balance)) | 7709 | if (time_before(now, nohz.next_balance)) | 
| 7615 | return 0; | 7710 | return false; | 
| 7616 | 7711 | ||
| 7617 | if (rq->nr_running >= 2) | 7712 | if (rq->nr_running >= 2) | 
| 7618 | goto need_kick; | 7713 | return true; | 
| 7619 | 7714 | ||
| 7620 | rcu_read_lock(); | 7715 | rcu_read_lock(); | 
| 7621 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 7716 | sd = rcu_dereference(per_cpu(sd_busy, cpu)); | 
| 7622 | |||
| 7623 | if (sd) { | 7717 | if (sd) { | 
| 7624 | sgc = sd->groups->sgc; | 7718 | sgc = sd->groups->sgc; | 
| 7625 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 7719 | nr_busy = atomic_read(&sgc->nr_busy_cpus); | 
| 7626 | 7720 | ||
| 7627 | if (nr_busy > 1) | 7721 | if (nr_busy > 1) { | 
| 7628 | goto need_kick_unlock; | 7722 | kick = true; | 
| 7723 | goto unlock; | ||
| 7724 | } | ||
| 7725 | |||
| 7629 | } | 7726 | } | 
| 7630 | 7727 | ||
| 7631 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | 7728 | sd = rcu_dereference(rq->sd); | 
| 7729 | if (sd) { | ||
| 7730 | if ((rq->cfs.h_nr_running >= 1) && | ||
| 7731 | check_cpu_capacity(rq, sd)) { | ||
| 7732 | kick = true; | ||
| 7733 | goto unlock; | ||
| 7734 | } | ||
| 7735 | } | ||
| 7632 | 7736 | ||
| 7737 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | ||
| 7633 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | 7738 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | 
| 7634 | sched_domain_span(sd)) < cpu)) | 7739 | sched_domain_span(sd)) < cpu)) { | 
| 7635 | goto need_kick_unlock; | 7740 | kick = true; | 
| 7636 | 7741 | goto unlock; | |
| 7637 | rcu_read_unlock(); | 7742 | } | 
| 7638 | return 0; | ||
| 7639 | 7743 | ||
| 7640 | need_kick_unlock: | 7744 | unlock: | 
| 7641 | rcu_read_unlock(); | 7745 | rcu_read_unlock(); | 
| 7642 | need_kick: | 7746 | return kick; | 
| 7643 | return 1; | ||
| 7644 | } | 7747 | } | 
| 7645 | #else | 7748 | #else | 
| 7646 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | 7749 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } | 
| @@ -7656,14 +7759,16 @@ static void run_rebalance_domains(struct softirq_action *h) | |||
| 7656 | enum cpu_idle_type idle = this_rq->idle_balance ? | 7759 | enum cpu_idle_type idle = this_rq->idle_balance ? | 
| 7657 | CPU_IDLE : CPU_NOT_IDLE; | 7760 | CPU_IDLE : CPU_NOT_IDLE; | 
| 7658 | 7761 | ||
| 7659 | rebalance_domains(this_rq, idle); | ||
| 7660 | |||
| 7661 | /* | 7762 | /* | 
| 7662 | * If this cpu has a pending nohz_balance_kick, then do the | 7763 | * If this cpu has a pending nohz_balance_kick, then do the | 
| 7663 | * balancing on behalf of the other idle cpus whose ticks are | 7764 | * balancing on behalf of the other idle cpus whose ticks are | 
| 7664 | * stopped. | 7765 | * stopped. Do nohz_idle_balance *before* rebalance_domains to | 
| 7766 | * give the idle cpus a chance to load balance. Else we may | ||
| 7767 | * load balance only within the local sched_domain hierarchy | ||
| 7768 | * and abort nohz_idle_balance altogether if we pull some load. | ||
| 7665 | */ | 7769 | */ | 
| 7666 | nohz_idle_balance(this_rq, idle); | 7770 | nohz_idle_balance(this_rq, idle); | 
| 7771 | rebalance_domains(this_rq, idle); | ||
| 7667 | } | 7772 | } | 
| 7668 | 7773 | ||
| 7669 | /* | 7774 | /* | 
| diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..91e33cd485f6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) | |||
| 56 | */ | 56 | */ | 
| 57 | SCHED_FEAT(TTWU_QUEUE, true) | 57 | SCHED_FEAT(TTWU_QUEUE, true) | 
| 58 | 58 | ||
| 59 | #ifdef HAVE_RT_PUSH_IPI | ||
| 60 | /* | ||
| 61 | * In order to avoid a thundering herd attack of CPUs that are | ||
| 62 | * lowering their priorities at the same time, and there being | ||
| 63 | * a single CPU that has an RT task that can migrate and is waiting | ||
| 64 | * to run, where the other CPUs will try to take that CPUs | ||
| 65 | * rq lock and possibly create a large contention, sending an | ||
| 66 | * IPI to that CPU and let that CPU push the RT task to where | ||
| 67 | * it should go may be a better scenario. | ||
| 68 | */ | ||
| 69 | SCHED_FEAT(RT_PUSH_IPI, true) | ||
| 70 | #endif | ||
| 71 | |||
| 59 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 72 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 
| 60 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 73 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 
| 61 | SCHED_FEAT(LB_MIN, false) | 74 | SCHED_FEAT(LB_MIN, false) | 
| diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80014a178342..deef1caa94c6 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -158,8 +158,7 @@ static void cpuidle_idle_call(void) | |||
| 158 | * is used from another cpu as a broadcast timer, this call may | 158 | * is used from another cpu as a broadcast timer, this call may | 
| 159 | * fail if it is not available | 159 | * fail if it is not available | 
| 160 | */ | 160 | */ | 
| 161 | if (broadcast && | 161 | if (broadcast && tick_broadcast_enter()) | 
| 162 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) | ||
| 163 | goto use_default; | 162 | goto use_default; | 
| 164 | 163 | ||
| 165 | /* Take note of the planned idle state. */ | 164 | /* Take note of the planned idle state. */ | 
| @@ -176,7 +175,7 @@ static void cpuidle_idle_call(void) | |||
| 176 | idle_set_state(this_rq(), NULL); | 175 | idle_set_state(this_rq(), NULL); | 
| 177 | 176 | ||
| 178 | if (broadcast) | 177 | if (broadcast) | 
| 179 | clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); | 178 | tick_broadcast_exit(); | 
| 180 | 179 | ||
| 181 | /* | 180 | /* | 
| 182 | * Give the governor an opportunity to reflect on the outcome | 181 | * Give the governor an opportunity to reflect on the outcome | 
| @@ -210,6 +209,8 @@ use_default: | |||
| 210 | goto exit_idle; | 209 | goto exit_idle; | 
| 211 | } | 210 | } | 
| 212 | 211 | ||
| 212 | DEFINE_PER_CPU(bool, cpu_dead_idle); | ||
| 213 | |||
| 213 | /* | 214 | /* | 
| 214 | * Generic idle loop implementation | 215 | * Generic idle loop implementation | 
| 215 | * | 216 | * | 
| @@ -234,8 +235,13 @@ static void cpu_idle_loop(void) | |||
| 234 | check_pgt_cache(); | 235 | check_pgt_cache(); | 
| 235 | rmb(); | 236 | rmb(); | 
| 236 | 237 | ||
| 237 | if (cpu_is_offline(smp_processor_id())) | 238 | if (cpu_is_offline(smp_processor_id())) { | 
| 239 | rcu_cpu_notify(NULL, CPU_DYING_IDLE, | ||
| 240 | (void *)(long)smp_processor_id()); | ||
| 241 | smp_mb(); /* all activity before dead. */ | ||
| 242 | this_cpu_write(cpu_dead_idle, true); | ||
| 238 | arch_cpu_idle_dead(); | 243 | arch_cpu_idle_dead(); | 
| 244 | } | ||
| 239 | 245 | ||
| 240 | local_irq_disable(); | 246 | local_irq_disable(); | 
| 241 | arch_cpu_idle_enter(); | 247 | arch_cpu_idle_enter(); | 
| diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f4d4b077eba0..575da76a3874 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include "sched.h" | 6 | #include "sched.h" | 
| 7 | 7 | ||
| 8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> | 
| 9 | #include <linux/irq_work.h> | ||
| 9 | 10 | ||
| 10 | int sched_rr_timeslice = RR_TIMESLICE; | 11 | int sched_rr_timeslice = RR_TIMESLICE; | 
| 11 | 12 | ||
| @@ -59,7 +60,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
| 59 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 60 | raw_spin_unlock(&rt_b->rt_runtime_lock); | 
| 60 | } | 61 | } | 
| 61 | 62 | ||
| 62 | void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 63 | #ifdef CONFIG_SMP | 
| 64 | static void push_irq_work_func(struct irq_work *work); | ||
| 65 | #endif | ||
| 66 | |||
| 67 | void init_rt_rq(struct rt_rq *rt_rq) | ||
| 63 | { | 68 | { | 
| 64 | struct rt_prio_array *array; | 69 | struct rt_prio_array *array; | 
| 65 | int i; | 70 | int i; | 
| @@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 78 | rt_rq->rt_nr_migratory = 0; | 83 | rt_rq->rt_nr_migratory = 0; | 
| 79 | rt_rq->overloaded = 0; | 84 | rt_rq->overloaded = 0; | 
| 80 | plist_head_init(&rt_rq->pushable_tasks); | 85 | plist_head_init(&rt_rq->pushable_tasks); | 
| 86 | |||
| 87 | #ifdef HAVE_RT_PUSH_IPI | ||
| 88 | rt_rq->push_flags = 0; | ||
| 89 | rt_rq->push_cpu = nr_cpu_ids; | ||
| 90 | raw_spin_lock_init(&rt_rq->push_lock); | ||
| 91 | init_irq_work(&rt_rq->push_work, push_irq_work_func); | ||
| 81 | #endif | 92 | #endif | 
| 93 | #endif /* CONFIG_SMP */ | ||
| 82 | /* We start is dequeued state, because no RT tasks are queued */ | 94 | /* We start is dequeued state, because no RT tasks are queued */ | 
| 83 | rt_rq->rt_queued = 0; | 95 | rt_rq->rt_queued = 0; | 
| 84 | 96 | ||
| @@ -193,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 193 | if (!rt_se) | 205 | if (!rt_se) | 
| 194 | goto err_free_rq; | 206 | goto err_free_rq; | 
| 195 | 207 | ||
| 196 | init_rt_rq(rt_rq, cpu_rq(i)); | 208 | init_rt_rq(rt_rq); | 
| 197 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 209 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 
| 198 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | 210 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | 
| 199 | } | 211 | } | 
| @@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) | |||
| 1778 | ; | 1790 | ; | 
| 1779 | } | 1791 | } | 
| 1780 | 1792 | ||
| 1793 | #ifdef HAVE_RT_PUSH_IPI | ||
| 1794 | /* | ||
| 1795 | * The search for the next cpu always starts at rq->cpu and ends | ||
| 1796 | * when we reach rq->cpu again. It will never return rq->cpu. | ||
| 1797 | * This returns the next cpu to check, or nr_cpu_ids if the loop | ||
| 1798 | * is complete. | ||
| 1799 | * | ||
| 1800 | * rq->rt.push_cpu holds the last cpu returned by this function, | ||
| 1801 | * or if this is the first instance, it must hold rq->cpu. | ||
| 1802 | */ | ||
| 1803 | static int rto_next_cpu(struct rq *rq) | ||
| 1804 | { | ||
| 1805 | int prev_cpu = rq->rt.push_cpu; | ||
| 1806 | int cpu; | ||
| 1807 | |||
| 1808 | cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); | ||
| 1809 | |||
| 1810 | /* | ||
| 1811 | * If the previous cpu is less than the rq's CPU, then it already | ||
| 1812 | * passed the end of the mask, and has started from the beginning. | ||
| 1813 | * We end if the next CPU is greater or equal to rq's CPU. | ||
| 1814 | */ | ||
| 1815 | if (prev_cpu < rq->cpu) { | ||
| 1816 | if (cpu >= rq->cpu) | ||
| 1817 | return nr_cpu_ids; | ||
| 1818 | |||
| 1819 | } else if (cpu >= nr_cpu_ids) { | ||
| 1820 | /* | ||
| 1821 | * We passed the end of the mask, start at the beginning. | ||
| 1822 | * If the result is greater or equal to the rq's CPU, then | ||
| 1823 | * the loop is finished. | ||
| 1824 | */ | ||
| 1825 | cpu = cpumask_first(rq->rd->rto_mask); | ||
| 1826 | if (cpu >= rq->cpu) | ||
| 1827 | return nr_cpu_ids; | ||
| 1828 | } | ||
| 1829 | rq->rt.push_cpu = cpu; | ||
| 1830 | |||
| 1831 | /* Return cpu to let the caller know if the loop is finished or not */ | ||
| 1832 | return cpu; | ||
| 1833 | } | ||
| 1834 | |||
| 1835 | static int find_next_push_cpu(struct rq *rq) | ||
| 1836 | { | ||
| 1837 | struct rq *next_rq; | ||
| 1838 | int cpu; | ||
| 1839 | |||
| 1840 | while (1) { | ||
| 1841 | cpu = rto_next_cpu(rq); | ||
| 1842 | if (cpu >= nr_cpu_ids) | ||
| 1843 | break; | ||
| 1844 | next_rq = cpu_rq(cpu); | ||
| 1845 | |||
| 1846 | /* Make sure the next rq can push to this rq */ | ||
| 1847 | if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) | ||
| 1848 | break; | ||
| 1849 | } | ||
| 1850 | |||
| 1851 | return cpu; | ||
| 1852 | } | ||
| 1853 | |||
| 1854 | #define RT_PUSH_IPI_EXECUTING 1 | ||
| 1855 | #define RT_PUSH_IPI_RESTART 2 | ||
| 1856 | |||
| 1857 | static void tell_cpu_to_push(struct rq *rq) | ||
| 1858 | { | ||
| 1859 | int cpu; | ||
| 1860 | |||
| 1861 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
| 1862 | raw_spin_lock(&rq->rt.push_lock); | ||
| 1863 | /* Make sure it's still executing */ | ||
| 1864 | if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { | ||
| 1865 | /* | ||
| 1866 | * Tell the IPI to restart the loop as things have | ||
| 1867 | * changed since it started. | ||
| 1868 | */ | ||
| 1869 | rq->rt.push_flags |= RT_PUSH_IPI_RESTART; | ||
| 1870 | raw_spin_unlock(&rq->rt.push_lock); | ||
| 1871 | return; | ||
| 1872 | } | ||
| 1873 | raw_spin_unlock(&rq->rt.push_lock); | ||
| 1874 | } | ||
| 1875 | |||
| 1876 | /* When here, there's no IPI going around */ | ||
| 1877 | |||
| 1878 | rq->rt.push_cpu = rq->cpu; | ||
| 1879 | cpu = find_next_push_cpu(rq); | ||
| 1880 | if (cpu >= nr_cpu_ids) | ||
| 1881 | return; | ||
| 1882 | |||
| 1883 | rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; | ||
| 1884 | |||
| 1885 | irq_work_queue_on(&rq->rt.push_work, cpu); | ||
| 1886 | } | ||
| 1887 | |||
| 1888 | /* Called from hardirq context */ | ||
| 1889 | static void try_to_push_tasks(void *arg) | ||
| 1890 | { | ||
| 1891 | struct rt_rq *rt_rq = arg; | ||
| 1892 | struct rq *rq, *src_rq; | ||
| 1893 | int this_cpu; | ||
| 1894 | int cpu; | ||
| 1895 | |||
| 1896 | this_cpu = rt_rq->push_cpu; | ||
| 1897 | |||
| 1898 | /* Paranoid check */ | ||
| 1899 | BUG_ON(this_cpu != smp_processor_id()); | ||
| 1900 | |||
| 1901 | rq = cpu_rq(this_cpu); | ||
| 1902 | src_rq = rq_of_rt_rq(rt_rq); | ||
| 1903 | |||
| 1904 | again: | ||
| 1905 | if (has_pushable_tasks(rq)) { | ||
| 1906 | raw_spin_lock(&rq->lock); | ||
| 1907 | push_rt_task(rq); | ||
| 1908 | raw_spin_unlock(&rq->lock); | ||
| 1909 | } | ||
| 1910 | |||
| 1911 | /* Pass the IPI to the next rt overloaded queue */ | ||
| 1912 | raw_spin_lock(&rt_rq->push_lock); | ||
| 1913 | /* | ||
| 1914 | * If the source queue changed since the IPI went out, | ||
| 1915 | * we need to restart the search from that CPU again. | ||
| 1916 | */ | ||
| 1917 | if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { | ||
| 1918 | rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; | ||
| 1919 | rt_rq->push_cpu = src_rq->cpu; | ||
| 1920 | } | ||
| 1921 | |||
| 1922 | cpu = find_next_push_cpu(src_rq); | ||
| 1923 | |||
| 1924 | if (cpu >= nr_cpu_ids) | ||
| 1925 | rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; | ||
| 1926 | raw_spin_unlock(&rt_rq->push_lock); | ||
| 1927 | |||
| 1928 | if (cpu >= nr_cpu_ids) | ||
| 1929 | return; | ||
| 1930 | |||
| 1931 | /* | ||
| 1932 | * It is possible that a restart caused this CPU to be | ||
| 1933 | * chosen again. Don't bother with an IPI, just see if we | ||
| 1934 | * have more to push. | ||
| 1935 | */ | ||
| 1936 | if (unlikely(cpu == rq->cpu)) | ||
| 1937 | goto again; | ||
| 1938 | |||
| 1939 | /* Try the next RT overloaded CPU */ | ||
| 1940 | irq_work_queue_on(&rt_rq->push_work, cpu); | ||
| 1941 | } | ||
| 1942 | |||
| 1943 | static void push_irq_work_func(struct irq_work *work) | ||
| 1944 | { | ||
| 1945 | struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); | ||
| 1946 | |||
| 1947 | try_to_push_tasks(rt_rq); | ||
| 1948 | } | ||
| 1949 | #endif /* HAVE_RT_PUSH_IPI */ | ||
| 1950 | |||
| 1781 | static int pull_rt_task(struct rq *this_rq) | 1951 | static int pull_rt_task(struct rq *this_rq) | 
| 1782 | { | 1952 | { | 
| 1783 | int this_cpu = this_rq->cpu, ret = 0, cpu; | 1953 | int this_cpu = this_rq->cpu, ret = 0, cpu; | 
| @@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1793 | */ | 1963 | */ | 
| 1794 | smp_rmb(); | 1964 | smp_rmb(); | 
| 1795 | 1965 | ||
| 1966 | #ifdef HAVE_RT_PUSH_IPI | ||
| 1967 | if (sched_feat(RT_PUSH_IPI)) { | ||
| 1968 | tell_cpu_to_push(this_rq); | ||
| 1969 | return 0; | ||
| 1970 | } | ||
| 1971 | #endif | ||
| 1972 | |||
| 1796 | for_each_cpu(cpu, this_rq->rd->rto_mask) { | 1973 | for_each_cpu(cpu, this_rq->rd->rto_mask) { | 
| 1797 | if (this_cpu == cpu) | 1974 | if (this_cpu == cpu) | 
| 1798 | continue; | 1975 | continue; | 
| diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dc0f435a2779..e0e129993958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> | 
| 7 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> | 
| 8 | #include <linux/stop_machine.h> | 8 | #include <linux/stop_machine.h> | 
| 9 | #include <linux/irq_work.h> | ||
| 9 | #include <linux/tick.h> | 10 | #include <linux/tick.h> | 
| 10 | #include <linux/slab.h> | 11 | #include <linux/slab.h> | 
| 11 | 12 | ||
| @@ -362,8 +363,14 @@ struct cfs_rq { | |||
| 362 | * Under CFS, load is tracked on a per-entity basis and aggregated up. | 363 | * Under CFS, load is tracked on a per-entity basis and aggregated up. | 
| 363 | * This allows for the description of both thread and group usage (in | 364 | * This allows for the description of both thread and group usage (in | 
| 364 | * the FAIR_GROUP_SCHED case). | 365 | * the FAIR_GROUP_SCHED case). | 
| 366 | * runnable_load_avg is the sum of the load_avg_contrib of the | ||
| 367 | * sched_entities on the rq. | ||
| 368 | * blocked_load_avg is similar to runnable_load_avg except that its | ||
| 369 | * the blocked sched_entities on the rq. | ||
| 370 | * utilization_load_avg is the sum of the average running time of the | ||
| 371 | * sched_entities on the rq. | ||
| 365 | */ | 372 | */ | 
| 366 | unsigned long runnable_load_avg, blocked_load_avg; | 373 | unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; | 
| 367 | atomic64_t decay_counter; | 374 | atomic64_t decay_counter; | 
| 368 | u64 last_decay; | 375 | u64 last_decay; | 
| 369 | atomic_long_t removed_load; | 376 | atomic_long_t removed_load; | 
| @@ -418,6 +425,11 @@ static inline int rt_bandwidth_enabled(void) | |||
| 418 | return sysctl_sched_rt_runtime >= 0; | 425 | return sysctl_sched_rt_runtime >= 0; | 
| 419 | } | 426 | } | 
| 420 | 427 | ||
| 428 | /* RT IPI pull logic requires IRQ_WORK */ | ||
| 429 | #ifdef CONFIG_IRQ_WORK | ||
| 430 | # define HAVE_RT_PUSH_IPI | ||
| 431 | #endif | ||
| 432 | |||
| 421 | /* Real-Time classes' related field in a runqueue: */ | 433 | /* Real-Time classes' related field in a runqueue: */ | 
| 422 | struct rt_rq { | 434 | struct rt_rq { | 
| 423 | struct rt_prio_array active; | 435 | struct rt_prio_array active; | 
| @@ -435,7 +447,13 @@ struct rt_rq { | |||
| 435 | unsigned long rt_nr_total; | 447 | unsigned long rt_nr_total; | 
| 436 | int overloaded; | 448 | int overloaded; | 
| 437 | struct plist_head pushable_tasks; | 449 | struct plist_head pushable_tasks; | 
| 450 | #ifdef HAVE_RT_PUSH_IPI | ||
| 451 | int push_flags; | ||
| 452 | int push_cpu; | ||
| 453 | struct irq_work push_work; | ||
| 454 | raw_spinlock_t push_lock; | ||
| 438 | #endif | 455 | #endif | 
| 456 | #endif /* CONFIG_SMP */ | ||
| 439 | int rt_queued; | 457 | int rt_queued; | 
| 440 | 458 | ||
| 441 | int rt_throttled; | 459 | int rt_throttled; | 
| @@ -597,6 +615,7 @@ struct rq { | |||
| 597 | struct sched_domain *sd; | 615 | struct sched_domain *sd; | 
| 598 | 616 | ||
| 599 | unsigned long cpu_capacity; | 617 | unsigned long cpu_capacity; | 
| 618 | unsigned long cpu_capacity_orig; | ||
| 600 | 619 | ||
| 601 | unsigned char idle_balance; | 620 | unsigned char idle_balance; | 
| 602 | /* For active balancing */ | 621 | /* For active balancing */ | 
| @@ -807,7 +826,7 @@ struct sched_group_capacity { | |||
| 807 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity | 826 | * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity | 
| 808 | * for a single CPU. | 827 | * for a single CPU. | 
| 809 | */ | 828 | */ | 
| 810 | unsigned int capacity, capacity_orig; | 829 | unsigned int capacity; | 
| 811 | unsigned long next_update; | 830 | unsigned long next_update; | 
| 812 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 831 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 
| 813 | /* | 832 | /* | 
| @@ -1368,9 +1387,18 @@ static inline int hrtick_enabled(struct rq *rq) | |||
| 1368 | 1387 | ||
| 1369 | #ifdef CONFIG_SMP | 1388 | #ifdef CONFIG_SMP | 
| 1370 | extern void sched_avg_update(struct rq *rq); | 1389 | extern void sched_avg_update(struct rq *rq); | 
| 1390 | |||
| 1391 | #ifndef arch_scale_freq_capacity | ||
| 1392 | static __always_inline | ||
| 1393 | unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) | ||
| 1394 | { | ||
| 1395 | return SCHED_CAPACITY_SCALE; | ||
| 1396 | } | ||
| 1397 | #endif | ||
| 1398 | |||
| 1371 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 1399 | static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | 
| 1372 | { | 1400 | { | 
| 1373 | rq->rt_avg += rt_delta; | 1401 | rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); | 
| 1374 | sched_avg_update(rq); | 1402 | sched_avg_update(rq); | 
| 1375 | } | 1403 | } | 
| 1376 | #else | 1404 | #else | 
| @@ -1643,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); | |||
| 1643 | extern void print_dl_stats(struct seq_file *m, int cpu); | 1671 | extern void print_dl_stats(struct seq_file *m, int cpu); | 
| 1644 | 1672 | ||
| 1645 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 1673 | extern void init_cfs_rq(struct cfs_rq *cfs_rq); | 
| 1646 | extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | 1674 | extern void init_rt_rq(struct rt_rq *rt_rq); | 
| 1647 | extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); | 1675 | extern void init_dl_rq(struct dl_rq *dl_rq); | 
| 1648 | 1676 | ||
| 1649 | extern void cfs_bandwidth_usage_inc(void); | 1677 | extern void cfs_bandwidth_usage_inc(void); | 
| 1650 | extern void cfs_bandwidth_usage_dec(void); | 1678 | extern void cfs_bandwidth_usage_dec(void); | 
| diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 40190f28db35..c697f73d82d6 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
| @@ -4,6 +4,7 @@ | |||
| 4 | #include <linux/cpu.h> | 4 | #include <linux/cpu.h> | 
| 5 | #include <linux/err.h> | 5 | #include <linux/err.h> | 
| 6 | #include <linux/smp.h> | 6 | #include <linux/smp.h> | 
| 7 | #include <linux/delay.h> | ||
| 7 | #include <linux/init.h> | 8 | #include <linux/init.h> | 
| 8 | #include <linux/list.h> | 9 | #include <linux/list.h> | 
| 9 | #include <linux/slab.h> | 10 | #include <linux/slab.h> | 
| @@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) | |||
| 314 | put_online_cpus(); | 315 | put_online_cpus(); | 
| 315 | } | 316 | } | 
| 316 | EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); | 317 | EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); | 
| 318 | |||
| 319 | static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); | ||
| 320 | |||
| 321 | /* | ||
| 322 | * Called to poll specified CPU's state, for example, when waiting for | ||
| 323 | * a CPU to come online. | ||
| 324 | */ | ||
| 325 | int cpu_report_state(int cpu) | ||
| 326 | { | ||
| 327 | return atomic_read(&per_cpu(cpu_hotplug_state, cpu)); | ||
| 328 | } | ||
| 329 | |||
| 330 | /* | ||
| 331 | * If CPU has died properly, set its state to CPU_UP_PREPARE and | ||
| 332 | * return success. Otherwise, return -EBUSY if the CPU died after | ||
| 333 | * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN | ||
| 334 | * if cpu_wait_death() timed out and the CPU still hasn't gotten around | ||
| 335 | * to dying. In the latter two cases, the CPU might not be set up | ||
| 336 | * properly, but it is up to the arch-specific code to decide. | ||
| 337 | * Finally, -EIO indicates an unanticipated problem. | ||
| 338 | * | ||
| 339 | * Note that it is permissible to omit this call entirely, as is | ||
| 340 | * done in architectures that do no CPU-hotplug error checking. | ||
| 341 | */ | ||
| 342 | int cpu_check_up_prepare(int cpu) | ||
| 343 | { | ||
| 344 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) { | ||
| 345 | atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); | ||
| 346 | return 0; | ||
| 347 | } | ||
| 348 | |||
| 349 | switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) { | ||
| 350 | |||
| 351 | case CPU_POST_DEAD: | ||
| 352 | |||
| 353 | /* The CPU died properly, so just start it up again. */ | ||
| 354 | atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); | ||
| 355 | return 0; | ||
| 356 | |||
| 357 | case CPU_DEAD_FROZEN: | ||
| 358 | |||
| 359 | /* | ||
| 360 | * Timeout during CPU death, so let caller know. | ||
| 361 | * The outgoing CPU completed its processing, but after | ||
| 362 | * cpu_wait_death() timed out and reported the error. The | ||
| 363 | * caller is free to proceed, in which case the state | ||
| 364 | * will be reset properly by cpu_set_state_online(). | ||
| 365 | * Proceeding despite this -EBUSY return makes sense | ||
| 366 | * for systems where the outgoing CPUs take themselves | ||
| 367 | * offline, with no post-death manipulation required from | ||
| 368 | * a surviving CPU. | ||
| 369 | */ | ||
| 370 | return -EBUSY; | ||
| 371 | |||
| 372 | case CPU_BROKEN: | ||
| 373 | |||
| 374 | /* | ||
| 375 | * The most likely reason we got here is that there was | ||
| 376 | * a timeout during CPU death, and the outgoing CPU never | ||
| 377 | * did complete its processing. This could happen on | ||
| 378 | * a virtualized system if the outgoing VCPU gets preempted | ||
| 379 | * for more than five seconds, and the user attempts to | ||
| 380 | * immediately online that same CPU. Trying again later | ||
| 381 | * might return -EBUSY above, hence -EAGAIN. | ||
| 382 | */ | ||
| 383 | return -EAGAIN; | ||
| 384 | |||
| 385 | default: | ||
| 386 | |||
| 387 | /* Should not happen. Famous last words. */ | ||
| 388 | return -EIO; | ||
| 389 | } | ||
| 390 | } | ||
| 391 | |||
| 392 | /* | ||
| 393 | * Mark the specified CPU online. | ||
| 394 | * | ||
| 395 | * Note that it is permissible to omit this call entirely, as is | ||
| 396 | * done in architectures that do no CPU-hotplug error checking. | ||
| 397 | */ | ||
| 398 | void cpu_set_state_online(int cpu) | ||
| 399 | { | ||
| 400 | (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE); | ||
| 401 | } | ||
| 402 | |||
| 403 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 404 | |||
| 405 | /* | ||
| 406 | * Wait for the specified CPU to exit the idle loop and die. | ||
| 407 | */ | ||
| 408 | bool cpu_wait_death(unsigned int cpu, int seconds) | ||
| 409 | { | ||
| 410 | int jf_left = seconds * HZ; | ||
| 411 | int oldstate; | ||
| 412 | bool ret = true; | ||
| 413 | int sleep_jf = 1; | ||
| 414 | |||
| 415 | might_sleep(); | ||
| 416 | |||
| 417 | /* The outgoing CPU will normally get done quite quickly. */ | ||
| 418 | if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) | ||
| 419 | goto update_state; | ||
| 420 | udelay(5); | ||
| 421 | |||
| 422 | /* But if the outgoing CPU dawdles, wait increasingly long times. */ | ||
| 423 | while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) { | ||
| 424 | schedule_timeout_uninterruptible(sleep_jf); | ||
| 425 | jf_left -= sleep_jf; | ||
| 426 | if (jf_left <= 0) | ||
| 427 | break; | ||
| 428 | sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); | ||
| 429 | } | ||
| 430 | update_state: | ||
| 431 | oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); | ||
| 432 | if (oldstate == CPU_DEAD) { | ||
| 433 | /* Outgoing CPU died normally, update state. */ | ||
| 434 | smp_mb(); /* atomic_read() before update. */ | ||
| 435 | atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); | ||
| 436 | } else { | ||
| 437 | /* Outgoing CPU still hasn't died, set state accordingly. */ | ||
| 438 | if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), | ||
| 439 | oldstate, CPU_BROKEN) != oldstate) | ||
| 440 | goto update_state; | ||
| 441 | ret = false; | ||
| 442 | } | ||
| 443 | return ret; | ||
| 444 | } | ||
| 445 | |||
| 446 | /* | ||
| 447 | * Called by the outgoing CPU to report its successful death. Return | ||
| 448 | * false if this report follows the surviving CPU's timing out. | ||
| 449 | * | ||
| 450 | * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU | ||
| 451 | * timed out. This approach allows architectures to omit calls to | ||
| 452 | * cpu_check_up_prepare() and cpu_set_state_online() without defeating | ||
| 453 | * the next cpu_wait_death()'s polling loop. | ||
| 454 | */ | ||
| 455 | bool cpu_report_death(void) | ||
| 456 | { | ||
| 457 | int oldstate; | ||
| 458 | int newstate; | ||
| 459 | int cpu = smp_processor_id(); | ||
| 460 | |||
| 461 | do { | ||
| 462 | oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); | ||
| 463 | if (oldstate != CPU_BROKEN) | ||
| 464 | newstate = CPU_DEAD; | ||
| 465 | else | ||
| 466 | newstate = CPU_DEAD_FROZEN; | ||
| 467 | } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), | ||
| 468 | oldstate, newstate) != oldstate); | ||
| 469 | return newstate == CPU_DEAD; | ||
| 470 | } | ||
| 471 | |||
| 472 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| diff --git a/kernel/sys.c b/kernel/sys.c index a03d9cd23ed7..3be344902316 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -325,6 +325,7 @@ out_unlock: | |||
| 325 | * SMP: There are not races, the GIDs are checked only by filesystem | 325 | * SMP: There are not races, the GIDs are checked only by filesystem | 
| 326 | * operations (as far as semantic preservation is concerned). | 326 | * operations (as far as semantic preservation is concerned). | 
| 327 | */ | 327 | */ | 
| 328 | #ifdef CONFIG_MULTIUSER | ||
| 328 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | 329 | SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) | 
| 329 | { | 330 | { | 
| 330 | struct user_namespace *ns = current_user_ns(); | 331 | struct user_namespace *ns = current_user_ns(); | 
| @@ -815,6 +816,7 @@ change_okay: | |||
| 815 | commit_creds(new); | 816 | commit_creds(new); | 
| 816 | return old_fsgid; | 817 | return old_fsgid; | 
| 817 | } | 818 | } | 
| 819 | #endif /* CONFIG_MULTIUSER */ | ||
| 818 | 820 | ||
| 819 | /** | 821 | /** | 
| 820 | * sys_getpid - return the thread group id of the current process | 822 | * sys_getpid - return the thread group id of the current process | 
| diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5adcb0ae3a58..7995ef5868d8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -159,6 +159,20 @@ cond_syscall(sys_uselib); | |||
| 159 | cond_syscall(sys_fadvise64); | 159 | cond_syscall(sys_fadvise64); | 
| 160 | cond_syscall(sys_fadvise64_64); | 160 | cond_syscall(sys_fadvise64_64); | 
| 161 | cond_syscall(sys_madvise); | 161 | cond_syscall(sys_madvise); | 
| 162 | cond_syscall(sys_setuid); | ||
| 163 | cond_syscall(sys_setregid); | ||
| 164 | cond_syscall(sys_setgid); | ||
| 165 | cond_syscall(sys_setreuid); | ||
| 166 | cond_syscall(sys_setresuid); | ||
| 167 | cond_syscall(sys_getresuid); | ||
| 168 | cond_syscall(sys_setresgid); | ||
| 169 | cond_syscall(sys_getresgid); | ||
| 170 | cond_syscall(sys_setgroups); | ||
| 171 | cond_syscall(sys_getgroups); | ||
| 172 | cond_syscall(sys_setfsuid); | ||
| 173 | cond_syscall(sys_setfsgid); | ||
| 174 | cond_syscall(sys_capget); | ||
| 175 | cond_syscall(sys_capset); | ||
| 162 | 176 | ||
| 163 | /* arch-specific weak syscall entries */ | 177 | /* arch-specific weak syscall entries */ | 
| 164 | cond_syscall(sys_pciconfig_read); | 178 | cond_syscall(sys_pciconfig_read); | 
| diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 88ea2d6e0031..42b7fc2860c1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | */ | 19 | */ | 
| 20 | 20 | ||
| 21 | #include <linux/module.h> | 21 | #include <linux/module.h> | 
| 22 | #include <linux/aio.h> | ||
| 22 | #include <linux/mm.h> | 23 | #include <linux/mm.h> | 
| 23 | #include <linux/swap.h> | 24 | #include <linux/swap.h> | 
| 24 | #include <linux/slab.h> | 25 | #include <linux/slab.h> | 
| @@ -846,7 +847,7 @@ static struct ctl_table kern_table[] = { | |||
| 846 | .data = &watchdog_user_enabled, | 847 | .data = &watchdog_user_enabled, | 
| 847 | .maxlen = sizeof (int), | 848 | .maxlen = sizeof (int), | 
| 848 | .mode = 0644, | 849 | .mode = 0644, | 
| 849 | .proc_handler = proc_dowatchdog, | 850 | .proc_handler = proc_watchdog, | 
| 850 | .extra1 = &zero, | 851 | .extra1 = &zero, | 
| 851 | .extra2 = &one, | 852 | .extra2 = &one, | 
| 852 | }, | 853 | }, | 
| @@ -855,11 +856,33 @@ static struct ctl_table kern_table[] = { | |||
| 855 | .data = &watchdog_thresh, | 856 | .data = &watchdog_thresh, | 
| 856 | .maxlen = sizeof(int), | 857 | .maxlen = sizeof(int), | 
| 857 | .mode = 0644, | 858 | .mode = 0644, | 
| 858 | .proc_handler = proc_dowatchdog, | 859 | .proc_handler = proc_watchdog_thresh, | 
| 859 | .extra1 = &zero, | 860 | .extra1 = &zero, | 
| 860 | .extra2 = &sixty, | 861 | .extra2 = &sixty, | 
| 861 | }, | 862 | }, | 
| 862 | { | 863 | { | 
| 864 | .procname = "nmi_watchdog", | ||
| 865 | .data = &nmi_watchdog_enabled, | ||
| 866 | .maxlen = sizeof (int), | ||
| 867 | .mode = 0644, | ||
| 868 | .proc_handler = proc_nmi_watchdog, | ||
| 869 | .extra1 = &zero, | ||
| 870 | #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) | ||
| 871 | .extra2 = &one, | ||
| 872 | #else | ||
| 873 | .extra2 = &zero, | ||
| 874 | #endif | ||
| 875 | }, | ||
| 876 | { | ||
| 877 | .procname = "soft_watchdog", | ||
| 878 | .data = &soft_watchdog_enabled, | ||
| 879 | .maxlen = sizeof (int), | ||
| 880 | .mode = 0644, | ||
| 881 | .proc_handler = proc_soft_watchdog, | ||
| 882 | .extra1 = &zero, | ||
| 883 | .extra2 = &one, | ||
| 884 | }, | ||
| 885 | { | ||
| 863 | .procname = "softlockup_panic", | 886 | .procname = "softlockup_panic", | 
| 864 | .data = &softlockup_panic, | 887 | .data = &softlockup_panic, | 
| 865 | .maxlen = sizeof(int), | 888 | .maxlen = sizeof(int), | 
| @@ -879,15 +902,6 @@ static struct ctl_table kern_table[] = { | |||
| 879 | .extra2 = &one, | 902 | .extra2 = &one, | 
| 880 | }, | 903 | }, | 
| 881 | #endif /* CONFIG_SMP */ | 904 | #endif /* CONFIG_SMP */ | 
| 882 | { | ||
| 883 | .procname = "nmi_watchdog", | ||
| 884 | .data = &watchdog_user_enabled, | ||
| 885 | .maxlen = sizeof (int), | ||
| 886 | .mode = 0644, | ||
| 887 | .proc_handler = proc_dowatchdog, | ||
| 888 | .extra1 = &zero, | ||
| 889 | .extra2 = &one, | ||
| 890 | }, | ||
| 891 | #endif | 905 | #endif | 
| 892 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 906 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 
| 893 | { | 907 | { | 
| @@ -1228,6 +1242,14 @@ static struct ctl_table vm_table[] = { | |||
| 1228 | .extra1 = &zero, | 1242 | .extra1 = &zero, | 
| 1229 | }, | 1243 | }, | 
| 1230 | { | 1244 | { | 
| 1245 | .procname = "dirtytime_expire_seconds", | ||
| 1246 | .data = &dirtytime_expire_interval, | ||
| 1247 | .maxlen = sizeof(dirty_expire_interval), | ||
| 1248 | .mode = 0644, | ||
| 1249 | .proc_handler = dirtytime_interval_handler, | ||
| 1250 | .extra1 = &zero, | ||
| 1251 | }, | ||
| 1252 | { | ||
| 1231 | .procname = "nr_pdflush_threads", | 1253 | .procname = "nr_pdflush_threads", | 
| 1232 | .mode = 0444 /* read-only */, | 1254 | .mode = 0444 /* read-only */, | 
| 1233 | .proc_handler = pdflush_proc_obsolete, | 1255 | .proc_handler = pdflush_proc_obsolete, | 
| @@ -1313,6 +1335,15 @@ static struct ctl_table vm_table[] = { | |||
| 1313 | .extra1 = &min_extfrag_threshold, | 1335 | .extra1 = &min_extfrag_threshold, | 
| 1314 | .extra2 = &max_extfrag_threshold, | 1336 | .extra2 = &max_extfrag_threshold, | 
| 1315 | }, | 1337 | }, | 
| 1338 | { | ||
| 1339 | .procname = "compact_unevictable_allowed", | ||
| 1340 | .data = &sysctl_compact_unevictable_allowed, | ||
| 1341 | .maxlen = sizeof(int), | ||
| 1342 | .mode = 0644, | ||
| 1343 | .proc_handler = proc_dointvec, | ||
| 1344 | .extra1 = &zero, | ||
| 1345 | .extra2 = &one, | ||
| 1346 | }, | ||
| 1316 | 1347 | ||
| 1317 | #endif /* CONFIG_COMPACTION */ | 1348 | #endif /* CONFIG_COMPACTION */ | 
| 1318 | { | 1349 | { | 
| diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index d626dc98e8df..579ce1b929af 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
| @@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET | |||
| 33 | config GENERIC_CLOCKEVENTS | 33 | config GENERIC_CLOCKEVENTS | 
| 34 | bool | 34 | bool | 
| 35 | 35 | ||
| 36 | # Migration helper. Builds, but does not invoke | ||
| 37 | config GENERIC_CLOCKEVENTS_BUILD | ||
| 38 | bool | ||
| 39 | default y | ||
| 40 | depends on GENERIC_CLOCKEVENTS | ||
| 41 | |||
| 42 | # Architecture can handle broadcast in a driver-agnostic way | 36 | # Architecture can handle broadcast in a driver-agnostic way | 
| 43 | config ARCH_HAS_TICK_BROADCAST | 37 | config ARCH_HAS_TICK_BROADCAST | 
| 44 | bool | 38 | bool | 
| diff --git a/kernel/time/Makefile b/kernel/time/Makefile index c09c07817d7a..01f0312419b3 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
| @@ -2,15 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o | |||
| 2 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o | 2 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o | 
| 3 | obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o | 3 | obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o | 
| 4 | 4 | ||
| 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o | 
| 6 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | ||
| 7 | ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) | 6 | ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) | 
| 8 | obj-y += tick-broadcast.o | 7 | obj-y += tick-broadcast.o | 
| 9 | obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o | 8 | obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o | 
| 10 | endif | 9 | endif | 
| 11 | obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o | 10 | obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o | 
| 12 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o | 11 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o | 
| 13 | obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o | ||
| 14 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o | 12 | obj-$(CONFIG_TIMER_STATS) += timer_stats.o | 
| 15 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o | 13 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o | 
| 16 | obj-$(CONFIG_TEST_UDELAY) += test_udelay.o | 14 | obj-$(CONFIG_TEST_UDELAY) += test_udelay.o | 
| diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 55449909f114..25d942d1da27 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -94,25 +94,76 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) | |||
| 94 | } | 94 | } | 
| 95 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); | 95 | EXPORT_SYMBOL_GPL(clockevent_delta2ns); | 
| 96 | 96 | ||
| 97 | static int __clockevents_set_state(struct clock_event_device *dev, | ||
| 98 | enum clock_event_state state) | ||
| 99 | { | ||
| 100 | /* Transition with legacy set_mode() callback */ | ||
| 101 | if (dev->set_mode) { | ||
| 102 | /* Legacy callback doesn't support new modes */ | ||
| 103 | if (state > CLOCK_EVT_STATE_ONESHOT) | ||
| 104 | return -ENOSYS; | ||
| 105 | /* | ||
| 106 | * 'clock_event_state' and 'clock_event_mode' have 1-to-1 | ||
| 107 | * mapping until *_ONESHOT, and so a simple cast will work. | ||
| 108 | */ | ||
| 109 | dev->set_mode((enum clock_event_mode)state, dev); | ||
| 110 | dev->mode = (enum clock_event_mode)state; | ||
| 111 | return 0; | ||
| 112 | } | ||
| 113 | |||
| 114 | if (dev->features & CLOCK_EVT_FEAT_DUMMY) | ||
| 115 | return 0; | ||
| 116 | |||
| 117 | /* Transition with new state-specific callbacks */ | ||
| 118 | switch (state) { | ||
| 119 | case CLOCK_EVT_STATE_DETACHED: | ||
| 120 | /* | ||
| 121 | * This is an internal state, which is guaranteed to go from | ||
| 122 | * SHUTDOWN to DETACHED. No driver interaction required. | ||
| 123 | */ | ||
| 124 | return 0; | ||
| 125 | |||
| 126 | case CLOCK_EVT_STATE_SHUTDOWN: | ||
| 127 | return dev->set_state_shutdown(dev); | ||
| 128 | |||
| 129 | case CLOCK_EVT_STATE_PERIODIC: | ||
| 130 | /* Core internal bug */ | ||
| 131 | if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) | ||
| 132 | return -ENOSYS; | ||
| 133 | return dev->set_state_periodic(dev); | ||
| 134 | |||
| 135 | case CLOCK_EVT_STATE_ONESHOT: | ||
| 136 | /* Core internal bug */ | ||
| 137 | if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
| 138 | return -ENOSYS; | ||
| 139 | return dev->set_state_oneshot(dev); | ||
| 140 | |||
| 141 | default: | ||
| 142 | return -ENOSYS; | ||
| 143 | } | ||
| 144 | } | ||
| 145 | |||
| 97 | /** | 146 | /** | 
| 98 | * clockevents_set_mode - set the operating mode of a clock event device | 147 | * clockevents_set_state - set the operating state of a clock event device | 
| 99 | * @dev: device to modify | 148 | * @dev: device to modify | 
| 100 | * @mode: new mode | 149 | * @state: new state | 
| 101 | * | 150 | * | 
| 102 | * Must be called with interrupts disabled ! | 151 | * Must be called with interrupts disabled ! | 
| 103 | */ | 152 | */ | 
| 104 | void clockevents_set_mode(struct clock_event_device *dev, | 153 | void clockevents_set_state(struct clock_event_device *dev, | 
| 105 | enum clock_event_mode mode) | 154 | enum clock_event_state state) | 
| 106 | { | 155 | { | 
| 107 | if (dev->mode != mode) { | 156 | if (dev->state != state) { | 
| 108 | dev->set_mode(mode, dev); | 157 | if (__clockevents_set_state(dev, state)) | 
| 109 | dev->mode = mode; | 158 | return; | 
| 159 | |||
| 160 | dev->state = state; | ||
| 110 | 161 | ||
| 111 | /* | 162 | /* | 
| 112 | * A nsec2cyc multiplicator of 0 is invalid and we'd crash | 163 | * A nsec2cyc multiplicator of 0 is invalid and we'd crash | 
| 113 | * on it, so fix it up and emit a warning: | 164 | * on it, so fix it up and emit a warning: | 
| 114 | */ | 165 | */ | 
| 115 | if (mode == CLOCK_EVT_MODE_ONESHOT) { | 166 | if (state == CLOCK_EVT_STATE_ONESHOT) { | 
| 116 | if (unlikely(!dev->mult)) { | 167 | if (unlikely(!dev->mult)) { | 
| 117 | dev->mult = 1; | 168 | dev->mult = 1; | 
| 118 | WARN_ON(1); | 169 | WARN_ON(1); | 
| @@ -127,10 +178,28 @@ void clockevents_set_mode(struct clock_event_device *dev, | |||
| 127 | */ | 178 | */ | 
| 128 | void clockevents_shutdown(struct clock_event_device *dev) | 179 | void clockevents_shutdown(struct clock_event_device *dev) | 
| 129 | { | 180 | { | 
| 130 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); | 181 | clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); | 
| 131 | dev->next_event.tv64 = KTIME_MAX; | 182 | dev->next_event.tv64 = KTIME_MAX; | 
| 132 | } | 183 | } | 
| 133 | 184 | ||
| 185 | /** | ||
| 186 | * clockevents_tick_resume - Resume the tick device before using it again | ||
| 187 | * @dev: device to resume | ||
| 188 | */ | ||
| 189 | int clockevents_tick_resume(struct clock_event_device *dev) | ||
| 190 | { | ||
| 191 | int ret = 0; | ||
| 192 | |||
| 193 | if (dev->set_mode) { | ||
| 194 | dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); | ||
| 195 | dev->mode = CLOCK_EVT_MODE_RESUME; | ||
| 196 | } else if (dev->tick_resume) { | ||
| 197 | ret = dev->tick_resume(dev); | ||
| 198 | } | ||
| 199 | |||
| 200 | return ret; | ||
| 201 | } | ||
| 202 | |||
| 134 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST | 203 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST | 
| 135 | 204 | ||
| 136 | /* Limit min_delta to a jiffie */ | 205 | /* Limit min_delta to a jiffie */ | 
| @@ -183,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) | |||
| 183 | delta = dev->min_delta_ns; | 252 | delta = dev->min_delta_ns; | 
| 184 | dev->next_event = ktime_add_ns(ktime_get(), delta); | 253 | dev->next_event = ktime_add_ns(ktime_get(), delta); | 
| 185 | 254 | ||
| 186 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 255 | if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) | 
| 187 | return 0; | 256 | return 0; | 
| 188 | 257 | ||
| 189 | dev->retries++; | 258 | dev->retries++; | 
| @@ -220,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) | |||
| 220 | delta = dev->min_delta_ns; | 289 | delta = dev->min_delta_ns; | 
| 221 | dev->next_event = ktime_add_ns(ktime_get(), delta); | 290 | dev->next_event = ktime_add_ns(ktime_get(), delta); | 
| 222 | 291 | ||
| 223 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 292 | if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) | 
| 224 | return 0; | 293 | return 0; | 
| 225 | 294 | ||
| 226 | dev->retries++; | 295 | dev->retries++; | 
| @@ -252,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, | |||
| 252 | 321 | ||
| 253 | dev->next_event = expires; | 322 | dev->next_event = expires; | 
| 254 | 323 | ||
| 255 | if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) | 324 | if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) | 
| 256 | return 0; | 325 | return 0; | 
| 257 | 326 | ||
| 258 | /* Shortcut for clockevent devices that can deal with ktime. */ | 327 | /* Shortcut for clockevent devices that can deal with ktime. */ | 
| @@ -297,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced) | |||
| 297 | struct clock_event_device *dev, *newdev = NULL; | 366 | struct clock_event_device *dev, *newdev = NULL; | 
| 298 | 367 | ||
| 299 | list_for_each_entry(dev, &clockevent_devices, list) { | 368 | list_for_each_entry(dev, &clockevent_devices, list) { | 
| 300 | if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) | 369 | if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) | 
| 301 | continue; | 370 | continue; | 
| 302 | 371 | ||
| 303 | if (!tick_check_replacement(newdev, dev)) | 372 | if (!tick_check_replacement(newdev, dev)) | 
| @@ -323,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced) | |||
| 323 | static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) | 392 | static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) | 
| 324 | { | 393 | { | 
| 325 | /* Fast track. Device is unused */ | 394 | /* Fast track. Device is unused */ | 
| 326 | if (ced->mode == CLOCK_EVT_MODE_UNUSED) { | 395 | if (ced->state == CLOCK_EVT_STATE_DETACHED) { | 
| 327 | list_del_init(&ced->list); | 396 | list_del_init(&ced->list); | 
| 328 | return 0; | 397 | return 0; | 
| 329 | } | 398 | } | 
| @@ -373,6 +442,37 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) | |||
| 373 | } | 442 | } | 
| 374 | EXPORT_SYMBOL_GPL(clockevents_unbind); | 443 | EXPORT_SYMBOL_GPL(clockevents_unbind); | 
| 375 | 444 | ||
| 445 | /* Sanity check of state transition callbacks */ | ||
| 446 | static int clockevents_sanity_check(struct clock_event_device *dev) | ||
| 447 | { | ||
| 448 | /* Legacy set_mode() callback */ | ||
| 449 | if (dev->set_mode) { | ||
| 450 | /* We shouldn't be supporting new modes now */ | ||
| 451 | WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || | ||
| 452 | dev->set_state_shutdown || dev->tick_resume); | ||
| 453 | |||
| 454 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | ||
| 455 | return 0; | ||
| 456 | } | ||
| 457 | |||
| 458 | if (dev->features & CLOCK_EVT_FEAT_DUMMY) | ||
| 459 | return 0; | ||
| 460 | |||
| 461 | /* New state-specific callbacks */ | ||
| 462 | if (!dev->set_state_shutdown) | ||
| 463 | return -EINVAL; | ||
| 464 | |||
| 465 | if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && | ||
| 466 | !dev->set_state_periodic) | ||
| 467 | return -EINVAL; | ||
| 468 | |||
| 469 | if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && | ||
| 470 | !dev->set_state_oneshot) | ||
| 471 | return -EINVAL; | ||
| 472 | |||
| 473 | return 0; | ||
| 474 | } | ||
| 475 | |||
| 376 | /** | 476 | /** | 
| 377 | * clockevents_register_device - register a clock event device | 477 | * clockevents_register_device - register a clock event device | 
| 378 | * @dev: device to register | 478 | * @dev: device to register | 
| @@ -381,7 +481,11 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
| 381 | { | 481 | { | 
| 382 | unsigned long flags; | 482 | unsigned long flags; | 
| 383 | 483 | ||
| 384 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | 484 | BUG_ON(clockevents_sanity_check(dev)); | 
| 485 | |||
| 486 | /* Initialize state to DETACHED */ | ||
| 487 | dev->state = CLOCK_EVT_STATE_DETACHED; | ||
| 488 | |||
| 385 | if (!dev->cpumask) { | 489 | if (!dev->cpumask) { | 
| 386 | WARN_ON(num_possible_cpus() > 1); | 490 | WARN_ON(num_possible_cpus() > 1); | 
| 387 | dev->cpumask = cpumask_of(smp_processor_id()); | 491 | dev->cpumask = cpumask_of(smp_processor_id()); | 
| @@ -445,11 +549,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) | |||
| 445 | { | 549 | { | 
| 446 | clockevents_config(dev, freq); | 550 | clockevents_config(dev, freq); | 
| 447 | 551 | ||
| 448 | if (dev->mode == CLOCK_EVT_MODE_ONESHOT) | 552 | if (dev->state == CLOCK_EVT_STATE_ONESHOT) | 
| 449 | return clockevents_program_event(dev, dev->next_event, false); | 553 | return clockevents_program_event(dev, dev->next_event, false); | 
| 450 | 554 | ||
| 451 | if (dev->mode == CLOCK_EVT_MODE_PERIODIC) | 555 | if (dev->state == CLOCK_EVT_STATE_PERIODIC) | 
| 452 | dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); | 556 | return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); | 
| 453 | 557 | ||
| 454 | return 0; | 558 | return 0; | 
| 455 | } | 559 | } | 
| @@ -491,30 +595,27 @@ void clockevents_handle_noop(struct clock_event_device *dev) | |||
| 491 | * @old: device to release (can be NULL) | 595 | * @old: device to release (can be NULL) | 
| 492 | * @new: device to request (can be NULL) | 596 | * @new: device to request (can be NULL) | 
| 493 | * | 597 | * | 
| 494 | * Called from the notifier chain. clockevents_lock is held already | 598 | * Called from various tick functions with clockevents_lock held and | 
| 599 | * interrupts disabled. | ||
| 495 | */ | 600 | */ | 
| 496 | void clockevents_exchange_device(struct clock_event_device *old, | 601 | void clockevents_exchange_device(struct clock_event_device *old, | 
| 497 | struct clock_event_device *new) | 602 | struct clock_event_device *new) | 
| 498 | { | 603 | { | 
| 499 | unsigned long flags; | ||
| 500 | |||
| 501 | local_irq_save(flags); | ||
| 502 | /* | 604 | /* | 
| 503 | * Caller releases a clock event device. We queue it into the | 605 | * Caller releases a clock event device. We queue it into the | 
| 504 | * released list and do a notify add later. | 606 | * released list and do a notify add later. | 
| 505 | */ | 607 | */ | 
| 506 | if (old) { | 608 | if (old) { | 
| 507 | module_put(old->owner); | 609 | module_put(old->owner); | 
| 508 | clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); | 610 | clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); | 
| 509 | list_del(&old->list); | 611 | list_del(&old->list); | 
| 510 | list_add(&old->list, &clockevents_released); | 612 | list_add(&old->list, &clockevents_released); | 
| 511 | } | 613 | } | 
| 512 | 614 | ||
| 513 | if (new) { | 615 | if (new) { | 
| 514 | BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); | 616 | BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); | 
| 515 | clockevents_shutdown(new); | 617 | clockevents_shutdown(new); | 
| 516 | } | 618 | } | 
| 517 | local_irq_restore(flags); | ||
| 518 | } | 619 | } | 
| 519 | 620 | ||
| 520 | /** | 621 | /** | 
| @@ -541,74 +642,40 @@ void clockevents_resume(void) | |||
| 541 | dev->resume(dev); | 642 | dev->resume(dev); | 
| 542 | } | 643 | } | 
| 543 | 644 | ||
| 544 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 645 | #ifdef CONFIG_HOTPLUG_CPU | 
| 545 | /** | 646 | /** | 
| 546 | * clockevents_notify - notification about relevant events | 647 | * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu | 
| 547 | * Returns 0 on success, any other value on error | ||
| 548 | */ | 648 | */ | 
| 549 | int clockevents_notify(unsigned long reason, void *arg) | 649 | void tick_cleanup_dead_cpu(int cpu) | 
| 550 | { | 650 | { | 
| 551 | struct clock_event_device *dev, *tmp; | 651 | struct clock_event_device *dev, *tmp; | 
| 552 | unsigned long flags; | 652 | unsigned long flags; | 
| 553 | int cpu, ret = 0; | ||
| 554 | 653 | ||
| 555 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 654 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 
| 556 | 655 | ||
| 557 | switch (reason) { | 656 | tick_shutdown_broadcast_oneshot(cpu); | 
| 558 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | 657 | tick_shutdown_broadcast(cpu); | 
| 559 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | 658 | tick_shutdown(cpu); | 
| 560 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | 659 | /* | 
| 561 | tick_broadcast_on_off(reason, arg); | 660 | * Unregister the clock event devices which were | 
| 562 | break; | 661 | * released from the users in the notify chain. | 
| 563 | 662 | */ | |
| 564 | case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: | 663 | list_for_each_entry_safe(dev, tmp, &clockevents_released, list) | 
| 565 | case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: | 664 | list_del(&dev->list); | 
| 566 | ret = tick_broadcast_oneshot_control(reason); | 665 | /* | 
| 567 | break; | 666 | * Now check whether the CPU has left unused per cpu devices | 
| 568 | 667 | */ | |
| 569 | case CLOCK_EVT_NOTIFY_CPU_DYING: | 668 | list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { | 
| 570 | tick_handover_do_timer(arg); | 669 | if (cpumask_test_cpu(cpu, dev->cpumask) && | 
| 571 | break; | 670 | cpumask_weight(dev->cpumask) == 1 && | 
| 572 | 671 | !tick_is_broadcast_device(dev)) { | |
| 573 | case CLOCK_EVT_NOTIFY_SUSPEND: | 672 | BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); | 
| 574 | tick_suspend(); | ||
| 575 | tick_suspend_broadcast(); | ||
| 576 | break; | ||
| 577 | |||
| 578 | case CLOCK_EVT_NOTIFY_RESUME: | ||
| 579 | tick_resume(); | ||
| 580 | break; | ||
| 581 | |||
| 582 | case CLOCK_EVT_NOTIFY_CPU_DEAD: | ||
| 583 | tick_shutdown_broadcast_oneshot(arg); | ||
| 584 | tick_shutdown_broadcast(arg); | ||
| 585 | tick_shutdown(arg); | ||
| 586 | /* | ||
| 587 | * Unregister the clock event devices which were | ||
| 588 | * released from the users in the notify chain. | ||
| 589 | */ | ||
| 590 | list_for_each_entry_safe(dev, tmp, &clockevents_released, list) | ||
| 591 | list_del(&dev->list); | 673 | list_del(&dev->list); | 
| 592 | /* | ||
| 593 | * Now check whether the CPU has left unused per cpu devices | ||
| 594 | */ | ||
| 595 | cpu = *((int *)arg); | ||
| 596 | list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { | ||
| 597 | if (cpumask_test_cpu(cpu, dev->cpumask) && | ||
| 598 | cpumask_weight(dev->cpumask) == 1 && | ||
| 599 | !tick_is_broadcast_device(dev)) { | ||
| 600 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | ||
| 601 | list_del(&dev->list); | ||
| 602 | } | ||
| 603 | } | 674 | } | 
| 604 | break; | ||
| 605 | default: | ||
| 606 | break; | ||
| 607 | } | 675 | } | 
| 608 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); | 676 | raw_spin_unlock_irqrestore(&clockevents_lock, flags); | 
| 609 | return ret; | ||
| 610 | } | 677 | } | 
| 611 | EXPORT_SYMBOL_GPL(clockevents_notify); | 678 | #endif | 
| 612 | 679 | ||
| 613 | #ifdef CONFIG_SYSFS | 680 | #ifdef CONFIG_SYSFS | 
| 614 | struct bus_type clockevents_subsys = { | 681 | struct bus_type clockevents_subsys = { | 
| @@ -727,5 +794,3 @@ static int __init clockevents_init_sysfs(void) | |||
| 727 | } | 794 | } | 
| 728 | device_initcall(clockevents_init_sysfs); | 795 | device_initcall(clockevents_init_sysfs); | 
| 729 | #endif /* SYSFS */ | 796 | #endif /* SYSFS */ | 
| 730 | |||
| 731 | #endif /* GENERIC_CLOCK_EVENTS */ | ||
| diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 4892352f0e49..15facb1b9c60 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs) | |||
| 142 | schedule_work(&watchdog_work); | 142 | schedule_work(&watchdog_work); | 
| 143 | } | 143 | } | 
| 144 | 144 | ||
| 145 | static void clocksource_unstable(struct clocksource *cs, int64_t delta) | ||
| 146 | { | ||
| 147 | printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", | ||
| 148 | cs->name, delta); | ||
| 149 | __clocksource_unstable(cs); | ||
| 150 | } | ||
| 151 | |||
| 152 | /** | 145 | /** | 
| 153 | * clocksource_mark_unstable - mark clocksource unstable via watchdog | 146 | * clocksource_mark_unstable - mark clocksource unstable via watchdog | 
| 154 | * @cs: clocksource to be marked unstable | 147 | * @cs: clocksource to be marked unstable | 
| @@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs) | |||
| 174 | static void clocksource_watchdog(unsigned long data) | 167 | static void clocksource_watchdog(unsigned long data) | 
| 175 | { | 168 | { | 
| 176 | struct clocksource *cs; | 169 | struct clocksource *cs; | 
| 177 | cycle_t csnow, wdnow, delta; | 170 | cycle_t csnow, wdnow, cslast, wdlast, delta; | 
| 178 | int64_t wd_nsec, cs_nsec; | 171 | int64_t wd_nsec, cs_nsec; | 
| 179 | int next_cpu, reset_pending; | 172 | int next_cpu, reset_pending; | 
| 180 | 173 | ||
| @@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data) | |||
| 213 | 206 | ||
| 214 | delta = clocksource_delta(csnow, cs->cs_last, cs->mask); | 207 | delta = clocksource_delta(csnow, cs->cs_last, cs->mask); | 
| 215 | cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); | 208 | cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); | 
| 209 | wdlast = cs->wd_last; /* save these in case we print them */ | ||
| 210 | cslast = cs->cs_last; | ||
| 216 | cs->cs_last = csnow; | 211 | cs->cs_last = csnow; | 
| 217 | cs->wd_last = wdnow; | 212 | cs->wd_last = wdnow; | 
| 218 | 213 | ||
| @@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data) | |||
| 221 | 216 | ||
| 222 | /* Check the deviation from the watchdog clocksource. */ | 217 | /* Check the deviation from the watchdog clocksource. */ | 
| 223 | if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { | 218 | if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { | 
| 224 | clocksource_unstable(cs, cs_nsec - wd_nsec); | 219 | pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); | 
| 220 | pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", | ||
| 221 | watchdog->name, wdnow, wdlast, watchdog->mask); | ||
| 222 | pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", | ||
| 223 | cs->name, csnow, cslast, cs->mask); | ||
| 224 | __clocksource_unstable(cs); | ||
| 225 | continue; | 225 | continue; | 
| 226 | } | 226 | } | 
| 227 | 227 | ||
| @@ -469,26 +469,25 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) | |||
| 469 | * @shift: cycle to nanosecond divisor (power of two) | 469 | * @shift: cycle to nanosecond divisor (power of two) | 
| 470 | * @maxadj: maximum adjustment value to mult (~11%) | 470 | * @maxadj: maximum adjustment value to mult (~11%) | 
| 471 | * @mask: bitmask for two's complement subtraction of non 64 bit counters | 471 | * @mask: bitmask for two's complement subtraction of non 64 bit counters | 
| 472 | * @max_cyc: maximum cycle value before potential overflow (does not include | ||
| 473 | * any safety margin) | ||
| 474 | * | ||
| 475 | * NOTE: This function includes a safety margin of 50%, in other words, we | ||
| 476 | * return half the number of nanoseconds the hardware counter can technically | ||
| 477 | * cover. This is done so that we can potentially detect problems caused by | ||
| 478 | * delayed timers or bad hardware, which might result in time intervals that | ||
| 479 | * are larger then what the math used can handle without overflows. | ||
| 472 | */ | 480 | */ | 
| 473 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) | 481 | u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) | 
| 474 | { | 482 | { | 
| 475 | u64 max_nsecs, max_cycles; | 483 | u64 max_nsecs, max_cycles; | 
| 476 | 484 | ||
| 477 | /* | 485 | /* | 
| 478 | * Calculate the maximum number of cycles that we can pass to the | 486 | * Calculate the maximum number of cycles that we can pass to the | 
| 479 | * cyc2ns function without overflowing a 64-bit signed result. The | 487 | * cyc2ns() function without overflowing a 64-bit result. | 
| 480 | * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) | ||
| 481 | * which is equivalent to the below. | ||
| 482 | * max_cycles < (2^63)/(mult + maxadj) | ||
| 483 | * max_cycles < 2^(log2((2^63)/(mult + maxadj))) | ||
| 484 | * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) | ||
| 485 | * max_cycles < 2^(63 - log2(mult + maxadj)) | ||
| 486 | * max_cycles < 1 << (63 - log2(mult + maxadj)) | ||
| 487 | * Please note that we add 1 to the result of the log2 to account for | ||
| 488 | * any rounding errors, ensure the above inequality is satisfied and | ||
| 489 | * no overflow will occur. | ||
| 490 | */ | 488 | */ | 
| 491 | max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); | 489 | max_cycles = ULLONG_MAX; | 
| 490 | do_div(max_cycles, mult+maxadj); | ||
| 492 | 491 | ||
| 493 | /* | 492 | /* | 
| 494 | * The actual maximum number of cycles we can defer the clocksource is | 493 | * The actual maximum number of cycles we can defer the clocksource is | 
| @@ -499,27 +498,26 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) | |||
| 499 | max_cycles = min(max_cycles, mask); | 498 | max_cycles = min(max_cycles, mask); | 
| 500 | max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); | 499 | max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); | 
| 501 | 500 | ||
| 501 | /* return the max_cycles value as well if requested */ | ||
| 502 | if (max_cyc) | ||
| 503 | *max_cyc = max_cycles; | ||
| 504 | |||
| 505 | /* Return 50% of the actual maximum, so we can detect bad values */ | ||
| 506 | max_nsecs >>= 1; | ||
| 507 | |||
| 502 | return max_nsecs; | 508 | return max_nsecs; | 
| 503 | } | 509 | } | 
| 504 | 510 | ||
| 505 | /** | 511 | /** | 
| 506 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 512 | * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles | 
| 507 | * @cs: Pointer to clocksource | 513 | * @cs: Pointer to clocksource to be updated | 
| 508 | * | 514 | * | 
| 509 | */ | 515 | */ | 
| 510 | static u64 clocksource_max_deferment(struct clocksource *cs) | 516 | static inline void clocksource_update_max_deferment(struct clocksource *cs) | 
| 511 | { | 517 | { | 
| 512 | u64 max_nsecs; | 518 | cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, | 
| 513 | 519 | cs->maxadj, cs->mask, | |
| 514 | max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, | 520 | &cs->max_cycles); | 
| 515 | cs->mask); | ||
| 516 | /* | ||
| 517 | * To ensure that the clocksource does not wrap whilst we are idle, | ||
| 518 | * limit the time the clocksource can be deferred by 12.5%. Please | ||
| 519 | * note a margin of 12.5% is used because this can be computed with | ||
| 520 | * a shift, versus say 10% which would require division. | ||
| 521 | */ | ||
| 522 | return max_nsecs - (max_nsecs >> 3); | ||
| 523 | } | 521 | } | 
| 524 | 522 | ||
| 525 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET | 523 | #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET | 
| @@ -648,7 +646,7 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
| 648 | } | 646 | } | 
| 649 | 647 | ||
| 650 | /** | 648 | /** | 
| 651 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 649 | * __clocksource_update_freq_scale - Used update clocksource with new freq | 
| 652 | * @cs: clocksource to be registered | 650 | * @cs: clocksource to be registered | 
| 653 | * @scale: Scale factor multiplied against freq to get clocksource hz | 651 | * @scale: Scale factor multiplied against freq to get clocksource hz | 
| 654 | * @freq: clocksource frequency (cycles per second) divided by scale | 652 | * @freq: clocksource frequency (cycles per second) divided by scale | 
| @@ -656,48 +654,64 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
| 656 | * This should only be called from the clocksource->enable() method. | 654 | * This should only be called from the clocksource->enable() method. | 
| 657 | * | 655 | * | 
| 658 | * This *SHOULD NOT* be called directly! Please use the | 656 | * This *SHOULD NOT* be called directly! Please use the | 
| 659 | * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. | 657 | * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper | 
| 658 | * functions. | ||
| 660 | */ | 659 | */ | 
| 661 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 660 | void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) | 
| 662 | { | 661 | { | 
| 663 | u64 sec; | 662 | u64 sec; | 
| 663 | |||
| 664 | /* | 664 | /* | 
| 665 | * Calc the maximum number of seconds which we can run before | 665 | * Default clocksources are *special* and self-define their mult/shift. | 
| 666 | * wrapping around. For clocksources which have a mask > 32bit | 666 | * But, you're not special, so you should specify a freq value. | 
| 667 | * we need to limit the max sleep time to have a good | ||
| 668 | * conversion precision. 10 minutes is still a reasonable | ||
| 669 | * amount. That results in a shift value of 24 for a | ||
| 670 | * clocksource with mask >= 40bit and f >= 4GHz. That maps to | ||
| 671 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | ||
| 672 | * margin as we do in clocksource_max_deferment() | ||
| 673 | */ | 667 | */ | 
| 674 | sec = (cs->mask - (cs->mask >> 3)); | 668 | if (freq) { | 
| 675 | do_div(sec, freq); | 669 | /* | 
| 676 | do_div(sec, scale); | 670 | * Calc the maximum number of seconds which we can run before | 
| 677 | if (!sec) | 671 | * wrapping around. For clocksources which have a mask > 32-bit | 
| 678 | sec = 1; | 672 | * we need to limit the max sleep time to have a good | 
| 679 | else if (sec > 600 && cs->mask > UINT_MAX) | 673 | * conversion precision. 10 minutes is still a reasonable | 
| 680 | sec = 600; | 674 | * amount. That results in a shift value of 24 for a | 
| 681 | 675 | * clocksource with mask >= 40-bit and f >= 4GHz. That maps to | |
| 682 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 676 | * ~ 0.06ppm granularity for NTP. | 
| 683 | NSEC_PER_SEC / scale, sec * scale); | 677 | */ | 
| 684 | 678 | sec = cs->mask; | |
| 679 | do_div(sec, freq); | ||
| 680 | do_div(sec, scale); | ||
| 681 | if (!sec) | ||
| 682 | sec = 1; | ||
| 683 | else if (sec > 600 && cs->mask > UINT_MAX) | ||
| 684 | sec = 600; | ||
| 685 | |||
| 686 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | ||
| 687 | NSEC_PER_SEC / scale, sec * scale); | ||
| 688 | } | ||
| 685 | /* | 689 | /* | 
| 686 | * for clocksources that have large mults, to avoid overflow. | 690 | * Ensure clocksources that have large 'mult' values don't overflow | 
| 687 | * Since mult may be adjusted by ntp, add an safety extra margin | 691 | * when adjusted. | 
| 688 | * | ||
| 689 | */ | 692 | */ | 
| 690 | cs->maxadj = clocksource_max_adjustment(cs); | 693 | cs->maxadj = clocksource_max_adjustment(cs); | 
| 691 | while ((cs->mult + cs->maxadj < cs->mult) | 694 | while (freq && ((cs->mult + cs->maxadj < cs->mult) | 
| 692 | || (cs->mult - cs->maxadj > cs->mult)) { | 695 | || (cs->mult - cs->maxadj > cs->mult))) { | 
| 693 | cs->mult >>= 1; | 696 | cs->mult >>= 1; | 
| 694 | cs->shift--; | 697 | cs->shift--; | 
| 695 | cs->maxadj = clocksource_max_adjustment(cs); | 698 | cs->maxadj = clocksource_max_adjustment(cs); | 
| 696 | } | 699 | } | 
| 697 | 700 | ||
| 698 | cs->max_idle_ns = clocksource_max_deferment(cs); | 701 | /* | 
| 702 | * Only warn for *special* clocksources that self-define | ||
| 703 | * their mult/shift values and don't specify a freq. | ||
| 704 | */ | ||
| 705 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
| 706 | "timekeeping: Clocksource %s might overflow on 11%% adjustment\n", | ||
| 707 | cs->name); | ||
| 708 | |||
| 709 | clocksource_update_max_deferment(cs); | ||
| 710 | |||
| 711 | pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", | ||
| 712 | cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); | ||
| 699 | } | 713 | } | 
| 700 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 714 | EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); | 
| 701 | 715 | ||
| 702 | /** | 716 | /** | 
| 703 | * __clocksource_register_scale - Used to install new clocksources | 717 | * __clocksource_register_scale - Used to install new clocksources | 
| @@ -714,7 +728,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
| 714 | { | 728 | { | 
| 715 | 729 | ||
| 716 | /* Initialize mult/shift and max_idle_ns */ | 730 | /* Initialize mult/shift and max_idle_ns */ | 
| 717 | __clocksource_updatefreq_scale(cs, scale, freq); | 731 | __clocksource_update_freq_scale(cs, scale, freq); | 
| 718 | 732 | ||
| 719 | /* Add clocksource to the clocksource list */ | 733 | /* Add clocksource to the clocksource list */ | 
| 720 | mutex_lock(&clocksource_mutex); | 734 | mutex_lock(&clocksource_mutex); | 
| @@ -726,33 +740,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
| 726 | } | 740 | } | 
| 727 | EXPORT_SYMBOL_GPL(__clocksource_register_scale); | 741 | EXPORT_SYMBOL_GPL(__clocksource_register_scale); | 
| 728 | 742 | ||
| 729 | |||
| 730 | /** | ||
| 731 | * clocksource_register - Used to install new clocksources | ||
| 732 | * @cs: clocksource to be registered | ||
| 733 | * | ||
| 734 | * Returns -EBUSY if registration fails, zero otherwise. | ||
| 735 | */ | ||
| 736 | int clocksource_register(struct clocksource *cs) | ||
| 737 | { | ||
| 738 | /* calculate max adjustment for given mult/shift */ | ||
| 739 | cs->maxadj = clocksource_max_adjustment(cs); | ||
| 740 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
| 741 | "Clocksource %s might overflow on 11%% adjustment\n", | ||
| 742 | cs->name); | ||
| 743 | |||
| 744 | /* calculate max idle time permitted for this clocksource */ | ||
| 745 | cs->max_idle_ns = clocksource_max_deferment(cs); | ||
| 746 | |||
| 747 | mutex_lock(&clocksource_mutex); | ||
| 748 | clocksource_enqueue(cs); | ||
| 749 | clocksource_enqueue_watchdog(cs); | ||
| 750 | clocksource_select(); | ||
| 751 | mutex_unlock(&clocksource_mutex); | ||
| 752 | return 0; | ||
| 753 | } | ||
| 754 | EXPORT_SYMBOL(clocksource_register); | ||
| 755 | |||
| 756 | static void __clocksource_change_rating(struct clocksource *cs, int rating) | 743 | static void __clocksource_change_rating(struct clocksource *cs, int rating) | 
| 757 | { | 744 | { | 
| 758 | list_del(&cs->list); | 745 | list_del(&cs->list); | 
| diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index bee0c1f78091..76d4bd962b19 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
| @@ -54,7 +54,7 @@ | |||
| 54 | 54 | ||
| 55 | #include <trace/events/timer.h> | 55 | #include <trace/events/timer.h> | 
| 56 | 56 | ||
| 57 | #include "timekeeping.h" | 57 | #include "tick-internal.h" | 
| 58 | 58 | ||
| 59 | /* | 59 | /* | 
| 60 | * The timer bases: | 60 | * The timer bases: | 
| @@ -1707,17 +1707,10 @@ static int hrtimer_cpu_notify(struct notifier_block *self, | |||
| 1707 | break; | 1707 | break; | 
| 1708 | 1708 | ||
| 1709 | #ifdef CONFIG_HOTPLUG_CPU | 1709 | #ifdef CONFIG_HOTPLUG_CPU | 
| 1710 | case CPU_DYING: | ||
| 1711 | case CPU_DYING_FROZEN: | ||
| 1712 | clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); | ||
| 1713 | break; | ||
| 1714 | case CPU_DEAD: | 1710 | case CPU_DEAD: | 
| 1715 | case CPU_DEAD_FROZEN: | 1711 | case CPU_DEAD_FROZEN: | 
| 1716 | { | ||
| 1717 | clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); | ||
| 1718 | migrate_hrtimers(scpu); | 1712 | migrate_hrtimers(scpu); | 
| 1719 | break; | 1713 | break; | 
| 1720 | } | ||
| 1721 | #endif | 1714 | #endif | 
| 1722 | 1715 | ||
| 1723 | default: | 1716 | default: | 
| diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a6a5bf53e86d..347fecf86a3f 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | #include <linux/module.h> | 25 | #include <linux/module.h> | 
| 26 | #include <linux/init.h> | 26 | #include <linux/init.h> | 
| 27 | 27 | ||
| 28 | #include "tick-internal.h" | 28 | #include "timekeeping.h" | 
| 29 | 29 | ||
| 30 | /* The Jiffies based clocksource is the lowest common | 30 | /* The Jiffies based clocksource is the lowest common | 
| 31 | * denominator clock source which should function on | 31 | * denominator clock source which should function on | 
| @@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = { | |||
| 71 | .mask = 0xffffffff, /*32bits*/ | 71 | .mask = 0xffffffff, /*32bits*/ | 
| 72 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | 72 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | 
| 73 | .shift = JIFFIES_SHIFT, | 73 | .shift = JIFFIES_SHIFT, | 
| 74 | .max_cycles = 10, | ||
| 74 | }; | 75 | }; | 
| 75 | 76 | ||
| 76 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); | 77 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); | 
| @@ -94,7 +95,7 @@ EXPORT_SYMBOL(jiffies); | |||
| 94 | 95 | ||
| 95 | static int __init init_jiffies_clocksource(void) | 96 | static int __init init_jiffies_clocksource(void) | 
| 96 | { | 97 | { | 
| 97 | return clocksource_register(&clocksource_jiffies); | 98 | return __clocksource_register(&clocksource_jiffies); | 
| 98 | } | 99 | } | 
| 99 | 100 | ||
| 100 | core_initcall(init_jiffies_clocksource); | 101 | core_initcall(init_jiffies_clocksource); | 
| @@ -130,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second) | |||
| 130 | 131 | ||
| 131 | refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; | 132 | refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; | 
| 132 | 133 | ||
| 133 | clocksource_register(&refined_jiffies); | 134 | __clocksource_register(&refined_jiffies); | 
| 134 | return 0; | 135 | return 0; | 
| 135 | } | 136 | } | 
| diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 0f60b08a4f07..7a681003001c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -17,7 +17,6 @@ | |||
| 17 | #include <linux/module.h> | 17 | #include <linux/module.h> | 
| 18 | #include <linux/rtc.h> | 18 | #include <linux/rtc.h> | 
| 19 | 19 | ||
| 20 | #include "tick-internal.h" | ||
| 21 | #include "ntp_internal.h" | 20 | #include "ntp_internal.h" | 
| 22 | 21 | ||
| 23 | /* | 22 | /* | 
| @@ -459,6 +458,16 @@ out: | |||
| 459 | return leap; | 458 | return leap; | 
| 460 | } | 459 | } | 
| 461 | 460 | ||
| 461 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | ||
| 462 | int __weak update_persistent_clock64(struct timespec64 now64) | ||
| 463 | { | ||
| 464 | struct timespec now; | ||
| 465 | |||
| 466 | now = timespec64_to_timespec(now64); | ||
| 467 | return update_persistent_clock(now); | ||
| 468 | } | ||
| 469 | #endif | ||
| 470 | |||
| 462 | #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) | 471 | #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) | 
| 463 | static void sync_cmos_clock(struct work_struct *work); | 472 | static void sync_cmos_clock(struct work_struct *work); | 
| 464 | 473 | ||
| @@ -494,8 +503,9 @@ static void sync_cmos_clock(struct work_struct *work) | |||
| 494 | if (persistent_clock_is_local) | 503 | if (persistent_clock_is_local) | 
| 495 | adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); | 504 | adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); | 
| 496 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | 505 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | 
| 497 | fail = update_persistent_clock(timespec64_to_timespec(adjust)); | 506 | fail = update_persistent_clock64(adjust); | 
| 498 | #endif | 507 | #endif | 
| 508 | |||
| 499 | #ifdef CONFIG_RTC_SYSTOHC | 509 | #ifdef CONFIG_RTC_SYSTOHC | 
| 500 | if (fail == -ENODEV) | 510 | if (fail == -ENODEV) | 
| 501 | fail = rtc_set_ntp_time(adjust); | 511 | fail = rtc_set_ntp_time(adjust); | 
| diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..a26036d37a38 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | /* | 1 | /* | 
| 2 | * sched_clock.c: support for extending counters to full 64-bit ns counter | 2 | * sched_clock.c: Generic sched_clock() support, to extend low level | 
| 3 | * hardware time counters to full 64-bit ns values. | ||
| 3 | * | 4 | * | 
| 4 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify | 
| 5 | * it under the terms of the GNU General Public License version 2 as | 6 | * it under the terms of the GNU General Public License version 2 as | 
| @@ -18,15 +19,53 @@ | |||
| 18 | #include <linux/seqlock.h> | 19 | #include <linux/seqlock.h> | 
| 19 | #include <linux/bitops.h> | 20 | #include <linux/bitops.h> | 
| 20 | 21 | ||
| 21 | struct clock_data { | 22 | /** | 
| 22 | ktime_t wrap_kt; | 23 | * struct clock_read_data - data required to read from sched_clock() | 
| 24 | * | ||
| 25 | * @epoch_ns: sched_clock() value at last update | ||
| 26 | * @epoch_cyc: Clock cycle value at last update. | ||
| 27 | * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit | ||
| 28 | * clocks. | ||
| 29 | * @read_sched_clock: Current clock source (or dummy source when suspended). | ||
| 30 | * @mult: Multipler for scaled math conversion. | ||
| 31 | * @shift: Shift value for scaled math conversion. | ||
| 32 | * | ||
| 33 | * Care must be taken when updating this structure; it is read by | ||
| 34 | * some very hot code paths. It occupies <=40 bytes and, when combined | ||
| 35 | * with the seqcount used to synchronize access, comfortably fits into | ||
| 36 | * a 64 byte cache line. | ||
| 37 | */ | ||
| 38 | struct clock_read_data { | ||
| 23 | u64 epoch_ns; | 39 | u64 epoch_ns; | 
| 24 | u64 epoch_cyc; | 40 | u64 epoch_cyc; | 
| 25 | seqcount_t seq; | 41 | u64 sched_clock_mask; | 
| 26 | unsigned long rate; | 42 | u64 (*read_sched_clock)(void); | 
| 27 | u32 mult; | 43 | u32 mult; | 
| 28 | u32 shift; | 44 | u32 shift; | 
| 29 | bool suspended; | 45 | }; | 
| 46 | |||
| 47 | /** | ||
| 48 | * struct clock_data - all data needed for sched_clock() (including | ||
| 49 | * registration of a new clock source) | ||
| 50 | * | ||
| 51 | * @seq: Sequence counter for protecting updates. The lowest | ||
| 52 | * bit is the index for @read_data. | ||
| 53 | * @read_data: Data required to read from sched_clock. | ||
| 54 | * @wrap_kt: Duration for which clock can run before wrapping. | ||
| 55 | * @rate: Tick rate of the registered clock. | ||
| 56 | * @actual_read_sched_clock: Registered hardware level clock read function. | ||
| 57 | * | ||
| 58 | * The ordering of this structure has been chosen to optimize cache | ||
| 59 | * performance. In particular 'seq' and 'read_data[0]' (combined) should fit | ||
| 60 | * into a single 64-byte cache line. | ||
| 61 | */ | ||
| 62 | struct clock_data { | ||
| 63 | seqcount_t seq; | ||
| 64 | struct clock_read_data read_data[2]; | ||
| 65 | ktime_t wrap_kt; | ||
| 66 | unsigned long rate; | ||
| 67 | |||
| 68 | u64 (*actual_read_sched_clock)(void); | ||
| 30 | }; | 69 | }; | 
| 31 | 70 | ||
| 32 | static struct hrtimer sched_clock_timer; | 71 | static struct hrtimer sched_clock_timer; | 
| @@ -34,12 +73,6 @@ static int irqtime = -1; | |||
| 34 | 73 | ||
| 35 | core_param(irqtime, irqtime, int, 0400); | 74 | core_param(irqtime, irqtime, int, 0400); | 
| 36 | 75 | ||
| 37 | static struct clock_data cd = { | ||
| 38 | .mult = NSEC_PER_SEC / HZ, | ||
| 39 | }; | ||
| 40 | |||
| 41 | static u64 __read_mostly sched_clock_mask; | ||
| 42 | |||
| 43 | static u64 notrace jiffy_sched_clock_read(void) | 76 | static u64 notrace jiffy_sched_clock_read(void) | 
| 44 | { | 77 | { | 
| 45 | /* | 78 | /* | 
| @@ -49,7 +82,11 @@ static u64 notrace jiffy_sched_clock_read(void) | |||
| 49 | return (u64)(jiffies - INITIAL_JIFFIES); | 82 | return (u64)(jiffies - INITIAL_JIFFIES); | 
| 50 | } | 83 | } | 
| 51 | 84 | ||
| 52 | static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; | 85 | static struct clock_data cd ____cacheline_aligned = { | 
| 86 | .read_data[0] = { .mult = NSEC_PER_SEC / HZ, | ||
| 87 | .read_sched_clock = jiffy_sched_clock_read, }, | ||
| 88 | .actual_read_sched_clock = jiffy_sched_clock_read, | ||
| 89 | }; | ||
| 53 | 90 | ||
| 54 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | 91 | static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | 
| 55 | { | 92 | { | 
| @@ -58,111 +95,136 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) | |||
| 58 | 95 | ||
| 59 | unsigned long long notrace sched_clock(void) | 96 | unsigned long long notrace sched_clock(void) | 
| 60 | { | 97 | { | 
| 61 | u64 epoch_ns; | 98 | u64 cyc, res; | 
| 62 | u64 epoch_cyc; | ||
| 63 | u64 cyc; | ||
| 64 | unsigned long seq; | 99 | unsigned long seq; | 
| 65 | 100 | struct clock_read_data *rd; | |
| 66 | if (cd.suspended) | ||
| 67 | return cd.epoch_ns; | ||
| 68 | 101 | ||
| 69 | do { | 102 | do { | 
| 70 | seq = raw_read_seqcount_begin(&cd.seq); | 103 | seq = raw_read_seqcount(&cd.seq); | 
| 71 | epoch_cyc = cd.epoch_cyc; | 104 | rd = cd.read_data + (seq & 1); | 
| 72 | epoch_ns = cd.epoch_ns; | 105 | |
| 106 | cyc = (rd->read_sched_clock() - rd->epoch_cyc) & | ||
| 107 | rd->sched_clock_mask; | ||
| 108 | res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); | ||
| 73 | } while (read_seqcount_retry(&cd.seq, seq)); | 109 | } while (read_seqcount_retry(&cd.seq, seq)); | 
| 74 | 110 | ||
| 75 | cyc = read_sched_clock(); | 111 | return res; | 
| 76 | cyc = (cyc - epoch_cyc) & sched_clock_mask; | 112 | } | 
| 77 | return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); | 113 | |
| 114 | /* | ||
| 115 | * Updating the data required to read the clock. | ||
| 116 | * | ||
| 117 | * sched_clock() will never observe mis-matched data even if called from | ||
| 118 | * an NMI. We do this by maintaining an odd/even copy of the data and | ||
| 119 | * steering sched_clock() to one or the other using a sequence counter. | ||
| 120 | * In order to preserve the data cache profile of sched_clock() as much | ||
| 121 | * as possible the system reverts back to the even copy when the update | ||
| 122 | * completes; the odd copy is used *only* during an update. | ||
| 123 | */ | ||
| 124 | static void update_clock_read_data(struct clock_read_data *rd) | ||
| 125 | { | ||
| 126 | /* update the backup (odd) copy with the new data */ | ||
| 127 | cd.read_data[1] = *rd; | ||
| 128 | |||
| 129 | /* steer readers towards the odd copy */ | ||
| 130 | raw_write_seqcount_latch(&cd.seq); | ||
| 131 | |||
| 132 | /* now its safe for us to update the normal (even) copy */ | ||
| 133 | cd.read_data[0] = *rd; | ||
| 134 | |||
| 135 | /* switch readers back to the even copy */ | ||
| 136 | raw_write_seqcount_latch(&cd.seq); | ||
| 78 | } | 137 | } | 
| 79 | 138 | ||
| 80 | /* | 139 | /* | 
| 81 | * Atomically update the sched_clock epoch. | 140 | * Atomically update the sched_clock() epoch. | 
| 82 | */ | 141 | */ | 
| 83 | static void notrace update_sched_clock(void) | 142 | static void update_sched_clock(void) | 
| 84 | { | 143 | { | 
| 85 | unsigned long flags; | ||
| 86 | u64 cyc; | 144 | u64 cyc; | 
| 87 | u64 ns; | 145 | u64 ns; | 
| 146 | struct clock_read_data rd; | ||
| 147 | |||
| 148 | rd = cd.read_data[0]; | ||
| 149 | |||
| 150 | cyc = cd.actual_read_sched_clock(); | ||
| 151 | ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); | ||
| 152 | |||
| 153 | rd.epoch_ns = ns; | ||
| 154 | rd.epoch_cyc = cyc; | ||
| 88 | 155 | ||
| 89 | cyc = read_sched_clock(); | 156 | update_clock_read_data(&rd); | 
| 90 | ns = cd.epoch_ns + | ||
| 91 | cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | ||
| 92 | cd.mult, cd.shift); | ||
| 93 | |||
| 94 | raw_local_irq_save(flags); | ||
| 95 | raw_write_seqcount_begin(&cd.seq); | ||
| 96 | cd.epoch_ns = ns; | ||
| 97 | cd.epoch_cyc = cyc; | ||
| 98 | raw_write_seqcount_end(&cd.seq); | ||
| 99 | raw_local_irq_restore(flags); | ||
| 100 | } | 157 | } | 
| 101 | 158 | ||
| 102 | static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) | 159 | static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) | 
| 103 | { | 160 | { | 
| 104 | update_sched_clock(); | 161 | update_sched_clock(); | 
| 105 | hrtimer_forward_now(hrt, cd.wrap_kt); | 162 | hrtimer_forward_now(hrt, cd.wrap_kt); | 
| 163 | |||
| 106 | return HRTIMER_RESTART; | 164 | return HRTIMER_RESTART; | 
| 107 | } | 165 | } | 
| 108 | 166 | ||
| 109 | void __init sched_clock_register(u64 (*read)(void), int bits, | 167 | void __init | 
| 110 | unsigned long rate) | 168 | sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) | 
| 111 | { | 169 | { | 
| 112 | u64 res, wrap, new_mask, new_epoch, cyc, ns; | 170 | u64 res, wrap, new_mask, new_epoch, cyc, ns; | 
| 113 | u32 new_mult, new_shift; | 171 | u32 new_mult, new_shift; | 
| 114 | ktime_t new_wrap_kt; | ||
| 115 | unsigned long r; | 172 | unsigned long r; | 
| 116 | char r_unit; | 173 | char r_unit; | 
| 174 | struct clock_read_data rd; | ||
| 117 | 175 | ||
| 118 | if (cd.rate > rate) | 176 | if (cd.rate > rate) | 
| 119 | return; | 177 | return; | 
| 120 | 178 | ||
| 121 | WARN_ON(!irqs_disabled()); | 179 | WARN_ON(!irqs_disabled()); | 
| 122 | 180 | ||
| 123 | /* calculate the mult/shift to convert counter ticks to ns. */ | 181 | /* Calculate the mult/shift to convert counter ticks to ns. */ | 
| 124 | clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); | 182 | clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); | 
| 125 | 183 | ||
| 126 | new_mask = CLOCKSOURCE_MASK(bits); | 184 | new_mask = CLOCKSOURCE_MASK(bits); | 
| 185 | cd.rate = rate; | ||
| 186 | |||
| 187 | /* Calculate how many nanosecs until we risk wrapping */ | ||
| 188 | wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); | ||
| 189 | cd.wrap_kt = ns_to_ktime(wrap); | ||
| 127 | 190 | ||
| 128 | /* calculate how many ns until we wrap */ | 191 | rd = cd.read_data[0]; | 
| 129 | wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); | ||
| 130 | new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); | ||
| 131 | 192 | ||
| 132 | /* update epoch for new counter and update epoch_ns from old counter*/ | 193 | /* Update epoch for new counter and update 'epoch_ns' from old counter*/ | 
| 133 | new_epoch = read(); | 194 | new_epoch = read(); | 
| 134 | cyc = read_sched_clock(); | 195 | cyc = cd.actual_read_sched_clock(); | 
| 135 | ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, | 196 | ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); | 
| 136 | cd.mult, cd.shift); | 197 | cd.actual_read_sched_clock = read; | 
| 137 | 198 | ||
| 138 | raw_write_seqcount_begin(&cd.seq); | 199 | rd.read_sched_clock = read; | 
| 139 | read_sched_clock = read; | 200 | rd.sched_clock_mask = new_mask; | 
| 140 | sched_clock_mask = new_mask; | 201 | rd.mult = new_mult; | 
| 141 | cd.rate = rate; | 202 | rd.shift = new_shift; | 
| 142 | cd.wrap_kt = new_wrap_kt; | 203 | rd.epoch_cyc = new_epoch; | 
| 143 | cd.mult = new_mult; | 204 | rd.epoch_ns = ns; | 
| 144 | cd.shift = new_shift; | 205 | |
| 145 | cd.epoch_cyc = new_epoch; | 206 | update_clock_read_data(&rd); | 
| 146 | cd.epoch_ns = ns; | ||
| 147 | raw_write_seqcount_end(&cd.seq); | ||
| 148 | 207 | ||
| 149 | r = rate; | 208 | r = rate; | 
| 150 | if (r >= 4000000) { | 209 | if (r >= 4000000) { | 
| 151 | r /= 1000000; | 210 | r /= 1000000; | 
| 152 | r_unit = 'M'; | 211 | r_unit = 'M'; | 
| 153 | } else if (r >= 1000) { | 212 | } else { | 
| 154 | r /= 1000; | 213 | if (r >= 1000) { | 
| 155 | r_unit = 'k'; | 214 | r /= 1000; | 
| 156 | } else | 215 | r_unit = 'k'; | 
| 157 | r_unit = ' '; | 216 | } else { | 
| 158 | 217 | r_unit = ' '; | |
| 159 | /* calculate the ns resolution of this counter */ | 218 | } | 
| 219 | } | ||
| 220 | |||
| 221 | /* Calculate the ns resolution of this counter */ | ||
| 160 | res = cyc_to_ns(1ULL, new_mult, new_shift); | 222 | res = cyc_to_ns(1ULL, new_mult, new_shift); | 
| 161 | 223 | ||
| 162 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", | 224 | pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", | 
| 163 | bits, r, r_unit, res, wrap); | 225 | bits, r, r_unit, res, wrap); | 
| 164 | 226 | ||
| 165 | /* Enable IRQ time accounting if we have a fast enough sched_clock */ | 227 | /* Enable IRQ time accounting if we have a fast enough sched_clock() */ | 
| 166 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) | 228 | if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) | 
| 167 | enable_sched_clock_irqtime(); | 229 | enable_sched_clock_irqtime(); | 
| 168 | 230 | ||
| @@ -172,10 +234,10 @@ void __init sched_clock_register(u64 (*read)(void), int bits, | |||
| 172 | void __init sched_clock_postinit(void) | 234 | void __init sched_clock_postinit(void) | 
| 173 | { | 235 | { | 
| 174 | /* | 236 | /* | 
| 175 | * If no sched_clock function has been provided at that point, | 237 | * If no sched_clock() function has been provided at that point, | 
| 176 | * make it the final one one. | 238 | * make it the final one one. | 
| 177 | */ | 239 | */ | 
| 178 | if (read_sched_clock == jiffy_sched_clock_read) | 240 | if (cd.actual_read_sched_clock == jiffy_sched_clock_read) | 
| 179 | sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); | 241 | sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); | 
| 180 | 242 | ||
| 181 | update_sched_clock(); | 243 | update_sched_clock(); | 
| @@ -189,29 +251,53 @@ void __init sched_clock_postinit(void) | |||
| 189 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | 251 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | 
| 190 | } | 252 | } | 
| 191 | 253 | ||
| 254 | /* | ||
| 255 | * Clock read function for use when the clock is suspended. | ||
| 256 | * | ||
| 257 | * This function makes it appear to sched_clock() as if the clock | ||
| 258 | * stopped counting at its last update. | ||
| 259 | * | ||
| 260 | * This function must only be called from the critical | ||
| 261 | * section in sched_clock(). It relies on the read_seqcount_retry() | ||
| 262 | * at the end of the critical section to be sure we observe the | ||
| 263 | * correct copy of 'epoch_cyc'. | ||
| 264 | */ | ||
| 265 | static u64 notrace suspended_sched_clock_read(void) | ||
| 266 | { | ||
| 267 | unsigned long seq = raw_read_seqcount(&cd.seq); | ||
| 268 | |||
| 269 | return cd.read_data[seq & 1].epoch_cyc; | ||
| 270 | } | ||
| 271 | |||
| 192 | static int sched_clock_suspend(void) | 272 | static int sched_clock_suspend(void) | 
| 193 | { | 273 | { | 
| 274 | struct clock_read_data *rd = &cd.read_data[0]; | ||
| 275 | |||
| 194 | update_sched_clock(); | 276 | update_sched_clock(); | 
| 195 | hrtimer_cancel(&sched_clock_timer); | 277 | hrtimer_cancel(&sched_clock_timer); | 
| 196 | cd.suspended = true; | 278 | rd->read_sched_clock = suspended_sched_clock_read; | 
| 279 | |||
| 197 | return 0; | 280 | return 0; | 
| 198 | } | 281 | } | 
| 199 | 282 | ||
| 200 | static void sched_clock_resume(void) | 283 | static void sched_clock_resume(void) | 
| 201 | { | 284 | { | 
| 202 | cd.epoch_cyc = read_sched_clock(); | 285 | struct clock_read_data *rd = &cd.read_data[0]; | 
| 286 | |||
| 287 | rd->epoch_cyc = cd.actual_read_sched_clock(); | ||
| 203 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | 288 | hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); | 
| 204 | cd.suspended = false; | 289 | rd->read_sched_clock = cd.actual_read_sched_clock; | 
| 205 | } | 290 | } | 
| 206 | 291 | ||
| 207 | static struct syscore_ops sched_clock_ops = { | 292 | static struct syscore_ops sched_clock_ops = { | 
| 208 | .suspend = sched_clock_suspend, | 293 | .suspend = sched_clock_suspend, | 
| 209 | .resume = sched_clock_resume, | 294 | .resume = sched_clock_resume, | 
| 210 | }; | 295 | }; | 
| 211 | 296 | ||
| 212 | static int __init sched_clock_syscore_init(void) | 297 | static int __init sched_clock_syscore_init(void) | 
| 213 | { | 298 | { | 
| 214 | register_syscore_ops(&sched_clock_ops); | 299 | register_syscore_ops(&sched_clock_ops); | 
| 300 | |||
| 215 | return 0; | 301 | return 0; | 
| 216 | } | 302 | } | 
| 217 | device_initcall(sched_clock_syscore_init); | 303 | device_initcall(sched_clock_syscore_init); | 
| diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index eb682d5c697c..6aac4beedbbe 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c | |||
| @@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode, | |||
| 49 | */ | 49 | */ | 
| 50 | static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | 50 | static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | 
| 51 | { | 51 | { | 
| 52 | int bc_moved; | ||
| 52 | /* | 53 | /* | 
| 53 | * We try to cancel the timer first. If the callback is on | 54 | * We try to cancel the timer first. If the callback is on | 
| 54 | * flight on some other cpu then we let it handle it. If we | 55 | * flight on some other cpu then we let it handle it. If we | 
| @@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) | |||
| 60 | * restart the timer because we are in the callback, but we | 61 | * restart the timer because we are in the callback, but we | 
| 61 | * can set the expiry time and let the callback return | 62 | * can set the expiry time and let the callback return | 
| 62 | * HRTIMER_RESTART. | 63 | * HRTIMER_RESTART. | 
| 64 | * | ||
| 65 | * Since we are in the idle loop at this point and because | ||
| 66 | * hrtimer_{start/cancel} functions call into tracing, | ||
| 67 | * calls to these functions must be bound within RCU_NONIDLE. | ||
| 63 | */ | 68 | */ | 
| 64 | if (hrtimer_try_to_cancel(&bctimer) >= 0) { | 69 | RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? | 
| 65 | hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); | 70 | !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : | 
| 71 | 0); | ||
| 72 | if (bc_moved) { | ||
| 66 | /* Bind the "device" to the cpu */ | 73 | /* Bind the "device" to the cpu */ | 
| 67 | bc->bound_on = smp_processor_id(); | 74 | bc->bound_on = smp_processor_id(); | 
| 68 | } else if (bc->bound_on == smp_processor_id()) { | 75 | } else if (bc->bound_on == smp_processor_id()) { | 
| diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 066f0ec05e48..7e8ca4f448a8 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -33,12 +33,14 @@ static cpumask_var_t tick_broadcast_mask; | |||
| 33 | static cpumask_var_t tick_broadcast_on; | 33 | static cpumask_var_t tick_broadcast_on; | 
| 34 | static cpumask_var_t tmpmask; | 34 | static cpumask_var_t tmpmask; | 
| 35 | static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); | 35 | static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); | 
| 36 | static int tick_broadcast_force; | 36 | static int tick_broadcast_forced; | 
| 37 | 37 | ||
| 38 | #ifdef CONFIG_TICK_ONESHOT | 38 | #ifdef CONFIG_TICK_ONESHOT | 
| 39 | static void tick_broadcast_clear_oneshot(int cpu); | 39 | static void tick_broadcast_clear_oneshot(int cpu); | 
| 40 | static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); | ||
| 40 | #else | 41 | #else | 
| 41 | static inline void tick_broadcast_clear_oneshot(int cpu) { } | 42 | static inline void tick_broadcast_clear_oneshot(int cpu) { } | 
| 43 | static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } | ||
| 42 | #endif | 44 | #endif | 
| 43 | 45 | ||
| 44 | /* | 46 | /* | 
| @@ -303,7 +305,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) | |||
| 303 | /* | 305 | /* | 
| 304 | * The device is in periodic mode. No reprogramming necessary: | 306 | * The device is in periodic mode. No reprogramming necessary: | 
| 305 | */ | 307 | */ | 
| 306 | if (dev->mode == CLOCK_EVT_MODE_PERIODIC) | 308 | if (dev->state == CLOCK_EVT_STATE_PERIODIC) | 
| 307 | goto unlock; | 309 | goto unlock; | 
| 308 | 310 | ||
| 309 | /* | 311 | /* | 
| @@ -324,49 +326,54 @@ unlock: | |||
| 324 | raw_spin_unlock(&tick_broadcast_lock); | 326 | raw_spin_unlock(&tick_broadcast_lock); | 
| 325 | } | 327 | } | 
| 326 | 328 | ||
| 327 | /* | 329 | /** | 
| 328 | * Powerstate information: The system enters/leaves a state, where | 330 | * tick_broadcast_control - Enable/disable or force broadcast mode | 
| 329 | * affected devices might stop | 331 | * @mode: The selected broadcast mode | 
| 332 | * | ||
| 333 | * Called when the system enters a state where affected tick devices | ||
| 334 | * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. | ||
| 335 | * | ||
| 336 | * Called with interrupts disabled, so clockevents_lock is not | ||
| 337 | * required here because the local clock event device cannot go away | ||
| 338 | * under us. | ||
| 330 | */ | 339 | */ | 
| 331 | static void tick_do_broadcast_on_off(unsigned long *reason) | 340 | void tick_broadcast_control(enum tick_broadcast_mode mode) | 
| 332 | { | 341 | { | 
| 333 | struct clock_event_device *bc, *dev; | 342 | struct clock_event_device *bc, *dev; | 
| 334 | struct tick_device *td; | 343 | struct tick_device *td; | 
| 335 | unsigned long flags; | ||
| 336 | int cpu, bc_stopped; | 344 | int cpu, bc_stopped; | 
| 337 | 345 | ||
| 338 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 346 | td = this_cpu_ptr(&tick_cpu_device); | 
| 339 | |||
| 340 | cpu = smp_processor_id(); | ||
| 341 | td = &per_cpu(tick_cpu_device, cpu); | ||
| 342 | dev = td->evtdev; | 347 | dev = td->evtdev; | 
| 343 | bc = tick_broadcast_device.evtdev; | ||
| 344 | 348 | ||
| 345 | /* | 349 | /* | 
| 346 | * Is the device not affected by the powerstate ? | 350 | * Is the device not affected by the powerstate ? | 
| 347 | */ | 351 | */ | 
| 348 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 352 | if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 
| 349 | goto out; | 353 | return; | 
| 350 | 354 | ||
| 351 | if (!tick_device_is_functional(dev)) | 355 | if (!tick_device_is_functional(dev)) | 
| 352 | goto out; | 356 | return; | 
| 353 | 357 | ||
| 358 | raw_spin_lock(&tick_broadcast_lock); | ||
| 359 | cpu = smp_processor_id(); | ||
| 360 | bc = tick_broadcast_device.evtdev; | ||
| 354 | bc_stopped = cpumask_empty(tick_broadcast_mask); | 361 | bc_stopped = cpumask_empty(tick_broadcast_mask); | 
| 355 | 362 | ||
| 356 | switch (*reason) { | 363 | switch (mode) { | 
| 357 | case CLOCK_EVT_NOTIFY_BROADCAST_ON: | 364 | case TICK_BROADCAST_FORCE: | 
| 358 | case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: | 365 | tick_broadcast_forced = 1; | 
| 366 | case TICK_BROADCAST_ON: | ||
| 359 | cpumask_set_cpu(cpu, tick_broadcast_on); | 367 | cpumask_set_cpu(cpu, tick_broadcast_on); | 
| 360 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { | 368 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { | 
| 361 | if (tick_broadcast_device.mode == | 369 | if (tick_broadcast_device.mode == | 
| 362 | TICKDEV_MODE_PERIODIC) | 370 | TICKDEV_MODE_PERIODIC) | 
| 363 | clockevents_shutdown(dev); | 371 | clockevents_shutdown(dev); | 
| 364 | } | 372 | } | 
| 365 | if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) | ||
| 366 | tick_broadcast_force = 1; | ||
| 367 | break; | 373 | break; | 
| 368 | case CLOCK_EVT_NOTIFY_BROADCAST_OFF: | 374 | |
| 369 | if (tick_broadcast_force) | 375 | case TICK_BROADCAST_OFF: | 
| 376 | if (tick_broadcast_forced) | ||
| 370 | break; | 377 | break; | 
| 371 | cpumask_clear_cpu(cpu, tick_broadcast_on); | 378 | cpumask_clear_cpu(cpu, tick_broadcast_on); | 
| 372 | if (!tick_device_is_functional(dev)) | 379 | if (!tick_device_is_functional(dev)) | 
| @@ -388,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason) | |||
| 388 | else | 395 | else | 
| 389 | tick_broadcast_setup_oneshot(bc); | 396 | tick_broadcast_setup_oneshot(bc); | 
| 390 | } | 397 | } | 
| 391 | out: | 398 | raw_spin_unlock(&tick_broadcast_lock); | 
| 392 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
| 393 | } | ||
| 394 | |||
| 395 | /* | ||
| 396 | * Powerstate information: The system enters/leaves a state, where | ||
| 397 | * affected devices might stop. | ||
| 398 | */ | ||
| 399 | void tick_broadcast_on_off(unsigned long reason, int *oncpu) | ||
| 400 | { | ||
| 401 | if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) | ||
| 402 | printk(KERN_ERR "tick-broadcast: ignoring broadcast for " | ||
| 403 | "offline CPU #%d\n", *oncpu); | ||
| 404 | else | ||
| 405 | tick_do_broadcast_on_off(&reason); | ||
| 406 | } | 399 | } | 
| 400 | EXPORT_SYMBOL_GPL(tick_broadcast_control); | ||
| 407 | 401 | ||
| 408 | /* | 402 | /* | 
| 409 | * Set the periodic handler depending on broadcast on/off | 403 | * Set the periodic handler depending on broadcast on/off | 
| @@ -416,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) | |||
| 416 | dev->event_handler = tick_handle_periodic_broadcast; | 410 | dev->event_handler = tick_handle_periodic_broadcast; | 
| 417 | } | 411 | } | 
| 418 | 412 | ||
| 413 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 419 | /* | 414 | /* | 
| 420 | * Remove a CPU from broadcasting | 415 | * Remove a CPU from broadcasting | 
| 421 | */ | 416 | */ | 
| 422 | void tick_shutdown_broadcast(unsigned int *cpup) | 417 | void tick_shutdown_broadcast(unsigned int cpu) | 
| 423 | { | 418 | { | 
| 424 | struct clock_event_device *bc; | 419 | struct clock_event_device *bc; | 
| 425 | unsigned long flags; | 420 | unsigned long flags; | 
| 426 | unsigned int cpu = *cpup; | ||
| 427 | 421 | ||
| 428 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 422 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 
| 429 | 423 | ||
| @@ -438,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) | |||
| 438 | 432 | ||
| 439 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 433 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 
| 440 | } | 434 | } | 
| 435 | #endif | ||
| 441 | 436 | ||
| 442 | void tick_suspend_broadcast(void) | 437 | void tick_suspend_broadcast(void) | 
| 443 | { | 438 | { | 
| @@ -453,38 +448,48 @@ void tick_suspend_broadcast(void) | |||
| 453 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 448 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 
| 454 | } | 449 | } | 
| 455 | 450 | ||
| 456 | int tick_resume_broadcast(void) | 451 | /* | 
| 452 | * This is called from tick_resume_local() on a resuming CPU. That's | ||
| 453 | * called from the core resume function, tick_unfreeze() and the magic XEN | ||
| 454 | * resume hackery. | ||
| 455 | * | ||
| 456 | * In none of these cases the broadcast device mode can change and the | ||
| 457 | * bit of the resuming CPU in the broadcast mask is safe as well. | ||
| 458 | */ | ||
| 459 | bool tick_resume_check_broadcast(void) | ||
| 460 | { | ||
| 461 | if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) | ||
| 462 | return false; | ||
| 463 | else | ||
| 464 | return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); | ||
| 465 | } | ||
| 466 | |||
| 467 | void tick_resume_broadcast(void) | ||
| 457 | { | 468 | { | 
| 458 | struct clock_event_device *bc; | 469 | struct clock_event_device *bc; | 
| 459 | unsigned long flags; | 470 | unsigned long flags; | 
| 460 | int broadcast = 0; | ||
| 461 | 471 | ||
| 462 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 472 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 
| 463 | 473 | ||
| 464 | bc = tick_broadcast_device.evtdev; | 474 | bc = tick_broadcast_device.evtdev; | 
| 465 | 475 | ||
| 466 | if (bc) { | 476 | if (bc) { | 
| 467 | clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); | 477 | clockevents_tick_resume(bc); | 
| 468 | 478 | ||
| 469 | switch (tick_broadcast_device.mode) { | 479 | switch (tick_broadcast_device.mode) { | 
| 470 | case TICKDEV_MODE_PERIODIC: | 480 | case TICKDEV_MODE_PERIODIC: | 
| 471 | if (!cpumask_empty(tick_broadcast_mask)) | 481 | if (!cpumask_empty(tick_broadcast_mask)) | 
| 472 | tick_broadcast_start_periodic(bc); | 482 | tick_broadcast_start_periodic(bc); | 
| 473 | broadcast = cpumask_test_cpu(smp_processor_id(), | ||
| 474 | tick_broadcast_mask); | ||
| 475 | break; | 483 | break; | 
| 476 | case TICKDEV_MODE_ONESHOT: | 484 | case TICKDEV_MODE_ONESHOT: | 
| 477 | if (!cpumask_empty(tick_broadcast_mask)) | 485 | if (!cpumask_empty(tick_broadcast_mask)) | 
| 478 | broadcast = tick_resume_broadcast_oneshot(bc); | 486 | tick_resume_broadcast_oneshot(bc); | 
| 479 | break; | 487 | break; | 
| 480 | } | 488 | } | 
| 481 | } | 489 | } | 
| 482 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 490 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 
| 483 | |||
| 484 | return broadcast; | ||
| 485 | } | 491 | } | 
| 486 | 492 | ||
| 487 | |||
| 488 | #ifdef CONFIG_TICK_ONESHOT | 493 | #ifdef CONFIG_TICK_ONESHOT | 
| 489 | 494 | ||
| 490 | static cpumask_var_t tick_broadcast_oneshot_mask; | 495 | static cpumask_var_t tick_broadcast_oneshot_mask; | 
| @@ -532,8 +537,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, | |||
| 532 | { | 537 | { | 
| 533 | int ret; | 538 | int ret; | 
| 534 | 539 | ||
| 535 | if (bc->mode != CLOCK_EVT_MODE_ONESHOT) | 540 | if (bc->state != CLOCK_EVT_STATE_ONESHOT) | 
| 536 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 541 | clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); | 
| 537 | 542 | ||
| 538 | ret = clockevents_program_event(bc, expires, force); | 543 | ret = clockevents_program_event(bc, expires, force); | 
| 539 | if (!ret) | 544 | if (!ret) | 
| @@ -541,10 +546,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, | |||
| 541 | return ret; | 546 | return ret; | 
| 542 | } | 547 | } | 
| 543 | 548 | ||
| 544 | int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | 549 | static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) | 
| 545 | { | 550 | { | 
| 546 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 551 | clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); | 
| 547 | return 0; | ||
| 548 | } | 552 | } | 
| 549 | 553 | ||
| 550 | /* | 554 | /* | 
| @@ -562,8 +566,8 @@ void tick_check_oneshot_broadcast_this_cpu(void) | |||
| 562 | * switched over, leave the device alone. | 566 | * switched over, leave the device alone. | 
| 563 | */ | 567 | */ | 
| 564 | if (td->mode == TICKDEV_MODE_ONESHOT) { | 568 | if (td->mode == TICKDEV_MODE_ONESHOT) { | 
| 565 | clockevents_set_mode(td->evtdev, | 569 | clockevents_set_state(td->evtdev, | 
| 566 | CLOCK_EVT_MODE_ONESHOT); | 570 | CLOCK_EVT_STATE_ONESHOT); | 
| 567 | } | 571 | } | 
| 568 | } | 572 | } | 
| 569 | } | 573 | } | 
| @@ -666,31 +670,26 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, | |||
| 666 | if (dev->next_event.tv64 < bc->next_event.tv64) | 670 | if (dev->next_event.tv64 < bc->next_event.tv64) | 
| 667 | return; | 671 | return; | 
| 668 | } | 672 | } | 
| 669 | clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); | 673 | clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); | 
| 670 | } | 674 | } | 
| 671 | 675 | ||
| 672 | static void broadcast_move_bc(int deadcpu) | 676 | /** | 
| 673 | { | 677 | * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode | 
| 674 | struct clock_event_device *bc = tick_broadcast_device.evtdev; | 678 | * @state: The target state (enter/exit) | 
| 675 | 679 | * | |
| 676 | if (!bc || !broadcast_needs_cpu(bc, deadcpu)) | 680 | * The system enters/leaves a state, where affected devices might stop | 
| 677 | return; | ||
| 678 | /* This moves the broadcast assignment to this cpu */ | ||
| 679 | clockevents_program_event(bc, bc->next_event, 1); | ||
| 680 | } | ||
| 681 | |||
| 682 | /* | ||
| 683 | * Powerstate information: The system enters/leaves a state, where | ||
| 684 | * affected devices might stop | ||
| 685 | * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. | 681 | * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. | 
| 682 | * | ||
| 683 | * Called with interrupts disabled, so clockevents_lock is not | ||
| 684 | * required here because the local clock event device cannot go away | ||
| 685 | * under us. | ||
| 686 | */ | 686 | */ | 
| 687 | int tick_broadcast_oneshot_control(unsigned long reason) | 687 | int tick_broadcast_oneshot_control(enum tick_broadcast_state state) | 
| 688 | { | 688 | { | 
| 689 | struct clock_event_device *bc, *dev; | 689 | struct clock_event_device *bc, *dev; | 
| 690 | struct tick_device *td; | 690 | struct tick_device *td; | 
| 691 | unsigned long flags; | ||
| 692 | ktime_t now; | ||
| 693 | int cpu, ret = 0; | 691 | int cpu, ret = 0; | 
| 692 | ktime_t now; | ||
| 694 | 693 | ||
| 695 | /* | 694 | /* | 
| 696 | * Periodic mode does not care about the enter/exit of power | 695 | * Periodic mode does not care about the enter/exit of power | 
| @@ -703,17 +702,17 @@ int tick_broadcast_oneshot_control(unsigned long reason) | |||
| 703 | * We are called with preemtion disabled from the depth of the | 702 | * We are called with preemtion disabled from the depth of the | 
| 704 | * idle code, so we can't be moved away. | 703 | * idle code, so we can't be moved away. | 
| 705 | */ | 704 | */ | 
| 706 | cpu = smp_processor_id(); | 705 | td = this_cpu_ptr(&tick_cpu_device); | 
| 707 | td = &per_cpu(tick_cpu_device, cpu); | ||
| 708 | dev = td->evtdev; | 706 | dev = td->evtdev; | 
| 709 | 707 | ||
| 710 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 708 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 
| 711 | return 0; | 709 | return 0; | 
| 712 | 710 | ||
| 711 | raw_spin_lock(&tick_broadcast_lock); | ||
| 713 | bc = tick_broadcast_device.evtdev; | 712 | bc = tick_broadcast_device.evtdev; | 
| 713 | cpu = smp_processor_id(); | ||
| 714 | 714 | ||
| 715 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 715 | if (state == TICK_BROADCAST_ENTER) { | 
| 716 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { | ||
| 717 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { | 716 | if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { | 
| 718 | WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); | 717 | WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); | 
| 719 | broadcast_shutdown_local(bc, dev); | 718 | broadcast_shutdown_local(bc, dev); | 
| @@ -741,7 +740,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) | |||
| 741 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); | 740 | cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); | 
| 742 | } else { | 741 | } else { | 
| 743 | if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { | 742 | if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { | 
| 744 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 743 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); | 
| 745 | /* | 744 | /* | 
| 746 | * The cpu which was handling the broadcast | 745 | * The cpu which was handling the broadcast | 
| 747 | * timer marked this cpu in the broadcast | 746 | * timer marked this cpu in the broadcast | 
| @@ -805,9 +804,10 @@ int tick_broadcast_oneshot_control(unsigned long reason) | |||
| 805 | } | 804 | } | 
| 806 | } | 805 | } | 
| 807 | out: | 806 | out: | 
| 808 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 807 | raw_spin_unlock(&tick_broadcast_lock); | 
| 809 | return ret; | 808 | return ret; | 
| 810 | } | 809 | } | 
| 810 | EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); | ||
| 811 | 811 | ||
| 812 | /* | 812 | /* | 
| 813 | * Reset the one shot broadcast for a cpu | 813 | * Reset the one shot broadcast for a cpu | 
| @@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
| 842 | 842 | ||
| 843 | /* Set it up only once ! */ | 843 | /* Set it up only once ! */ | 
| 844 | if (bc->event_handler != tick_handle_oneshot_broadcast) { | 844 | if (bc->event_handler != tick_handle_oneshot_broadcast) { | 
| 845 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; | 845 | int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; | 
| 846 | 846 | ||
| 847 | bc->event_handler = tick_handle_oneshot_broadcast; | 847 | bc->event_handler = tick_handle_oneshot_broadcast; | 
| 848 | 848 | ||
| @@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
| 858 | tick_broadcast_oneshot_mask, tmpmask); | 858 | tick_broadcast_oneshot_mask, tmpmask); | 
| 859 | 859 | ||
| 860 | if (was_periodic && !cpumask_empty(tmpmask)) { | 860 | if (was_periodic && !cpumask_empty(tmpmask)) { | 
| 861 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 861 | clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); | 
| 862 | tick_broadcast_init_next_event(tmpmask, | 862 | tick_broadcast_init_next_event(tmpmask, | 
| 863 | tick_next_period); | 863 | tick_next_period); | 
| 864 | tick_broadcast_set_event(bc, cpu, tick_next_period, 1); | 864 | tick_broadcast_set_event(bc, cpu, tick_next_period, 1); | 
| @@ -894,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void) | |||
| 894 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 894 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 
| 895 | } | 895 | } | 
| 896 | 896 | ||
| 897 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 898 | void hotplug_cpu__broadcast_tick_pull(int deadcpu) | ||
| 899 | { | ||
| 900 | struct clock_event_device *bc; | ||
| 901 | unsigned long flags; | ||
| 902 | |||
| 903 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
| 904 | bc = tick_broadcast_device.evtdev; | ||
| 905 | |||
| 906 | if (bc && broadcast_needs_cpu(bc, deadcpu)) { | ||
| 907 | /* This moves the broadcast assignment to this CPU: */ | ||
| 908 | clockevents_program_event(bc, bc->next_event, 1); | ||
| 909 | } | ||
| 910 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | ||
| 911 | } | ||
| 897 | 912 | ||
| 898 | /* | 913 | /* | 
| 899 | * Remove a dead CPU from broadcasting | 914 | * Remove a dead CPU from broadcasting | 
| 900 | */ | 915 | */ | 
| 901 | void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | 916 | void tick_shutdown_broadcast_oneshot(unsigned int cpu) | 
| 902 | { | 917 | { | 
| 903 | unsigned long flags; | 918 | unsigned long flags; | 
| 904 | unsigned int cpu = *cpup; | ||
| 905 | 919 | ||
| 906 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 920 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | 
| 907 | 921 | ||
| @@ -913,10 +927,9 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) | |||
| 913 | cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); | 927 | cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); | 
| 914 | cpumask_clear_cpu(cpu, tick_broadcast_force_mask); | 928 | cpumask_clear_cpu(cpu, tick_broadcast_force_mask); | 
| 915 | 929 | ||
| 916 | broadcast_move_bc(cpu); | ||
| 917 | |||
| 918 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 930 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 
| 919 | } | 931 | } | 
| 932 | #endif | ||
| 920 | 933 | ||
| 921 | /* | 934 | /* | 
| 922 | * Check, whether the broadcast device is in one shot mode | 935 | * Check, whether the broadcast device is in one shot mode | 
| diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index f7c515595b42..3ae6afa1eb98 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
| @@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev) | |||
| 102 | 102 | ||
| 103 | tick_periodic(cpu); | 103 | tick_periodic(cpu); | 
| 104 | 104 | ||
| 105 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | 105 | if (dev->state != CLOCK_EVT_STATE_ONESHOT) | 
| 106 | return; | 106 | return; | 
| 107 | for (;;) { | 107 | for (;;) { | 
| 108 | /* | 108 | /* | 
| @@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
| 140 | 140 | ||
| 141 | if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && | 141 | if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && | 
| 142 | !tick_broadcast_oneshot_active()) { | 142 | !tick_broadcast_oneshot_active()) { | 
| 143 | clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); | 143 | clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); | 
| 144 | } else { | 144 | } else { | 
| 145 | unsigned long seq; | 145 | unsigned long seq; | 
| 146 | ktime_t next; | 146 | ktime_t next; | 
| @@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) | |||
| 150 | next = tick_next_period; | 150 | next = tick_next_period; | 
| 151 | } while (read_seqretry(&jiffies_lock, seq)); | 151 | } while (read_seqretry(&jiffies_lock, seq)); | 
| 152 | 152 | ||
| 153 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 153 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); | 
| 154 | 154 | ||
| 155 | for (;;) { | 155 | for (;;) { | 
| 156 | if (!clockevents_program_event(dev, next, false)) | 156 | if (!clockevents_program_event(dev, next, false)) | 
| @@ -332,14 +332,16 @@ out_bc: | |||
| 332 | tick_install_broadcast_device(newdev); | 332 | tick_install_broadcast_device(newdev); | 
| 333 | } | 333 | } | 
| 334 | 334 | ||
| 335 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 335 | /* | 336 | /* | 
| 336 | * Transfer the do_timer job away from a dying cpu. | 337 | * Transfer the do_timer job away from a dying cpu. | 
| 337 | * | 338 | * | 
| 338 | * Called with interrupts disabled. | 339 | * Called with interrupts disabled. Not locking required. If | 
| 340 | * tick_do_timer_cpu is owned by this cpu, nothing can change it. | ||
| 339 | */ | 341 | */ | 
| 340 | void tick_handover_do_timer(int *cpup) | 342 | void tick_handover_do_timer(void) | 
| 341 | { | 343 | { | 
| 342 | if (*cpup == tick_do_timer_cpu) { | 344 | if (tick_do_timer_cpu == smp_processor_id()) { | 
| 343 | int cpu = cpumask_first(cpu_online_mask); | 345 | int cpu = cpumask_first(cpu_online_mask); | 
| 344 | 346 | ||
| 345 | tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : | 347 | tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : | 
| @@ -354,9 +356,9 @@ void tick_handover_do_timer(int *cpup) | |||
| 354 | * access the hardware device itself. | 356 | * access the hardware device itself. | 
| 355 | * We just set the mode and remove it from the lists. | 357 | * We just set the mode and remove it from the lists. | 
| 356 | */ | 358 | */ | 
| 357 | void tick_shutdown(unsigned int *cpup) | 359 | void tick_shutdown(unsigned int cpu) | 
| 358 | { | 360 | { | 
| 359 | struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); | 361 | struct tick_device *td = &per_cpu(tick_cpu_device, cpu); | 
| 360 | struct clock_event_device *dev = td->evtdev; | 362 | struct clock_event_device *dev = td->evtdev; | 
| 361 | 363 | ||
| 362 | td->mode = TICKDEV_MODE_PERIODIC; | 364 | td->mode = TICKDEV_MODE_PERIODIC; | 
| @@ -365,27 +367,42 @@ void tick_shutdown(unsigned int *cpup) | |||
| 365 | * Prevent that the clock events layer tries to call | 367 | * Prevent that the clock events layer tries to call | 
| 366 | * the set mode function! | 368 | * the set mode function! | 
| 367 | */ | 369 | */ | 
| 370 | dev->state = CLOCK_EVT_STATE_DETACHED; | ||
| 368 | dev->mode = CLOCK_EVT_MODE_UNUSED; | 371 | dev->mode = CLOCK_EVT_MODE_UNUSED; | 
| 369 | clockevents_exchange_device(dev, NULL); | 372 | clockevents_exchange_device(dev, NULL); | 
| 370 | dev->event_handler = clockevents_handle_noop; | 373 | dev->event_handler = clockevents_handle_noop; | 
| 371 | td->evtdev = NULL; | 374 | td->evtdev = NULL; | 
| 372 | } | 375 | } | 
| 373 | } | 376 | } | 
| 377 | #endif | ||
| 374 | 378 | ||
| 375 | void tick_suspend(void) | 379 | /** | 
| 380 | * tick_suspend_local - Suspend the local tick device | ||
| 381 | * | ||
| 382 | * Called from the local cpu for freeze with interrupts disabled. | ||
| 383 | * | ||
| 384 | * No locks required. Nothing can change the per cpu device. | ||
| 385 | */ | ||
| 386 | void tick_suspend_local(void) | ||
| 376 | { | 387 | { | 
| 377 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); | 388 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); | 
| 378 | 389 | ||
| 379 | clockevents_shutdown(td->evtdev); | 390 | clockevents_shutdown(td->evtdev); | 
| 380 | } | 391 | } | 
| 381 | 392 | ||
| 382 | void tick_resume(void) | 393 | /** | 
| 394 | * tick_resume_local - Resume the local tick device | ||
| 395 | * | ||
| 396 | * Called from the local CPU for unfreeze or XEN resume magic. | ||
| 397 | * | ||
| 398 | * No locks required. Nothing can change the per cpu device. | ||
| 399 | */ | ||
| 400 | void tick_resume_local(void) | ||
| 383 | { | 401 | { | 
| 384 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); | 402 | struct tick_device *td = this_cpu_ptr(&tick_cpu_device); | 
| 385 | int broadcast = tick_resume_broadcast(); | 403 | bool broadcast = tick_resume_check_broadcast(); | 
| 386 | |||
| 387 | clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); | ||
| 388 | 404 | ||
| 405 | clockevents_tick_resume(td->evtdev); | ||
| 389 | if (!broadcast) { | 406 | if (!broadcast) { | 
| 390 | if (td->mode == TICKDEV_MODE_PERIODIC) | 407 | if (td->mode == TICKDEV_MODE_PERIODIC) | 
| 391 | tick_setup_periodic(td->evtdev, 0); | 408 | tick_setup_periodic(td->evtdev, 0); | 
| @@ -394,6 +411,35 @@ void tick_resume(void) | |||
| 394 | } | 411 | } | 
| 395 | } | 412 | } | 
| 396 | 413 | ||
| 414 | /** | ||
| 415 | * tick_suspend - Suspend the tick and the broadcast device | ||
| 416 | * | ||
| 417 | * Called from syscore_suspend() via timekeeping_suspend with only one | ||
| 418 | * CPU online and interrupts disabled or from tick_unfreeze() under | ||
| 419 | * tick_freeze_lock. | ||
| 420 | * | ||
| 421 | * No locks required. Nothing can change the per cpu device. | ||
| 422 | */ | ||
| 423 | void tick_suspend(void) | ||
| 424 | { | ||
| 425 | tick_suspend_local(); | ||
| 426 | tick_suspend_broadcast(); | ||
| 427 | } | ||
| 428 | |||
| 429 | /** | ||
| 430 | * tick_resume - Resume the tick and the broadcast device | ||
| 431 | * | ||
| 432 | * Called from syscore_resume() via timekeeping_resume with only one | ||
| 433 | * CPU online and interrupts disabled. | ||
| 434 | * | ||
| 435 | * No locks required. Nothing can change the per cpu device. | ||
| 436 | */ | ||
| 437 | void tick_resume(void) | ||
| 438 | { | ||
| 439 | tick_resume_broadcast(); | ||
| 440 | tick_resume_local(); | ||
| 441 | } | ||
| 442 | |||
| 397 | static DEFINE_RAW_SPINLOCK(tick_freeze_lock); | 443 | static DEFINE_RAW_SPINLOCK(tick_freeze_lock); | 
| 398 | static unsigned int tick_freeze_depth; | 444 | static unsigned int tick_freeze_depth; | 
| 399 | 445 | ||
| @@ -411,12 +457,10 @@ void tick_freeze(void) | |||
| 411 | raw_spin_lock(&tick_freeze_lock); | 457 | raw_spin_lock(&tick_freeze_lock); | 
| 412 | 458 | ||
| 413 | tick_freeze_depth++; | 459 | tick_freeze_depth++; | 
| 414 | if (tick_freeze_depth == num_online_cpus()) { | 460 | if (tick_freeze_depth == num_online_cpus()) | 
| 415 | timekeeping_suspend(); | 461 | timekeeping_suspend(); | 
| 416 | } else { | 462 | else | 
| 417 | tick_suspend(); | 463 | tick_suspend_local(); | 
| 418 | tick_suspend_broadcast(); | ||
| 419 | } | ||
| 420 | 464 | ||
| 421 | raw_spin_unlock(&tick_freeze_lock); | 465 | raw_spin_unlock(&tick_freeze_lock); | 
| 422 | } | 466 | } | 
| @@ -437,7 +481,7 @@ void tick_unfreeze(void) | |||
| 437 | if (tick_freeze_depth == num_online_cpus()) | 481 | if (tick_freeze_depth == num_online_cpus()) | 
| 438 | timekeeping_resume(); | 482 | timekeeping_resume(); | 
| 439 | else | 483 | else | 
| 440 | tick_resume(); | 484 | tick_resume_local(); | 
| 441 | 485 | ||
| 442 | tick_freeze_depth--; | 486 | tick_freeze_depth--; | 
| 443 | 487 | ||
| diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 366aeb4f2c66..b64fdd8054c5 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
| @@ -5,15 +5,12 @@ | |||
| 5 | #include <linux/tick.h> | 5 | #include <linux/tick.h> | 
| 6 | 6 | ||
| 7 | #include "timekeeping.h" | 7 | #include "timekeeping.h" | 
| 8 | #include "tick-sched.h" | ||
| 8 | 9 | ||
| 9 | extern seqlock_t jiffies_lock; | 10 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 
| 10 | 11 | ||
| 11 | #define CS_NAME_LEN 32 | 12 | # define TICK_DO_TIMER_NONE -1 | 
| 12 | 13 | # define TICK_DO_TIMER_BOOT -2 | |
| 13 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD | ||
| 14 | |||
| 15 | #define TICK_DO_TIMER_NONE -1 | ||
| 16 | #define TICK_DO_TIMER_BOOT -2 | ||
| 17 | 14 | ||
| 18 | DECLARE_PER_CPU(struct tick_device, tick_cpu_device); | 15 | DECLARE_PER_CPU(struct tick_device, tick_cpu_device); | 
| 19 | extern ktime_t tick_next_period; | 16 | extern ktime_t tick_next_period; | 
| @@ -23,21 +20,72 @@ extern int tick_do_timer_cpu __read_mostly; | |||
| 23 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); | 20 | extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); | 
| 24 | extern void tick_handle_periodic(struct clock_event_device *dev); | 21 | extern void tick_handle_periodic(struct clock_event_device *dev); | 
| 25 | extern void tick_check_new_device(struct clock_event_device *dev); | 22 | extern void tick_check_new_device(struct clock_event_device *dev); | 
| 26 | extern void tick_handover_do_timer(int *cpup); | 23 | extern void tick_shutdown(unsigned int cpu); | 
| 27 | extern void tick_shutdown(unsigned int *cpup); | ||
| 28 | extern void tick_suspend(void); | 24 | extern void tick_suspend(void); | 
| 29 | extern void tick_resume(void); | 25 | extern void tick_resume(void); | 
| 30 | extern bool tick_check_replacement(struct clock_event_device *curdev, | 26 | extern bool tick_check_replacement(struct clock_event_device *curdev, | 
| 31 | struct clock_event_device *newdev); | 27 | struct clock_event_device *newdev); | 
| 32 | extern void tick_install_replacement(struct clock_event_device *dev); | 28 | extern void tick_install_replacement(struct clock_event_device *dev); | 
| 29 | extern int tick_is_oneshot_available(void); | ||
| 30 | extern struct tick_device *tick_get_device(int cpu); | ||
| 33 | 31 | ||
| 34 | extern void clockevents_shutdown(struct clock_event_device *dev); | 32 | extern int clockevents_tick_resume(struct clock_event_device *dev); | 
| 33 | /* Check, if the device is functional or a dummy for broadcast */ | ||
| 34 | static inline int tick_device_is_functional(struct clock_event_device *dev) | ||
| 35 | { | ||
| 36 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | ||
| 37 | } | ||
| 35 | 38 | ||
| 39 | extern void clockevents_shutdown(struct clock_event_device *dev); | ||
| 40 | extern void clockevents_exchange_device(struct clock_event_device *old, | ||
| 41 | struct clock_event_device *new); | ||
| 42 | extern void clockevents_set_state(struct clock_event_device *dev, | ||
| 43 | enum clock_event_state state); | ||
| 44 | extern int clockevents_program_event(struct clock_event_device *dev, | ||
| 45 | ktime_t expires, bool force); | ||
| 46 | extern void clockevents_handle_noop(struct clock_event_device *dev); | ||
| 47 | extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); | ||
| 36 | extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); | 48 | extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); | 
| 37 | 49 | ||
| 38 | /* | 50 | /* Broadcasting support */ | 
| 39 | * NO_HZ / high resolution timer shared code | 51 | # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | 
| 40 | */ | 52 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | 
| 53 | extern void tick_install_broadcast_device(struct clock_event_device *dev); | ||
| 54 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | ||
| 55 | extern void tick_shutdown_broadcast(unsigned int cpu); | ||
| 56 | extern void tick_suspend_broadcast(void); | ||
| 57 | extern void tick_resume_broadcast(void); | ||
| 58 | extern bool tick_resume_check_broadcast(void); | ||
| 59 | extern void tick_broadcast_init(void); | ||
| 60 | extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); | ||
| 61 | extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); | ||
| 62 | extern struct tick_device *tick_get_broadcast_device(void); | ||
| 63 | extern struct cpumask *tick_get_broadcast_mask(void); | ||
| 64 | # else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */ | ||
| 65 | static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } | ||
| 66 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } | ||
| 67 | static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } | ||
| 68 | static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } | ||
| 69 | static inline void tick_shutdown_broadcast(unsigned int cpu) { } | ||
| 70 | static inline void tick_suspend_broadcast(void) { } | ||
| 71 | static inline void tick_resume_broadcast(void) { } | ||
| 72 | static inline bool tick_resume_check_broadcast(void) { return false; } | ||
| 73 | static inline void tick_broadcast_init(void) { } | ||
| 74 | static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; } | ||
| 75 | |||
| 76 | /* Set the periodic handler in non broadcast mode */ | ||
| 77 | static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) | ||
| 78 | { | ||
| 79 | dev->event_handler = tick_handle_periodic; | ||
| 80 | } | ||
| 81 | # endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ | ||
| 82 | |||
| 83 | #else /* !GENERIC_CLOCKEVENTS: */ | ||
| 84 | static inline void tick_suspend(void) { } | ||
| 85 | static inline void tick_resume(void) { } | ||
| 86 | #endif /* !GENERIC_CLOCKEVENTS */ | ||
| 87 | |||
| 88 | /* Oneshot related functions */ | ||
| 41 | #ifdef CONFIG_TICK_ONESHOT | 89 | #ifdef CONFIG_TICK_ONESHOT | 
| 42 | extern void tick_setup_oneshot(struct clock_event_device *newdev, | 90 | extern void tick_setup_oneshot(struct clock_event_device *newdev, | 
| 43 | void (*handler)(struct clock_event_device *), | 91 | void (*handler)(struct clock_event_device *), | 
| @@ -46,58 +94,42 @@ extern int tick_program_event(ktime_t expires, int force); | |||
| 46 | extern void tick_oneshot_notify(void); | 94 | extern void tick_oneshot_notify(void); | 
| 47 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); | 95 | extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); | 
| 48 | extern void tick_resume_oneshot(void); | 96 | extern void tick_resume_oneshot(void); | 
| 49 | # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | 97 | static inline bool tick_oneshot_possible(void) { return true; } | 
| 98 | extern int tick_oneshot_mode_active(void); | ||
| 99 | extern void tick_clock_notify(void); | ||
| 100 | extern int tick_check_oneshot_change(int allow_nohz); | ||
| 101 | extern int tick_init_highres(void); | ||
| 102 | #else /* !CONFIG_TICK_ONESHOT: */ | ||
| 103 | static inline | ||
| 104 | void tick_setup_oneshot(struct clock_event_device *newdev, | ||
| 105 | void (*handler)(struct clock_event_device *), | ||
| 106 | ktime_t nextevt) { BUG(); } | ||
| 107 | static inline void tick_resume_oneshot(void) { BUG(); } | ||
| 108 | static inline int tick_program_event(ktime_t expires, int force) { return 0; } | ||
| 109 | static inline void tick_oneshot_notify(void) { } | ||
| 110 | static inline bool tick_oneshot_possible(void) { return false; } | ||
| 111 | static inline int tick_oneshot_mode_active(void) { return 0; } | ||
| 112 | static inline void tick_clock_notify(void) { } | ||
| 113 | static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } | ||
| 114 | #endif /* !CONFIG_TICK_ONESHOT */ | ||
| 115 | |||
| 116 | /* Functions related to oneshot broadcasting */ | ||
| 117 | #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) | ||
| 50 | extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); | 118 | extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); | 
| 51 | extern int tick_broadcast_oneshot_control(unsigned long reason); | ||
| 52 | extern void tick_broadcast_switch_to_oneshot(void); | 119 | extern void tick_broadcast_switch_to_oneshot(void); | 
| 53 | extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); | 120 | extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); | 
| 54 | extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); | ||
| 55 | extern int tick_broadcast_oneshot_active(void); | 121 | extern int tick_broadcast_oneshot_active(void); | 
| 56 | extern void tick_check_oneshot_broadcast_this_cpu(void); | 122 | extern void tick_check_oneshot_broadcast_this_cpu(void); | 
| 57 | bool tick_broadcast_oneshot_available(void); | 123 | bool tick_broadcast_oneshot_available(void); | 
| 58 | # else /* BROADCAST */ | 124 | extern struct cpumask *tick_get_broadcast_oneshot_mask(void); | 
| 59 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 125 | #else /* !(BROADCAST && ONESHOT): */ | 
| 60 | { | 126 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } | 
| 61 | BUG(); | ||
| 62 | } | ||
| 63 | static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } | ||
| 64 | static inline void tick_broadcast_switch_to_oneshot(void) { } | 127 | static inline void tick_broadcast_switch_to_oneshot(void) { } | 
| 65 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | 128 | static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } | 
| 66 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 129 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | 
| 67 | static inline void tick_check_oneshot_broadcast_this_cpu(void) { } | 130 | static inline void tick_check_oneshot_broadcast_this_cpu(void) { } | 
| 68 | static inline bool tick_broadcast_oneshot_available(void) { return true; } | 131 | static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } | 
| 69 | # endif /* !BROADCAST */ | 132 | #endif /* !(BROADCAST && ONESHOT) */ | 
| 70 | |||
| 71 | #else /* !ONESHOT */ | ||
| 72 | static inline | ||
| 73 | void tick_setup_oneshot(struct clock_event_device *newdev, | ||
| 74 | void (*handler)(struct clock_event_device *), | ||
| 75 | ktime_t nextevt) | ||
| 76 | { | ||
| 77 | BUG(); | ||
| 78 | } | ||
| 79 | static inline void tick_resume_oneshot(void) | ||
| 80 | { | ||
| 81 | BUG(); | ||
| 82 | } | ||
| 83 | static inline int tick_program_event(ktime_t expires, int force) | ||
| 84 | { | ||
| 85 | return 0; | ||
| 86 | } | ||
| 87 | static inline void tick_oneshot_notify(void) { } | ||
| 88 | static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | ||
| 89 | { | ||
| 90 | BUG(); | ||
| 91 | } | ||
| 92 | static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } | ||
| 93 | static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } | ||
| 94 | static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | ||
| 95 | { | ||
| 96 | return 0; | ||
| 97 | } | ||
| 98 | static inline int tick_broadcast_oneshot_active(void) { return 0; } | ||
| 99 | static inline bool tick_broadcast_oneshot_available(void) { return false; } | ||
| 100 | #endif /* !TICK_ONESHOT */ | ||
| 101 | 133 | ||
| 102 | /* NO_HZ_FULL internal */ | 134 | /* NO_HZ_FULL internal */ | 
| 103 | #ifdef CONFIG_NO_HZ_FULL | 135 | #ifdef CONFIG_NO_HZ_FULL | 
| @@ -105,68 +137,3 @@ extern void tick_nohz_init(void); | |||
| 105 | # else | 137 | # else | 
| 106 | static inline void tick_nohz_init(void) { } | 138 | static inline void tick_nohz_init(void) { } | 
| 107 | #endif | 139 | #endif | 
| 108 | |||
| 109 | /* | ||
| 110 | * Broadcasting support | ||
| 111 | */ | ||
| 112 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | ||
| 113 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | ||
| 114 | extern void tick_install_broadcast_device(struct clock_event_device *dev); | ||
| 115 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | ||
| 116 | extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); | ||
| 117 | extern void tick_shutdown_broadcast(unsigned int *cpup); | ||
| 118 | extern void tick_suspend_broadcast(void); | ||
| 119 | extern int tick_resume_broadcast(void); | ||
| 120 | extern void tick_broadcast_init(void); | ||
| 121 | extern void | ||
| 122 | tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); | ||
| 123 | int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); | ||
| 124 | |||
| 125 | #else /* !BROADCAST */ | ||
| 126 | |||
| 127 | static inline void tick_install_broadcast_device(struct clock_event_device *dev) | ||
| 128 | { | ||
| 129 | } | ||
| 130 | |||
| 131 | static inline int tick_is_broadcast_device(struct clock_event_device *dev) | ||
| 132 | { | ||
| 133 | return 0; | ||
| 134 | } | ||
| 135 | static inline int tick_device_uses_broadcast(struct clock_event_device *dev, | ||
| 136 | int cpu) | ||
| 137 | { | ||
| 138 | return 0; | ||
| 139 | } | ||
| 140 | static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } | ||
| 141 | static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } | ||
| 142 | static inline void tick_shutdown_broadcast(unsigned int *cpup) { } | ||
| 143 | static inline void tick_suspend_broadcast(void) { } | ||
| 144 | static inline int tick_resume_broadcast(void) { return 0; } | ||
| 145 | static inline void tick_broadcast_init(void) { } | ||
| 146 | static inline int tick_broadcast_update_freq(struct clock_event_device *dev, | ||
| 147 | u32 freq) { return -ENODEV; } | ||
| 148 | |||
| 149 | /* | ||
| 150 | * Set the periodic handler in non broadcast mode | ||
| 151 | */ | ||
| 152 | static inline void tick_set_periodic_handler(struct clock_event_device *dev, | ||
| 153 | int broadcast) | ||
| 154 | { | ||
| 155 | dev->event_handler = tick_handle_periodic; | ||
| 156 | } | ||
| 157 | #endif /* !BROADCAST */ | ||
| 158 | |||
| 159 | /* | ||
| 160 | * Check, if the device is functional or a dummy for broadcast | ||
| 161 | */ | ||
| 162 | static inline int tick_device_is_functional(struct clock_event_device *dev) | ||
| 163 | { | ||
| 164 | return !(dev->features & CLOCK_EVT_FEAT_DUMMY); | ||
| 165 | } | ||
| 166 | |||
| 167 | int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); | ||
| 168 | |||
| 169 | #endif | ||
| 170 | |||
| 171 | extern void do_timer(unsigned long ticks); | ||
| 172 | extern void update_wall_time(void); | ||
| diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 7ce740e78e1b..67a64b1670bf 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c | |||
| @@ -38,7 +38,7 @@ void tick_resume_oneshot(void) | |||
| 38 | { | 38 | { | 
| 39 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); | 39 | struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); | 
| 40 | 40 | ||
| 41 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 41 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); | 
| 42 | clockevents_program_event(dev, ktime_get(), true); | 42 | clockevents_program_event(dev, ktime_get(), true); | 
| 43 | } | 43 | } | 
| 44 | 44 | ||
| @@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, | |||
| 50 | ktime_t next_event) | 50 | ktime_t next_event) | 
| 51 | { | 51 | { | 
| 52 | newdev->event_handler = handler; | 52 | newdev->event_handler = handler; | 
| 53 | clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); | 53 | clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); | 
| 54 | clockevents_program_event(newdev, next_event, true); | 54 | clockevents_program_event(newdev, next_event, true); | 
| 55 | } | 55 | } | 
| 56 | 56 | ||
| @@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) | |||
| 81 | 81 | ||
| 82 | td->mode = TICKDEV_MODE_ONESHOT; | 82 | td->mode = TICKDEV_MODE_ONESHOT; | 
| 83 | dev->event_handler = handler; | 83 | dev->event_handler = handler; | 
| 84 | clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); | 84 | clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); | 
| 85 | tick_broadcast_switch_to_oneshot(); | 85 | tick_broadcast_switch_to_oneshot(); | 
| 86 | return 0; | 86 | return 0; | 
| 87 | } | 87 | } | 
| diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a4c4edac4528..914259128145 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -34,7 +34,7 @@ | |||
| 34 | /* | 34 | /* | 
| 35 | * Per cpu nohz control structure | 35 | * Per cpu nohz control structure | 
| 36 | */ | 36 | */ | 
| 37 | DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 37 | static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); | 
| 38 | 38 | ||
| 39 | /* | 39 | /* | 
| 40 | * The time, when the last jiffy update happened. Protected by jiffies_lock. | 40 | * The time, when the last jiffy update happened. Protected by jiffies_lock. | 
| @@ -416,6 +416,11 @@ static int __init setup_tick_nohz(char *str) | |||
| 416 | 416 | ||
| 417 | __setup("nohz=", setup_tick_nohz); | 417 | __setup("nohz=", setup_tick_nohz); | 
| 418 | 418 | ||
| 419 | int tick_nohz_tick_stopped(void) | ||
| 420 | { | ||
| 421 | return __this_cpu_read(tick_cpu_sched.tick_stopped); | ||
| 422 | } | ||
| 423 | |||
| 419 | /** | 424 | /** | 
| 420 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | 425 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | 
| 421 | * | 426 | * | 
| diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h new file mode 100644 index 000000000000..28b5da3e1a17 --- /dev/null +++ b/kernel/time/tick-sched.h | |||
| @@ -0,0 +1,74 @@ | |||
| 1 | #ifndef _TICK_SCHED_H | ||
| 2 | #define _TICK_SCHED_H | ||
| 3 | |||
| 4 | #include <linux/hrtimer.h> | ||
| 5 | |||
| 6 | enum tick_device_mode { | ||
| 7 | TICKDEV_MODE_PERIODIC, | ||
| 8 | TICKDEV_MODE_ONESHOT, | ||
| 9 | }; | ||
| 10 | |||
| 11 | struct tick_device { | ||
| 12 | struct clock_event_device *evtdev; | ||
| 13 | enum tick_device_mode mode; | ||
| 14 | }; | ||
| 15 | |||
| 16 | enum tick_nohz_mode { | ||
| 17 | NOHZ_MODE_INACTIVE, | ||
| 18 | NOHZ_MODE_LOWRES, | ||
| 19 | NOHZ_MODE_HIGHRES, | ||
| 20 | }; | ||
| 21 | |||
| 22 | /** | ||
| 23 | * struct tick_sched - sched tick emulation and no idle tick control/stats | ||
| 24 | * @sched_timer: hrtimer to schedule the periodic tick in high | ||
| 25 | * resolution mode | ||
| 26 | * @last_tick: Store the last tick expiry time when the tick | ||
| 27 | * timer is modified for nohz sleeps. This is necessary | ||
| 28 | * to resume the tick timer operation in the timeline | ||
| 29 | * when the CPU returns from nohz sleep. | ||
| 30 | * @tick_stopped: Indicator that the idle tick has been stopped | ||
| 31 | * @idle_jiffies: jiffies at the entry to idle for idle time accounting | ||
| 32 | * @idle_calls: Total number of idle calls | ||
| 33 | * @idle_sleeps: Number of idle calls, where the sched tick was stopped | ||
| 34 | * @idle_entrytime: Time when the idle call was entered | ||
| 35 | * @idle_waketime: Time when the idle was interrupted | ||
| 36 | * @idle_exittime: Time when the idle state was left | ||
| 37 | * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped | ||
| 38 | * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding | ||
| 39 | * @sleep_length: Duration of the current idle sleep | ||
| 40 | * @do_timer_lst: CPU was the last one doing do_timer before going idle | ||
| 41 | */ | ||
| 42 | struct tick_sched { | ||
| 43 | struct hrtimer sched_timer; | ||
| 44 | unsigned long check_clocks; | ||
| 45 | enum tick_nohz_mode nohz_mode; | ||
| 46 | ktime_t last_tick; | ||
| 47 | int inidle; | ||
| 48 | int tick_stopped; | ||
| 49 | unsigned long idle_jiffies; | ||
| 50 | unsigned long idle_calls; | ||
| 51 | unsigned long idle_sleeps; | ||
| 52 | int idle_active; | ||
| 53 | ktime_t idle_entrytime; | ||
| 54 | ktime_t idle_waketime; | ||
| 55 | ktime_t idle_exittime; | ||
| 56 | ktime_t idle_sleeptime; | ||
| 57 | ktime_t iowait_sleeptime; | ||
| 58 | ktime_t sleep_length; | ||
| 59 | unsigned long last_jiffies; | ||
| 60 | unsigned long next_jiffies; | ||
| 61 | ktime_t idle_expires; | ||
| 62 | int do_timer_last; | ||
| 63 | }; | ||
| 64 | |||
| 65 | extern struct tick_sched *tick_get_tick_sched(int cpu); | ||
| 66 | |||
| 67 | extern void tick_setup_sched_timer(void); | ||
| 68 | #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS | ||
| 69 | extern void tick_cancel_sched_timer(int cpu); | ||
| 70 | #else | ||
| 71 | static inline void tick_cancel_sched_timer(int cpu) { } | ||
| 72 | #endif | ||
| 73 | |||
| 74 | #endif | ||
| diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 91db94136c10..946acb72179f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -59,17 +59,15 @@ struct tk_fast { | |||
| 59 | }; | 59 | }; | 
| 60 | 60 | ||
| 61 | static struct tk_fast tk_fast_mono ____cacheline_aligned; | 61 | static struct tk_fast tk_fast_mono ____cacheline_aligned; | 
| 62 | static struct tk_fast tk_fast_raw ____cacheline_aligned; | ||
| 62 | 63 | ||
| 63 | /* flag for if timekeeping is suspended */ | 64 | /* flag for if timekeeping is suspended */ | 
| 64 | int __read_mostly timekeeping_suspended; | 65 | int __read_mostly timekeeping_suspended; | 
| 65 | 66 | ||
| 66 | /* Flag for if there is a persistent clock on this platform */ | ||
| 67 | bool __read_mostly persistent_clock_exist = false; | ||
| 68 | |||
| 69 | static inline void tk_normalize_xtime(struct timekeeper *tk) | 67 | static inline void tk_normalize_xtime(struct timekeeper *tk) | 
| 70 | { | 68 | { | 
| 71 | while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { | 69 | while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { | 
| 72 | tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; | 70 | tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; | 
| 73 | tk->xtime_sec++; | 71 | tk->xtime_sec++; | 
| 74 | } | 72 | } | 
| 75 | } | 73 | } | 
| @@ -79,20 +77,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk) | |||
| 79 | struct timespec64 ts; | 77 | struct timespec64 ts; | 
| 80 | 78 | ||
| 81 | ts.tv_sec = tk->xtime_sec; | 79 | ts.tv_sec = tk->xtime_sec; | 
| 82 | ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); | 80 | ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); | 
| 83 | return ts; | 81 | return ts; | 
| 84 | } | 82 | } | 
| 85 | 83 | ||
| 86 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) | 84 | static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) | 
| 87 | { | 85 | { | 
| 88 | tk->xtime_sec = ts->tv_sec; | 86 | tk->xtime_sec = ts->tv_sec; | 
| 89 | tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; | 87 | tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; | 
| 90 | } | 88 | } | 
| 91 | 89 | ||
| 92 | static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) | 90 | static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) | 
| 93 | { | 91 | { | 
| 94 | tk->xtime_sec += ts->tv_sec; | 92 | tk->xtime_sec += ts->tv_sec; | 
| 95 | tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; | 93 | tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; | 
| 96 | tk_normalize_xtime(tk); | 94 | tk_normalize_xtime(tk); | 
| 97 | } | 95 | } | 
| 98 | 96 | ||
| @@ -118,6 +116,117 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) | |||
| 118 | tk->offs_boot = ktime_add(tk->offs_boot, delta); | 116 | tk->offs_boot = ktime_add(tk->offs_boot, delta); | 
| 119 | } | 117 | } | 
| 120 | 118 | ||
| 119 | #ifdef CONFIG_DEBUG_TIMEKEEPING | ||
| 120 | #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ | ||
| 121 | /* | ||
| 122 | * These simple flag variables are managed | ||
| 123 | * without locks, which is racy, but ok since | ||
| 124 | * we don't really care about being super | ||
| 125 | * precise about how many events were seen, | ||
| 126 | * just that a problem was observed. | ||
| 127 | */ | ||
| 128 | static int timekeeping_underflow_seen; | ||
| 129 | static int timekeeping_overflow_seen; | ||
| 130 | |||
| 131 | /* last_warning is only modified under the timekeeping lock */ | ||
| 132 | static long timekeeping_last_warning; | ||
| 133 | |||
| 134 | static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) | ||
| 135 | { | ||
| 136 | |||
| 137 | cycle_t max_cycles = tk->tkr_mono.clock->max_cycles; | ||
| 138 | const char *name = tk->tkr_mono.clock->name; | ||
| 139 | |||
| 140 | if (offset > max_cycles) { | ||
| 141 | printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", | ||
| 142 | offset, name, max_cycles); | ||
| 143 | printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); | ||
| 144 | } else { | ||
| 145 | if (offset > (max_cycles >> 1)) { | ||
| 146 | printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", | ||
| 147 | offset, name, max_cycles >> 1); | ||
| 148 | printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); | ||
| 149 | } | ||
| 150 | } | ||
| 151 | |||
| 152 | if (timekeeping_underflow_seen) { | ||
| 153 | if (jiffies - timekeeping_last_warning > WARNING_FREQ) { | ||
| 154 | printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); | ||
| 155 | printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); | ||
| 156 | printk_deferred(" Your kernel is probably still fine.\n"); | ||
| 157 | timekeeping_last_warning = jiffies; | ||
| 158 | } | ||
| 159 | timekeeping_underflow_seen = 0; | ||
| 160 | } | ||
| 161 | |||
| 162 | if (timekeeping_overflow_seen) { | ||
| 163 | if (jiffies - timekeeping_last_warning > WARNING_FREQ) { | ||
| 164 | printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); | ||
| 165 | printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); | ||
| 166 | printk_deferred(" Your kernel is probably still fine.\n"); | ||
| 167 | timekeeping_last_warning = jiffies; | ||
| 168 | } | ||
| 169 | timekeeping_overflow_seen = 0; | ||
| 170 | } | ||
| 171 | } | ||
| 172 | |||
| 173 | static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) | ||
| 174 | { | ||
| 175 | cycle_t now, last, mask, max, delta; | ||
| 176 | unsigned int seq; | ||
| 177 | |||
| 178 | /* | ||
| 179 | * Since we're called holding a seqlock, the data may shift | ||
| 180 | * under us while we're doing the calculation. This can cause | ||
| 181 | * false positives, since we'd note a problem but throw the | ||
| 182 | * results away. So nest another seqlock here to atomically | ||
| 183 | * grab the points we are checking with. | ||
| 184 | */ | ||
| 185 | do { | ||
| 186 | seq = read_seqcount_begin(&tk_core.seq); | ||
| 187 | now = tkr->read(tkr->clock); | ||
| 188 | last = tkr->cycle_last; | ||
| 189 | mask = tkr->mask; | ||
| 190 | max = tkr->clock->max_cycles; | ||
| 191 | } while (read_seqcount_retry(&tk_core.seq, seq)); | ||
| 192 | |||
| 193 | delta = clocksource_delta(now, last, mask); | ||
| 194 | |||
| 195 | /* | ||
| 196 | * Try to catch underflows by checking if we are seeing small | ||
| 197 | * mask-relative negative values. | ||
| 198 | */ | ||
| 199 | if (unlikely((~delta & mask) < (mask >> 3))) { | ||
| 200 | timekeeping_underflow_seen = 1; | ||
| 201 | delta = 0; | ||
| 202 | } | ||
| 203 | |||
| 204 | /* Cap delta value to the max_cycles values to avoid mult overflows */ | ||
| 205 | if (unlikely(delta > max)) { | ||
| 206 | timekeeping_overflow_seen = 1; | ||
| 207 | delta = tkr->clock->max_cycles; | ||
| 208 | } | ||
| 209 | |||
| 210 | return delta; | ||
| 211 | } | ||
| 212 | #else | ||
| 213 | static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) | ||
| 214 | { | ||
| 215 | } | ||
| 216 | static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) | ||
| 217 | { | ||
| 218 | cycle_t cycle_now, delta; | ||
| 219 | |||
| 220 | /* read clocksource */ | ||
| 221 | cycle_now = tkr->read(tkr->clock); | ||
| 222 | |||
| 223 | /* calculate the delta since the last update_wall_time */ | ||
| 224 | delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); | ||
| 225 | |||
| 226 | return delta; | ||
| 227 | } | ||
| 228 | #endif | ||
| 229 | |||
| 121 | /** | 230 | /** | 
| 122 | * tk_setup_internals - Set up internals to use clocksource clock. | 231 | * tk_setup_internals - Set up internals to use clocksource clock. | 
| 123 | * | 232 | * | 
| @@ -135,11 +244,16 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
| 135 | u64 tmp, ntpinterval; | 244 | u64 tmp, ntpinterval; | 
| 136 | struct clocksource *old_clock; | 245 | struct clocksource *old_clock; | 
| 137 | 246 | ||
| 138 | old_clock = tk->tkr.clock; | 247 | old_clock = tk->tkr_mono.clock; | 
| 139 | tk->tkr.clock = clock; | 248 | tk->tkr_mono.clock = clock; | 
| 140 | tk->tkr.read = clock->read; | 249 | tk->tkr_mono.read = clock->read; | 
| 141 | tk->tkr.mask = clock->mask; | 250 | tk->tkr_mono.mask = clock->mask; | 
| 142 | tk->tkr.cycle_last = tk->tkr.read(clock); | 251 | tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); | 
| 252 | |||
| 253 | tk->tkr_raw.clock = clock; | ||
| 254 | tk->tkr_raw.read = clock->read; | ||
| 255 | tk->tkr_raw.mask = clock->mask; | ||
| 256 | tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; | ||
| 143 | 257 | ||
| 144 | /* Do the ns -> cycle conversion first, using original mult */ | 258 | /* Do the ns -> cycle conversion first, using original mult */ | 
| 145 | tmp = NTP_INTERVAL_LENGTH; | 259 | tmp = NTP_INTERVAL_LENGTH; | 
| @@ -163,11 +277,14 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
| 163 | if (old_clock) { | 277 | if (old_clock) { | 
| 164 | int shift_change = clock->shift - old_clock->shift; | 278 | int shift_change = clock->shift - old_clock->shift; | 
| 165 | if (shift_change < 0) | 279 | if (shift_change < 0) | 
| 166 | tk->tkr.xtime_nsec >>= -shift_change; | 280 | tk->tkr_mono.xtime_nsec >>= -shift_change; | 
| 167 | else | 281 | else | 
| 168 | tk->tkr.xtime_nsec <<= shift_change; | 282 | tk->tkr_mono.xtime_nsec <<= shift_change; | 
| 169 | } | 283 | } | 
| 170 | tk->tkr.shift = clock->shift; | 284 | tk->tkr_raw.xtime_nsec = 0; | 
| 285 | |||
| 286 | tk->tkr_mono.shift = clock->shift; | ||
| 287 | tk->tkr_raw.shift = clock->shift; | ||
| 171 | 288 | ||
| 172 | tk->ntp_error = 0; | 289 | tk->ntp_error = 0; | 
| 173 | tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; | 290 | tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; | 
| @@ -178,7 +295,8 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) | |||
| 178 | * active clocksource. These value will be adjusted via NTP | 295 | * active clocksource. These value will be adjusted via NTP | 
| 179 | * to counteract clock drifting. | 296 | * to counteract clock drifting. | 
| 180 | */ | 297 | */ | 
| 181 | tk->tkr.mult = clock->mult; | 298 | tk->tkr_mono.mult = clock->mult; | 
| 299 | tk->tkr_raw.mult = clock->mult; | ||
| 182 | tk->ntp_err_mult = 0; | 300 | tk->ntp_err_mult = 0; | 
| 183 | } | 301 | } | 
| 184 | 302 | ||
| @@ -193,14 +311,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; } | |||
| 193 | 311 | ||
| 194 | static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) | 312 | static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) | 
| 195 | { | 313 | { | 
| 196 | cycle_t cycle_now, delta; | 314 | cycle_t delta; | 
| 197 | s64 nsec; | 315 | s64 nsec; | 
| 198 | 316 | ||
| 199 | /* read clocksource: */ | 317 | delta = timekeeping_get_delta(tkr); | 
| 200 | cycle_now = tkr->read(tkr->clock); | ||
| 201 | |||
| 202 | /* calculate the delta since the last update_wall_time: */ | ||
| 203 | delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); | ||
| 204 | 318 | ||
| 205 | nsec = delta * tkr->mult + tkr->xtime_nsec; | 319 | nsec = delta * tkr->mult + tkr->xtime_nsec; | 
| 206 | nsec >>= tkr->shift; | 320 | nsec >>= tkr->shift; | 
| @@ -209,25 +323,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) | |||
| 209 | return nsec + arch_gettimeoffset(); | 323 | return nsec + arch_gettimeoffset(); | 
| 210 | } | 324 | } | 
| 211 | 325 | ||
| 212 | static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | ||
| 213 | { | ||
| 214 | struct clocksource *clock = tk->tkr.clock; | ||
| 215 | cycle_t cycle_now, delta; | ||
| 216 | s64 nsec; | ||
| 217 | |||
| 218 | /* read clocksource: */ | ||
| 219 | cycle_now = tk->tkr.read(clock); | ||
| 220 | |||
| 221 | /* calculate the delta since the last update_wall_time: */ | ||
| 222 | delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); | ||
| 223 | |||
| 224 | /* convert delta to nanoseconds. */ | ||
| 225 | nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); | ||
| 226 | |||
| 227 | /* If arch requires, add in get_arch_timeoffset() */ | ||
| 228 | return nsec + arch_gettimeoffset(); | ||
| 229 | } | ||
| 230 | |||
| 231 | /** | 326 | /** | 
| 232 | * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. | 327 | * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. | 
| 233 | * @tkr: Timekeeping readout base from which we take the update | 328 | * @tkr: Timekeeping readout base from which we take the update | 
| @@ -267,18 +362,18 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) | |||
| 267 | * slightly wrong timestamp (a few nanoseconds). See | 362 | * slightly wrong timestamp (a few nanoseconds). See | 
| 268 | * @ktime_get_mono_fast_ns. | 363 | * @ktime_get_mono_fast_ns. | 
| 269 | */ | 364 | */ | 
| 270 | static void update_fast_timekeeper(struct tk_read_base *tkr) | 365 | static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf) | 
| 271 | { | 366 | { | 
| 272 | struct tk_read_base *base = tk_fast_mono.base; | 367 | struct tk_read_base *base = tkf->base; | 
| 273 | 368 | ||
| 274 | /* Force readers off to base[1] */ | 369 | /* Force readers off to base[1] */ | 
| 275 | raw_write_seqcount_latch(&tk_fast_mono.seq); | 370 | raw_write_seqcount_latch(&tkf->seq); | 
| 276 | 371 | ||
| 277 | /* Update base[0] */ | 372 | /* Update base[0] */ | 
| 278 | memcpy(base, tkr, sizeof(*base)); | 373 | memcpy(base, tkr, sizeof(*base)); | 
| 279 | 374 | ||
| 280 | /* Force readers back to base[0] */ | 375 | /* Force readers back to base[0] */ | 
| 281 | raw_write_seqcount_latch(&tk_fast_mono.seq); | 376 | raw_write_seqcount_latch(&tkf->seq); | 
| 282 | 377 | ||
| 283 | /* Update base[1] */ | 378 | /* Update base[1] */ | 
| 284 | memcpy(base + 1, base, sizeof(*base)); | 379 | memcpy(base + 1, base, sizeof(*base)); | 
| @@ -316,22 +411,33 @@ static void update_fast_timekeeper(struct tk_read_base *tkr) | |||
| 316 | * of the following timestamps. Callers need to be aware of that and | 411 | * of the following timestamps. Callers need to be aware of that and | 
| 317 | * deal with it. | 412 | * deal with it. | 
| 318 | */ | 413 | */ | 
| 319 | u64 notrace ktime_get_mono_fast_ns(void) | 414 | static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) | 
| 320 | { | 415 | { | 
| 321 | struct tk_read_base *tkr; | 416 | struct tk_read_base *tkr; | 
| 322 | unsigned int seq; | 417 | unsigned int seq; | 
| 323 | u64 now; | 418 | u64 now; | 
| 324 | 419 | ||
| 325 | do { | 420 | do { | 
| 326 | seq = raw_read_seqcount(&tk_fast_mono.seq); | 421 | seq = raw_read_seqcount(&tkf->seq); | 
| 327 | tkr = tk_fast_mono.base + (seq & 0x01); | 422 | tkr = tkf->base + (seq & 0x01); | 
| 328 | now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); | 423 | now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); | 
| 424 | } while (read_seqcount_retry(&tkf->seq, seq)); | ||
| 329 | 425 | ||
| 330 | } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); | ||
| 331 | return now; | 426 | return now; | 
| 332 | } | 427 | } | 
| 428 | |||
| 429 | u64 ktime_get_mono_fast_ns(void) | ||
| 430 | { | ||
| 431 | return __ktime_get_fast_ns(&tk_fast_mono); | ||
| 432 | } | ||
| 333 | EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); | 433 | EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); | 
| 334 | 434 | ||
| 435 | u64 ktime_get_raw_fast_ns(void) | ||
| 436 | { | ||
| 437 | return __ktime_get_fast_ns(&tk_fast_raw); | ||
| 438 | } | ||
| 439 | EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); | ||
| 440 | |||
| 335 | /* Suspend-time cycles value for halted fast timekeeper. */ | 441 | /* Suspend-time cycles value for halted fast timekeeper. */ | 
| 336 | static cycle_t cycles_at_suspend; | 442 | static cycle_t cycles_at_suspend; | 
| 337 | 443 | ||
| @@ -353,12 +459,17 @@ static cycle_t dummy_clock_read(struct clocksource *cs) | |||
| 353 | static void halt_fast_timekeeper(struct timekeeper *tk) | 459 | static void halt_fast_timekeeper(struct timekeeper *tk) | 
| 354 | { | 460 | { | 
| 355 | static struct tk_read_base tkr_dummy; | 461 | static struct tk_read_base tkr_dummy; | 
| 356 | struct tk_read_base *tkr = &tk->tkr; | 462 | struct tk_read_base *tkr = &tk->tkr_mono; | 
| 357 | 463 | ||
| 358 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); | 464 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); | 
| 359 | cycles_at_suspend = tkr->read(tkr->clock); | 465 | cycles_at_suspend = tkr->read(tkr->clock); | 
| 360 | tkr_dummy.read = dummy_clock_read; | 466 | tkr_dummy.read = dummy_clock_read; | 
| 361 | update_fast_timekeeper(&tkr_dummy); | 467 | update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); | 
| 468 | |||
| 469 | tkr = &tk->tkr_raw; | ||
| 470 | memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); | ||
| 471 | tkr_dummy.read = dummy_clock_read; | ||
| 472 | update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); | ||
| 362 | } | 473 | } | 
| 363 | 474 | ||
| 364 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD | 475 | #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD | 
| @@ -369,8 +480,8 @@ static inline void update_vsyscall(struct timekeeper *tk) | |||
| 369 | 480 | ||
| 370 | xt = timespec64_to_timespec(tk_xtime(tk)); | 481 | xt = timespec64_to_timespec(tk_xtime(tk)); | 
| 371 | wm = timespec64_to_timespec(tk->wall_to_monotonic); | 482 | wm = timespec64_to_timespec(tk->wall_to_monotonic); | 
| 372 | update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, | 483 | update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult, | 
| 373 | tk->tkr.cycle_last); | 484 | tk->tkr_mono.cycle_last); | 
| 374 | } | 485 | } | 
| 375 | 486 | ||
| 376 | static inline void old_vsyscall_fixup(struct timekeeper *tk) | 487 | static inline void old_vsyscall_fixup(struct timekeeper *tk) | 
| @@ -387,11 +498,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) | |||
| 387 | * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD | 498 | * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD | 
| 388 | * users are removed, this can be killed. | 499 | * users are removed, this can be killed. | 
| 389 | */ | 500 | */ | 
| 390 | remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); | 501 | remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); | 
| 391 | tk->tkr.xtime_nsec -= remainder; | 502 | tk->tkr_mono.xtime_nsec -= remainder; | 
| 392 | tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; | 503 | tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; | 
| 393 | tk->ntp_error += remainder << tk->ntp_error_shift; | 504 | tk->ntp_error += remainder << tk->ntp_error_shift; | 
| 394 | tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; | 505 | tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; | 
| 395 | } | 506 | } | 
| 396 | #else | 507 | #else | 
| 397 | #define old_vsyscall_fixup(tk) | 508 | #define old_vsyscall_fixup(tk) | 
| @@ -456,17 +567,17 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) | |||
| 456 | */ | 567 | */ | 
| 457 | seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); | 568 | seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); | 
| 458 | nsec = (u32) tk->wall_to_monotonic.tv_nsec; | 569 | nsec = (u32) tk->wall_to_monotonic.tv_nsec; | 
| 459 | tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); | 570 | tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); | 
| 460 | 571 | ||
| 461 | /* Update the monotonic raw base */ | 572 | /* Update the monotonic raw base */ | 
| 462 | tk->base_raw = timespec64_to_ktime(tk->raw_time); | 573 | tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); | 
| 463 | 574 | ||
| 464 | /* | 575 | /* | 
| 465 | * The sum of the nanoseconds portions of xtime and | 576 | * The sum of the nanoseconds portions of xtime and | 
| 466 | * wall_to_monotonic can be greater/equal one second. Take | 577 | * wall_to_monotonic can be greater/equal one second. Take | 
| 467 | * this into account before updating tk->ktime_sec. | 578 | * this into account before updating tk->ktime_sec. | 
| 468 | */ | 579 | */ | 
| 469 | nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); | 580 | nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); | 
| 470 | if (nsec >= NSEC_PER_SEC) | 581 | if (nsec >= NSEC_PER_SEC) | 
| 471 | seconds++; | 582 | seconds++; | 
| 472 | tk->ktime_sec = seconds; | 583 | tk->ktime_sec = seconds; | 
| @@ -489,7 +600,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) | |||
| 489 | memcpy(&shadow_timekeeper, &tk_core.timekeeper, | 600 | memcpy(&shadow_timekeeper, &tk_core.timekeeper, | 
| 490 | sizeof(tk_core.timekeeper)); | 601 | sizeof(tk_core.timekeeper)); | 
| 491 | 602 | ||
| 492 | update_fast_timekeeper(&tk->tkr); | 603 | update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); | 
| 604 | update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); | ||
| 493 | } | 605 | } | 
| 494 | 606 | ||
| 495 | /** | 607 | /** | 
| @@ -501,22 +613,23 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) | |||
| 501 | */ | 613 | */ | 
| 502 | static void timekeeping_forward_now(struct timekeeper *tk) | 614 | static void timekeeping_forward_now(struct timekeeper *tk) | 
| 503 | { | 615 | { | 
| 504 | struct clocksource *clock = tk->tkr.clock; | 616 | struct clocksource *clock = tk->tkr_mono.clock; | 
| 505 | cycle_t cycle_now, delta; | 617 | cycle_t cycle_now, delta; | 
| 506 | s64 nsec; | 618 | s64 nsec; | 
| 507 | 619 | ||
| 508 | cycle_now = tk->tkr.read(clock); | 620 | cycle_now = tk->tkr_mono.read(clock); | 
| 509 | delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); | 621 | delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); | 
| 510 | tk->tkr.cycle_last = cycle_now; | 622 | tk->tkr_mono.cycle_last = cycle_now; | 
| 623 | tk->tkr_raw.cycle_last = cycle_now; | ||
| 511 | 624 | ||
| 512 | tk->tkr.xtime_nsec += delta * tk->tkr.mult; | 625 | tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; | 
| 513 | 626 | ||
| 514 | /* If arch requires, add in get_arch_timeoffset() */ | 627 | /* If arch requires, add in get_arch_timeoffset() */ | 
| 515 | tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; | 628 | tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; | 
| 516 | 629 | ||
| 517 | tk_normalize_xtime(tk); | 630 | tk_normalize_xtime(tk); | 
| 518 | 631 | ||
| 519 | nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); | 632 | nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); | 
| 520 | timespec64_add_ns(&tk->raw_time, nsec); | 633 | timespec64_add_ns(&tk->raw_time, nsec); | 
| 521 | } | 634 | } | 
| 522 | 635 | ||
| @@ -537,7 +650,7 @@ int __getnstimeofday64(struct timespec64 *ts) | |||
| 537 | seq = read_seqcount_begin(&tk_core.seq); | 650 | seq = read_seqcount_begin(&tk_core.seq); | 
| 538 | 651 | ||
| 539 | ts->tv_sec = tk->xtime_sec; | 652 | ts->tv_sec = tk->xtime_sec; | 
| 540 | nsecs = timekeeping_get_ns(&tk->tkr); | 653 | nsecs = timekeeping_get_ns(&tk->tkr_mono); | 
| 541 | 654 | ||
| 542 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 655 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 
| 543 | 656 | ||
| @@ -577,8 +690,8 @@ ktime_t ktime_get(void) | |||
| 577 | 690 | ||
| 578 | do { | 691 | do { | 
| 579 | seq = read_seqcount_begin(&tk_core.seq); | 692 | seq = read_seqcount_begin(&tk_core.seq); | 
| 580 | base = tk->tkr.base_mono; | 693 | base = tk->tkr_mono.base; | 
| 581 | nsecs = timekeeping_get_ns(&tk->tkr); | 694 | nsecs = timekeeping_get_ns(&tk->tkr_mono); | 
| 582 | 695 | ||
| 583 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 696 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 
| 584 | 697 | ||
| @@ -603,8 +716,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) | |||
| 603 | 716 | ||
| 604 | do { | 717 | do { | 
| 605 | seq = read_seqcount_begin(&tk_core.seq); | 718 | seq = read_seqcount_begin(&tk_core.seq); | 
| 606 | base = ktime_add(tk->tkr.base_mono, *offset); | 719 | base = ktime_add(tk->tkr_mono.base, *offset); | 
| 607 | nsecs = timekeeping_get_ns(&tk->tkr); | 720 | nsecs = timekeeping_get_ns(&tk->tkr_mono); | 
| 608 | 721 | ||
| 609 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 722 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 
| 610 | 723 | ||
| @@ -645,8 +758,8 @@ ktime_t ktime_get_raw(void) | |||
| 645 | 758 | ||
| 646 | do { | 759 | do { | 
| 647 | seq = read_seqcount_begin(&tk_core.seq); | 760 | seq = read_seqcount_begin(&tk_core.seq); | 
| 648 | base = tk->base_raw; | 761 | base = tk->tkr_raw.base; | 
| 649 | nsecs = timekeeping_get_ns_raw(tk); | 762 | nsecs = timekeeping_get_ns(&tk->tkr_raw); | 
| 650 | 763 | ||
| 651 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 764 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 
| 652 | 765 | ||
| @@ -674,7 +787,7 @@ void ktime_get_ts64(struct timespec64 *ts) | |||
| 674 | do { | 787 | do { | 
| 675 | seq = read_seqcount_begin(&tk_core.seq); | 788 | seq = read_seqcount_begin(&tk_core.seq); | 
| 676 | ts->tv_sec = tk->xtime_sec; | 789 | ts->tv_sec = tk->xtime_sec; | 
| 677 | nsec = timekeeping_get_ns(&tk->tkr); | 790 | nsec = timekeeping_get_ns(&tk->tkr_mono); | 
| 678 | tomono = tk->wall_to_monotonic; | 791 | tomono = tk->wall_to_monotonic; | 
| 679 | 792 | ||
| 680 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 793 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 
| @@ -759,8 +872,8 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) | |||
| 759 | ts_real->tv_sec = tk->xtime_sec; | 872 | ts_real->tv_sec = tk->xtime_sec; | 
| 760 | ts_real->tv_nsec = 0; | 873 | ts_real->tv_nsec = 0; | 
| 761 | 874 | ||
| 762 | nsecs_raw = timekeeping_get_ns_raw(tk); | 875 | nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); | 
| 763 | nsecs_real = timekeeping_get_ns(&tk->tkr); | 876 | nsecs_real = timekeeping_get_ns(&tk->tkr_mono); | 
| 764 | 877 | ||
| 765 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 878 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 
| 766 | 879 | ||
| @@ -943,7 +1056,7 @@ static int change_clocksource(void *data) | |||
| 943 | */ | 1056 | */ | 
| 944 | if (try_module_get(new->owner)) { | 1057 | if (try_module_get(new->owner)) { | 
| 945 | if (!new->enable || new->enable(new) == 0) { | 1058 | if (!new->enable || new->enable(new) == 0) { | 
| 946 | old = tk->tkr.clock; | 1059 | old = tk->tkr_mono.clock; | 
| 947 | tk_setup_internals(tk, new); | 1060 | tk_setup_internals(tk, new); | 
| 948 | if (old->disable) | 1061 | if (old->disable) | 
| 949 | old->disable(old); | 1062 | old->disable(old); | 
| @@ -971,11 +1084,11 @@ int timekeeping_notify(struct clocksource *clock) | |||
| 971 | { | 1084 | { | 
| 972 | struct timekeeper *tk = &tk_core.timekeeper; | 1085 | struct timekeeper *tk = &tk_core.timekeeper; | 
| 973 | 1086 | ||
| 974 | if (tk->tkr.clock == clock) | 1087 | if (tk->tkr_mono.clock == clock) | 
| 975 | return 0; | 1088 | return 0; | 
| 976 | stop_machine(change_clocksource, clock, NULL); | 1089 | stop_machine(change_clocksource, clock, NULL); | 
| 977 | tick_clock_notify(); | 1090 | tick_clock_notify(); | 
| 978 | return tk->tkr.clock == clock ? 0 : -1; | 1091 | return tk->tkr_mono.clock == clock ? 0 : -1; | 
| 979 | } | 1092 | } | 
| 980 | 1093 | ||
| 981 | /** | 1094 | /** | 
| @@ -993,7 +1106,7 @@ void getrawmonotonic64(struct timespec64 *ts) | |||
| 993 | 1106 | ||
| 994 | do { | 1107 | do { | 
| 995 | seq = read_seqcount_begin(&tk_core.seq); | 1108 | seq = read_seqcount_begin(&tk_core.seq); | 
| 996 | nsecs = timekeeping_get_ns_raw(tk); | 1109 | nsecs = timekeeping_get_ns(&tk->tkr_raw); | 
| 997 | ts64 = tk->raw_time; | 1110 | ts64 = tk->raw_time; | 
| 998 | 1111 | ||
| 999 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1112 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 
| @@ -1016,7 +1129,7 @@ int timekeeping_valid_for_hres(void) | |||
| 1016 | do { | 1129 | do { | 
| 1017 | seq = read_seqcount_begin(&tk_core.seq); | 1130 | seq = read_seqcount_begin(&tk_core.seq); | 
| 1018 | 1131 | ||
| 1019 | ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; | 1132 | ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; | 
| 1020 | 1133 | ||
| 1021 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1134 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 
| 1022 | 1135 | ||
| @@ -1035,7 +1148,7 @@ u64 timekeeping_max_deferment(void) | |||
| 1035 | do { | 1148 | do { | 
| 1036 | seq = read_seqcount_begin(&tk_core.seq); | 1149 | seq = read_seqcount_begin(&tk_core.seq); | 
| 1037 | 1150 | ||
| 1038 | ret = tk->tkr.clock->max_idle_ns; | 1151 | ret = tk->tkr_mono.clock->max_idle_ns; | 
| 1039 | 1152 | ||
| 1040 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 1153 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 
| 1041 | 1154 | ||
| @@ -1057,6 +1170,14 @@ void __weak read_persistent_clock(struct timespec *ts) | |||
| 1057 | ts->tv_nsec = 0; | 1170 | ts->tv_nsec = 0; | 
| 1058 | } | 1171 | } | 
| 1059 | 1172 | ||
| 1173 | void __weak read_persistent_clock64(struct timespec64 *ts64) | ||
| 1174 | { | ||
| 1175 | struct timespec ts; | ||
| 1176 | |||
| 1177 | read_persistent_clock(&ts); | ||
| 1178 | *ts64 = timespec_to_timespec64(ts); | ||
| 1179 | } | ||
| 1180 | |||
| 1060 | /** | 1181 | /** | 
| 1061 | * read_boot_clock - Return time of the system start. | 1182 | * read_boot_clock - Return time of the system start. | 
| 1062 | * | 1183 | * | 
| @@ -1072,6 +1193,20 @@ void __weak read_boot_clock(struct timespec *ts) | |||
| 1072 | ts->tv_nsec = 0; | 1193 | ts->tv_nsec = 0; | 
| 1073 | } | 1194 | } | 
| 1074 | 1195 | ||
| 1196 | void __weak read_boot_clock64(struct timespec64 *ts64) | ||
| 1197 | { | ||
| 1198 | struct timespec ts; | ||
| 1199 | |||
| 1200 | read_boot_clock(&ts); | ||
| 1201 | *ts64 = timespec_to_timespec64(ts); | ||
| 1202 | } | ||
| 1203 | |||
| 1204 | /* Flag for if timekeeping_resume() has injected sleeptime */ | ||
| 1205 | static bool sleeptime_injected; | ||
| 1206 | |||
| 1207 | /* Flag for if there is a persistent clock on this platform */ | ||
| 1208 | static bool persistent_clock_exists; | ||
| 1209 | |||
| 1075 | /* | 1210 | /* | 
| 1076 | * timekeeping_init - Initializes the clocksource and common timekeeping values | 1211 | * timekeeping_init - Initializes the clocksource and common timekeeping values | 
| 1077 | */ | 1212 | */ | 
| @@ -1081,20 +1216,17 @@ void __init timekeeping_init(void) | |||
| 1081 | struct clocksource *clock; | 1216 | struct clocksource *clock; | 
| 1082 | unsigned long flags; | 1217 | unsigned long flags; | 
| 1083 | struct timespec64 now, boot, tmp; | 1218 | struct timespec64 now, boot, tmp; | 
| 1084 | struct timespec ts; | ||
| 1085 | 1219 | ||
| 1086 | read_persistent_clock(&ts); | 1220 | read_persistent_clock64(&now); | 
| 1087 | now = timespec_to_timespec64(ts); | ||
| 1088 | if (!timespec64_valid_strict(&now)) { | 1221 | if (!timespec64_valid_strict(&now)) { | 
| 1089 | pr_warn("WARNING: Persistent clock returned invalid value!\n" | 1222 | pr_warn("WARNING: Persistent clock returned invalid value!\n" | 
| 1090 | " Check your CMOS/BIOS settings.\n"); | 1223 | " Check your CMOS/BIOS settings.\n"); | 
| 1091 | now.tv_sec = 0; | 1224 | now.tv_sec = 0; | 
| 1092 | now.tv_nsec = 0; | 1225 | now.tv_nsec = 0; | 
| 1093 | } else if (now.tv_sec || now.tv_nsec) | 1226 | } else if (now.tv_sec || now.tv_nsec) | 
| 1094 | persistent_clock_exist = true; | 1227 | persistent_clock_exists = true; | 
| 1095 | 1228 | ||
| 1096 | read_boot_clock(&ts); | 1229 | read_boot_clock64(&boot); | 
| 1097 | boot = timespec_to_timespec64(ts); | ||
| 1098 | if (!timespec64_valid_strict(&boot)) { | 1230 | if (!timespec64_valid_strict(&boot)) { | 
| 1099 | pr_warn("WARNING: Boot clock returned invalid value!\n" | 1231 | pr_warn("WARNING: Boot clock returned invalid value!\n" | 
| 1100 | " Check your CMOS/BIOS settings.\n"); | 1232 | " Check your CMOS/BIOS settings.\n"); | 
| @@ -1114,7 +1246,6 @@ void __init timekeeping_init(void) | |||
| 1114 | tk_set_xtime(tk, &now); | 1246 | tk_set_xtime(tk, &now); | 
| 1115 | tk->raw_time.tv_sec = 0; | 1247 | tk->raw_time.tv_sec = 0; | 
| 1116 | tk->raw_time.tv_nsec = 0; | 1248 | tk->raw_time.tv_nsec = 0; | 
| 1117 | tk->base_raw.tv64 = 0; | ||
| 1118 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) | 1249 | if (boot.tv_sec == 0 && boot.tv_nsec == 0) | 
| 1119 | boot = tk_xtime(tk); | 1250 | boot = tk_xtime(tk); | 
| 1120 | 1251 | ||
| @@ -1127,7 +1258,7 @@ void __init timekeeping_init(void) | |||
| 1127 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1258 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 
| 1128 | } | 1259 | } | 
| 1129 | 1260 | ||
| 1130 | /* time in seconds when suspend began */ | 1261 | /* time in seconds when suspend began for persistent clock */ | 
| 1131 | static struct timespec64 timekeeping_suspend_time; | 1262 | static struct timespec64 timekeeping_suspend_time; | 
| 1132 | 1263 | ||
| 1133 | /** | 1264 | /** | 
| @@ -1152,12 +1283,49 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, | |||
| 1152 | tk_debug_account_sleep_time(delta); | 1283 | tk_debug_account_sleep_time(delta); | 
| 1153 | } | 1284 | } | 
| 1154 | 1285 | ||
| 1286 | #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) | ||
| 1287 | /** | ||
| 1288 | * We have three kinds of time sources to use for sleep time | ||
| 1289 | * injection, the preference order is: | ||
| 1290 | * 1) non-stop clocksource | ||
| 1291 | * 2) persistent clock (ie: RTC accessible when irqs are off) | ||
| 1292 | * 3) RTC | ||
| 1293 | * | ||
| 1294 | * 1) and 2) are used by timekeeping, 3) by RTC subsystem. | ||
| 1295 | * If system has neither 1) nor 2), 3) will be used finally. | ||
| 1296 | * | ||
| 1297 | * | ||
| 1298 | * If timekeeping has injected sleeptime via either 1) or 2), | ||
| 1299 | * 3) becomes needless, so in this case we don't need to call | ||
| 1300 | * rtc_resume(), and this is what timekeeping_rtc_skipresume() | ||
| 1301 | * means. | ||
| 1302 | */ | ||
| 1303 | bool timekeeping_rtc_skipresume(void) | ||
| 1304 | { | ||
| 1305 | return sleeptime_injected; | ||
| 1306 | } | ||
| 1307 | |||
| 1308 | /** | ||
| 1309 | * 1) can be determined whether to use or not only when doing | ||
| 1310 | * timekeeping_resume() which is invoked after rtc_suspend(), | ||
| 1311 | * so we can't skip rtc_suspend() surely if system has 1). | ||
| 1312 | * | ||
| 1313 | * But if system has 2), 2) will definitely be used, so in this | ||
| 1314 | * case we don't need to call rtc_suspend(), and this is what | ||
| 1315 | * timekeeping_rtc_skipsuspend() means. | ||
| 1316 | */ | ||
| 1317 | bool timekeeping_rtc_skipsuspend(void) | ||
| 1318 | { | ||
| 1319 | return persistent_clock_exists; | ||
| 1320 | } | ||
| 1321 | |||
| 1155 | /** | 1322 | /** | 
| 1156 | * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values | 1323 | * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values | 
| 1157 | * @delta: pointer to a timespec64 delta value | 1324 | * @delta: pointer to a timespec64 delta value | 
| 1158 | * | 1325 | * | 
| 1159 | * This hook is for architectures that cannot support read_persistent_clock | 1326 | * This hook is for architectures that cannot support read_persistent_clock64 | 
| 1160 | * because their RTC/persistent clock is only accessible when irqs are enabled. | 1327 | * because their RTC/persistent clock is only accessible when irqs are enabled. | 
| 1328 | * and also don't have an effective nonstop clocksource. | ||
| 1161 | * | 1329 | * | 
| 1162 | * This function should only be called by rtc_resume(), and allows | 1330 | * This function should only be called by rtc_resume(), and allows | 
| 1163 | * a suspend offset to be injected into the timekeeping values. | 1331 | * a suspend offset to be injected into the timekeeping values. | 
| @@ -1167,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) | |||
| 1167 | struct timekeeper *tk = &tk_core.timekeeper; | 1335 | struct timekeeper *tk = &tk_core.timekeeper; | 
| 1168 | unsigned long flags; | 1336 | unsigned long flags; | 
| 1169 | 1337 | ||
| 1170 | /* | ||
| 1171 | * Make sure we don't set the clock twice, as timekeeping_resume() | ||
| 1172 | * already did it | ||
| 1173 | */ | ||
| 1174 | if (has_persistent_clock()) | ||
| 1175 | return; | ||
| 1176 | |||
| 1177 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1338 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 
| 1178 | write_seqcount_begin(&tk_core.seq); | 1339 | write_seqcount_begin(&tk_core.seq); | 
| 1179 | 1340 | ||
| @@ -1189,26 +1350,21 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) | |||
| 1189 | /* signal hrtimers about time change */ | 1350 | /* signal hrtimers about time change */ | 
| 1190 | clock_was_set(); | 1351 | clock_was_set(); | 
| 1191 | } | 1352 | } | 
| 1353 | #endif | ||
| 1192 | 1354 | ||
| 1193 | /** | 1355 | /** | 
| 1194 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 1356 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 
| 1195 | * | ||
| 1196 | * This is for the generic clocksource timekeeping. | ||
| 1197 | * xtime/wall_to_monotonic/jiffies/etc are | ||
| 1198 | * still managed by arch specific suspend/resume code. | ||
| 1199 | */ | 1357 | */ | 
| 1200 | void timekeeping_resume(void) | 1358 | void timekeeping_resume(void) | 
| 1201 | { | 1359 | { | 
| 1202 | struct timekeeper *tk = &tk_core.timekeeper; | 1360 | struct timekeeper *tk = &tk_core.timekeeper; | 
| 1203 | struct clocksource *clock = tk->tkr.clock; | 1361 | struct clocksource *clock = tk->tkr_mono.clock; | 
| 1204 | unsigned long flags; | 1362 | unsigned long flags; | 
| 1205 | struct timespec64 ts_new, ts_delta; | 1363 | struct timespec64 ts_new, ts_delta; | 
| 1206 | struct timespec tmp; | ||
| 1207 | cycle_t cycle_now, cycle_delta; | 1364 | cycle_t cycle_now, cycle_delta; | 
| 1208 | bool suspendtime_found = false; | ||
| 1209 | 1365 | ||
| 1210 | read_persistent_clock(&tmp); | 1366 | sleeptime_injected = false; | 
| 1211 | ts_new = timespec_to_timespec64(tmp); | 1367 | read_persistent_clock64(&ts_new); | 
| 1212 | 1368 | ||
| 1213 | clockevents_resume(); | 1369 | clockevents_resume(); | 
| 1214 | clocksource_resume(); | 1370 | clocksource_resume(); | 
| @@ -1228,16 +1384,16 @@ void timekeeping_resume(void) | |||
| 1228 | * The less preferred source will only be tried if there is no better | 1384 | * The less preferred source will only be tried if there is no better | 
| 1229 | * usable source. The rtc part is handled separately in rtc core code. | 1385 | * usable source. The rtc part is handled separately in rtc core code. | 
| 1230 | */ | 1386 | */ | 
| 1231 | cycle_now = tk->tkr.read(clock); | 1387 | cycle_now = tk->tkr_mono.read(clock); | 
| 1232 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && | 1388 | if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && | 
| 1233 | cycle_now > tk->tkr.cycle_last) { | 1389 | cycle_now > tk->tkr_mono.cycle_last) { | 
| 1234 | u64 num, max = ULLONG_MAX; | 1390 | u64 num, max = ULLONG_MAX; | 
| 1235 | u32 mult = clock->mult; | 1391 | u32 mult = clock->mult; | 
| 1236 | u32 shift = clock->shift; | 1392 | u32 shift = clock->shift; | 
| 1237 | s64 nsec = 0; | 1393 | s64 nsec = 0; | 
| 1238 | 1394 | ||
| 1239 | cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, | 1395 | cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, | 
| 1240 | tk->tkr.mask); | 1396 | tk->tkr_mono.mask); | 
| 1241 | 1397 | ||
| 1242 | /* | 1398 | /* | 
| 1243 | * "cycle_delta * mutl" may cause 64 bits overflow, if the | 1399 | * "cycle_delta * mutl" may cause 64 bits overflow, if the | 
| @@ -1253,17 +1409,19 @@ void timekeeping_resume(void) | |||
| 1253 | nsec += ((u64) cycle_delta * mult) >> shift; | 1409 | nsec += ((u64) cycle_delta * mult) >> shift; | 
| 1254 | 1410 | ||
| 1255 | ts_delta = ns_to_timespec64(nsec); | 1411 | ts_delta = ns_to_timespec64(nsec); | 
| 1256 | suspendtime_found = true; | 1412 | sleeptime_injected = true; | 
| 1257 | } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { | 1413 | } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { | 
| 1258 | ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); | 1414 | ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); | 
| 1259 | suspendtime_found = true; | 1415 | sleeptime_injected = true; | 
| 1260 | } | 1416 | } | 
| 1261 | 1417 | ||
| 1262 | if (suspendtime_found) | 1418 | if (sleeptime_injected) | 
| 1263 | __timekeeping_inject_sleeptime(tk, &ts_delta); | 1419 | __timekeeping_inject_sleeptime(tk, &ts_delta); | 
| 1264 | 1420 | ||
| 1265 | /* Re-base the last cycle value */ | 1421 | /* Re-base the last cycle value */ | 
| 1266 | tk->tkr.cycle_last = cycle_now; | 1422 | tk->tkr_mono.cycle_last = cycle_now; | 
| 1423 | tk->tkr_raw.cycle_last = cycle_now; | ||
| 1424 | |||
| 1267 | tk->ntp_error = 0; | 1425 | tk->ntp_error = 0; | 
| 1268 | timekeeping_suspended = 0; | 1426 | timekeeping_suspended = 0; | 
| 1269 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); | 1427 | timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); | 
| @@ -1272,9 +1430,7 @@ void timekeeping_resume(void) | |||
| 1272 | 1430 | ||
| 1273 | touch_softlockup_watchdog(); | 1431 | touch_softlockup_watchdog(); | 
| 1274 | 1432 | ||
| 1275 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); | 1433 | tick_resume(); | 
| 1276 | |||
| 1277 | /* Resume hrtimers */ | ||
| 1278 | hrtimers_resume(); | 1434 | hrtimers_resume(); | 
| 1279 | } | 1435 | } | 
| 1280 | 1436 | ||
| @@ -1284,10 +1440,8 @@ int timekeeping_suspend(void) | |||
| 1284 | unsigned long flags; | 1440 | unsigned long flags; | 
| 1285 | struct timespec64 delta, delta_delta; | 1441 | struct timespec64 delta, delta_delta; | 
| 1286 | static struct timespec64 old_delta; | 1442 | static struct timespec64 old_delta; | 
| 1287 | struct timespec tmp; | ||
| 1288 | 1443 | ||
| 1289 | read_persistent_clock(&tmp); | 1444 | read_persistent_clock64(&timekeeping_suspend_time); | 
| 1290 | timekeeping_suspend_time = timespec_to_timespec64(tmp); | ||
| 1291 | 1445 | ||
| 1292 | /* | 1446 | /* | 
| 1293 | * On some systems the persistent_clock can not be detected at | 1447 | * On some systems the persistent_clock can not be detected at | 
| @@ -1295,31 +1449,33 @@ int timekeeping_suspend(void) | |||
| 1295 | * value returned, update the persistent_clock_exists flag. | 1449 | * value returned, update the persistent_clock_exists flag. | 
| 1296 | */ | 1450 | */ | 
| 1297 | if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) | 1451 | if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) | 
| 1298 | persistent_clock_exist = true; | 1452 | persistent_clock_exists = true; | 
| 1299 | 1453 | ||
| 1300 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 1454 | raw_spin_lock_irqsave(&timekeeper_lock, flags); | 
| 1301 | write_seqcount_begin(&tk_core.seq); | 1455 | write_seqcount_begin(&tk_core.seq); | 
| 1302 | timekeeping_forward_now(tk); | 1456 | timekeeping_forward_now(tk); | 
| 1303 | timekeeping_suspended = 1; | 1457 | timekeeping_suspended = 1; | 
| 1304 | 1458 | ||
| 1305 | /* | 1459 | if (persistent_clock_exists) { | 
| 1306 | * To avoid drift caused by repeated suspend/resumes, | ||
| 1307 | * which each can add ~1 second drift error, | ||
| 1308 | * try to compensate so the difference in system time | ||
| 1309 | * and persistent_clock time stays close to constant. | ||
| 1310 | */ | ||
| 1311 | delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); | ||
| 1312 | delta_delta = timespec64_sub(delta, old_delta); | ||
| 1313 | if (abs(delta_delta.tv_sec) >= 2) { | ||
| 1314 | /* | 1460 | /* | 
| 1315 | * if delta_delta is too large, assume time correction | 1461 | * To avoid drift caused by repeated suspend/resumes, | 
| 1316 | * has occured and set old_delta to the current delta. | 1462 | * which each can add ~1 second drift error, | 
| 1463 | * try to compensate so the difference in system time | ||
| 1464 | * and persistent_clock time stays close to constant. | ||
| 1317 | */ | 1465 | */ | 
| 1318 | old_delta = delta; | 1466 | delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); | 
| 1319 | } else { | 1467 | delta_delta = timespec64_sub(delta, old_delta); | 
| 1320 | /* Otherwise try to adjust old_system to compensate */ | 1468 | if (abs(delta_delta.tv_sec) >= 2) { | 
| 1321 | timekeeping_suspend_time = | 1469 | /* | 
| 1322 | timespec64_add(timekeeping_suspend_time, delta_delta); | 1470 | * if delta_delta is too large, assume time correction | 
| 1471 | * has occurred and set old_delta to the current delta. | ||
| 1472 | */ | ||
| 1473 | old_delta = delta; | ||
| 1474 | } else { | ||
| 1475 | /* Otherwise try to adjust old_system to compensate */ | ||
| 1476 | timekeeping_suspend_time = | ||
| 1477 | timespec64_add(timekeeping_suspend_time, delta_delta); | ||
| 1478 | } | ||
| 1323 | } | 1479 | } | 
| 1324 | 1480 | ||
| 1325 | timekeeping_update(tk, TK_MIRROR); | 1481 | timekeeping_update(tk, TK_MIRROR); | 
| @@ -1327,7 +1483,7 @@ int timekeeping_suspend(void) | |||
| 1327 | write_seqcount_end(&tk_core.seq); | 1483 | write_seqcount_end(&tk_core.seq); | 
| 1328 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 1484 | raw_spin_unlock_irqrestore(&timekeeper_lock, flags); | 
| 1329 | 1485 | ||
| 1330 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 1486 | tick_suspend(); | 
| 1331 | clocksource_suspend(); | 1487 | clocksource_suspend(); | 
| 1332 | clockevents_suspend(); | 1488 | clockevents_suspend(); | 
| 1333 | 1489 | ||
| @@ -1416,15 +1572,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, | |||
| 1416 | * | 1572 | * | 
| 1417 | * XXX - TODO: Doc ntp_error calculation. | 1573 | * XXX - TODO: Doc ntp_error calculation. | 
| 1418 | */ | 1574 | */ | 
| 1419 | if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { | 1575 | if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { | 
| 1420 | /* NTP adjustment caused clocksource mult overflow */ | 1576 | /* NTP adjustment caused clocksource mult overflow */ | 
| 1421 | WARN_ON_ONCE(1); | 1577 | WARN_ON_ONCE(1); | 
| 1422 | return; | 1578 | return; | 
| 1423 | } | 1579 | } | 
| 1424 | 1580 | ||
| 1425 | tk->tkr.mult += mult_adj; | 1581 | tk->tkr_mono.mult += mult_adj; | 
| 1426 | tk->xtime_interval += interval; | 1582 | tk->xtime_interval += interval; | 
| 1427 | tk->tkr.xtime_nsec -= offset; | 1583 | tk->tkr_mono.xtime_nsec -= offset; | 
| 1428 | tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; | 1584 | tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; | 
| 1429 | } | 1585 | } | 
| 1430 | 1586 | ||
| @@ -1486,13 +1642,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
| 1486 | tk->ntp_err_mult = 0; | 1642 | tk->ntp_err_mult = 0; | 
| 1487 | } | 1643 | } | 
| 1488 | 1644 | ||
| 1489 | if (unlikely(tk->tkr.clock->maxadj && | 1645 | if (unlikely(tk->tkr_mono.clock->maxadj && | 
| 1490 | (abs(tk->tkr.mult - tk->tkr.clock->mult) | 1646 | (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) | 
| 1491 | > tk->tkr.clock->maxadj))) { | 1647 | > tk->tkr_mono.clock->maxadj))) { | 
| 1492 | printk_once(KERN_WARNING | 1648 | printk_once(KERN_WARNING | 
| 1493 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 1649 | "Adjusting %s more than 11%% (%ld vs %ld)\n", | 
| 1494 | tk->tkr.clock->name, (long)tk->tkr.mult, | 1650 | tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, | 
| 1495 | (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); | 1651 | (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); | 
| 1496 | } | 1652 | } | 
| 1497 | 1653 | ||
| 1498 | /* | 1654 | /* | 
| @@ -1509,9 +1665,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
| 1509 | * We'll correct this error next time through this function, when | 1665 | * We'll correct this error next time through this function, when | 
| 1510 | * xtime_nsec is not as small. | 1666 | * xtime_nsec is not as small. | 
| 1511 | */ | 1667 | */ | 
| 1512 | if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { | 1668 | if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { | 
| 1513 | s64 neg = -(s64)tk->tkr.xtime_nsec; | 1669 | s64 neg = -(s64)tk->tkr_mono.xtime_nsec; | 
| 1514 | tk->tkr.xtime_nsec = 0; | 1670 | tk->tkr_mono.xtime_nsec = 0; | 
| 1515 | tk->ntp_error += neg << tk->ntp_error_shift; | 1671 | tk->ntp_error += neg << tk->ntp_error_shift; | 
| 1516 | } | 1672 | } | 
| 1517 | } | 1673 | } | 
| @@ -1526,13 +1682,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) | |||
| 1526 | */ | 1682 | */ | 
| 1527 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) | 1683 | static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) | 
| 1528 | { | 1684 | { | 
| 1529 | u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; | 1685 | u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; | 
| 1530 | unsigned int clock_set = 0; | 1686 | unsigned int clock_set = 0; | 
| 1531 | 1687 | ||
| 1532 | while (tk->tkr.xtime_nsec >= nsecps) { | 1688 | while (tk->tkr_mono.xtime_nsec >= nsecps) { | 
| 1533 | int leap; | 1689 | int leap; | 
| 1534 | 1690 | ||
| 1535 | tk->tkr.xtime_nsec -= nsecps; | 1691 | tk->tkr_mono.xtime_nsec -= nsecps; | 
| 1536 | tk->xtime_sec++; | 1692 | tk->xtime_sec++; | 
| 1537 | 1693 | ||
| 1538 | /* Figure out if its a leap sec and apply if needed */ | 1694 | /* Figure out if its a leap sec and apply if needed */ | 
| @@ -1577,9 +1733,10 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, | |||
| 1577 | 1733 | ||
| 1578 | /* Accumulate one shifted interval */ | 1734 | /* Accumulate one shifted interval */ | 
| 1579 | offset -= interval; | 1735 | offset -= interval; | 
| 1580 | tk->tkr.cycle_last += interval; | 1736 | tk->tkr_mono.cycle_last += interval; | 
| 1737 | tk->tkr_raw.cycle_last += interval; | ||
| 1581 | 1738 | ||
| 1582 | tk->tkr.xtime_nsec += tk->xtime_interval << shift; | 1739 | tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; | 
| 1583 | *clock_set |= accumulate_nsecs_to_secs(tk); | 1740 | *clock_set |= accumulate_nsecs_to_secs(tk); | 
| 1584 | 1741 | ||
| 1585 | /* Accumulate raw time */ | 1742 | /* Accumulate raw time */ | 
| @@ -1622,14 +1779,17 @@ void update_wall_time(void) | |||
| 1622 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET | 1779 | #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET | 
| 1623 | offset = real_tk->cycle_interval; | 1780 | offset = real_tk->cycle_interval; | 
| 1624 | #else | 1781 | #else | 
| 1625 | offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), | 1782 | offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), | 
| 1626 | tk->tkr.cycle_last, tk->tkr.mask); | 1783 | tk->tkr_mono.cycle_last, tk->tkr_mono.mask); | 
| 1627 | #endif | 1784 | #endif | 
| 1628 | 1785 | ||
| 1629 | /* Check if there's really nothing to do */ | 1786 | /* Check if there's really nothing to do */ | 
| 1630 | if (offset < real_tk->cycle_interval) | 1787 | if (offset < real_tk->cycle_interval) | 
| 1631 | goto out; | 1788 | goto out; | 
| 1632 | 1789 | ||
| 1790 | /* Do some additional sanity checking */ | ||
| 1791 | timekeeping_check_update(real_tk, offset); | ||
| 1792 | |||
| 1633 | /* | 1793 | /* | 
| 1634 | * With NO_HZ we may have to accumulate many cycle_intervals | 1794 | * With NO_HZ we may have to accumulate many cycle_intervals | 
| 1635 | * (think "ticks") worth of time at once. To do this efficiently, | 1795 | * (think "ticks") worth of time at once. To do this efficiently, | 
| @@ -1784,8 +1944,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, | |||
| 1784 | do { | 1944 | do { | 
| 1785 | seq = read_seqcount_begin(&tk_core.seq); | 1945 | seq = read_seqcount_begin(&tk_core.seq); | 
| 1786 | 1946 | ||
| 1787 | base = tk->tkr.base_mono; | 1947 | base = tk->tkr_mono.base; | 
| 1788 | nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; | 1948 | nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; | 
| 1789 | 1949 | ||
| 1790 | *offs_real = tk->offs_real; | 1950 | *offs_real = tk->offs_real; | 
| 1791 | *offs_boot = tk->offs_boot; | 1951 | *offs_boot = tk->offs_boot; | 
| @@ -1816,8 +1976,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, | |||
| 1816 | do { | 1976 | do { | 
| 1817 | seq = read_seqcount_begin(&tk_core.seq); | 1977 | seq = read_seqcount_begin(&tk_core.seq); | 
| 1818 | 1978 | ||
| 1819 | base = tk->tkr.base_mono; | 1979 | base = tk->tkr_mono.base; | 
| 1820 | nsecs = timekeeping_get_ns(&tk->tkr); | 1980 | nsecs = timekeeping_get_ns(&tk->tkr_mono); | 
| 1821 | 1981 | ||
| 1822 | *offs_real = tk->offs_real; | 1982 | *offs_real = tk->offs_real; | 
| 1823 | *offs_boot = tk->offs_boot; | 1983 | *offs_boot = tk->offs_boot; | 
| diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 1d91416055d5..ead8794b9a4e 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h | |||
| @@ -19,4 +19,11 @@ extern void timekeeping_clocktai(struct timespec *ts); | |||
| 19 | extern int timekeeping_suspend(void); | 19 | extern int timekeeping_suspend(void); | 
| 20 | extern void timekeeping_resume(void); | 20 | extern void timekeeping_resume(void); | 
| 21 | 21 | ||
| 22 | extern void do_timer(unsigned long ticks); | ||
| 23 | extern void update_wall_time(void); | ||
| 24 | |||
| 25 | extern seqlock_t jiffies_lock; | ||
| 26 | |||
| 27 | #define CS_NAME_LEN 32 | ||
| 28 | |||
| 22 | #endif | 29 | #endif | 
| diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d3f5c504939..2ece3aa5069c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
| @@ -90,8 +90,18 @@ struct tvec_base { | |||
| 90 | struct tvec tv5; | 90 | struct tvec tv5; | 
| 91 | } ____cacheline_aligned; | 91 | } ____cacheline_aligned; | 
| 92 | 92 | ||
| 93 | /* | ||
| 94 | * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've | ||
| 95 | * made NULL special, hint: lock_timer_base()) and we cannot get a compile time | ||
| 96 | * pointer to per-cpu entries because we don't know where we'll map the section, | ||
| 97 | * even for the boot cpu. | ||
| 98 | * | ||
| 99 | * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the | ||
| 100 | * rest of them. | ||
| 101 | */ | ||
| 93 | struct tvec_base boot_tvec_bases; | 102 | struct tvec_base boot_tvec_bases; | 
| 94 | EXPORT_SYMBOL(boot_tvec_bases); | 103 | EXPORT_SYMBOL(boot_tvec_bases); | 
| 104 | |||
| 95 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | 105 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | 
| 96 | 106 | ||
| 97 | /* Functions below help us manage 'deferrable' flag */ | 107 | /* Functions below help us manage 'deferrable' flag */ | 
| @@ -1027,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer) | |||
| 1027 | EXPORT_SYMBOL(try_to_del_timer_sync); | 1037 | EXPORT_SYMBOL(try_to_del_timer_sync); | 
| 1028 | 1038 | ||
| 1029 | #ifdef CONFIG_SMP | 1039 | #ifdef CONFIG_SMP | 
| 1040 | static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); | ||
| 1041 | |||
| 1030 | /** | 1042 | /** | 
| 1031 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 1043 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 
| 1032 | * @timer: the timer to be deactivated | 1044 | * @timer: the timer to be deactivated | 
| @@ -1532,64 +1544,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) | |||
| 1532 | } | 1544 | } | 
| 1533 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1545 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 
| 1534 | 1546 | ||
| 1535 | static int init_timers_cpu(int cpu) | ||
| 1536 | { | ||
| 1537 | int j; | ||
| 1538 | struct tvec_base *base; | ||
| 1539 | static char tvec_base_done[NR_CPUS]; | ||
| 1540 | |||
| 1541 | if (!tvec_base_done[cpu]) { | ||
| 1542 | static char boot_done; | ||
| 1543 | |||
| 1544 | if (boot_done) { | ||
| 1545 | /* | ||
| 1546 | * The APs use this path later in boot | ||
| 1547 | */ | ||
| 1548 | base = kzalloc_node(sizeof(*base), GFP_KERNEL, | ||
| 1549 | cpu_to_node(cpu)); | ||
| 1550 | if (!base) | ||
| 1551 | return -ENOMEM; | ||
| 1552 | |||
| 1553 | /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ | ||
| 1554 | if (WARN_ON(base != tbase_get_base(base))) { | ||
| 1555 | kfree(base); | ||
| 1556 | return -ENOMEM; | ||
| 1557 | } | ||
| 1558 | per_cpu(tvec_bases, cpu) = base; | ||
| 1559 | } else { | ||
| 1560 | /* | ||
| 1561 | * This is for the boot CPU - we use compile-time | ||
| 1562 | * static initialisation because per-cpu memory isn't | ||
| 1563 | * ready yet and because the memory allocators are not | ||
| 1564 | * initialised either. | ||
| 1565 | */ | ||
| 1566 | boot_done = 1; | ||
| 1567 | base = &boot_tvec_bases; | ||
| 1568 | } | ||
| 1569 | spin_lock_init(&base->lock); | ||
| 1570 | tvec_base_done[cpu] = 1; | ||
| 1571 | base->cpu = cpu; | ||
| 1572 | } else { | ||
| 1573 | base = per_cpu(tvec_bases, cpu); | ||
| 1574 | } | ||
| 1575 | |||
| 1576 | |||
| 1577 | for (j = 0; j < TVN_SIZE; j++) { | ||
| 1578 | INIT_LIST_HEAD(base->tv5.vec + j); | ||
| 1579 | INIT_LIST_HEAD(base->tv4.vec + j); | ||
| 1580 | INIT_LIST_HEAD(base->tv3.vec + j); | ||
| 1581 | INIT_LIST_HEAD(base->tv2.vec + j); | ||
| 1582 | } | ||
| 1583 | for (j = 0; j < TVR_SIZE; j++) | ||
| 1584 | INIT_LIST_HEAD(base->tv1.vec + j); | ||
| 1585 | |||
| 1586 | base->timer_jiffies = jiffies; | ||
| 1587 | base->next_timer = base->timer_jiffies; | ||
| 1588 | base->active_timers = 0; | ||
| 1589 | base->all_timers = 0; | ||
| 1590 | return 0; | ||
| 1591 | } | ||
| 1592 | |||
| 1593 | #ifdef CONFIG_HOTPLUG_CPU | 1547 | #ifdef CONFIG_HOTPLUG_CPU | 
| 1594 | static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) | 1548 | static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) | 
| 1595 | { | 1549 | { | 
| @@ -1631,55 +1585,86 @@ static void migrate_timers(int cpu) | |||
| 1631 | migrate_timer_list(new_base, old_base->tv5.vec + i); | 1585 | migrate_timer_list(new_base, old_base->tv5.vec + i); | 
| 1632 | } | 1586 | } | 
| 1633 | 1587 | ||
| 1588 | old_base->active_timers = 0; | ||
| 1589 | old_base->all_timers = 0; | ||
| 1590 | |||
| 1634 | spin_unlock(&old_base->lock); | 1591 | spin_unlock(&old_base->lock); | 
| 1635 | spin_unlock_irq(&new_base->lock); | 1592 | spin_unlock_irq(&new_base->lock); | 
| 1636 | put_cpu_var(tvec_bases); | 1593 | put_cpu_var(tvec_bases); | 
| 1637 | } | 1594 | } | 
| 1638 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 1639 | 1595 | ||
| 1640 | static int timer_cpu_notify(struct notifier_block *self, | 1596 | static int timer_cpu_notify(struct notifier_block *self, | 
| 1641 | unsigned long action, void *hcpu) | 1597 | unsigned long action, void *hcpu) | 
| 1642 | { | 1598 | { | 
| 1643 | long cpu = (long)hcpu; | 1599 | switch (action) { | 
| 1644 | int err; | ||
| 1645 | |||
| 1646 | switch(action) { | ||
| 1647 | case CPU_UP_PREPARE: | ||
| 1648 | case CPU_UP_PREPARE_FROZEN: | ||
| 1649 | err = init_timers_cpu(cpu); | ||
| 1650 | if (err < 0) | ||
| 1651 | return notifier_from_errno(err); | ||
| 1652 | break; | ||
| 1653 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1654 | case CPU_DEAD: | 1600 | case CPU_DEAD: | 
| 1655 | case CPU_DEAD_FROZEN: | 1601 | case CPU_DEAD_FROZEN: | 
| 1656 | migrate_timers(cpu); | 1602 | migrate_timers((long)hcpu); | 
| 1657 | break; | 1603 | break; | 
| 1658 | #endif | ||
| 1659 | default: | 1604 | default: | 
| 1660 | break; | 1605 | break; | 
| 1661 | } | 1606 | } | 
| 1607 | |||
| 1662 | return NOTIFY_OK; | 1608 | return NOTIFY_OK; | 
| 1663 | } | 1609 | } | 
| 1664 | 1610 | ||
| 1665 | static struct notifier_block timers_nb = { | 1611 | static inline void timer_register_cpu_notifier(void) | 
| 1666 | .notifier_call = timer_cpu_notify, | 1612 | { | 
| 1667 | }; | 1613 | cpu_notifier(timer_cpu_notify, 0); | 
| 1614 | } | ||
| 1615 | #else | ||
| 1616 | static inline void timer_register_cpu_notifier(void) { } | ||
| 1617 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 1668 | 1618 | ||
| 1619 | static void __init init_timer_cpu(struct tvec_base *base, int cpu) | ||
| 1620 | { | ||
| 1621 | int j; | ||
| 1669 | 1622 | ||
| 1670 | void __init init_timers(void) | 1623 | BUG_ON(base != tbase_get_base(base)); | 
| 1624 | |||
| 1625 | base->cpu = cpu; | ||
| 1626 | per_cpu(tvec_bases, cpu) = base; | ||
| 1627 | spin_lock_init(&base->lock); | ||
| 1628 | |||
| 1629 | for (j = 0; j < TVN_SIZE; j++) { | ||
| 1630 | INIT_LIST_HEAD(base->tv5.vec + j); | ||
| 1631 | INIT_LIST_HEAD(base->tv4.vec + j); | ||
| 1632 | INIT_LIST_HEAD(base->tv3.vec + j); | ||
| 1633 | INIT_LIST_HEAD(base->tv2.vec + j); | ||
| 1634 | } | ||
| 1635 | for (j = 0; j < TVR_SIZE; j++) | ||
| 1636 | INIT_LIST_HEAD(base->tv1.vec + j); | ||
| 1637 | |||
| 1638 | base->timer_jiffies = jiffies; | ||
| 1639 | base->next_timer = base->timer_jiffies; | ||
| 1640 | } | ||
| 1641 | |||
| 1642 | static void __init init_timer_cpus(void) | ||
| 1671 | { | 1643 | { | 
| 1672 | int err; | 1644 | struct tvec_base *base; | 
| 1645 | int local_cpu = smp_processor_id(); | ||
| 1646 | int cpu; | ||
| 1673 | 1647 | ||
| 1648 | for_each_possible_cpu(cpu) { | ||
| 1649 | if (cpu == local_cpu) | ||
| 1650 | base = &boot_tvec_bases; | ||
| 1651 | #ifdef CONFIG_SMP | ||
| 1652 | else | ||
| 1653 | base = per_cpu_ptr(&__tvec_bases, cpu); | ||
| 1654 | #endif | ||
| 1655 | |||
| 1656 | init_timer_cpu(base, cpu); | ||
| 1657 | } | ||
| 1658 | } | ||
| 1659 | |||
| 1660 | void __init init_timers(void) | ||
| 1661 | { | ||
| 1674 | /* ensure there are enough low bits for flags in timer->base pointer */ | 1662 | /* ensure there are enough low bits for flags in timer->base pointer */ | 
| 1675 | BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); | 1663 | BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); | 
| 1676 | 1664 | ||
| 1677 | err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | 1665 | init_timer_cpus(); | 
| 1678 | (void *)(long)smp_processor_id()); | ||
| 1679 | BUG_ON(err != NOTIFY_OK); | ||
| 1680 | |||
| 1681 | init_timer_stats(); | 1666 | init_timer_stats(); | 
| 1682 | register_cpu_notifier(&timers_nb); | 1667 | timer_register_cpu_notifier(); | 
| 1683 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq); | 1668 | open_softirq(TIMER_SOFTIRQ, run_timer_softirq); | 
| 1684 | } | 1669 | } | 
| 1685 | 1670 | ||
| diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 61ed862cdd37..e878c2e0ba45 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
| @@ -16,10 +16,10 @@ | |||
| 16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> | 
| 17 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> | 
| 18 | #include <linux/kallsyms.h> | 18 | #include <linux/kallsyms.h> | 
| 19 | #include <linux/tick.h> | ||
| 20 | 19 | ||
| 21 | #include <asm/uaccess.h> | 20 | #include <asm/uaccess.h> | 
| 22 | 21 | ||
| 22 | #include "tick-internal.h" | ||
| 23 | 23 | ||
| 24 | struct timer_list_iter { | 24 | struct timer_list_iter { | 
| 25 | int cpu; | 25 | int cpu; | 
| @@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) | |||
| 228 | print_name_offset(m, dev->set_next_event); | 228 | print_name_offset(m, dev->set_next_event); | 
| 229 | SEQ_printf(m, "\n"); | 229 | SEQ_printf(m, "\n"); | 
| 230 | 230 | ||
| 231 | SEQ_printf(m, " set_mode: "); | 231 | if (dev->set_mode) { | 
| 232 | print_name_offset(m, dev->set_mode); | 232 | SEQ_printf(m, " set_mode: "); | 
| 233 | SEQ_printf(m, "\n"); | 233 | print_name_offset(m, dev->set_mode); | 
| 234 | SEQ_printf(m, "\n"); | ||
| 235 | } else { | ||
| 236 | if (dev->set_state_shutdown) { | ||
| 237 | SEQ_printf(m, " shutdown: "); | ||
| 238 | print_name_offset(m, dev->set_state_shutdown); | ||
| 239 | SEQ_printf(m, "\n"); | ||
| 240 | } | ||
| 241 | |||
| 242 | if (dev->set_state_periodic) { | ||
| 243 | SEQ_printf(m, " periodic: "); | ||
| 244 | print_name_offset(m, dev->set_state_periodic); | ||
| 245 | SEQ_printf(m, "\n"); | ||
| 246 | } | ||
| 247 | |||
| 248 | if (dev->set_state_oneshot) { | ||
| 249 | SEQ_printf(m, " oneshot: "); | ||
| 250 | print_name_offset(m, dev->set_state_oneshot); | ||
| 251 | SEQ_printf(m, "\n"); | ||
| 252 | } | ||
| 253 | |||
| 254 | if (dev->tick_resume) { | ||
| 255 | SEQ_printf(m, " resume: "); | ||
| 256 | print_name_offset(m, dev->tick_resume); | ||
| 257 | SEQ_printf(m, "\n"); | ||
| 258 | } | ||
| 259 | } | ||
| 234 | 260 | ||
| 235 | SEQ_printf(m, " event_handler: "); | 261 | SEQ_printf(m, " event_handler: "); | 
| 236 | print_name_offset(m, dev->event_handler); | 262 | print_name_offset(m, dev->event_handler); | 
| diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a5da09c899dd..3b9a48ae153a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -432,6 +432,14 @@ config UPROBE_EVENT | |||
| 432 | This option is required if you plan to use perf-probe subcommand | 432 | This option is required if you plan to use perf-probe subcommand | 
| 433 | of perf tools on user space applications. | 433 | of perf tools on user space applications. | 
| 434 | 434 | ||
| 435 | config BPF_EVENTS | ||
| 436 | depends on BPF_SYSCALL | ||
| 437 | depends on KPROBE_EVENT | ||
| 438 | bool | ||
| 439 | default y | ||
| 440 | help | ||
| 441 | This allows the user to attach BPF programs to kprobe events. | ||
| 442 | |||
| 435 | config PROBE_EVENTS | 443 | config PROBE_EVENTS | 
| 436 | def_bool n | 444 | def_bool n | 
| 437 | 445 | ||
| @@ -599,6 +607,34 @@ config RING_BUFFER_STARTUP_TEST | |||
| 599 | 607 | ||
| 600 | If unsure, say N | 608 | If unsure, say N | 
| 601 | 609 | ||
| 610 | config TRACE_ENUM_MAP_FILE | ||
| 611 | bool "Show enum mappings for trace events" | ||
| 612 | depends on TRACING | ||
| 613 | help | ||
| 614 | The "print fmt" of the trace events will show the enum names instead | ||
| 615 | of their values. This can cause problems for user space tools that | ||
| 616 | use this string to parse the raw data as user space does not know | ||
| 617 | how to convert the string to its value. | ||
| 618 | |||
| 619 | To fix this, there's a special macro in the kernel that can be used | ||
| 620 | to convert the enum into its value. If this macro is used, then the | ||
| 621 | print fmt strings will have the enums converted to their values. | ||
| 622 | |||
| 623 | If something does not get converted properly, this option can be | ||
| 624 | used to show what enums the kernel tried to convert. | ||
| 625 | |||
| 626 | This option is for debugging the enum conversions. A file is created | ||
| 627 | in the tracing directory called "enum_map" that will show the enum | ||
| 628 | names matched with their values and what trace event system they | ||
| 629 | belong too. | ||
| 630 | |||
| 631 | Normally, the mapping of the strings to values will be freed after | ||
| 632 | boot up or module load. With this option, they will not be freed, as | ||
| 633 | they are needed for the "enum_map" file. Enabling this option will | ||
| 634 | increase the memory footprint of the running kernel. | ||
| 635 | |||
| 636 | If unsure, say N | ||
| 637 | |||
| 602 | endif # FTRACE | 638 | endif # FTRACE | 
| 603 | 639 | ||
| 604 | endif # TRACING_SUPPORT | 640 | endif # TRACING_SUPPORT | 
| diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 98f26588255e..9b1044e936a6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
| @@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o | |||
| 53 | endif | 53 | endif | 
| 54 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 54 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 
| 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o | 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o | 
| 56 | obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o | ||
| 56 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 57 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | 
| 57 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o | 58 | obj-$(CONFIG_TRACEPOINTS) += power-traces.o | 
| 58 | ifeq ($(CONFIG_PM),y) | 59 | ifeq ($(CONFIG_PM),y) | 
| diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c new file mode 100644 index 000000000000..2d56ce501632 --- /dev/null +++ b/kernel/trace/bpf_trace.c | |||
| @@ -0,0 +1,222 @@ | |||
| 1 | /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com | ||
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | ||
| 7 | #include <linux/kernel.h> | ||
| 8 | #include <linux/types.h> | ||
| 9 | #include <linux/slab.h> | ||
| 10 | #include <linux/bpf.h> | ||
| 11 | #include <linux/filter.h> | ||
| 12 | #include <linux/uaccess.h> | ||
| 13 | #include <linux/ctype.h> | ||
| 14 | #include "trace.h" | ||
| 15 | |||
| 16 | static DEFINE_PER_CPU(int, bpf_prog_active); | ||
| 17 | |||
| 18 | /** | ||
| 19 | * trace_call_bpf - invoke BPF program | ||
| 20 | * @prog: BPF program | ||
| 21 | * @ctx: opaque context pointer | ||
| 22 | * | ||
| 23 | * kprobe handlers execute BPF programs via this helper. | ||
| 24 | * Can be used from static tracepoints in the future. | ||
| 25 | * | ||
| 26 | * Return: BPF programs always return an integer which is interpreted by | ||
| 27 | * kprobe handler as: | ||
| 28 | * 0 - return from kprobe (event is filtered out) | ||
| 29 | * 1 - store kprobe event into ring buffer | ||
| 30 | * Other values are reserved and currently alias to 1 | ||
| 31 | */ | ||
| 32 | unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) | ||
| 33 | { | ||
| 34 | unsigned int ret; | ||
| 35 | |||
| 36 | if (in_nmi()) /* not supported yet */ | ||
| 37 | return 1; | ||
| 38 | |||
| 39 | preempt_disable(); | ||
| 40 | |||
| 41 | if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { | ||
| 42 | /* | ||
| 43 | * since some bpf program is already running on this cpu, | ||
| 44 | * don't call into another bpf program (same or different) | ||
| 45 | * and don't send kprobe event into ring-buffer, | ||
| 46 | * so return zero here | ||
| 47 | */ | ||
| 48 | ret = 0; | ||
| 49 | goto out; | ||
| 50 | } | ||
| 51 | |||
| 52 | rcu_read_lock(); | ||
| 53 | ret = BPF_PROG_RUN(prog, ctx); | ||
| 54 | rcu_read_unlock(); | ||
| 55 | |||
| 56 | out: | ||
| 57 | __this_cpu_dec(bpf_prog_active); | ||
| 58 | preempt_enable(); | ||
| 59 | |||
| 60 | return ret; | ||
| 61 | } | ||
| 62 | EXPORT_SYMBOL_GPL(trace_call_bpf); | ||
| 63 | |||
| 64 | static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 65 | { | ||
| 66 | void *dst = (void *) (long) r1; | ||
| 67 | int size = (int) r2; | ||
| 68 | void *unsafe_ptr = (void *) (long) r3; | ||
| 69 | |||
| 70 | return probe_kernel_read(dst, unsafe_ptr, size); | ||
| 71 | } | ||
| 72 | |||
| 73 | static const struct bpf_func_proto bpf_probe_read_proto = { | ||
| 74 | .func = bpf_probe_read, | ||
| 75 | .gpl_only = true, | ||
| 76 | .ret_type = RET_INTEGER, | ||
| 77 | .arg1_type = ARG_PTR_TO_STACK, | ||
| 78 | .arg2_type = ARG_CONST_STACK_SIZE, | ||
| 79 | .arg3_type = ARG_ANYTHING, | ||
| 80 | }; | ||
| 81 | |||
| 82 | static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) | ||
| 83 | { | ||
| 84 | /* NMI safe access to clock monotonic */ | ||
| 85 | return ktime_get_mono_fast_ns(); | ||
| 86 | } | ||
| 87 | |||
| 88 | static const struct bpf_func_proto bpf_ktime_get_ns_proto = { | ||
| 89 | .func = bpf_ktime_get_ns, | ||
| 90 | .gpl_only = true, | ||
| 91 | .ret_type = RET_INTEGER, | ||
| 92 | }; | ||
| 93 | |||
| 94 | /* | ||
| 95 | * limited trace_printk() | ||
| 96 | * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed | ||
| 97 | */ | ||
| 98 | static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) | ||
| 99 | { | ||
| 100 | char *fmt = (char *) (long) r1; | ||
| 101 | int mod[3] = {}; | ||
| 102 | int fmt_cnt = 0; | ||
| 103 | int i; | ||
| 104 | |||
| 105 | /* | ||
| 106 | * bpf_check()->check_func_arg()->check_stack_boundary() | ||
| 107 | * guarantees that fmt points to bpf program stack, | ||
| 108 | * fmt_size bytes of it were initialized and fmt_size > 0 | ||
| 109 | */ | ||
| 110 | if (fmt[--fmt_size] != 0) | ||
| 111 | return -EINVAL; | ||
| 112 | |||
| 113 | /* check format string for allowed specifiers */ | ||
| 114 | for (i = 0; i < fmt_size; i++) { | ||
| 115 | if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) | ||
| 116 | return -EINVAL; | ||
| 117 | |||
| 118 | if (fmt[i] != '%') | ||
| 119 | continue; | ||
| 120 | |||
| 121 | if (fmt_cnt >= 3) | ||
| 122 | return -EINVAL; | ||
| 123 | |||
| 124 | /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ | ||
| 125 | i++; | ||
| 126 | if (fmt[i] == 'l') { | ||
| 127 | mod[fmt_cnt]++; | ||
| 128 | i++; | ||
| 129 | } else if (fmt[i] == 'p') { | ||
| 130 | mod[fmt_cnt]++; | ||
| 131 | i++; | ||
| 132 | if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) | ||
| 133 | return -EINVAL; | ||
| 134 | fmt_cnt++; | ||
| 135 | continue; | ||
| 136 | } | ||
| 137 | |||
| 138 | if (fmt[i] == 'l') { | ||
| 139 | mod[fmt_cnt]++; | ||
| 140 | i++; | ||
| 141 | } | ||
| 142 | |||
| 143 | if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') | ||
| 144 | return -EINVAL; | ||
| 145 | fmt_cnt++; | ||
| 146 | } | ||
| 147 | |||
| 148 | return __trace_printk(1/* fake ip will not be printed */, fmt, | ||
| 149 | mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, | ||
| 150 | mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, | ||
| 151 | mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); | ||
| 152 | } | ||
| 153 | |||
| 154 | static const struct bpf_func_proto bpf_trace_printk_proto = { | ||
| 155 | .func = bpf_trace_printk, | ||
| 156 | .gpl_only = true, | ||
| 157 | .ret_type = RET_INTEGER, | ||
| 158 | .arg1_type = ARG_PTR_TO_STACK, | ||
| 159 | .arg2_type = ARG_CONST_STACK_SIZE, | ||
| 160 | }; | ||
| 161 | |||
| 162 | static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) | ||
| 163 | { | ||
| 164 | switch (func_id) { | ||
| 165 | case BPF_FUNC_map_lookup_elem: | ||
| 166 | return &bpf_map_lookup_elem_proto; | ||
| 167 | case BPF_FUNC_map_update_elem: | ||
| 168 | return &bpf_map_update_elem_proto; | ||
| 169 | case BPF_FUNC_map_delete_elem: | ||
| 170 | return &bpf_map_delete_elem_proto; | ||
| 171 | case BPF_FUNC_probe_read: | ||
| 172 | return &bpf_probe_read_proto; | ||
| 173 | case BPF_FUNC_ktime_get_ns: | ||
| 174 | return &bpf_ktime_get_ns_proto; | ||
| 175 | |||
| 176 | case BPF_FUNC_trace_printk: | ||
| 177 | /* | ||
| 178 | * this program might be calling bpf_trace_printk, | ||
| 179 | * so allocate per-cpu printk buffers | ||
| 180 | */ | ||
| 181 | trace_printk_init_buffers(); | ||
| 182 | |||
| 183 | return &bpf_trace_printk_proto; | ||
| 184 | default: | ||
| 185 | return NULL; | ||
| 186 | } | ||
| 187 | } | ||
| 188 | |||
| 189 | /* bpf+kprobe programs can access fields of 'struct pt_regs' */ | ||
| 190 | static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) | ||
| 191 | { | ||
| 192 | /* check bounds */ | ||
| 193 | if (off < 0 || off >= sizeof(struct pt_regs)) | ||
| 194 | return false; | ||
| 195 | |||
| 196 | /* only read is allowed */ | ||
| 197 | if (type != BPF_READ) | ||
| 198 | return false; | ||
| 199 | |||
| 200 | /* disallow misaligned access */ | ||
| 201 | if (off % size != 0) | ||
| 202 | return false; | ||
| 203 | |||
| 204 | return true; | ||
| 205 | } | ||
| 206 | |||
| 207 | static struct bpf_verifier_ops kprobe_prog_ops = { | ||
| 208 | .get_func_proto = kprobe_prog_func_proto, | ||
| 209 | .is_valid_access = kprobe_prog_is_valid_access, | ||
| 210 | }; | ||
| 211 | |||
| 212 | static struct bpf_prog_type_list kprobe_tl = { | ||
| 213 | .ops = &kprobe_prog_ops, | ||
| 214 | .type = BPF_PROG_TYPE_KPROBE, | ||
| 215 | }; | ||
| 216 | |||
| 217 | static int __init register_kprobe_prog_ops(void) | ||
| 218 | { | ||
| 219 | bpf_register_prog_type(&kprobe_tl); | ||
| 220 | return 0; | ||
| 221 | } | ||
| 222 | late_initcall(register_kprobe_prog_ops); | ||
| diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f228024055b..02bece4a99ea 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -18,7 +18,7 @@ | |||
| 18 | #include <linux/kallsyms.h> | 18 | #include <linux/kallsyms.h> | 
| 19 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> | 
| 20 | #include <linux/suspend.h> | 20 | #include <linux/suspend.h> | 
| 21 | #include <linux/debugfs.h> | 21 | #include <linux/tracefs.h> | 
| 22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> | 
| 23 | #include <linux/kthread.h> | 23 | #include <linux/kthread.h> | 
| 24 | #include <linux/uaccess.h> | 24 | #include <linux/uaccess.h> | 
| @@ -249,6 +249,19 @@ static void update_function_graph_func(void); | |||
| 249 | static inline void update_function_graph_func(void) { } | 249 | static inline void update_function_graph_func(void) { } | 
| 250 | #endif | 250 | #endif | 
| 251 | 251 | ||
| 252 | |||
| 253 | static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) | ||
| 254 | { | ||
| 255 | /* | ||
| 256 | * If this is a dynamic ops or we force list func, | ||
| 257 | * then it needs to call the list anyway. | ||
| 258 | */ | ||
| 259 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) | ||
| 260 | return ftrace_ops_list_func; | ||
| 261 | |||
| 262 | return ftrace_ops_get_func(ops); | ||
| 263 | } | ||
| 264 | |||
| 252 | static void update_ftrace_function(void) | 265 | static void update_ftrace_function(void) | 
| 253 | { | 266 | { | 
| 254 | ftrace_func_t func; | 267 | ftrace_func_t func; | 
| @@ -270,7 +283,7 @@ static void update_ftrace_function(void) | |||
| 270 | * then have the mcount trampoline call the function directly. | 283 | * then have the mcount trampoline call the function directly. | 
| 271 | */ | 284 | */ | 
| 272 | } else if (ftrace_ops_list->next == &ftrace_list_end) { | 285 | } else if (ftrace_ops_list->next == &ftrace_list_end) { | 
| 273 | func = ftrace_ops_get_func(ftrace_ops_list); | 286 | func = ftrace_ops_get_list_func(ftrace_ops_list); | 
| 274 | 287 | ||
| 275 | } else { | 288 | } else { | 
| 276 | /* Just use the default ftrace_ops */ | 289 | /* Just use the default ftrace_ops */ | 
| @@ -1008,7 +1021,7 @@ static struct tracer_stat function_stats __initdata = { | |||
| 1008 | .stat_show = function_stat_show | 1021 | .stat_show = function_stat_show | 
| 1009 | }; | 1022 | }; | 
| 1010 | 1023 | ||
| 1011 | static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | 1024 | static __init void ftrace_profile_tracefs(struct dentry *d_tracer) | 
| 1012 | { | 1025 | { | 
| 1013 | struct ftrace_profile_stat *stat; | 1026 | struct ftrace_profile_stat *stat; | 
| 1014 | struct dentry *entry; | 1027 | struct dentry *entry; | 
| @@ -1044,15 +1057,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | |||
| 1044 | } | 1057 | } | 
| 1045 | } | 1058 | } | 
| 1046 | 1059 | ||
| 1047 | entry = debugfs_create_file("function_profile_enabled", 0644, | 1060 | entry = tracefs_create_file("function_profile_enabled", 0644, | 
| 1048 | d_tracer, NULL, &ftrace_profile_fops); | 1061 | d_tracer, NULL, &ftrace_profile_fops); | 
| 1049 | if (!entry) | 1062 | if (!entry) | 
| 1050 | pr_warning("Could not create debugfs " | 1063 | pr_warning("Could not create tracefs " | 
| 1051 | "'function_profile_enabled' entry\n"); | 1064 | "'function_profile_enabled' entry\n"); | 
| 1052 | } | 1065 | } | 
| 1053 | 1066 | ||
| 1054 | #else /* CONFIG_FUNCTION_PROFILER */ | 1067 | #else /* CONFIG_FUNCTION_PROFILER */ | 
| 1055 | static __init void ftrace_profile_debugfs(struct dentry *d_tracer) | 1068 | static __init void ftrace_profile_tracefs(struct dentry *d_tracer) | 
| 1056 | { | 1069 | { | 
| 1057 | } | 1070 | } | 
| 1058 | #endif /* CONFIG_FUNCTION_PROFILER */ | 1071 | #endif /* CONFIG_FUNCTION_PROFILER */ | 
| @@ -4712,7 +4725,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops) | |||
| 4712 | mutex_unlock(&ftrace_lock); | 4725 | mutex_unlock(&ftrace_lock); | 
| 4713 | } | 4726 | } | 
| 4714 | 4727 | ||
| 4715 | static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | 4728 | static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) | 
| 4716 | { | 4729 | { | 
| 4717 | 4730 | ||
| 4718 | trace_create_file("available_filter_functions", 0444, | 4731 | trace_create_file("available_filter_functions", 0444, | 
| @@ -5020,7 +5033,7 @@ static int __init ftrace_nodyn_init(void) | |||
| 5020 | } | 5033 | } | 
| 5021 | core_initcall(ftrace_nodyn_init); | 5034 | core_initcall(ftrace_nodyn_init); | 
| 5022 | 5035 | ||
| 5023 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 5036 | static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } | 
| 5024 | static inline void ftrace_startup_enable(int command) { } | 5037 | static inline void ftrace_startup_enable(int command) { } | 
| 5025 | static inline void ftrace_startup_all(int command) { } | 5038 | static inline void ftrace_startup_all(int command) { } | 
| 5026 | /* Keep as macros so we do not need to define the commands */ | 5039 | /* Keep as macros so we do not need to define the commands */ | 
| @@ -5209,13 +5222,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, | |||
| 5209 | ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) | 5222 | ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) | 
| 5210 | { | 5223 | { | 
| 5211 | /* | 5224 | /* | 
| 5212 | * If this is a dynamic ops or we force list func, | ||
| 5213 | * then it needs to call the list anyway. | ||
| 5214 | */ | ||
| 5215 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) | ||
| 5216 | return ftrace_ops_list_func; | ||
| 5217 | |||
| 5218 | /* | ||
| 5219 | * If the func handles its own recursion, call it directly. | 5225 | * If the func handles its own recursion, call it directly. | 
| 5220 | * Otherwise call the recursion protected function that | 5226 | * Otherwise call the recursion protected function that | 
| 5221 | * will call the ftrace ops function. | 5227 | * will call the ftrace ops function. | 
| @@ -5473,7 +5479,7 @@ static const struct file_operations ftrace_pid_fops = { | |||
| 5473 | .release = ftrace_pid_release, | 5479 | .release = ftrace_pid_release, | 
| 5474 | }; | 5480 | }; | 
| 5475 | 5481 | ||
| 5476 | static __init int ftrace_init_debugfs(void) | 5482 | static __init int ftrace_init_tracefs(void) | 
| 5477 | { | 5483 | { | 
| 5478 | struct dentry *d_tracer; | 5484 | struct dentry *d_tracer; | 
| 5479 | 5485 | ||
| @@ -5481,16 +5487,16 @@ static __init int ftrace_init_debugfs(void) | |||
| 5481 | if (IS_ERR(d_tracer)) | 5487 | if (IS_ERR(d_tracer)) | 
| 5482 | return 0; | 5488 | return 0; | 
| 5483 | 5489 | ||
| 5484 | ftrace_init_dyn_debugfs(d_tracer); | 5490 | ftrace_init_dyn_tracefs(d_tracer); | 
| 5485 | 5491 | ||
| 5486 | trace_create_file("set_ftrace_pid", 0644, d_tracer, | 5492 | trace_create_file("set_ftrace_pid", 0644, d_tracer, | 
| 5487 | NULL, &ftrace_pid_fops); | 5493 | NULL, &ftrace_pid_fops); | 
| 5488 | 5494 | ||
| 5489 | ftrace_profile_debugfs(d_tracer); | 5495 | ftrace_profile_tracefs(d_tracer); | 
| 5490 | 5496 | ||
| 5491 | return 0; | 5497 | return 0; | 
| 5492 | } | 5498 | } | 
| 5493 | fs_initcall(ftrace_init_debugfs); | 5499 | fs_initcall(ftrace_init_tracefs); | 
| 5494 | 5500 | ||
| 5495 | /** | 5501 | /** | 
| 5496 | * ftrace_kill - kill ftrace | 5502 | * ftrace_kill - kill ftrace | 
| diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5040d44fe5a3..0315d43176d8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context); | |||
| 2679 | 2679 | ||
| 2680 | static __always_inline int trace_recursive_lock(void) | 2680 | static __always_inline int trace_recursive_lock(void) | 
| 2681 | { | 2681 | { | 
| 2682 | unsigned int val = this_cpu_read(current_context); | 2682 | unsigned int val = __this_cpu_read(current_context); | 
| 2683 | int bit; | 2683 | int bit; | 
| 2684 | 2684 | ||
| 2685 | if (in_interrupt()) { | 2685 | if (in_interrupt()) { | 
| @@ -2696,18 +2696,14 @@ static __always_inline int trace_recursive_lock(void) | |||
| 2696 | return 1; | 2696 | return 1; | 
| 2697 | 2697 | ||
| 2698 | val |= (1 << bit); | 2698 | val |= (1 << bit); | 
| 2699 | this_cpu_write(current_context, val); | 2699 | __this_cpu_write(current_context, val); | 
| 2700 | 2700 | ||
| 2701 | return 0; | 2701 | return 0; | 
| 2702 | } | 2702 | } | 
| 2703 | 2703 | ||
| 2704 | static __always_inline void trace_recursive_unlock(void) | 2704 | static __always_inline void trace_recursive_unlock(void) | 
| 2705 | { | 2705 | { | 
| 2706 | unsigned int val = this_cpu_read(current_context); | 2706 | __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); | 
| 2707 | |||
| 2708 | val--; | ||
| 2709 | val &= this_cpu_read(current_context); | ||
| 2710 | this_cpu_write(current_context, val); | ||
| 2711 | } | 2707 | } | 
| 2712 | 2708 | ||
| 2713 | #else | 2709 | #else | 
| diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 62c6506d663f..91eecaaa43e0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/notifier.h> | 20 | #include <linux/notifier.h> | 
| 21 | #include <linux/irqflags.h> | 21 | #include <linux/irqflags.h> | 
| 22 | #include <linux/debugfs.h> | 22 | #include <linux/debugfs.h> | 
| 23 | #include <linux/tracefs.h> | ||
| 23 | #include <linux/pagemap.h> | 24 | #include <linux/pagemap.h> | 
| 24 | #include <linux/hardirq.h> | 25 | #include <linux/hardirq.h> | 
| 25 | #include <linux/linkage.h> | 26 | #include <linux/linkage.h> | 
| @@ -31,6 +32,7 @@ | |||
| 31 | #include <linux/splice.h> | 32 | #include <linux/splice.h> | 
| 32 | #include <linux/kdebug.h> | 33 | #include <linux/kdebug.h> | 
| 33 | #include <linux/string.h> | 34 | #include <linux/string.h> | 
| 35 | #include <linux/mount.h> | ||
| 34 | #include <linux/rwsem.h> | 36 | #include <linux/rwsem.h> | 
| 35 | #include <linux/slab.h> | 37 | #include <linux/slab.h> | 
| 36 | #include <linux/ctype.h> | 38 | #include <linux/ctype.h> | 
| @@ -123,6 +125,42 @@ enum ftrace_dump_mode ftrace_dump_on_oops; | |||
| 123 | /* When set, tracing will stop when a WARN*() is hit */ | 125 | /* When set, tracing will stop when a WARN*() is hit */ | 
| 124 | int __disable_trace_on_warning; | 126 | int __disable_trace_on_warning; | 
| 125 | 127 | ||
| 128 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | ||
| 129 | /* Map of enums to their values, for "enum_map" file */ | ||
| 130 | struct trace_enum_map_head { | ||
| 131 | struct module *mod; | ||
| 132 | unsigned long length; | ||
| 133 | }; | ||
| 134 | |||
| 135 | union trace_enum_map_item; | ||
| 136 | |||
| 137 | struct trace_enum_map_tail { | ||
| 138 | /* | ||
| 139 | * "end" is first and points to NULL as it must be different | ||
| 140 | * than "mod" or "enum_string" | ||
| 141 | */ | ||
| 142 | union trace_enum_map_item *next; | ||
| 143 | const char *end; /* points to NULL */ | ||
| 144 | }; | ||
| 145 | |||
| 146 | static DEFINE_MUTEX(trace_enum_mutex); | ||
| 147 | |||
| 148 | /* | ||
| 149 | * The trace_enum_maps are saved in an array with two extra elements, | ||
| 150 | * one at the beginning, and one at the end. The beginning item contains | ||
| 151 | * the count of the saved maps (head.length), and the module they | ||
| 152 | * belong to if not built in (head.mod). The ending item contains a | ||
| 153 | * pointer to the next array of saved enum_map items. | ||
| 154 | */ | ||
| 155 | union trace_enum_map_item { | ||
| 156 | struct trace_enum_map map; | ||
| 157 | struct trace_enum_map_head head; | ||
| 158 | struct trace_enum_map_tail tail; | ||
| 159 | }; | ||
| 160 | |||
| 161 | static union trace_enum_map_item *trace_enum_maps; | ||
| 162 | #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ | ||
| 163 | |||
| 126 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); | 164 | static int tracing_set_tracer(struct trace_array *tr, const char *buf); | 
| 127 | 165 | ||
| 128 | #define MAX_TRACER_SIZE 100 | 166 | #define MAX_TRACER_SIZE 100 | 
| @@ -3908,6 +3946,182 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = { | |||
| 3908 | .write = tracing_saved_cmdlines_size_write, | 3946 | .write = tracing_saved_cmdlines_size_write, | 
| 3909 | }; | 3947 | }; | 
| 3910 | 3948 | ||
| 3949 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | ||
| 3950 | static union trace_enum_map_item * | ||
| 3951 | update_enum_map(union trace_enum_map_item *ptr) | ||
| 3952 | { | ||
| 3953 | if (!ptr->map.enum_string) { | ||
| 3954 | if (ptr->tail.next) { | ||
| 3955 | ptr = ptr->tail.next; | ||
| 3956 | /* Set ptr to the next real item (skip head) */ | ||
| 3957 | ptr++; | ||
| 3958 | } else | ||
| 3959 | return NULL; | ||
| 3960 | } | ||
| 3961 | return ptr; | ||
| 3962 | } | ||
| 3963 | |||
| 3964 | static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos) | ||
| 3965 | { | ||
| 3966 | union trace_enum_map_item *ptr = v; | ||
| 3967 | |||
| 3968 | /* | ||
| 3969 | * Paranoid! If ptr points to end, we don't want to increment past it. | ||
| 3970 | * This really should never happen. | ||
| 3971 | */ | ||
| 3972 | ptr = update_enum_map(ptr); | ||
| 3973 | if (WARN_ON_ONCE(!ptr)) | ||
| 3974 | return NULL; | ||
| 3975 | |||
| 3976 | ptr++; | ||
| 3977 | |||
| 3978 | (*pos)++; | ||
| 3979 | |||
| 3980 | ptr = update_enum_map(ptr); | ||
| 3981 | |||
| 3982 | return ptr; | ||
| 3983 | } | ||
| 3984 | |||
| 3985 | static void *enum_map_start(struct seq_file *m, loff_t *pos) | ||
| 3986 | { | ||
| 3987 | union trace_enum_map_item *v; | ||
| 3988 | loff_t l = 0; | ||
| 3989 | |||
| 3990 | mutex_lock(&trace_enum_mutex); | ||
| 3991 | |||
| 3992 | v = trace_enum_maps; | ||
| 3993 | if (v) | ||
| 3994 | v++; | ||
| 3995 | |||
| 3996 | while (v && l < *pos) { | ||
| 3997 | v = enum_map_next(m, v, &l); | ||
| 3998 | } | ||
| 3999 | |||
| 4000 | return v; | ||
| 4001 | } | ||
| 4002 | |||
| 4003 | static void enum_map_stop(struct seq_file *m, void *v) | ||
| 4004 | { | ||
| 4005 | mutex_unlock(&trace_enum_mutex); | ||
| 4006 | } | ||
| 4007 | |||
| 4008 | static int enum_map_show(struct seq_file *m, void *v) | ||
| 4009 | { | ||
| 4010 | union trace_enum_map_item *ptr = v; | ||
| 4011 | |||
| 4012 | seq_printf(m, "%s %ld (%s)\n", | ||
| 4013 | ptr->map.enum_string, ptr->map.enum_value, | ||
| 4014 | ptr->map.system); | ||
| 4015 | |||
| 4016 | return 0; | ||
| 4017 | } | ||
| 4018 | |||
| 4019 | static const struct seq_operations tracing_enum_map_seq_ops = { | ||
| 4020 | .start = enum_map_start, | ||
| 4021 | .next = enum_map_next, | ||
| 4022 | .stop = enum_map_stop, | ||
| 4023 | .show = enum_map_show, | ||
| 4024 | }; | ||
| 4025 | |||
| 4026 | static int tracing_enum_map_open(struct inode *inode, struct file *filp) | ||
| 4027 | { | ||
| 4028 | if (tracing_disabled) | ||
| 4029 | return -ENODEV; | ||
| 4030 | |||
| 4031 | return seq_open(filp, &tracing_enum_map_seq_ops); | ||
| 4032 | } | ||
| 4033 | |||
| 4034 | static const struct file_operations tracing_enum_map_fops = { | ||
| 4035 | .open = tracing_enum_map_open, | ||
| 4036 | .read = seq_read, | ||
| 4037 | .llseek = seq_lseek, | ||
| 4038 | .release = seq_release, | ||
| 4039 | }; | ||
| 4040 | |||
| 4041 | static inline union trace_enum_map_item * | ||
| 4042 | trace_enum_jmp_to_tail(union trace_enum_map_item *ptr) | ||
| 4043 | { | ||
| 4044 | /* Return tail of array given the head */ | ||
| 4045 | return ptr + ptr->head.length + 1; | ||
| 4046 | } | ||
| 4047 | |||
| 4048 | static void | ||
| 4049 | trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, | ||
| 4050 | int len) | ||
| 4051 | { | ||
| 4052 | struct trace_enum_map **stop; | ||
| 4053 | struct trace_enum_map **map; | ||
| 4054 | union trace_enum_map_item *map_array; | ||
| 4055 | union trace_enum_map_item *ptr; | ||
| 4056 | |||
| 4057 | stop = start + len; | ||
| 4058 | |||
| 4059 | /* | ||
| 4060 | * The trace_enum_maps contains the map plus a head and tail item, | ||
| 4061 | * where the head holds the module and length of array, and the | ||
| 4062 | * tail holds a pointer to the next list. | ||
| 4063 | */ | ||
| 4064 | map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); | ||
| 4065 | if (!map_array) { | ||
| 4066 | pr_warning("Unable to allocate trace enum mapping\n"); | ||
| 4067 | return; | ||
| 4068 | } | ||
| 4069 | |||
| 4070 | mutex_lock(&trace_enum_mutex); | ||
| 4071 | |||
| 4072 | if (!trace_enum_maps) | ||
| 4073 | trace_enum_maps = map_array; | ||
| 4074 | else { | ||
| 4075 | ptr = trace_enum_maps; | ||
| 4076 | for (;;) { | ||
| 4077 | ptr = trace_enum_jmp_to_tail(ptr); | ||
| 4078 | if (!ptr->tail.next) | ||
| 4079 | break; | ||
| 4080 | ptr = ptr->tail.next; | ||
| 4081 | |||
| 4082 | } | ||
| 4083 | ptr->tail.next = map_array; | ||
| 4084 | } | ||
| 4085 | map_array->head.mod = mod; | ||
| 4086 | map_array->head.length = len; | ||
| 4087 | map_array++; | ||
| 4088 | |||
| 4089 | for (map = start; (unsigned long)map < (unsigned long)stop; map++) { | ||
| 4090 | map_array->map = **map; | ||
| 4091 | map_array++; | ||
| 4092 | } | ||
| 4093 | memset(map_array, 0, sizeof(*map_array)); | ||
| 4094 | |||
| 4095 | mutex_unlock(&trace_enum_mutex); | ||
| 4096 | } | ||
| 4097 | |||
| 4098 | static void trace_create_enum_file(struct dentry *d_tracer) | ||
| 4099 | { | ||
| 4100 | trace_create_file("enum_map", 0444, d_tracer, | ||
| 4101 | NULL, &tracing_enum_map_fops); | ||
| 4102 | } | ||
| 4103 | |||
| 4104 | #else /* CONFIG_TRACE_ENUM_MAP_FILE */ | ||
| 4105 | static inline void trace_create_enum_file(struct dentry *d_tracer) { } | ||
| 4106 | static inline void trace_insert_enum_map_file(struct module *mod, | ||
| 4107 | struct trace_enum_map **start, int len) { } | ||
| 4108 | #endif /* !CONFIG_TRACE_ENUM_MAP_FILE */ | ||
| 4109 | |||
| 4110 | static void trace_insert_enum_map(struct module *mod, | ||
| 4111 | struct trace_enum_map **start, int len) | ||
| 4112 | { | ||
| 4113 | struct trace_enum_map **map; | ||
| 4114 | |||
| 4115 | if (len <= 0) | ||
| 4116 | return; | ||
| 4117 | |||
| 4118 | map = start; | ||
| 4119 | |||
| 4120 | trace_event_enum_update(map, len); | ||
| 4121 | |||
| 4122 | trace_insert_enum_map_file(mod, start, len); | ||
| 4123 | } | ||
| 4124 | |||
| 3911 | static ssize_t | 4125 | static ssize_t | 
| 3912 | tracing_set_trace_read(struct file *filp, char __user *ubuf, | 4126 | tracing_set_trace_read(struct file *filp, char __user *ubuf, | 
| 3913 | size_t cnt, loff_t *ppos) | 4127 | size_t cnt, loff_t *ppos) | 
| @@ -4105,9 +4319,24 @@ static void tracing_set_nop(struct trace_array *tr) | |||
| 4105 | tr->current_trace = &nop_trace; | 4319 | tr->current_trace = &nop_trace; | 
| 4106 | } | 4320 | } | 
| 4107 | 4321 | ||
| 4108 | static int tracing_set_tracer(struct trace_array *tr, const char *buf) | 4322 | static void update_tracer_options(struct trace_array *tr, struct tracer *t) | 
| 4109 | { | 4323 | { | 
| 4110 | static struct trace_option_dentry *topts; | 4324 | static struct trace_option_dentry *topts; | 
| 4325 | |||
| 4326 | /* Only enable if the directory has been created already. */ | ||
| 4327 | if (!tr->dir) | ||
| 4328 | return; | ||
| 4329 | |||
| 4330 | /* Currently, only the top instance has options */ | ||
| 4331 | if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) | ||
| 4332 | return; | ||
| 4333 | |||
| 4334 | destroy_trace_option_files(topts); | ||
| 4335 | topts = create_trace_option_files(tr, t); | ||
| 4336 | } | ||
| 4337 | |||
| 4338 | static int tracing_set_tracer(struct trace_array *tr, const char *buf) | ||
| 4339 | { | ||
| 4111 | struct tracer *t; | 4340 | struct tracer *t; | 
| 4112 | #ifdef CONFIG_TRACER_MAX_TRACE | 4341 | #ifdef CONFIG_TRACER_MAX_TRACE | 
| 4113 | bool had_max_tr; | 4342 | bool had_max_tr; | 
| @@ -4172,11 +4401,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) | |||
| 4172 | free_snapshot(tr); | 4401 | free_snapshot(tr); | 
| 4173 | } | 4402 | } | 
| 4174 | #endif | 4403 | #endif | 
| 4175 | /* Currently, only the top instance has options */ | 4404 | update_tracer_options(tr, t); | 
| 4176 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { | ||
| 4177 | destroy_trace_option_files(topts); | ||
| 4178 | topts = create_trace_option_files(tr, t); | ||
| 4179 | } | ||
| 4180 | 4405 | ||
| 4181 | #ifdef CONFIG_TRACER_MAX_TRACE | 4406 | #ifdef CONFIG_TRACER_MAX_TRACE | 
| 4182 | if (t->use_max_tr && !had_max_tr) { | 4407 | if (t->use_max_tr && !had_max_tr) { | 
| @@ -5817,6 +6042,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; } | |||
| 5817 | 6042 | ||
| 5818 | static struct dentry *tracing_get_dentry(struct trace_array *tr) | 6043 | static struct dentry *tracing_get_dentry(struct trace_array *tr) | 
| 5819 | { | 6044 | { | 
| 6045 | if (WARN_ON(!tr->dir)) | ||
| 6046 | return ERR_PTR(-ENODEV); | ||
| 6047 | |||
| 6048 | /* Top directory uses NULL as the parent */ | ||
| 6049 | if (tr->flags & TRACE_ARRAY_FL_GLOBAL) | ||
| 6050 | return NULL; | ||
| 6051 | |||
| 6052 | /* All sub buffers have a descriptor */ | ||
| 5820 | return tr->dir; | 6053 | return tr->dir; | 
| 5821 | } | 6054 | } | 
| 5822 | 6055 | ||
| @@ -5831,10 +6064,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) | |||
| 5831 | if (IS_ERR(d_tracer)) | 6064 | if (IS_ERR(d_tracer)) | 
| 5832 | return NULL; | 6065 | return NULL; | 
| 5833 | 6066 | ||
| 5834 | tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); | 6067 | tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); | 
| 5835 | 6068 | ||
| 5836 | WARN_ONCE(!tr->percpu_dir, | 6069 | WARN_ONCE(!tr->percpu_dir, | 
| 5837 | "Could not create debugfs directory 'per_cpu/%d'\n", cpu); | 6070 | "Could not create tracefs directory 'per_cpu/%d'\n", cpu); | 
| 5838 | 6071 | ||
| 5839 | return tr->percpu_dir; | 6072 | return tr->percpu_dir; | 
| 5840 | } | 6073 | } | 
| @@ -5851,7 +6084,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, | |||
| 5851 | } | 6084 | } | 
| 5852 | 6085 | ||
| 5853 | static void | 6086 | static void | 
| 5854 | tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) | 6087 | tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) | 
| 5855 | { | 6088 | { | 
| 5856 | struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); | 6089 | struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); | 
| 5857 | struct dentry *d_cpu; | 6090 | struct dentry *d_cpu; | 
| @@ -5861,9 +6094,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) | |||
| 5861 | return; | 6094 | return; | 
| 5862 | 6095 | ||
| 5863 | snprintf(cpu_dir, 30, "cpu%ld", cpu); | 6096 | snprintf(cpu_dir, 30, "cpu%ld", cpu); | 
| 5864 | d_cpu = debugfs_create_dir(cpu_dir, d_percpu); | 6097 | d_cpu = tracefs_create_dir(cpu_dir, d_percpu); | 
| 5865 | if (!d_cpu) { | 6098 | if (!d_cpu) { | 
| 5866 | pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); | 6099 | pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); | 
| 5867 | return; | 6100 | return; | 
| 5868 | } | 6101 | } | 
| 5869 | 6102 | ||
| @@ -6015,9 +6248,9 @@ struct dentry *trace_create_file(const char *name, | |||
| 6015 | { | 6248 | { | 
| 6016 | struct dentry *ret; | 6249 | struct dentry *ret; | 
| 6017 | 6250 | ||
| 6018 | ret = debugfs_create_file(name, mode, parent, data, fops); | 6251 | ret = tracefs_create_file(name, mode, parent, data, fops); | 
| 6019 | if (!ret) | 6252 | if (!ret) | 
| 6020 | pr_warning("Could not create debugfs '%s' entry\n", name); | 6253 | pr_warning("Could not create tracefs '%s' entry\n", name); | 
| 6021 | 6254 | ||
| 6022 | return ret; | 6255 | return ret; | 
| 6023 | } | 6256 | } | 
| @@ -6034,9 +6267,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) | |||
| 6034 | if (IS_ERR(d_tracer)) | 6267 | if (IS_ERR(d_tracer)) | 
| 6035 | return NULL; | 6268 | return NULL; | 
| 6036 | 6269 | ||
| 6037 | tr->options = debugfs_create_dir("options", d_tracer); | 6270 | tr->options = tracefs_create_dir("options", d_tracer); | 
| 6038 | if (!tr->options) { | 6271 | if (!tr->options) { | 
| 6039 | pr_warning("Could not create debugfs directory 'options'\n"); | 6272 | pr_warning("Could not create tracefs directory 'options'\n"); | 
| 6040 | return NULL; | 6273 | return NULL; | 
| 6041 | } | 6274 | } | 
| 6042 | 6275 | ||
| @@ -6105,7 +6338,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts) | |||
| 6105 | return; | 6338 | return; | 
| 6106 | 6339 | ||
| 6107 | for (cnt = 0; topts[cnt].opt; cnt++) | 6340 | for (cnt = 0; topts[cnt].opt; cnt++) | 
| 6108 | debugfs_remove(topts[cnt].entry); | 6341 | tracefs_remove(topts[cnt].entry); | 
| 6109 | 6342 | ||
| 6110 | kfree(topts); | 6343 | kfree(topts); | 
| 6111 | } | 6344 | } | 
| @@ -6194,7 +6427,7 @@ static const struct file_operations rb_simple_fops = { | |||
| 6194 | struct dentry *trace_instance_dir; | 6427 | struct dentry *trace_instance_dir; | 
| 6195 | 6428 | ||
| 6196 | static void | 6429 | static void | 
| 6197 | init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); | 6430 | init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); | 
| 6198 | 6431 | ||
| 6199 | static int | 6432 | static int | 
| 6200 | allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) | 6433 | allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) | 
| @@ -6271,7 +6504,7 @@ static void free_trace_buffers(struct trace_array *tr) | |||
| 6271 | #endif | 6504 | #endif | 
| 6272 | } | 6505 | } | 
| 6273 | 6506 | ||
| 6274 | static int new_instance_create(const char *name) | 6507 | static int instance_mkdir(const char *name) | 
| 6275 | { | 6508 | { | 
| 6276 | struct trace_array *tr; | 6509 | struct trace_array *tr; | 
| 6277 | int ret; | 6510 | int ret; | 
| @@ -6310,17 +6543,17 @@ static int new_instance_create(const char *name) | |||
| 6310 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) | 6543 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) | 
| 6311 | goto out_free_tr; | 6544 | goto out_free_tr; | 
| 6312 | 6545 | ||
| 6313 | tr->dir = debugfs_create_dir(name, trace_instance_dir); | 6546 | tr->dir = tracefs_create_dir(name, trace_instance_dir); | 
| 6314 | if (!tr->dir) | 6547 | if (!tr->dir) | 
| 6315 | goto out_free_tr; | 6548 | goto out_free_tr; | 
| 6316 | 6549 | ||
| 6317 | ret = event_trace_add_tracer(tr->dir, tr); | 6550 | ret = event_trace_add_tracer(tr->dir, tr); | 
| 6318 | if (ret) { | 6551 | if (ret) { | 
| 6319 | debugfs_remove_recursive(tr->dir); | 6552 | tracefs_remove_recursive(tr->dir); | 
| 6320 | goto out_free_tr; | 6553 | goto out_free_tr; | 
| 6321 | } | 6554 | } | 
| 6322 | 6555 | ||
| 6323 | init_tracer_debugfs(tr, tr->dir); | 6556 | init_tracer_tracefs(tr, tr->dir); | 
| 6324 | 6557 | ||
| 6325 | list_add(&tr->list, &ftrace_trace_arrays); | 6558 | list_add(&tr->list, &ftrace_trace_arrays); | 
| 6326 | 6559 | ||
| @@ -6341,7 +6574,7 @@ static int new_instance_create(const char *name) | |||
| 6341 | 6574 | ||
| 6342 | } | 6575 | } | 
| 6343 | 6576 | ||
| 6344 | static int instance_delete(const char *name) | 6577 | static int instance_rmdir(const char *name) | 
| 6345 | { | 6578 | { | 
| 6346 | struct trace_array *tr; | 6579 | struct trace_array *tr; | 
| 6347 | int found = 0; | 6580 | int found = 0; | 
| @@ -6382,82 +6615,17 @@ static int instance_delete(const char *name) | |||
| 6382 | return ret; | 6615 | return ret; | 
| 6383 | } | 6616 | } | 
| 6384 | 6617 | ||
| 6385 | static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) | ||
| 6386 | { | ||
| 6387 | struct dentry *parent; | ||
| 6388 | int ret; | ||
| 6389 | |||
| 6390 | /* Paranoid: Make sure the parent is the "instances" directory */ | ||
| 6391 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); | ||
| 6392 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | ||
| 6393 | return -ENOENT; | ||
| 6394 | |||
| 6395 | /* | ||
| 6396 | * The inode mutex is locked, but debugfs_create_dir() will also | ||
| 6397 | * take the mutex. As the instances directory can not be destroyed | ||
| 6398 | * or changed in any other way, it is safe to unlock it, and | ||
| 6399 | * let the dentry try. If two users try to make the same dir at | ||
| 6400 | * the same time, then the new_instance_create() will determine the | ||
| 6401 | * winner. | ||
| 6402 | */ | ||
| 6403 | mutex_unlock(&inode->i_mutex); | ||
| 6404 | |||
| 6405 | ret = new_instance_create(dentry->d_iname); | ||
| 6406 | |||
| 6407 | mutex_lock(&inode->i_mutex); | ||
| 6408 | |||
| 6409 | return ret; | ||
| 6410 | } | ||
| 6411 | |||
| 6412 | static int instance_rmdir(struct inode *inode, struct dentry *dentry) | ||
| 6413 | { | ||
| 6414 | struct dentry *parent; | ||
| 6415 | int ret; | ||
| 6416 | |||
| 6417 | /* Paranoid: Make sure the parent is the "instances" directory */ | ||
| 6418 | parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); | ||
| 6419 | if (WARN_ON_ONCE(parent != trace_instance_dir)) | ||
| 6420 | return -ENOENT; | ||
| 6421 | |||
| 6422 | /* The caller did a dget() on dentry */ | ||
| 6423 | mutex_unlock(&dentry->d_inode->i_mutex); | ||
| 6424 | |||
| 6425 | /* | ||
| 6426 | * The inode mutex is locked, but debugfs_create_dir() will also | ||
| 6427 | * take the mutex. As the instances directory can not be destroyed | ||
| 6428 | * or changed in any other way, it is safe to unlock it, and | ||
| 6429 | * let the dentry try. If two users try to make the same dir at | ||
| 6430 | * the same time, then the instance_delete() will determine the | ||
| 6431 | * winner. | ||
| 6432 | */ | ||
| 6433 | mutex_unlock(&inode->i_mutex); | ||
| 6434 | |||
| 6435 | ret = instance_delete(dentry->d_iname); | ||
| 6436 | |||
| 6437 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); | ||
| 6438 | mutex_lock(&dentry->d_inode->i_mutex); | ||
| 6439 | |||
| 6440 | return ret; | ||
| 6441 | } | ||
| 6442 | |||
| 6443 | static const struct inode_operations instance_dir_inode_operations = { | ||
| 6444 | .lookup = simple_lookup, | ||
| 6445 | .mkdir = instance_mkdir, | ||
| 6446 | .rmdir = instance_rmdir, | ||
| 6447 | }; | ||
| 6448 | |||
| 6449 | static __init void create_trace_instances(struct dentry *d_tracer) | 6618 | static __init void create_trace_instances(struct dentry *d_tracer) | 
| 6450 | { | 6619 | { | 
| 6451 | trace_instance_dir = debugfs_create_dir("instances", d_tracer); | 6620 | trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, | 
| 6621 | instance_mkdir, | ||
| 6622 | instance_rmdir); | ||
| 6452 | if (WARN_ON(!trace_instance_dir)) | 6623 | if (WARN_ON(!trace_instance_dir)) | 
| 6453 | return; | 6624 | return; | 
| 6454 | |||
| 6455 | /* Hijack the dir inode operations, to allow mkdir */ | ||
| 6456 | trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; | ||
| 6457 | } | 6625 | } | 
| 6458 | 6626 | ||
| 6459 | static void | 6627 | static void | 
| 6460 | init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | 6628 | init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) | 
| 6461 | { | 6629 | { | 
| 6462 | int cpu; | 6630 | int cpu; | 
| 6463 | 6631 | ||
| @@ -6511,10 +6679,32 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) | |||
| 6511 | #endif | 6679 | #endif | 
| 6512 | 6680 | ||
| 6513 | for_each_tracing_cpu(cpu) | 6681 | for_each_tracing_cpu(cpu) | 
| 6514 | tracing_init_debugfs_percpu(tr, cpu); | 6682 | tracing_init_tracefs_percpu(tr, cpu); | 
| 6515 | 6683 | ||
| 6516 | } | 6684 | } | 
| 6517 | 6685 | ||
| 6686 | static struct vfsmount *trace_automount(void *ingore) | ||
| 6687 | { | ||
| 6688 | struct vfsmount *mnt; | ||
| 6689 | struct file_system_type *type; | ||
| 6690 | |||
| 6691 | /* | ||
| 6692 | * To maintain backward compatibility for tools that mount | ||
| 6693 | * debugfs to get to the tracing facility, tracefs is automatically | ||
| 6694 | * mounted to the debugfs/tracing directory. | ||
| 6695 | */ | ||
| 6696 | type = get_fs_type("tracefs"); | ||
| 6697 | if (!type) | ||
| 6698 | return NULL; | ||
| 6699 | mnt = vfs_kern_mount(type, 0, "tracefs", NULL); | ||
| 6700 | put_filesystem(type); | ||
| 6701 | if (IS_ERR(mnt)) | ||
| 6702 | return NULL; | ||
| 6703 | mntget(mnt); | ||
| 6704 | |||
| 6705 | return mnt; | ||
| 6706 | } | ||
| 6707 | |||
| 6518 | /** | 6708 | /** | 
| 6519 | * tracing_init_dentry - initialize top level trace array | 6709 | * tracing_init_dentry - initialize top level trace array | 
| 6520 | * | 6710 | * | 
| @@ -6526,23 +6716,112 @@ struct dentry *tracing_init_dentry(void) | |||
| 6526 | { | 6716 | { | 
| 6527 | struct trace_array *tr = &global_trace; | 6717 | struct trace_array *tr = &global_trace; | 
| 6528 | 6718 | ||
| 6719 | /* The top level trace array uses NULL as parent */ | ||
| 6529 | if (tr->dir) | 6720 | if (tr->dir) | 
| 6530 | return tr->dir; | 6721 | return NULL; | 
| 6531 | 6722 | ||
| 6532 | if (WARN_ON(!debugfs_initialized())) | 6723 | if (WARN_ON(!debugfs_initialized())) | 
| 6533 | return ERR_PTR(-ENODEV); | 6724 | return ERR_PTR(-ENODEV); | 
| 6534 | 6725 | ||
| 6535 | tr->dir = debugfs_create_dir("tracing", NULL); | 6726 | /* | 
| 6536 | 6727 | * As there may still be users that expect the tracing | |
| 6728 | * files to exist in debugfs/tracing, we must automount | ||
| 6729 | * the tracefs file system there, so older tools still | ||
| 6730 | * work with the newer kerenl. | ||
| 6731 | */ | ||
| 6732 | tr->dir = debugfs_create_automount("tracing", NULL, | ||
| 6733 | trace_automount, NULL); | ||
| 6537 | if (!tr->dir) { | 6734 | if (!tr->dir) { | 
| 6538 | pr_warn_once("Could not create debugfs directory 'tracing'\n"); | 6735 | pr_warn_once("Could not create debugfs directory 'tracing'\n"); | 
| 6539 | return ERR_PTR(-ENOMEM); | 6736 | return ERR_PTR(-ENOMEM); | 
| 6540 | } | 6737 | } | 
| 6541 | 6738 | ||
| 6542 | return tr->dir; | 6739 | return NULL; | 
| 6740 | } | ||
| 6741 | |||
| 6742 | extern struct trace_enum_map *__start_ftrace_enum_maps[]; | ||
| 6743 | extern struct trace_enum_map *__stop_ftrace_enum_maps[]; | ||
| 6744 | |||
| 6745 | static void __init trace_enum_init(void) | ||
| 6746 | { | ||
| 6747 | int len; | ||
| 6748 | |||
| 6749 | len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps; | ||
| 6750 | trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len); | ||
| 6751 | } | ||
| 6752 | |||
| 6753 | #ifdef CONFIG_MODULES | ||
| 6754 | static void trace_module_add_enums(struct module *mod) | ||
| 6755 | { | ||
| 6756 | if (!mod->num_trace_enums) | ||
| 6757 | return; | ||
| 6758 | |||
| 6759 | /* | ||
| 6760 | * Modules with bad taint do not have events created, do | ||
| 6761 | * not bother with enums either. | ||
| 6762 | */ | ||
| 6763 | if (trace_module_has_bad_taint(mod)) | ||
| 6764 | return; | ||
| 6765 | |||
| 6766 | trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums); | ||
| 6543 | } | 6767 | } | 
| 6544 | 6768 | ||
| 6545 | static __init int tracer_init_debugfs(void) | 6769 | #ifdef CONFIG_TRACE_ENUM_MAP_FILE | 
| 6770 | static void trace_module_remove_enums(struct module *mod) | ||
| 6771 | { | ||
| 6772 | union trace_enum_map_item *map; | ||
| 6773 | union trace_enum_map_item **last = &trace_enum_maps; | ||
| 6774 | |||
| 6775 | if (!mod->num_trace_enums) | ||
| 6776 | return; | ||
| 6777 | |||
| 6778 | mutex_lock(&trace_enum_mutex); | ||
| 6779 | |||
| 6780 | map = trace_enum_maps; | ||
| 6781 | |||
| 6782 | while (map) { | ||
| 6783 | if (map->head.mod == mod) | ||
| 6784 | break; | ||
| 6785 | map = trace_enum_jmp_to_tail(map); | ||
| 6786 | last = &map->tail.next; | ||
| 6787 | map = map->tail.next; | ||
| 6788 | } | ||
| 6789 | if (!map) | ||
| 6790 | goto out; | ||
| 6791 | |||
| 6792 | *last = trace_enum_jmp_to_tail(map)->tail.next; | ||
| 6793 | kfree(map); | ||
| 6794 | out: | ||
| 6795 | mutex_unlock(&trace_enum_mutex); | ||
| 6796 | } | ||
| 6797 | #else | ||
| 6798 | static inline void trace_module_remove_enums(struct module *mod) { } | ||
| 6799 | #endif /* CONFIG_TRACE_ENUM_MAP_FILE */ | ||
| 6800 | |||
| 6801 | static int trace_module_notify(struct notifier_block *self, | ||
| 6802 | unsigned long val, void *data) | ||
| 6803 | { | ||
| 6804 | struct module *mod = data; | ||
| 6805 | |||
| 6806 | switch (val) { | ||
| 6807 | case MODULE_STATE_COMING: | ||
| 6808 | trace_module_add_enums(mod); | ||
| 6809 | break; | ||
| 6810 | case MODULE_STATE_GOING: | ||
| 6811 | trace_module_remove_enums(mod); | ||
| 6812 | break; | ||
| 6813 | } | ||
| 6814 | |||
| 6815 | return 0; | ||
| 6816 | } | ||
| 6817 | |||
| 6818 | static struct notifier_block trace_module_nb = { | ||
| 6819 | .notifier_call = trace_module_notify, | ||
| 6820 | .priority = 0, | ||
| 6821 | }; | ||
| 6822 | #endif /* CONFIG_MODULES */ | ||
| 6823 | |||
| 6824 | static __init int tracer_init_tracefs(void) | ||
| 6546 | { | 6825 | { | 
| 6547 | struct dentry *d_tracer; | 6826 | struct dentry *d_tracer; | 
| 6548 | 6827 | ||
| @@ -6552,7 +6831,7 @@ static __init int tracer_init_debugfs(void) | |||
| 6552 | if (IS_ERR(d_tracer)) | 6831 | if (IS_ERR(d_tracer)) | 
| 6553 | return 0; | 6832 | return 0; | 
| 6554 | 6833 | ||
| 6555 | init_tracer_debugfs(&global_trace, d_tracer); | 6834 | init_tracer_tracefs(&global_trace, d_tracer); | 
| 6556 | 6835 | ||
| 6557 | trace_create_file("tracing_thresh", 0644, d_tracer, | 6836 | trace_create_file("tracing_thresh", 0644, d_tracer, | 
| 6558 | &global_trace, &tracing_thresh_fops); | 6837 | &global_trace, &tracing_thresh_fops); | 
| @@ -6566,6 +6845,14 @@ static __init int tracer_init_debugfs(void) | |||
| 6566 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, | 6845 | trace_create_file("saved_cmdlines_size", 0644, d_tracer, | 
| 6567 | NULL, &tracing_saved_cmdlines_size_fops); | 6846 | NULL, &tracing_saved_cmdlines_size_fops); | 
| 6568 | 6847 | ||
| 6848 | trace_enum_init(); | ||
| 6849 | |||
| 6850 | trace_create_enum_file(d_tracer); | ||
| 6851 | |||
| 6852 | #ifdef CONFIG_MODULES | ||
| 6853 | register_module_notifier(&trace_module_nb); | ||
| 6854 | #endif | ||
| 6855 | |||
| 6569 | #ifdef CONFIG_DYNAMIC_FTRACE | 6856 | #ifdef CONFIG_DYNAMIC_FTRACE | 
| 6570 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 6857 | trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, | 
| 6571 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 6858 | &ftrace_update_tot_cnt, &tracing_dyn_info_fops); | 
| @@ -6575,6 +6862,10 @@ static __init int tracer_init_debugfs(void) | |||
| 6575 | 6862 | ||
| 6576 | create_trace_options_dir(&global_trace); | 6863 | create_trace_options_dir(&global_trace); | 
| 6577 | 6864 | ||
| 6865 | /* If the tracer was started via cmdline, create options for it here */ | ||
| 6866 | if (global_trace.current_trace != &nop_trace) | ||
| 6867 | update_tracer_options(&global_trace, global_trace.current_trace); | ||
| 6868 | |||
| 6578 | return 0; | 6869 | return 0; | 
| 6579 | } | 6870 | } | 
| 6580 | 6871 | ||
| @@ -6888,7 +7179,7 @@ void __init trace_init(void) | |||
| 6888 | tracepoint_printk = 0; | 7179 | tracepoint_printk = 0; | 
| 6889 | } | 7180 | } | 
| 6890 | tracer_alloc_buffers(); | 7181 | tracer_alloc_buffers(); | 
| 6891 | trace_event_init(); | 7182 | trace_event_init(); | 
| 6892 | } | 7183 | } | 
| 6893 | 7184 | ||
| 6894 | __init static int clear_boot_tracer(void) | 7185 | __init static int clear_boot_tracer(void) | 
| @@ -6910,5 +7201,5 @@ __init static int clear_boot_tracer(void) | |||
| 6910 | return 0; | 7201 | return 0; | 
| 6911 | } | 7202 | } | 
| 6912 | 7203 | ||
| 6913 | fs_initcall(tracer_init_debugfs); | 7204 | fs_initcall(tracer_init_tracefs); | 
| 6914 | late_initcall(clear_boot_tracer); | 7205 | late_initcall(clear_boot_tracer); | 
| diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dd8205a35760..d2612016de94 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -334,7 +334,7 @@ struct tracer_flags { | |||
| 334 | 334 | ||
| 335 | 335 | ||
| 336 | /** | 336 | /** | 
| 337 | * struct tracer - a specific tracer and its callbacks to interact with debugfs | 337 | * struct tracer - a specific tracer and its callbacks to interact with tracefs | 
| 338 | * @name: the name chosen to select it on the available_tracers file | 338 | * @name: the name chosen to select it on the available_tracers file | 
| 339 | * @init: called when one switches to this tracer (echo name > current_tracer) | 339 | * @init: called when one switches to this tracer (echo name > current_tracer) | 
| 340 | * @reset: called when one switches to another tracer | 340 | * @reset: called when one switches to another tracer | 
| @@ -1309,8 +1309,10 @@ static inline void init_ftrace_syscalls(void) { } | |||
| 1309 | 1309 | ||
| 1310 | #ifdef CONFIG_EVENT_TRACING | 1310 | #ifdef CONFIG_EVENT_TRACING | 
| 1311 | void trace_event_init(void); | 1311 | void trace_event_init(void); | 
| 1312 | void trace_event_enum_update(struct trace_enum_map **map, int len); | ||
| 1312 | #else | 1313 | #else | 
| 1313 | static inline void __init trace_event_init(void) { } | 1314 | static inline void __init trace_event_init(void) { } | 
| 1315 | static inlin void trace_event_enum_update(struct trace_enum_map **map, int len) { } | ||
| 1314 | #endif | 1316 | #endif | 
| 1315 | 1317 | ||
| 1316 | extern struct trace_iterator *tracepoint_print_iter; | 1318 | extern struct trace_iterator *tracepoint_print_iter; | 
| diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e2d027ac66a2..ee7b94a4810a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
| @@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry, | |||
| 223 | __dynamic_array( u32, buf ) | 223 | __dynamic_array( u32, buf ) | 
| 224 | ), | 224 | ), | 
| 225 | 225 | ||
| 226 | F_printk("%pf: %s", | 226 | F_printk("%ps: %s", | 
| 227 | (void *)__entry->ip, __entry->fmt), | 227 | (void *)__entry->ip, __entry->fmt), | 
| 228 | 228 | ||
| 229 | FILTER_OTHER | 229 | FILTER_OTHER | 
| @@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry, | |||
| 238 | __dynamic_array( char, buf ) | 238 | __dynamic_array( char, buf ) | 
| 239 | ), | 239 | ), | 
| 240 | 240 | ||
| 241 | F_printk("%pf: %s", | 241 | F_printk("%ps: %s", | 
| 242 | (void *)__entry->ip, __entry->buf), | 242 | (void *)__entry->ip, __entry->buf), | 
| 243 | 243 | ||
| 244 | FILTER_OTHER | 244 | FILTER_OTHER | 
| @@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry, | |||
| 253 | __field( const char *, str ) | 253 | __field( const char *, str ) | 
| 254 | ), | 254 | ), | 
| 255 | 255 | ||
| 256 | F_printk("%pf: %s", | 256 | F_printk("%ps: %s", | 
| 257 | (void *)__entry->ip, __entry->str), | 257 | (void *)__entry->ip, __entry->str), | 
| 258 | 258 | ||
| 259 | FILTER_OTHER | 259 | FILTER_OTHER | 
| diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db54dda10ccc..7da1dfeb322e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -13,7 +13,7 @@ | |||
| 13 | #include <linux/workqueue.h> | 13 | #include <linux/workqueue.h> | 
| 14 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> | 
| 15 | #include <linux/kthread.h> | 15 | #include <linux/kthread.h> | 
| 16 | #include <linux/debugfs.h> | 16 | #include <linux/tracefs.h> | 
| 17 | #include <linux/uaccess.h> | 17 | #include <linux/uaccess.h> | 
| 18 | #include <linux/module.h> | 18 | #include <linux/module.h> | 
| 19 | #include <linux/ctype.h> | 19 | #include <linux/ctype.h> | 
| @@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) | |||
| 480 | return; | 480 | return; | 
| 481 | 481 | ||
| 482 | if (!--dir->nr_events) { | 482 | if (!--dir->nr_events) { | 
| 483 | debugfs_remove_recursive(dir->entry); | 483 | tracefs_remove_recursive(dir->entry); | 
| 484 | list_del(&dir->list); | 484 | list_del(&dir->list); | 
| 485 | __put_system_dir(dir); | 485 | __put_system_dir(dir); | 
| 486 | } | 486 | } | 
| @@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) | |||
| 499 | } | 499 | } | 
| 500 | spin_unlock(&dir->d_lock); | 500 | spin_unlock(&dir->d_lock); | 
| 501 | 501 | ||
| 502 | debugfs_remove_recursive(dir); | 502 | tracefs_remove_recursive(dir); | 
| 503 | } | 503 | } | 
| 504 | 504 | ||
| 505 | list_del(&file->list); | 505 | list_del(&file->list); | 
| @@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
| 1526 | } else | 1526 | } else | 
| 1527 | __get_system(system); | 1527 | __get_system(system); | 
| 1528 | 1528 | ||
| 1529 | dir->entry = debugfs_create_dir(name, parent); | 1529 | dir->entry = tracefs_create_dir(name, parent); | 
| 1530 | if (!dir->entry) { | 1530 | if (!dir->entry) { | 
| 1531 | pr_warn("Failed to create system directory %s\n", name); | 1531 | pr_warn("Failed to create system directory %s\n", name); | 
| 1532 | __put_system(system); | 1532 | __put_system(system); | 
| @@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name, | |||
| 1539 | dir->subsystem = system; | 1539 | dir->subsystem = system; | 
| 1540 | file->system = dir; | 1540 | file->system = dir; | 
| 1541 | 1541 | ||
| 1542 | entry = debugfs_create_file("filter", 0644, dir->entry, dir, | 1542 | entry = tracefs_create_file("filter", 0644, dir->entry, dir, | 
| 1543 | &ftrace_subsystem_filter_fops); | 1543 | &ftrace_subsystem_filter_fops); | 
| 1544 | if (!entry) { | 1544 | if (!entry) { | 
| 1545 | kfree(system->filter); | 1545 | kfree(system->filter); | 
| 1546 | system->filter = NULL; | 1546 | system->filter = NULL; | 
| 1547 | pr_warn("Could not create debugfs '%s/filter' entry\n", name); | 1547 | pr_warn("Could not create tracefs '%s/filter' entry\n", name); | 
| 1548 | } | 1548 | } | 
| 1549 | 1549 | ||
| 1550 | trace_create_file("enable", 0644, dir->entry, dir, | 1550 | trace_create_file("enable", 0644, dir->entry, dir, | 
| @@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) | |||
| 1585 | d_events = parent; | 1585 | d_events = parent; | 
| 1586 | 1586 | ||
| 1587 | name = ftrace_event_name(call); | 1587 | name = ftrace_event_name(call); | 
| 1588 | file->dir = debugfs_create_dir(name, d_events); | 1588 | file->dir = tracefs_create_dir(name, d_events); | 
| 1589 | if (!file->dir) { | 1589 | if (!file->dir) { | 
| 1590 | pr_warn("Could not create debugfs '%s' directory\n", name); | 1590 | pr_warn("Could not create tracefs '%s' directory\n", name); | 
| 1591 | return -1; | 1591 | return -1; | 
| 1592 | } | 1592 | } | 
| 1593 | 1593 | ||
| @@ -1704,6 +1704,125 @@ __register_event(struct ftrace_event_call *call, struct module *mod) | |||
| 1704 | return 0; | 1704 | return 0; | 
| 1705 | } | 1705 | } | 
| 1706 | 1706 | ||
| 1707 | static char *enum_replace(char *ptr, struct trace_enum_map *map, int len) | ||
| 1708 | { | ||
| 1709 | int rlen; | ||
| 1710 | int elen; | ||
| 1711 | |||
| 1712 | /* Find the length of the enum value as a string */ | ||
| 1713 | elen = snprintf(ptr, 0, "%ld", map->enum_value); | ||
| 1714 | /* Make sure there's enough room to replace the string with the value */ | ||
| 1715 | if (len < elen) | ||
| 1716 | return NULL; | ||
| 1717 | |||
| 1718 | snprintf(ptr, elen + 1, "%ld", map->enum_value); | ||
| 1719 | |||
| 1720 | /* Get the rest of the string of ptr */ | ||
| 1721 | rlen = strlen(ptr + len); | ||
| 1722 | memmove(ptr + elen, ptr + len, rlen); | ||
| 1723 | /* Make sure we end the new string */ | ||
| 1724 | ptr[elen + rlen] = 0; | ||
| 1725 | |||
| 1726 | return ptr + elen; | ||
| 1727 | } | ||
| 1728 | |||
| 1729 | static void update_event_printk(struct ftrace_event_call *call, | ||
| 1730 | struct trace_enum_map *map) | ||
| 1731 | { | ||
| 1732 | char *ptr; | ||
| 1733 | int quote = 0; | ||
| 1734 | int len = strlen(map->enum_string); | ||
| 1735 | |||
| 1736 | for (ptr = call->print_fmt; *ptr; ptr++) { | ||
| 1737 | if (*ptr == '\\') { | ||
| 1738 | ptr++; | ||
| 1739 | /* paranoid */ | ||
| 1740 | if (!*ptr) | ||
| 1741 | break; | ||
| 1742 | continue; | ||
| 1743 | } | ||
| 1744 | if (*ptr == '"') { | ||
| 1745 | quote ^= 1; | ||
| 1746 | continue; | ||
| 1747 | } | ||
| 1748 | if (quote) | ||
| 1749 | continue; | ||
| 1750 | if (isdigit(*ptr)) { | ||
| 1751 | /* skip numbers */ | ||
| 1752 | do { | ||
| 1753 | ptr++; | ||
| 1754 | /* Check for alpha chars like ULL */ | ||
| 1755 | } while (isalnum(*ptr)); | ||
| 1756 | /* | ||
| 1757 | * A number must have some kind of delimiter after | ||
| 1758 | * it, and we can ignore that too. | ||
| 1759 | */ | ||
| 1760 | continue; | ||
| 1761 | } | ||
| 1762 | if (isalpha(*ptr) || *ptr == '_') { | ||
| 1763 | if (strncmp(map->enum_string, ptr, len) == 0 && | ||
| 1764 | !isalnum(ptr[len]) && ptr[len] != '_') { | ||
| 1765 | ptr = enum_replace(ptr, map, len); | ||
| 1766 | /* Hmm, enum string smaller than value */ | ||
| 1767 | if (WARN_ON_ONCE(!ptr)) | ||
| 1768 | return; | ||
| 1769 | /* | ||
| 1770 | * No need to decrement here, as enum_replace() | ||
| 1771 | * returns the pointer to the character passed | ||
| 1772 | * the enum, and two enums can not be placed | ||
| 1773 | * back to back without something in between. | ||
| 1774 | * We can skip that something in between. | ||
| 1775 | */ | ||
| 1776 | continue; | ||
| 1777 | } | ||
| 1778 | skip_more: | ||
| 1779 | do { | ||
| 1780 | ptr++; | ||
| 1781 | } while (isalnum(*ptr) || *ptr == '_'); | ||
| 1782 | /* | ||
| 1783 | * If what comes after this variable is a '.' or | ||
| 1784 | * '->' then we can continue to ignore that string. | ||
| 1785 | */ | ||
| 1786 | if (*ptr == '.' || (ptr[0] == '-' && ptr[1] == '>')) { | ||
| 1787 | ptr += *ptr == '.' ? 1 : 2; | ||
| 1788 | goto skip_more; | ||
| 1789 | } | ||
| 1790 | /* | ||
| 1791 | * Once again, we can skip the delimiter that came | ||
| 1792 | * after the string. | ||
| 1793 | */ | ||
| 1794 | continue; | ||
| 1795 | } | ||
| 1796 | } | ||
| 1797 | } | ||
| 1798 | |||
| 1799 | void trace_event_enum_update(struct trace_enum_map **map, int len) | ||
| 1800 | { | ||
| 1801 | struct ftrace_event_call *call, *p; | ||
| 1802 | const char *last_system = NULL; | ||
| 1803 | int last_i; | ||
| 1804 | int i; | ||
| 1805 | |||
| 1806 | down_write(&trace_event_sem); | ||
| 1807 | list_for_each_entry_safe(call, p, &ftrace_events, list) { | ||
| 1808 | /* events are usually grouped together with systems */ | ||
| 1809 | if (!last_system || call->class->system != last_system) { | ||
| 1810 | last_i = 0; | ||
| 1811 | last_system = call->class->system; | ||
| 1812 | } | ||
| 1813 | |||
| 1814 | for (i = last_i; i < len; i++) { | ||
| 1815 | if (call->class->system == map[i]->system) { | ||
| 1816 | /* Save the first system if need be */ | ||
| 1817 | if (!last_i) | ||
| 1818 | last_i = i; | ||
| 1819 | update_event_printk(call, map[i]); | ||
| 1820 | } | ||
| 1821 | } | ||
| 1822 | } | ||
| 1823 | up_write(&trace_event_sem); | ||
| 1824 | } | ||
| 1825 | |||
| 1707 | static struct ftrace_event_file * | 1826 | static struct ftrace_event_file * | 
| 1708 | trace_create_new_event(struct ftrace_event_call *call, | 1827 | trace_create_new_event(struct ftrace_event_call *call, | 
| 1709 | struct trace_array *tr) | 1828 | struct trace_array *tr) | 
| @@ -1915,7 +2034,7 @@ static int trace_module_notify(struct notifier_block *self, | |||
| 1915 | 2034 | ||
| 1916 | static struct notifier_block trace_module_nb = { | 2035 | static struct notifier_block trace_module_nb = { | 
| 1917 | .notifier_call = trace_module_notify, | 2036 | .notifier_call = trace_module_notify, | 
| 1918 | .priority = 0, | 2037 | .priority = 1, /* higher than trace.c module notify */ | 
| 1919 | }; | 2038 | }; | 
| 1920 | #endif /* CONFIG_MODULES */ | 2039 | #endif /* CONFIG_MODULES */ | 
| 1921 | 2040 | ||
| @@ -2228,7 +2347,7 @@ static inline int register_event_cmds(void) { return 0; } | |||
| 2228 | /* | 2347 | /* | 
| 2229 | * The top level array has already had its ftrace_event_file | 2348 | * The top level array has already had its ftrace_event_file | 
| 2230 | * descriptors created in order to allow for early events to | 2349 | * descriptors created in order to allow for early events to | 
| 2231 | * be recorded. This function is called after the debugfs has been | 2350 | * be recorded. This function is called after the tracefs has been | 
| 2232 | * initialized, and we now have to create the files associated | 2351 | * initialized, and we now have to create the files associated | 
| 2233 | * to the events. | 2352 | * to the events. | 
| 2234 | */ | 2353 | */ | 
| @@ -2311,16 +2430,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) | |||
| 2311 | struct dentry *d_events; | 2430 | struct dentry *d_events; | 
| 2312 | struct dentry *entry; | 2431 | struct dentry *entry; | 
| 2313 | 2432 | ||
| 2314 | entry = debugfs_create_file("set_event", 0644, parent, | 2433 | entry = tracefs_create_file("set_event", 0644, parent, | 
| 2315 | tr, &ftrace_set_event_fops); | 2434 | tr, &ftrace_set_event_fops); | 
| 2316 | if (!entry) { | 2435 | if (!entry) { | 
| 2317 | pr_warn("Could not create debugfs 'set_event' entry\n"); | 2436 | pr_warn("Could not create tracefs 'set_event' entry\n"); | 
| 2318 | return -ENOMEM; | 2437 | return -ENOMEM; | 
| 2319 | } | 2438 | } | 
| 2320 | 2439 | ||
| 2321 | d_events = debugfs_create_dir("events", parent); | 2440 | d_events = tracefs_create_dir("events", parent); | 
| 2322 | if (!d_events) { | 2441 | if (!d_events) { | 
| 2323 | pr_warn("Could not create debugfs 'events' directory\n"); | 2442 | pr_warn("Could not create tracefs 'events' directory\n"); | 
| 2324 | return -ENOMEM; | 2443 | return -ENOMEM; | 
| 2325 | } | 2444 | } | 
| 2326 | 2445 | ||
| @@ -2412,7 +2531,7 @@ int event_trace_del_tracer(struct trace_array *tr) | |||
| 2412 | 2531 | ||
| 2413 | down_write(&trace_event_sem); | 2532 | down_write(&trace_event_sem); | 
| 2414 | __trace_remove_event_dirs(tr); | 2533 | __trace_remove_event_dirs(tr); | 
| 2415 | debugfs_remove_recursive(tr->event_dir); | 2534 | tracefs_remove_recursive(tr->event_dir); | 
| 2416 | up_write(&trace_event_sem); | 2535 | up_write(&trace_event_sem); | 
| 2417 | 2536 | ||
| 2418 | tr->event_dir = NULL; | 2537 | tr->event_dir = NULL; | 
| @@ -2534,10 +2653,10 @@ static __init int event_trace_init(void) | |||
| 2534 | if (IS_ERR(d_tracer)) | 2653 | if (IS_ERR(d_tracer)) | 
| 2535 | return 0; | 2654 | return 0; | 
| 2536 | 2655 | ||
| 2537 | entry = debugfs_create_file("available_events", 0444, d_tracer, | 2656 | entry = tracefs_create_file("available_events", 0444, d_tracer, | 
| 2538 | tr, &ftrace_avail_fops); | 2657 | tr, &ftrace_avail_fops); | 
| 2539 | if (!entry) | 2658 | if (!entry) | 
| 2540 | pr_warn("Could not create debugfs 'available_events' entry\n"); | 2659 | pr_warn("Could not create tracefs 'available_events' entry\n"); | 
| 2541 | 2660 | ||
| 2542 | if (trace_define_common_fields()) | 2661 | if (trace_define_common_fields()) | 
| 2543 | pr_warn("tracing: Failed to allocate common fields"); | 2662 | pr_warn("tracing: Failed to allocate common fields"); | 
| diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 12e2b99be862..174a6a71146c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
| @@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = { \ | |||
| 177 | }, \ | 177 | }, \ | 
| 178 | .event.type = etype, \ | 178 | .event.type = etype, \ | 
| 179 | .print_fmt = print, \ | 179 | .print_fmt = print, \ | 
| 180 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ | 180 | .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ | 
| 181 | }; \ | 181 | }; \ | 
| 182 | struct ftrace_event_call __used \ | 182 | struct ftrace_event_call __used \ | 
| 183 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 183 | __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; | 
| diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2d25ad1526bb..9cfea4c6d314 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -6,7 +6,6 @@ | |||
| 6 | * is Copyright (c) Steven Rostedt <srostedt@redhat.com> | 6 | * is Copyright (c) Steven Rostedt <srostedt@redhat.com> | 
| 7 | * | 7 | * | 
| 8 | */ | 8 | */ | 
| 9 | #include <linux/debugfs.h> | ||
| 10 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> | 
| 11 | #include <linux/ftrace.h> | 10 | #include <linux/ftrace.h> | 
| 12 | #include <linux/slab.h> | 11 | #include <linux/slab.h> | 
| @@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, | |||
| 151 | * The curr_ret_stack is initialized to -1 and get increased | 150 | * The curr_ret_stack is initialized to -1 and get increased | 
| 152 | * in this function. So it can be less than -1 only if it was | 151 | * in this function. So it can be less than -1 only if it was | 
| 153 | * filtered out via ftrace_graph_notrace_addr() which can be | 152 | * filtered out via ftrace_graph_notrace_addr() which can be | 
| 154 | * set from set_graph_notrace file in debugfs by user. | 153 | * set from set_graph_notrace file in tracefs by user. | 
| 155 | */ | 154 | */ | 
| 156 | if (current->curr_ret_stack < -1) | 155 | if (current->curr_ret_stack < -1) | 
| 157 | return -EBUSY; | 156 | return -EBUSY; | 
| @@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = { | |||
| 1432 | .llseek = generic_file_llseek, | 1431 | .llseek = generic_file_llseek, | 
| 1433 | }; | 1432 | }; | 
| 1434 | 1433 | ||
| 1435 | static __init int init_graph_debugfs(void) | 1434 | static __init int init_graph_tracefs(void) | 
| 1436 | { | 1435 | { | 
| 1437 | struct dentry *d_tracer; | 1436 | struct dentry *d_tracer; | 
| 1438 | 1437 | ||
| @@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void) | |||
| 1445 | 1444 | ||
| 1446 | return 0; | 1445 | return 0; | 
| 1447 | } | 1446 | } | 
| 1448 | fs_initcall(init_graph_debugfs); | 1447 | fs_initcall(init_graph_tracefs); | 
| 1449 | 1448 | ||
| 1450 | static __init int init_graph_trace(void) | 1449 | static __init int init_graph_trace(void) | 
| 1451 | { | 1450 | { | 
| diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d73f565b4e06..d0ce590f06e1 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size) | |||
| 250 | #define fetch_file_offset_string_size NULL | 250 | #define fetch_file_offset_string_size NULL | 
| 251 | 251 | ||
| 252 | /* Fetch type information table */ | 252 | /* Fetch type information table */ | 
| 253 | const struct fetch_type kprobes_fetch_type_table[] = { | 253 | static const struct fetch_type kprobes_fetch_type_table[] = { | 
| 254 | /* Special types */ | 254 | /* Special types */ | 
| 255 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | 255 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | 
| 256 | sizeof(u32), 1, "__data_loc char[]"), | 256 | sizeof(u32), 1, "__data_loc char[]"), | 
| @@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv) | |||
| 760 | 760 | ||
| 761 | /* Parse fetch argument */ | 761 | /* Parse fetch argument */ | 
| 762 | ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, | 762 | ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, | 
| 763 | is_return, true); | 763 | is_return, true, | 
| 764 | kprobes_fetch_type_table); | ||
| 764 | if (ret) { | 765 | if (ret) { | 
| 765 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 766 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 
| 766 | goto error; | 767 | goto error; | 
| @@ -1134,11 +1135,15 @@ static void | |||
| 1134 | kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) | 1135 | kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) | 
| 1135 | { | 1136 | { | 
| 1136 | struct ftrace_event_call *call = &tk->tp.call; | 1137 | struct ftrace_event_call *call = &tk->tp.call; | 
| 1138 | struct bpf_prog *prog = call->prog; | ||
| 1137 | struct kprobe_trace_entry_head *entry; | 1139 | struct kprobe_trace_entry_head *entry; | 
| 1138 | struct hlist_head *head; | 1140 | struct hlist_head *head; | 
| 1139 | int size, __size, dsize; | 1141 | int size, __size, dsize; | 
| 1140 | int rctx; | 1142 | int rctx; | 
| 1141 | 1143 | ||
| 1144 | if (prog && !trace_call_bpf(prog, regs)) | ||
| 1145 | return; | ||
| 1146 | |||
| 1142 | head = this_cpu_ptr(call->perf_events); | 1147 | head = this_cpu_ptr(call->perf_events); | 
| 1143 | if (hlist_empty(head)) | 1148 | if (hlist_empty(head)) | 
| 1144 | return; | 1149 | return; | 
| @@ -1165,11 +1170,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | |||
| 1165 | struct pt_regs *regs) | 1170 | struct pt_regs *regs) | 
| 1166 | { | 1171 | { | 
| 1167 | struct ftrace_event_call *call = &tk->tp.call; | 1172 | struct ftrace_event_call *call = &tk->tp.call; | 
| 1173 | struct bpf_prog *prog = call->prog; | ||
| 1168 | struct kretprobe_trace_entry_head *entry; | 1174 | struct kretprobe_trace_entry_head *entry; | 
| 1169 | struct hlist_head *head; | 1175 | struct hlist_head *head; | 
| 1170 | int size, __size, dsize; | 1176 | int size, __size, dsize; | 
| 1171 | int rctx; | 1177 | int rctx; | 
| 1172 | 1178 | ||
| 1179 | if (prog && !trace_call_bpf(prog, regs)) | ||
| 1180 | return; | ||
| 1181 | |||
| 1173 | head = this_cpu_ptr(call->perf_events); | 1182 | head = this_cpu_ptr(call->perf_events); | 
| 1174 | if (hlist_empty(head)) | 1183 | if (hlist_empty(head)) | 
| 1175 | return; | 1184 | return; | 
| @@ -1286,7 +1295,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) | |||
| 1286 | kfree(call->print_fmt); | 1295 | kfree(call->print_fmt); | 
| 1287 | return -ENODEV; | 1296 | return -ENODEV; | 
| 1288 | } | 1297 | } | 
| 1289 | call->flags = 0; | 1298 | call->flags = TRACE_EVENT_FL_KPROBE; | 
| 1290 | call->class->reg = kprobe_register; | 1299 | call->class->reg = kprobe_register; | 
| 1291 | call->data = tk; | 1300 | call->data = tk; | 
| 1292 | ret = trace_add_event_call(call); | 1301 | ret = trace_add_event_call(call); | 
| @@ -1310,7 +1319,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) | |||
| 1310 | return ret; | 1319 | return ret; | 
| 1311 | } | 1320 | } | 
| 1312 | 1321 | ||
| 1313 | /* Make a debugfs interface for controlling probe points */ | 1322 | /* Make a tracefs interface for controlling probe points */ | 
| 1314 | static __init int init_kprobe_trace(void) | 1323 | static __init int init_kprobe_trace(void) | 
| 1315 | { | 1324 | { | 
| 1316 | struct dentry *d_tracer; | 1325 | struct dentry *d_tracer; | 
| @@ -1323,20 +1332,20 @@ static __init int init_kprobe_trace(void) | |||
| 1323 | if (IS_ERR(d_tracer)) | 1332 | if (IS_ERR(d_tracer)) | 
| 1324 | return 0; | 1333 | return 0; | 
| 1325 | 1334 | ||
| 1326 | entry = debugfs_create_file("kprobe_events", 0644, d_tracer, | 1335 | entry = tracefs_create_file("kprobe_events", 0644, d_tracer, | 
| 1327 | NULL, &kprobe_events_ops); | 1336 | NULL, &kprobe_events_ops); | 
| 1328 | 1337 | ||
| 1329 | /* Event list interface */ | 1338 | /* Event list interface */ | 
| 1330 | if (!entry) | 1339 | if (!entry) | 
| 1331 | pr_warning("Could not create debugfs " | 1340 | pr_warning("Could not create tracefs " | 
| 1332 | "'kprobe_events' entry\n"); | 1341 | "'kprobe_events' entry\n"); | 
| 1333 | 1342 | ||
| 1334 | /* Profile interface */ | 1343 | /* Profile interface */ | 
| 1335 | entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, | 1344 | entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, | 
| 1336 | NULL, &kprobe_profile_ops); | 1345 | NULL, &kprobe_profile_ops); | 
| 1337 | 1346 | ||
| 1338 | if (!entry) | 1347 | if (!entry) | 
| 1339 | pr_warning("Could not create debugfs " | 1348 | pr_warning("Could not create tracefs " | 
| 1340 | "'kprobe_profile' entry\n"); | 1349 | "'kprobe_profile' entry\n"); | 
| 1341 | return 0; | 1350 | return 0; | 
| 1342 | } | 1351 | } | 
| diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index b983b2fd2ca1..1769a81da8a7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
| @@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, | |||
| 356 | 356 | ||
| 357 | /* Recursive argument parser */ | 357 | /* Recursive argument parser */ | 
| 358 | static int parse_probe_arg(char *arg, const struct fetch_type *t, | 358 | static int parse_probe_arg(char *arg, const struct fetch_type *t, | 
| 359 | struct fetch_param *f, bool is_return, bool is_kprobe) | 359 | struct fetch_param *f, bool is_return, bool is_kprobe, | 
| 360 | const struct fetch_type *ftbl) | ||
| 360 | { | 361 | { | 
| 361 | const struct fetch_type *ftbl; | ||
| 362 | unsigned long param; | 362 | unsigned long param; | 
| 363 | long offset; | 363 | long offset; | 
| 364 | char *tmp; | 364 | char *tmp; | 
| 365 | int ret = 0; | 365 | int ret = 0; | 
| 366 | 366 | ||
| 367 | ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; | ||
| 368 | BUG_ON(ftbl == NULL); | ||
| 369 | |||
| 370 | switch (arg[0]) { | 367 | switch (arg[0]) { | 
| 371 | case '$': | 368 | case '$': | 
| 372 | ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); | 369 | ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); | 
| @@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, | |||
| 447 | dprm->fetch_size = get_fetch_size_function(t, | 444 | dprm->fetch_size = get_fetch_size_function(t, | 
| 448 | dprm->fetch, ftbl); | 445 | dprm->fetch, ftbl); | 
| 449 | ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, | 446 | ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, | 
| 450 | is_kprobe); | 447 | is_kprobe, ftbl); | 
| 451 | if (ret) | 448 | if (ret) | 
| 452 | kfree(dprm); | 449 | kfree(dprm); | 
| 453 | else { | 450 | else { | 
| @@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf, | |||
| 505 | 502 | ||
| 506 | /* String length checking wrapper */ | 503 | /* String length checking wrapper */ | 
| 507 | int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | 504 | int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | 
| 508 | struct probe_arg *parg, bool is_return, bool is_kprobe) | 505 | struct probe_arg *parg, bool is_return, bool is_kprobe, | 
| 506 | const struct fetch_type *ftbl) | ||
| 509 | { | 507 | { | 
| 510 | const struct fetch_type *ftbl; | ||
| 511 | const char *t; | 508 | const char *t; | 
| 512 | int ret; | 509 | int ret; | 
| 513 | 510 | ||
| 514 | ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; | ||
| 515 | BUG_ON(ftbl == NULL); | ||
| 516 | |||
| 517 | if (strlen(arg) > MAX_ARGSTR_LEN) { | 511 | if (strlen(arg) > MAX_ARGSTR_LEN) { | 
| 518 | pr_info("Argument is too long.: %s\n", arg); | 512 | pr_info("Argument is too long.: %s\n", arg); | 
| 519 | return -ENOSPC; | 513 | return -ENOSPC; | 
| @@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | |||
| 535 | } | 529 | } | 
| 536 | parg->offset = *size; | 530 | parg->offset = *size; | 
| 537 | *size += parg->type->size; | 531 | *size += parg->type->size; | 
| 538 | ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); | 532 | ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, | 
| 533 | is_kprobe, ftbl); | ||
| 539 | 534 | ||
| 540 | if (ret >= 0 && t != NULL) | 535 | if (ret >= 0 && t != NULL) | 
| 541 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | 536 | ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); | 
| diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4f815fbce16d..ab283e146b70 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | #include <linux/seq_file.h> | 25 | #include <linux/seq_file.h> | 
| 26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> | 
| 27 | #include <linux/smp.h> | 27 | #include <linux/smp.h> | 
| 28 | #include <linux/debugfs.h> | 28 | #include <linux/tracefs.h> | 
| 29 | #include <linux/types.h> | 29 | #include <linux/types.h> | 
| 30 | #include <linux/string.h> | 30 | #include <linux/string.h> | 
| 31 | #include <linux/ctype.h> | 31 | #include <linux/ctype.h> | 
| @@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ | |||
| 229 | #define FETCH_TYPE_STRING 0 | 229 | #define FETCH_TYPE_STRING 0 | 
| 230 | #define FETCH_TYPE_STRSIZE 1 | 230 | #define FETCH_TYPE_STRSIZE 1 | 
| 231 | 231 | ||
| 232 | /* | ||
| 233 | * Fetch type information table. | ||
| 234 | * It's declared as a weak symbol due to conditional compilation. | ||
| 235 | */ | ||
| 236 | extern __weak const struct fetch_type kprobes_fetch_type_table[]; | ||
| 237 | extern __weak const struct fetch_type uprobes_fetch_type_table[]; | ||
| 238 | |||
| 239 | #ifdef CONFIG_KPROBE_EVENT | 232 | #ifdef CONFIG_KPROBE_EVENT | 
| 240 | struct symbol_cache; | 233 | struct symbol_cache; | 
| 241 | unsigned long update_symbol_cache(struct symbol_cache *sc); | 234 | unsigned long update_symbol_cache(struct symbol_cache *sc); | 
| @@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) | |||
| 333 | } | 326 | } | 
| 334 | 327 | ||
| 335 | extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | 328 | extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, | 
| 336 | struct probe_arg *parg, bool is_return, bool is_kprobe); | 329 | struct probe_arg *parg, bool is_return, bool is_kprobe, | 
| 330 | const struct fetch_type *ftbl); | ||
| 337 | 331 | ||
| 338 | extern int traceprobe_conflict_field_name(const char *name, | 332 | extern int traceprobe_conflict_field_name(const char *name, | 
| 339 | struct probe_arg *args, int narg); | 333 | struct probe_arg *args, int narg); | 
| diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c3e4fcfddd45..3f34496244e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -327,11 +327,11 @@ static void t_stop(struct seq_file *m, void *p) | |||
| 327 | local_irq_enable(); | 327 | local_irq_enable(); | 
| 328 | } | 328 | } | 
| 329 | 329 | ||
| 330 | static int trace_lookup_stack(struct seq_file *m, long i) | 330 | static void trace_lookup_stack(struct seq_file *m, long i) | 
| 331 | { | 331 | { | 
| 332 | unsigned long addr = stack_dump_trace[i]; | 332 | unsigned long addr = stack_dump_trace[i]; | 
| 333 | 333 | ||
| 334 | return seq_printf(m, "%pS\n", (void *)addr); | 334 | seq_printf(m, "%pS\n", (void *)addr); | 
| 335 | } | 335 | } | 
| 336 | 336 | ||
| 337 | static void print_disabled(struct seq_file *m) | 337 | static void print_disabled(struct seq_file *m) | 
| diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 75e19e86c954..6cf935316769 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c | |||
| @@ -12,7 +12,7 @@ | |||
| 12 | #include <linux/list.h> | 12 | #include <linux/list.h> | 
| 13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> | 
| 14 | #include <linux/rbtree.h> | 14 | #include <linux/rbtree.h> | 
| 15 | #include <linux/debugfs.h> | 15 | #include <linux/tracefs.h> | 
| 16 | #include "trace_stat.h" | 16 | #include "trace_stat.h" | 
| 17 | #include "trace.h" | 17 | #include "trace.h" | 
| 18 | 18 | ||
| @@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session) | |||
| 65 | 65 | ||
| 66 | static void destroy_session(struct stat_session *session) | 66 | static void destroy_session(struct stat_session *session) | 
| 67 | { | 67 | { | 
| 68 | debugfs_remove(session->file); | 68 | tracefs_remove(session->file); | 
| 69 | __reset_stat_session(session); | 69 | __reset_stat_session(session); | 
| 70 | mutex_destroy(&session->stat_mutex); | 70 | mutex_destroy(&session->stat_mutex); | 
| 71 | kfree(session); | 71 | kfree(session); | 
| @@ -279,9 +279,9 @@ static int tracing_stat_init(void) | |||
| 279 | if (IS_ERR(d_tracing)) | 279 | if (IS_ERR(d_tracing)) | 
| 280 | return 0; | 280 | return 0; | 
| 281 | 281 | ||
| 282 | stat_dir = debugfs_create_dir("trace_stat", d_tracing); | 282 | stat_dir = tracefs_create_dir("trace_stat", d_tracing); | 
| 283 | if (!stat_dir) | 283 | if (!stat_dir) | 
| 284 | pr_warning("Could not create debugfs " | 284 | pr_warning("Could not create tracefs " | 
| 285 | "'trace_stat' entry\n"); | 285 | "'trace_stat' entry\n"); | 
| 286 | return 0; | 286 | return 0; | 
| 287 | } | 287 | } | 
| @@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session) | |||
| 291 | if (!stat_dir && tracing_stat_init()) | 291 | if (!stat_dir && tracing_stat_init()) | 
| 292 | return -ENODEV; | 292 | return -ENODEV; | 
| 293 | 293 | ||
| 294 | session->file = debugfs_create_file(session->ts->name, 0644, | 294 | session->file = tracefs_create_file(session->ts->name, 0644, | 
| 295 | stat_dir, | 295 | stat_dir, | 
| 296 | session, &tracing_stat_fops); | 296 | session, &tracing_stat_fops); | 
| 297 | if (!session->file) | 297 | if (!session->file) | 
| diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7dc1c8abecd6..d60fe62ec4fa 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
| @@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string) | |||
| 196 | DEFINE_FETCH_file_offset(string_size) | 196 | DEFINE_FETCH_file_offset(string_size) | 
| 197 | 197 | ||
| 198 | /* Fetch type information table */ | 198 | /* Fetch type information table */ | 
| 199 | const struct fetch_type uprobes_fetch_type_table[] = { | 199 | static const struct fetch_type uprobes_fetch_type_table[] = { | 
| 200 | /* Special types */ | 200 | /* Special types */ | 
| 201 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | 201 | [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, | 
| 202 | sizeof(u32), 1, "__data_loc char[]"), | 202 | sizeof(u32), 1, "__data_loc char[]"), | 
| @@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv) | |||
| 535 | 535 | ||
| 536 | /* Parse fetch argument */ | 536 | /* Parse fetch argument */ | 
| 537 | ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, | 537 | ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, | 
| 538 | is_return, false); | 538 | is_return, false, | 
| 539 | uprobes_fetch_type_table); | ||
| 539 | if (ret) { | 540 | if (ret) { | 
| 540 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 541 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | 
| 541 | goto error; | 542 | goto error; | 
| @@ -1005,7 +1006,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) | |||
| 1005 | return true; | 1006 | return true; | 
| 1006 | 1007 | ||
| 1007 | list_for_each_entry(event, &filter->perf_events, hw.tp_list) { | 1008 | list_for_each_entry(event, &filter->perf_events, hw.tp_list) { | 
| 1008 | if (event->hw.tp_target->mm == mm) | 1009 | if (event->hw.target->mm == mm) | 
| 1009 | return true; | 1010 | return true; | 
| 1010 | } | 1011 | } | 
| 1011 | 1012 | ||
| @@ -1015,7 +1016,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) | |||
| 1015 | static inline bool | 1016 | static inline bool | 
| 1016 | uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) | 1017 | uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) | 
| 1017 | { | 1018 | { | 
| 1018 | return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); | 1019 | return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); | 
| 1019 | } | 1020 | } | 
| 1020 | 1021 | ||
| 1021 | static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) | 1022 | static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) | 
| @@ -1023,10 +1024,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) | |||
| 1023 | bool done; | 1024 | bool done; | 
| 1024 | 1025 | ||
| 1025 | write_lock(&tu->filter.rwlock); | 1026 | write_lock(&tu->filter.rwlock); | 
| 1026 | if (event->hw.tp_target) { | 1027 | if (event->hw.target) { | 
| 1027 | list_del(&event->hw.tp_list); | 1028 | list_del(&event->hw.tp_list); | 
| 1028 | done = tu->filter.nr_systemwide || | 1029 | done = tu->filter.nr_systemwide || | 
| 1029 | (event->hw.tp_target->flags & PF_EXITING) || | 1030 | (event->hw.target->flags & PF_EXITING) || | 
| 1030 | uprobe_filter_event(tu, event); | 1031 | uprobe_filter_event(tu, event); | 
| 1031 | } else { | 1032 | } else { | 
| 1032 | tu->filter.nr_systemwide--; | 1033 | tu->filter.nr_systemwide--; | 
| @@ -1046,7 +1047,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) | |||
| 1046 | int err; | 1047 | int err; | 
| 1047 | 1048 | ||
| 1048 | write_lock(&tu->filter.rwlock); | 1049 | write_lock(&tu->filter.rwlock); | 
| 1049 | if (event->hw.tp_target) { | 1050 | if (event->hw.target) { | 
| 1050 | /* | 1051 | /* | 
| 1051 | * event->parent != NULL means copy_process(), we can avoid | 1052 | * event->parent != NULL means copy_process(), we can avoid | 
| 1052 | * uprobe_apply(). current->mm must be probed and we can rely | 1053 | * uprobe_apply(). current->mm must be probed and we can rely | 
| diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3174bf8e3538..2316f50b07a4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -24,8 +24,33 @@ | |||
| 24 | #include <linux/kvm_para.h> | 24 | #include <linux/kvm_para.h> | 
| 25 | #include <linux/perf_event.h> | 25 | #include <linux/perf_event.h> | 
| 26 | 26 | ||
| 27 | int watchdog_user_enabled = 1; | 27 | /* | 
| 28 | * The run state of the lockup detectors is controlled by the content of the | ||
| 29 | * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - | ||
| 30 | * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. | ||
| 31 | * | ||
| 32 | * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' | ||
| 33 | * are variables that are only used as an 'interface' between the parameters | ||
| 34 | * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The | ||
| 35 | * 'watchdog_thresh' variable is handled differently because its value is not | ||
| 36 | * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' | ||
| 37 | * is equal zero. | ||
| 38 | */ | ||
| 39 | #define NMI_WATCHDOG_ENABLED_BIT 0 | ||
| 40 | #define SOFT_WATCHDOG_ENABLED_BIT 1 | ||
| 41 | #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) | ||
| 42 | #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) | ||
| 43 | |||
| 44 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 45 | static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; | ||
| 46 | #else | ||
| 47 | static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; | ||
| 48 | #endif | ||
| 49 | int __read_mostly nmi_watchdog_enabled; | ||
| 50 | int __read_mostly soft_watchdog_enabled; | ||
| 51 | int __read_mostly watchdog_user_enabled; | ||
| 28 | int __read_mostly watchdog_thresh = 10; | 52 | int __read_mostly watchdog_thresh = 10; | 
| 53 | |||
| 29 | #ifdef CONFIG_SMP | 54 | #ifdef CONFIG_SMP | 
| 30 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | 55 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | 
| 31 | #else | 56 | #else | 
| @@ -58,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn; | |||
| 58 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 83 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 
| 59 | static int hardlockup_panic = | 84 | static int hardlockup_panic = | 
| 60 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | 85 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | 
| 61 | |||
| 62 | static bool hardlockup_detector_enabled = true; | ||
| 63 | /* | 86 | /* | 
| 64 | * We may not want to enable hard lockup detection by default in all cases, | 87 | * We may not want to enable hard lockup detection by default in all cases, | 
| 65 | * for example when running the kernel as a guest on a hypervisor. In these | 88 | * for example when running the kernel as a guest on a hypervisor. In these | 
| @@ -68,14 +91,9 @@ static bool hardlockup_detector_enabled = true; | |||
| 68 | * kernel command line parameters are parsed, because otherwise it is not | 91 | * kernel command line parameters are parsed, because otherwise it is not | 
| 69 | * possible to override this in hardlockup_panic_setup(). | 92 | * possible to override this in hardlockup_panic_setup(). | 
| 70 | */ | 93 | */ | 
| 71 | void watchdog_enable_hardlockup_detector(bool val) | 94 | void hardlockup_detector_disable(void) | 
| 72 | { | ||
| 73 | hardlockup_detector_enabled = val; | ||
| 74 | } | ||
| 75 | |||
| 76 | bool watchdog_hardlockup_detector_is_enabled(void) | ||
| 77 | { | 95 | { | 
| 78 | return hardlockup_detector_enabled; | 96 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | 
| 79 | } | 97 | } | 
| 80 | 98 | ||
| 81 | static int __init hardlockup_panic_setup(char *str) | 99 | static int __init hardlockup_panic_setup(char *str) | 
| @@ -85,15 +103,9 @@ static int __init hardlockup_panic_setup(char *str) | |||
| 85 | else if (!strncmp(str, "nopanic", 7)) | 103 | else if (!strncmp(str, "nopanic", 7)) | 
| 86 | hardlockup_panic = 0; | 104 | hardlockup_panic = 0; | 
| 87 | else if (!strncmp(str, "0", 1)) | 105 | else if (!strncmp(str, "0", 1)) | 
| 88 | watchdog_user_enabled = 0; | 106 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | 
| 89 | else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) { | 107 | else if (!strncmp(str, "1", 1)) | 
| 90 | /* | 108 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | 
| 91 | * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option) | ||
| 92 | * has the same effect. | ||
| 93 | */ | ||
| 94 | watchdog_user_enabled = 1; | ||
| 95 | watchdog_enable_hardlockup_detector(true); | ||
| 96 | } | ||
| 97 | return 1; | 109 | return 1; | 
| 98 | } | 110 | } | 
| 99 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 111 | __setup("nmi_watchdog=", hardlockup_panic_setup); | 
| @@ -112,19 +124,18 @@ __setup("softlockup_panic=", softlockup_panic_setup); | |||
| 112 | 124 | ||
| 113 | static int __init nowatchdog_setup(char *str) | 125 | static int __init nowatchdog_setup(char *str) | 
| 114 | { | 126 | { | 
| 115 | watchdog_user_enabled = 0; | 127 | watchdog_enabled = 0; | 
| 116 | return 1; | 128 | return 1; | 
| 117 | } | 129 | } | 
| 118 | __setup("nowatchdog", nowatchdog_setup); | 130 | __setup("nowatchdog", nowatchdog_setup); | 
| 119 | 131 | ||
| 120 | /* deprecated */ | ||
| 121 | static int __init nosoftlockup_setup(char *str) | 132 | static int __init nosoftlockup_setup(char *str) | 
| 122 | { | 133 | { | 
| 123 | watchdog_user_enabled = 0; | 134 | watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; | 
| 124 | return 1; | 135 | return 1; | 
| 125 | } | 136 | } | 
| 126 | __setup("nosoftlockup", nosoftlockup_setup); | 137 | __setup("nosoftlockup", nosoftlockup_setup); | 
| 127 | /* */ | 138 | |
| 128 | #ifdef CONFIG_SMP | 139 | #ifdef CONFIG_SMP | 
| 129 | static int __init softlockup_all_cpu_backtrace_setup(char *str) | 140 | static int __init softlockup_all_cpu_backtrace_setup(char *str) | 
| 130 | { | 141 | { | 
| @@ -239,10 +250,11 @@ static int is_softlockup(unsigned long touch_ts) | |||
| 239 | { | 250 | { | 
| 240 | unsigned long now = get_timestamp(); | 251 | unsigned long now = get_timestamp(); | 
| 241 | 252 | ||
| 242 | /* Warn about unreasonable delays: */ | 253 | if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) { | 
| 243 | if (time_after(now, touch_ts + get_softlockup_thresh())) | 254 | /* Warn about unreasonable delays. */ | 
| 244 | return now - touch_ts; | 255 | if (time_after(now, touch_ts + get_softlockup_thresh())) | 
| 245 | 256 | return now - touch_ts; | |
| 257 | } | ||
| 246 | return 0; | 258 | return 0; | 
| 247 | } | 259 | } | 
| 248 | 260 | ||
| @@ -477,6 +489,21 @@ static void watchdog(unsigned int cpu) | |||
| 477 | __this_cpu_write(soft_lockup_hrtimer_cnt, | 489 | __this_cpu_write(soft_lockup_hrtimer_cnt, | 
| 478 | __this_cpu_read(hrtimer_interrupts)); | 490 | __this_cpu_read(hrtimer_interrupts)); | 
| 479 | __touch_watchdog(); | 491 | __touch_watchdog(); | 
| 492 | |||
| 493 | /* | ||
| 494 | * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the | ||
| 495 | * failure path. Check for failures that can occur asynchronously - | ||
| 496 | * for example, when CPUs are on-lined - and shut down the hardware | ||
| 497 | * perf event on each CPU accordingly. | ||
| 498 | * | ||
| 499 | * The only non-obvious place this bit can be cleared is through | ||
| 500 | * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a | ||
| 501 | * pr_info here would be too noisy as it would result in a message | ||
| 502 | * every few seconds if the hardlockup was disabled but the softlockup | ||
| 503 | * enabled. | ||
| 504 | */ | ||
| 505 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
| 506 | watchdog_nmi_disable(cpu); | ||
| 480 | } | 507 | } | 
| 481 | 508 | ||
| 482 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 509 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 
| @@ -492,14 +519,9 @@ static int watchdog_nmi_enable(unsigned int cpu) | |||
| 492 | struct perf_event_attr *wd_attr; | 519 | struct perf_event_attr *wd_attr; | 
| 493 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 520 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 
| 494 | 521 | ||
| 495 | /* | 522 | /* nothing to do if the hard lockup detector is disabled */ | 
| 496 | * Some kernels need to default hard lockup detection to | 523 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | 
| 497 | * 'disabled', for example a guest on a hypervisor. | 524 | goto out; | 
| 498 | */ | ||
| 499 | if (!watchdog_hardlockup_detector_is_enabled()) { | ||
| 500 | event = ERR_PTR(-ENOENT); | ||
| 501 | goto handle_err; | ||
| 502 | } | ||
| 503 | 525 | ||
| 504 | /* is it already setup and enabled? */ | 526 | /* is it already setup and enabled? */ | 
| 505 | if (event && event->state > PERF_EVENT_STATE_OFF) | 527 | if (event && event->state > PERF_EVENT_STATE_OFF) | 
| @@ -515,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu) | |||
| 515 | /* Try to register using hardware perf events */ | 537 | /* Try to register using hardware perf events */ | 
| 516 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 538 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | 
| 517 | 539 | ||
| 518 | handle_err: | ||
| 519 | /* save cpu0 error for future comparision */ | 540 | /* save cpu0 error for future comparision */ | 
| 520 | if (cpu == 0 && IS_ERR(event)) | 541 | if (cpu == 0 && IS_ERR(event)) | 
| 521 | cpu0_err = PTR_ERR(event); | 542 | cpu0_err = PTR_ERR(event); | 
| @@ -527,6 +548,18 @@ handle_err: | |||
| 527 | goto out_save; | 548 | goto out_save; | 
| 528 | } | 549 | } | 
| 529 | 550 | ||
| 551 | /* | ||
| 552 | * Disable the hard lockup detector if _any_ CPU fails to set up | ||
| 553 | * set up the hardware perf event. The watchdog() function checks | ||
| 554 | * the NMI_WATCHDOG_ENABLED bit periodically. | ||
| 555 | * | ||
| 556 | * The barriers are for syncing up watchdog_enabled across all the | ||
| 557 | * cpus, as clear_bit() does not use barriers. | ||
| 558 | */ | ||
| 559 | smp_mb__before_atomic(); | ||
| 560 | clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); | ||
| 561 | smp_mb__after_atomic(); | ||
| 562 | |||
| 530 | /* skip displaying the same error again */ | 563 | /* skip displaying the same error again */ | 
| 531 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | 564 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | 
| 532 | return PTR_ERR(event); | 565 | return PTR_ERR(event); | 
| @@ -540,6 +573,9 @@ handle_err: | |||
| 540 | else | 573 | else | 
| 541 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", | 574 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", | 
| 542 | cpu, PTR_ERR(event)); | 575 | cpu, PTR_ERR(event)); | 
| 576 | |||
| 577 | pr_info("Shutting down hard lockup detector on all cpus\n"); | ||
| 578 | |||
| 543 | return PTR_ERR(event); | 579 | return PTR_ERR(event); | 
| 544 | 580 | ||
| 545 | /* success path */ | 581 | /* success path */ | 
| @@ -567,9 +603,37 @@ static void watchdog_nmi_disable(unsigned int cpu) | |||
| 567 | cpu0_err = 0; | 603 | cpu0_err = 0; | 
| 568 | } | 604 | } | 
| 569 | } | 605 | } | 
| 606 | |||
| 607 | void watchdog_nmi_enable_all(void) | ||
| 608 | { | ||
| 609 | int cpu; | ||
| 610 | |||
| 611 | if (!watchdog_user_enabled) | ||
| 612 | return; | ||
| 613 | |||
| 614 | get_online_cpus(); | ||
| 615 | for_each_online_cpu(cpu) | ||
| 616 | watchdog_nmi_enable(cpu); | ||
| 617 | put_online_cpus(); | ||
| 618 | } | ||
| 619 | |||
| 620 | void watchdog_nmi_disable_all(void) | ||
| 621 | { | ||
| 622 | int cpu; | ||
| 623 | |||
| 624 | if (!watchdog_running) | ||
| 625 | return; | ||
| 626 | |||
| 627 | get_online_cpus(); | ||
| 628 | for_each_online_cpu(cpu) | ||
| 629 | watchdog_nmi_disable(cpu); | ||
| 630 | put_online_cpus(); | ||
| 631 | } | ||
| 570 | #else | 632 | #else | 
| 571 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } | 633 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } | 
| 572 | static void watchdog_nmi_disable(unsigned int cpu) { return; } | 634 | static void watchdog_nmi_disable(unsigned int cpu) { return; } | 
| 635 | void watchdog_nmi_enable_all(void) {} | ||
| 636 | void watchdog_nmi_disable_all(void) {} | ||
| 573 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 637 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 
| 574 | 638 | ||
| 575 | static struct smp_hotplug_thread watchdog_threads = { | 639 | static struct smp_hotplug_thread watchdog_threads = { | 
| @@ -600,7 +664,7 @@ static void restart_watchdog_hrtimer(void *info) | |||
| 600 | HRTIMER_MODE_REL_PINNED); | 664 | HRTIMER_MODE_REL_PINNED); | 
| 601 | } | 665 | } | 
| 602 | 666 | ||
| 603 | static void update_timers(int cpu) | 667 | static void update_watchdog(int cpu) | 
| 604 | { | 668 | { | 
| 605 | /* | 669 | /* | 
| 606 | * Make sure that perf event counter will adopt to a new | 670 | * Make sure that perf event counter will adopt to a new | 
| @@ -615,17 +679,17 @@ static void update_timers(int cpu) | |||
| 615 | watchdog_nmi_enable(cpu); | 679 | watchdog_nmi_enable(cpu); | 
| 616 | } | 680 | } | 
| 617 | 681 | ||
| 618 | static void update_timers_all_cpus(void) | 682 | static void update_watchdog_all_cpus(void) | 
| 619 | { | 683 | { | 
| 620 | int cpu; | 684 | int cpu; | 
| 621 | 685 | ||
| 622 | get_online_cpus(); | 686 | get_online_cpus(); | 
| 623 | for_each_online_cpu(cpu) | 687 | for_each_online_cpu(cpu) | 
| 624 | update_timers(cpu); | 688 | update_watchdog(cpu); | 
| 625 | put_online_cpus(); | 689 | put_online_cpus(); | 
| 626 | } | 690 | } | 
| 627 | 691 | ||
| 628 | static int watchdog_enable_all_cpus(bool sample_period_changed) | 692 | static int watchdog_enable_all_cpus(void) | 
| 629 | { | 693 | { | 
| 630 | int err = 0; | 694 | int err = 0; | 
| 631 | 695 | ||
| @@ -635,8 +699,12 @@ static int watchdog_enable_all_cpus(bool sample_period_changed) | |||
| 635 | pr_err("Failed to create watchdog threads, disabled\n"); | 699 | pr_err("Failed to create watchdog threads, disabled\n"); | 
| 636 | else | 700 | else | 
| 637 | watchdog_running = 1; | 701 | watchdog_running = 1; | 
| 638 | } else if (sample_period_changed) { | 702 | } else { | 
| 639 | update_timers_all_cpus(); | 703 | /* | 
| 704 | * Enable/disable the lockup detectors or | ||
| 705 | * change the sample period 'on the fly'. | ||
| 706 | */ | ||
| 707 | update_watchdog_all_cpus(); | ||
| 640 | } | 708 | } | 
| 641 | 709 | ||
| 642 | return err; | 710 | return err; | 
| @@ -654,58 +722,159 @@ static void watchdog_disable_all_cpus(void) | |||
| 654 | } | 722 | } | 
| 655 | 723 | ||
| 656 | /* | 724 | /* | 
| 657 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh | 725 | * Update the run state of the lockup detectors. | 
| 658 | */ | 726 | */ | 
| 727 | static int proc_watchdog_update(void) | ||
| 728 | { | ||
| 729 | int err = 0; | ||
| 659 | 730 | ||
| 660 | int proc_dowatchdog(struct ctl_table *table, int write, | 731 | /* | 
| 661 | void __user *buffer, size_t *lenp, loff_t *ppos) | 732 | * Watchdog threads won't be started if they are already active. | 
| 733 | * The 'watchdog_running' variable in watchdog_*_all_cpus() takes | ||
| 734 | * care of this. If those threads are already active, the sample | ||
| 735 | * period will be updated and the lockup detectors will be enabled | ||
| 736 | * or disabled 'on the fly'. | ||
| 737 | */ | ||
| 738 | if (watchdog_enabled && watchdog_thresh) | ||
| 739 | err = watchdog_enable_all_cpus(); | ||
| 740 | else | ||
| 741 | watchdog_disable_all_cpus(); | ||
| 742 | |||
| 743 | return err; | ||
| 744 | |||
| 745 | } | ||
| 746 | |||
| 747 | static DEFINE_MUTEX(watchdog_proc_mutex); | ||
| 748 | |||
| 749 | /* | ||
| 750 | * common function for watchdog, nmi_watchdog and soft_watchdog parameter | ||
| 751 | * | ||
| 752 | * caller | table->data points to | 'which' contains the flag(s) | ||
| 753 | * -------------------|-----------------------|----------------------------- | ||
| 754 | * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed | ||
| 755 | * | | with SOFT_WATCHDOG_ENABLED | ||
| 756 | * -------------------|-----------------------|----------------------------- | ||
| 757 | * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED | ||
| 758 | * -------------------|-----------------------|----------------------------- | ||
| 759 | * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED | ||
| 760 | */ | ||
| 761 | static int proc_watchdog_common(int which, struct ctl_table *table, int write, | ||
| 762 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 662 | { | 763 | { | 
| 663 | int err, old_thresh, old_enabled; | 764 | int err, old, new; | 
| 664 | bool old_hardlockup; | 765 | int *watchdog_param = (int *)table->data; | 
| 665 | static DEFINE_MUTEX(watchdog_proc_mutex); | ||
| 666 | 766 | ||
| 667 | mutex_lock(&watchdog_proc_mutex); | 767 | mutex_lock(&watchdog_proc_mutex); | 
| 668 | old_thresh = ACCESS_ONCE(watchdog_thresh); | ||
| 669 | old_enabled = ACCESS_ONCE(watchdog_user_enabled); | ||
| 670 | old_hardlockup = watchdog_hardlockup_detector_is_enabled(); | ||
| 671 | 768 | ||
| 672 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
| 673 | if (err || !write) | ||
| 674 | goto out; | ||
| 675 | |||
| 676 | set_sample_period(); | ||
| 677 | /* | 769 | /* | 
| 678 | * Watchdog threads shouldn't be enabled if they are | 770 | * If the parameter is being read return the state of the corresponding | 
| 679 | * disabled. The 'watchdog_running' variable check in | 771 | * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the | 
| 680 | * watchdog_*_all_cpus() function takes care of this. | 772 | * run state of the lockup detectors. | 
| 681 | */ | 773 | */ | 
| 682 | if (watchdog_user_enabled && watchdog_thresh) { | 774 | if (!write) { | 
| 775 | *watchdog_param = (watchdog_enabled & which) != 0; | ||
| 776 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
| 777 | } else { | ||
| 778 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
| 779 | if (err) | ||
| 780 | goto out; | ||
| 781 | |||
| 683 | /* | 782 | /* | 
| 684 | * Prevent a change in watchdog_thresh accidentally overriding | 783 | * There is a race window between fetching the current value | 
| 685 | * the enablement of the hardlockup detector. | 784 | * from 'watchdog_enabled' and storing the new value. During | 
| 785 | * this race window, watchdog_nmi_enable() can sneak in and | ||
| 786 | * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. | ||
| 787 | * The 'cmpxchg' detects this race and the loop retries. | ||
| 686 | */ | 788 | */ | 
| 687 | if (watchdog_user_enabled != old_enabled) | 789 | do { | 
| 688 | watchdog_enable_hardlockup_detector(true); | 790 | old = watchdog_enabled; | 
| 689 | err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); | 791 | /* | 
| 690 | } else | 792 | * If the parameter value is not zero set the | 
| 691 | watchdog_disable_all_cpus(); | 793 | * corresponding bit(s), else clear it(them). | 
| 794 | */ | ||
| 795 | if (*watchdog_param) | ||
| 796 | new = old | which; | ||
| 797 | else | ||
| 798 | new = old & ~which; | ||
| 799 | } while (cmpxchg(&watchdog_enabled, old, new) != old); | ||
| 692 | 800 | ||
| 693 | /* Restore old values on failure */ | 801 | /* | 
| 694 | if (err) { | 802 | * Update the run state of the lockup detectors. | 
| 695 | watchdog_thresh = old_thresh; | 803 | * Restore 'watchdog_enabled' on failure. | 
| 696 | watchdog_user_enabled = old_enabled; | 804 | */ | 
| 697 | watchdog_enable_hardlockup_detector(old_hardlockup); | 805 | err = proc_watchdog_update(); | 
| 806 | if (err) | ||
| 807 | watchdog_enabled = old; | ||
| 698 | } | 808 | } | 
| 699 | out: | 809 | out: | 
| 700 | mutex_unlock(&watchdog_proc_mutex); | 810 | mutex_unlock(&watchdog_proc_mutex); | 
| 701 | return err; | 811 | return err; | 
| 702 | } | 812 | } | 
| 813 | |||
| 814 | /* | ||
| 815 | * /proc/sys/kernel/watchdog | ||
| 816 | */ | ||
| 817 | int proc_watchdog(struct ctl_table *table, int write, | ||
| 818 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 819 | { | ||
| 820 | return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, | ||
| 821 | table, write, buffer, lenp, ppos); | ||
| 822 | } | ||
| 823 | |||
| 824 | /* | ||
| 825 | * /proc/sys/kernel/nmi_watchdog | ||
| 826 | */ | ||
| 827 | int proc_nmi_watchdog(struct ctl_table *table, int write, | ||
| 828 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 829 | { | ||
| 830 | return proc_watchdog_common(NMI_WATCHDOG_ENABLED, | ||
| 831 | table, write, buffer, lenp, ppos); | ||
| 832 | } | ||
| 833 | |||
| 834 | /* | ||
| 835 | * /proc/sys/kernel/soft_watchdog | ||
| 836 | */ | ||
| 837 | int proc_soft_watchdog(struct ctl_table *table, int write, | ||
| 838 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 839 | { | ||
| 840 | return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, | ||
| 841 | table, write, buffer, lenp, ppos); | ||
| 842 | } | ||
| 843 | |||
| 844 | /* | ||
| 845 | * /proc/sys/kernel/watchdog_thresh | ||
| 846 | */ | ||
| 847 | int proc_watchdog_thresh(struct ctl_table *table, int write, | ||
| 848 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 849 | { | ||
| 850 | int err, old; | ||
| 851 | |||
| 852 | mutex_lock(&watchdog_proc_mutex); | ||
| 853 | |||
| 854 | old = ACCESS_ONCE(watchdog_thresh); | ||
| 855 | err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
| 856 | |||
| 857 | if (err || !write) | ||
| 858 | goto out; | ||
| 859 | |||
| 860 | /* | ||
| 861 | * Update the sample period. | ||
| 862 | * Restore 'watchdog_thresh' on failure. | ||
| 863 | */ | ||
| 864 | set_sample_period(); | ||
| 865 | err = proc_watchdog_update(); | ||
| 866 | if (err) | ||
| 867 | watchdog_thresh = old; | ||
| 868 | out: | ||
| 869 | mutex_unlock(&watchdog_proc_mutex); | ||
| 870 | return err; | ||
| 871 | } | ||
| 703 | #endif /* CONFIG_SYSCTL */ | 872 | #endif /* CONFIG_SYSCTL */ | 
| 704 | 873 | ||
| 705 | void __init lockup_detector_init(void) | 874 | void __init lockup_detector_init(void) | 
| 706 | { | 875 | { | 
| 707 | set_sample_period(); | 876 | set_sample_period(); | 
| 708 | 877 | ||
| 709 | if (watchdog_user_enabled) | 878 | if (watchdog_enabled) | 
| 710 | watchdog_enable_all_cpus(false); | 879 | watchdog_enable_all_cpus(); | 
| 711 | } | 880 | } | 
| diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 41ff75b478c6..586ad91300b0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -159,6 +159,7 @@ struct worker_pool { | |||
| 159 | 159 | ||
| 160 | /* see manage_workers() for details on the two manager mutexes */ | 160 | /* see manage_workers() for details on the two manager mutexes */ | 
| 161 | struct mutex manager_arb; /* manager arbitration */ | 161 | struct mutex manager_arb; /* manager arbitration */ | 
| 162 | struct worker *manager; /* L: purely informational */ | ||
| 162 | struct mutex attach_mutex; /* attach/detach exclusion */ | 163 | struct mutex attach_mutex; /* attach/detach exclusion */ | 
| 163 | struct list_head workers; /* A: attached workers */ | 164 | struct list_head workers; /* A: attached workers */ | 
| 164 | struct completion *detach_completion; /* all workers detached */ | 165 | struct completion *detach_completion; /* all workers detached */ | 
| @@ -230,7 +231,7 @@ struct wq_device; | |||
| 230 | */ | 231 | */ | 
| 231 | struct workqueue_struct { | 232 | struct workqueue_struct { | 
| 232 | struct list_head pwqs; /* WR: all pwqs of this wq */ | 233 | struct list_head pwqs; /* WR: all pwqs of this wq */ | 
| 233 | struct list_head list; /* PL: list of all workqueues */ | 234 | struct list_head list; /* PR: list of all workqueues */ | 
| 234 | 235 | ||
| 235 | struct mutex mutex; /* protects this wq */ | 236 | struct mutex mutex; /* protects this wq */ | 
| 236 | int work_color; /* WQ: current work color */ | 237 | int work_color; /* WQ: current work color */ | 
| @@ -257,6 +258,13 @@ struct workqueue_struct { | |||
| 257 | #endif | 258 | #endif | 
| 258 | char name[WQ_NAME_LEN]; /* I: workqueue name */ | 259 | char name[WQ_NAME_LEN]; /* I: workqueue name */ | 
| 259 | 260 | ||
| 261 | /* | ||
| 262 | * Destruction of workqueue_struct is sched-RCU protected to allow | ||
| 263 | * walking the workqueues list without grabbing wq_pool_mutex. | ||
| 264 | * This is used to dump all workqueues from sysrq. | ||
| 265 | */ | ||
| 266 | struct rcu_head rcu; | ||
| 267 | |||
| 260 | /* hot fields used during command issue, aligned to cacheline */ | 268 | /* hot fields used during command issue, aligned to cacheline */ | 
| 261 | unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ | 269 | unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ | 
| 262 | struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ | 270 | struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ | 
| @@ -288,7 +296,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; | |||
| 288 | static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ | 296 | static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ | 
| 289 | static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ | 297 | static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ | 
| 290 | 298 | ||
| 291 | static LIST_HEAD(workqueues); /* PL: list of all workqueues */ | 299 | static LIST_HEAD(workqueues); /* PR: list of all workqueues */ | 
| 292 | static bool workqueue_freezing; /* PL: have wqs started freezing? */ | 300 | static bool workqueue_freezing; /* PL: have wqs started freezing? */ | 
| 293 | 301 | ||
| 294 | /* the per-cpu worker pools */ | 302 | /* the per-cpu worker pools */ | 
| @@ -324,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); | |||
| 324 | static int worker_thread(void *__worker); | 332 | static int worker_thread(void *__worker); | 
| 325 | static void copy_workqueue_attrs(struct workqueue_attrs *to, | 333 | static void copy_workqueue_attrs(struct workqueue_attrs *to, | 
| 326 | const struct workqueue_attrs *from); | 334 | const struct workqueue_attrs *from); | 
| 335 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq); | ||
| 327 | 336 | ||
| 328 | #define CREATE_TRACE_POINTS | 337 | #define CREATE_TRACE_POINTS | 
| 329 | #include <trace/events/workqueue.h> | 338 | #include <trace/events/workqueue.h> | 
| @@ -1911,9 +1920,11 @@ static bool manage_workers(struct worker *worker) | |||
| 1911 | */ | 1920 | */ | 
| 1912 | if (!mutex_trylock(&pool->manager_arb)) | 1921 | if (!mutex_trylock(&pool->manager_arb)) | 
| 1913 | return false; | 1922 | return false; | 
| 1923 | pool->manager = worker; | ||
| 1914 | 1924 | ||
| 1915 | maybe_create_worker(pool); | 1925 | maybe_create_worker(pool); | 
| 1916 | 1926 | ||
| 1927 | pool->manager = NULL; | ||
| 1917 | mutex_unlock(&pool->manager_arb); | 1928 | mutex_unlock(&pool->manager_arb); | 
| 1918 | return true; | 1929 | return true; | 
| 1919 | } | 1930 | } | 
| @@ -2303,6 +2314,7 @@ repeat: | |||
| 2303 | struct wq_barrier { | 2314 | struct wq_barrier { | 
| 2304 | struct work_struct work; | 2315 | struct work_struct work; | 
| 2305 | struct completion done; | 2316 | struct completion done; | 
| 2317 | struct task_struct *task; /* purely informational */ | ||
| 2306 | }; | 2318 | }; | 
| 2307 | 2319 | ||
| 2308 | static void wq_barrier_func(struct work_struct *work) | 2320 | static void wq_barrier_func(struct work_struct *work) | 
| @@ -2351,6 +2363,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, | |||
| 2351 | INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); | 2363 | INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); | 
| 2352 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); | 2364 | __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); | 
| 2353 | init_completion(&barr->done); | 2365 | init_completion(&barr->done); | 
| 2366 | barr->task = current; | ||
| 2354 | 2367 | ||
| 2355 | /* | 2368 | /* | 
| 2356 | * If @target is currently being executed, schedule the | 2369 | * If @target is currently being executed, schedule the | 
| @@ -2989,323 +3002,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) | |||
| 2989 | } | 3002 | } | 
| 2990 | EXPORT_SYMBOL_GPL(execute_in_process_context); | 3003 | EXPORT_SYMBOL_GPL(execute_in_process_context); | 
| 2991 | 3004 | ||
| 2992 | #ifdef CONFIG_SYSFS | ||
| 2993 | /* | ||
| 2994 | * Workqueues with WQ_SYSFS flag set is visible to userland via | ||
| 2995 | * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the | ||
| 2996 | * following attributes. | ||
| 2997 | * | ||
| 2998 | * per_cpu RO bool : whether the workqueue is per-cpu or unbound | ||
| 2999 | * max_active RW int : maximum number of in-flight work items | ||
| 3000 | * | ||
| 3001 | * Unbound workqueues have the following extra attributes. | ||
| 3002 | * | ||
| 3003 | * id RO int : the associated pool ID | ||
| 3004 | * nice RW int : nice value of the workers | ||
| 3005 | * cpumask RW mask : bitmask of allowed CPUs for the workers | ||
| 3006 | */ | ||
| 3007 | struct wq_device { | ||
| 3008 | struct workqueue_struct *wq; | ||
| 3009 | struct device dev; | ||
| 3010 | }; | ||
| 3011 | |||
| 3012 | static struct workqueue_struct *dev_to_wq(struct device *dev) | ||
| 3013 | { | ||
| 3014 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
| 3015 | |||
| 3016 | return wq_dev->wq; | ||
| 3017 | } | ||
| 3018 | |||
| 3019 | static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, | ||
| 3020 | char *buf) | ||
| 3021 | { | ||
| 3022 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 3023 | |||
| 3024 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | ||
| 3025 | } | ||
| 3026 | static DEVICE_ATTR_RO(per_cpu); | ||
| 3027 | |||
| 3028 | static ssize_t max_active_show(struct device *dev, | ||
| 3029 | struct device_attribute *attr, char *buf) | ||
| 3030 | { | ||
| 3031 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 3032 | |||
| 3033 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | ||
| 3034 | } | ||
| 3035 | |||
| 3036 | static ssize_t max_active_store(struct device *dev, | ||
| 3037 | struct device_attribute *attr, const char *buf, | ||
| 3038 | size_t count) | ||
| 3039 | { | ||
| 3040 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 3041 | int val; | ||
| 3042 | |||
| 3043 | if (sscanf(buf, "%d", &val) != 1 || val <= 0) | ||
| 3044 | return -EINVAL; | ||
| 3045 | |||
| 3046 | workqueue_set_max_active(wq, val); | ||
| 3047 | return count; | ||
| 3048 | } | ||
| 3049 | static DEVICE_ATTR_RW(max_active); | ||
| 3050 | |||
| 3051 | static struct attribute *wq_sysfs_attrs[] = { | ||
| 3052 | &dev_attr_per_cpu.attr, | ||
| 3053 | &dev_attr_max_active.attr, | ||
| 3054 | NULL, | ||
| 3055 | }; | ||
| 3056 | ATTRIBUTE_GROUPS(wq_sysfs); | ||
| 3057 | |||
| 3058 | static ssize_t wq_pool_ids_show(struct device *dev, | ||
| 3059 | struct device_attribute *attr, char *buf) | ||
| 3060 | { | ||
| 3061 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 3062 | const char *delim = ""; | ||
| 3063 | int node, written = 0; | ||
| 3064 | |||
| 3065 | rcu_read_lock_sched(); | ||
| 3066 | for_each_node(node) { | ||
| 3067 | written += scnprintf(buf + written, PAGE_SIZE - written, | ||
| 3068 | "%s%d:%d", delim, node, | ||
| 3069 | unbound_pwq_by_node(wq, node)->pool->id); | ||
| 3070 | delim = " "; | ||
| 3071 | } | ||
| 3072 | written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); | ||
| 3073 | rcu_read_unlock_sched(); | ||
| 3074 | |||
| 3075 | return written; | ||
| 3076 | } | ||
| 3077 | |||
| 3078 | static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, | ||
| 3079 | char *buf) | ||
| 3080 | { | ||
| 3081 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 3082 | int written; | ||
| 3083 | |||
| 3084 | mutex_lock(&wq->mutex); | ||
| 3085 | written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); | ||
| 3086 | mutex_unlock(&wq->mutex); | ||
| 3087 | |||
| 3088 | return written; | ||
| 3089 | } | ||
| 3090 | |||
| 3091 | /* prepare workqueue_attrs for sysfs store operations */ | ||
| 3092 | static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) | ||
| 3093 | { | ||
| 3094 | struct workqueue_attrs *attrs; | ||
| 3095 | |||
| 3096 | attrs = alloc_workqueue_attrs(GFP_KERNEL); | ||
| 3097 | if (!attrs) | ||
| 3098 | return NULL; | ||
| 3099 | |||
| 3100 | mutex_lock(&wq->mutex); | ||
| 3101 | copy_workqueue_attrs(attrs, wq->unbound_attrs); | ||
| 3102 | mutex_unlock(&wq->mutex); | ||
| 3103 | return attrs; | ||
| 3104 | } | ||
| 3105 | |||
| 3106 | static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, | ||
| 3107 | const char *buf, size_t count) | ||
| 3108 | { | ||
| 3109 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 3110 | struct workqueue_attrs *attrs; | ||
| 3111 | int ret; | ||
| 3112 | |||
| 3113 | attrs = wq_sysfs_prep_attrs(wq); | ||
| 3114 | if (!attrs) | ||
| 3115 | return -ENOMEM; | ||
| 3116 | |||
| 3117 | if (sscanf(buf, "%d", &attrs->nice) == 1 && | ||
| 3118 | attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) | ||
| 3119 | ret = apply_workqueue_attrs(wq, attrs); | ||
| 3120 | else | ||
| 3121 | ret = -EINVAL; | ||
| 3122 | |||
| 3123 | free_workqueue_attrs(attrs); | ||
| 3124 | return ret ?: count; | ||
| 3125 | } | ||
| 3126 | |||
| 3127 | static ssize_t wq_cpumask_show(struct device *dev, | ||
| 3128 | struct device_attribute *attr, char *buf) | ||
| 3129 | { | ||
| 3130 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 3131 | int written; | ||
| 3132 | |||
| 3133 | mutex_lock(&wq->mutex); | ||
| 3134 | written = scnprintf(buf, PAGE_SIZE, "%*pb\n", | ||
| 3135 | cpumask_pr_args(wq->unbound_attrs->cpumask)); | ||
| 3136 | mutex_unlock(&wq->mutex); | ||
| 3137 | return written; | ||
| 3138 | } | ||
| 3139 | |||
| 3140 | static ssize_t wq_cpumask_store(struct device *dev, | ||
| 3141 | struct device_attribute *attr, | ||
| 3142 | const char *buf, size_t count) | ||
| 3143 | { | ||
| 3144 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 3145 | struct workqueue_attrs *attrs; | ||
| 3146 | int ret; | ||
| 3147 | |||
| 3148 | attrs = wq_sysfs_prep_attrs(wq); | ||
| 3149 | if (!attrs) | ||
| 3150 | return -ENOMEM; | ||
| 3151 | |||
| 3152 | ret = cpumask_parse(buf, attrs->cpumask); | ||
| 3153 | if (!ret) | ||
| 3154 | ret = apply_workqueue_attrs(wq, attrs); | ||
| 3155 | |||
| 3156 | free_workqueue_attrs(attrs); | ||
| 3157 | return ret ?: count; | ||
| 3158 | } | ||
| 3159 | |||
| 3160 | static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, | ||
| 3161 | char *buf) | ||
| 3162 | { | ||
| 3163 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 3164 | int written; | ||
| 3165 | |||
| 3166 | mutex_lock(&wq->mutex); | ||
| 3167 | written = scnprintf(buf, PAGE_SIZE, "%d\n", | ||
| 3168 | !wq->unbound_attrs->no_numa); | ||
| 3169 | mutex_unlock(&wq->mutex); | ||
| 3170 | |||
| 3171 | return written; | ||
| 3172 | } | ||
| 3173 | |||
| 3174 | static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, | ||
| 3175 | const char *buf, size_t count) | ||
| 3176 | { | ||
| 3177 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 3178 | struct workqueue_attrs *attrs; | ||
| 3179 | int v, ret; | ||
| 3180 | |||
| 3181 | attrs = wq_sysfs_prep_attrs(wq); | ||
| 3182 | if (!attrs) | ||
| 3183 | return -ENOMEM; | ||
| 3184 | |||
| 3185 | ret = -EINVAL; | ||
| 3186 | if (sscanf(buf, "%d", &v) == 1) { | ||
| 3187 | attrs->no_numa = !v; | ||
| 3188 | ret = apply_workqueue_attrs(wq, attrs); | ||
| 3189 | } | ||
| 3190 | |||
| 3191 | free_workqueue_attrs(attrs); | ||
| 3192 | return ret ?: count; | ||
| 3193 | } | ||
| 3194 | |||
| 3195 | static struct device_attribute wq_sysfs_unbound_attrs[] = { | ||
| 3196 | __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), | ||
| 3197 | __ATTR(nice, 0644, wq_nice_show, wq_nice_store), | ||
| 3198 | __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), | ||
| 3199 | __ATTR(numa, 0644, wq_numa_show, wq_numa_store), | ||
| 3200 | __ATTR_NULL, | ||
| 3201 | }; | ||
| 3202 | |||
| 3203 | static struct bus_type wq_subsys = { | ||
| 3204 | .name = "workqueue", | ||
| 3205 | .dev_groups = wq_sysfs_groups, | ||
| 3206 | }; | ||
| 3207 | |||
| 3208 | static int __init wq_sysfs_init(void) | ||
| 3209 | { | ||
| 3210 | return subsys_virtual_register(&wq_subsys, NULL); | ||
| 3211 | } | ||
| 3212 | core_initcall(wq_sysfs_init); | ||
| 3213 | |||
| 3214 | static void wq_device_release(struct device *dev) | ||
| 3215 | { | ||
| 3216 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
| 3217 | |||
| 3218 | kfree(wq_dev); | ||
| 3219 | } | ||
| 3220 | |||
| 3221 | /** | ||
| 3222 | * workqueue_sysfs_register - make a workqueue visible in sysfs | ||
| 3223 | * @wq: the workqueue to register | ||
| 3224 | * | ||
| 3225 | * Expose @wq in sysfs under /sys/bus/workqueue/devices. | ||
| 3226 | * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set | ||
| 3227 | * which is the preferred method. | ||
| 3228 | * | ||
| 3229 | * Workqueue user should use this function directly iff it wants to apply | ||
| 3230 | * workqueue_attrs before making the workqueue visible in sysfs; otherwise, | ||
| 3231 | * apply_workqueue_attrs() may race against userland updating the | ||
| 3232 | * attributes. | ||
| 3233 | * | ||
| 3234 | * Return: 0 on success, -errno on failure. | ||
| 3235 | */ | ||
| 3236 | int workqueue_sysfs_register(struct workqueue_struct *wq) | ||
| 3237 | { | ||
| 3238 | struct wq_device *wq_dev; | ||
| 3239 | int ret; | ||
| 3240 | |||
| 3241 | /* | ||
| 3242 | * Adjusting max_active or creating new pwqs by applyting | ||
| 3243 | * attributes breaks ordering guarantee. Disallow exposing ordered | ||
| 3244 | * workqueues. | ||
| 3245 | */ | ||
| 3246 | if (WARN_ON(wq->flags & __WQ_ORDERED)) | ||
| 3247 | return -EINVAL; | ||
| 3248 | |||
| 3249 | wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); | ||
| 3250 | if (!wq_dev) | ||
| 3251 | return -ENOMEM; | ||
| 3252 | |||
| 3253 | wq_dev->wq = wq; | ||
| 3254 | wq_dev->dev.bus = &wq_subsys; | ||
| 3255 | wq_dev->dev.init_name = wq->name; | ||
| 3256 | wq_dev->dev.release = wq_device_release; | ||
| 3257 | |||
| 3258 | /* | ||
| 3259 | * unbound_attrs are created separately. Suppress uevent until | ||
| 3260 | * everything is ready. | ||
| 3261 | */ | ||
| 3262 | dev_set_uevent_suppress(&wq_dev->dev, true); | ||
| 3263 | |||
| 3264 | ret = device_register(&wq_dev->dev); | ||
| 3265 | if (ret) { | ||
| 3266 | kfree(wq_dev); | ||
| 3267 | wq->wq_dev = NULL; | ||
| 3268 | return ret; | ||
| 3269 | } | ||
| 3270 | |||
| 3271 | if (wq->flags & WQ_UNBOUND) { | ||
| 3272 | struct device_attribute *attr; | ||
| 3273 | |||
| 3274 | for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { | ||
| 3275 | ret = device_create_file(&wq_dev->dev, attr); | ||
| 3276 | if (ret) { | ||
| 3277 | device_unregister(&wq_dev->dev); | ||
| 3278 | wq->wq_dev = NULL; | ||
| 3279 | return ret; | ||
| 3280 | } | ||
| 3281 | } | ||
| 3282 | } | ||
| 3283 | |||
| 3284 | dev_set_uevent_suppress(&wq_dev->dev, false); | ||
| 3285 | kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); | ||
| 3286 | return 0; | ||
| 3287 | } | ||
| 3288 | |||
| 3289 | /** | ||
| 3290 | * workqueue_sysfs_unregister - undo workqueue_sysfs_register() | ||
| 3291 | * @wq: the workqueue to unregister | ||
| 3292 | * | ||
| 3293 | * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. | ||
| 3294 | */ | ||
| 3295 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) | ||
| 3296 | { | ||
| 3297 | struct wq_device *wq_dev = wq->wq_dev; | ||
| 3298 | |||
| 3299 | if (!wq->wq_dev) | ||
| 3300 | return; | ||
| 3301 | |||
| 3302 | wq->wq_dev = NULL; | ||
| 3303 | device_unregister(&wq_dev->dev); | ||
| 3304 | } | ||
| 3305 | #else /* CONFIG_SYSFS */ | ||
| 3306 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } | ||
| 3307 | #endif /* CONFIG_SYSFS */ | ||
| 3308 | |||
| 3309 | /** | 3005 | /** | 
| 3310 | * free_workqueue_attrs - free a workqueue_attrs | 3006 | * free_workqueue_attrs - free a workqueue_attrs | 
| 3311 | * @attrs: workqueue_attrs to free | 3007 | * @attrs: workqueue_attrs to free | 
| @@ -3424,6 +3120,20 @@ static int init_worker_pool(struct worker_pool *pool) | |||
| 3424 | return 0; | 3120 | return 0; | 
| 3425 | } | 3121 | } | 
| 3426 | 3122 | ||
| 3123 | static void rcu_free_wq(struct rcu_head *rcu) | ||
| 3124 | { | ||
| 3125 | struct workqueue_struct *wq = | ||
| 3126 | container_of(rcu, struct workqueue_struct, rcu); | ||
| 3127 | |||
| 3128 | if (!(wq->flags & WQ_UNBOUND)) | ||
| 3129 | free_percpu(wq->cpu_pwqs); | ||
| 3130 | else | ||
| 3131 | free_workqueue_attrs(wq->unbound_attrs); | ||
| 3132 | |||
| 3133 | kfree(wq->rescuer); | ||
| 3134 | kfree(wq); | ||
| 3135 | } | ||
| 3136 | |||
| 3427 | static void rcu_free_pool(struct rcu_head *rcu) | 3137 | static void rcu_free_pool(struct rcu_head *rcu) | 
| 3428 | { | 3138 | { | 
| 3429 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); | 3139 | struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); | 
| @@ -3601,12 +3311,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) | |||
| 3601 | 3311 | ||
| 3602 | /* | 3312 | /* | 
| 3603 | * If we're the last pwq going away, @wq is already dead and no one | 3313 | * If we're the last pwq going away, @wq is already dead and no one | 
| 3604 | * is gonna access it anymore. Free it. | 3314 | * is gonna access it anymore. Schedule RCU free. | 
| 3605 | */ | 3315 | */ | 
| 3606 | if (is_last) { | 3316 | if (is_last) | 
| 3607 | free_workqueue_attrs(wq->unbound_attrs); | 3317 | call_rcu_sched(&wq->rcu, rcu_free_wq); | 
| 3608 | kfree(wq); | ||
| 3609 | } | ||
| 3610 | } | 3318 | } | 
| 3611 | 3319 | ||
| 3612 | /** | 3320 | /** | 
| @@ -4143,7 +3851,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, | |||
| 4143 | pwq_adjust_max_active(pwq); | 3851 | pwq_adjust_max_active(pwq); | 
| 4144 | mutex_unlock(&wq->mutex); | 3852 | mutex_unlock(&wq->mutex); | 
| 4145 | 3853 | ||
| 4146 | list_add(&wq->list, &workqueues); | 3854 | list_add_tail_rcu(&wq->list, &workqueues); | 
| 4147 | 3855 | ||
| 4148 | mutex_unlock(&wq_pool_mutex); | 3856 | mutex_unlock(&wq_pool_mutex); | 
| 4149 | 3857 | ||
| @@ -4199,24 +3907,20 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
| 4199 | * flushing is complete in case freeze races us. | 3907 | * flushing is complete in case freeze races us. | 
| 4200 | */ | 3908 | */ | 
| 4201 | mutex_lock(&wq_pool_mutex); | 3909 | mutex_lock(&wq_pool_mutex); | 
| 4202 | list_del_init(&wq->list); | 3910 | list_del_rcu(&wq->list); | 
| 4203 | mutex_unlock(&wq_pool_mutex); | 3911 | mutex_unlock(&wq_pool_mutex); | 
| 4204 | 3912 | ||
| 4205 | workqueue_sysfs_unregister(wq); | 3913 | workqueue_sysfs_unregister(wq); | 
| 4206 | 3914 | ||
| 4207 | if (wq->rescuer) { | 3915 | if (wq->rescuer) | 
| 4208 | kthread_stop(wq->rescuer->task); | 3916 | kthread_stop(wq->rescuer->task); | 
| 4209 | kfree(wq->rescuer); | ||
| 4210 | wq->rescuer = NULL; | ||
| 4211 | } | ||
| 4212 | 3917 | ||
| 4213 | if (!(wq->flags & WQ_UNBOUND)) { | 3918 | if (!(wq->flags & WQ_UNBOUND)) { | 
| 4214 | /* | 3919 | /* | 
| 4215 | * The base ref is never dropped on per-cpu pwqs. Directly | 3920 | * The base ref is never dropped on per-cpu pwqs. Directly | 
| 4216 | * free the pwqs and wq. | 3921 | * schedule RCU free. | 
| 4217 | */ | 3922 | */ | 
| 4218 | free_percpu(wq->cpu_pwqs); | 3923 | call_rcu_sched(&wq->rcu, rcu_free_wq); | 
| 4219 | kfree(wq); | ||
| 4220 | } else { | 3924 | } else { | 
| 4221 | /* | 3925 | /* | 
| 4222 | * We're the sole accessor of @wq at this point. Directly | 3926 | * We're the sole accessor of @wq at this point. Directly | 
| @@ -4437,6 +4141,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) | |||
| 4437 | } | 4141 | } | 
| 4438 | } | 4142 | } | 
| 4439 | 4143 | ||
| 4144 | static void pr_cont_pool_info(struct worker_pool *pool) | ||
| 4145 | { | ||
| 4146 | pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); | ||
| 4147 | if (pool->node != NUMA_NO_NODE) | ||
| 4148 | pr_cont(" node=%d", pool->node); | ||
| 4149 | pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); | ||
| 4150 | } | ||
| 4151 | |||
| 4152 | static void pr_cont_work(bool comma, struct work_struct *work) | ||
| 4153 | { | ||
| 4154 | if (work->func == wq_barrier_func) { | ||
| 4155 | struct wq_barrier *barr; | ||
| 4156 | |||
| 4157 | barr = container_of(work, struct wq_barrier, work); | ||
| 4158 | |||
| 4159 | pr_cont("%s BAR(%d)", comma ? "," : "", | ||
| 4160 | task_pid_nr(barr->task)); | ||
| 4161 | } else { | ||
| 4162 | pr_cont("%s %pf", comma ? "," : "", work->func); | ||
| 4163 | } | ||
| 4164 | } | ||
| 4165 | |||
| 4166 | static void show_pwq(struct pool_workqueue *pwq) | ||
| 4167 | { | ||
| 4168 | struct worker_pool *pool = pwq->pool; | ||
| 4169 | struct work_struct *work; | ||
| 4170 | struct worker *worker; | ||
| 4171 | bool has_in_flight = false, has_pending = false; | ||
| 4172 | int bkt; | ||
| 4173 | |||
| 4174 | pr_info(" pwq %d:", pool->id); | ||
| 4175 | pr_cont_pool_info(pool); | ||
| 4176 | |||
| 4177 | pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, | ||
| 4178 | !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); | ||
| 4179 | |||
| 4180 | hash_for_each(pool->busy_hash, bkt, worker, hentry) { | ||
| 4181 | if (worker->current_pwq == pwq) { | ||
| 4182 | has_in_flight = true; | ||
| 4183 | break; | ||
| 4184 | } | ||
| 4185 | } | ||
| 4186 | if (has_in_flight) { | ||
| 4187 | bool comma = false; | ||
| 4188 | |||
| 4189 | pr_info(" in-flight:"); | ||
| 4190 | hash_for_each(pool->busy_hash, bkt, worker, hentry) { | ||
| 4191 | if (worker->current_pwq != pwq) | ||
| 4192 | continue; | ||
| 4193 | |||
| 4194 | pr_cont("%s %d%s:%pf", comma ? "," : "", | ||
| 4195 | task_pid_nr(worker->task), | ||
| 4196 | worker == pwq->wq->rescuer ? "(RESCUER)" : "", | ||
| 4197 | worker->current_func); | ||
| 4198 | list_for_each_entry(work, &worker->scheduled, entry) | ||
| 4199 | pr_cont_work(false, work); | ||
| 4200 | comma = true; | ||
| 4201 | } | ||
| 4202 | pr_cont("\n"); | ||
| 4203 | } | ||
| 4204 | |||
| 4205 | list_for_each_entry(work, &pool->worklist, entry) { | ||
| 4206 | if (get_work_pwq(work) == pwq) { | ||
| 4207 | has_pending = true; | ||
| 4208 | break; | ||
| 4209 | } | ||
| 4210 | } | ||
| 4211 | if (has_pending) { | ||
| 4212 | bool comma = false; | ||
| 4213 | |||
| 4214 | pr_info(" pending:"); | ||
| 4215 | list_for_each_entry(work, &pool->worklist, entry) { | ||
| 4216 | if (get_work_pwq(work) != pwq) | ||
| 4217 | continue; | ||
| 4218 | |||
| 4219 | pr_cont_work(comma, work); | ||
| 4220 | comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); | ||
| 4221 | } | ||
| 4222 | pr_cont("\n"); | ||
| 4223 | } | ||
| 4224 | |||
| 4225 | if (!list_empty(&pwq->delayed_works)) { | ||
| 4226 | bool comma = false; | ||
| 4227 | |||
| 4228 | pr_info(" delayed:"); | ||
| 4229 | list_for_each_entry(work, &pwq->delayed_works, entry) { | ||
| 4230 | pr_cont_work(comma, work); | ||
| 4231 | comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); | ||
| 4232 | } | ||
| 4233 | pr_cont("\n"); | ||
| 4234 | } | ||
| 4235 | } | ||
| 4236 | |||
| 4237 | /** | ||
| 4238 | * show_workqueue_state - dump workqueue state | ||
| 4239 | * | ||
| 4240 | * Called from a sysrq handler and prints out all busy workqueues and | ||
| 4241 | * pools. | ||
| 4242 | */ | ||
| 4243 | void show_workqueue_state(void) | ||
| 4244 | { | ||
| 4245 | struct workqueue_struct *wq; | ||
| 4246 | struct worker_pool *pool; | ||
| 4247 | unsigned long flags; | ||
| 4248 | int pi; | ||
| 4249 | |||
| 4250 | rcu_read_lock_sched(); | ||
| 4251 | |||
| 4252 | pr_info("Showing busy workqueues and worker pools:\n"); | ||
| 4253 | |||
| 4254 | list_for_each_entry_rcu(wq, &workqueues, list) { | ||
| 4255 | struct pool_workqueue *pwq; | ||
| 4256 | bool idle = true; | ||
| 4257 | |||
| 4258 | for_each_pwq(pwq, wq) { | ||
| 4259 | if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { | ||
| 4260 | idle = false; | ||
| 4261 | break; | ||
| 4262 | } | ||
| 4263 | } | ||
| 4264 | if (idle) | ||
| 4265 | continue; | ||
| 4266 | |||
| 4267 | pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); | ||
| 4268 | |||
| 4269 | for_each_pwq(pwq, wq) { | ||
| 4270 | spin_lock_irqsave(&pwq->pool->lock, flags); | ||
| 4271 | if (pwq->nr_active || !list_empty(&pwq->delayed_works)) | ||
| 4272 | show_pwq(pwq); | ||
| 4273 | spin_unlock_irqrestore(&pwq->pool->lock, flags); | ||
| 4274 | } | ||
| 4275 | } | ||
| 4276 | |||
| 4277 | for_each_pool(pool, pi) { | ||
| 4278 | struct worker *worker; | ||
| 4279 | bool first = true; | ||
| 4280 | |||
| 4281 | spin_lock_irqsave(&pool->lock, flags); | ||
| 4282 | if (pool->nr_workers == pool->nr_idle) | ||
| 4283 | goto next_pool; | ||
| 4284 | |||
| 4285 | pr_info("pool %d:", pool->id); | ||
| 4286 | pr_cont_pool_info(pool); | ||
| 4287 | pr_cont(" workers=%d", pool->nr_workers); | ||
| 4288 | if (pool->manager) | ||
| 4289 | pr_cont(" manager: %d", | ||
| 4290 | task_pid_nr(pool->manager->task)); | ||
| 4291 | list_for_each_entry(worker, &pool->idle_list, entry) { | ||
| 4292 | pr_cont(" %s%d", first ? "idle: " : "", | ||
| 4293 | task_pid_nr(worker->task)); | ||
| 4294 | first = false; | ||
| 4295 | } | ||
| 4296 | pr_cont("\n"); | ||
| 4297 | next_pool: | ||
| 4298 | spin_unlock_irqrestore(&pool->lock, flags); | ||
| 4299 | } | ||
| 4300 | |||
| 4301 | rcu_read_unlock_sched(); | ||
| 4302 | } | ||
| 4303 | |||
| 4440 | /* | 4304 | /* | 
| 4441 | * CPU hotplug. | 4305 | * CPU hotplug. | 
| 4442 | * | 4306 | * | 
| @@ -4834,6 +4698,323 @@ out_unlock: | |||
| 4834 | } | 4698 | } | 
| 4835 | #endif /* CONFIG_FREEZER */ | 4699 | #endif /* CONFIG_FREEZER */ | 
| 4836 | 4700 | ||
| 4701 | #ifdef CONFIG_SYSFS | ||
| 4702 | /* | ||
| 4703 | * Workqueues with WQ_SYSFS flag set is visible to userland via | ||
| 4704 | * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the | ||
| 4705 | * following attributes. | ||
| 4706 | * | ||
| 4707 | * per_cpu RO bool : whether the workqueue is per-cpu or unbound | ||
| 4708 | * max_active RW int : maximum number of in-flight work items | ||
| 4709 | * | ||
| 4710 | * Unbound workqueues have the following extra attributes. | ||
| 4711 | * | ||
| 4712 | * id RO int : the associated pool ID | ||
| 4713 | * nice RW int : nice value of the workers | ||
| 4714 | * cpumask RW mask : bitmask of allowed CPUs for the workers | ||
| 4715 | */ | ||
| 4716 | struct wq_device { | ||
| 4717 | struct workqueue_struct *wq; | ||
| 4718 | struct device dev; | ||
| 4719 | }; | ||
| 4720 | |||
| 4721 | static struct workqueue_struct *dev_to_wq(struct device *dev) | ||
| 4722 | { | ||
| 4723 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
| 4724 | |||
| 4725 | return wq_dev->wq; | ||
| 4726 | } | ||
| 4727 | |||
| 4728 | static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, | ||
| 4729 | char *buf) | ||
| 4730 | { | ||
| 4731 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 4732 | |||
| 4733 | return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); | ||
| 4734 | } | ||
| 4735 | static DEVICE_ATTR_RO(per_cpu); | ||
| 4736 | |||
| 4737 | static ssize_t max_active_show(struct device *dev, | ||
| 4738 | struct device_attribute *attr, char *buf) | ||
| 4739 | { | ||
| 4740 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 4741 | |||
| 4742 | return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); | ||
| 4743 | } | ||
| 4744 | |||
| 4745 | static ssize_t max_active_store(struct device *dev, | ||
| 4746 | struct device_attribute *attr, const char *buf, | ||
| 4747 | size_t count) | ||
| 4748 | { | ||
| 4749 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 4750 | int val; | ||
| 4751 | |||
| 4752 | if (sscanf(buf, "%d", &val) != 1 || val <= 0) | ||
| 4753 | return -EINVAL; | ||
| 4754 | |||
| 4755 | workqueue_set_max_active(wq, val); | ||
| 4756 | return count; | ||
| 4757 | } | ||
| 4758 | static DEVICE_ATTR_RW(max_active); | ||
| 4759 | |||
| 4760 | static struct attribute *wq_sysfs_attrs[] = { | ||
| 4761 | &dev_attr_per_cpu.attr, | ||
| 4762 | &dev_attr_max_active.attr, | ||
| 4763 | NULL, | ||
| 4764 | }; | ||
| 4765 | ATTRIBUTE_GROUPS(wq_sysfs); | ||
| 4766 | |||
| 4767 | static ssize_t wq_pool_ids_show(struct device *dev, | ||
| 4768 | struct device_attribute *attr, char *buf) | ||
| 4769 | { | ||
| 4770 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 4771 | const char *delim = ""; | ||
| 4772 | int node, written = 0; | ||
| 4773 | |||
| 4774 | rcu_read_lock_sched(); | ||
| 4775 | for_each_node(node) { | ||
| 4776 | written += scnprintf(buf + written, PAGE_SIZE - written, | ||
| 4777 | "%s%d:%d", delim, node, | ||
| 4778 | unbound_pwq_by_node(wq, node)->pool->id); | ||
| 4779 | delim = " "; | ||
| 4780 | } | ||
| 4781 | written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); | ||
| 4782 | rcu_read_unlock_sched(); | ||
| 4783 | |||
| 4784 | return written; | ||
| 4785 | } | ||
| 4786 | |||
| 4787 | static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, | ||
| 4788 | char *buf) | ||
| 4789 | { | ||
| 4790 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 4791 | int written; | ||
| 4792 | |||
| 4793 | mutex_lock(&wq->mutex); | ||
| 4794 | written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); | ||
| 4795 | mutex_unlock(&wq->mutex); | ||
| 4796 | |||
| 4797 | return written; | ||
| 4798 | } | ||
| 4799 | |||
| 4800 | /* prepare workqueue_attrs for sysfs store operations */ | ||
| 4801 | static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) | ||
| 4802 | { | ||
| 4803 | struct workqueue_attrs *attrs; | ||
| 4804 | |||
| 4805 | attrs = alloc_workqueue_attrs(GFP_KERNEL); | ||
| 4806 | if (!attrs) | ||
| 4807 | return NULL; | ||
| 4808 | |||
| 4809 | mutex_lock(&wq->mutex); | ||
| 4810 | copy_workqueue_attrs(attrs, wq->unbound_attrs); | ||
| 4811 | mutex_unlock(&wq->mutex); | ||
| 4812 | return attrs; | ||
| 4813 | } | ||
| 4814 | |||
| 4815 | static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, | ||
| 4816 | const char *buf, size_t count) | ||
| 4817 | { | ||
| 4818 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 4819 | struct workqueue_attrs *attrs; | ||
| 4820 | int ret; | ||
| 4821 | |||
| 4822 | attrs = wq_sysfs_prep_attrs(wq); | ||
| 4823 | if (!attrs) | ||
| 4824 | return -ENOMEM; | ||
| 4825 | |||
| 4826 | if (sscanf(buf, "%d", &attrs->nice) == 1 && | ||
| 4827 | attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) | ||
| 4828 | ret = apply_workqueue_attrs(wq, attrs); | ||
| 4829 | else | ||
| 4830 | ret = -EINVAL; | ||
| 4831 | |||
| 4832 | free_workqueue_attrs(attrs); | ||
| 4833 | return ret ?: count; | ||
| 4834 | } | ||
| 4835 | |||
| 4836 | static ssize_t wq_cpumask_show(struct device *dev, | ||
| 4837 | struct device_attribute *attr, char *buf) | ||
| 4838 | { | ||
| 4839 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 4840 | int written; | ||
| 4841 | |||
| 4842 | mutex_lock(&wq->mutex); | ||
| 4843 | written = scnprintf(buf, PAGE_SIZE, "%*pb\n", | ||
| 4844 | cpumask_pr_args(wq->unbound_attrs->cpumask)); | ||
| 4845 | mutex_unlock(&wq->mutex); | ||
| 4846 | return written; | ||
| 4847 | } | ||
| 4848 | |||
| 4849 | static ssize_t wq_cpumask_store(struct device *dev, | ||
| 4850 | struct device_attribute *attr, | ||
| 4851 | const char *buf, size_t count) | ||
| 4852 | { | ||
| 4853 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 4854 | struct workqueue_attrs *attrs; | ||
| 4855 | int ret; | ||
| 4856 | |||
| 4857 | attrs = wq_sysfs_prep_attrs(wq); | ||
| 4858 | if (!attrs) | ||
| 4859 | return -ENOMEM; | ||
| 4860 | |||
| 4861 | ret = cpumask_parse(buf, attrs->cpumask); | ||
| 4862 | if (!ret) | ||
| 4863 | ret = apply_workqueue_attrs(wq, attrs); | ||
| 4864 | |||
| 4865 | free_workqueue_attrs(attrs); | ||
| 4866 | return ret ?: count; | ||
| 4867 | } | ||
| 4868 | |||
| 4869 | static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, | ||
| 4870 | char *buf) | ||
| 4871 | { | ||
| 4872 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 4873 | int written; | ||
| 4874 | |||
| 4875 | mutex_lock(&wq->mutex); | ||
| 4876 | written = scnprintf(buf, PAGE_SIZE, "%d\n", | ||
| 4877 | !wq->unbound_attrs->no_numa); | ||
| 4878 | mutex_unlock(&wq->mutex); | ||
| 4879 | |||
| 4880 | return written; | ||
| 4881 | } | ||
| 4882 | |||
| 4883 | static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, | ||
| 4884 | const char *buf, size_t count) | ||
| 4885 | { | ||
| 4886 | struct workqueue_struct *wq = dev_to_wq(dev); | ||
| 4887 | struct workqueue_attrs *attrs; | ||
| 4888 | int v, ret; | ||
| 4889 | |||
| 4890 | attrs = wq_sysfs_prep_attrs(wq); | ||
| 4891 | if (!attrs) | ||
| 4892 | return -ENOMEM; | ||
| 4893 | |||
| 4894 | ret = -EINVAL; | ||
| 4895 | if (sscanf(buf, "%d", &v) == 1) { | ||
| 4896 | attrs->no_numa = !v; | ||
| 4897 | ret = apply_workqueue_attrs(wq, attrs); | ||
| 4898 | } | ||
| 4899 | |||
| 4900 | free_workqueue_attrs(attrs); | ||
| 4901 | return ret ?: count; | ||
| 4902 | } | ||
| 4903 | |||
| 4904 | static struct device_attribute wq_sysfs_unbound_attrs[] = { | ||
| 4905 | __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), | ||
| 4906 | __ATTR(nice, 0644, wq_nice_show, wq_nice_store), | ||
| 4907 | __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), | ||
| 4908 | __ATTR(numa, 0644, wq_numa_show, wq_numa_store), | ||
| 4909 | __ATTR_NULL, | ||
| 4910 | }; | ||
| 4911 | |||
| 4912 | static struct bus_type wq_subsys = { | ||
| 4913 | .name = "workqueue", | ||
| 4914 | .dev_groups = wq_sysfs_groups, | ||
| 4915 | }; | ||
| 4916 | |||
| 4917 | static int __init wq_sysfs_init(void) | ||
| 4918 | { | ||
| 4919 | return subsys_virtual_register(&wq_subsys, NULL); | ||
| 4920 | } | ||
| 4921 | core_initcall(wq_sysfs_init); | ||
| 4922 | |||
| 4923 | static void wq_device_release(struct device *dev) | ||
| 4924 | { | ||
| 4925 | struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); | ||
| 4926 | |||
| 4927 | kfree(wq_dev); | ||
| 4928 | } | ||
| 4929 | |||
| 4930 | /** | ||
| 4931 | * workqueue_sysfs_register - make a workqueue visible in sysfs | ||
| 4932 | * @wq: the workqueue to register | ||
| 4933 | * | ||
| 4934 | * Expose @wq in sysfs under /sys/bus/workqueue/devices. | ||
| 4935 | * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set | ||
| 4936 | * which is the preferred method. | ||
| 4937 | * | ||
| 4938 | * Workqueue user should use this function directly iff it wants to apply | ||
| 4939 | * workqueue_attrs before making the workqueue visible in sysfs; otherwise, | ||
| 4940 | * apply_workqueue_attrs() may race against userland updating the | ||
| 4941 | * attributes. | ||
| 4942 | * | ||
| 4943 | * Return: 0 on success, -errno on failure. | ||
| 4944 | */ | ||
| 4945 | int workqueue_sysfs_register(struct workqueue_struct *wq) | ||
| 4946 | { | ||
| 4947 | struct wq_device *wq_dev; | ||
| 4948 | int ret; | ||
| 4949 | |||
| 4950 | /* | ||
| 4951 | * Adjusting max_active or creating new pwqs by applyting | ||
| 4952 | * attributes breaks ordering guarantee. Disallow exposing ordered | ||
| 4953 | * workqueues. | ||
| 4954 | */ | ||
| 4955 | if (WARN_ON(wq->flags & __WQ_ORDERED)) | ||
| 4956 | return -EINVAL; | ||
| 4957 | |||
| 4958 | wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); | ||
| 4959 | if (!wq_dev) | ||
| 4960 | return -ENOMEM; | ||
| 4961 | |||
| 4962 | wq_dev->wq = wq; | ||
| 4963 | wq_dev->dev.bus = &wq_subsys; | ||
| 4964 | wq_dev->dev.init_name = wq->name; | ||
| 4965 | wq_dev->dev.release = wq_device_release; | ||
| 4966 | |||
| 4967 | /* | ||
| 4968 | * unbound_attrs are created separately. Suppress uevent until | ||
| 4969 | * everything is ready. | ||
| 4970 | */ | ||
| 4971 | dev_set_uevent_suppress(&wq_dev->dev, true); | ||
| 4972 | |||
| 4973 | ret = device_register(&wq_dev->dev); | ||
| 4974 | if (ret) { | ||
| 4975 | kfree(wq_dev); | ||
| 4976 | wq->wq_dev = NULL; | ||
| 4977 | return ret; | ||
| 4978 | } | ||
| 4979 | |||
| 4980 | if (wq->flags & WQ_UNBOUND) { | ||
| 4981 | struct device_attribute *attr; | ||
| 4982 | |||
| 4983 | for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { | ||
| 4984 | ret = device_create_file(&wq_dev->dev, attr); | ||
| 4985 | if (ret) { | ||
| 4986 | device_unregister(&wq_dev->dev); | ||
| 4987 | wq->wq_dev = NULL; | ||
| 4988 | return ret; | ||
| 4989 | } | ||
| 4990 | } | ||
| 4991 | } | ||
| 4992 | |||
| 4993 | dev_set_uevent_suppress(&wq_dev->dev, false); | ||
| 4994 | kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); | ||
| 4995 | return 0; | ||
| 4996 | } | ||
| 4997 | |||
| 4998 | /** | ||
| 4999 | * workqueue_sysfs_unregister - undo workqueue_sysfs_register() | ||
| 5000 | * @wq: the workqueue to unregister | ||
| 5001 | * | ||
| 5002 | * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. | ||
| 5003 | */ | ||
| 5004 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) | ||
| 5005 | { | ||
| 5006 | struct wq_device *wq_dev = wq->wq_dev; | ||
| 5007 | |||
| 5008 | if (!wq->wq_dev) | ||
| 5009 | return; | ||
| 5010 | |||
| 5011 | wq->wq_dev = NULL; | ||
| 5012 | device_unregister(&wq_dev->dev); | ||
| 5013 | } | ||
| 5014 | #else /* CONFIG_SYSFS */ | ||
| 5015 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } | ||
| 5016 | #endif /* CONFIG_SYSFS */ | ||
| 5017 | |||
| 4837 | static void __init wq_numa_init(void) | 5018 | static void __init wq_numa_init(void) | 
| 4838 | { | 5019 | { | 
| 4839 | cpumask_var_t *tbl; | 5020 | cpumask_var_t *tbl; | 
