diff options
Diffstat (limited to 'kernel')
108 files changed, 5540 insertions, 2440 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 85cbfb31e73e..e9cf19155b46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg | |||
21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg | 21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg |
22 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 22 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
23 | CFLAGS_REMOVE_sched_clock.o = -pg | 23 | CFLAGS_REMOVE_sched_clock.o = -pg |
24 | CFLAGS_REMOVE_perf_event.o = -pg | ||
25 | CFLAGS_REMOVE_irq_work.o = -pg | 24 | CFLAGS_REMOVE_irq_work.o = -pg |
26 | endif | 25 | endif |
27 | 26 | ||
@@ -103,8 +102,9 @@ obj-$(CONFIG_RING_BUFFER) += trace/ | |||
103 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 102 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
104 | obj-$(CONFIG_SMP) += sched_cpupri.o | 103 | obj-$(CONFIG_SMP) += sched_cpupri.o |
105 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 104 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
106 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 105 | |
107 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 106 | obj-$(CONFIG_PERF_EVENTS) += events/ |
107 | |||
108 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 108 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
109 | obj-$(CONFIG_PADATA) += padata.o | 109 | obj-$(CONFIG_PADATA) += padata.o |
110 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 110 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 37b2bea170c8..e99dda04b126 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
@@ -607,7 +607,7 @@ void audit_trim_trees(void) | |||
607 | spin_lock(&hash_lock); | 607 | spin_lock(&hash_lock); |
608 | list_for_each_entry(node, &tree->chunks, list) { | 608 | list_for_each_entry(node, &tree->chunks, list) { |
609 | struct audit_chunk *chunk = find_chunk(node); | 609 | struct audit_chunk *chunk = find_chunk(node); |
610 | /* this could be NULL if the watch is dieing else where... */ | 610 | /* this could be NULL if the watch is dying else where... */ |
611 | struct inode *inode = chunk->mark.i.inode; | 611 | struct inode *inode = chunk->mark.i.inode; |
612 | node->index |= 1U<<31; | 612 | node->index |= 1U<<31; |
613 | if (iterate_mounts(compare_root, inode, root_mnt)) | 613 | if (iterate_mounts(compare_root, inode, root_mnt)) |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f49a0318c2ed..b33513a08beb 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -1011,7 +1011,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
1011 | /* | 1011 | /* |
1012 | * to_send and len_sent accounting are very loose estimates. We aren't | 1012 | * to_send and len_sent accounting are very loose estimates. We aren't |
1013 | * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being | 1013 | * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being |
1014 | * within about 500 bytes (next page boundry) | 1014 | * within about 500 bytes (next page boundary) |
1015 | * | 1015 | * |
1016 | * why snprintf? an int is up to 12 digits long. if we just assumed when | 1016 | * why snprintf? an int is up to 12 digits long. if we just assumed when |
1017 | * logging that a[%d]= was going to be 16 characters long we would be wasting | 1017 | * logging that a[%d]= was going to be 16 characters long we would be wasting |
diff --git a/kernel/capability.c b/kernel/capability.c index bf0c734d0c12..32a80e08ff4b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -399,3 +399,15 @@ bool task_ns_capable(struct task_struct *t, int cap) | |||
399 | return ns_capable(task_cred_xxx(t, user)->user_ns, cap); | 399 | return ns_capable(task_cred_xxx(t, user)->user_ns, cap); |
400 | } | 400 | } |
401 | EXPORT_SYMBOL(task_ns_capable); | 401 | EXPORT_SYMBOL(task_ns_capable); |
402 | |||
403 | /** | ||
404 | * nsown_capable - Check superior capability to one's own user_ns | ||
405 | * @cap: The capability in question | ||
406 | * | ||
407 | * Return true if the current task has the given superior capability | ||
408 | * targeted at its own user namespace. | ||
409 | */ | ||
410 | bool nsown_capable(int cap) | ||
411 | { | ||
412 | return ns_capable(current_user_ns(), cap); | ||
413 | } | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e31b220a743d..909a35510af5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -157,7 +157,7 @@ struct css_id { | |||
157 | }; | 157 | }; |
158 | 158 | ||
159 | /* | 159 | /* |
160 | * cgroup_event represents events which userspace want to recieve. | 160 | * cgroup_event represents events which userspace want to receive. |
161 | */ | 161 | */ |
162 | struct cgroup_event { | 162 | struct cgroup_event { |
163 | /* | 163 | /* |
@@ -326,12 +326,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) | |||
326 | return &css_set_table[index]; | 326 | return &css_set_table[index]; |
327 | } | 327 | } |
328 | 328 | ||
329 | static void free_css_set_rcu(struct rcu_head *obj) | ||
330 | { | ||
331 | struct css_set *cg = container_of(obj, struct css_set, rcu_head); | ||
332 | kfree(cg); | ||
333 | } | ||
334 | |||
335 | /* We don't maintain the lists running through each css_set to its | 329 | /* We don't maintain the lists running through each css_set to its |
336 | * task until after the first call to cgroup_iter_start(). This | 330 | * task until after the first call to cgroup_iter_start(). This |
337 | * reduces the fork()/exit() overhead for people who have cgroups | 331 | * reduces the fork()/exit() overhead for people who have cgroups |
@@ -375,7 +369,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) | |||
375 | } | 369 | } |
376 | 370 | ||
377 | write_unlock(&css_set_lock); | 371 | write_unlock(&css_set_lock); |
378 | call_rcu(&cg->rcu_head, free_css_set_rcu); | 372 | kfree_rcu(cg, rcu_head); |
379 | } | 373 | } |
380 | 374 | ||
381 | /* | 375 | /* |
@@ -812,13 +806,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
812 | return ret; | 806 | return ret; |
813 | } | 807 | } |
814 | 808 | ||
815 | static void free_cgroup_rcu(struct rcu_head *obj) | ||
816 | { | ||
817 | struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); | ||
818 | |||
819 | kfree(cgrp); | ||
820 | } | ||
821 | |||
822 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 809 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
823 | { | 810 | { |
824 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 811 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
@@ -856,7 +843,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
856 | */ | 843 | */ |
857 | BUG_ON(!list_empty(&cgrp->pidlists)); | 844 | BUG_ON(!list_empty(&cgrp->pidlists)); |
858 | 845 | ||
859 | call_rcu(&cgrp->rcu_head, free_cgroup_rcu); | 846 | kfree_rcu(cgrp, rcu_head); |
860 | } | 847 | } |
861 | iput(inode); | 848 | iput(inode); |
862 | } | 849 | } |
@@ -4623,14 +4610,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, | |||
4623 | return ret; | 4610 | return ret; |
4624 | } | 4611 | } |
4625 | 4612 | ||
4626 | static void __free_css_id_cb(struct rcu_head *head) | ||
4627 | { | ||
4628 | struct css_id *id; | ||
4629 | |||
4630 | id = container_of(head, struct css_id, rcu_head); | ||
4631 | kfree(id); | ||
4632 | } | ||
4633 | |||
4634 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | 4613 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) |
4635 | { | 4614 | { |
4636 | struct css_id *id = css->id; | 4615 | struct css_id *id = css->id; |
@@ -4645,7 +4624,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | |||
4645 | spin_lock(&ss->id_lock); | 4624 | spin_lock(&ss->id_lock); |
4646 | idr_remove(&ss->idr, id->id); | 4625 | idr_remove(&ss->idr, id->id); |
4647 | spin_unlock(&ss->id_lock); | 4626 | spin_unlock(&ss->id_lock); |
4648 | call_rcu(&id->rcu_head, __free_css_id_cb); | 4627 | kfree_rcu(id, rcu_head); |
4649 | } | 4628 | } |
4650 | EXPORT_SYMBOL_GPL(free_css_id); | 4629 | EXPORT_SYMBOL_GPL(free_css_id); |
4651 | 4630 | ||
diff --git a/kernel/cpu.c b/kernel/cpu.c index c95fc4df0faa..12b7458f23b1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void) | |||
126 | #else /* #if CONFIG_HOTPLUG_CPU */ | 126 | #else /* #if CONFIG_HOTPLUG_CPU */ |
127 | static void cpu_hotplug_begin(void) {} | 127 | static void cpu_hotplug_begin(void) {} |
128 | static void cpu_hotplug_done(void) {} | 128 | static void cpu_hotplug_done(void) {} |
129 | #endif /* #esle #if CONFIG_HOTPLUG_CPU */ | 129 | #endif /* #else #if CONFIG_HOTPLUG_CPU */ |
130 | 130 | ||
131 | /* Need to know about CPUs going up/down? */ | 131 | /* Need to know about CPUs going up/down? */ |
132 | int __ref register_cpu_notifier(struct notifier_block *nb) | 132 | int __ref register_cpu_notifier(struct notifier_block *nb) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 33eee16addb8..2bb8c2e98fff 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) | |||
1159 | static int update_relax_domain_level(struct cpuset *cs, s64 val) | 1159 | static int update_relax_domain_level(struct cpuset *cs, s64 val) |
1160 | { | 1160 | { |
1161 | #ifdef CONFIG_SMP | 1161 | #ifdef CONFIG_SMP |
1162 | if (val < -1 || val >= SD_LV_MAX) | 1162 | if (val < -1 || val >= sched_domain_level_max) |
1163 | return -EINVAL; | 1163 | return -EINVAL; |
1164 | #endif | 1164 | #endif |
1165 | 1165 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 5557b55048df..8093c16b84b1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
@@ -54,6 +54,7 @@ struct cred init_cred = { | |||
54 | .cap_effective = CAP_INIT_EFF_SET, | 54 | .cap_effective = CAP_INIT_EFF_SET, |
55 | .cap_bset = CAP_INIT_BSET, | 55 | .cap_bset = CAP_INIT_BSET, |
56 | .user = INIT_USER, | 56 | .user = INIT_USER, |
57 | .user_ns = &init_user_ns, | ||
57 | .group_info = &init_groups, | 58 | .group_info = &init_groups, |
58 | #ifdef CONFIG_KEYS | 59 | #ifdef CONFIG_KEYS |
59 | .tgcred = &init_tgcred, | 60 | .tgcred = &init_tgcred, |
@@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
410 | goto error_put; | 411 | goto error_put; |
411 | } | 412 | } |
412 | 413 | ||
414 | /* cache user_ns in cred. Doesn't need a refcount because it will | ||
415 | * stay pinned by cred->user | ||
416 | */ | ||
417 | new->user_ns = new->user->user_ns; | ||
418 | |||
413 | #ifdef CONFIG_KEYS | 419 | #ifdef CONFIG_KEYS |
414 | /* new threads get their own thread keyrings if their parent already | 420 | /* new threads get their own thread keyrings if their parent already |
415 | * had one */ | 421 | * had one */ |
@@ -741,12 +747,6 @@ int set_create_files_as(struct cred *new, struct inode *inode) | |||
741 | } | 747 | } |
742 | EXPORT_SYMBOL(set_create_files_as); | 748 | EXPORT_SYMBOL(set_create_files_as); |
743 | 749 | ||
744 | struct user_namespace *current_user_ns(void) | ||
745 | { | ||
746 | return _current_user_ns(); | ||
747 | } | ||
748 | EXPORT_SYMBOL(current_user_ns); | ||
749 | |||
750 | #ifdef CONFIG_DEBUG_CREDENTIALS | 750 | #ifdef CONFIG_DEBUG_CREDENTIALS |
751 | 751 | ||
752 | bool creds_are_invalid(const struct cred *cred) | 752 | bool creds_are_invalid(const struct cred *cred) |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index cefd4a11f6d9..bad6786dee88 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -538,7 +538,7 @@ return_normal: | |||
538 | 538 | ||
539 | /* | 539 | /* |
540 | * For single stepping, try to only enter on the processor | 540 | * For single stepping, try to only enter on the processor |
541 | * that was single stepping. To gaurd against a deadlock, the | 541 | * that was single stepping. To guard against a deadlock, the |
542 | * kernel will only try for the value of sstep_tries before | 542 | * kernel will only try for the value of sstep_tries before |
543 | * giving up and continuing on. | 543 | * giving up and continuing on. |
544 | */ | 544 | */ |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 6bc6e3bc4f9c..be14779bcef6 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -441,9 +441,9 @@ static int kdb_check_regs(void) | |||
441 | * symbol name, and offset to the caller. | 441 | * symbol name, and offset to the caller. |
442 | * | 442 | * |
443 | * The argument may consist of a numeric value (decimal or | 443 | * The argument may consist of a numeric value (decimal or |
444 | * hexidecimal), a symbol name, a register name (preceeded by the | 444 | * hexidecimal), a symbol name, a register name (preceded by the |
445 | * percent sign), an environment variable with a numeric value | 445 | * percent sign), an environment variable with a numeric value |
446 | * (preceeded by a dollar sign) or a simple arithmetic expression | 446 | * (preceded by a dollar sign) or a simple arithmetic expression |
447 | * consisting of a symbol name, +/-, and a numeric constant value | 447 | * consisting of a symbol name, +/-, and a numeric constant value |
448 | * (offset). | 448 | * (offset). |
449 | * Parameters: | 449 | * Parameters: |
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value) | |||
1335 | * error The hardware-defined error code | 1335 | * error The hardware-defined error code |
1336 | * reason2 kdb's current reason code. | 1336 | * reason2 kdb's current reason code. |
1337 | * Initially error but can change | 1337 | * Initially error but can change |
1338 | * acording to kdb state. | 1338 | * according to kdb state. |
1339 | * db_result Result code from break or debug point. | 1339 | * db_result Result code from break or debug point. |
1340 | * regs The exception frame at time of fault/breakpoint. | 1340 | * regs The exception frame at time of fault/breakpoint. |
1341 | * should always be valid. | 1341 | * should always be valid. |
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 6b2485dcb050..5532dd37aa86 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c | |||
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size) | |||
545 | * Mask for process state. | 545 | * Mask for process state. |
546 | * Notes: | 546 | * Notes: |
547 | * The mask folds data from several sources into a single long value, so | 547 | * The mask folds data from several sources into a single long value, so |
548 | * be carefull not to overlap the bits. TASK_* bits are in the LSB, | 548 | * be careful not to overlap the bits. TASK_* bits are in the LSB, |
549 | * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there | 549 | * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there |
550 | * is no overlap between TASK_* and EXIT_* but that may not always be | 550 | * is no overlap between TASK_* and EXIT_* but that may not always be |
551 | * true, so EXIT_* bits are shifted left 16 bits before being stored in | 551 | * true, so EXIT_* bits are shifted left 16 bits before being stored in |
diff --git a/kernel/events/Makefile b/kernel/events/Makefile new file mode 100644 index 000000000000..1ce23d3d8394 --- /dev/null +++ b/kernel/events/Makefile | |||
@@ -0,0 +1,6 @@ | |||
1 | ifdef CONFIG_FUNCTION_TRACER | ||
2 | CFLAGS_REMOVE_core.o = -pg | ||
3 | endif | ||
4 | |||
5 | obj-y := core.o | ||
6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | ||
diff --git a/kernel/perf_event.c b/kernel/events/core.c index 27960f114efd..c09767f7db3e 100644 --- a/kernel/perf_event.c +++ b/kernel/events/core.c | |||
@@ -2,8 +2,8 @@ | |||
2 | * Performance events core code: | 2 | * Performance events core code: |
3 | * | 3 | * |
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
5 | * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar | 5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar |
6 | * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | 7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
8 | * | 8 | * |
9 | * For licensing details see kernel-base/COPYING | 9 | * For licensing details see kernel-base/COPYING |
@@ -39,10 +39,10 @@ | |||
39 | #include <asm/irq_regs.h> | 39 | #include <asm/irq_regs.h> |
40 | 40 | ||
41 | struct remote_function_call { | 41 | struct remote_function_call { |
42 | struct task_struct *p; | 42 | struct task_struct *p; |
43 | int (*func)(void *info); | 43 | int (*func)(void *info); |
44 | void *info; | 44 | void *info; |
45 | int ret; | 45 | int ret; |
46 | }; | 46 | }; |
47 | 47 | ||
48 | static void remote_function(void *data) | 48 | static void remote_function(void *data) |
@@ -76,10 +76,10 @@ static int | |||
76 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | 76 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) |
77 | { | 77 | { |
78 | struct remote_function_call data = { | 78 | struct remote_function_call data = { |
79 | .p = p, | 79 | .p = p, |
80 | .func = func, | 80 | .func = func, |
81 | .info = info, | 81 | .info = info, |
82 | .ret = -ESRCH, /* No such (running) process */ | 82 | .ret = -ESRCH, /* No such (running) process */ |
83 | }; | 83 | }; |
84 | 84 | ||
85 | if (task_curr(p)) | 85 | if (task_curr(p)) |
@@ -100,10 +100,10 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | |||
100 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | 100 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) |
101 | { | 101 | { |
102 | struct remote_function_call data = { | 102 | struct remote_function_call data = { |
103 | .p = NULL, | 103 | .p = NULL, |
104 | .func = func, | 104 | .func = func, |
105 | .info = info, | 105 | .info = info, |
106 | .ret = -ENXIO, /* No such CPU */ | 106 | .ret = -ENXIO, /* No such CPU */ |
107 | }; | 107 | }; |
108 | 108 | ||
109 | smp_call_function_single(cpu, remote_function, &data, 1); | 109 | smp_call_function_single(cpu, remote_function, &data, 1); |
@@ -125,7 +125,7 @@ enum event_type_t { | |||
125 | * perf_sched_events : >0 events exist | 125 | * perf_sched_events : >0 events exist |
126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | 126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu |
127 | */ | 127 | */ |
128 | atomic_t perf_sched_events __read_mostly; | 128 | struct jump_label_key perf_sched_events __read_mostly; |
129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
130 | 130 | ||
131 | static atomic_t nr_mmap_events __read_mostly; | 131 | static atomic_t nr_mmap_events __read_mostly; |
@@ -364,6 +364,7 @@ void perf_cgroup_switch(struct task_struct *task, int mode) | |||
364 | } | 364 | } |
365 | 365 | ||
366 | if (mode & PERF_CGROUP_SWIN) { | 366 | if (mode & PERF_CGROUP_SWIN) { |
367 | WARN_ON_ONCE(cpuctx->cgrp); | ||
367 | /* set cgrp before ctxsw in to | 368 | /* set cgrp before ctxsw in to |
368 | * allow event_filter_match() to not | 369 | * allow event_filter_match() to not |
369 | * have to pass task around | 370 | * have to pass task around |
@@ -585,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx) | |||
585 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); | 586 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); |
586 | } | 587 | } |
587 | 588 | ||
588 | static void free_ctx(struct rcu_head *head) | ||
589 | { | ||
590 | struct perf_event_context *ctx; | ||
591 | |||
592 | ctx = container_of(head, struct perf_event_context, rcu_head); | ||
593 | kfree(ctx); | ||
594 | } | ||
595 | |||
596 | static void put_ctx(struct perf_event_context *ctx) | 589 | static void put_ctx(struct perf_event_context *ctx) |
597 | { | 590 | { |
598 | if (atomic_dec_and_test(&ctx->refcount)) { | 591 | if (atomic_dec_and_test(&ctx->refcount)) { |
@@ -600,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx) | |||
600 | put_ctx(ctx->parent_ctx); | 593 | put_ctx(ctx->parent_ctx); |
601 | if (ctx->task) | 594 | if (ctx->task) |
602 | put_task_struct(ctx->task); | 595 | put_task_struct(ctx->task); |
603 | call_rcu(&ctx->rcu_head, free_ctx); | 596 | kfree_rcu(ctx, rcu_head); |
604 | } | 597 | } |
605 | } | 598 | } |
606 | 599 | ||
@@ -2423,6 +2416,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
2423 | if (!ctx || !ctx->nr_events) | 2416 | if (!ctx || !ctx->nr_events) |
2424 | goto out; | 2417 | goto out; |
2425 | 2418 | ||
2419 | /* | ||
2420 | * We must ctxsw out cgroup events to avoid conflict | ||
2421 | * when invoking perf_task_event_sched_in() later on | ||
2422 | * in this function. Otherwise we end up trying to | ||
2423 | * ctxswin cgroup events which are already scheduled | ||
2424 | * in. | ||
2425 | */ | ||
2426 | perf_cgroup_sched_out(current); | ||
2426 | task_ctx_sched_out(ctx, EVENT_ALL); | 2427 | task_ctx_sched_out(ctx, EVENT_ALL); |
2427 | 2428 | ||
2428 | raw_spin_lock(&ctx->lock); | 2429 | raw_spin_lock(&ctx->lock); |
@@ -2447,6 +2448,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
2447 | 2448 | ||
2448 | raw_spin_unlock(&ctx->lock); | 2449 | raw_spin_unlock(&ctx->lock); |
2449 | 2450 | ||
2451 | /* | ||
2452 | * Also calls ctxswin for cgroup events, if any: | ||
2453 | */ | ||
2450 | perf_event_context_sched_in(ctx, ctx->task); | 2454 | perf_event_context_sched_in(ctx, ctx->task); |
2451 | out: | 2455 | out: |
2452 | local_irq_restore(flags); | 2456 | local_irq_restore(flags); |
@@ -5319,14 +5323,6 @@ swevent_hlist_deref(struct swevent_htable *swhash) | |||
5319 | lockdep_is_held(&swhash->hlist_mutex)); | 5323 | lockdep_is_held(&swhash->hlist_mutex)); |
5320 | } | 5324 | } |
5321 | 5325 | ||
5322 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | ||
5323 | { | ||
5324 | struct swevent_hlist *hlist; | ||
5325 | |||
5326 | hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); | ||
5327 | kfree(hlist); | ||
5328 | } | ||
5329 | |||
5330 | static void swevent_hlist_release(struct swevent_htable *swhash) | 5326 | static void swevent_hlist_release(struct swevent_htable *swhash) |
5331 | { | 5327 | { |
5332 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); | 5328 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); |
@@ -5335,7 +5331,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) | |||
5335 | return; | 5331 | return; |
5336 | 5332 | ||
5337 | rcu_assign_pointer(swhash->swevent_hlist, NULL); | 5333 | rcu_assign_pointer(swhash->swevent_hlist, NULL); |
5338 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 5334 | kfree_rcu(hlist, rcu_head); |
5339 | } | 5335 | } |
5340 | 5336 | ||
5341 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) | 5337 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) |
@@ -5417,7 +5413,7 @@ fail: | |||
5417 | return err; | 5413 | return err; |
5418 | } | 5414 | } |
5419 | 5415 | ||
5420 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 5416 | struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
5421 | 5417 | ||
5422 | static void sw_perf_event_destroy(struct perf_event *event) | 5418 | static void sw_perf_event_destroy(struct perf_event *event) |
5423 | { | 5419 | { |
@@ -7433,11 +7429,11 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
7433 | } | 7429 | } |
7434 | 7430 | ||
7435 | struct cgroup_subsys perf_subsys = { | 7431 | struct cgroup_subsys perf_subsys = { |
7436 | .name = "perf_event", | 7432 | .name = "perf_event", |
7437 | .subsys_id = perf_subsys_id, | 7433 | .subsys_id = perf_subsys_id, |
7438 | .create = perf_cgroup_create, | 7434 | .create = perf_cgroup_create, |
7439 | .destroy = perf_cgroup_destroy, | 7435 | .destroy = perf_cgroup_destroy, |
7440 | .exit = perf_cgroup_exit, | 7436 | .exit = perf_cgroup_exit, |
7441 | .attach = perf_cgroup_attach, | 7437 | .attach = perf_cgroup_attach, |
7442 | }; | 7438 | }; |
7443 | #endif /* CONFIG_CGROUP_PERF */ | 7439 | #endif /* CONFIG_CGROUP_PERF */ |
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 086adf25a55e..086adf25a55e 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
diff --git a/kernel/exit.c b/kernel/exit.c index 33837936b98c..20a406471525 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
841 | /* Let father know we died | 841 | /* Let father know we died |
842 | * | 842 | * |
843 | * Thread signals are configurable, but you aren't going to use | 843 | * Thread signals are configurable, but you aren't going to use |
844 | * that to send signals to arbitary processes. | 844 | * that to send signals to arbitrary processes. |
845 | * That stops right now. | 845 | * That stops right now. |
846 | * | 846 | * |
847 | * If the parent exec id doesn't match the exec id we saved | 847 | * If the parent exec id doesn't match the exec id we saved |
@@ -1016,7 +1016,7 @@ NORET_TYPE void do_exit(long code) | |||
1016 | /* | 1016 | /* |
1017 | * FIXME: do that only when needed, using sched_exit tracepoint | 1017 | * FIXME: do that only when needed, using sched_exit tracepoint |
1018 | */ | 1018 | */ |
1019 | flush_ptrace_hw_breakpoint(tsk); | 1019 | ptrace_put_breakpoints(tsk); |
1020 | 1020 | ||
1021 | exit_notify(tsk, group_dead); | 1021 | exit_notify(tsk, group_dead); |
1022 | #ifdef CONFIG_NUMA | 1022 | #ifdef CONFIG_NUMA |
diff --git a/kernel/extable.c b/kernel/extable.c index 7f8f263f8524..c2d625fcda77 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -72,6 +72,14 @@ int core_kernel_text(unsigned long addr) | |||
72 | return 0; | 72 | return 0; |
73 | } | 73 | } |
74 | 74 | ||
75 | int core_kernel_data(unsigned long addr) | ||
76 | { | ||
77 | if (addr >= (unsigned long)_sdata && | ||
78 | addr < (unsigned long)_edata) | ||
79 | return 1; | ||
80 | return 0; | ||
81 | } | ||
82 | |||
75 | int __kernel_text_address(unsigned long addr) | 83 | int __kernel_text_address(unsigned long addr) |
76 | { | 84 | { |
77 | if (core_kernel_text(addr)) | 85 | if (core_kernel_text(addr)) |
diff --git a/kernel/fork.c b/kernel/fork.c index e7548dee636b..2b44d82b8237 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1103 | 1103 | ||
1104 | posix_cpu_timers_init(p); | 1104 | posix_cpu_timers_init(p); |
1105 | 1105 | ||
1106 | p->lock_depth = -1; /* -1 = no lock */ | ||
1107 | do_posix_clock_monotonic_gettime(&p->start_time); | 1106 | do_posix_clock_monotonic_gettime(&p->start_time); |
1108 | p->real_start_time = p->start_time; | 1107 | p->real_start_time = p->start_time; |
1109 | monotonic_to_bootbased(&p->real_start_time); | 1108 | monotonic_to_bootbased(&p->real_start_time); |
@@ -1153,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1153 | #endif | 1152 | #endif |
1154 | 1153 | ||
1155 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1154 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
1156 | sched_fork(p, clone_flags); | 1155 | sched_fork(p); |
1157 | 1156 | ||
1158 | retval = perf_event_init_task(p); | 1157 | retval = perf_event_init_task(p); |
1159 | if (retval) | 1158 | if (retval) |
@@ -1464,7 +1463,7 @@ long do_fork(unsigned long clone_flags, | |||
1464 | */ | 1463 | */ |
1465 | p->flags &= ~PF_STARTING; | 1464 | p->flags &= ~PF_STARTING; |
1466 | 1465 | ||
1467 | wake_up_new_task(p, clone_flags); | 1466 | wake_up_new_task(p); |
1468 | 1467 | ||
1469 | tracehook_report_clone_complete(trace, regs, | 1468 | tracehook_report_clone_complete(trace, regs, |
1470 | clone_flags, nr, p); | 1469 | clone_flags, nr, p); |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 66ecd2ead215..7b01de98bb6a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
@@ -17,7 +17,7 @@ static inline void frozen_process(void) | |||
17 | { | 17 | { |
18 | if (!unlikely(current->flags & PF_NOFREEZE)) { | 18 | if (!unlikely(current->flags & PF_NOFREEZE)) { |
19 | current->flags |= PF_FROZEN; | 19 | current->flags |= PF_FROZEN; |
20 | wmb(); | 20 | smp_wmb(); |
21 | } | 21 | } |
22 | clear_freeze_flag(current); | 22 | clear_freeze_flag(current); |
23 | } | 23 | } |
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only) | |||
93 | * the task as frozen and next clears its TIF_FREEZE. | 93 | * the task as frozen and next clears its TIF_FREEZE. |
94 | */ | 94 | */ |
95 | if (!freezing(p)) { | 95 | if (!freezing(p)) { |
96 | rmb(); | 96 | smp_rmb(); |
97 | if (frozen(p)) | 97 | if (frozen(p)) |
98 | return false; | 98 | return false; |
99 | 99 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index dfb924ffe65b..fe28dc282eae 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -1886,7 +1886,7 @@ retry: | |||
1886 | restart->futex.val = val; | 1886 | restart->futex.val = val; |
1887 | restart->futex.time = abs_time->tv64; | 1887 | restart->futex.time = abs_time->tv64; |
1888 | restart->futex.bitset = bitset; | 1888 | restart->futex.bitset = bitset; |
1889 | restart->futex.flags = flags; | 1889 | restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; |
1890 | 1890 | ||
1891 | ret = -ERESTART_RESTARTBLOCK; | 1891 | ret = -ERESTART_RESTARTBLOCK; |
1892 | 1892 | ||
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9017478c5d4c..dbbbf7d43080 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -81,7 +81,11 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | |||
81 | } | 81 | } |
82 | }; | 82 | }; |
83 | 83 | ||
84 | static int hrtimer_clock_to_base_table[MAX_CLOCKS]; | 84 | static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { |
85 | [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, | ||
86 | [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, | ||
87 | [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, | ||
88 | }; | ||
85 | 89 | ||
86 | static inline int hrtimer_clockid_to_base(clockid_t clock_id) | 90 | static inline int hrtimer_clockid_to_base(clockid_t clock_id) |
87 | { | 91 | { |
@@ -1722,10 +1726,6 @@ static struct notifier_block __cpuinitdata hrtimers_nb = { | |||
1722 | 1726 | ||
1723 | void __init hrtimers_init(void) | 1727 | void __init hrtimers_init(void) |
1724 | { | 1728 | { |
1725 | hrtimer_clock_to_base_table[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME; | ||
1726 | hrtimer_clock_to_base_table[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC; | ||
1727 | hrtimer_clock_to_base_table[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME; | ||
1728 | |||
1729 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, | 1729 | hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, |
1730 | (void *)(long)smp_processor_id()); | 1730 | (void *)(long)smp_processor_id()); |
1731 | register_cpu_notifier(&hrtimers_nb); | 1731 | register_cpu_notifier(&hrtimers_nb); |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 53ead174da2f..ea640120ab86 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; | |||
33 | /* | 33 | /* |
34 | * Zero means infinite timeout - no checking done: | 34 | * Zero means infinite timeout - no checking done: |
35 | */ | 35 | */ |
36 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; | 36 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; |
37 | 37 | ||
38 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; | 38 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; |
39 | 39 | ||
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index c574f9a12c48..d1d051b38e0b 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
@@ -48,6 +48,10 @@ config IRQ_PREFLOW_FASTEOI | |||
48 | config IRQ_EDGE_EOI_HANDLER | 48 | config IRQ_EDGE_EOI_HANDLER |
49 | bool | 49 | bool |
50 | 50 | ||
51 | # Generic configurable interrupt chip implementation | ||
52 | config GENERIC_IRQ_CHIP | ||
53 | bool | ||
54 | |||
51 | # Support forced irq threading | 55 | # Support forced irq threading |
52 | config IRQ_FORCED_THREADING | 56 | config IRQ_FORCED_THREADING |
53 | bool | 57 | bool |
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 54329cd7b3ee..73290056cfb6 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
@@ -1,5 +1,6 @@ | |||
1 | 1 | ||
2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o | 2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o |
3 | obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o | ||
3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 4 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
4 | obj-$(CONFIG_PROC_FS) += proc.o | 5 | obj-$(CONFIG_PROC_FS) += proc.o |
5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 6 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 1dafc8652bd8..d5a3009da71a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -310,6 +310,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
310 | out_unlock: | 310 | out_unlock: |
311 | raw_spin_unlock(&desc->lock); | 311 | raw_spin_unlock(&desc->lock); |
312 | } | 312 | } |
313 | EXPORT_SYMBOL_GPL(handle_simple_irq); | ||
313 | 314 | ||
314 | /** | 315 | /** |
315 | * handle_level_irq - Level type irq handler | 316 | * handle_level_irq - Level type irq handler |
@@ -415,7 +416,7 @@ out: | |||
415 | * @desc: the interrupt description structure for this irq | 416 | * @desc: the interrupt description structure for this irq |
416 | * | 417 | * |
417 | * Interrupt occures on the falling and/or rising edge of a hardware | 418 | * Interrupt occures on the falling and/or rising edge of a hardware |
418 | * signal. The occurence is latched into the irq controller hardware | 419 | * signal. The occurrence is latched into the irq controller hardware |
419 | * and must be acked in order to be reenabled. After the ack another | 420 | * and must be acked in order to be reenabled. After the ack another |
420 | * interrupt can happen on the same source even before the first one | 421 | * interrupt can happen on the same source even before the first one |
421 | * is handled by the associated event handler. If this happens it | 422 | * is handled by the associated event handler. If this happens it |
@@ -573,6 +574,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
573 | if (handle != handle_bad_irq && is_chained) { | 574 | if (handle != handle_bad_irq && is_chained) { |
574 | irq_settings_set_noprobe(desc); | 575 | irq_settings_set_noprobe(desc); |
575 | irq_settings_set_norequest(desc); | 576 | irq_settings_set_norequest(desc); |
577 | irq_settings_set_nothread(desc); | ||
576 | irq_startup(desc); | 578 | irq_startup(desc); |
577 | } | 579 | } |
578 | out: | 580 | out: |
@@ -612,6 +614,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) | |||
612 | 614 | ||
613 | irq_put_desc_unlock(desc, flags); | 615 | irq_put_desc_unlock(desc, flags); |
614 | } | 616 | } |
617 | EXPORT_SYMBOL_GPL(irq_modify_status); | ||
615 | 618 | ||
616 | /** | 619 | /** |
617 | * irq_cpu_online - Invoke all irq_cpu_online functions. | 620 | * irq_cpu_online - Invoke all irq_cpu_online functions. |
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 306cba37e9a5..97a8bfadc88a 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h | |||
@@ -27,6 +27,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | |||
27 | P(IRQ_PER_CPU); | 27 | P(IRQ_PER_CPU); |
28 | P(IRQ_NOPROBE); | 28 | P(IRQ_NOPROBE); |
29 | P(IRQ_NOREQUEST); | 29 | P(IRQ_NOREQUEST); |
30 | P(IRQ_NOTHREAD); | ||
30 | P(IRQ_NOAUTOEN); | 31 | P(IRQ_NOAUTOEN); |
31 | 32 | ||
32 | PS(IRQS_AUTODETECT); | 33 | PS(IRQS_AUTODETECT); |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c new file mode 100644 index 000000000000..31a9db711906 --- /dev/null +++ b/kernel/irq/generic-chip.c | |||
@@ -0,0 +1,354 @@ | |||
1 | /* | ||
2 | * Library implementing the most common irq chip callback functions | ||
3 | * | ||
4 | * Copyright (C) 2011, Thomas Gleixner | ||
5 | */ | ||
6 | #include <linux/io.h> | ||
7 | #include <linux/irq.h> | ||
8 | #include <linux/slab.h> | ||
9 | #include <linux/interrupt.h> | ||
10 | #include <linux/kernel_stat.h> | ||
11 | #include <linux/syscore_ops.h> | ||
12 | |||
13 | #include "internals.h" | ||
14 | |||
15 | static LIST_HEAD(gc_list); | ||
16 | static DEFINE_RAW_SPINLOCK(gc_lock); | ||
17 | |||
18 | static inline struct irq_chip_regs *cur_regs(struct irq_data *d) | ||
19 | { | ||
20 | return &container_of(d->chip, struct irq_chip_type, chip)->regs; | ||
21 | } | ||
22 | |||
23 | /** | ||
24 | * irq_gc_noop - NOOP function | ||
25 | * @d: irq_data | ||
26 | */ | ||
27 | void irq_gc_noop(struct irq_data *d) | ||
28 | { | ||
29 | } | ||
30 | |||
31 | /** | ||
32 | * irq_gc_mask_disable_reg - Mask chip via disable register | ||
33 | * @d: irq_data | ||
34 | * | ||
35 | * Chip has separate enable/disable registers instead of a single mask | ||
36 | * register. | ||
37 | */ | ||
38 | void irq_gc_mask_disable_reg(struct irq_data *d) | ||
39 | { | ||
40 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
41 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
42 | |||
43 | irq_gc_lock(gc); | ||
44 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); | ||
45 | gc->mask_cache &= ~mask; | ||
46 | irq_gc_unlock(gc); | ||
47 | } | ||
48 | |||
49 | /** | ||
50 | * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register | ||
51 | * @d: irq_data | ||
52 | * | ||
53 | * Chip has a single mask register. Values of this register are cached | ||
54 | * and protected by gc->lock | ||
55 | */ | ||
56 | void irq_gc_mask_set_bit(struct irq_data *d) | ||
57 | { | ||
58 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
59 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
60 | |||
61 | irq_gc_lock(gc); | ||
62 | gc->mask_cache |= mask; | ||
63 | irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); | ||
64 | irq_gc_unlock(gc); | ||
65 | } | ||
66 | |||
67 | /** | ||
68 | * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register | ||
69 | * @d: irq_data | ||
70 | * | ||
71 | * Chip has a single mask register. Values of this register are cached | ||
72 | * and protected by gc->lock | ||
73 | */ | ||
74 | void irq_gc_mask_clr_bit(struct irq_data *d) | ||
75 | { | ||
76 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
77 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
78 | |||
79 | irq_gc_lock(gc); | ||
80 | gc->mask_cache &= ~mask; | ||
81 | irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); | ||
82 | irq_gc_unlock(gc); | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * irq_gc_unmask_enable_reg - Unmask chip via enable register | ||
87 | * @d: irq_data | ||
88 | * | ||
89 | * Chip has separate enable/disable registers instead of a single mask | ||
90 | * register. | ||
91 | */ | ||
92 | void irq_gc_unmask_enable_reg(struct irq_data *d) | ||
93 | { | ||
94 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
95 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
96 | |||
97 | irq_gc_lock(gc); | ||
98 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); | ||
99 | gc->mask_cache |= mask; | ||
100 | irq_gc_unlock(gc); | ||
101 | } | ||
102 | |||
103 | /** | ||
104 | * irq_gc_ack - Ack pending interrupt | ||
105 | * @d: irq_data | ||
106 | */ | ||
107 | void irq_gc_ack(struct irq_data *d) | ||
108 | { | ||
109 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
110 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
111 | |||
112 | irq_gc_lock(gc); | ||
113 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
114 | irq_gc_unlock(gc); | ||
115 | } | ||
116 | |||
117 | /** | ||
118 | * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt | ||
119 | * @d: irq_data | ||
120 | */ | ||
121 | void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) | ||
122 | { | ||
123 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
124 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
125 | |||
126 | irq_gc_lock(gc); | ||
127 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); | ||
128 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
129 | irq_gc_unlock(gc); | ||
130 | } | ||
131 | |||
132 | /** | ||
133 | * irq_gc_eoi - EOI interrupt | ||
134 | * @d: irq_data | ||
135 | */ | ||
136 | void irq_gc_eoi(struct irq_data *d) | ||
137 | { | ||
138 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
139 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
140 | |||
141 | irq_gc_lock(gc); | ||
142 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); | ||
143 | irq_gc_unlock(gc); | ||
144 | } | ||
145 | |||
146 | /** | ||
147 | * irq_gc_set_wake - Set/clr wake bit for an interrupt | ||
148 | * @d: irq_data | ||
149 | * | ||
150 | * For chips where the wake from suspend functionality is not | ||
151 | * configured in a separate register and the wakeup active state is | ||
152 | * just stored in a bitmask. | ||
153 | */ | ||
154 | int irq_gc_set_wake(struct irq_data *d, unsigned int on) | ||
155 | { | ||
156 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
157 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
158 | |||
159 | if (!(mask & gc->wake_enabled)) | ||
160 | return -EINVAL; | ||
161 | |||
162 | irq_gc_lock(gc); | ||
163 | if (on) | ||
164 | gc->wake_active |= mask; | ||
165 | else | ||
166 | gc->wake_active &= ~mask; | ||
167 | irq_gc_unlock(gc); | ||
168 | return 0; | ||
169 | } | ||
170 | |||
171 | /** | ||
172 | * irq_alloc_generic_chip - Allocate a generic chip and initialize it | ||
173 | * @name: Name of the irq chip | ||
174 | * @num_ct: Number of irq_chip_type instances associated with this | ||
175 | * @irq_base: Interrupt base nr for this chip | ||
176 | * @reg_base: Register base address (virtual) | ||
177 | * @handler: Default flow handler associated with this chip | ||
178 | * | ||
179 | * Returns an initialized irq_chip_generic structure. The chip defaults | ||
180 | * to the primary (index 0) irq_chip_type and @handler | ||
181 | */ | ||
182 | struct irq_chip_generic * | ||
183 | irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, | ||
184 | void __iomem *reg_base, irq_flow_handler_t handler) | ||
185 | { | ||
186 | struct irq_chip_generic *gc; | ||
187 | unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); | ||
188 | |||
189 | gc = kzalloc(sz, GFP_KERNEL); | ||
190 | if (gc) { | ||
191 | raw_spin_lock_init(&gc->lock); | ||
192 | gc->num_ct = num_ct; | ||
193 | gc->irq_base = irq_base; | ||
194 | gc->reg_base = reg_base; | ||
195 | gc->chip_types->chip.name = name; | ||
196 | gc->chip_types->handler = handler; | ||
197 | } | ||
198 | return gc; | ||
199 | } | ||
200 | |||
201 | /* | ||
202 | * Separate lockdep class for interrupt chip which can nest irq_desc | ||
203 | * lock. | ||
204 | */ | ||
205 | static struct lock_class_key irq_nested_lock_class; | ||
206 | |||
207 | /** | ||
208 | * irq_setup_generic_chip - Setup a range of interrupts with a generic chip | ||
209 | * @gc: Generic irq chip holding all data | ||
210 | * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base | ||
211 | * @flags: Flags for initialization | ||
212 | * @clr: IRQ_* bits to clear | ||
213 | * @set: IRQ_* bits to set | ||
214 | * | ||
215 | * Set up max. 32 interrupts starting from gc->irq_base. Note, this | ||
216 | * initializes all interrupts to the primary irq_chip_type and its | ||
217 | * associated handler. | ||
218 | */ | ||
219 | void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, | ||
220 | enum irq_gc_flags flags, unsigned int clr, | ||
221 | unsigned int set) | ||
222 | { | ||
223 | struct irq_chip_type *ct = gc->chip_types; | ||
224 | unsigned int i; | ||
225 | |||
226 | raw_spin_lock(&gc_lock); | ||
227 | list_add_tail(&gc->list, &gc_list); | ||
228 | raw_spin_unlock(&gc_lock); | ||
229 | |||
230 | /* Init mask cache ? */ | ||
231 | if (flags & IRQ_GC_INIT_MASK_CACHE) | ||
232 | gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); | ||
233 | |||
234 | for (i = gc->irq_base; msk; msk >>= 1, i++) { | ||
235 | if (!msk & 0x01) | ||
236 | continue; | ||
237 | |||
238 | if (flags & IRQ_GC_INIT_NESTED_LOCK) | ||
239 | irq_set_lockdep_class(i, &irq_nested_lock_class); | ||
240 | |||
241 | irq_set_chip_and_handler(i, &ct->chip, ct->handler); | ||
242 | irq_set_chip_data(i, gc); | ||
243 | irq_modify_status(i, clr, set); | ||
244 | } | ||
245 | gc->irq_cnt = i - gc->irq_base; | ||
246 | } | ||
247 | |||
248 | /** | ||
249 | * irq_setup_alt_chip - Switch to alternative chip | ||
250 | * @d: irq_data for this interrupt | ||
251 | * @type Flow type to be initialized | ||
252 | * | ||
253 | * Only to be called from chip->irq_set_type() callbacks. | ||
254 | */ | ||
255 | int irq_setup_alt_chip(struct irq_data *d, unsigned int type) | ||
256 | { | ||
257 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
258 | struct irq_chip_type *ct = gc->chip_types; | ||
259 | unsigned int i; | ||
260 | |||
261 | for (i = 0; i < gc->num_ct; i++, ct++) { | ||
262 | if (ct->type & type) { | ||
263 | d->chip = &ct->chip; | ||
264 | irq_data_to_desc(d)->handle_irq = ct->handler; | ||
265 | return 0; | ||
266 | } | ||
267 | } | ||
268 | return -EINVAL; | ||
269 | } | ||
270 | |||
271 | /** | ||
272 | * irq_remove_generic_chip - Remove a chip | ||
273 | * @gc: Generic irq chip holding all data | ||
274 | * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base | ||
275 | * @clr: IRQ_* bits to clear | ||
276 | * @set: IRQ_* bits to set | ||
277 | * | ||
278 | * Remove up to 32 interrupts starting from gc->irq_base. | ||
279 | */ | ||
280 | void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, | ||
281 | unsigned int clr, unsigned int set) | ||
282 | { | ||
283 | unsigned int i = gc->irq_base; | ||
284 | |||
285 | raw_spin_lock(&gc_lock); | ||
286 | list_del(&gc->list); | ||
287 | raw_spin_unlock(&gc_lock); | ||
288 | |||
289 | for (; msk; msk >>= 1, i++) { | ||
290 | if (!msk & 0x01) | ||
291 | continue; | ||
292 | |||
293 | /* Remove handler first. That will mask the irq line */ | ||
294 | irq_set_handler(i, NULL); | ||
295 | irq_set_chip(i, &no_irq_chip); | ||
296 | irq_set_chip_data(i, NULL); | ||
297 | irq_modify_status(i, clr, set); | ||
298 | } | ||
299 | } | ||
300 | |||
301 | #ifdef CONFIG_PM | ||
302 | static int irq_gc_suspend(void) | ||
303 | { | ||
304 | struct irq_chip_generic *gc; | ||
305 | |||
306 | list_for_each_entry(gc, &gc_list, list) { | ||
307 | struct irq_chip_type *ct = gc->chip_types; | ||
308 | |||
309 | if (ct->chip.irq_suspend) | ||
310 | ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); | ||
311 | } | ||
312 | return 0; | ||
313 | } | ||
314 | |||
315 | static void irq_gc_resume(void) | ||
316 | { | ||
317 | struct irq_chip_generic *gc; | ||
318 | |||
319 | list_for_each_entry(gc, &gc_list, list) { | ||
320 | struct irq_chip_type *ct = gc->chip_types; | ||
321 | |||
322 | if (ct->chip.irq_resume) | ||
323 | ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); | ||
324 | } | ||
325 | } | ||
326 | #else | ||
327 | #define irq_gc_suspend NULL | ||
328 | #define irq_gc_resume NULL | ||
329 | #endif | ||
330 | |||
331 | static void irq_gc_shutdown(void) | ||
332 | { | ||
333 | struct irq_chip_generic *gc; | ||
334 | |||
335 | list_for_each_entry(gc, &gc_list, list) { | ||
336 | struct irq_chip_type *ct = gc->chip_types; | ||
337 | |||
338 | if (ct->chip.irq_pm_shutdown) | ||
339 | ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); | ||
340 | } | ||
341 | } | ||
342 | |||
343 | static struct syscore_ops irq_gc_syscore_ops = { | ||
344 | .suspend = irq_gc_suspend, | ||
345 | .resume = irq_gc_resume, | ||
346 | .shutdown = irq_gc_shutdown, | ||
347 | }; | ||
348 | |||
349 | static int __init irq_gc_init_ops(void) | ||
350 | { | ||
351 | register_syscore_ops(&irq_gc_syscore_ops); | ||
352 | return 0; | ||
353 | } | ||
354 | device_initcall(irq_gc_init_ops); | ||
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2c039c9b9383..886e80347b32 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
@@ -22,7 +22,7 @@ | |||
22 | */ | 22 | */ |
23 | static struct lock_class_key irq_desc_lock_class; | 23 | static struct lock_class_key irq_desc_lock_class; |
24 | 24 | ||
25 | #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) | 25 | #if defined(CONFIG_SMP) |
26 | static void __init init_irq_default_affinity(void) | 26 | static void __init init_irq_default_affinity(void) |
27 | { | 27 | { |
28 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); | 28 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); |
@@ -290,6 +290,22 @@ static int irq_expand_nr_irqs(unsigned int nr) | |||
290 | 290 | ||
291 | #endif /* !CONFIG_SPARSE_IRQ */ | 291 | #endif /* !CONFIG_SPARSE_IRQ */ |
292 | 292 | ||
293 | /** | ||
294 | * generic_handle_irq - Invoke the handler for a particular irq | ||
295 | * @irq: The irq number to handle | ||
296 | * | ||
297 | */ | ||
298 | int generic_handle_irq(unsigned int irq) | ||
299 | { | ||
300 | struct irq_desc *desc = irq_to_desc(irq); | ||
301 | |||
302 | if (!desc) | ||
303 | return -EINVAL; | ||
304 | generic_handle_irq_desc(irq, desc); | ||
305 | return 0; | ||
306 | } | ||
307 | EXPORT_SYMBOL_GPL(generic_handle_irq); | ||
308 | |||
293 | /* Dynamic interrupt handling */ | 309 | /* Dynamic interrupt handling */ |
294 | 310 | ||
295 | /** | 311 | /** |
@@ -311,6 +327,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt) | |||
311 | bitmap_clear(allocated_irqs, from, cnt); | 327 | bitmap_clear(allocated_irqs, from, cnt); |
312 | mutex_unlock(&sparse_irq_lock); | 328 | mutex_unlock(&sparse_irq_lock); |
313 | } | 329 | } |
330 | EXPORT_SYMBOL_GPL(irq_free_descs); | ||
314 | 331 | ||
315 | /** | 332 | /** |
316 | * irq_alloc_descs - allocate and initialize a range of irq descriptors | 333 | * irq_alloc_descs - allocate and initialize a range of irq descriptors |
@@ -351,6 +368,7 @@ err: | |||
351 | mutex_unlock(&sparse_irq_lock); | 368 | mutex_unlock(&sparse_irq_lock); |
352 | return ret; | 369 | return ret; |
353 | } | 370 | } |
371 | EXPORT_SYMBOL_GPL(irq_alloc_descs); | ||
354 | 372 | ||
355 | /** | 373 | /** |
356 | * irq_reserve_irqs - mark irqs allocated | 374 | * irq_reserve_irqs - mark irqs allocated |
@@ -430,7 +448,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | |||
430 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; | 448 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; |
431 | } | 449 | } |
432 | 450 | ||
433 | #ifdef CONFIG_GENERIC_HARDIRQS | ||
434 | unsigned int kstat_irqs(unsigned int irq) | 451 | unsigned int kstat_irqs(unsigned int irq) |
435 | { | 452 | { |
436 | struct irq_desc *desc = irq_to_desc(irq); | 453 | struct irq_desc *desc = irq_to_desc(irq); |
@@ -443,4 +460,3 @@ unsigned int kstat_irqs(unsigned int irq) | |||
443 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); | 460 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); |
444 | return sum; | 461 | return sum; |
445 | } | 462 | } |
446 | #endif /* CONFIG_GENERIC_HARDIRQS */ | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 12a80fdae11c..f7ce0021e1c4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -900,7 +900,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
900 | */ | 900 | */ |
901 | new->handler = irq_nested_primary_handler; | 901 | new->handler = irq_nested_primary_handler; |
902 | } else { | 902 | } else { |
903 | irq_setup_forced_threading(new); | 903 | if (irq_settings_can_thread(desc)) |
904 | irq_setup_forced_threading(new); | ||
904 | } | 905 | } |
905 | 906 | ||
906 | /* | 907 | /* |
@@ -1051,6 +1052,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
1051 | register_irq_proc(irq, desc); | 1052 | register_irq_proc(irq, desc); |
1052 | new->dir = NULL; | 1053 | new->dir = NULL; |
1053 | register_handler_proc(irq, new); | 1054 | register_handler_proc(irq, new); |
1055 | free_cpumask_var(mask); | ||
1054 | 1056 | ||
1055 | return 0; | 1057 | return 0; |
1056 | 1058 | ||
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index bc6194698dfd..47420908fba0 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c | |||
@@ -35,7 +35,7 @@ void irq_move_masked_irq(struct irq_data *idata) | |||
35 | * do the disable, re-program, enable sequence. | 35 | * do the disable, re-program, enable sequence. |
36 | * This is *not* particularly important for level triggered | 36 | * This is *not* particularly important for level triggered |
37 | * but in a edge trigger case, we might be setting rte | 37 | * but in a edge trigger case, we might be setting rte |
38 | * when an active trigger is comming in. This could | 38 | * when an active trigger is coming in. This could |
39 | * cause some ioapics to mal-function. | 39 | * cause some ioapics to mal-function. |
40 | * Being paranoid i guess! | 40 | * Being paranoid i guess! |
41 | * | 41 | * |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index dd201bd35103..834899f2500f 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -419,7 +419,7 @@ int show_interrupts(struct seq_file *p, void *v) | |||
419 | } else { | 419 | } else { |
420 | seq_printf(p, " %8s", "None"); | 420 | seq_printf(p, " %8s", "None"); |
421 | } | 421 | } |
422 | #ifdef CONFIG_GENIRC_IRQ_SHOW_LEVEL | 422 | #ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL |
423 | seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); | 423 | seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); |
424 | #endif | 424 | #endif |
425 | if (desc->name) | 425 | if (desc->name) |
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 0d91730b6330..f1667833d444 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h | |||
@@ -8,6 +8,7 @@ enum { | |||
8 | _IRQ_LEVEL = IRQ_LEVEL, | 8 | _IRQ_LEVEL = IRQ_LEVEL, |
9 | _IRQ_NOPROBE = IRQ_NOPROBE, | 9 | _IRQ_NOPROBE = IRQ_NOPROBE, |
10 | _IRQ_NOREQUEST = IRQ_NOREQUEST, | 10 | _IRQ_NOREQUEST = IRQ_NOREQUEST, |
11 | _IRQ_NOTHREAD = IRQ_NOTHREAD, | ||
11 | _IRQ_NOAUTOEN = IRQ_NOAUTOEN, | 12 | _IRQ_NOAUTOEN = IRQ_NOAUTOEN, |
12 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, | 13 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, |
13 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, | 14 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, |
@@ -20,6 +21,7 @@ enum { | |||
20 | #define IRQ_LEVEL GOT_YOU_MORON | 21 | #define IRQ_LEVEL GOT_YOU_MORON |
21 | #define IRQ_NOPROBE GOT_YOU_MORON | 22 | #define IRQ_NOPROBE GOT_YOU_MORON |
22 | #define IRQ_NOREQUEST GOT_YOU_MORON | 23 | #define IRQ_NOREQUEST GOT_YOU_MORON |
24 | #define IRQ_NOTHREAD GOT_YOU_MORON | ||
23 | #define IRQ_NOAUTOEN GOT_YOU_MORON | 25 | #define IRQ_NOAUTOEN GOT_YOU_MORON |
24 | #define IRQ_NESTED_THREAD GOT_YOU_MORON | 26 | #define IRQ_NESTED_THREAD GOT_YOU_MORON |
25 | #undef IRQF_MODIFY_MASK | 27 | #undef IRQF_MODIFY_MASK |
@@ -94,6 +96,21 @@ static inline void irq_settings_set_norequest(struct irq_desc *desc) | |||
94 | desc->status_use_accessors |= _IRQ_NOREQUEST; | 96 | desc->status_use_accessors |= _IRQ_NOREQUEST; |
95 | } | 97 | } |
96 | 98 | ||
99 | static inline bool irq_settings_can_thread(struct irq_desc *desc) | ||
100 | { | ||
101 | return !(desc->status_use_accessors & _IRQ_NOTHREAD); | ||
102 | } | ||
103 | |||
104 | static inline void irq_settings_clr_nothread(struct irq_desc *desc) | ||
105 | { | ||
106 | desc->status_use_accessors &= ~_IRQ_NOTHREAD; | ||
107 | } | ||
108 | |||
109 | static inline void irq_settings_set_nothread(struct irq_desc *desc) | ||
110 | { | ||
111 | desc->status_use_accessors |= _IRQ_NOTHREAD; | ||
112 | } | ||
113 | |||
97 | static inline bool irq_settings_can_probe(struct irq_desc *desc) | 114 | static inline bool irq_settings_can_probe(struct irq_desc *desc) |
98 | { | 115 | { |
99 | return !(desc->status_use_accessors & _IRQ_NOPROBE); | 116 | return !(desc->status_use_accessors & _IRQ_NOPROBE); |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 3b79bd938330..74d1c099fbd1 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
@@ -2,43 +2,23 @@ | |||
2 | * jump label support | 2 | * jump label support |
3 | * | 3 | * |
4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> | 4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> |
5 | * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com> | ||
5 | * | 6 | * |
6 | */ | 7 | */ |
7 | #include <linux/jump_label.h> | ||
8 | #include <linux/memory.h> | 8 | #include <linux/memory.h> |
9 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/list.h> | 11 | #include <linux/list.h> |
12 | #include <linux/jhash.h> | ||
13 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
14 | #include <linux/sort.h> | 13 | #include <linux/sort.h> |
15 | #include <linux/err.h> | 14 | #include <linux/err.h> |
15 | #include <linux/jump_label.h> | ||
16 | 16 | ||
17 | #ifdef HAVE_JUMP_LABEL | 17 | #ifdef HAVE_JUMP_LABEL |
18 | 18 | ||
19 | #define JUMP_LABEL_HASH_BITS 6 | ||
20 | #define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) | ||
21 | static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; | ||
22 | |||
23 | /* mutex to protect coming/going of the the jump_label table */ | 19 | /* mutex to protect coming/going of the the jump_label table */ |
24 | static DEFINE_MUTEX(jump_label_mutex); | 20 | static DEFINE_MUTEX(jump_label_mutex); |
25 | 21 | ||
26 | struct jump_label_entry { | ||
27 | struct hlist_node hlist; | ||
28 | struct jump_entry *table; | ||
29 | int nr_entries; | ||
30 | /* hang modules off here */ | ||
31 | struct hlist_head modules; | ||
32 | unsigned long key; | ||
33 | }; | ||
34 | |||
35 | struct jump_label_module_entry { | ||
36 | struct hlist_node hlist; | ||
37 | struct jump_entry *table; | ||
38 | int nr_entries; | ||
39 | struct module *mod; | ||
40 | }; | ||
41 | |||
42 | void jump_label_lock(void) | 22 | void jump_label_lock(void) |
43 | { | 23 | { |
44 | mutex_lock(&jump_label_mutex); | 24 | mutex_lock(&jump_label_mutex); |
@@ -49,6 +29,11 @@ void jump_label_unlock(void) | |||
49 | mutex_unlock(&jump_label_mutex); | 29 | mutex_unlock(&jump_label_mutex); |
50 | } | 30 | } |
51 | 31 | ||
32 | bool jump_label_enabled(struct jump_label_key *key) | ||
33 | { | ||
34 | return !!atomic_read(&key->enabled); | ||
35 | } | ||
36 | |||
52 | static int jump_label_cmp(const void *a, const void *b) | 37 | static int jump_label_cmp(const void *a, const void *b) |
53 | { | 38 | { |
54 | const struct jump_entry *jea = a; | 39 | const struct jump_entry *jea = a; |
@@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b) | |||
64 | } | 49 | } |
65 | 50 | ||
66 | static void | 51 | static void |
67 | sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) | 52 | jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) |
68 | { | 53 | { |
69 | unsigned long size; | 54 | unsigned long size; |
70 | 55 | ||
@@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) | |||
73 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); | 58 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); |
74 | } | 59 | } |
75 | 60 | ||
76 | static struct jump_label_entry *get_jump_label_entry(jump_label_t key) | 61 | static void jump_label_update(struct jump_label_key *key, int enable); |
77 | { | ||
78 | struct hlist_head *head; | ||
79 | struct hlist_node *node; | ||
80 | struct jump_label_entry *e; | ||
81 | u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
82 | |||
83 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
84 | hlist_for_each_entry(e, node, head, hlist) { | ||
85 | if (key == e->key) | ||
86 | return e; | ||
87 | } | ||
88 | return NULL; | ||
89 | } | ||
90 | 62 | ||
91 | static struct jump_label_entry * | 63 | void jump_label_inc(struct jump_label_key *key) |
92 | add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) | ||
93 | { | 64 | { |
94 | struct hlist_head *head; | 65 | if (atomic_inc_not_zero(&key->enabled)) |
95 | struct jump_label_entry *e; | 66 | return; |
96 | u32 hash; | ||
97 | |||
98 | e = get_jump_label_entry(key); | ||
99 | if (e) | ||
100 | return ERR_PTR(-EEXIST); | ||
101 | |||
102 | e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); | ||
103 | if (!e) | ||
104 | return ERR_PTR(-ENOMEM); | ||
105 | |||
106 | hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
107 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
108 | e->key = key; | ||
109 | e->table = table; | ||
110 | e->nr_entries = nr_entries; | ||
111 | INIT_HLIST_HEAD(&(e->modules)); | ||
112 | hlist_add_head(&e->hlist, head); | ||
113 | return e; | ||
114 | } | ||
115 | 67 | ||
116 | static int | 68 | jump_label_lock(); |
117 | build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) | 69 | if (atomic_add_return(1, &key->enabled) == 1) |
118 | { | 70 | jump_label_update(key, JUMP_LABEL_ENABLE); |
119 | struct jump_entry *iter, *iter_begin; | 71 | jump_label_unlock(); |
120 | struct jump_label_entry *entry; | ||
121 | int count; | ||
122 | |||
123 | sort_jump_label_entries(start, stop); | ||
124 | iter = start; | ||
125 | while (iter < stop) { | ||
126 | entry = get_jump_label_entry(iter->key); | ||
127 | if (!entry) { | ||
128 | iter_begin = iter; | ||
129 | count = 0; | ||
130 | while ((iter < stop) && | ||
131 | (iter->key == iter_begin->key)) { | ||
132 | iter++; | ||
133 | count++; | ||
134 | } | ||
135 | entry = add_jump_label_entry(iter_begin->key, | ||
136 | count, iter_begin); | ||
137 | if (IS_ERR(entry)) | ||
138 | return PTR_ERR(entry); | ||
139 | } else { | ||
140 | WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); | ||
141 | return -1; | ||
142 | } | ||
143 | } | ||
144 | return 0; | ||
145 | } | 72 | } |
146 | 73 | ||
147 | /*** | 74 | void jump_label_dec(struct jump_label_key *key) |
148 | * jump_label_update - update jump label text | ||
149 | * @key - key value associated with a a jump label | ||
150 | * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE | ||
151 | * | ||
152 | * Will enable/disable the jump for jump label @key, depending on the | ||
153 | * value of @type. | ||
154 | * | ||
155 | */ | ||
156 | |||
157 | void jump_label_update(unsigned long key, enum jump_label_type type) | ||
158 | { | 75 | { |
159 | struct jump_entry *iter; | 76 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) |
160 | struct jump_label_entry *entry; | 77 | return; |
161 | struct hlist_node *module_node; | ||
162 | struct jump_label_module_entry *e_module; | ||
163 | int count; | ||
164 | 78 | ||
165 | jump_label_lock(); | 79 | jump_label_update(key, JUMP_LABEL_DISABLE); |
166 | entry = get_jump_label_entry((jump_label_t)key); | ||
167 | if (entry) { | ||
168 | count = entry->nr_entries; | ||
169 | iter = entry->table; | ||
170 | while (count--) { | ||
171 | if (kernel_text_address(iter->code)) | ||
172 | arch_jump_label_transform(iter, type); | ||
173 | iter++; | ||
174 | } | ||
175 | /* eanble/disable jump labels in modules */ | ||
176 | hlist_for_each_entry(e_module, module_node, &(entry->modules), | ||
177 | hlist) { | ||
178 | count = e_module->nr_entries; | ||
179 | iter = e_module->table; | ||
180 | while (count--) { | ||
181 | if (iter->key && | ||
182 | kernel_text_address(iter->code)) | ||
183 | arch_jump_label_transform(iter, type); | ||
184 | iter++; | ||
185 | } | ||
186 | } | ||
187 | } | ||
188 | jump_label_unlock(); | 80 | jump_label_unlock(); |
189 | } | 81 | } |
190 | 82 | ||
@@ -197,77 +89,33 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end) | |||
197 | return 0; | 89 | return 0; |
198 | } | 90 | } |
199 | 91 | ||
200 | #ifdef CONFIG_MODULES | 92 | static int __jump_label_text_reserved(struct jump_entry *iter_start, |
201 | 93 | struct jump_entry *iter_stop, void *start, void *end) | |
202 | static int module_conflict(void *start, void *end) | ||
203 | { | 94 | { |
204 | struct hlist_head *head; | ||
205 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | ||
206 | struct jump_label_entry *e; | ||
207 | struct jump_label_module_entry *e_module; | ||
208 | struct jump_entry *iter; | 95 | struct jump_entry *iter; |
209 | int i, count; | ||
210 | int conflict = 0; | ||
211 | |||
212 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | ||
213 | head = &jump_label_table[i]; | ||
214 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | ||
215 | hlist_for_each_entry_safe(e_module, module_node, | ||
216 | module_node_next, | ||
217 | &(e->modules), hlist) { | ||
218 | count = e_module->nr_entries; | ||
219 | iter = e_module->table; | ||
220 | while (count--) { | ||
221 | if (addr_conflict(iter, start, end)) { | ||
222 | conflict = 1; | ||
223 | goto out; | ||
224 | } | ||
225 | iter++; | ||
226 | } | ||
227 | } | ||
228 | } | ||
229 | } | ||
230 | out: | ||
231 | return conflict; | ||
232 | } | ||
233 | |||
234 | #endif | ||
235 | |||
236 | /*** | ||
237 | * jump_label_text_reserved - check if addr range is reserved | ||
238 | * @start: start text addr | ||
239 | * @end: end text addr | ||
240 | * | ||
241 | * checks if the text addr located between @start and @end | ||
242 | * overlaps with any of the jump label patch addresses. Code | ||
243 | * that wants to modify kernel text should first verify that | ||
244 | * it does not overlap with any of the jump label addresses. | ||
245 | * Caller must hold jump_label_mutex. | ||
246 | * | ||
247 | * returns 1 if there is an overlap, 0 otherwise | ||
248 | */ | ||
249 | int jump_label_text_reserved(void *start, void *end) | ||
250 | { | ||
251 | struct jump_entry *iter; | ||
252 | struct jump_entry *iter_start = __start___jump_table; | ||
253 | struct jump_entry *iter_stop = __start___jump_table; | ||
254 | int conflict = 0; | ||
255 | 96 | ||
256 | iter = iter_start; | 97 | iter = iter_start; |
257 | while (iter < iter_stop) { | 98 | while (iter < iter_stop) { |
258 | if (addr_conflict(iter, start, end)) { | 99 | if (addr_conflict(iter, start, end)) |
259 | conflict = 1; | 100 | return 1; |
260 | goto out; | ||
261 | } | ||
262 | iter++; | 101 | iter++; |
263 | } | 102 | } |
264 | 103 | ||
265 | /* now check modules */ | 104 | return 0; |
266 | #ifdef CONFIG_MODULES | 105 | } |
267 | conflict = module_conflict(start, end); | 106 | |
268 | #endif | 107 | static void __jump_label_update(struct jump_label_key *key, |
269 | out: | 108 | struct jump_entry *entry, int enable) |
270 | return conflict; | 109 | { |
110 | for (; entry->key == (jump_label_t)(unsigned long)key; entry++) { | ||
111 | /* | ||
112 | * entry->code set to 0 invalidates module init text sections | ||
113 | * kernel_text_address() verifies we are not in core kernel | ||
114 | * init code, see jump_label_invalidate_module_init(). | ||
115 | */ | ||
116 | if (entry->code && kernel_text_address(entry->code)) | ||
117 | arch_jump_label_transform(entry, enable); | ||
118 | } | ||
271 | } | 119 | } |
272 | 120 | ||
273 | /* | 121 | /* |
@@ -277,142 +125,173 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr) | |||
277 | { | 125 | { |
278 | } | 126 | } |
279 | 127 | ||
280 | static __init int init_jump_label(void) | 128 | static __init int jump_label_init(void) |
281 | { | 129 | { |
282 | int ret; | ||
283 | struct jump_entry *iter_start = __start___jump_table; | 130 | struct jump_entry *iter_start = __start___jump_table; |
284 | struct jump_entry *iter_stop = __stop___jump_table; | 131 | struct jump_entry *iter_stop = __stop___jump_table; |
132 | struct jump_label_key *key = NULL; | ||
285 | struct jump_entry *iter; | 133 | struct jump_entry *iter; |
286 | 134 | ||
287 | jump_label_lock(); | 135 | jump_label_lock(); |
288 | ret = build_jump_label_hashtable(__start___jump_table, | 136 | jump_label_sort_entries(iter_start, iter_stop); |
289 | __stop___jump_table); | 137 | |
290 | iter = iter_start; | 138 | for (iter = iter_start; iter < iter_stop; iter++) { |
291 | while (iter < iter_stop) { | ||
292 | arch_jump_label_text_poke_early(iter->code); | 139 | arch_jump_label_text_poke_early(iter->code); |
293 | iter++; | 140 | if (iter->key == (jump_label_t)(unsigned long)key) |
141 | continue; | ||
142 | |||
143 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
144 | atomic_set(&key->enabled, 0); | ||
145 | key->entries = iter; | ||
146 | #ifdef CONFIG_MODULES | ||
147 | key->next = NULL; | ||
148 | #endif | ||
294 | } | 149 | } |
295 | jump_label_unlock(); | 150 | jump_label_unlock(); |
296 | return ret; | 151 | |
152 | return 0; | ||
297 | } | 153 | } |
298 | early_initcall(init_jump_label); | 154 | early_initcall(jump_label_init); |
299 | 155 | ||
300 | #ifdef CONFIG_MODULES | 156 | #ifdef CONFIG_MODULES |
301 | 157 | ||
302 | static struct jump_label_module_entry * | 158 | struct jump_label_mod { |
303 | add_jump_label_module_entry(struct jump_label_entry *entry, | 159 | struct jump_label_mod *next; |
304 | struct jump_entry *iter_begin, | 160 | struct jump_entry *entries; |
305 | int count, struct module *mod) | 161 | struct module *mod; |
162 | }; | ||
163 | |||
164 | static int __jump_label_mod_text_reserved(void *start, void *end) | ||
165 | { | ||
166 | struct module *mod; | ||
167 | |||
168 | mod = __module_text_address((unsigned long)start); | ||
169 | if (!mod) | ||
170 | return 0; | ||
171 | |||
172 | WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); | ||
173 | |||
174 | return __jump_label_text_reserved(mod->jump_entries, | ||
175 | mod->jump_entries + mod->num_jump_entries, | ||
176 | start, end); | ||
177 | } | ||
178 | |||
179 | static void __jump_label_mod_update(struct jump_label_key *key, int enable) | ||
180 | { | ||
181 | struct jump_label_mod *mod = key->next; | ||
182 | |||
183 | while (mod) { | ||
184 | __jump_label_update(key, mod->entries, enable); | ||
185 | mod = mod->next; | ||
186 | } | ||
187 | } | ||
188 | |||
189 | /*** | ||
190 | * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() | ||
191 | * @mod: module to patch | ||
192 | * | ||
193 | * Allow for run-time selection of the optimal nops. Before the module | ||
194 | * loads patch these with arch_get_jump_label_nop(), which is specified by | ||
195 | * the arch specific jump label code. | ||
196 | */ | ||
197 | void jump_label_apply_nops(struct module *mod) | ||
306 | { | 198 | { |
307 | struct jump_label_module_entry *e; | 199 | struct jump_entry *iter_start = mod->jump_entries; |
308 | 200 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; | |
309 | e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); | 201 | struct jump_entry *iter; |
310 | if (!e) | 202 | |
311 | return ERR_PTR(-ENOMEM); | 203 | /* if the module doesn't have jump label entries, just return */ |
312 | e->mod = mod; | 204 | if (iter_start == iter_stop) |
313 | e->nr_entries = count; | 205 | return; |
314 | e->table = iter_begin; | 206 | |
315 | hlist_add_head(&e->hlist, &entry->modules); | 207 | for (iter = iter_start; iter < iter_stop; iter++) |
316 | return e; | 208 | arch_jump_label_text_poke_early(iter->code); |
317 | } | 209 | } |
318 | 210 | ||
319 | static int add_jump_label_module(struct module *mod) | 211 | static int jump_label_add_module(struct module *mod) |
320 | { | 212 | { |
321 | struct jump_entry *iter, *iter_begin; | 213 | struct jump_entry *iter_start = mod->jump_entries; |
322 | struct jump_label_entry *entry; | 214 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
323 | struct jump_label_module_entry *module_entry; | 215 | struct jump_entry *iter; |
324 | int count; | 216 | struct jump_label_key *key = NULL; |
217 | struct jump_label_mod *jlm; | ||
325 | 218 | ||
326 | /* if the module doesn't have jump label entries, just return */ | 219 | /* if the module doesn't have jump label entries, just return */ |
327 | if (!mod->num_jump_entries) | 220 | if (iter_start == iter_stop) |
328 | return 0; | 221 | return 0; |
329 | 222 | ||
330 | sort_jump_label_entries(mod->jump_entries, | 223 | jump_label_sort_entries(iter_start, iter_stop); |
331 | mod->jump_entries + mod->num_jump_entries); | 224 | |
332 | iter = mod->jump_entries; | 225 | for (iter = iter_start; iter < iter_stop; iter++) { |
333 | while (iter < mod->jump_entries + mod->num_jump_entries) { | 226 | if (iter->key == (jump_label_t)(unsigned long)key) |
334 | entry = get_jump_label_entry(iter->key); | 227 | continue; |
335 | iter_begin = iter; | 228 | |
336 | count = 0; | 229 | key = (struct jump_label_key *)(unsigned long)iter->key; |
337 | while ((iter < mod->jump_entries + mod->num_jump_entries) && | 230 | |
338 | (iter->key == iter_begin->key)) { | 231 | if (__module_address(iter->key) == mod) { |
339 | iter++; | 232 | atomic_set(&key->enabled, 0); |
340 | count++; | 233 | key->entries = iter; |
341 | } | 234 | key->next = NULL; |
342 | if (!entry) { | 235 | continue; |
343 | entry = add_jump_label_entry(iter_begin->key, 0, NULL); | ||
344 | if (IS_ERR(entry)) | ||
345 | return PTR_ERR(entry); | ||
346 | } | 236 | } |
347 | module_entry = add_jump_label_module_entry(entry, iter_begin, | 237 | |
348 | count, mod); | 238 | jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); |
349 | if (IS_ERR(module_entry)) | 239 | if (!jlm) |
350 | return PTR_ERR(module_entry); | 240 | return -ENOMEM; |
241 | |||
242 | jlm->mod = mod; | ||
243 | jlm->entries = iter; | ||
244 | jlm->next = key->next; | ||
245 | key->next = jlm; | ||
246 | |||
247 | if (jump_label_enabled(key)) | ||
248 | __jump_label_update(key, iter, JUMP_LABEL_ENABLE); | ||
351 | } | 249 | } |
250 | |||
352 | return 0; | 251 | return 0; |
353 | } | 252 | } |
354 | 253 | ||
355 | static void remove_jump_label_module(struct module *mod) | 254 | static void jump_label_del_module(struct module *mod) |
356 | { | 255 | { |
357 | struct hlist_head *head; | 256 | struct jump_entry *iter_start = mod->jump_entries; |
358 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | 257 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
359 | struct jump_label_entry *e; | 258 | struct jump_entry *iter; |
360 | struct jump_label_module_entry *e_module; | 259 | struct jump_label_key *key = NULL; |
361 | int i; | 260 | struct jump_label_mod *jlm, **prev; |
362 | 261 | ||
363 | /* if the module doesn't have jump label entries, just return */ | 262 | for (iter = iter_start; iter < iter_stop; iter++) { |
364 | if (!mod->num_jump_entries) | 263 | if (iter->key == (jump_label_t)(unsigned long)key) |
365 | return; | 264 | continue; |
265 | |||
266 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
267 | |||
268 | if (__module_address(iter->key) == mod) | ||
269 | continue; | ||
270 | |||
271 | prev = &key->next; | ||
272 | jlm = key->next; | ||
366 | 273 | ||
367 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | 274 | while (jlm && jlm->mod != mod) { |
368 | head = &jump_label_table[i]; | 275 | prev = &jlm->next; |
369 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | 276 | jlm = jlm->next; |
370 | hlist_for_each_entry_safe(e_module, module_node, | 277 | } |
371 | module_node_next, | 278 | |
372 | &(e->modules), hlist) { | 279 | if (jlm) { |
373 | if (e_module->mod == mod) { | 280 | *prev = jlm->next; |
374 | hlist_del(&e_module->hlist); | 281 | kfree(jlm); |
375 | kfree(e_module); | ||
376 | } | ||
377 | } | ||
378 | if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { | ||
379 | hlist_del(&e->hlist); | ||
380 | kfree(e); | ||
381 | } | ||
382 | } | 282 | } |
383 | } | 283 | } |
384 | } | 284 | } |
385 | 285 | ||
386 | static void remove_jump_label_module_init(struct module *mod) | 286 | static void jump_label_invalidate_module_init(struct module *mod) |
387 | { | 287 | { |
388 | struct hlist_head *head; | 288 | struct jump_entry *iter_start = mod->jump_entries; |
389 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | 289 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
390 | struct jump_label_entry *e; | ||
391 | struct jump_label_module_entry *e_module; | ||
392 | struct jump_entry *iter; | 290 | struct jump_entry *iter; |
393 | int i, count; | ||
394 | |||
395 | /* if the module doesn't have jump label entries, just return */ | ||
396 | if (!mod->num_jump_entries) | ||
397 | return; | ||
398 | 291 | ||
399 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | 292 | for (iter = iter_start; iter < iter_stop; iter++) { |
400 | head = &jump_label_table[i]; | 293 | if (within_module_init(iter->code, mod)) |
401 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | 294 | iter->code = 0; |
402 | hlist_for_each_entry_safe(e_module, module_node, | ||
403 | module_node_next, | ||
404 | &(e->modules), hlist) { | ||
405 | if (e_module->mod != mod) | ||
406 | continue; | ||
407 | count = e_module->nr_entries; | ||
408 | iter = e_module->table; | ||
409 | while (count--) { | ||
410 | if (within_module_init(iter->code, mod)) | ||
411 | iter->key = 0; | ||
412 | iter++; | ||
413 | } | ||
414 | } | ||
415 | } | ||
416 | } | 295 | } |
417 | } | 296 | } |
418 | 297 | ||
@@ -426,59 +305,77 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, | |||
426 | switch (val) { | 305 | switch (val) { |
427 | case MODULE_STATE_COMING: | 306 | case MODULE_STATE_COMING: |
428 | jump_label_lock(); | 307 | jump_label_lock(); |
429 | ret = add_jump_label_module(mod); | 308 | ret = jump_label_add_module(mod); |
430 | if (ret) | 309 | if (ret) |
431 | remove_jump_label_module(mod); | 310 | jump_label_del_module(mod); |
432 | jump_label_unlock(); | 311 | jump_label_unlock(); |
433 | break; | 312 | break; |
434 | case MODULE_STATE_GOING: | 313 | case MODULE_STATE_GOING: |
435 | jump_label_lock(); | 314 | jump_label_lock(); |
436 | remove_jump_label_module(mod); | 315 | jump_label_del_module(mod); |
437 | jump_label_unlock(); | 316 | jump_label_unlock(); |
438 | break; | 317 | break; |
439 | case MODULE_STATE_LIVE: | 318 | case MODULE_STATE_LIVE: |
440 | jump_label_lock(); | 319 | jump_label_lock(); |
441 | remove_jump_label_module_init(mod); | 320 | jump_label_invalidate_module_init(mod); |
442 | jump_label_unlock(); | 321 | jump_label_unlock(); |
443 | break; | 322 | break; |
444 | } | 323 | } |
445 | return ret; | ||
446 | } | ||
447 | 324 | ||
448 | /*** | 325 | return notifier_from_errno(ret); |
449 | * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() | ||
450 | * @mod: module to patch | ||
451 | * | ||
452 | * Allow for run-time selection of the optimal nops. Before the module | ||
453 | * loads patch these with arch_get_jump_label_nop(), which is specified by | ||
454 | * the arch specific jump label code. | ||
455 | */ | ||
456 | void jump_label_apply_nops(struct module *mod) | ||
457 | { | ||
458 | struct jump_entry *iter; | ||
459 | |||
460 | /* if the module doesn't have jump label entries, just return */ | ||
461 | if (!mod->num_jump_entries) | ||
462 | return; | ||
463 | |||
464 | iter = mod->jump_entries; | ||
465 | while (iter < mod->jump_entries + mod->num_jump_entries) { | ||
466 | arch_jump_label_text_poke_early(iter->code); | ||
467 | iter++; | ||
468 | } | ||
469 | } | 326 | } |
470 | 327 | ||
471 | struct notifier_block jump_label_module_nb = { | 328 | struct notifier_block jump_label_module_nb = { |
472 | .notifier_call = jump_label_module_notify, | 329 | .notifier_call = jump_label_module_notify, |
473 | .priority = 0, | 330 | .priority = 1, /* higher than tracepoints */ |
474 | }; | 331 | }; |
475 | 332 | ||
476 | static __init int init_jump_label_module(void) | 333 | static __init int jump_label_init_module(void) |
477 | { | 334 | { |
478 | return register_module_notifier(&jump_label_module_nb); | 335 | return register_module_notifier(&jump_label_module_nb); |
479 | } | 336 | } |
480 | early_initcall(init_jump_label_module); | 337 | early_initcall(jump_label_init_module); |
481 | 338 | ||
482 | #endif /* CONFIG_MODULES */ | 339 | #endif /* CONFIG_MODULES */ |
483 | 340 | ||
341 | /*** | ||
342 | * jump_label_text_reserved - check if addr range is reserved | ||
343 | * @start: start text addr | ||
344 | * @end: end text addr | ||
345 | * | ||
346 | * checks if the text addr located between @start and @end | ||
347 | * overlaps with any of the jump label patch addresses. Code | ||
348 | * that wants to modify kernel text should first verify that | ||
349 | * it does not overlap with any of the jump label addresses. | ||
350 | * Caller must hold jump_label_mutex. | ||
351 | * | ||
352 | * returns 1 if there is an overlap, 0 otherwise | ||
353 | */ | ||
354 | int jump_label_text_reserved(void *start, void *end) | ||
355 | { | ||
356 | int ret = __jump_label_text_reserved(__start___jump_table, | ||
357 | __stop___jump_table, start, end); | ||
358 | |||
359 | if (ret) | ||
360 | return ret; | ||
361 | |||
362 | #ifdef CONFIG_MODULES | ||
363 | ret = __jump_label_mod_text_reserved(start, end); | ||
364 | #endif | ||
365 | return ret; | ||
366 | } | ||
367 | |||
368 | static void jump_label_update(struct jump_label_key *key, int enable) | ||
369 | { | ||
370 | struct jump_entry *entry = key->entries; | ||
371 | |||
372 | /* if there are no users, entry can be NULL */ | ||
373 | if (entry) | ||
374 | __jump_label_update(key, entry, enable); | ||
375 | |||
376 | #ifdef CONFIG_MODULES | ||
377 | __jump_label_mod_update(key, enable); | ||
378 | #endif | ||
379 | } | ||
380 | |||
484 | #endif | 381 | #endif |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 4e240a378df6..8d814cbc8109 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/vmalloc.h> | 33 | #include <linux/vmalloc.h> |
34 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
35 | #include <linux/kmsg_dump.h> | 35 | #include <linux/kmsg_dump.h> |
36 | #include <linux/syscore_ops.h> | ||
36 | 37 | ||
37 | #include <asm/page.h> | 38 | #include <asm/page.h> |
38 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
144 | /* Initialize the list of destination pages */ | 145 | /* Initialize the list of destination pages */ |
145 | INIT_LIST_HEAD(&image->dest_pages); | 146 | INIT_LIST_HEAD(&image->dest_pages); |
146 | 147 | ||
147 | /* Initialize the list of unuseable pages */ | 148 | /* Initialize the list of unusable pages */ |
148 | INIT_LIST_HEAD(&image->unuseable_pages); | 149 | INIT_LIST_HEAD(&image->unuseable_pages); |
149 | 150 | ||
150 | /* Read in the segments */ | 151 | /* Read in the segments */ |
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, | |||
454 | /* Deal with the destination pages I have inadvertently allocated. | 455 | /* Deal with the destination pages I have inadvertently allocated. |
455 | * | 456 | * |
456 | * Ideally I would convert multi-page allocations into single | 457 | * Ideally I would convert multi-page allocations into single |
457 | * page allocations, and add everyting to image->dest_pages. | 458 | * page allocations, and add everything to image->dest_pages. |
458 | * | 459 | * |
459 | * For now it is simpler to just free the pages. | 460 | * For now it is simpler to just free the pages. |
460 | */ | 461 | */ |
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image) | |||
602 | /* Walk through and free any extra destination pages I may have */ | 603 | /* Walk through and free any extra destination pages I may have */ |
603 | kimage_free_page_list(&image->dest_pages); | 604 | kimage_free_page_list(&image->dest_pages); |
604 | 605 | ||
605 | /* Walk through and free any unuseable pages I have cached */ | 606 | /* Walk through and free any unusable pages I have cached */ |
606 | kimage_free_page_list(&image->unuseable_pages); | 607 | kimage_free_page_list(&image->unuseable_pages); |
607 | 608 | ||
608 | } | 609 | } |
@@ -1530,8 +1531,7 @@ int kernel_kexec(void) | |||
1530 | if (error) | 1531 | if (error) |
1531 | goto Enable_cpus; | 1532 | goto Enable_cpus; |
1532 | local_irq_disable(); | 1533 | local_irq_disable(); |
1533 | /* Suspend system devices */ | 1534 | error = syscore_suspend(); |
1534 | error = sysdev_suspend(PMSG_FREEZE); | ||
1535 | if (error) | 1535 | if (error) |
1536 | goto Enable_irqs; | 1536 | goto Enable_irqs; |
1537 | } else | 1537 | } else |
@@ -1546,7 +1546,7 @@ int kernel_kexec(void) | |||
1546 | 1546 | ||
1547 | #ifdef CONFIG_KEXEC_JUMP | 1547 | #ifdef CONFIG_KEXEC_JUMP |
1548 | if (kexec_image->preserve_context) { | 1548 | if (kexec_image->preserve_context) { |
1549 | sysdev_resume(); | 1549 | syscore_resume(); |
1550 | Enable_irqs: | 1550 | Enable_irqs: |
1551 | local_irq_enable(); | 1551 | local_irq_enable(); |
1552 | Enable_cpus: | 1552 | Enable_cpus: |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 9cd0591c96a2..5ae0ff38425f 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -245,7 +245,6 @@ static void __call_usermodehelper(struct work_struct *work) | |||
245 | } | 245 | } |
246 | } | 246 | } |
247 | 247 | ||
248 | #ifdef CONFIG_PM_SLEEP | ||
249 | /* | 248 | /* |
250 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY | 249 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY |
251 | * (used for preventing user land processes from being created after the user | 250 | * (used for preventing user land processes from being created after the user |
@@ -301,6 +300,15 @@ void usermodehelper_enable(void) | |||
301 | usermodehelper_disabled = 0; | 300 | usermodehelper_disabled = 0; |
302 | } | 301 | } |
303 | 302 | ||
303 | /** | ||
304 | * usermodehelper_is_disabled - check if new helpers are allowed to be started | ||
305 | */ | ||
306 | bool usermodehelper_is_disabled(void) | ||
307 | { | ||
308 | return usermodehelper_disabled; | ||
309 | } | ||
310 | EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); | ||
311 | |||
304 | static void helper_lock(void) | 312 | static void helper_lock(void) |
305 | { | 313 | { |
306 | atomic_inc(&running_helpers); | 314 | atomic_inc(&running_helpers); |
@@ -312,12 +320,6 @@ static void helper_unlock(void) | |||
312 | if (atomic_dec_and_test(&running_helpers)) | 320 | if (atomic_dec_and_test(&running_helpers)) |
313 | wake_up(&running_helpers_waitq); | 321 | wake_up(&running_helpers_waitq); |
314 | } | 322 | } |
315 | #else /* CONFIG_PM_SLEEP */ | ||
316 | #define usermodehelper_disabled 0 | ||
317 | |||
318 | static inline void helper_lock(void) {} | ||
319 | static inline void helper_unlock(void) {} | ||
320 | #endif /* CONFIG_PM_SLEEP */ | ||
321 | 323 | ||
322 | /** | 324 | /** |
323 | * call_usermodehelper_setup - prepare to call a usermode helper | 325 | * call_usermodehelper_setup - prepare to call a usermode helper |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0b624e791805..3b053c04dd86 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/kexec.h> | 16 | #include <linux/kexec.h> |
17 | #include <linux/profile.h> | 17 | #include <linux/profile.h> |
18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
19 | #include <linux/capability.h> | ||
19 | 20 | ||
20 | #define KERNEL_ATTR_RO(_name) \ | 21 | #define KERNEL_ATTR_RO(_name) \ |
21 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | 22 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) |
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo); | |||
131 | 132 | ||
132 | #endif /* CONFIG_KEXEC */ | 133 | #endif /* CONFIG_KEXEC */ |
133 | 134 | ||
135 | /* whether file capabilities are enabled */ | ||
136 | static ssize_t fscaps_show(struct kobject *kobj, | ||
137 | struct kobj_attribute *attr, char *buf) | ||
138 | { | ||
139 | return sprintf(buf, "%d\n", file_caps_enabled); | ||
140 | } | ||
141 | KERNEL_ATTR_RO(fscaps); | ||
142 | |||
134 | /* | 143 | /* |
135 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. | 144 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. |
136 | */ | 145 | */ |
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj; | |||
158 | EXPORT_SYMBOL_GPL(kernel_kobj); | 167 | EXPORT_SYMBOL_GPL(kernel_kobj); |
159 | 168 | ||
160 | static struct attribute * kernel_attrs[] = { | 169 | static struct attribute * kernel_attrs[] = { |
170 | &fscaps_attr.attr, | ||
161 | #if defined(CONFIG_HOTPLUG) | 171 | #if defined(CONFIG_HOTPLUG) |
162 | &uevent_seqnum_attr.attr, | 172 | &uevent_seqnum_attr.attr, |
163 | &uevent_helper_attr.attr, | 173 | &uevent_helper_attr.attr, |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 684ab3f7dd72..3b34d2732bce 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -139,7 +139,7 @@ static void create_kthread(struct kthread_create_info *create) | |||
139 | * in @node, to get NUMA affinity for kthread stack, or else give -1. | 139 | * in @node, to get NUMA affinity for kthread stack, or else give -1. |
140 | * When woken, the thread will run @threadfn() with @data as its | 140 | * When woken, the thread will run @threadfn() with @data as its |
141 | * argument. @threadfn() can either call do_exit() directly if it is a | 141 | * argument. @threadfn() can either call do_exit() directly if it is a |
142 | * standalone thread for which noone will call kthread_stop(), or | 142 | * standalone thread for which no one will call kthread_stop(), or |
143 | * return when 'kthread_should_stop()' is true (which means | 143 | * return when 'kthread_should_stop()' is true (which means |
144 | * kthread_stop() has been called). The return value should be zero | 144 | * kthread_stop() has been called). The return value should be zero |
145 | * or a negative error number; it will be passed to kthread_stop(). | 145 | * or a negative error number; it will be passed to kthread_stop(). |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index ee74b35e528d..376066e10413 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk, | |||
153 | } | 153 | } |
154 | 154 | ||
155 | /** | 155 | /** |
156 | * __account_scheduler_latency - record an occured latency | 156 | * __account_scheduler_latency - record an occurred latency |
157 | * @tsk - the task struct of the task hitting the latency | 157 | * @tsk - the task struct of the task hitting the latency |
158 | * @usecs - the duration of the latency in microseconds | 158 | * @usecs - the duration of the latency in microseconds |
159 | * @inter - 1 if the sleep was interruptible, 0 if uninterruptible | 159 | * @inter - 1 if the sleep was interruptible, 0 if uninterruptible |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0d2058da80f5..63437d065ac8 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) | |||
490 | usage[i] = '\0'; | 490 | usage[i] = '\0'; |
491 | } | 491 | } |
492 | 492 | ||
493 | static int __print_lock_name(struct lock_class *class) | ||
494 | { | ||
495 | char str[KSYM_NAME_LEN]; | ||
496 | const char *name; | ||
497 | |||
498 | name = class->name; | ||
499 | if (!name) | ||
500 | name = __get_key_name(class->key, str); | ||
501 | |||
502 | return printk("%s", name); | ||
503 | } | ||
504 | |||
493 | static void print_lock_name(struct lock_class *class) | 505 | static void print_lock_name(struct lock_class *class) |
494 | { | 506 | { |
495 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; | 507 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; |
@@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth) | |||
1053 | return 0; | 1065 | return 0; |
1054 | } | 1066 | } |
1055 | 1067 | ||
1068 | static void | ||
1069 | print_circular_lock_scenario(struct held_lock *src, | ||
1070 | struct held_lock *tgt, | ||
1071 | struct lock_list *prt) | ||
1072 | { | ||
1073 | struct lock_class *source = hlock_class(src); | ||
1074 | struct lock_class *target = hlock_class(tgt); | ||
1075 | struct lock_class *parent = prt->class; | ||
1076 | |||
1077 | /* | ||
1078 | * A direct locking problem where unsafe_class lock is taken | ||
1079 | * directly by safe_class lock, then all we need to show | ||
1080 | * is the deadlock scenario, as it is obvious that the | ||
1081 | * unsafe lock is taken under the safe lock. | ||
1082 | * | ||
1083 | * But if there is a chain instead, where the safe lock takes | ||
1084 | * an intermediate lock (middle_class) where this lock is | ||
1085 | * not the same as the safe lock, then the lock chain is | ||
1086 | * used to describe the problem. Otherwise we would need | ||
1087 | * to show a different CPU case for each link in the chain | ||
1088 | * from the safe_class lock to the unsafe_class lock. | ||
1089 | */ | ||
1090 | if (parent != source) { | ||
1091 | printk("Chain exists of:\n "); | ||
1092 | __print_lock_name(source); | ||
1093 | printk(" --> "); | ||
1094 | __print_lock_name(parent); | ||
1095 | printk(" --> "); | ||
1096 | __print_lock_name(target); | ||
1097 | printk("\n\n"); | ||
1098 | } | ||
1099 | |||
1100 | printk(" Possible unsafe locking scenario:\n\n"); | ||
1101 | printk(" CPU0 CPU1\n"); | ||
1102 | printk(" ---- ----\n"); | ||
1103 | printk(" lock("); | ||
1104 | __print_lock_name(target); | ||
1105 | printk(");\n"); | ||
1106 | printk(" lock("); | ||
1107 | __print_lock_name(parent); | ||
1108 | printk(");\n"); | ||
1109 | printk(" lock("); | ||
1110 | __print_lock_name(target); | ||
1111 | printk(");\n"); | ||
1112 | printk(" lock("); | ||
1113 | __print_lock_name(source); | ||
1114 | printk(");\n"); | ||
1115 | printk("\n *** DEADLOCK ***\n\n"); | ||
1116 | } | ||
1117 | |||
1056 | /* | 1118 | /* |
1057 | * When a circular dependency is detected, print the | 1119 | * When a circular dependency is detected, print the |
1058 | * header first: | 1120 | * header first: |
@@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
1096 | { | 1158 | { |
1097 | struct task_struct *curr = current; | 1159 | struct task_struct *curr = current; |
1098 | struct lock_list *parent; | 1160 | struct lock_list *parent; |
1161 | struct lock_list *first_parent; | ||
1099 | int depth; | 1162 | int depth; |
1100 | 1163 | ||
1101 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1164 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
@@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
1109 | print_circular_bug_header(target, depth, check_src, check_tgt); | 1172 | print_circular_bug_header(target, depth, check_src, check_tgt); |
1110 | 1173 | ||
1111 | parent = get_lock_parent(target); | 1174 | parent = get_lock_parent(target); |
1175 | first_parent = parent; | ||
1112 | 1176 | ||
1113 | while (parent) { | 1177 | while (parent) { |
1114 | print_circular_bug_entry(parent, --depth); | 1178 | print_circular_bug_entry(parent, --depth); |
@@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
1116 | } | 1180 | } |
1117 | 1181 | ||
1118 | printk("\nother info that might help us debug this:\n\n"); | 1182 | printk("\nother info that might help us debug this:\n\n"); |
1183 | print_circular_lock_scenario(check_src, check_tgt, | ||
1184 | first_parent); | ||
1185 | |||
1119 | lockdep_print_held_locks(curr); | 1186 | lockdep_print_held_locks(curr); |
1120 | 1187 | ||
1121 | printk("\nstack backtrace:\n"); | 1188 | printk("\nstack backtrace:\n"); |
@@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, | |||
1314 | printk("\n"); | 1381 | printk("\n"); |
1315 | 1382 | ||
1316 | if (depth == 0 && (entry != root)) { | 1383 | if (depth == 0 && (entry != root)) { |
1317 | printk("lockdep:%s bad BFS generated tree\n", __func__); | 1384 | printk("lockdep:%s bad path found in chain graph\n", __func__); |
1318 | break; | 1385 | break; |
1319 | } | 1386 | } |
1320 | 1387 | ||
@@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf, | |||
1325 | return; | 1392 | return; |
1326 | } | 1393 | } |
1327 | 1394 | ||
1395 | static void | ||
1396 | print_irq_lock_scenario(struct lock_list *safe_entry, | ||
1397 | struct lock_list *unsafe_entry, | ||
1398 | struct lock_class *prev_class, | ||
1399 | struct lock_class *next_class) | ||
1400 | { | ||
1401 | struct lock_class *safe_class = safe_entry->class; | ||
1402 | struct lock_class *unsafe_class = unsafe_entry->class; | ||
1403 | struct lock_class *middle_class = prev_class; | ||
1404 | |||
1405 | if (middle_class == safe_class) | ||
1406 | middle_class = next_class; | ||
1407 | |||
1408 | /* | ||
1409 | * A direct locking problem where unsafe_class lock is taken | ||
1410 | * directly by safe_class lock, then all we need to show | ||
1411 | * is the deadlock scenario, as it is obvious that the | ||
1412 | * unsafe lock is taken under the safe lock. | ||
1413 | * | ||
1414 | * But if there is a chain instead, where the safe lock takes | ||
1415 | * an intermediate lock (middle_class) where this lock is | ||
1416 | * not the same as the safe lock, then the lock chain is | ||
1417 | * used to describe the problem. Otherwise we would need | ||
1418 | * to show a different CPU case for each link in the chain | ||
1419 | * from the safe_class lock to the unsafe_class lock. | ||
1420 | */ | ||
1421 | if (middle_class != unsafe_class) { | ||
1422 | printk("Chain exists of:\n "); | ||
1423 | __print_lock_name(safe_class); | ||
1424 | printk(" --> "); | ||
1425 | __print_lock_name(middle_class); | ||
1426 | printk(" --> "); | ||
1427 | __print_lock_name(unsafe_class); | ||
1428 | printk("\n\n"); | ||
1429 | } | ||
1430 | |||
1431 | printk(" Possible interrupt unsafe locking scenario:\n\n"); | ||
1432 | printk(" CPU0 CPU1\n"); | ||
1433 | printk(" ---- ----\n"); | ||
1434 | printk(" lock("); | ||
1435 | __print_lock_name(unsafe_class); | ||
1436 | printk(");\n"); | ||
1437 | printk(" local_irq_disable();\n"); | ||
1438 | printk(" lock("); | ||
1439 | __print_lock_name(safe_class); | ||
1440 | printk(");\n"); | ||
1441 | printk(" lock("); | ||
1442 | __print_lock_name(middle_class); | ||
1443 | printk(");\n"); | ||
1444 | printk(" <Interrupt>\n"); | ||
1445 | printk(" lock("); | ||
1446 | __print_lock_name(safe_class); | ||
1447 | printk(");\n"); | ||
1448 | printk("\n *** DEADLOCK ***\n\n"); | ||
1449 | } | ||
1450 | |||
1328 | static int | 1451 | static int |
1329 | print_bad_irq_dependency(struct task_struct *curr, | 1452 | print_bad_irq_dependency(struct task_struct *curr, |
1330 | struct lock_list *prev_root, | 1453 | struct lock_list *prev_root, |
@@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
1376 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); | 1499 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); |
1377 | 1500 | ||
1378 | printk("\nother info that might help us debug this:\n\n"); | 1501 | printk("\nother info that might help us debug this:\n\n"); |
1502 | print_irq_lock_scenario(backwards_entry, forwards_entry, | ||
1503 | hlock_class(prev), hlock_class(next)); | ||
1504 | |||
1379 | lockdep_print_held_locks(curr); | 1505 | lockdep_print_held_locks(curr); |
1380 | 1506 | ||
1381 | printk("\nthe dependencies between %s-irq-safe lock", irqclass); | 1507 | printk("\nthe dependencies between %s-irq-safe lock", irqclass); |
@@ -1539,6 +1665,26 @@ static inline void inc_chains(void) | |||
1539 | 1665 | ||
1540 | #endif | 1666 | #endif |
1541 | 1667 | ||
1668 | static void | ||
1669 | print_deadlock_scenario(struct held_lock *nxt, | ||
1670 | struct held_lock *prv) | ||
1671 | { | ||
1672 | struct lock_class *next = hlock_class(nxt); | ||
1673 | struct lock_class *prev = hlock_class(prv); | ||
1674 | |||
1675 | printk(" Possible unsafe locking scenario:\n\n"); | ||
1676 | printk(" CPU0\n"); | ||
1677 | printk(" ----\n"); | ||
1678 | printk(" lock("); | ||
1679 | __print_lock_name(prev); | ||
1680 | printk(");\n"); | ||
1681 | printk(" lock("); | ||
1682 | __print_lock_name(next); | ||
1683 | printk(");\n"); | ||
1684 | printk("\n *** DEADLOCK ***\n\n"); | ||
1685 | printk(" May be due to missing lock nesting notation\n\n"); | ||
1686 | } | ||
1687 | |||
1542 | static int | 1688 | static int |
1543 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | 1689 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, |
1544 | struct held_lock *next) | 1690 | struct held_lock *next) |
@@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
1557 | print_lock(prev); | 1703 | print_lock(prev); |
1558 | 1704 | ||
1559 | printk("\nother info that might help us debug this:\n"); | 1705 | printk("\nother info that might help us debug this:\n"); |
1706 | print_deadlock_scenario(next, prev); | ||
1560 | lockdep_print_held_locks(curr); | 1707 | lockdep_print_held_locks(curr); |
1561 | 1708 | ||
1562 | printk("\nstack backtrace:\n"); | 1709 | printk("\nstack backtrace:\n"); |
@@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, | |||
1826 | struct list_head *hash_head = chainhashentry(chain_key); | 1973 | struct list_head *hash_head = chainhashentry(chain_key); |
1827 | struct lock_chain *chain; | 1974 | struct lock_chain *chain; |
1828 | struct held_lock *hlock_curr, *hlock_next; | 1975 | struct held_lock *hlock_curr, *hlock_next; |
1829 | int i, j, n, cn; | 1976 | int i, j; |
1830 | 1977 | ||
1831 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 1978 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
1832 | return 0; | 1979 | return 0; |
@@ -1886,15 +2033,9 @@ cache_hit: | |||
1886 | } | 2033 | } |
1887 | i++; | 2034 | i++; |
1888 | chain->depth = curr->lockdep_depth + 1 - i; | 2035 | chain->depth = curr->lockdep_depth + 1 - i; |
1889 | cn = nr_chain_hlocks; | 2036 | if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { |
1890 | while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { | 2037 | chain->base = nr_chain_hlocks; |
1891 | n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); | 2038 | nr_chain_hlocks += chain->depth; |
1892 | if (n == cn) | ||
1893 | break; | ||
1894 | cn = n; | ||
1895 | } | ||
1896 | if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { | ||
1897 | chain->base = cn; | ||
1898 | for (j = 0; j < chain->depth - 1; j++, i++) { | 2039 | for (j = 0; j < chain->depth - 1; j++, i++) { |
1899 | int lock_id = curr->held_locks[i].class_idx - 1; | 2040 | int lock_id = curr->held_locks[i].class_idx - 1; |
1900 | chain_hlocks[chain->base + j] = lock_id; | 2041 | chain_hlocks[chain->base + j] = lock_id; |
@@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr) | |||
2011 | #endif | 2152 | #endif |
2012 | } | 2153 | } |
2013 | 2154 | ||
2155 | static void | ||
2156 | print_usage_bug_scenario(struct held_lock *lock) | ||
2157 | { | ||
2158 | struct lock_class *class = hlock_class(lock); | ||
2159 | |||
2160 | printk(" Possible unsafe locking scenario:\n\n"); | ||
2161 | printk(" CPU0\n"); | ||
2162 | printk(" ----\n"); | ||
2163 | printk(" lock("); | ||
2164 | __print_lock_name(class); | ||
2165 | printk(");\n"); | ||
2166 | printk(" <Interrupt>\n"); | ||
2167 | printk(" lock("); | ||
2168 | __print_lock_name(class); | ||
2169 | printk(");\n"); | ||
2170 | printk("\n *** DEADLOCK ***\n\n"); | ||
2171 | } | ||
2172 | |||
2014 | static int | 2173 | static int |
2015 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | 2174 | print_usage_bug(struct task_struct *curr, struct held_lock *this, |
2016 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | 2175 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) |
@@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
2039 | 2198 | ||
2040 | print_irqtrace_events(curr); | 2199 | print_irqtrace_events(curr); |
2041 | printk("\nother info that might help us debug this:\n"); | 2200 | printk("\nother info that might help us debug this:\n"); |
2201 | print_usage_bug_scenario(this); | ||
2202 | |||
2042 | lockdep_print_held_locks(curr); | 2203 | lockdep_print_held_locks(curr); |
2043 | 2204 | ||
2044 | printk("\nstack backtrace:\n"); | 2205 | printk("\nstack backtrace:\n"); |
@@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2073 | struct held_lock *this, int forwards, | 2234 | struct held_lock *this, int forwards, |
2074 | const char *irqclass) | 2235 | const char *irqclass) |
2075 | { | 2236 | { |
2237 | struct lock_list *entry = other; | ||
2238 | struct lock_list *middle = NULL; | ||
2239 | int depth; | ||
2240 | |||
2076 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2241 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
2077 | return 0; | 2242 | return 0; |
2078 | 2243 | ||
@@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
2091 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); | 2256 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); |
2092 | 2257 | ||
2093 | printk("\nother info that might help us debug this:\n"); | 2258 | printk("\nother info that might help us debug this:\n"); |
2259 | |||
2260 | /* Find a middle lock (if one exists) */ | ||
2261 | depth = get_lock_depth(other); | ||
2262 | do { | ||
2263 | if (depth == 0 && (entry != root)) { | ||
2264 | printk("lockdep:%s bad path found in chain graph\n", __func__); | ||
2265 | break; | ||
2266 | } | ||
2267 | middle = entry; | ||
2268 | entry = get_lock_parent(entry); | ||
2269 | depth--; | ||
2270 | } while (entry && entry != root && (depth >= 0)); | ||
2271 | if (forwards) | ||
2272 | print_irq_lock_scenario(root, other, | ||
2273 | middle ? middle->class : root->class, other->class); | ||
2274 | else | ||
2275 | print_irq_lock_scenario(other, root, | ||
2276 | middle ? middle->class : other->class, root->class); | ||
2277 | |||
2094 | lockdep_print_held_locks(curr); | 2278 | lockdep_print_held_locks(curr); |
2095 | 2279 | ||
2096 | printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); | 2280 | printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); |
@@ -2309,7 +2493,7 @@ void trace_hardirqs_on_caller(unsigned long ip) | |||
2309 | if (unlikely(curr->hardirqs_enabled)) { | 2493 | if (unlikely(curr->hardirqs_enabled)) { |
2310 | /* | 2494 | /* |
2311 | * Neither irq nor preemption are disabled here | 2495 | * Neither irq nor preemption are disabled here |
2312 | * so this is racy by nature but loosing one hit | 2496 | * so this is racy by nature but losing one hit |
2313 | * in a stat is not a big deal. | 2497 | * in a stat is not a big deal. |
2314 | */ | 2498 | */ |
2315 | __debug_atomic_inc(redundant_hardirqs_on); | 2499 | __debug_atomic_inc(redundant_hardirqs_on); |
@@ -2620,7 +2804,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
2620 | if (!graph_lock()) | 2804 | if (!graph_lock()) |
2621 | return 0; | 2805 | return 0; |
2622 | /* | 2806 | /* |
2623 | * Make sure we didnt race: | 2807 | * Make sure we didn't race: |
2624 | */ | 2808 | */ |
2625 | if (unlikely(hlock_class(this)->usage_mask & new_mask)) { | 2809 | if (unlikely(hlock_class(this)->usage_mask & new_mask)) { |
2626 | graph_unlock(); | 2810 | graph_unlock(); |
diff --git a/kernel/module.c b/kernel/module.c index 1f9f7bc56ca1..22879725678d 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/kmemleak.h> | 57 | #include <linux/kmemleak.h> |
58 | #include <linux/jump_label.h> | 58 | #include <linux/jump_label.h> |
59 | #include <linux/pfn.h> | 59 | #include <linux/pfn.h> |
60 | #include <linux/bsearch.h> | ||
60 | 61 | ||
61 | #define CREATE_TRACE_POINTS | 62 | #define CREATE_TRACE_POINTS |
62 | #include <trace/events/module.h> | 63 | #include <trace/events/module.h> |
@@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr, | |||
240 | struct module *owner, | 241 | struct module *owner, |
241 | bool (*fn)(const struct symsearch *syms, | 242 | bool (*fn)(const struct symsearch *syms, |
242 | struct module *owner, | 243 | struct module *owner, |
243 | unsigned int symnum, void *data), | 244 | void *data), |
244 | void *data) | 245 | void *data) |
245 | { | 246 | { |
246 | unsigned int i, j; | 247 | unsigned int j; |
247 | 248 | ||
248 | for (j = 0; j < arrsize; j++) { | 249 | for (j = 0; j < arrsize; j++) { |
249 | for (i = 0; i < arr[j].stop - arr[j].start; i++) | 250 | if (fn(&arr[j], owner, data)) |
250 | if (fn(&arr[j], owner, i, data)) | 251 | return true; |
251 | return true; | ||
252 | } | 252 | } |
253 | 253 | ||
254 | return false; | 254 | return false; |
255 | } | 255 | } |
256 | 256 | ||
257 | /* Returns true as soon as fn returns true, otherwise false. */ | 257 | /* Returns true as soon as fn returns true, otherwise false. */ |
258 | bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, | 258 | bool each_symbol_section(bool (*fn)(const struct symsearch *arr, |
259 | unsigned int symnum, void *data), void *data) | 259 | struct module *owner, |
260 | void *data), | ||
261 | void *data) | ||
260 | { | 262 | { |
261 | struct module *mod; | 263 | struct module *mod; |
262 | static const struct symsearch arr[] = { | 264 | static const struct symsearch arr[] = { |
@@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, | |||
309 | } | 311 | } |
310 | return false; | 312 | return false; |
311 | } | 313 | } |
312 | EXPORT_SYMBOL_GPL(each_symbol); | 314 | EXPORT_SYMBOL_GPL(each_symbol_section); |
313 | 315 | ||
314 | struct find_symbol_arg { | 316 | struct find_symbol_arg { |
315 | /* Input */ | 317 | /* Input */ |
@@ -323,15 +325,12 @@ struct find_symbol_arg { | |||
323 | const struct kernel_symbol *sym; | 325 | const struct kernel_symbol *sym; |
324 | }; | 326 | }; |
325 | 327 | ||
326 | static bool find_symbol_in_section(const struct symsearch *syms, | 328 | static bool check_symbol(const struct symsearch *syms, |
327 | struct module *owner, | 329 | struct module *owner, |
328 | unsigned int symnum, void *data) | 330 | unsigned int symnum, void *data) |
329 | { | 331 | { |
330 | struct find_symbol_arg *fsa = data; | 332 | struct find_symbol_arg *fsa = data; |
331 | 333 | ||
332 | if (strcmp(syms->start[symnum].name, fsa->name) != 0) | ||
333 | return false; | ||
334 | |||
335 | if (!fsa->gplok) { | 334 | if (!fsa->gplok) { |
336 | if (syms->licence == GPL_ONLY) | 335 | if (syms->licence == GPL_ONLY) |
337 | return false; | 336 | return false; |
@@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms, | |||
365 | return true; | 364 | return true; |
366 | } | 365 | } |
367 | 366 | ||
367 | static int cmp_name(const void *va, const void *vb) | ||
368 | { | ||
369 | const char *a; | ||
370 | const struct kernel_symbol *b; | ||
371 | a = va; b = vb; | ||
372 | return strcmp(a, b->name); | ||
373 | } | ||
374 | |||
375 | static bool find_symbol_in_section(const struct symsearch *syms, | ||
376 | struct module *owner, | ||
377 | void *data) | ||
378 | { | ||
379 | struct find_symbol_arg *fsa = data; | ||
380 | struct kernel_symbol *sym; | ||
381 | |||
382 | sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, | ||
383 | sizeof(struct kernel_symbol), cmp_name); | ||
384 | |||
385 | if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data)) | ||
386 | return true; | ||
387 | |||
388 | return false; | ||
389 | } | ||
390 | |||
368 | /* Find a symbol and return it, along with, (optional) crc and | 391 | /* Find a symbol and return it, along with, (optional) crc and |
369 | * (optional) module which owns it. Needs preempt disabled or module_mutex. */ | 392 | * (optional) module which owns it. Needs preempt disabled or module_mutex. */ |
370 | const struct kernel_symbol *find_symbol(const char *name, | 393 | const struct kernel_symbol *find_symbol(const char *name, |
@@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name, | |||
379 | fsa.gplok = gplok; | 402 | fsa.gplok = gplok; |
380 | fsa.warn = warn; | 403 | fsa.warn = warn; |
381 | 404 | ||
382 | if (each_symbol(find_symbol_in_section, &fsa)) { | 405 | if (each_symbol_section(find_symbol_in_section, &fsa)) { |
383 | if (owner) | 406 | if (owner) |
384 | *owner = fsa.owner; | 407 | *owner = fsa.owner; |
385 | if (crc) | 408 | if (crc) |
@@ -809,7 +832,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, | |||
809 | wait_for_zero_refcount(mod); | 832 | wait_for_zero_refcount(mod); |
810 | 833 | ||
811 | mutex_unlock(&module_mutex); | 834 | mutex_unlock(&module_mutex); |
812 | /* Final destruction now noone is using it. */ | 835 | /* Final destruction now no one is using it. */ |
813 | if (mod->exit != NULL) | 836 | if (mod->exit != NULL) |
814 | mod->exit(); | 837 | mod->exit(); |
815 | blocking_notifier_call_chain(&module_notify_list, | 838 | blocking_notifier_call_chain(&module_notify_list, |
@@ -1607,27 +1630,28 @@ static void set_section_ro_nx(void *base, | |||
1607 | } | 1630 | } |
1608 | } | 1631 | } |
1609 | 1632 | ||
1610 | /* Setting memory back to RW+NX before releasing it */ | 1633 | static void unset_module_core_ro_nx(struct module *mod) |
1611 | void unset_section_ro_nx(struct module *mod, void *module_region) | ||
1612 | { | 1634 | { |
1613 | unsigned long total_pages; | 1635 | set_page_attributes(mod->module_core + mod->core_text_size, |
1614 | 1636 | mod->module_core + mod->core_size, | |
1615 | if (mod->module_core == module_region) { | 1637 | set_memory_x); |
1616 | /* Set core as NX+RW */ | 1638 | set_page_attributes(mod->module_core, |
1617 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); | 1639 | mod->module_core + mod->core_ro_size, |
1618 | set_memory_nx((unsigned long)mod->module_core, total_pages); | 1640 | set_memory_rw); |
1619 | set_memory_rw((unsigned long)mod->module_core, total_pages); | 1641 | } |
1620 | 1642 | ||
1621 | } else if (mod->module_init == module_region) { | 1643 | static void unset_module_init_ro_nx(struct module *mod) |
1622 | /* Set init as NX+RW */ | 1644 | { |
1623 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); | 1645 | set_page_attributes(mod->module_init + mod->init_text_size, |
1624 | set_memory_nx((unsigned long)mod->module_init, total_pages); | 1646 | mod->module_init + mod->init_size, |
1625 | set_memory_rw((unsigned long)mod->module_init, total_pages); | 1647 | set_memory_x); |
1626 | } | 1648 | set_page_attributes(mod->module_init, |
1649 | mod->module_init + mod->init_ro_size, | ||
1650 | set_memory_rw); | ||
1627 | } | 1651 | } |
1628 | 1652 | ||
1629 | /* Iterate through all modules and set each module's text as RW */ | 1653 | /* Iterate through all modules and set each module's text as RW */ |
1630 | void set_all_modules_text_rw() | 1654 | void set_all_modules_text_rw(void) |
1631 | { | 1655 | { |
1632 | struct module *mod; | 1656 | struct module *mod; |
1633 | 1657 | ||
@@ -1648,7 +1672,7 @@ void set_all_modules_text_rw() | |||
1648 | } | 1672 | } |
1649 | 1673 | ||
1650 | /* Iterate through all modules and set each module's text as RO */ | 1674 | /* Iterate through all modules and set each module's text as RO */ |
1651 | void set_all_modules_text_ro() | 1675 | void set_all_modules_text_ro(void) |
1652 | { | 1676 | { |
1653 | struct module *mod; | 1677 | struct module *mod; |
1654 | 1678 | ||
@@ -1669,7 +1693,8 @@ void set_all_modules_text_ro() | |||
1669 | } | 1693 | } |
1670 | #else | 1694 | #else |
1671 | static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } | 1695 | static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } |
1672 | static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } | 1696 | static void unset_module_core_ro_nx(struct module *mod) { } |
1697 | static void unset_module_init_ro_nx(struct module *mod) { } | ||
1673 | #endif | 1698 | #endif |
1674 | 1699 | ||
1675 | /* Free a module, remove from lists, etc. */ | 1700 | /* Free a module, remove from lists, etc. */ |
@@ -1696,7 +1721,7 @@ static void free_module(struct module *mod) | |||
1696 | destroy_params(mod->kp, mod->num_kp); | 1721 | destroy_params(mod->kp, mod->num_kp); |
1697 | 1722 | ||
1698 | /* This may be NULL, but that's OK */ | 1723 | /* This may be NULL, but that's OK */ |
1699 | unset_section_ro_nx(mod, mod->module_init); | 1724 | unset_module_init_ro_nx(mod); |
1700 | module_free(mod, mod->module_init); | 1725 | module_free(mod, mod->module_init); |
1701 | kfree(mod->args); | 1726 | kfree(mod->args); |
1702 | percpu_modfree(mod); | 1727 | percpu_modfree(mod); |
@@ -1705,7 +1730,7 @@ static void free_module(struct module *mod) | |||
1705 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1730 | lockdep_free_key_range(mod->module_core, mod->core_size); |
1706 | 1731 | ||
1707 | /* Finally, free the core (containing the module structure) */ | 1732 | /* Finally, free the core (containing the module structure) */ |
1708 | unset_section_ro_nx(mod, mod->module_core); | 1733 | unset_module_core_ro_nx(mod); |
1709 | module_free(mod, mod->module_core); | 1734 | module_free(mod, mod->module_core); |
1710 | 1735 | ||
1711 | #ifdef CONFIG_MPU | 1736 | #ifdef CONFIG_MPU |
@@ -2030,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name, | |||
2030 | const struct kernel_symbol *start, | 2055 | const struct kernel_symbol *start, |
2031 | const struct kernel_symbol *stop) | 2056 | const struct kernel_symbol *stop) |
2032 | { | 2057 | { |
2033 | const struct kernel_symbol *ks = start; | 2058 | return bsearch(name, start, stop - start, |
2034 | for (; ks < stop; ks++) | 2059 | sizeof(struct kernel_symbol), cmp_name); |
2035 | if (strcmp(ks->name, name) == 0) | ||
2036 | return ks; | ||
2037 | return NULL; | ||
2038 | } | 2060 | } |
2039 | 2061 | ||
2040 | static int is_exported(const char *name, unsigned long value, | 2062 | static int is_exported(const char *name, unsigned long value, |
@@ -2777,7 +2799,7 @@ static struct module *load_module(void __user *umod, | |||
2777 | mod->state = MODULE_STATE_COMING; | 2799 | mod->state = MODULE_STATE_COMING; |
2778 | 2800 | ||
2779 | /* Now sew it into the lists so we can get lockdep and oops | 2801 | /* Now sew it into the lists so we can get lockdep and oops |
2780 | * info during argument parsing. Noone should access us, since | 2802 | * info during argument parsing. No one should access us, since |
2781 | * strong_try_module_get() will fail. | 2803 | * strong_try_module_get() will fail. |
2782 | * lockdep/oops can run asynchronous, so use the RCU list insertion | 2804 | * lockdep/oops can run asynchronous, so use the RCU list insertion |
2783 | * function to insert in a way safe to concurrent readers. | 2805 | * function to insert in a way safe to concurrent readers. |
@@ -2931,10 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
2931 | mod->symtab = mod->core_symtab; | 2953 | mod->symtab = mod->core_symtab; |
2932 | mod->strtab = mod->core_strtab; | 2954 | mod->strtab = mod->core_strtab; |
2933 | #endif | 2955 | #endif |
2934 | unset_section_ro_nx(mod, mod->module_init); | 2956 | unset_module_init_ro_nx(mod); |
2935 | module_free(mod, mod->module_init); | 2957 | module_free(mod, mod->module_init); |
2936 | mod->module_init = NULL; | 2958 | mod->module_init = NULL; |
2937 | mod->init_size = 0; | 2959 | mod->init_size = 0; |
2960 | mod->init_ro_size = 0; | ||
2938 | mod->init_text_size = 0; | 2961 | mod->init_text_size = 0; |
2939 | mutex_unlock(&module_mutex); | 2962 | mutex_unlock(&module_mutex); |
2940 | 2963 | ||
@@ -2971,7 +2994,7 @@ static const char *get_ksymbol(struct module *mod, | |||
2971 | else | 2994 | else |
2972 | nextval = (unsigned long)mod->module_core+mod->core_text_size; | 2995 | nextval = (unsigned long)mod->module_core+mod->core_text_size; |
2973 | 2996 | ||
2974 | /* Scan for closest preceeding symbol, and next symbol. (ELF | 2997 | /* Scan for closest preceding symbol, and next symbol. (ELF |
2975 | starts real symbols at 1). */ | 2998 | starts real symbols at 1). */ |
2976 | for (i = 1; i < mod->num_symtab; i++) { | 2999 | for (i = 1; i < mod->num_symtab; i++) { |
2977 | if (mod->symtab[i].st_shndx == SHN_UNDEF) | 3000 | if (mod->symtab[i].st_shndx == SHN_UNDEF) |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a960b5d..73da83aff418 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) | |||
75 | return; | 75 | return; |
76 | 76 | ||
77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); | 77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
78 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); | 78 | DEBUG_LOCKS_WARN_ON(lock->owner != current); |
79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
80 | mutex_clear_owner(lock); | 80 | mutex_clear_owner(lock); |
81 | } | 81 | } |
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a16f9d..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h | |||
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, | |||
29 | 29 | ||
30 | static inline void mutex_set_owner(struct mutex *lock) | 30 | static inline void mutex_set_owner(struct mutex *lock) |
31 | { | 31 | { |
32 | lock->owner = current_thread_info(); | 32 | lock->owner = current; |
33 | } | 33 | } |
34 | 34 | ||
35 | static inline void mutex_clear_owner(struct mutex *lock) | 35 | static inline void mutex_clear_owner(struct mutex *lock) |
diff --git a/kernel/mutex.c b/kernel/mutex.c index a5889fb28ecf..2c938e2337cd 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
160 | */ | 160 | */ |
161 | 161 | ||
162 | for (;;) { | 162 | for (;;) { |
163 | struct thread_info *owner; | 163 | struct task_struct *owner; |
164 | |||
165 | /* | ||
166 | * If we own the BKL, then don't spin. The owner of | ||
167 | * the mutex might be waiting on us to release the BKL. | ||
168 | */ | ||
169 | if (unlikely(current->lock_depth >= 0)) | ||
170 | break; | ||
171 | 164 | ||
172 | /* | 165 | /* |
173 | * If there's an owner, wait for it to either | 166 | * If there's an owner, wait for it to either |
@@ -245,7 +238,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
245 | } | 238 | } |
246 | __set_task_state(task, state); | 239 | __set_task_state(task, state); |
247 | 240 | ||
248 | /* didnt get the lock, go to sleep: */ | 241 | /* didn't get the lock, go to sleep: */ |
249 | spin_unlock_mutex(&lock->wait_lock, flags); | 242 | spin_unlock_mutex(&lock->wait_lock, flags); |
250 | preempt_enable_no_resched(); | 243 | preempt_enable_no_resched(); |
251 | schedule(); | 244 | schedule(); |
diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca48f94..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h | |||
@@ -19,7 +19,7 @@ | |||
19 | #ifdef CONFIG_SMP | 19 | #ifdef CONFIG_SMP |
20 | static inline void mutex_set_owner(struct mutex *lock) | 20 | static inline void mutex_set_owner(struct mutex *lock) |
21 | { | 21 | { |
22 | lock->owner = current_thread_info(); | 22 | lock->owner = current; |
23 | } | 23 | } |
24 | 24 | ||
25 | static inline void mutex_clear_owner(struct mutex *lock) | 25 | static inline void mutex_clear_owner(struct mutex *lock) |
diff --git a/kernel/padata.c b/kernel/padata.c index 751019415d23..b91941df5e63 100644 --- a/kernel/padata.c +++ b/kernel/padata.c | |||
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd) | |||
262 | /* | 262 | /* |
263 | * This cpu has to do the parallel processing of the next | 263 | * This cpu has to do the parallel processing of the next |
264 | * object. It's waiting in the cpu's parallelization queue, | 264 | * object. It's waiting in the cpu's parallelization queue, |
265 | * so exit imediately. | 265 | * so exit immediately. |
266 | */ | 266 | */ |
267 | if (PTR_ERR(padata) == -ENODATA) { | 267 | if (PTR_ERR(padata) == -ENODATA) { |
268 | del_timer(&pd->timer); | 268 | del_timer(&pd->timer); |
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd) | |||
284 | /* | 284 | /* |
285 | * The next object that needs serialization might have arrived to | 285 | * The next object that needs serialization might have arrived to |
286 | * the reorder queues in the meantime, we will be called again | 286 | * the reorder queues in the meantime, we will be called again |
287 | * from the timer function if noone else cares for it. | 287 | * from the timer function if no one else cares for it. |
288 | */ | 288 | */ |
289 | if (atomic_read(&pd->reorder_objects) | 289 | if (atomic_read(&pd->reorder_objects) |
290 | && !(pinst->flags & PADATA_RESET)) | 290 | && !(pinst->flags & PADATA_RESET)) |
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst) | |||
515 | put_online_cpus(); | 515 | put_online_cpus(); |
516 | } | 516 | } |
517 | 517 | ||
518 | /* Replace the internal control stucture with a new one. */ | 518 | /* Replace the internal control structure with a new one. */ |
519 | static void padata_replace(struct padata_instance *pinst, | 519 | static void padata_replace(struct padata_instance *pinst, |
520 | struct parallel_data *pd_new) | 520 | struct parallel_data *pd_new) |
521 | { | 521 | { |
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) | |||
768 | } | 768 | } |
769 | 769 | ||
770 | /** | 770 | /** |
771 | * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) | 771 | * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) |
772 | * padata cpumasks. | 772 | * padata cpumasks. |
773 | * | 773 | * |
774 | * @pinst: padata instance | 774 | * @pinst: padata instance |
diff --git a/kernel/params.c b/kernel/params.c index 0da1411222b9..ed72e1330862 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -95,7 +95,7 @@ static int parse_one(char *param, | |||
95 | /* Find parameter */ | 95 | /* Find parameter */ |
96 | for (i = 0; i < num_params; i++) { | 96 | for (i = 0; i < num_params; i++) { |
97 | if (parameq(param, params[i].name)) { | 97 | if (parameq(param, params[i].name)) { |
98 | /* Noone handled NULL, so do it here. */ | 98 | /* No one handled NULL, so do it here. */ |
99 | if (!val && params[i].ops->set != param_set_bool) | 99 | if (!val && params[i].ops->set != param_set_bool) |
100 | return -EINVAL; | 100 | return -EINVAL; |
101 | DEBUGP("They are equal! Calling %p\n", | 101 | DEBUGP("They are equal! Calling %p\n", |
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp); | |||
297 | int param_set_bool(const char *val, const struct kernel_param *kp) | 297 | int param_set_bool(const char *val, const struct kernel_param *kp) |
298 | { | 298 | { |
299 | bool v; | 299 | bool v; |
300 | int ret; | ||
300 | 301 | ||
301 | /* No equals means "set"... */ | 302 | /* No equals means "set"... */ |
302 | if (!val) val = "1"; | 303 | if (!val) val = "1"; |
303 | 304 | ||
304 | /* One of =[yYnN01] */ | 305 | /* One of =[yYnN01] */ |
305 | switch (val[0]) { | 306 | ret = strtobool(val, &v); |
306 | case 'y': case 'Y': case '1': | 307 | if (ret) |
307 | v = true; | 308 | return ret; |
308 | break; | ||
309 | case 'n': case 'N': case '0': | ||
310 | v = false; | ||
311 | break; | ||
312 | default: | ||
313 | return -EINVAL; | ||
314 | } | ||
315 | 309 | ||
316 | if (kp->flags & KPARAM_ISBOOL) | 310 | if (kp->flags & KPARAM_ISBOOL) |
317 | *(bool *)kp->arg = v; | 311 | *(bool *)kp->arg = v; |
@@ -821,15 +815,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr, | |||
821 | return sprintf(buf, "%s\n", vattr->version); | 815 | return sprintf(buf, "%s\n", vattr->version); |
822 | } | 816 | } |
823 | 817 | ||
824 | extern struct module_version_attribute __start___modver[], __stop___modver[]; | 818 | extern const struct module_version_attribute *__start___modver[]; |
819 | extern const struct module_version_attribute *__stop___modver[]; | ||
825 | 820 | ||
826 | static void __init version_sysfs_builtin(void) | 821 | static void __init version_sysfs_builtin(void) |
827 | { | 822 | { |
828 | const struct module_version_attribute *vattr; | 823 | const struct module_version_attribute **p; |
829 | struct module_kobject *mk; | 824 | struct module_kobject *mk; |
830 | int err; | 825 | int err; |
831 | 826 | ||
832 | for (vattr = __start___modver; vattr < __stop___modver; vattr++) { | 827 | for (p = __start___modver; p < __stop___modver; p++) { |
828 | const struct module_version_attribute *vattr = *p; | ||
829 | |||
833 | mk = locate_module_kobject(vattr->module_name); | 830 | mk = locate_module_kobject(vattr->module_name); |
834 | if (mk) { | 831 | if (mk) { |
835 | err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); | 832 | err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); |
diff --git a/kernel/pid.c b/kernel/pid.c index 02f221274265..57a8346a270e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) | |||
217 | return -1; | 217 | return -1; |
218 | } | 218 | } |
219 | 219 | ||
220 | int next_pidmap(struct pid_namespace *pid_ns, int last) | 220 | int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) |
221 | { | 221 | { |
222 | int offset; | 222 | int offset; |
223 | struct pidmap *map, *end; | 223 | struct pidmap *map, *end; |
224 | 224 | ||
225 | if (last >= PID_MAX_LIMIT) | ||
226 | return -1; | ||
227 | |||
225 | offset = (last + 1) & BITS_PER_PAGE_MASK; | 228 | offset = (last + 1) & BITS_PER_PAGE_MASK; |
226 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; | 229 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; |
227 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; | 230 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 67fea9d25d55..0791b13df7bf 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -1347,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1347 | 1347 | ||
1348 | /* | 1348 | /* |
1349 | * Now that all the timers on our list have the firing flag, | 1349 | * Now that all the timers on our list have the firing flag, |
1350 | * noone will touch their list entries but us. We'll take | 1350 | * no one will touch their list entries but us. We'll take |
1351 | * each timer's lock before clearing its firing flag, so no | 1351 | * each timer's lock before clearing its firing flag, so no |
1352 | * timer call will interfere. | 1352 | * timer call will interfere. |
1353 | */ | 1353 | */ |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 4c0124919f9a..e5498d7405c3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -313,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr) | |||
313 | * restarted (i.e. we have flagged this in the sys_private entry of the | 313 | * restarted (i.e. we have flagged this in the sys_private entry of the |
314 | * info block). | 314 | * info block). |
315 | * | 315 | * |
316 | * To protect aginst the timer going away while the interrupt is queued, | 316 | * To protect against the timer going away while the interrupt is queued, |
317 | * we require that the it_requeue_pending flag be set. | 317 | * we require that the it_requeue_pending flag be set. |
318 | */ | 318 | */ |
319 | void do_schedule_next_timer(struct siginfo *info) | 319 | void do_schedule_next_timer(struct siginfo *info) |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 4603f08dc47b..87f4d24b55b0 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -18,9 +18,13 @@ config SUSPEND_FREEZER | |||
18 | 18 | ||
19 | Turning OFF this setting is NOT recommended! If in doubt, say Y. | 19 | Turning OFF this setting is NOT recommended! If in doubt, say Y. |
20 | 20 | ||
21 | config HIBERNATE_CALLBACKS | ||
22 | bool | ||
23 | |||
21 | config HIBERNATION | 24 | config HIBERNATION |
22 | bool "Hibernation (aka 'suspend to disk')" | 25 | bool "Hibernation (aka 'suspend to disk')" |
23 | depends on SWAP && ARCH_HIBERNATION_POSSIBLE | 26 | depends on SWAP && ARCH_HIBERNATION_POSSIBLE |
27 | select HIBERNATE_CALLBACKS | ||
24 | select LZO_COMPRESS | 28 | select LZO_COMPRESS |
25 | select LZO_DECOMPRESS | 29 | select LZO_DECOMPRESS |
26 | ---help--- | 30 | ---help--- |
@@ -85,7 +89,7 @@ config PM_STD_PARTITION | |||
85 | 89 | ||
86 | config PM_SLEEP | 90 | config PM_SLEEP |
87 | def_bool y | 91 | def_bool y |
88 | depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE | 92 | depends on SUSPEND || HIBERNATE_CALLBACKS |
89 | 93 | ||
90 | config PM_SLEEP_SMP | 94 | config PM_SLEEP_SMP |
91 | def_bool y | 95 | def_bool y |
@@ -121,12 +125,6 @@ config PM_DEBUG | |||
121 | code. This is helpful when debugging and reporting PM bugs, like | 125 | code. This is helpful when debugging and reporting PM bugs, like |
122 | suspend support. | 126 | suspend support. |
123 | 127 | ||
124 | config PM_VERBOSE | ||
125 | bool "Verbose Power Management debugging" | ||
126 | depends on PM_DEBUG | ||
127 | ---help--- | ||
128 | This option enables verbose messages from the Power Management code. | ||
129 | |||
130 | config PM_ADVANCED_DEBUG | 128 | config PM_ADVANCED_DEBUG |
131 | bool "Extra PM attributes in sysfs for low-level debugging/testing" | 129 | bool "Extra PM attributes in sysfs for low-level debugging/testing" |
132 | depends on PM_DEBUG | 130 | depends on PM_DEBUG |
@@ -225,3 +223,7 @@ config PM_OPP | |||
225 | representing individual voltage domains and provides SOC | 223 | representing individual voltage domains and provides SOC |
226 | implementations a ready to use framework to manage OPPs. | 224 | implementations a ready to use framework to manage OPPs. |
227 | For more information, read <file:Documentation/power/opp.txt> | 225 | For more information, read <file:Documentation/power/opp.txt> |
226 | |||
227 | config PM_RUNTIME_CLK | ||
228 | def_bool y | ||
229 | depends on PM_RUNTIME && HAVE_CLK | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aeabd26e3342..f9bec56d8825 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -272,9 +272,7 @@ static int create_image(int platform_mode) | |||
272 | 272 | ||
273 | local_irq_disable(); | 273 | local_irq_disable(); |
274 | 274 | ||
275 | error = sysdev_suspend(PMSG_FREEZE); | 275 | error = syscore_suspend(); |
276 | if (!error) | ||
277 | error = syscore_suspend(); | ||
278 | if (error) { | 276 | if (error) { |
279 | printk(KERN_ERR "PM: Some system devices failed to power down, " | 277 | printk(KERN_ERR "PM: Some system devices failed to power down, " |
280 | "aborting hibernation\n"); | 278 | "aborting hibernation\n"); |
@@ -299,7 +297,6 @@ static int create_image(int platform_mode) | |||
299 | 297 | ||
300 | Power_up: | 298 | Power_up: |
301 | syscore_resume(); | 299 | syscore_resume(); |
302 | sysdev_resume(); | ||
303 | /* NOTE: dpm_resume_noirq() is just a resume() for devices | 300 | /* NOTE: dpm_resume_noirq() is just a resume() for devices |
304 | * that suspended with irqs off ... no overall powerup. | 301 | * that suspended with irqs off ... no overall powerup. |
305 | */ | 302 | */ |
@@ -330,20 +327,25 @@ static int create_image(int platform_mode) | |||
330 | 327 | ||
331 | int hibernation_snapshot(int platform_mode) | 328 | int hibernation_snapshot(int platform_mode) |
332 | { | 329 | { |
330 | pm_message_t msg = PMSG_RECOVER; | ||
333 | int error; | 331 | int error; |
334 | 332 | ||
335 | error = platform_begin(platform_mode); | 333 | error = platform_begin(platform_mode); |
336 | if (error) | 334 | if (error) |
337 | goto Close; | 335 | goto Close; |
338 | 336 | ||
337 | error = dpm_prepare(PMSG_FREEZE); | ||
338 | if (error) | ||
339 | goto Complete_devices; | ||
340 | |||
339 | /* Preallocate image memory before shutting down devices. */ | 341 | /* Preallocate image memory before shutting down devices. */ |
340 | error = hibernate_preallocate_memory(); | 342 | error = hibernate_preallocate_memory(); |
341 | if (error) | 343 | if (error) |
342 | goto Close; | 344 | goto Complete_devices; |
343 | 345 | ||
344 | suspend_console(); | 346 | suspend_console(); |
345 | pm_restrict_gfp_mask(); | 347 | pm_restrict_gfp_mask(); |
346 | error = dpm_suspend_start(PMSG_FREEZE); | 348 | error = dpm_suspend(PMSG_FREEZE); |
347 | if (error) | 349 | if (error) |
348 | goto Recover_platform; | 350 | goto Recover_platform; |
349 | 351 | ||
@@ -361,13 +363,17 @@ int hibernation_snapshot(int platform_mode) | |||
361 | if (error || !in_suspend) | 363 | if (error || !in_suspend) |
362 | swsusp_free(); | 364 | swsusp_free(); |
363 | 365 | ||
364 | dpm_resume_end(in_suspend ? | 366 | msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; |
365 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); | 367 | dpm_resume(msg); |
366 | 368 | ||
367 | if (error || !in_suspend) | 369 | if (error || !in_suspend) |
368 | pm_restore_gfp_mask(); | 370 | pm_restore_gfp_mask(); |
369 | 371 | ||
370 | resume_console(); | 372 | resume_console(); |
373 | |||
374 | Complete_devices: | ||
375 | dpm_complete(msg); | ||
376 | |||
371 | Close: | 377 | Close: |
372 | platform_end(platform_mode); | 378 | platform_end(platform_mode); |
373 | return error; | 379 | return error; |
@@ -406,9 +412,7 @@ static int resume_target_kernel(bool platform_mode) | |||
406 | 412 | ||
407 | local_irq_disable(); | 413 | local_irq_disable(); |
408 | 414 | ||
409 | error = sysdev_suspend(PMSG_QUIESCE); | 415 | error = syscore_suspend(); |
410 | if (!error) | ||
411 | error = syscore_suspend(); | ||
412 | if (error) | 416 | if (error) |
413 | goto Enable_irqs; | 417 | goto Enable_irqs; |
414 | 418 | ||
@@ -436,7 +440,6 @@ static int resume_target_kernel(bool platform_mode) | |||
436 | touch_softlockup_watchdog(); | 440 | touch_softlockup_watchdog(); |
437 | 441 | ||
438 | syscore_resume(); | 442 | syscore_resume(); |
439 | sysdev_resume(); | ||
440 | 443 | ||
441 | Enable_irqs: | 444 | Enable_irqs: |
442 | local_irq_enable(); | 445 | local_irq_enable(); |
@@ -522,7 +525,6 @@ int hibernation_platform_enter(void) | |||
522 | goto Platform_finish; | 525 | goto Platform_finish; |
523 | 526 | ||
524 | local_irq_disable(); | 527 | local_irq_disable(); |
525 | sysdev_suspend(PMSG_HIBERNATE); | ||
526 | syscore_suspend(); | 528 | syscore_suspend(); |
527 | if (pm_wakeup_pending()) { | 529 | if (pm_wakeup_pending()) { |
528 | error = -EAGAIN; | 530 | error = -EAGAIN; |
@@ -535,7 +537,6 @@ int hibernation_platform_enter(void) | |||
535 | 537 | ||
536 | Power_up: | 538 | Power_up: |
537 | syscore_resume(); | 539 | syscore_resume(); |
538 | sysdev_resume(); | ||
539 | local_irq_enable(); | 540 | local_irq_enable(); |
540 | enable_nonboot_cpus(); | 541 | enable_nonboot_cpus(); |
541 | 542 | ||
@@ -976,10 +977,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att | |||
976 | 977 | ||
977 | power_attr(image_size); | 978 | power_attr(image_size); |
978 | 979 | ||
980 | static ssize_t reserved_size_show(struct kobject *kobj, | ||
981 | struct kobj_attribute *attr, char *buf) | ||
982 | { | ||
983 | return sprintf(buf, "%lu\n", reserved_size); | ||
984 | } | ||
985 | |||
986 | static ssize_t reserved_size_store(struct kobject *kobj, | ||
987 | struct kobj_attribute *attr, | ||
988 | const char *buf, size_t n) | ||
989 | { | ||
990 | unsigned long size; | ||
991 | |||
992 | if (sscanf(buf, "%lu", &size) == 1) { | ||
993 | reserved_size = size; | ||
994 | return n; | ||
995 | } | ||
996 | |||
997 | return -EINVAL; | ||
998 | } | ||
999 | |||
1000 | power_attr(reserved_size); | ||
1001 | |||
979 | static struct attribute * g[] = { | 1002 | static struct attribute * g[] = { |
980 | &disk_attr.attr, | 1003 | &disk_attr.attr, |
981 | &resume_attr.attr, | 1004 | &resume_attr.attr, |
982 | &image_size_attr.attr, | 1005 | &image_size_attr.attr, |
1006 | &reserved_size_attr.attr, | ||
983 | NULL, | 1007 | NULL, |
984 | }; | 1008 | }; |
985 | 1009 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 8eaba5f27b10..2981af4ce7cb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -224,7 +224,7 @@ power_attr(state); | |||
224 | * writing to 'state'. It first should read from 'wakeup_count' and store | 224 | * writing to 'state'. It first should read from 'wakeup_count' and store |
225 | * the read value. Then, after carrying out its own preparations for the system | 225 | * the read value. Then, after carrying out its own preparations for the system |
226 | * transition to a sleep state, it should write the stored value to | 226 | * transition to a sleep state, it should write the stored value to |
227 | * 'wakeup_count'. If that fails, at least one wakeup event has occured since | 227 | * 'wakeup_count'. If that fails, at least one wakeup event has occurred since |
228 | * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it | 228 | * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it |
229 | * is allowed to write to 'state', but the transition will be aborted if there | 229 | * is allowed to write to 'state', but the transition will be aborted if there |
230 | * are any wakeup events detected after 'wakeup_count' was written to. | 230 | * are any wakeup events detected after 'wakeup_count' was written to. |
@@ -337,6 +337,7 @@ static int __init pm_init(void) | |||
337 | if (error) | 337 | if (error) |
338 | return error; | 338 | return error; |
339 | hibernate_image_size_init(); | 339 | hibernate_image_size_init(); |
340 | hibernate_reserved_size_init(); | ||
340 | power_kobj = kobject_create_and_add("power", NULL); | 341 | power_kobj = kobject_create_and_add("power", NULL); |
341 | if (!power_kobj) | 342 | if (!power_kobj) |
342 | return -ENOMEM; | 343 | return -ENOMEM; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 03634be55f62..9a00a0a26280 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -15,6 +15,7 @@ struct swsusp_info { | |||
15 | 15 | ||
16 | #ifdef CONFIG_HIBERNATION | 16 | #ifdef CONFIG_HIBERNATION |
17 | /* kernel/power/snapshot.c */ | 17 | /* kernel/power/snapshot.c */ |
18 | extern void __init hibernate_reserved_size_init(void); | ||
18 | extern void __init hibernate_image_size_init(void); | 19 | extern void __init hibernate_image_size_init(void); |
19 | 20 | ||
20 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER | 21 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER |
@@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void); | |||
55 | 56 | ||
56 | #else /* !CONFIG_HIBERNATION */ | 57 | #else /* !CONFIG_HIBERNATION */ |
57 | 58 | ||
59 | static inline void hibernate_reserved_size_init(void) {} | ||
58 | static inline void hibernate_image_size_init(void) {} | 60 | static inline void hibernate_image_size_init(void) {} |
59 | #endif /* !CONFIG_HIBERNATION */ | 61 | #endif /* !CONFIG_HIBERNATION */ |
60 | 62 | ||
@@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \ | |||
72 | 74 | ||
73 | /* Preferred image size in bytes (default 500 MB) */ | 75 | /* Preferred image size in bytes (default 500 MB) */ |
74 | extern unsigned long image_size; | 76 | extern unsigned long image_size; |
77 | /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ | ||
78 | extern unsigned long reserved_size; | ||
75 | extern int in_suspend; | 79 | extern int in_suspend; |
76 | extern dev_t swsusp_resume_device; | 80 | extern dev_t swsusp_resume_device; |
77 | extern sector_t swsusp_resume_block; | 81 | extern sector_t swsusp_resume_block; |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ca0aacc24874..ace55889f702 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -41,16 +41,28 @@ static void swsusp_set_page_forbidden(struct page *); | |||
41 | static void swsusp_unset_page_forbidden(struct page *); | 41 | static void swsusp_unset_page_forbidden(struct page *); |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * Number of bytes to reserve for memory allocations made by device drivers | ||
45 | * from their ->freeze() and ->freeze_noirq() callbacks so that they don't | ||
46 | * cause image creation to fail (tunable via /sys/power/reserved_size). | ||
47 | */ | ||
48 | unsigned long reserved_size; | ||
49 | |||
50 | void __init hibernate_reserved_size_init(void) | ||
51 | { | ||
52 | reserved_size = SPARE_PAGES * PAGE_SIZE; | ||
53 | } | ||
54 | |||
55 | /* | ||
44 | * Preferred image size in bytes (tunable via /sys/power/image_size). | 56 | * Preferred image size in bytes (tunable via /sys/power/image_size). |
45 | * When it is set to N, the image creating code will do its best to | 57 | * When it is set to N, swsusp will do its best to ensure the image |
46 | * ensure the image size will not exceed N bytes, but if that is | 58 | * size will not exceed N bytes, but if that is impossible, it will |
47 | * impossible, it will try to create the smallest image possible. | 59 | * try to create the smallest image possible. |
48 | */ | 60 | */ |
49 | unsigned long image_size; | 61 | unsigned long image_size; |
50 | 62 | ||
51 | void __init hibernate_image_size_init(void) | 63 | void __init hibernate_image_size_init(void) |
52 | { | 64 | { |
53 | image_size = (totalram_pages / 3) * PAGE_SIZE; | 65 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; |
54 | } | 66 | } |
55 | 67 | ||
56 | /* List of PBEs needed for restoring the pages that were allocated before | 68 | /* List of PBEs needed for restoring the pages that were allocated before |
@@ -1263,11 +1275,13 @@ static unsigned long minimum_image_size(unsigned long saveable) | |||
1263 | * frame in use. We also need a number of page frames to be free during | 1275 | * frame in use. We also need a number of page frames to be free during |
1264 | * hibernation for allocations made while saving the image and for device | 1276 | * hibernation for allocations made while saving the image and for device |
1265 | * drivers, in case they need to allocate memory from their hibernation | 1277 | * drivers, in case they need to allocate memory from their hibernation |
1266 | * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, | 1278 | * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough |
1267 | * respectively, both of which are rough estimates). To make this happen, we | 1279 | * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through |
1268 | * compute the total number of available page frames and allocate at least | 1280 | * /sys/power/reserved_size, respectively). To make this happen, we compute the |
1281 | * total number of available page frames and allocate at least | ||
1269 | * | 1282 | * |
1270 | * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES | 1283 | * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 |
1284 | * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) | ||
1271 | * | 1285 | * |
1272 | * of them, which corresponds to the maximum size of a hibernation image. | 1286 | * of them, which corresponds to the maximum size of a hibernation image. |
1273 | * | 1287 | * |
@@ -1322,7 +1336,8 @@ int hibernate_preallocate_memory(void) | |||
1322 | count -= totalreserve_pages; | 1336 | count -= totalreserve_pages; |
1323 | 1337 | ||
1324 | /* Compute the maximum number of saveable pages to leave in memory. */ | 1338 | /* Compute the maximum number of saveable pages to leave in memory. */ |
1325 | max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; | 1339 | max_size = (count - (size + PAGES_FOR_IO)) / 2 |
1340 | - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); | ||
1326 | /* Compute the desired number of image pages specified by image_size. */ | 1341 | /* Compute the desired number of image pages specified by image_size. */ |
1327 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); | 1342 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); |
1328 | if (size > max_size) | 1343 | if (size > max_size) |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 2814c32aed51..1c41ba215419 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
@@ -163,16 +163,13 @@ static int suspend_enter(suspend_state_t state) | |||
163 | arch_suspend_disable_irqs(); | 163 | arch_suspend_disable_irqs(); |
164 | BUG_ON(!irqs_disabled()); | 164 | BUG_ON(!irqs_disabled()); |
165 | 165 | ||
166 | error = sysdev_suspend(PMSG_SUSPEND); | 166 | error = syscore_suspend(); |
167 | if (!error) | ||
168 | error = syscore_suspend(); | ||
169 | if (!error) { | 167 | if (!error) { |
170 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { | 168 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { |
171 | error = suspend_ops->enter(state); | 169 | error = suspend_ops->enter(state); |
172 | events_check_enabled = false; | 170 | events_check_enabled = false; |
173 | } | 171 | } |
174 | syscore_resume(); | 172 | syscore_resume(); |
175 | sysdev_resume(); | ||
176 | } | 173 | } |
177 | 174 | ||
178 | arch_suspend_enable_irqs(); | 175 | arch_suspend_enable_irqs(); |
@@ -213,7 +210,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
213 | goto Close; | 210 | goto Close; |
214 | } | 211 | } |
215 | suspend_console(); | 212 | suspend_console(); |
216 | pm_restrict_gfp_mask(); | ||
217 | suspend_test_start(); | 213 | suspend_test_start(); |
218 | error = dpm_suspend_start(PMSG_SUSPEND); | 214 | error = dpm_suspend_start(PMSG_SUSPEND); |
219 | if (error) { | 215 | if (error) { |
@@ -224,13 +220,12 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
224 | if (suspend_test(TEST_DEVICES)) | 220 | if (suspend_test(TEST_DEVICES)) |
225 | goto Recover_platform; | 221 | goto Recover_platform; |
226 | 222 | ||
227 | suspend_enter(state); | 223 | error = suspend_enter(state); |
228 | 224 | ||
229 | Resume_devices: | 225 | Resume_devices: |
230 | suspend_test_start(); | 226 | suspend_test_start(); |
231 | dpm_resume_end(PMSG_RESUME); | 227 | dpm_resume_end(PMSG_RESUME); |
232 | suspend_test_finish("resume devices"); | 228 | suspend_test_finish("resume devices"); |
233 | pm_restore_gfp_mask(); | ||
234 | resume_console(); | 229 | resume_console(); |
235 | Close: | 230 | Close: |
236 | if (suspend_ops->end) | 231 | if (suspend_ops->end) |
@@ -291,7 +286,9 @@ int enter_state(suspend_state_t state) | |||
291 | goto Finish; | 286 | goto Finish; |
292 | 287 | ||
293 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); | 288 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); |
289 | pm_restrict_gfp_mask(); | ||
294 | error = suspend_devices_and_enter(state); | 290 | error = suspend_devices_and_enter(state); |
291 | pm_restore_gfp_mask(); | ||
295 | 292 | ||
296 | Finish: | 293 | Finish: |
297 | pr_debug("PM: Finishing wakeup.\n"); | 294 | pr_debug("PM: Finishing wakeup.\n"); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index c36c3b9e8a84..7d02d33be699 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -135,8 +135,10 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
135 | free_basic_memory_bitmaps(); | 135 | free_basic_memory_bitmaps(); |
136 | data = filp->private_data; | 136 | data = filp->private_data; |
137 | free_all_swap_pages(data->swap); | 137 | free_all_swap_pages(data->swap); |
138 | if (data->frozen) | 138 | if (data->frozen) { |
139 | pm_restore_gfp_mask(); | ||
139 | thaw_processes(); | 140 | thaw_processes(); |
141 | } | ||
140 | pm_notifier_call_chain(data->mode == O_RDONLY ? | 142 | pm_notifier_call_chain(data->mode == O_RDONLY ? |
141 | PM_POST_HIBERNATION : PM_POST_RESTORE); | 143 | PM_POST_HIBERNATION : PM_POST_RESTORE); |
142 | atomic_inc(&snapshot_device_available); | 144 | atomic_inc(&snapshot_device_available); |
@@ -379,6 +381,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
379 | * PM_HIBERNATION_PREPARE | 381 | * PM_HIBERNATION_PREPARE |
380 | */ | 382 | */ |
381 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); | 383 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); |
384 | data->ready = 0; | ||
382 | break; | 385 | break; |
383 | 386 | ||
384 | case SNAPSHOT_PLATFORM_SUPPORT: | 387 | case SNAPSHOT_PLATFORM_SUPPORT: |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 512bd017218d..7a81fc071344 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
23 | #include <linux/uaccess.h> | 23 | #include <linux/uaccess.h> |
24 | #include <linux/regset.h> | 24 | #include <linux/regset.h> |
25 | #include <linux/hw_breakpoint.h> | ||
25 | 26 | ||
26 | 27 | ||
27 | /* | 28 | /* |
@@ -923,3 +924,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
923 | return ret; | 924 | return ret; |
924 | } | 925 | } |
925 | #endif /* CONFIG_COMPAT */ | 926 | #endif /* CONFIG_COMPAT */ |
927 | |||
928 | #ifdef CONFIG_HAVE_HW_BREAKPOINT | ||
929 | int ptrace_get_breakpoints(struct task_struct *tsk) | ||
930 | { | ||
931 | if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) | ||
932 | return 0; | ||
933 | |||
934 | return -1; | ||
935 | } | ||
936 | |||
937 | void ptrace_put_breakpoints(struct task_struct *tsk) | ||
938 | { | ||
939 | if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) | ||
940 | flush_ptrace_hw_breakpoint(tsk); | ||
941 | } | ||
942 | #endif /* CONFIG_HAVE_HW_BREAKPOINT */ | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f3240e987928..7784bd216b6a 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | |||
142 | * Ensure that queued callbacks are all executed. | 142 | * Ensure that queued callbacks are all executed. |
143 | * If we detect that we are nested in a RCU read-side critical | 143 | * If we detect that we are nested in a RCU read-side critical |
144 | * section, we should simply fail, otherwise we would deadlock. | 144 | * section, we should simply fail, otherwise we would deadlock. |
145 | * In !PREEMPT configurations, there is no way to tell if we are | ||
146 | * in a RCU read-side critical section or not, so we never | ||
147 | * attempt any fixup and just print a warning. | ||
145 | */ | 148 | */ |
149 | #ifndef CONFIG_PREEMPT | ||
150 | WARN_ON_ONCE(1); | ||
151 | return 0; | ||
152 | #endif | ||
146 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 153 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
147 | irqs_disabled()) { | 154 | irqs_disabled()) { |
148 | WARN_ON(1); | 155 | WARN_ON_ONCE(1); |
149 | return 0; | 156 | return 0; |
150 | } | 157 | } |
151 | rcu_barrier(); | 158 | rcu_barrier(); |
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | |||
184 | * Ensure that queued callbacks are all executed. | 191 | * Ensure that queued callbacks are all executed. |
185 | * If we detect that we are nested in a RCU read-side critical | 192 | * If we detect that we are nested in a RCU read-side critical |
186 | * section, we should simply fail, otherwise we would deadlock. | 193 | * section, we should simply fail, otherwise we would deadlock. |
194 | * In !PREEMPT configurations, there is no way to tell if we are | ||
195 | * in a RCU read-side critical section or not, so we never | ||
196 | * attempt any fixup and just print a warning. | ||
187 | */ | 197 | */ |
198 | #ifndef CONFIG_PREEMPT | ||
199 | WARN_ON_ONCE(1); | ||
200 | return 0; | ||
201 | #endif | ||
188 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 202 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
189 | irqs_disabled()) { | 203 | irqs_disabled()) { |
190 | WARN_ON(1); | 204 | WARN_ON_ONCE(1); |
191 | return 0; | 205 | return 0; |
192 | } | 206 | } |
193 | rcu_barrier(); | 207 | rcu_barrier(); |
@@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | |||
214 | * Ensure that queued callbacks are all executed. | 228 | * Ensure that queued callbacks are all executed. |
215 | * If we detect that we are nested in a RCU read-side critical | 229 | * If we detect that we are nested in a RCU read-side critical |
216 | * section, we should simply fail, otherwise we would deadlock. | 230 | * section, we should simply fail, otherwise we would deadlock. |
217 | * Note that the machinery to reliably determine whether | 231 | * In !PREEMPT configurations, there is no way to tell if we are |
218 | * or not we are in an RCU read-side critical section | 232 | * in a RCU read-side critical section or not, so we never |
219 | * exists only in the preemptible RCU implementations | 233 | * attempt any fixup and just print a warning. |
220 | * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why | ||
221 | * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT. | ||
222 | */ | 234 | */ |
235 | #ifndef CONFIG_PREEMPT | ||
236 | WARN_ON_ONCE(1); | ||
237 | return 0; | ||
238 | #endif | ||
223 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 239 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
224 | irqs_disabled()) { | 240 | irqs_disabled()) { |
225 | WARN_ON(1); | 241 | WARN_ON_ONCE(1); |
226 | return 0; | 242 | return 0; |
227 | } | 243 | } |
228 | rcu_barrier(); | 244 | rcu_barrier(); |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 0c343b9a46d5..7bbac7d0f5ab 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
@@ -35,15 +35,16 @@ | |||
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/time.h> | 36 | #include <linux/time.h> |
37 | #include <linux/cpu.h> | 37 | #include <linux/cpu.h> |
38 | #include <linux/prefetch.h> | ||
38 | 39 | ||
39 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ | 40 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ |
40 | static struct task_struct *rcu_kthread_task; | 41 | static struct task_struct *rcu_kthread_task; |
41 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); | 42 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); |
42 | static unsigned long have_rcu_kthread_work; | 43 | static unsigned long have_rcu_kthread_work; |
43 | static void invoke_rcu_kthread(void); | ||
44 | 44 | ||
45 | /* Forward declarations for rcutiny_plugin.h. */ | 45 | /* Forward declarations for rcutiny_plugin.h. */ |
46 | struct rcu_ctrlblk; | 46 | struct rcu_ctrlblk; |
47 | static void invoke_rcu_kthread(void); | ||
47 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 48 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); |
48 | static int rcu_kthread(void *arg); | 49 | static int rcu_kthread(void *arg); |
49 | static void __call_rcu(struct rcu_head *head, | 50 | static void __call_rcu(struct rcu_head *head, |
@@ -79,36 +80,45 @@ void rcu_exit_nohz(void) | |||
79 | #endif /* #ifdef CONFIG_NO_HZ */ | 80 | #endif /* #ifdef CONFIG_NO_HZ */ |
80 | 81 | ||
81 | /* | 82 | /* |
82 | * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). | 83 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). |
83 | * Also disable irqs to avoid confusion due to interrupt handlers | 84 | * Also irqs are disabled to avoid confusion due to interrupt handlers |
84 | * invoking call_rcu(). | 85 | * invoking call_rcu(). |
85 | */ | 86 | */ |
86 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 87 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
87 | { | 88 | { |
88 | unsigned long flags; | ||
89 | |||
90 | local_irq_save(flags); | ||
91 | if (rcp->rcucblist != NULL && | 89 | if (rcp->rcucblist != NULL && |
92 | rcp->donetail != rcp->curtail) { | 90 | rcp->donetail != rcp->curtail) { |
93 | rcp->donetail = rcp->curtail; | 91 | rcp->donetail = rcp->curtail; |
94 | local_irq_restore(flags); | ||
95 | return 1; | 92 | return 1; |
96 | } | 93 | } |
97 | local_irq_restore(flags); | ||
98 | 94 | ||
99 | return 0; | 95 | return 0; |
100 | } | 96 | } |
101 | 97 | ||
102 | /* | 98 | /* |
99 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
100 | * or to boost readers. | ||
101 | */ | ||
102 | static void invoke_rcu_kthread(void) | ||
103 | { | ||
104 | have_rcu_kthread_work = 1; | ||
105 | wake_up(&rcu_kthread_wq); | ||
106 | } | ||
107 | |||
108 | /* | ||
103 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we | 109 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we |
104 | * are at it, given that any rcu quiescent state is also an rcu_bh | 110 | * are at it, given that any rcu quiescent state is also an rcu_bh |
105 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. | 111 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. |
106 | */ | 112 | */ |
107 | void rcu_sched_qs(int cpu) | 113 | void rcu_sched_qs(int cpu) |
108 | { | 114 | { |
115 | unsigned long flags; | ||
116 | |||
117 | local_irq_save(flags); | ||
109 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | 118 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
110 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | 119 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
111 | invoke_rcu_kthread(); | 120 | invoke_rcu_kthread(); |
121 | local_irq_restore(flags); | ||
112 | } | 122 | } |
113 | 123 | ||
114 | /* | 124 | /* |
@@ -116,8 +126,12 @@ void rcu_sched_qs(int cpu) | |||
116 | */ | 126 | */ |
117 | void rcu_bh_qs(int cpu) | 127 | void rcu_bh_qs(int cpu) |
118 | { | 128 | { |
129 | unsigned long flags; | ||
130 | |||
131 | local_irq_save(flags); | ||
119 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | 132 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
120 | invoke_rcu_kthread(); | 133 | invoke_rcu_kthread(); |
134 | local_irq_restore(flags); | ||
121 | } | 135 | } |
122 | 136 | ||
123 | /* | 137 | /* |
@@ -167,7 +181,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
167 | prefetch(next); | 181 | prefetch(next); |
168 | debug_rcu_head_unqueue(list); | 182 | debug_rcu_head_unqueue(list); |
169 | local_bh_disable(); | 183 | local_bh_disable(); |
170 | list->func(list); | 184 | __rcu_reclaim(list); |
171 | local_bh_enable(); | 185 | local_bh_enable(); |
172 | list = next; | 186 | list = next; |
173 | RCU_TRACE(cb_count++); | 187 | RCU_TRACE(cb_count++); |
@@ -208,20 +222,6 @@ static int rcu_kthread(void *arg) | |||
208 | } | 222 | } |
209 | 223 | ||
210 | /* | 224 | /* |
211 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
212 | * or to boost readers. | ||
213 | */ | ||
214 | static void invoke_rcu_kthread(void) | ||
215 | { | ||
216 | unsigned long flags; | ||
217 | |||
218 | local_irq_save(flags); | ||
219 | have_rcu_kthread_work = 1; | ||
220 | wake_up(&rcu_kthread_wq); | ||
221 | local_irq_restore(flags); | ||
222 | } | ||
223 | |||
224 | /* | ||
225 | * Wait for a grace period to elapse. But it is illegal to invoke | 225 | * Wait for a grace period to elapse. But it is illegal to invoke |
226 | * synchronize_sched() from within an RCU read-side critical section. | 226 | * synchronize_sched() from within an RCU read-side critical section. |
227 | * Therefore, any legal call to synchronize_sched() is a quiescent | 227 | * Therefore, any legal call to synchronize_sched() is a quiescent |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 3cb8e362e883..f259c676195f 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk { | |||
100 | u8 completed; /* Last grace period completed. */ | 100 | u8 completed; /* Last grace period completed. */ |
101 | /* If all three are equal, RCU is idle. */ | 101 | /* If all three are equal, RCU is idle. */ |
102 | #ifdef CONFIG_RCU_BOOST | 102 | #ifdef CONFIG_RCU_BOOST |
103 | s8 boosted_this_gp; /* Has boosting already happened? */ | ||
104 | unsigned long boost_time; /* When to start boosting (jiffies) */ | 103 | unsigned long boost_time; /* When to start boosting (jiffies) */ |
105 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 104 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
106 | #ifdef CONFIG_RCU_TRACE | 105 | #ifdef CONFIG_RCU_TRACE |
107 | unsigned long n_grace_periods; | 106 | unsigned long n_grace_periods; |
108 | #ifdef CONFIG_RCU_BOOST | 107 | #ifdef CONFIG_RCU_BOOST |
109 | unsigned long n_tasks_boosted; | 108 | unsigned long n_tasks_boosted; |
109 | /* Total number of tasks boosted. */ | ||
110 | unsigned long n_exp_boosts; | 110 | unsigned long n_exp_boosts; |
111 | /* Number of tasks boosted for expedited GP. */ | ||
111 | unsigned long n_normal_boosts; | 112 | unsigned long n_normal_boosts; |
112 | unsigned long n_normal_balk_blkd_tasks; | 113 | /* Number of tasks boosted for normal GP. */ |
113 | unsigned long n_normal_balk_gp_tasks; | 114 | unsigned long n_balk_blkd_tasks; |
114 | unsigned long n_normal_balk_boost_tasks; | 115 | /* Refused to boost: no blocked tasks. */ |
115 | unsigned long n_normal_balk_boosted; | 116 | unsigned long n_balk_exp_gp_tasks; |
116 | unsigned long n_normal_balk_notyet; | 117 | /* Refused to boost: nothing blocking GP. */ |
117 | unsigned long n_normal_balk_nos; | 118 | unsigned long n_balk_boost_tasks; |
118 | unsigned long n_exp_balk_blkd_tasks; | 119 | /* Refused to boost: already boosting. */ |
119 | unsigned long n_exp_balk_nos; | 120 | unsigned long n_balk_notyet; |
121 | /* Refused to boost: not yet time. */ | ||
122 | unsigned long n_balk_nos; | ||
123 | /* Refused to boost: not sure why, though. */ | ||
124 | /* This can happen due to race conditions. */ | ||
120 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 125 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
121 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 126 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
122 | }; | 127 | }; |
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t) | |||
201 | 206 | ||
202 | #ifdef CONFIG_RCU_BOOST | 207 | #ifdef CONFIG_RCU_BOOST |
203 | static void rcu_initiate_boost_trace(void); | 208 | static void rcu_initiate_boost_trace(void); |
204 | static void rcu_initiate_exp_boost_trace(void); | ||
205 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 209 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
206 | 210 | ||
207 | /* | 211 | /* |
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m) | |||
219 | "N."[!rcu_preempt_ctrlblk.gp_tasks], | 223 | "N."[!rcu_preempt_ctrlblk.gp_tasks], |
220 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); | 224 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); |
221 | #ifdef CONFIG_RCU_BOOST | 225 | #ifdef CONFIG_RCU_BOOST |
222 | seq_printf(m, " ttb=%c btg=", | 226 | seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", |
223 | "B."[!rcu_preempt_ctrlblk.boost_tasks]); | 227 | " ", |
224 | switch (rcu_preempt_ctrlblk.boosted_this_gp) { | 228 | "B."[!rcu_preempt_ctrlblk.boost_tasks], |
225 | case -1: | ||
226 | seq_puts(m, "exp"); | ||
227 | break; | ||
228 | case 0: | ||
229 | seq_puts(m, "no"); | ||
230 | break; | ||
231 | case 1: | ||
232 | seq_puts(m, "begun"); | ||
233 | break; | ||
234 | case 2: | ||
235 | seq_puts(m, "done"); | ||
236 | break; | ||
237 | default: | ||
238 | seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); | ||
239 | } | ||
240 | seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", | ||
241 | rcu_preempt_ctrlblk.n_tasks_boosted, | 229 | rcu_preempt_ctrlblk.n_tasks_boosted, |
242 | rcu_preempt_ctrlblk.n_exp_boosts, | 230 | rcu_preempt_ctrlblk.n_exp_boosts, |
243 | rcu_preempt_ctrlblk.n_normal_boosts, | 231 | rcu_preempt_ctrlblk.n_normal_boosts, |
244 | (int)(jiffies & 0xffff), | 232 | (int)(jiffies & 0xffff), |
245 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); | 233 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); |
246 | seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", | 234 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", |
247 | "normal balk", | 235 | " balk", |
248 | rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, | 236 | rcu_preempt_ctrlblk.n_balk_blkd_tasks, |
249 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, | 237 | rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, |
250 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, | 238 | rcu_preempt_ctrlblk.n_balk_boost_tasks, |
251 | rcu_preempt_ctrlblk.n_normal_balk_boosted, | 239 | rcu_preempt_ctrlblk.n_balk_notyet, |
252 | rcu_preempt_ctrlblk.n_normal_balk_notyet, | 240 | rcu_preempt_ctrlblk.n_balk_nos); |
253 | rcu_preempt_ctrlblk.n_normal_balk_nos); | ||
254 | seq_printf(m, " exp balk: bt=%lu nos=%lu\n", | ||
255 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, | ||
256 | rcu_preempt_ctrlblk.n_exp_balk_nos); | ||
257 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 241 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
258 | } | 242 | } |
259 | 243 | ||
@@ -271,25 +255,59 @@ static int rcu_boost(void) | |||
271 | { | 255 | { |
272 | unsigned long flags; | 256 | unsigned long flags; |
273 | struct rt_mutex mtx; | 257 | struct rt_mutex mtx; |
274 | struct list_head *np; | ||
275 | struct task_struct *t; | 258 | struct task_struct *t; |
259 | struct list_head *tb; | ||
276 | 260 | ||
277 | if (rcu_preempt_ctrlblk.boost_tasks == NULL) | 261 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && |
262 | rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
278 | return 0; /* Nothing to boost. */ | 263 | return 0; /* Nothing to boost. */ |
264 | |||
279 | raw_local_irq_save(flags); | 265 | raw_local_irq_save(flags); |
280 | rcu_preempt_ctrlblk.boosted_this_gp++; | 266 | |
281 | t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, | 267 | /* |
282 | rcu_node_entry); | 268 | * Recheck with irqs disabled: all tasks in need of boosting |
283 | np = rcu_next_node_entry(t); | 269 | * might exit their RCU read-side critical sections on their own |
270 | * if we are preempted just before disabling irqs. | ||
271 | */ | ||
272 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
273 | rcu_preempt_ctrlblk.exp_tasks == NULL) { | ||
274 | raw_local_irq_restore(flags); | ||
275 | return 0; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * Preferentially boost tasks blocking expedited grace periods. | ||
280 | * This cannot starve the normal grace periods because a second | ||
281 | * expedited grace period must boost all blocked tasks, including | ||
282 | * those blocking the pre-existing normal grace period. | ||
283 | */ | ||
284 | if (rcu_preempt_ctrlblk.exp_tasks != NULL) { | ||
285 | tb = rcu_preempt_ctrlblk.exp_tasks; | ||
286 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
287 | } else { | ||
288 | tb = rcu_preempt_ctrlblk.boost_tasks; | ||
289 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
290 | } | ||
291 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | ||
292 | |||
293 | /* | ||
294 | * We boost task t by manufacturing an rt_mutex that appears to | ||
295 | * be held by task t. We leave a pointer to that rt_mutex where | ||
296 | * task t can find it, and task t will release the mutex when it | ||
297 | * exits its outermost RCU read-side critical section. Then | ||
298 | * simply acquiring this artificial rt_mutex will boost task | ||
299 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
300 | */ | ||
301 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
284 | rt_mutex_init_proxy_locked(&mtx, t); | 302 | rt_mutex_init_proxy_locked(&mtx, t); |
285 | t->rcu_boost_mutex = &mtx; | 303 | t->rcu_boost_mutex = &mtx; |
286 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | 304 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; |
287 | raw_local_irq_restore(flags); | 305 | raw_local_irq_restore(flags); |
288 | rt_mutex_lock(&mtx); | 306 | rt_mutex_lock(&mtx); |
289 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | 307 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
290 | rcu_preempt_ctrlblk.boosted_this_gp++; | 308 | |
291 | rt_mutex_unlock(&mtx); | 309 | return rcu_preempt_ctrlblk.boost_tasks != NULL || |
292 | return rcu_preempt_ctrlblk.boost_tasks != NULL; | 310 | rcu_preempt_ctrlblk.exp_tasks != NULL; |
293 | } | 311 | } |
294 | 312 | ||
295 | /* | 313 | /* |
@@ -304,42 +322,25 @@ static int rcu_boost(void) | |||
304 | */ | 322 | */ |
305 | static int rcu_initiate_boost(void) | 323 | static int rcu_initiate_boost(void) |
306 | { | 324 | { |
307 | if (!rcu_preempt_blocked_readers_cgp()) { | 325 | if (!rcu_preempt_blocked_readers_cgp() && |
308 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); | 326 | rcu_preempt_ctrlblk.exp_tasks == NULL) { |
327 | RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); | ||
309 | return 0; | 328 | return 0; |
310 | } | 329 | } |
311 | if (rcu_preempt_ctrlblk.gp_tasks != NULL && | 330 | if (rcu_preempt_ctrlblk.exp_tasks != NULL || |
312 | rcu_preempt_ctrlblk.boost_tasks == NULL && | 331 | (rcu_preempt_ctrlblk.gp_tasks != NULL && |
313 | rcu_preempt_ctrlblk.boosted_this_gp == 0 && | 332 | rcu_preempt_ctrlblk.boost_tasks == NULL && |
314 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { | 333 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { |
315 | rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; | 334 | if (rcu_preempt_ctrlblk.exp_tasks == NULL) |
335 | rcu_preempt_ctrlblk.boost_tasks = | ||
336 | rcu_preempt_ctrlblk.gp_tasks; | ||
316 | invoke_rcu_kthread(); | 337 | invoke_rcu_kthread(); |
317 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
318 | } else | 338 | } else |
319 | RCU_TRACE(rcu_initiate_boost_trace()); | 339 | RCU_TRACE(rcu_initiate_boost_trace()); |
320 | return 1; | 340 | return 1; |
321 | } | 341 | } |
322 | 342 | ||
323 | /* | 343 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) |
324 | * Initiate boosting for an expedited grace period. | ||
325 | */ | ||
326 | static void rcu_initiate_expedited_boost(void) | ||
327 | { | ||
328 | unsigned long flags; | ||
329 | |||
330 | raw_local_irq_save(flags); | ||
331 | if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { | ||
332 | rcu_preempt_ctrlblk.boost_tasks = | ||
333 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
334 | rcu_preempt_ctrlblk.boosted_this_gp = -1; | ||
335 | invoke_rcu_kthread(); | ||
336 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
337 | } else | ||
338 | RCU_TRACE(rcu_initiate_exp_boost_trace()); | ||
339 | raw_local_irq_restore(flags); | ||
340 | } | ||
341 | |||
342 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); | ||
343 | 344 | ||
344 | /* | 345 | /* |
345 | * Do priority-boost accounting for the start of a new grace period. | 346 | * Do priority-boost accounting for the start of a new grace period. |
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void) | |||
347 | static void rcu_preempt_boost_start_gp(void) | 348 | static void rcu_preempt_boost_start_gp(void) |
348 | { | 349 | { |
349 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | 350 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; |
350 | if (rcu_preempt_ctrlblk.boosted_this_gp > 0) | ||
351 | rcu_preempt_ctrlblk.boosted_this_gp = 0; | ||
352 | } | 351 | } |
353 | 352 | ||
354 | #else /* #ifdef CONFIG_RCU_BOOST */ | 353 | #else /* #ifdef CONFIG_RCU_BOOST */ |
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void) | |||
372 | } | 371 | } |
373 | 372 | ||
374 | /* | 373 | /* |
375 | * If there is no RCU priority boosting, we don't initiate expedited boosting. | ||
376 | */ | ||
377 | static void rcu_initiate_expedited_boost(void) | ||
378 | { | ||
379 | } | ||
380 | |||
381 | /* | ||
382 | * If there is no RCU priority boosting, nothing to do at grace-period start. | 374 | * If there is no RCU priority boosting, nothing to do at grace-period start. |
383 | */ | 375 | */ |
384 | static void rcu_preempt_boost_start_gp(void) | 376 | static void rcu_preempt_boost_start_gp(void) |
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void) | |||
418 | if (!rcu_preempt_gp_in_progress()) | 410 | if (!rcu_preempt_gp_in_progress()) |
419 | return; | 411 | return; |
420 | /* | 412 | /* |
421 | * Check up on boosting. If there are no readers blocking the | 413 | * Check up on boosting. If there are readers blocking the |
422 | * current grace period, leave. | 414 | * current grace period, leave. |
423 | */ | 415 | */ |
424 | if (rcu_initiate_boost()) | 416 | if (rcu_initiate_boost()) |
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
578 | empty = !rcu_preempt_blocked_readers_cgp(); | 570 | empty = !rcu_preempt_blocked_readers_cgp(); |
579 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | 571 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; |
580 | np = rcu_next_node_entry(t); | 572 | np = rcu_next_node_entry(t); |
581 | list_del(&t->rcu_node_entry); | 573 | list_del_init(&t->rcu_node_entry); |
582 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | 574 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) |
583 | rcu_preempt_ctrlblk.gp_tasks = np; | 575 | rcu_preempt_ctrlblk.gp_tasks = np; |
584 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | 576 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) |
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
587 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) | 579 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) |
588 | rcu_preempt_ctrlblk.boost_tasks = np; | 580 | rcu_preempt_ctrlblk.boost_tasks = np; |
589 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 581 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
590 | INIT_LIST_HEAD(&t->rcu_node_entry); | ||
591 | 582 | ||
592 | /* | 583 | /* |
593 | * If this was the last task on the current list, and if | 584 | * If this was the last task on the current list, and if |
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void) | |||
812 | rpcp->exp_tasks = rpcp->blkd_tasks.next; | 803 | rpcp->exp_tasks = rpcp->blkd_tasks.next; |
813 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) | 804 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) |
814 | rpcp->exp_tasks = NULL; | 805 | rpcp->exp_tasks = NULL; |
815 | local_irq_restore(flags); | ||
816 | 806 | ||
817 | /* Wait for tail of ->blkd_tasks list to drain. */ | 807 | /* Wait for tail of ->blkd_tasks list to drain. */ |
818 | if (rcu_preempted_readers_exp()) | 808 | if (!rcu_preempted_readers_exp()) |
819 | rcu_initiate_expedited_boost(); | 809 | local_irq_restore(flags); |
810 | else { | ||
811 | rcu_initiate_boost(); | ||
812 | local_irq_restore(flags); | ||
820 | wait_event(sync_rcu_preempt_exp_wq, | 813 | wait_event(sync_rcu_preempt_exp_wq, |
821 | !rcu_preempted_readers_exp()); | 814 | !rcu_preempted_readers_exp()); |
815 | } | ||
822 | 816 | ||
823 | /* Clean up and exit. */ | 817 | /* Clean up and exit. */ |
824 | barrier(); /* ensure expedited GP seen before counter increment. */ | 818 | barrier(); /* ensure expedited GP seen before counter increment. */ |
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void) | |||
931 | 925 | ||
932 | static void rcu_initiate_boost_trace(void) | 926 | static void rcu_initiate_boost_trace(void) |
933 | { | 927 | { |
934 | if (rcu_preempt_ctrlblk.gp_tasks == NULL) | 928 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) |
935 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; | 929 | rcu_preempt_ctrlblk.n_balk_blkd_tasks++; |
930 | else if (rcu_preempt_ctrlblk.gp_tasks == NULL && | ||
931 | rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
932 | rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; | ||
936 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) | 933 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) |
937 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; | 934 | rcu_preempt_ctrlblk.n_balk_boost_tasks++; |
938 | else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) | ||
939 | rcu_preempt_ctrlblk.n_normal_balk_boosted++; | ||
940 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) | 935 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) |
941 | rcu_preempt_ctrlblk.n_normal_balk_notyet++; | 936 | rcu_preempt_ctrlblk.n_balk_notyet++; |
942 | else | ||
943 | rcu_preempt_ctrlblk.n_normal_balk_nos++; | ||
944 | } | ||
945 | |||
946 | static void rcu_initiate_exp_boost_trace(void) | ||
947 | { | ||
948 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) | ||
949 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; | ||
950 | else | 937 | else |
951 | rcu_preempt_ctrlblk.n_exp_balk_nos++; | 938 | rcu_preempt_ctrlblk.n_balk_nos++; |
952 | } | 939 | } |
953 | 940 | ||
954 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 941 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c224da41890c..2e138db03382 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -131,7 +131,7 @@ struct rcu_torture { | |||
131 | 131 | ||
132 | static LIST_HEAD(rcu_torture_freelist); | 132 | static LIST_HEAD(rcu_torture_freelist); |
133 | static struct rcu_torture __rcu *rcu_torture_current; | 133 | static struct rcu_torture __rcu *rcu_torture_current; |
134 | static long rcu_torture_current_version; | 134 | static unsigned long rcu_torture_current_version; |
135 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | 135 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; |
136 | static DEFINE_SPINLOCK(rcu_torture_lock); | 136 | static DEFINE_SPINLOCK(rcu_torture_lock); |
137 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = | 137 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = |
@@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror; | |||
146 | static atomic_t n_rcu_torture_error; | 146 | static atomic_t n_rcu_torture_error; |
147 | static long n_rcu_torture_boost_ktrerror; | 147 | static long n_rcu_torture_boost_ktrerror; |
148 | static long n_rcu_torture_boost_rterror; | 148 | static long n_rcu_torture_boost_rterror; |
149 | static long n_rcu_torture_boost_allocerror; | ||
150 | static long n_rcu_torture_boost_afferror; | ||
151 | static long n_rcu_torture_boost_failure; | 149 | static long n_rcu_torture_boost_failure; |
152 | static long n_rcu_torture_boosts; | 150 | static long n_rcu_torture_boosts; |
153 | static long n_rcu_torture_timers; | 151 | static long n_rcu_torture_timers; |
@@ -163,11 +161,11 @@ static int stutter_pause_test; | |||
163 | #endif | 161 | #endif |
164 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 162 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
165 | 163 | ||
166 | #ifdef CONFIG_RCU_BOOST | 164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) |
167 | #define rcu_can_boost() 1 | 165 | #define rcu_can_boost() 1 |
168 | #else /* #ifdef CONFIG_RCU_BOOST */ | 166 | #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
169 | #define rcu_can_boost() 0 | 167 | #define rcu_can_boost() 0 |
170 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | 168 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
171 | 169 | ||
172 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 170 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
173 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 171 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
@@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg) | |||
751 | n_rcu_torture_boost_rterror++; | 749 | n_rcu_torture_boost_rterror++; |
752 | } | 750 | } |
753 | 751 | ||
752 | init_rcu_head_on_stack(&rbi.rcu); | ||
754 | /* Each pass through the following loop does one boost-test cycle. */ | 753 | /* Each pass through the following loop does one boost-test cycle. */ |
755 | do { | 754 | do { |
756 | /* Wait for the next test interval. */ | 755 | /* Wait for the next test interval. */ |
@@ -810,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); | |||
810 | 809 | ||
811 | /* Clean up and exit. */ | 810 | /* Clean up and exit. */ |
812 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | 811 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); |
812 | destroy_rcu_head_on_stack(&rbi.rcu); | ||
813 | rcutorture_shutdown_absorb("rcu_torture_boost"); | 813 | rcutorture_shutdown_absorb("rcu_torture_boost"); |
814 | while (!kthread_should_stop() || rbi.inflight) | 814 | while (!kthread_should_stop() || rbi.inflight) |
815 | schedule_timeout_uninterruptible(1); | 815 | schedule_timeout_uninterruptible(1); |
@@ -886,7 +886,7 @@ rcu_torture_writer(void *arg) | |||
886 | old_rp->rtort_pipe_count++; | 886 | old_rp->rtort_pipe_count++; |
887 | cur_ops->deferred_free(old_rp); | 887 | cur_ops->deferred_free(old_rp); |
888 | } | 888 | } |
889 | rcu_torture_current_version++; | 889 | rcutorture_record_progress(++rcu_torture_current_version); |
890 | oldbatch = cur_ops->completed(); | 890 | oldbatch = cur_ops->completed(); |
891 | rcu_stutter_wait("rcu_torture_writer"); | 891 | rcu_stutter_wait("rcu_torture_writer"); |
892 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 892 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
@@ -1066,8 +1066,8 @@ rcu_torture_printk(char *page) | |||
1066 | } | 1066 | } |
1067 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1067 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
1068 | cnt += sprintf(&page[cnt], | 1068 | cnt += sprintf(&page[cnt], |
1069 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 1069 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
1070 | "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " | 1070 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
1071 | "rtbf: %ld rtb: %ld nt: %ld", | 1071 | "rtbf: %ld rtb: %ld nt: %ld", |
1072 | rcu_torture_current, | 1072 | rcu_torture_current, |
1073 | rcu_torture_current_version, | 1073 | rcu_torture_current_version, |
@@ -1078,16 +1078,12 @@ rcu_torture_printk(char *page) | |||
1078 | atomic_read(&n_rcu_torture_mberror), | 1078 | atomic_read(&n_rcu_torture_mberror), |
1079 | n_rcu_torture_boost_ktrerror, | 1079 | n_rcu_torture_boost_ktrerror, |
1080 | n_rcu_torture_boost_rterror, | 1080 | n_rcu_torture_boost_rterror, |
1081 | n_rcu_torture_boost_allocerror, | ||
1082 | n_rcu_torture_boost_afferror, | ||
1083 | n_rcu_torture_boost_failure, | 1081 | n_rcu_torture_boost_failure, |
1084 | n_rcu_torture_boosts, | 1082 | n_rcu_torture_boosts, |
1085 | n_rcu_torture_timers); | 1083 | n_rcu_torture_timers); |
1086 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1084 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
1087 | n_rcu_torture_boost_ktrerror != 0 || | 1085 | n_rcu_torture_boost_ktrerror != 0 || |
1088 | n_rcu_torture_boost_rterror != 0 || | 1086 | n_rcu_torture_boost_rterror != 0 || |
1089 | n_rcu_torture_boost_allocerror != 0 || | ||
1090 | n_rcu_torture_boost_afferror != 0 || | ||
1091 | n_rcu_torture_boost_failure != 0) | 1087 | n_rcu_torture_boost_failure != 0) |
1092 | cnt += sprintf(&page[cnt], " !!!"); | 1088 | cnt += sprintf(&page[cnt], " !!!"); |
1093 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1089 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
@@ -1331,6 +1327,7 @@ rcu_torture_cleanup(void) | |||
1331 | int i; | 1327 | int i; |
1332 | 1328 | ||
1333 | mutex_lock(&fullstop_mutex); | 1329 | mutex_lock(&fullstop_mutex); |
1330 | rcutorture_record_test_transition(); | ||
1334 | if (fullstop == FULLSTOP_SHUTDOWN) { | 1331 | if (fullstop == FULLSTOP_SHUTDOWN) { |
1335 | printk(KERN_WARNING /* but going down anyway, so... */ | 1332 | printk(KERN_WARNING /* but going down anyway, so... */ |
1336 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | 1333 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); |
@@ -1486,8 +1483,6 @@ rcu_torture_init(void) | |||
1486 | atomic_set(&n_rcu_torture_error, 0); | 1483 | atomic_set(&n_rcu_torture_error, 0); |
1487 | n_rcu_torture_boost_ktrerror = 0; | 1484 | n_rcu_torture_boost_ktrerror = 0; |
1488 | n_rcu_torture_boost_rterror = 0; | 1485 | n_rcu_torture_boost_rterror = 0; |
1489 | n_rcu_torture_boost_allocerror = 0; | ||
1490 | n_rcu_torture_boost_afferror = 0; | ||
1491 | n_rcu_torture_boost_failure = 0; | 1486 | n_rcu_torture_boost_failure = 0; |
1492 | n_rcu_torture_boosts = 0; | 1487 | n_rcu_torture_boosts = 0; |
1493 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1488 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
@@ -1624,6 +1619,7 @@ rcu_torture_init(void) | |||
1624 | } | 1619 | } |
1625 | } | 1620 | } |
1626 | register_reboot_notifier(&rcutorture_shutdown_nb); | 1621 | register_reboot_notifier(&rcutorture_shutdown_nb); |
1622 | rcutorture_record_test_transition(); | ||
1627 | mutex_unlock(&fullstop_mutex); | 1623 | mutex_unlock(&fullstop_mutex); |
1628 | return 0; | 1624 | return 0; |
1629 | 1625 | ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dd4aea806f8e..f07d2f03181a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -47,6 +47,9 @@ | |||
47 | #include <linux/mutex.h> | 47 | #include <linux/mutex.h> |
48 | #include <linux/time.h> | 48 | #include <linux/time.h> |
49 | #include <linux/kernel_stat.h> | 49 | #include <linux/kernel_stat.h> |
50 | #include <linux/wait.h> | ||
51 | #include <linux/kthread.h> | ||
52 | #include <linux/prefetch.h> | ||
50 | 53 | ||
51 | #include "rcutree.h" | 54 | #include "rcutree.h" |
52 | 55 | ||
@@ -79,10 +82,41 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | |||
79 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); | 82 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); |
80 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 83 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
81 | 84 | ||
85 | static struct rcu_state *rcu_state; | ||
86 | |||
82 | int rcu_scheduler_active __read_mostly; | 87 | int rcu_scheduler_active __read_mostly; |
83 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | 88 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); |
84 | 89 | ||
85 | /* | 90 | /* |
91 | * Control variables for per-CPU and per-rcu_node kthreads. These | ||
92 | * handle all flavors of RCU. | ||
93 | */ | ||
94 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | ||
95 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
96 | DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); | ||
97 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
98 | static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq); | ||
99 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | ||
100 | static char rcu_kthreads_spawnable; | ||
101 | |||
102 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | ||
103 | static void invoke_rcu_cpu_kthread(void); | ||
104 | |||
105 | #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ | ||
106 | |||
107 | /* | ||
108 | * Track the rcutorture test sequence number and the update version | ||
109 | * number within a given test. The rcutorture_testseq is incremented | ||
110 | * on every rcutorture module load and unload, so has an odd value | ||
111 | * when a test is running. The rcutorture_vernum is set to zero | ||
112 | * when rcutorture starts and is incremented on each rcutorture update. | ||
113 | * These variables enable correlating rcutorture output with the | ||
114 | * RCU tracing information. | ||
115 | */ | ||
116 | unsigned long rcutorture_testseq; | ||
117 | unsigned long rcutorture_vernum; | ||
118 | |||
119 | /* | ||
86 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 120 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
87 | * permit this function to be invoked without holding the root rcu_node | 121 | * permit this function to be invoked without holding the root rcu_node |
88 | * structure's ->lock, but of course results can be subject to change. | 122 | * structure's ->lock, but of course results can be subject to change. |
@@ -124,6 +158,7 @@ void rcu_note_context_switch(int cpu) | |||
124 | rcu_sched_qs(cpu); | 158 | rcu_sched_qs(cpu); |
125 | rcu_preempt_note_context_switch(cpu); | 159 | rcu_preempt_note_context_switch(cpu); |
126 | } | 160 | } |
161 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | ||
127 | 162 | ||
128 | #ifdef CONFIG_NO_HZ | 163 | #ifdef CONFIG_NO_HZ |
129 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 164 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
@@ -140,10 +175,8 @@ module_param(blimit, int, 0); | |||
140 | module_param(qhimark, int, 0); | 175 | module_param(qhimark, int, 0); |
141 | module_param(qlowmark, int, 0); | 176 | module_param(qlowmark, int, 0); |
142 | 177 | ||
143 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 178 | int rcu_cpu_stall_suppress __read_mostly; |
144 | int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; | ||
145 | module_param(rcu_cpu_stall_suppress, int, 0644); | 179 | module_param(rcu_cpu_stall_suppress, int, 0644); |
146 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
147 | 180 | ||
148 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); | 181 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); |
149 | static int rcu_pending(int cpu); | 182 | static int rcu_pending(int cpu); |
@@ -176,6 +209,31 @@ void rcu_bh_force_quiescent_state(void) | |||
176 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 209 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); |
177 | 210 | ||
178 | /* | 211 | /* |
212 | * Record the number of times rcutorture tests have been initiated and | ||
213 | * terminated. This information allows the debugfs tracing stats to be | ||
214 | * correlated to the rcutorture messages, even when the rcutorture module | ||
215 | * is being repeatedly loaded and unloaded. In other words, we cannot | ||
216 | * store this state in rcutorture itself. | ||
217 | */ | ||
218 | void rcutorture_record_test_transition(void) | ||
219 | { | ||
220 | rcutorture_testseq++; | ||
221 | rcutorture_vernum = 0; | ||
222 | } | ||
223 | EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); | ||
224 | |||
225 | /* | ||
226 | * Record the number of writer passes through the current rcutorture test. | ||
227 | * This is also used to correlate debugfs tracing stats with the rcutorture | ||
228 | * messages. | ||
229 | */ | ||
230 | void rcutorture_record_progress(unsigned long vernum) | ||
231 | { | ||
232 | rcutorture_vernum++; | ||
233 | } | ||
234 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | ||
235 | |||
236 | /* | ||
179 | * Force a quiescent state for RCU-sched. | 237 | * Force a quiescent state for RCU-sched. |
180 | */ | 238 | */ |
181 | void rcu_sched_force_quiescent_state(void) | 239 | void rcu_sched_force_quiescent_state(void) |
@@ -234,8 +292,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
234 | return 1; | 292 | return 1; |
235 | } | 293 | } |
236 | 294 | ||
237 | /* If preemptable RCU, no point in sending reschedule IPI. */ | 295 | /* If preemptible RCU, no point in sending reschedule IPI. */ |
238 | if (rdp->preemptable) | 296 | if (rdp->preemptible) |
239 | return 0; | 297 | return 0; |
240 | 298 | ||
241 | /* The CPU is online, so send it a reschedule IPI. */ | 299 | /* The CPU is online, so send it a reschedule IPI. */ |
@@ -450,8 +508,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
450 | 508 | ||
451 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 509 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
452 | 510 | ||
453 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
454 | |||
455 | int rcu_cpu_stall_suppress __read_mostly; | 511 | int rcu_cpu_stall_suppress __read_mostly; |
456 | 512 | ||
457 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 513 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
@@ -537,21 +593,24 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
537 | 593 | ||
538 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | 594 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) |
539 | { | 595 | { |
540 | long delta; | 596 | unsigned long j; |
597 | unsigned long js; | ||
541 | struct rcu_node *rnp; | 598 | struct rcu_node *rnp; |
542 | 599 | ||
543 | if (rcu_cpu_stall_suppress) | 600 | if (rcu_cpu_stall_suppress) |
544 | return; | 601 | return; |
545 | delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); | 602 | j = ACCESS_ONCE(jiffies); |
603 | js = ACCESS_ONCE(rsp->jiffies_stall); | ||
546 | rnp = rdp->mynode; | 604 | rnp = rdp->mynode; |
547 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { | 605 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { |
548 | 606 | ||
549 | /* We haven't checked in, so go dump stack. */ | 607 | /* We haven't checked in, so go dump stack. */ |
550 | print_cpu_stall(rsp); | 608 | print_cpu_stall(rsp); |
551 | 609 | ||
552 | } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { | 610 | } else if (rcu_gp_in_progress(rsp) && |
611 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { | ||
553 | 612 | ||
554 | /* They had two time units to dump stack, so complain. */ | 613 | /* They had a few time units to dump stack, so complain. */ |
555 | print_other_cpu_stall(rsp); | 614 | print_other_cpu_stall(rsp); |
556 | } | 615 | } |
557 | } | 616 | } |
@@ -587,26 +646,6 @@ static void __init check_cpu_stall_init(void) | |||
587 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); | 646 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); |
588 | } | 647 | } |
589 | 648 | ||
590 | #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
591 | |||
592 | static void record_gp_stall_check_time(struct rcu_state *rsp) | ||
593 | { | ||
594 | } | ||
595 | |||
596 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | ||
597 | { | ||
598 | } | ||
599 | |||
600 | void rcu_cpu_stall_reset(void) | ||
601 | { | ||
602 | } | ||
603 | |||
604 | static void __init check_cpu_stall_init(void) | ||
605 | { | ||
606 | } | ||
607 | |||
608 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
609 | |||
610 | /* | 649 | /* |
611 | * Update CPU-local rcu_data state to record the newly noticed grace period. | 650 | * Update CPU-local rcu_data state to record the newly noticed grace period. |
612 | * This is used both when we started the grace period and when we notice | 651 | * This is used both when we started the grace period and when we notice |
@@ -809,6 +848,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
809 | rnp->completed = rsp->completed; | 848 | rnp->completed = rsp->completed; |
810 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 849 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ |
811 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 850 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
851 | rcu_preempt_boost_start_gp(rnp); | ||
812 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 852 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
813 | return; | 853 | return; |
814 | } | 854 | } |
@@ -844,6 +884,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
844 | rnp->completed = rsp->completed; | 884 | rnp->completed = rsp->completed; |
845 | if (rnp == rdp->mynode) | 885 | if (rnp == rdp->mynode) |
846 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 886 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
887 | rcu_preempt_boost_start_gp(rnp); | ||
847 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 888 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
848 | } | 889 | } |
849 | 890 | ||
@@ -864,7 +905,12 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
864 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 905 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) |
865 | __releases(rcu_get_root(rsp)->lock) | 906 | __releases(rcu_get_root(rsp)->lock) |
866 | { | 907 | { |
908 | unsigned long gp_duration; | ||
909 | |||
867 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 910 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
911 | gp_duration = jiffies - rsp->gp_start; | ||
912 | if (gp_duration > rsp->gp_max) | ||
913 | rsp->gp_max = gp_duration; | ||
868 | rsp->completed = rsp->gpnum; | 914 | rsp->completed = rsp->gpnum; |
869 | rsp->signaled = RCU_GP_IDLE; | 915 | rsp->signaled = RCU_GP_IDLE; |
870 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 916 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
@@ -894,7 +940,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
894 | return; | 940 | return; |
895 | } | 941 | } |
896 | rnp->qsmask &= ~mask; | 942 | rnp->qsmask &= ~mask; |
897 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 943 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
898 | 944 | ||
899 | /* Other bits still set at this level, so done. */ | 945 | /* Other bits still set at this level, so done. */ |
900 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 946 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
@@ -1037,6 +1083,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) | |||
1037 | /* | 1083 | /* |
1038 | * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy | 1084 | * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy |
1039 | * and move all callbacks from the outgoing CPU to the current one. | 1085 | * and move all callbacks from the outgoing CPU to the current one. |
1086 | * There can only be one CPU hotplug operation at a time, so no other | ||
1087 | * CPU can be attempting to update rcu_cpu_kthread_task. | ||
1040 | */ | 1088 | */ |
1041 | static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | 1089 | static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) |
1042 | { | 1090 | { |
@@ -1045,6 +1093,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1045 | int need_report = 0; | 1093 | int need_report = 0; |
1046 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 1094 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
1047 | struct rcu_node *rnp; | 1095 | struct rcu_node *rnp; |
1096 | struct task_struct *t; | ||
1097 | |||
1098 | /* Stop the CPU's kthread. */ | ||
1099 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1100 | if (t != NULL) { | ||
1101 | per_cpu(rcu_cpu_kthread_task, cpu) = NULL; | ||
1102 | kthread_stop(t); | ||
1103 | } | ||
1048 | 1104 | ||
1049 | /* Exclude any attempts to start a new grace period. */ | 1105 | /* Exclude any attempts to start a new grace period. */ |
1050 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1106 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
@@ -1082,6 +1138,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
1082 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1138 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1083 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1139 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1084 | rcu_report_exp_rnp(rsp, rnp); | 1140 | rcu_report_exp_rnp(rsp, rnp); |
1141 | rcu_node_kthread_setaffinity(rnp, -1); | ||
1085 | } | 1142 | } |
1086 | 1143 | ||
1087 | /* | 1144 | /* |
@@ -1143,7 +1200,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1143 | next = list->next; | 1200 | next = list->next; |
1144 | prefetch(next); | 1201 | prefetch(next); |
1145 | debug_rcu_head_unqueue(list); | 1202 | debug_rcu_head_unqueue(list); |
1146 | list->func(list); | 1203 | __rcu_reclaim(list); |
1147 | list = next; | 1204 | list = next; |
1148 | if (++count >= rdp->blimit) | 1205 | if (++count >= rdp->blimit) |
1149 | break; | 1206 | break; |
@@ -1179,7 +1236,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1179 | 1236 | ||
1180 | /* Re-raise the RCU softirq if there are callbacks remaining. */ | 1237 | /* Re-raise the RCU softirq if there are callbacks remaining. */ |
1181 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 1238 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1182 | raise_softirq(RCU_SOFTIRQ); | 1239 | invoke_rcu_cpu_kthread(); |
1183 | } | 1240 | } |
1184 | 1241 | ||
1185 | /* | 1242 | /* |
@@ -1225,7 +1282,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
1225 | } | 1282 | } |
1226 | rcu_preempt_check_callbacks(cpu); | 1283 | rcu_preempt_check_callbacks(cpu); |
1227 | if (rcu_pending(cpu)) | 1284 | if (rcu_pending(cpu)) |
1228 | raise_softirq(RCU_SOFTIRQ); | 1285 | invoke_rcu_cpu_kthread(); |
1229 | } | 1286 | } |
1230 | 1287 | ||
1231 | #ifdef CONFIG_SMP | 1288 | #ifdef CONFIG_SMP |
@@ -1233,6 +1290,8 @@ void rcu_check_callbacks(int cpu, int user) | |||
1233 | /* | 1290 | /* |
1234 | * Scan the leaf rcu_node structures, processing dyntick state for any that | 1291 | * Scan the leaf rcu_node structures, processing dyntick state for any that |
1235 | * have not yet encountered a quiescent state, using the function specified. | 1292 | * have not yet encountered a quiescent state, using the function specified. |
1293 | * Also initiate boosting for any threads blocked on the root rcu_node. | ||
1294 | * | ||
1236 | * The caller must have suppressed start of new grace periods. | 1295 | * The caller must have suppressed start of new grace periods. |
1237 | */ | 1296 | */ |
1238 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | 1297 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) |
@@ -1251,7 +1310,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
1251 | return; | 1310 | return; |
1252 | } | 1311 | } |
1253 | if (rnp->qsmask == 0) { | 1312 | if (rnp->qsmask == 0) { |
1254 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1313 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ |
1255 | continue; | 1314 | continue; |
1256 | } | 1315 | } |
1257 | cpu = rnp->grplo; | 1316 | cpu = rnp->grplo; |
@@ -1269,6 +1328,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
1269 | } | 1328 | } |
1270 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1329 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1271 | } | 1330 | } |
1331 | rnp = rcu_get_root(rsp); | ||
1332 | if (rnp->qsmask == 0) { | ||
1333 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1334 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
1335 | } | ||
1272 | } | 1336 | } |
1273 | 1337 | ||
1274 | /* | 1338 | /* |
@@ -1389,7 +1453,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1389 | /* | 1453 | /* |
1390 | * Do softirq processing for the current CPU. | 1454 | * Do softirq processing for the current CPU. |
1391 | */ | 1455 | */ |
1392 | static void rcu_process_callbacks(struct softirq_action *unused) | 1456 | static void rcu_process_callbacks(void) |
1393 | { | 1457 | { |
1394 | /* | 1458 | /* |
1395 | * Memory references from any prior RCU read-side critical sections | 1459 | * Memory references from any prior RCU read-side critical sections |
@@ -1414,6 +1478,347 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
1414 | rcu_needs_cpu_flush(); | 1478 | rcu_needs_cpu_flush(); |
1415 | } | 1479 | } |
1416 | 1480 | ||
1481 | /* | ||
1482 | * Wake up the current CPU's kthread. This replaces raise_softirq() | ||
1483 | * in earlier versions of RCU. Note that because we are running on | ||
1484 | * the current CPU with interrupts disabled, the rcu_cpu_kthread_task | ||
1485 | * cannot disappear out from under us. | ||
1486 | */ | ||
1487 | static void invoke_rcu_cpu_kthread(void) | ||
1488 | { | ||
1489 | unsigned long flags; | ||
1490 | |||
1491 | local_irq_save(flags); | ||
1492 | __this_cpu_write(rcu_cpu_has_work, 1); | ||
1493 | if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { | ||
1494 | local_irq_restore(flags); | ||
1495 | return; | ||
1496 | } | ||
1497 | wake_up(&__get_cpu_var(rcu_cpu_wq)); | ||
1498 | local_irq_restore(flags); | ||
1499 | } | ||
1500 | |||
1501 | /* | ||
1502 | * Wake up the specified per-rcu_node-structure kthread. | ||
1503 | * Because the per-rcu_node kthreads are immortal, we don't need | ||
1504 | * to do anything to keep them alive. | ||
1505 | */ | ||
1506 | static void invoke_rcu_node_kthread(struct rcu_node *rnp) | ||
1507 | { | ||
1508 | struct task_struct *t; | ||
1509 | |||
1510 | t = rnp->node_kthread_task; | ||
1511 | if (t != NULL) | ||
1512 | wake_up_process(t); | ||
1513 | } | ||
1514 | |||
1515 | /* | ||
1516 | * Set the specified CPU's kthread to run RT or not, as specified by | ||
1517 | * the to_rt argument. The CPU-hotplug locks are held, so the task | ||
1518 | * is not going away. | ||
1519 | */ | ||
1520 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
1521 | { | ||
1522 | int policy; | ||
1523 | struct sched_param sp; | ||
1524 | struct task_struct *t; | ||
1525 | |||
1526 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1527 | if (t == NULL) | ||
1528 | return; | ||
1529 | if (to_rt) { | ||
1530 | policy = SCHED_FIFO; | ||
1531 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1532 | } else { | ||
1533 | policy = SCHED_NORMAL; | ||
1534 | sp.sched_priority = 0; | ||
1535 | } | ||
1536 | sched_setscheduler_nocheck(t, policy, &sp); | ||
1537 | } | ||
1538 | |||
1539 | /* | ||
1540 | * Timer handler to initiate the waking up of per-CPU kthreads that | ||
1541 | * have yielded the CPU due to excess numbers of RCU callbacks. | ||
1542 | * We wake up the per-rcu_node kthread, which in turn will wake up | ||
1543 | * the booster kthread. | ||
1544 | */ | ||
1545 | static void rcu_cpu_kthread_timer(unsigned long arg) | ||
1546 | { | ||
1547 | unsigned long flags; | ||
1548 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); | ||
1549 | struct rcu_node *rnp = rdp->mynode; | ||
1550 | |||
1551 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1552 | rnp->wakemask |= rdp->grpmask; | ||
1553 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1554 | invoke_rcu_node_kthread(rnp); | ||
1555 | } | ||
1556 | |||
1557 | /* | ||
1558 | * Drop to non-real-time priority and yield, but only after posting a | ||
1559 | * timer that will cause us to regain our real-time priority if we | ||
1560 | * remain preempted. Either way, we restore our real-time priority | ||
1561 | * before returning. | ||
1562 | */ | ||
1563 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg) | ||
1564 | { | ||
1565 | struct sched_param sp; | ||
1566 | struct timer_list yield_timer; | ||
1567 | |||
1568 | setup_timer_on_stack(&yield_timer, f, arg); | ||
1569 | mod_timer(&yield_timer, jiffies + 2); | ||
1570 | sp.sched_priority = 0; | ||
1571 | sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); | ||
1572 | set_user_nice(current, 19); | ||
1573 | schedule(); | ||
1574 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1575 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | ||
1576 | del_timer(&yield_timer); | ||
1577 | } | ||
1578 | |||
1579 | /* | ||
1580 | * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. | ||
1581 | * This can happen while the corresponding CPU is either coming online | ||
1582 | * or going offline. We cannot wait until the CPU is fully online | ||
1583 | * before starting the kthread, because the various notifier functions | ||
1584 | * can wait for RCU grace periods. So we park rcu_cpu_kthread() until | ||
1585 | * the corresponding CPU is online. | ||
1586 | * | ||
1587 | * Return 1 if the kthread needs to stop, 0 otherwise. | ||
1588 | * | ||
1589 | * Caller must disable bh. This function can momentarily enable it. | ||
1590 | */ | ||
1591 | static int rcu_cpu_kthread_should_stop(int cpu) | ||
1592 | { | ||
1593 | while (cpu_is_offline(cpu) || | ||
1594 | !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || | ||
1595 | smp_processor_id() != cpu) { | ||
1596 | if (kthread_should_stop()) | ||
1597 | return 1; | ||
1598 | per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | ||
1599 | per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); | ||
1600 | local_bh_enable(); | ||
1601 | schedule_timeout_uninterruptible(1); | ||
1602 | if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) | ||
1603 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
1604 | local_bh_disable(); | ||
1605 | } | ||
1606 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
1607 | return 0; | ||
1608 | } | ||
1609 | |||
1610 | /* | ||
1611 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces the | ||
1612 | * earlier RCU softirq. | ||
1613 | */ | ||
1614 | static int rcu_cpu_kthread(void *arg) | ||
1615 | { | ||
1616 | int cpu = (int)(long)arg; | ||
1617 | unsigned long flags; | ||
1618 | int spincnt = 0; | ||
1619 | unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); | ||
1620 | wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu); | ||
1621 | char work; | ||
1622 | char *workp = &per_cpu(rcu_cpu_has_work, cpu); | ||
1623 | |||
1624 | for (;;) { | ||
1625 | *statusp = RCU_KTHREAD_WAITING; | ||
1626 | wait_event_interruptible(*wqp, | ||
1627 | *workp != 0 || kthread_should_stop()); | ||
1628 | local_bh_disable(); | ||
1629 | if (rcu_cpu_kthread_should_stop(cpu)) { | ||
1630 | local_bh_enable(); | ||
1631 | break; | ||
1632 | } | ||
1633 | *statusp = RCU_KTHREAD_RUNNING; | ||
1634 | per_cpu(rcu_cpu_kthread_loops, cpu)++; | ||
1635 | local_irq_save(flags); | ||
1636 | work = *workp; | ||
1637 | *workp = 0; | ||
1638 | local_irq_restore(flags); | ||
1639 | if (work) | ||
1640 | rcu_process_callbacks(); | ||
1641 | local_bh_enable(); | ||
1642 | if (*workp != 0) | ||
1643 | spincnt++; | ||
1644 | else | ||
1645 | spincnt = 0; | ||
1646 | if (spincnt > 10) { | ||
1647 | *statusp = RCU_KTHREAD_YIELDING; | ||
1648 | rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); | ||
1649 | spincnt = 0; | ||
1650 | } | ||
1651 | } | ||
1652 | *statusp = RCU_KTHREAD_STOPPED; | ||
1653 | return 0; | ||
1654 | } | ||
1655 | |||
1656 | /* | ||
1657 | * Spawn a per-CPU kthread, setting up affinity and priority. | ||
1658 | * Because the CPU hotplug lock is held, no other CPU will be attempting | ||
1659 | * to manipulate rcu_cpu_kthread_task. There might be another CPU | ||
1660 | * attempting to access it during boot, but the locking in kthread_bind() | ||
1661 | * will enforce sufficient ordering. | ||
1662 | */ | ||
1663 | static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) | ||
1664 | { | ||
1665 | struct sched_param sp; | ||
1666 | struct task_struct *t; | ||
1667 | |||
1668 | if (!rcu_kthreads_spawnable || | ||
1669 | per_cpu(rcu_cpu_kthread_task, cpu) != NULL) | ||
1670 | return 0; | ||
1671 | t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); | ||
1672 | if (IS_ERR(t)) | ||
1673 | return PTR_ERR(t); | ||
1674 | kthread_bind(t, cpu); | ||
1675 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
1676 | WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); | ||
1677 | per_cpu(rcu_cpu_kthread_task, cpu) = t; | ||
1678 | wake_up_process(t); | ||
1679 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1680 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1681 | return 0; | ||
1682 | } | ||
1683 | |||
1684 | /* | ||
1685 | * Per-rcu_node kthread, which is in charge of waking up the per-CPU | ||
1686 | * kthreads when needed. We ignore requests to wake up kthreads | ||
1687 | * for offline CPUs, which is OK because force_quiescent_state() | ||
1688 | * takes care of this case. | ||
1689 | */ | ||
1690 | static int rcu_node_kthread(void *arg) | ||
1691 | { | ||
1692 | int cpu; | ||
1693 | unsigned long flags; | ||
1694 | unsigned long mask; | ||
1695 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
1696 | struct sched_param sp; | ||
1697 | struct task_struct *t; | ||
1698 | |||
1699 | for (;;) { | ||
1700 | rnp->node_kthread_status = RCU_KTHREAD_WAITING; | ||
1701 | wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0); | ||
1702 | rnp->node_kthread_status = RCU_KTHREAD_RUNNING; | ||
1703 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1704 | mask = rnp->wakemask; | ||
1705 | rnp->wakemask = 0; | ||
1706 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
1707 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { | ||
1708 | if ((mask & 0x1) == 0) | ||
1709 | continue; | ||
1710 | preempt_disable(); | ||
1711 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
1712 | if (!cpu_online(cpu) || t == NULL) { | ||
1713 | preempt_enable(); | ||
1714 | continue; | ||
1715 | } | ||
1716 | per_cpu(rcu_cpu_has_work, cpu) = 1; | ||
1717 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1718 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1719 | preempt_enable(); | ||
1720 | } | ||
1721 | } | ||
1722 | /* NOTREACHED */ | ||
1723 | rnp->node_kthread_status = RCU_KTHREAD_STOPPED; | ||
1724 | return 0; | ||
1725 | } | ||
1726 | |||
1727 | /* | ||
1728 | * Set the per-rcu_node kthread's affinity to cover all CPUs that are | ||
1729 | * served by the rcu_node in question. The CPU hotplug lock is still | ||
1730 | * held, so the value of rnp->qsmaskinit will be stable. | ||
1731 | * | ||
1732 | * We don't include outgoingcpu in the affinity set, use -1 if there is | ||
1733 | * no outgoing CPU. If there are no CPUs left in the affinity set, | ||
1734 | * this function allows the kthread to execute on any CPU. | ||
1735 | */ | ||
1736 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
1737 | { | ||
1738 | cpumask_var_t cm; | ||
1739 | int cpu; | ||
1740 | unsigned long mask = rnp->qsmaskinit; | ||
1741 | |||
1742 | if (rnp->node_kthread_task == NULL) | ||
1743 | return; | ||
1744 | if (!alloc_cpumask_var(&cm, GFP_KERNEL)) | ||
1745 | return; | ||
1746 | cpumask_clear(cm); | ||
1747 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) | ||
1748 | if ((mask & 0x1) && cpu != outgoingcpu) | ||
1749 | cpumask_set_cpu(cpu, cm); | ||
1750 | if (cpumask_weight(cm) == 0) { | ||
1751 | cpumask_setall(cm); | ||
1752 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) | ||
1753 | cpumask_clear_cpu(cpu, cm); | ||
1754 | WARN_ON_ONCE(cpumask_weight(cm) == 0); | ||
1755 | } | ||
1756 | set_cpus_allowed_ptr(rnp->node_kthread_task, cm); | ||
1757 | rcu_boost_kthread_setaffinity(rnp, cm); | ||
1758 | free_cpumask_var(cm); | ||
1759 | } | ||
1760 | |||
1761 | /* | ||
1762 | * Spawn a per-rcu_node kthread, setting priority and affinity. | ||
1763 | * Called during boot before online/offline can happen, or, if | ||
1764 | * during runtime, with the main CPU-hotplug locks held. So only | ||
1765 | * one of these can be executing at a time. | ||
1766 | */ | ||
1767 | static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, | ||
1768 | struct rcu_node *rnp) | ||
1769 | { | ||
1770 | unsigned long flags; | ||
1771 | int rnp_index = rnp - &rsp->node[0]; | ||
1772 | struct sched_param sp; | ||
1773 | struct task_struct *t; | ||
1774 | |||
1775 | if (!rcu_kthreads_spawnable || | ||
1776 | rnp->qsmaskinit == 0) | ||
1777 | return 0; | ||
1778 | if (rnp->node_kthread_task == NULL) { | ||
1779 | t = kthread_create(rcu_node_kthread, (void *)rnp, | ||
1780 | "rcun%d", rnp_index); | ||
1781 | if (IS_ERR(t)) | ||
1782 | return PTR_ERR(t); | ||
1783 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1784 | rnp->node_kthread_task = t; | ||
1785 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1786 | wake_up_process(t); | ||
1787 | sp.sched_priority = 99; | ||
1788 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1789 | } | ||
1790 | return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); | ||
1791 | } | ||
1792 | |||
1793 | /* | ||
1794 | * Spawn all kthreads -- called as soon as the scheduler is running. | ||
1795 | */ | ||
1796 | static int __init rcu_spawn_kthreads(void) | ||
1797 | { | ||
1798 | int cpu; | ||
1799 | struct rcu_node *rnp; | ||
1800 | |||
1801 | rcu_kthreads_spawnable = 1; | ||
1802 | for_each_possible_cpu(cpu) { | ||
1803 | init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu)); | ||
1804 | per_cpu(rcu_cpu_has_work, cpu) = 0; | ||
1805 | if (cpu_online(cpu)) | ||
1806 | (void)rcu_spawn_one_cpu_kthread(cpu); | ||
1807 | } | ||
1808 | rnp = rcu_get_root(rcu_state); | ||
1809 | init_waitqueue_head(&rnp->node_wq); | ||
1810 | rcu_init_boost_waitqueue(rnp); | ||
1811 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
1812 | if (NUM_RCU_NODES > 1) | ||
1813 | rcu_for_each_leaf_node(rcu_state, rnp) { | ||
1814 | init_waitqueue_head(&rnp->node_wq); | ||
1815 | rcu_init_boost_waitqueue(rnp); | ||
1816 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
1817 | } | ||
1818 | return 0; | ||
1819 | } | ||
1820 | early_initcall(rcu_spawn_kthreads); | ||
1821 | |||
1417 | static void | 1822 | static void |
1418 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 1823 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), |
1419 | struct rcu_state *rsp) | 1824 | struct rcu_state *rsp) |
@@ -1439,6 +1844,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1439 | /* Add the callback to our list. */ | 1844 | /* Add the callback to our list. */ |
1440 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | 1845 | *rdp->nxttail[RCU_NEXT_TAIL] = head; |
1441 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | 1846 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
1847 | rdp->qlen++; | ||
1848 | |||
1849 | /* If interrupts were disabled, don't dive into RCU core. */ | ||
1850 | if (irqs_disabled_flags(flags)) { | ||
1851 | local_irq_restore(flags); | ||
1852 | return; | ||
1853 | } | ||
1442 | 1854 | ||
1443 | /* | 1855 | /* |
1444 | * Force the grace period if too many callbacks or too long waiting. | 1856 | * Force the grace period if too many callbacks or too long waiting. |
@@ -1447,7 +1859,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1447 | * invoking force_quiescent_state() if the newly enqueued callback | 1859 | * invoking force_quiescent_state() if the newly enqueued callback |
1448 | * is the only one waiting for a grace period to complete. | 1860 | * is the only one waiting for a grace period to complete. |
1449 | */ | 1861 | */ |
1450 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 1862 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { |
1451 | 1863 | ||
1452 | /* Are we ignoring a completed grace period? */ | 1864 | /* Are we ignoring a completed grace period? */ |
1453 | rcu_process_gp_end(rsp, rdp); | 1865 | rcu_process_gp_end(rsp, rdp); |
@@ -1583,7 +1995,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1583 | * or RCU-bh, force a local reschedule. | 1995 | * or RCU-bh, force a local reschedule. |
1584 | */ | 1996 | */ |
1585 | rdp->n_rp_qs_pending++; | 1997 | rdp->n_rp_qs_pending++; |
1586 | if (!rdp->preemptable && | 1998 | if (!rdp->preemptible && |
1587 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, | 1999 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, |
1588 | jiffies)) | 2000 | jiffies)) |
1589 | set_need_resched(); | 2001 | set_need_resched(); |
@@ -1760,7 +2172,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
1760 | * that this CPU cannot possibly have any RCU callbacks in flight yet. | 2172 | * that this CPU cannot possibly have any RCU callbacks in flight yet. |
1761 | */ | 2173 | */ |
1762 | static void __cpuinit | 2174 | static void __cpuinit |
1763 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | 2175 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) |
1764 | { | 2176 | { |
1765 | unsigned long flags; | 2177 | unsigned long flags; |
1766 | unsigned long mask; | 2178 | unsigned long mask; |
@@ -1772,7 +2184,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1772 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ | 2184 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ |
1773 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ | 2185 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ |
1774 | rdp->beenonline = 1; /* We have now been online. */ | 2186 | rdp->beenonline = 1; /* We have now been online. */ |
1775 | rdp->preemptable = preemptable; | 2187 | rdp->preemptible = preemptible; |
1776 | rdp->qlen_last_fqs_check = 0; | 2188 | rdp->qlen_last_fqs_check = 0; |
1777 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2189 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1778 | rdp->blimit = blimit; | 2190 | rdp->blimit = blimit; |
@@ -1813,6 +2225,19 @@ static void __cpuinit rcu_online_cpu(int cpu) | |||
1813 | rcu_preempt_init_percpu_data(cpu); | 2225 | rcu_preempt_init_percpu_data(cpu); |
1814 | } | 2226 | } |
1815 | 2227 | ||
2228 | static void __cpuinit rcu_online_kthreads(int cpu) | ||
2229 | { | ||
2230 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
2231 | struct rcu_node *rnp = rdp->mynode; | ||
2232 | |||
2233 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | ||
2234 | if (rcu_kthreads_spawnable) { | ||
2235 | (void)rcu_spawn_one_cpu_kthread(cpu); | ||
2236 | if (rnp->node_kthread_task == NULL) | ||
2237 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
2238 | } | ||
2239 | } | ||
2240 | |||
1816 | /* | 2241 | /* |
1817 | * Handle CPU online/offline notification events. | 2242 | * Handle CPU online/offline notification events. |
1818 | */ | 2243 | */ |
@@ -1820,11 +2245,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
1820 | unsigned long action, void *hcpu) | 2245 | unsigned long action, void *hcpu) |
1821 | { | 2246 | { |
1822 | long cpu = (long)hcpu; | 2247 | long cpu = (long)hcpu; |
2248 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
2249 | struct rcu_node *rnp = rdp->mynode; | ||
1823 | 2250 | ||
1824 | switch (action) { | 2251 | switch (action) { |
1825 | case CPU_UP_PREPARE: | 2252 | case CPU_UP_PREPARE: |
1826 | case CPU_UP_PREPARE_FROZEN: | 2253 | case CPU_UP_PREPARE_FROZEN: |
1827 | rcu_online_cpu(cpu); | 2254 | rcu_online_cpu(cpu); |
2255 | rcu_online_kthreads(cpu); | ||
2256 | break; | ||
2257 | case CPU_ONLINE: | ||
2258 | case CPU_DOWN_FAILED: | ||
2259 | rcu_node_kthread_setaffinity(rnp, -1); | ||
2260 | rcu_cpu_kthread_setrt(cpu, 1); | ||
2261 | break; | ||
2262 | case CPU_DOWN_PREPARE: | ||
2263 | rcu_node_kthread_setaffinity(rnp, cpu); | ||
2264 | rcu_cpu_kthread_setrt(cpu, 0); | ||
1828 | break; | 2265 | break; |
1829 | case CPU_DYING: | 2266 | case CPU_DYING: |
1830 | case CPU_DYING_FROZEN: | 2267 | case CPU_DYING_FROZEN: |
@@ -1943,10 +2380,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
1943 | j / rsp->levelspread[i - 1]; | 2380 | j / rsp->levelspread[i - 1]; |
1944 | } | 2381 | } |
1945 | rnp->level = i; | 2382 | rnp->level = i; |
1946 | INIT_LIST_HEAD(&rnp->blocked_tasks[0]); | 2383 | INIT_LIST_HEAD(&rnp->blkd_tasks); |
1947 | INIT_LIST_HEAD(&rnp->blocked_tasks[1]); | ||
1948 | INIT_LIST_HEAD(&rnp->blocked_tasks[2]); | ||
1949 | INIT_LIST_HEAD(&rnp->blocked_tasks[3]); | ||
1950 | } | 2384 | } |
1951 | } | 2385 | } |
1952 | 2386 | ||
@@ -1968,7 +2402,6 @@ void __init rcu_init(void) | |||
1968 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 2402 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
1969 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 2403 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
1970 | __rcu_init_preempt(); | 2404 | __rcu_init_preempt(); |
1971 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | ||
1972 | 2405 | ||
1973 | /* | 2406 | /* |
1974 | * We don't need protection against CPU-hotplug here because | 2407 | * We don't need protection against CPU-hotplug here because |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e8f057e44e3e..257664815d5d 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -91,6 +91,14 @@ struct rcu_dynticks { | |||
91 | /* remains even for nmi from irq handler. */ | 91 | /* remains even for nmi from irq handler. */ |
92 | }; | 92 | }; |
93 | 93 | ||
94 | /* RCU's kthread states for tracing. */ | ||
95 | #define RCU_KTHREAD_STOPPED 0 | ||
96 | #define RCU_KTHREAD_RUNNING 1 | ||
97 | #define RCU_KTHREAD_WAITING 2 | ||
98 | #define RCU_KTHREAD_OFFCPU 3 | ||
99 | #define RCU_KTHREAD_YIELDING 4 | ||
100 | #define RCU_KTHREAD_MAX 4 | ||
101 | |||
94 | /* | 102 | /* |
95 | * Definition for node within the RCU grace-period-detection hierarchy. | 103 | * Definition for node within the RCU grace-period-detection hierarchy. |
96 | */ | 104 | */ |
@@ -109,10 +117,11 @@ struct rcu_node { | |||
109 | /* an rcu_data structure, otherwise, each */ | 117 | /* an rcu_data structure, otherwise, each */ |
110 | /* bit corresponds to a child rcu_node */ | 118 | /* bit corresponds to a child rcu_node */ |
111 | /* structure. */ | 119 | /* structure. */ |
112 | unsigned long expmask; /* Groups that have ->blocked_tasks[] */ | 120 | unsigned long expmask; /* Groups that have ->blkd_tasks */ |
113 | /* elements that need to drain to allow the */ | 121 | /* elements that need to drain to allow the */ |
114 | /* current expedited grace period to */ | 122 | /* current expedited grace period to */ |
115 | /* complete (only for TREE_PREEMPT_RCU). */ | 123 | /* complete (only for TREE_PREEMPT_RCU). */ |
124 | unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */ | ||
116 | unsigned long qsmaskinit; | 125 | unsigned long qsmaskinit; |
117 | /* Per-GP initial value for qsmask & expmask. */ | 126 | /* Per-GP initial value for qsmask & expmask. */ |
118 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 127 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
@@ -122,11 +131,68 @@ struct rcu_node { | |||
122 | u8 grpnum; /* CPU/group number for next level up. */ | 131 | u8 grpnum; /* CPU/group number for next level up. */ |
123 | u8 level; /* root is at level 0. */ | 132 | u8 level; /* root is at level 0. */ |
124 | struct rcu_node *parent; | 133 | struct rcu_node *parent; |
125 | struct list_head blocked_tasks[4]; | 134 | struct list_head blkd_tasks; |
126 | /* Tasks blocked in RCU read-side critsect. */ | 135 | /* Tasks blocked in RCU read-side critical */ |
127 | /* Grace period number (->gpnum) x blocked */ | 136 | /* section. Tasks are placed at the head */ |
128 | /* by tasks on the (x & 0x1) element of the */ | 137 | /* of this list and age towards the tail. */ |
129 | /* blocked_tasks[] array. */ | 138 | struct list_head *gp_tasks; |
139 | /* Pointer to the first task blocking the */ | ||
140 | /* current grace period, or NULL if there */ | ||
141 | /* is no such task. */ | ||
142 | struct list_head *exp_tasks; | ||
143 | /* Pointer to the first task blocking the */ | ||
144 | /* current expedited grace period, or NULL */ | ||
145 | /* if there is no such task. If there */ | ||
146 | /* is no current expedited grace period, */ | ||
147 | /* then there can cannot be any such task. */ | ||
148 | #ifdef CONFIG_RCU_BOOST | ||
149 | struct list_head *boost_tasks; | ||
150 | /* Pointer to first task that needs to be */ | ||
151 | /* priority boosted, or NULL if no priority */ | ||
152 | /* boosting is needed for this rcu_node */ | ||
153 | /* structure. If there are no tasks */ | ||
154 | /* queued on this rcu_node structure that */ | ||
155 | /* are blocking the current grace period, */ | ||
156 | /* there can be no such task. */ | ||
157 | unsigned long boost_time; | ||
158 | /* When to start boosting (jiffies). */ | ||
159 | struct task_struct *boost_kthread_task; | ||
160 | /* kthread that takes care of priority */ | ||
161 | /* boosting for this rcu_node structure. */ | ||
162 | wait_queue_head_t boost_wq; | ||
163 | /* Wait queue on which to park the boost */ | ||
164 | /* kthread. */ | ||
165 | unsigned int boost_kthread_status; | ||
166 | /* State of boost_kthread_task for tracing. */ | ||
167 | unsigned long n_tasks_boosted; | ||
168 | /* Total number of tasks boosted. */ | ||
169 | unsigned long n_exp_boosts; | ||
170 | /* Number of tasks boosted for expedited GP. */ | ||
171 | unsigned long n_normal_boosts; | ||
172 | /* Number of tasks boosted for normal GP. */ | ||
173 | unsigned long n_balk_blkd_tasks; | ||
174 | /* Refused to boost: no blocked tasks. */ | ||
175 | unsigned long n_balk_exp_gp_tasks; | ||
176 | /* Refused to boost: nothing blocking GP. */ | ||
177 | unsigned long n_balk_boost_tasks; | ||
178 | /* Refused to boost: already boosting. */ | ||
179 | unsigned long n_balk_notblocked; | ||
180 | /* Refused to boost: RCU RS CS still running. */ | ||
181 | unsigned long n_balk_notyet; | ||
182 | /* Refused to boost: not yet time. */ | ||
183 | unsigned long n_balk_nos; | ||
184 | /* Refused to boost: not sure why, though. */ | ||
185 | /* This can happen due to race conditions. */ | ||
186 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
187 | struct task_struct *node_kthread_task; | ||
188 | /* kthread that takes care of this rcu_node */ | ||
189 | /* structure, for example, awakening the */ | ||
190 | /* per-CPU kthreads as needed. */ | ||
191 | wait_queue_head_t node_wq; | ||
192 | /* Wait queue on which to park the per-node */ | ||
193 | /* kthread. */ | ||
194 | unsigned int node_kthread_status; | ||
195 | /* State of node_kthread_task for tracing. */ | ||
130 | } ____cacheline_internodealigned_in_smp; | 196 | } ____cacheline_internodealigned_in_smp; |
131 | 197 | ||
132 | /* | 198 | /* |
@@ -175,7 +241,7 @@ struct rcu_data { | |||
175 | bool passed_quiesc; /* User-mode/idle loop etc. */ | 241 | bool passed_quiesc; /* User-mode/idle loop etc. */ |
176 | bool qs_pending; /* Core waits for quiesc state. */ | 242 | bool qs_pending; /* Core waits for quiesc state. */ |
177 | bool beenonline; /* CPU online at least once. */ | 243 | bool beenonline; /* CPU online at least once. */ |
178 | bool preemptable; /* Preemptable RCU? */ | 244 | bool preemptible; /* Preemptible RCU? */ |
179 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | 245 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ |
180 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ | 246 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ |
181 | 247 | ||
@@ -254,7 +320,6 @@ struct rcu_data { | |||
254 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 320 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
255 | 321 | ||
256 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 322 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
257 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
258 | 323 | ||
259 | #ifdef CONFIG_PROVE_RCU | 324 | #ifdef CONFIG_PROVE_RCU |
260 | #define RCU_STALL_DELAY_DELTA (5 * HZ) | 325 | #define RCU_STALL_DELAY_DELTA (5 * HZ) |
@@ -272,13 +337,6 @@ struct rcu_data { | |||
272 | /* scheduling clock irq */ | 337 | /* scheduling clock irq */ |
273 | /* before ratting on them. */ | 338 | /* before ratting on them. */ |
274 | 339 | ||
275 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE | ||
276 | #define RCU_CPU_STALL_SUPPRESS_INIT 0 | ||
277 | #else | ||
278 | #define RCU_CPU_STALL_SUPPRESS_INIT 1 | ||
279 | #endif | ||
280 | |||
281 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
282 | 340 | ||
283 | /* | 341 | /* |
284 | * RCU global state, including node hierarchy. This hierarchy is | 342 | * RCU global state, including node hierarchy. This hierarchy is |
@@ -325,12 +383,12 @@ struct rcu_state { | |||
325 | /* due to lock unavailable. */ | 383 | /* due to lock unavailable. */ |
326 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ | 384 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ |
327 | /* due to no GP active. */ | 385 | /* due to no GP active. */ |
328 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
329 | unsigned long gp_start; /* Time at which GP started, */ | 386 | unsigned long gp_start; /* Time at which GP started, */ |
330 | /* but in jiffies. */ | 387 | /* but in jiffies. */ |
331 | unsigned long jiffies_stall; /* Time at which to check */ | 388 | unsigned long jiffies_stall; /* Time at which to check */ |
332 | /* for CPU stalls. */ | 389 | /* for CPU stalls. */ |
333 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 390 | unsigned long gp_max; /* Maximum GP duration in */ |
391 | /* jiffies. */ | ||
334 | char *name; /* Name of structure. */ | 392 | char *name; /* Name of structure. */ |
335 | }; | 393 | }; |
336 | 394 | ||
@@ -361,16 +419,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | |||
361 | static void rcu_bootup_announce(void); | 419 | static void rcu_bootup_announce(void); |
362 | long rcu_batches_completed(void); | 420 | long rcu_batches_completed(void); |
363 | static void rcu_preempt_note_context_switch(int cpu); | 421 | static void rcu_preempt_note_context_switch(int cpu); |
364 | static int rcu_preempted_readers(struct rcu_node *rnp); | 422 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
365 | #ifdef CONFIG_HOTPLUG_CPU | 423 | #ifdef CONFIG_HOTPLUG_CPU |
366 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 424 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
367 | unsigned long flags); | 425 | unsigned long flags); |
368 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 426 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
369 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
370 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 427 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
371 | static void rcu_print_task_stall(struct rcu_node *rnp); | 428 | static void rcu_print_task_stall(struct rcu_node *rnp); |
372 | static void rcu_preempt_stall_reset(void); | 429 | static void rcu_preempt_stall_reset(void); |
373 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
374 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 430 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
375 | #ifdef CONFIG_HOTPLUG_CPU | 431 | #ifdef CONFIG_HOTPLUG_CPU |
376 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | 432 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, |
@@ -390,5 +446,13 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | |||
390 | static void rcu_preempt_send_cbs_to_online(void); | 446 | static void rcu_preempt_send_cbs_to_online(void); |
391 | static void __init __rcu_init_preempt(void); | 447 | static void __init __rcu_init_preempt(void); |
392 | static void rcu_needs_cpu_flush(void); | 448 | static void rcu_needs_cpu_flush(void); |
449 | static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp); | ||
450 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | ||
451 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
452 | cpumask_var_t cm); | ||
453 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | ||
454 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
455 | struct rcu_node *rnp, | ||
456 | int rnp_index); | ||
393 | 457 | ||
394 | #endif /* #ifndef RCU_TREE_NONCORE */ | 458 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a3638710dc67..3f6559a5f5cd 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | 2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) |
3 | * Internal non-public definitions that provide either classic | 3 | * Internal non-public definitions that provide either classic |
4 | * or preemptable semantics. | 4 | * or preemptible semantics. |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void) | |||
54 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | 54 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE |
55 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); | 55 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); |
56 | #endif | 56 | #endif |
57 | #ifndef CONFIG_RCU_CPU_STALL_DETECTOR | ||
58 | printk(KERN_INFO | ||
59 | "\tRCU-based detection of stalled CPUs is disabled.\n"); | ||
60 | #endif | ||
61 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) | 57 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) |
62 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); | 58 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); |
63 | #endif | 59 | #endif |
@@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
70 | 66 | ||
71 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); | 67 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); |
72 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | 68 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); |
69 | static struct rcu_state *rcu_state = &rcu_preempt_state; | ||
73 | 70 | ||
74 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 71 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
75 | 72 | ||
@@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); | |||
78 | */ | 75 | */ |
79 | static void __init rcu_bootup_announce(void) | 76 | static void __init rcu_bootup_announce(void) |
80 | { | 77 | { |
81 | printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); | 78 | printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); |
82 | rcu_bootup_announce_oddness(); | 79 | rcu_bootup_announce_oddness(); |
83 | } | 80 | } |
84 | 81 | ||
@@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void) | |||
111 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 108 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
112 | 109 | ||
113 | /* | 110 | /* |
114 | * Record a preemptable-RCU quiescent state for the specified CPU. Note | 111 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
115 | * that this just means that the task currently running on the CPU is | 112 | * that this just means that the task currently running on the CPU is |
116 | * not in a quiescent state. There might be any number of tasks blocked | 113 | * not in a quiescent state. There might be any number of tasks blocked |
117 | * while in an RCU read-side critical section. | 114 | * while in an RCU read-side critical section. |
@@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu) | |||
134 | * We have entered the scheduler, and the current task might soon be | 131 | * We have entered the scheduler, and the current task might soon be |
135 | * context-switched away from. If this task is in an RCU read-side | 132 | * context-switched away from. If this task is in an RCU read-side |
136 | * critical section, we will no longer be able to rely on the CPU to | 133 | * critical section, we will no longer be able to rely on the CPU to |
137 | * record that fact, so we enqueue the task on the appropriate entry | 134 | * record that fact, so we enqueue the task on the blkd_tasks list. |
138 | * of the blocked_tasks[] array. The task will dequeue itself when | 135 | * The task will dequeue itself when it exits the outermost enclosing |
139 | * it exits the outermost enclosing RCU read-side critical section. | 136 | * RCU read-side critical section. Therefore, the current grace period |
140 | * Therefore, the current grace period cannot be permitted to complete | 137 | * cannot be permitted to complete until the blkd_tasks list entries |
141 | * until the blocked_tasks[] entry indexed by the low-order bit of | 138 | * predating the current grace period drain, in other words, until |
142 | * rnp->gpnum empties. | 139 | * rnp->gp_tasks becomes NULL. |
143 | * | 140 | * |
144 | * Caller must disable preemption. | 141 | * Caller must disable preemption. |
145 | */ | 142 | */ |
@@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
147 | { | 144 | { |
148 | struct task_struct *t = current; | 145 | struct task_struct *t = current; |
149 | unsigned long flags; | 146 | unsigned long flags; |
150 | int phase; | ||
151 | struct rcu_data *rdp; | 147 | struct rcu_data *rdp; |
152 | struct rcu_node *rnp; | 148 | struct rcu_node *rnp; |
153 | 149 | ||
@@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
169 | * (i.e., this CPU has not yet passed through a quiescent | 165 | * (i.e., this CPU has not yet passed through a quiescent |
170 | * state for the current grace period), then as long | 166 | * state for the current grace period), then as long |
171 | * as that task remains queued, the current grace period | 167 | * as that task remains queued, the current grace period |
172 | * cannot end. | 168 | * cannot end. Note that there is some uncertainty as |
169 | * to exactly when the current grace period started. | ||
170 | * We take a conservative approach, which can result | ||
171 | * in unnecessarily waiting on tasks that started very | ||
172 | * slightly after the current grace period began. C'est | ||
173 | * la vie!!! | ||
173 | * | 174 | * |
174 | * But first, note that the current CPU must still be | 175 | * But first, note that the current CPU must still be |
175 | * on line! | 176 | * on line! |
176 | */ | 177 | */ |
177 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); | 178 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); |
178 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 179 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); |
179 | phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; | 180 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { |
180 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); | 181 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); |
182 | rnp->gp_tasks = &t->rcu_node_entry; | ||
183 | #ifdef CONFIG_RCU_BOOST | ||
184 | if (rnp->boost_tasks != NULL) | ||
185 | rnp->boost_tasks = rnp->gp_tasks; | ||
186 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
187 | } else { | ||
188 | list_add(&t->rcu_node_entry, &rnp->blkd_tasks); | ||
189 | if (rnp->qsmask & rdp->grpmask) | ||
190 | rnp->gp_tasks = &t->rcu_node_entry; | ||
191 | } | ||
181 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 192 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
182 | } | 193 | } |
183 | 194 | ||
@@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
196 | } | 207 | } |
197 | 208 | ||
198 | /* | 209 | /* |
199 | * Tree-preemptable RCU implementation for rcu_read_lock(). | 210 | * Tree-preemptible RCU implementation for rcu_read_lock(). |
200 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | 211 | * Just increment ->rcu_read_lock_nesting, shared state will be updated |
201 | * if we block. | 212 | * if we block. |
202 | */ | 213 | */ |
@@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); | |||
212 | * for the specified rcu_node structure. If the caller needs a reliable | 223 | * for the specified rcu_node structure. If the caller needs a reliable |
213 | * answer, it must hold the rcu_node's ->lock. | 224 | * answer, it must hold the rcu_node's ->lock. |
214 | */ | 225 | */ |
215 | static int rcu_preempted_readers(struct rcu_node *rnp) | 226 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) |
216 | { | 227 | { |
217 | int phase = rnp->gpnum & 0x1; | 228 | return rnp->gp_tasks != NULL; |
218 | |||
219 | return !list_empty(&rnp->blocked_tasks[phase]) || | ||
220 | !list_empty(&rnp->blocked_tasks[phase + 2]); | ||
221 | } | 229 | } |
222 | 230 | ||
223 | /* | 231 | /* |
@@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
233 | unsigned long mask; | 241 | unsigned long mask; |
234 | struct rcu_node *rnp_p; | 242 | struct rcu_node *rnp_p; |
235 | 243 | ||
236 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 244 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
237 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 245 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
238 | return; /* Still need more quiescent states! */ | 246 | return; /* Still need more quiescent states! */ |
239 | } | 247 | } |
@@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
257 | } | 265 | } |
258 | 266 | ||
259 | /* | 267 | /* |
268 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
269 | * returning NULL if at the end of the list. | ||
270 | */ | ||
271 | static struct list_head *rcu_next_node_entry(struct task_struct *t, | ||
272 | struct rcu_node *rnp) | ||
273 | { | ||
274 | struct list_head *np; | ||
275 | |||
276 | np = t->rcu_node_entry.next; | ||
277 | if (np == &rnp->blkd_tasks) | ||
278 | np = NULL; | ||
279 | return np; | ||
280 | } | ||
281 | |||
282 | /* | ||
260 | * Handle special cases during rcu_read_unlock(), such as needing to | 283 | * Handle special cases during rcu_read_unlock(), such as needing to |
261 | * notify RCU core processing or task having blocked during the RCU | 284 | * notify RCU core processing or task having blocked during the RCU |
262 | * read-side critical section. | 285 | * read-side critical section. |
@@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
266 | int empty; | 289 | int empty; |
267 | int empty_exp; | 290 | int empty_exp; |
268 | unsigned long flags; | 291 | unsigned long flags; |
292 | struct list_head *np; | ||
269 | struct rcu_node *rnp; | 293 | struct rcu_node *rnp; |
270 | int special; | 294 | int special; |
271 | 295 | ||
@@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
306 | break; | 330 | break; |
307 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 331 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
308 | } | 332 | } |
309 | empty = !rcu_preempted_readers(rnp); | 333 | empty = !rcu_preempt_blocked_readers_cgp(rnp); |
310 | empty_exp = !rcu_preempted_readers_exp(rnp); | 334 | empty_exp = !rcu_preempted_readers_exp(rnp); |
311 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 335 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ |
336 | np = rcu_next_node_entry(t, rnp); | ||
312 | list_del_init(&t->rcu_node_entry); | 337 | list_del_init(&t->rcu_node_entry); |
338 | if (&t->rcu_node_entry == rnp->gp_tasks) | ||
339 | rnp->gp_tasks = np; | ||
340 | if (&t->rcu_node_entry == rnp->exp_tasks) | ||
341 | rnp->exp_tasks = np; | ||
342 | #ifdef CONFIG_RCU_BOOST | ||
343 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
344 | rnp->boost_tasks = np; | ||
345 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
313 | t->rcu_blocked_node = NULL; | 346 | t->rcu_blocked_node = NULL; |
314 | 347 | ||
315 | /* | 348 | /* |
@@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
322 | else | 355 | else |
323 | rcu_report_unblock_qs_rnp(rnp, flags); | 356 | rcu_report_unblock_qs_rnp(rnp, flags); |
324 | 357 | ||
358 | #ifdef CONFIG_RCU_BOOST | ||
359 | /* Unboost if we were boosted. */ | ||
360 | if (special & RCU_READ_UNLOCK_BOOSTED) { | ||
361 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; | ||
362 | rt_mutex_unlock(t->rcu_boost_mutex); | ||
363 | t->rcu_boost_mutex = NULL; | ||
364 | } | ||
365 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
366 | |||
325 | /* | 367 | /* |
326 | * If this was the last task on the expedited lists, | 368 | * If this was the last task on the expedited lists, |
327 | * then we need to report up the rcu_node hierarchy. | 369 | * then we need to report up the rcu_node hierarchy. |
@@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
334 | } | 376 | } |
335 | 377 | ||
336 | /* | 378 | /* |
337 | * Tree-preemptable RCU implementation for rcu_read_unlock(). | 379 | * Tree-preemptible RCU implementation for rcu_read_unlock(). |
338 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | 380 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost |
339 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | 381 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then |
340 | * invoke rcu_read_unlock_special() to clean up after a context switch | 382 | * invoke rcu_read_unlock_special() to clean up after a context switch |
@@ -356,8 +398,6 @@ void __rcu_read_unlock(void) | |||
356 | } | 398 | } |
357 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | 399 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); |
358 | 400 | ||
359 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
360 | |||
361 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | 401 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE |
362 | 402 | ||
363 | /* | 403 | /* |
@@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock); | |||
367 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | 407 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) |
368 | { | 408 | { |
369 | unsigned long flags; | 409 | unsigned long flags; |
370 | struct list_head *lp; | ||
371 | int phase; | ||
372 | struct task_struct *t; | 410 | struct task_struct *t; |
373 | 411 | ||
374 | if (rcu_preempted_readers(rnp)) { | 412 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
375 | raw_spin_lock_irqsave(&rnp->lock, flags); | 413 | return; |
376 | phase = rnp->gpnum & 0x1; | 414 | raw_spin_lock_irqsave(&rnp->lock, flags); |
377 | lp = &rnp->blocked_tasks[phase]; | 415 | t = list_entry(rnp->gp_tasks, |
378 | list_for_each_entry(t, lp, rcu_node_entry) | 416 | struct task_struct, rcu_node_entry); |
379 | sched_show_task(t); | 417 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) |
380 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 418 | sched_show_task(t); |
381 | } | 419 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
382 | } | 420 | } |
383 | 421 | ||
384 | /* | 422 | /* |
@@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
408 | */ | 446 | */ |
409 | static void rcu_print_task_stall(struct rcu_node *rnp) | 447 | static void rcu_print_task_stall(struct rcu_node *rnp) |
410 | { | 448 | { |
411 | struct list_head *lp; | ||
412 | int phase; | ||
413 | struct task_struct *t; | 449 | struct task_struct *t; |
414 | 450 | ||
415 | if (rcu_preempted_readers(rnp)) { | 451 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
416 | phase = rnp->gpnum & 0x1; | 452 | return; |
417 | lp = &rnp->blocked_tasks[phase]; | 453 | t = list_entry(rnp->gp_tasks, |
418 | list_for_each_entry(t, lp, rcu_node_entry) | 454 | struct task_struct, rcu_node_entry); |
419 | printk(" P%d", t->pid); | 455 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) |
420 | } | 456 | printk(" P%d", t->pid); |
421 | } | 457 | } |
422 | 458 | ||
423 | /* | 459 | /* |
@@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void) | |||
430 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; | 466 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; |
431 | } | 467 | } |
432 | 468 | ||
433 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
434 | |||
435 | /* | 469 | /* |
436 | * Check that the list of blocked tasks for the newly completed grace | 470 | * Check that the list of blocked tasks for the newly completed grace |
437 | * period is in fact empty. It is a serious bug to complete a grace | 471 | * period is in fact empty. It is a serious bug to complete a grace |
438 | * period that still has RCU readers blocked! This function must be | 472 | * period that still has RCU readers blocked! This function must be |
439 | * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock | 473 | * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock |
440 | * must be held by the caller. | 474 | * must be held by the caller. |
475 | * | ||
476 | * Also, if there are blocked tasks on the list, they automatically | ||
477 | * block the newly created grace period, so set up ->gp_tasks accordingly. | ||
441 | */ | 478 | */ |
442 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | 479 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) |
443 | { | 480 | { |
444 | WARN_ON_ONCE(rcu_preempted_readers(rnp)); | 481 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); |
482 | if (!list_empty(&rnp->blkd_tasks)) | ||
483 | rnp->gp_tasks = rnp->blkd_tasks.next; | ||
445 | WARN_ON_ONCE(rnp->qsmask); | 484 | WARN_ON_ONCE(rnp->qsmask); |
446 | } | 485 | } |
447 | 486 | ||
@@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
465 | struct rcu_node *rnp, | 504 | struct rcu_node *rnp, |
466 | struct rcu_data *rdp) | 505 | struct rcu_data *rdp) |
467 | { | 506 | { |
468 | int i; | ||
469 | struct list_head *lp; | 507 | struct list_head *lp; |
470 | struct list_head *lp_root; | 508 | struct list_head *lp_root; |
471 | int retval = 0; | 509 | int retval = 0; |
472 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 510 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
473 | struct task_struct *tp; | 511 | struct task_struct *t; |
474 | 512 | ||
475 | if (rnp == rnp_root) { | 513 | if (rnp == rnp_root) { |
476 | WARN_ONCE(1, "Last CPU thought to be offlined?"); | 514 | WARN_ONCE(1, "Last CPU thought to be offlined?"); |
477 | return 0; /* Shouldn't happen: at least one CPU online. */ | 515 | return 0; /* Shouldn't happen: at least one CPU online. */ |
478 | } | 516 | } |
479 | WARN_ON_ONCE(rnp != rdp->mynode && | 517 | |
480 | (!list_empty(&rnp->blocked_tasks[0]) || | 518 | /* If we are on an internal node, complain bitterly. */ |
481 | !list_empty(&rnp->blocked_tasks[1]) || | 519 | WARN_ON_ONCE(rnp != rdp->mynode); |
482 | !list_empty(&rnp->blocked_tasks[2]) || | ||
483 | !list_empty(&rnp->blocked_tasks[3]))); | ||
484 | 520 | ||
485 | /* | 521 | /* |
486 | * Move tasks up to root rcu_node. Rely on the fact that the | 522 | * Move tasks up to root rcu_node. Don't try to get fancy for |
487 | * root rcu_node can be at most one ahead of the rest of the | 523 | * this corner-case operation -- just put this node's tasks |
488 | * rcu_nodes in terms of gp_num value. This fact allows us to | 524 | * at the head of the root node's list, and update the root node's |
489 | * move the blocked_tasks[] array directly, element by element. | 525 | * ->gp_tasks and ->exp_tasks pointers to those of this node's, |
526 | * if non-NULL. This might result in waiting for more tasks than | ||
527 | * absolutely necessary, but this is a good performance/complexity | ||
528 | * tradeoff. | ||
490 | */ | 529 | */ |
491 | if (rcu_preempted_readers(rnp)) | 530 | if (rcu_preempt_blocked_readers_cgp(rnp)) |
492 | retval |= RCU_OFL_TASKS_NORM_GP; | 531 | retval |= RCU_OFL_TASKS_NORM_GP; |
493 | if (rcu_preempted_readers_exp(rnp)) | 532 | if (rcu_preempted_readers_exp(rnp)) |
494 | retval |= RCU_OFL_TASKS_EXP_GP; | 533 | retval |= RCU_OFL_TASKS_EXP_GP; |
495 | for (i = 0; i < 4; i++) { | 534 | lp = &rnp->blkd_tasks; |
496 | lp = &rnp->blocked_tasks[i]; | 535 | lp_root = &rnp_root->blkd_tasks; |
497 | lp_root = &rnp_root->blocked_tasks[i]; | 536 | while (!list_empty(lp)) { |
498 | while (!list_empty(lp)) { | 537 | t = list_entry(lp->next, typeof(*t), rcu_node_entry); |
499 | tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); | 538 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
500 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | 539 | list_del(&t->rcu_node_entry); |
501 | list_del(&tp->rcu_node_entry); | 540 | t->rcu_blocked_node = rnp_root; |
502 | tp->rcu_blocked_node = rnp_root; | 541 | list_add(&t->rcu_node_entry, lp_root); |
503 | list_add(&tp->rcu_node_entry, lp_root); | 542 | if (&t->rcu_node_entry == rnp->gp_tasks) |
504 | raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ | 543 | rnp_root->gp_tasks = rnp->gp_tasks; |
505 | } | 544 | if (&t->rcu_node_entry == rnp->exp_tasks) |
545 | rnp_root->exp_tasks = rnp->exp_tasks; | ||
546 | #ifdef CONFIG_RCU_BOOST | ||
547 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
548 | rnp_root->boost_tasks = rnp->boost_tasks; | ||
549 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
550 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
506 | } | 551 | } |
552 | |||
553 | #ifdef CONFIG_RCU_BOOST | ||
554 | /* In case root is being boosted and leaf is not. */ | ||
555 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | ||
556 | if (rnp_root->boost_tasks != NULL && | ||
557 | rnp_root->boost_tasks != rnp_root->gp_tasks) | ||
558 | rnp_root->boost_tasks = rnp_root->gp_tasks; | ||
559 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
560 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
561 | |||
562 | rnp->gp_tasks = NULL; | ||
563 | rnp->exp_tasks = NULL; | ||
507 | return retval; | 564 | return retval; |
508 | } | 565 | } |
509 | 566 | ||
510 | /* | 567 | /* |
511 | * Do CPU-offline processing for preemptable RCU. | 568 | * Do CPU-offline processing for preemptible RCU. |
512 | */ | 569 | */ |
513 | static void rcu_preempt_offline_cpu(int cpu) | 570 | static void rcu_preempt_offline_cpu(int cpu) |
514 | { | 571 | { |
@@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
537 | } | 594 | } |
538 | 595 | ||
539 | /* | 596 | /* |
540 | * Process callbacks for preemptable RCU. | 597 | * Process callbacks for preemptible RCU. |
541 | */ | 598 | */ |
542 | static void rcu_preempt_process_callbacks(void) | 599 | static void rcu_preempt_process_callbacks(void) |
543 | { | 600 | { |
@@ -546,7 +603,7 @@ static void rcu_preempt_process_callbacks(void) | |||
546 | } | 603 | } |
547 | 604 | ||
548 | /* | 605 | /* |
549 | * Queue a preemptable-RCU callback for invocation after a grace period. | 606 | * Queue a preemptible-RCU callback for invocation after a grace period. |
550 | */ | 607 | */ |
551 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 608 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
552 | { | 609 | { |
@@ -594,8 +651,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | |||
594 | */ | 651 | */ |
595 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) | 652 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) |
596 | { | 653 | { |
597 | return !list_empty(&rnp->blocked_tasks[2]) || | 654 | return rnp->exp_tasks != NULL; |
598 | !list_empty(&rnp->blocked_tasks[3]); | ||
599 | } | 655 | } |
600 | 656 | ||
601 | /* | 657 | /* |
@@ -655,13 +711,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
655 | static void | 711 | static void |
656 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | 712 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) |
657 | { | 713 | { |
658 | int must_wait; | 714 | unsigned long flags; |
715 | int must_wait = 0; | ||
659 | 716 | ||
660 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 717 | raw_spin_lock_irqsave(&rnp->lock, flags); |
661 | list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); | 718 | if (list_empty(&rnp->blkd_tasks)) |
662 | list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); | 719 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
663 | must_wait = rcu_preempted_readers_exp(rnp); | 720 | else { |
664 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 721 | rnp->exp_tasks = rnp->blkd_tasks.next; |
722 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | ||
723 | must_wait = 1; | ||
724 | } | ||
665 | if (!must_wait) | 725 | if (!must_wait) |
666 | rcu_report_exp_rnp(rsp, rnp); | 726 | rcu_report_exp_rnp(rsp, rnp); |
667 | } | 727 | } |
@@ -669,9 +729,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
669 | /* | 729 | /* |
670 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea | 730 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea |
671 | * is to invoke synchronize_sched_expedited() to push all the tasks to | 731 | * is to invoke synchronize_sched_expedited() to push all the tasks to |
672 | * the ->blocked_tasks[] lists, move all entries from the first set of | 732 | * the ->blkd_tasks lists and wait for this list to drain. |
673 | * ->blocked_tasks[] lists to the second set, and finally wait for this | ||
674 | * second set to drain. | ||
675 | */ | 733 | */ |
676 | void synchronize_rcu_expedited(void) | 734 | void synchronize_rcu_expedited(void) |
677 | { | 735 | { |
@@ -703,7 +761,7 @@ void synchronize_rcu_expedited(void) | |||
703 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) | 761 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) |
704 | goto unlock_mb_ret; /* Others did our work for us. */ | 762 | goto unlock_mb_ret; /* Others did our work for us. */ |
705 | 763 | ||
706 | /* force all RCU readers onto blocked_tasks[]. */ | 764 | /* force all RCU readers onto ->blkd_tasks lists. */ |
707 | synchronize_sched_expedited(); | 765 | synchronize_sched_expedited(); |
708 | 766 | ||
709 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 767 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
@@ -715,7 +773,7 @@ void synchronize_rcu_expedited(void) | |||
715 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 773 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
716 | } | 774 | } |
717 | 775 | ||
718 | /* Snapshot current state of ->blocked_tasks[] lists. */ | 776 | /* Snapshot current state of ->blkd_tasks lists. */ |
719 | rcu_for_each_leaf_node(rsp, rnp) | 777 | rcu_for_each_leaf_node(rsp, rnp) |
720 | sync_rcu_preempt_exp_init(rsp, rnp); | 778 | sync_rcu_preempt_exp_init(rsp, rnp); |
721 | if (NUM_RCU_NODES > 1) | 779 | if (NUM_RCU_NODES > 1) |
@@ -723,7 +781,7 @@ void synchronize_rcu_expedited(void) | |||
723 | 781 | ||
724 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 782 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
725 | 783 | ||
726 | /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ | 784 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ |
727 | rnp = rcu_get_root(rsp); | 785 | rnp = rcu_get_root(rsp); |
728 | wait_event(sync_rcu_preempt_exp_wq, | 786 | wait_event(sync_rcu_preempt_exp_wq, |
729 | sync_rcu_preempt_exp_done(rnp)); | 787 | sync_rcu_preempt_exp_done(rnp)); |
@@ -739,7 +797,7 @@ mb_ret: | |||
739 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 797 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
740 | 798 | ||
741 | /* | 799 | /* |
742 | * Check to see if there is any immediate preemptable-RCU-related work | 800 | * Check to see if there is any immediate preemptible-RCU-related work |
743 | * to be done. | 801 | * to be done. |
744 | */ | 802 | */ |
745 | static int rcu_preempt_pending(int cpu) | 803 | static int rcu_preempt_pending(int cpu) |
@@ -749,7 +807,7 @@ static int rcu_preempt_pending(int cpu) | |||
749 | } | 807 | } |
750 | 808 | ||
751 | /* | 809 | /* |
752 | * Does preemptable RCU need the CPU to stay out of dynticks mode? | 810 | * Does preemptible RCU need the CPU to stay out of dynticks mode? |
753 | */ | 811 | */ |
754 | static int rcu_preempt_needs_cpu(int cpu) | 812 | static int rcu_preempt_needs_cpu(int cpu) |
755 | { | 813 | { |
@@ -766,7 +824,7 @@ void rcu_barrier(void) | |||
766 | EXPORT_SYMBOL_GPL(rcu_barrier); | 824 | EXPORT_SYMBOL_GPL(rcu_barrier); |
767 | 825 | ||
768 | /* | 826 | /* |
769 | * Initialize preemptable RCU's per-CPU data. | 827 | * Initialize preemptible RCU's per-CPU data. |
770 | */ | 828 | */ |
771 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | 829 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) |
772 | { | 830 | { |
@@ -774,7 +832,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
774 | } | 832 | } |
775 | 833 | ||
776 | /* | 834 | /* |
777 | * Move preemptable RCU's callbacks from dying CPU to other online CPU. | 835 | * Move preemptible RCU's callbacks from dying CPU to other online CPU. |
778 | */ | 836 | */ |
779 | static void rcu_preempt_send_cbs_to_online(void) | 837 | static void rcu_preempt_send_cbs_to_online(void) |
780 | { | 838 | { |
@@ -782,7 +840,7 @@ static void rcu_preempt_send_cbs_to_online(void) | |||
782 | } | 840 | } |
783 | 841 | ||
784 | /* | 842 | /* |
785 | * Initialize preemptable RCU's state structures. | 843 | * Initialize preemptible RCU's state structures. |
786 | */ | 844 | */ |
787 | static void __init __rcu_init_preempt(void) | 845 | static void __init __rcu_init_preempt(void) |
788 | { | 846 | { |
@@ -790,7 +848,7 @@ static void __init __rcu_init_preempt(void) | |||
790 | } | 848 | } |
791 | 849 | ||
792 | /* | 850 | /* |
793 | * Check for a task exiting while in a preemptable-RCU read-side | 851 | * Check for a task exiting while in a preemptible-RCU read-side |
794 | * critical section, clean up if so. No need to issue warnings, | 852 | * critical section, clean up if so. No need to issue warnings, |
795 | * as debug_check_no_locks_held() already does this if lockdep | 853 | * as debug_check_no_locks_held() already does this if lockdep |
796 | * is enabled. | 854 | * is enabled. |
@@ -802,11 +860,13 @@ void exit_rcu(void) | |||
802 | if (t->rcu_read_lock_nesting == 0) | 860 | if (t->rcu_read_lock_nesting == 0) |
803 | return; | 861 | return; |
804 | t->rcu_read_lock_nesting = 1; | 862 | t->rcu_read_lock_nesting = 1; |
805 | rcu_read_unlock(); | 863 | __rcu_read_unlock(); |
806 | } | 864 | } |
807 | 865 | ||
808 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 866 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
809 | 867 | ||
868 | static struct rcu_state *rcu_state = &rcu_sched_state; | ||
869 | |||
810 | /* | 870 | /* |
811 | * Tell them what RCU they are running. | 871 | * Tell them what RCU they are running. |
812 | */ | 872 | */ |
@@ -836,7 +896,7 @@ void rcu_force_quiescent_state(void) | |||
836 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 896 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
837 | 897 | ||
838 | /* | 898 | /* |
839 | * Because preemptable RCU does not exist, we never have to check for | 899 | * Because preemptible RCU does not exist, we never have to check for |
840 | * CPUs being in quiescent states. | 900 | * CPUs being in quiescent states. |
841 | */ | 901 | */ |
842 | static void rcu_preempt_note_context_switch(int cpu) | 902 | static void rcu_preempt_note_context_switch(int cpu) |
@@ -844,10 +904,10 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
844 | } | 904 | } |
845 | 905 | ||
846 | /* | 906 | /* |
847 | * Because preemptable RCU does not exist, there are never any preempted | 907 | * Because preemptible RCU does not exist, there are never any preempted |
848 | * RCU readers. | 908 | * RCU readers. |
849 | */ | 909 | */ |
850 | static int rcu_preempted_readers(struct rcu_node *rnp) | 910 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) |
851 | { | 911 | { |
852 | return 0; | 912 | return 0; |
853 | } | 913 | } |
@@ -862,10 +922,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
862 | 922 | ||
863 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 923 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
864 | 924 | ||
865 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
866 | |||
867 | /* | 925 | /* |
868 | * Because preemptable RCU does not exist, we never have to check for | 926 | * Because preemptible RCU does not exist, we never have to check for |
869 | * tasks blocked within RCU read-side critical sections. | 927 | * tasks blocked within RCU read-side critical sections. |
870 | */ | 928 | */ |
871 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | 929 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) |
@@ -873,7 +931,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
873 | } | 931 | } |
874 | 932 | ||
875 | /* | 933 | /* |
876 | * Because preemptable RCU does not exist, we never have to check for | 934 | * Because preemptible RCU does not exist, we never have to check for |
877 | * tasks blocked within RCU read-side critical sections. | 935 | * tasks blocked within RCU read-side critical sections. |
878 | */ | 936 | */ |
879 | static void rcu_print_task_stall(struct rcu_node *rnp) | 937 | static void rcu_print_task_stall(struct rcu_node *rnp) |
@@ -888,10 +946,8 @@ static void rcu_preempt_stall_reset(void) | |||
888 | { | 946 | { |
889 | } | 947 | } |
890 | 948 | ||
891 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
892 | |||
893 | /* | 949 | /* |
894 | * Because there is no preemptable RCU, there can be no readers blocked, | 950 | * Because there is no preemptible RCU, there can be no readers blocked, |
895 | * so there is no need to check for blocked tasks. So check only for | 951 | * so there is no need to check for blocked tasks. So check only for |
896 | * bogus qsmask values. | 952 | * bogus qsmask values. |
897 | */ | 953 | */ |
@@ -903,7 +959,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |||
903 | #ifdef CONFIG_HOTPLUG_CPU | 959 | #ifdef CONFIG_HOTPLUG_CPU |
904 | 960 | ||
905 | /* | 961 | /* |
906 | * Because preemptable RCU does not exist, it never needs to migrate | 962 | * Because preemptible RCU does not exist, it never needs to migrate |
907 | * tasks that were blocked within RCU read-side critical sections, and | 963 | * tasks that were blocked within RCU read-side critical sections, and |
908 | * such non-existent tasks cannot possibly have been blocking the current | 964 | * such non-existent tasks cannot possibly have been blocking the current |
909 | * grace period. | 965 | * grace period. |
@@ -916,7 +972,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
916 | } | 972 | } |
917 | 973 | ||
918 | /* | 974 | /* |
919 | * Because preemptable RCU does not exist, it never needs CPU-offline | 975 | * Because preemptible RCU does not exist, it never needs CPU-offline |
920 | * processing. | 976 | * processing. |
921 | */ | 977 | */ |
922 | static void rcu_preempt_offline_cpu(int cpu) | 978 | static void rcu_preempt_offline_cpu(int cpu) |
@@ -926,7 +982,7 @@ static void rcu_preempt_offline_cpu(int cpu) | |||
926 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 982 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
927 | 983 | ||
928 | /* | 984 | /* |
929 | * Because preemptable RCU does not exist, it never has any callbacks | 985 | * Because preemptible RCU does not exist, it never has any callbacks |
930 | * to check. | 986 | * to check. |
931 | */ | 987 | */ |
932 | static void rcu_preempt_check_callbacks(int cpu) | 988 | static void rcu_preempt_check_callbacks(int cpu) |
@@ -934,7 +990,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
934 | } | 990 | } |
935 | 991 | ||
936 | /* | 992 | /* |
937 | * Because preemptable RCU does not exist, it never has any callbacks | 993 | * Because preemptible RCU does not exist, it never has any callbacks |
938 | * to process. | 994 | * to process. |
939 | */ | 995 | */ |
940 | static void rcu_preempt_process_callbacks(void) | 996 | static void rcu_preempt_process_callbacks(void) |
@@ -943,7 +999,7 @@ static void rcu_preempt_process_callbacks(void) | |||
943 | 999 | ||
944 | /* | 1000 | /* |
945 | * Wait for an rcu-preempt grace period, but make it happen quickly. | 1001 | * Wait for an rcu-preempt grace period, but make it happen quickly. |
946 | * But because preemptable RCU does not exist, map to rcu-sched. | 1002 | * But because preemptible RCU does not exist, map to rcu-sched. |
947 | */ | 1003 | */ |
948 | void synchronize_rcu_expedited(void) | 1004 | void synchronize_rcu_expedited(void) |
949 | { | 1005 | { |
@@ -954,7 +1010,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
954 | #ifdef CONFIG_HOTPLUG_CPU | 1010 | #ifdef CONFIG_HOTPLUG_CPU |
955 | 1011 | ||
956 | /* | 1012 | /* |
957 | * Because preemptable RCU does not exist, there is never any need to | 1013 | * Because preemptible RCU does not exist, there is never any need to |
958 | * report on tasks preempted in RCU read-side critical sections during | 1014 | * report on tasks preempted in RCU read-side critical sections during |
959 | * expedited RCU grace periods. | 1015 | * expedited RCU grace periods. |
960 | */ | 1016 | */ |
@@ -966,7 +1022,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
966 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1022 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
967 | 1023 | ||
968 | /* | 1024 | /* |
969 | * Because preemptable RCU does not exist, it never has any work to do. | 1025 | * Because preemptible RCU does not exist, it never has any work to do. |
970 | */ | 1026 | */ |
971 | static int rcu_preempt_pending(int cpu) | 1027 | static int rcu_preempt_pending(int cpu) |
972 | { | 1028 | { |
@@ -974,7 +1030,7 @@ static int rcu_preempt_pending(int cpu) | |||
974 | } | 1030 | } |
975 | 1031 | ||
976 | /* | 1032 | /* |
977 | * Because preemptable RCU does not exist, it never needs any CPU. | 1033 | * Because preemptible RCU does not exist, it never needs any CPU. |
978 | */ | 1034 | */ |
979 | static int rcu_preempt_needs_cpu(int cpu) | 1035 | static int rcu_preempt_needs_cpu(int cpu) |
980 | { | 1036 | { |
@@ -982,7 +1038,7 @@ static int rcu_preempt_needs_cpu(int cpu) | |||
982 | } | 1038 | } |
983 | 1039 | ||
984 | /* | 1040 | /* |
985 | * Because preemptable RCU does not exist, rcu_barrier() is just | 1041 | * Because preemptible RCU does not exist, rcu_barrier() is just |
986 | * another name for rcu_barrier_sched(). | 1042 | * another name for rcu_barrier_sched(). |
987 | */ | 1043 | */ |
988 | void rcu_barrier(void) | 1044 | void rcu_barrier(void) |
@@ -992,7 +1048,7 @@ void rcu_barrier(void) | |||
992 | EXPORT_SYMBOL_GPL(rcu_barrier); | 1048 | EXPORT_SYMBOL_GPL(rcu_barrier); |
993 | 1049 | ||
994 | /* | 1050 | /* |
995 | * Because preemptable RCU does not exist, there is no per-CPU | 1051 | * Because preemptible RCU does not exist, there is no per-CPU |
996 | * data to initialize. | 1052 | * data to initialize. |
997 | */ | 1053 | */ |
998 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | 1054 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) |
@@ -1000,14 +1056,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
1000 | } | 1056 | } |
1001 | 1057 | ||
1002 | /* | 1058 | /* |
1003 | * Because there is no preemptable RCU, there are no callbacks to move. | 1059 | * Because there is no preemptible RCU, there are no callbacks to move. |
1004 | */ | 1060 | */ |
1005 | static void rcu_preempt_send_cbs_to_online(void) | 1061 | static void rcu_preempt_send_cbs_to_online(void) |
1006 | { | 1062 | { |
1007 | } | 1063 | } |
1008 | 1064 | ||
1009 | /* | 1065 | /* |
1010 | * Because preemptable RCU does not exist, it need not be initialized. | 1066 | * Because preemptible RCU does not exist, it need not be initialized. |
1011 | */ | 1067 | */ |
1012 | static void __init __rcu_init_preempt(void) | 1068 | static void __init __rcu_init_preempt(void) |
1013 | { | 1069 | { |
@@ -1015,6 +1071,276 @@ static void __init __rcu_init_preempt(void) | |||
1015 | 1071 | ||
1016 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1072 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
1017 | 1073 | ||
1074 | #ifdef CONFIG_RCU_BOOST | ||
1075 | |||
1076 | #include "rtmutex_common.h" | ||
1077 | |||
1078 | #ifdef CONFIG_RCU_TRACE | ||
1079 | |||
1080 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
1081 | { | ||
1082 | if (list_empty(&rnp->blkd_tasks)) | ||
1083 | rnp->n_balk_blkd_tasks++; | ||
1084 | else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) | ||
1085 | rnp->n_balk_exp_gp_tasks++; | ||
1086 | else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) | ||
1087 | rnp->n_balk_boost_tasks++; | ||
1088 | else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) | ||
1089 | rnp->n_balk_notblocked++; | ||
1090 | else if (rnp->gp_tasks != NULL && | ||
1091 | ULONG_CMP_LT(jiffies, rnp->boost_time)) | ||
1092 | rnp->n_balk_notyet++; | ||
1093 | else | ||
1094 | rnp->n_balk_nos++; | ||
1095 | } | ||
1096 | |||
1097 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
1098 | |||
1099 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
1100 | { | ||
1101 | } | ||
1102 | |||
1103 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
1104 | |||
1105 | /* | ||
1106 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | ||
1107 | * or ->boost_tasks, advancing the pointer to the next task in the | ||
1108 | * ->blkd_tasks list. | ||
1109 | * | ||
1110 | * Note that irqs must be enabled: boosting the task can block. | ||
1111 | * Returns 1 if there are more tasks needing to be boosted. | ||
1112 | */ | ||
1113 | static int rcu_boost(struct rcu_node *rnp) | ||
1114 | { | ||
1115 | unsigned long flags; | ||
1116 | struct rt_mutex mtx; | ||
1117 | struct task_struct *t; | ||
1118 | struct list_head *tb; | ||
1119 | |||
1120 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) | ||
1121 | return 0; /* Nothing left to boost. */ | ||
1122 | |||
1123 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1124 | |||
1125 | /* | ||
1126 | * Recheck under the lock: all tasks in need of boosting | ||
1127 | * might exit their RCU read-side critical sections on their own. | ||
1128 | */ | ||
1129 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { | ||
1130 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1131 | return 0; | ||
1132 | } | ||
1133 | |||
1134 | /* | ||
1135 | * Preferentially boost tasks blocking expedited grace periods. | ||
1136 | * This cannot starve the normal grace periods because a second | ||
1137 | * expedited grace period must boost all blocked tasks, including | ||
1138 | * those blocking the pre-existing normal grace period. | ||
1139 | */ | ||
1140 | if (rnp->exp_tasks != NULL) { | ||
1141 | tb = rnp->exp_tasks; | ||
1142 | rnp->n_exp_boosts++; | ||
1143 | } else { | ||
1144 | tb = rnp->boost_tasks; | ||
1145 | rnp->n_normal_boosts++; | ||
1146 | } | ||
1147 | rnp->n_tasks_boosted++; | ||
1148 | |||
1149 | /* | ||
1150 | * We boost task t by manufacturing an rt_mutex that appears to | ||
1151 | * be held by task t. We leave a pointer to that rt_mutex where | ||
1152 | * task t can find it, and task t will release the mutex when it | ||
1153 | * exits its outermost RCU read-side critical section. Then | ||
1154 | * simply acquiring this artificial rt_mutex will boost task | ||
1155 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
1156 | * | ||
1157 | * Note that task t must acquire rnp->lock to remove itself from | ||
1158 | * the ->blkd_tasks list, which it will do from exit() if from | ||
1159 | * nowhere else. We therefore are guaranteed that task t will | ||
1160 | * stay around at least until we drop rnp->lock. Note that | ||
1161 | * rnp->lock also resolves races between our priority boosting | ||
1162 | * and task t's exiting its outermost RCU read-side critical | ||
1163 | * section. | ||
1164 | */ | ||
1165 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
1166 | rt_mutex_init_proxy_locked(&mtx, t); | ||
1167 | t->rcu_boost_mutex = &mtx; | ||
1168 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | ||
1169 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1170 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | ||
1171 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | ||
1172 | |||
1173 | return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; | ||
1174 | } | ||
1175 | |||
1176 | /* | ||
1177 | * Timer handler to initiate waking up of boost kthreads that | ||
1178 | * have yielded the CPU due to excessive numbers of tasks to | ||
1179 | * boost. We wake up the per-rcu_node kthread, which in turn | ||
1180 | * will wake up the booster kthread. | ||
1181 | */ | ||
1182 | static void rcu_boost_kthread_timer(unsigned long arg) | ||
1183 | { | ||
1184 | invoke_rcu_node_kthread((struct rcu_node *)arg); | ||
1185 | } | ||
1186 | |||
1187 | /* | ||
1188 | * Priority-boosting kthread. One per leaf rcu_node and one for the | ||
1189 | * root rcu_node. | ||
1190 | */ | ||
1191 | static int rcu_boost_kthread(void *arg) | ||
1192 | { | ||
1193 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
1194 | int spincnt = 0; | ||
1195 | int more2boost; | ||
1196 | |||
1197 | for (;;) { | ||
1198 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | ||
1199 | wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks || | ||
1200 | rnp->exp_tasks); | ||
1201 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | ||
1202 | more2boost = rcu_boost(rnp); | ||
1203 | if (more2boost) | ||
1204 | spincnt++; | ||
1205 | else | ||
1206 | spincnt = 0; | ||
1207 | if (spincnt > 10) { | ||
1208 | rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); | ||
1209 | spincnt = 0; | ||
1210 | } | ||
1211 | } | ||
1212 | /* NOTREACHED */ | ||
1213 | return 0; | ||
1214 | } | ||
1215 | |||
1216 | /* | ||
1217 | * Check to see if it is time to start boosting RCU readers that are | ||
1218 | * blocking the current grace period, and, if so, tell the per-rcu_node | ||
1219 | * kthread to start boosting them. If there is an expedited grace | ||
1220 | * period in progress, it is always time to boost. | ||
1221 | * | ||
1222 | * The caller must hold rnp->lock, which this function releases, | ||
1223 | * but irqs remain disabled. The ->boost_kthread_task is immortal, | ||
1224 | * so we don't need to worry about it going away. | ||
1225 | */ | ||
1226 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
1227 | { | ||
1228 | struct task_struct *t; | ||
1229 | |||
1230 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { | ||
1231 | rnp->n_balk_exp_gp_tasks++; | ||
1232 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1233 | return; | ||
1234 | } | ||
1235 | if (rnp->exp_tasks != NULL || | ||
1236 | (rnp->gp_tasks != NULL && | ||
1237 | rnp->boost_tasks == NULL && | ||
1238 | rnp->qsmask == 0 && | ||
1239 | ULONG_CMP_GE(jiffies, rnp->boost_time))) { | ||
1240 | if (rnp->exp_tasks == NULL) | ||
1241 | rnp->boost_tasks = rnp->gp_tasks; | ||
1242 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1243 | t = rnp->boost_kthread_task; | ||
1244 | if (t != NULL) | ||
1245 | wake_up_process(t); | ||
1246 | } else { | ||
1247 | rcu_initiate_boost_trace(rnp); | ||
1248 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1249 | } | ||
1250 | } | ||
1251 | |||
1252 | /* | ||
1253 | * Set the affinity of the boost kthread. The CPU-hotplug locks are | ||
1254 | * held, so no one should be messing with the existence of the boost | ||
1255 | * kthread. | ||
1256 | */ | ||
1257 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
1258 | cpumask_var_t cm) | ||
1259 | { | ||
1260 | struct task_struct *t; | ||
1261 | |||
1262 | t = rnp->boost_kthread_task; | ||
1263 | if (t != NULL) | ||
1264 | set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); | ||
1265 | } | ||
1266 | |||
1267 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | ||
1268 | |||
1269 | /* | ||
1270 | * Do priority-boost accounting for the start of a new grace period. | ||
1271 | */ | ||
1272 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
1273 | { | ||
1274 | rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
1275 | } | ||
1276 | |||
1277 | /* | ||
1278 | * Initialize the RCU-boost waitqueue. | ||
1279 | */ | ||
1280 | static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp) | ||
1281 | { | ||
1282 | init_waitqueue_head(&rnp->boost_wq); | ||
1283 | } | ||
1284 | |||
1285 | /* | ||
1286 | * Create an RCU-boost kthread for the specified node if one does not | ||
1287 | * already exist. We only create this kthread for preemptible RCU. | ||
1288 | * Returns zero if all is well, a negated errno otherwise. | ||
1289 | */ | ||
1290 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
1291 | struct rcu_node *rnp, | ||
1292 | int rnp_index) | ||
1293 | { | ||
1294 | unsigned long flags; | ||
1295 | struct sched_param sp; | ||
1296 | struct task_struct *t; | ||
1297 | |||
1298 | if (&rcu_preempt_state != rsp) | ||
1299 | return 0; | ||
1300 | if (rnp->boost_kthread_task != NULL) | ||
1301 | return 0; | ||
1302 | t = kthread_create(rcu_boost_kthread, (void *)rnp, | ||
1303 | "rcub%d", rnp_index); | ||
1304 | if (IS_ERR(t)) | ||
1305 | return PTR_ERR(t); | ||
1306 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1307 | rnp->boost_kthread_task = t; | ||
1308 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1309 | wake_up_process(t); | ||
1310 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1311 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1312 | return 0; | ||
1313 | } | ||
1314 | |||
1315 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
1316 | |||
1317 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
1318 | { | ||
1319 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1320 | } | ||
1321 | |||
1322 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
1323 | cpumask_var_t cm) | ||
1324 | { | ||
1325 | } | ||
1326 | |||
1327 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
1328 | { | ||
1329 | } | ||
1330 | |||
1331 | static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp) | ||
1332 | { | ||
1333 | } | ||
1334 | |||
1335 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
1336 | struct rcu_node *rnp, | ||
1337 | int rnp_index) | ||
1338 | { | ||
1339 | return 0; | ||
1340 | } | ||
1341 | |||
1342 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
1343 | |||
1018 | #ifndef CONFIG_SMP | 1344 | #ifndef CONFIG_SMP |
1019 | 1345 | ||
1020 | void synchronize_sched_expedited(void) | 1346 | void synchronize_sched_expedited(void) |
@@ -1187,8 +1513,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | |||
1187 | * | 1513 | * |
1188 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 1514 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
1189 | * disabled, we do one pass of force_quiescent_state(), then do a | 1515 | * disabled, we do one pass of force_quiescent_state(), then do a |
1190 | * raise_softirq() to cause rcu_process_callbacks() to be invoked later. | 1516 | * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked |
1191 | * The per-cpu rcu_dyntick_drain variable controls the sequencing. | 1517 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. |
1192 | */ | 1518 | */ |
1193 | int rcu_needs_cpu(int cpu) | 1519 | int rcu_needs_cpu(int cpu) |
1194 | { | 1520 | { |
@@ -1239,7 +1565,7 @@ int rcu_needs_cpu(int cpu) | |||
1239 | 1565 | ||
1240 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ | 1566 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ |
1241 | if (c) | 1567 | if (c) |
1242 | raise_softirq(RCU_SOFTIRQ); | 1568 | invoke_rcu_cpu_kthread(); |
1243 | return c; | 1569 | return c; |
1244 | } | 1570 | } |
1245 | 1571 | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c8e97853b970..aa0fd72b4bc7 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -46,6 +46,18 @@ | |||
46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
48 | 48 | ||
49 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
50 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); | ||
51 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
52 | DECLARE_PER_CPU(char, rcu_cpu_has_work); | ||
53 | |||
54 | static char convert_kthread_status(unsigned int kthread_status) | ||
55 | { | ||
56 | if (kthread_status > RCU_KTHREAD_MAX) | ||
57 | return '?'; | ||
58 | return "SRWOY"[kthread_status]; | ||
59 | } | ||
60 | |||
49 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | 61 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) |
50 | { | 62 | { |
51 | if (!rdp->beenonline) | 63 | if (!rdp->beenonline) |
@@ -64,7 +76,21 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
64 | rdp->dynticks_fqs); | 76 | rdp->dynticks_fqs); |
65 | #endif /* #ifdef CONFIG_NO_HZ */ | 77 | #endif /* #ifdef CONFIG_NO_HZ */ |
66 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 78 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); |
67 | seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); | 79 | seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld", |
80 | rdp->qlen, | ||
81 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
82 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
83 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
84 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
85 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
86 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
87 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], | ||
88 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
89 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
90 | rdp->cpu)), | ||
91 | per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), | ||
92 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff, | ||
93 | rdp->blimit); | ||
68 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", | 94 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", |
69 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 95 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); |
70 | } | 96 | } |
@@ -121,7 +147,18 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
121 | rdp->dynticks_fqs); | 147 | rdp->dynticks_fqs); |
122 | #endif /* #ifdef CONFIG_NO_HZ */ | 148 | #endif /* #ifdef CONFIG_NO_HZ */ |
123 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 149 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); |
124 | seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); | 150 | seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen, |
151 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
152 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
153 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
154 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
155 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
156 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
157 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]], | ||
158 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
159 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
160 | rdp->cpu)), | ||
161 | rdp->blimit); | ||
125 | seq_printf(m, ",%lu,%lu,%lu\n", | 162 | seq_printf(m, ",%lu,%lu,%lu\n", |
126 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 163 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); |
127 | } | 164 | } |
@@ -157,11 +194,76 @@ static const struct file_operations rcudata_csv_fops = { | |||
157 | .release = single_release, | 194 | .release = single_release, |
158 | }; | 195 | }; |
159 | 196 | ||
197 | #ifdef CONFIG_RCU_BOOST | ||
198 | |||
199 | static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) | ||
200 | { | ||
201 | seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " | ||
202 | "j=%04x bt=%04x\n", | ||
203 | rnp->grplo, rnp->grphi, | ||
204 | "T."[list_empty(&rnp->blkd_tasks)], | ||
205 | "N."[!rnp->gp_tasks], | ||
206 | "E."[!rnp->exp_tasks], | ||
207 | "B."[!rnp->boost_tasks], | ||
208 | convert_kthread_status(rnp->boost_kthread_status), | ||
209 | rnp->n_tasks_boosted, rnp->n_exp_boosts, | ||
210 | rnp->n_normal_boosts, | ||
211 | (int)(jiffies & 0xffff), | ||
212 | (int)(rnp->boost_time & 0xffff)); | ||
213 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", | ||
214 | " balk", | ||
215 | rnp->n_balk_blkd_tasks, | ||
216 | rnp->n_balk_exp_gp_tasks, | ||
217 | rnp->n_balk_boost_tasks, | ||
218 | rnp->n_balk_notblocked, | ||
219 | rnp->n_balk_notyet, | ||
220 | rnp->n_balk_nos); | ||
221 | } | ||
222 | |||
223 | static int show_rcu_node_boost(struct seq_file *m, void *unused) | ||
224 | { | ||
225 | struct rcu_node *rnp; | ||
226 | |||
227 | rcu_for_each_leaf_node(&rcu_preempt_state, rnp) | ||
228 | print_one_rcu_node_boost(m, rnp); | ||
229 | return 0; | ||
230 | } | ||
231 | |||
232 | static int rcu_node_boost_open(struct inode *inode, struct file *file) | ||
233 | { | ||
234 | return single_open(file, show_rcu_node_boost, NULL); | ||
235 | } | ||
236 | |||
237 | static const struct file_operations rcu_node_boost_fops = { | ||
238 | .owner = THIS_MODULE, | ||
239 | .open = rcu_node_boost_open, | ||
240 | .read = seq_read, | ||
241 | .llseek = seq_lseek, | ||
242 | .release = single_release, | ||
243 | }; | ||
244 | |||
245 | /* | ||
246 | * Create the rcuboost debugfs entry. Standard error return. | ||
247 | */ | ||
248 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
249 | { | ||
250 | return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, | ||
251 | &rcu_node_boost_fops); | ||
252 | } | ||
253 | |||
254 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
255 | |||
256 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
257 | { | ||
258 | return 0; /* There cannot be an error if we didn't create it! */ | ||
259 | } | ||
260 | |||
261 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
262 | |||
160 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | 263 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) |
161 | { | 264 | { |
162 | unsigned long gpnum; | 265 | unsigned long gpnum; |
163 | int level = 0; | 266 | int level = 0; |
164 | int phase; | ||
165 | struct rcu_node *rnp; | 267 | struct rcu_node *rnp; |
166 | 268 | ||
167 | gpnum = rsp->gpnum; | 269 | gpnum = rsp->gpnum; |
@@ -178,13 +280,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
178 | seq_puts(m, "\n"); | 280 | seq_puts(m, "\n"); |
179 | level = rnp->level; | 281 | level = rnp->level; |
180 | } | 282 | } |
181 | phase = gpnum & 0x1; | 283 | seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", |
182 | seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ", | ||
183 | rnp->qsmask, rnp->qsmaskinit, | 284 | rnp->qsmask, rnp->qsmaskinit, |
184 | "T."[list_empty(&rnp->blocked_tasks[phase])], | 285 | ".G"[rnp->gp_tasks != NULL], |
185 | "E."[list_empty(&rnp->blocked_tasks[phase + 2])], | 286 | ".E"[rnp->exp_tasks != NULL], |
186 | "T."[list_empty(&rnp->blocked_tasks[!phase])], | 287 | ".T"[!list_empty(&rnp->blkd_tasks)], |
187 | "E."[list_empty(&rnp->blocked_tasks[!phase + 2])], | ||
188 | rnp->grplo, rnp->grphi, rnp->grpnum); | 288 | rnp->grplo, rnp->grphi, rnp->grpnum); |
189 | } | 289 | } |
190 | seq_puts(m, "\n"); | 290 | seq_puts(m, "\n"); |
@@ -216,16 +316,35 @@ static const struct file_operations rcuhier_fops = { | |||
216 | .release = single_release, | 316 | .release = single_release, |
217 | }; | 317 | }; |
218 | 318 | ||
319 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | ||
320 | { | ||
321 | unsigned long flags; | ||
322 | unsigned long completed; | ||
323 | unsigned long gpnum; | ||
324 | unsigned long gpage; | ||
325 | unsigned long gpmax; | ||
326 | struct rcu_node *rnp = &rsp->node[0]; | ||
327 | |||
328 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
329 | completed = rsp->completed; | ||
330 | gpnum = rsp->gpnum; | ||
331 | if (rsp->completed == rsp->gpnum) | ||
332 | gpage = 0; | ||
333 | else | ||
334 | gpage = jiffies - rsp->gp_start; | ||
335 | gpmax = rsp->gp_max; | ||
336 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
337 | seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", | ||
338 | rsp->name, completed, gpnum, gpage, gpmax); | ||
339 | } | ||
340 | |||
219 | static int show_rcugp(struct seq_file *m, void *unused) | 341 | static int show_rcugp(struct seq_file *m, void *unused) |
220 | { | 342 | { |
221 | #ifdef CONFIG_TREE_PREEMPT_RCU | 343 | #ifdef CONFIG_TREE_PREEMPT_RCU |
222 | seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", | 344 | show_one_rcugp(m, &rcu_preempt_state); |
223 | rcu_preempt_state.completed, rcu_preempt_state.gpnum); | ||
224 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 345 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
225 | seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", | 346 | show_one_rcugp(m, &rcu_sched_state); |
226 | rcu_sched_state.completed, rcu_sched_state.gpnum); | 347 | show_one_rcugp(m, &rcu_bh_state); |
227 | seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", | ||
228 | rcu_bh_state.completed, rcu_bh_state.gpnum); | ||
229 | return 0; | 348 | return 0; |
230 | } | 349 | } |
231 | 350 | ||
@@ -298,6 +417,29 @@ static const struct file_operations rcu_pending_fops = { | |||
298 | .release = single_release, | 417 | .release = single_release, |
299 | }; | 418 | }; |
300 | 419 | ||
420 | static int show_rcutorture(struct seq_file *m, void *unused) | ||
421 | { | ||
422 | seq_printf(m, "rcutorture test sequence: %lu %s\n", | ||
423 | rcutorture_testseq >> 1, | ||
424 | (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); | ||
425 | seq_printf(m, "rcutorture update version number: %lu\n", | ||
426 | rcutorture_vernum); | ||
427 | return 0; | ||
428 | } | ||
429 | |||
430 | static int rcutorture_open(struct inode *inode, struct file *file) | ||
431 | { | ||
432 | return single_open(file, show_rcutorture, NULL); | ||
433 | } | ||
434 | |||
435 | static const struct file_operations rcutorture_fops = { | ||
436 | .owner = THIS_MODULE, | ||
437 | .open = rcutorture_open, | ||
438 | .read = seq_read, | ||
439 | .llseek = seq_lseek, | ||
440 | .release = single_release, | ||
441 | }; | ||
442 | |||
301 | static struct dentry *rcudir; | 443 | static struct dentry *rcudir; |
302 | 444 | ||
303 | static int __init rcutree_trace_init(void) | 445 | static int __init rcutree_trace_init(void) |
@@ -318,6 +460,9 @@ static int __init rcutree_trace_init(void) | |||
318 | if (!retval) | 460 | if (!retval) |
319 | goto free_out; | 461 | goto free_out; |
320 | 462 | ||
463 | if (rcu_boost_trace_create_file(rcudir)) | ||
464 | goto free_out; | ||
465 | |||
321 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | 466 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); |
322 | if (!retval) | 467 | if (!retval) |
323 | goto free_out; | 468 | goto free_out; |
@@ -331,6 +476,11 @@ static int __init rcutree_trace_init(void) | |||
331 | NULL, &rcu_pending_fops); | 476 | NULL, &rcu_pending_fops); |
332 | if (!retval) | 477 | if (!retval) |
333 | goto free_out; | 478 | goto free_out; |
479 | |||
480 | retval = debugfs_create_file("rcutorture", 0444, rcudir, | ||
481 | NULL, &rcutorture_fops); | ||
482 | if (!retval) | ||
483 | goto free_out; | ||
334 | return 0; | 484 | return 0; |
335 | free_out: | 485 | free_out: |
336 | debugfs_remove_recursive(rcudir); | 486 | debugfs_remove_recursive(rcudir); |
diff --git a/kernel/sched.c b/kernel/sched.c index a8845516ace6..c62acf45d3b9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
231 | #endif | 231 | #endif |
232 | 232 | ||
233 | /* | 233 | /* |
234 | * sched_domains_mutex serializes calls to arch_init_sched_domains, | 234 | * sched_domains_mutex serializes calls to init_sched_domains, |
235 | * detach_destroy_domains and partition_sched_domains. | 235 | * detach_destroy_domains and partition_sched_domains. |
236 | */ | 236 | */ |
237 | static DEFINE_MUTEX(sched_domains_mutex); | 237 | static DEFINE_MUTEX(sched_domains_mutex); |
@@ -312,6 +312,9 @@ struct cfs_rq { | |||
312 | 312 | ||
313 | u64 exec_clock; | 313 | u64 exec_clock; |
314 | u64 min_vruntime; | 314 | u64 min_vruntime; |
315 | #ifndef CONFIG_64BIT | ||
316 | u64 min_vruntime_copy; | ||
317 | #endif | ||
315 | 318 | ||
316 | struct rb_root tasks_timeline; | 319 | struct rb_root tasks_timeline; |
317 | struct rb_node *rb_leftmost; | 320 | struct rb_node *rb_leftmost; |
@@ -325,7 +328,9 @@ struct cfs_rq { | |||
325 | */ | 328 | */ |
326 | struct sched_entity *curr, *next, *last, *skip; | 329 | struct sched_entity *curr, *next, *last, *skip; |
327 | 330 | ||
331 | #ifdef CONFIG_SCHED_DEBUG | ||
328 | unsigned int nr_spread_over; | 332 | unsigned int nr_spread_over; |
333 | #endif | ||
329 | 334 | ||
330 | #ifdef CONFIG_FAIR_GROUP_SCHED | 335 | #ifdef CONFIG_FAIR_GROUP_SCHED |
331 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 336 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
@@ -417,6 +422,7 @@ struct rt_rq { | |||
417 | */ | 422 | */ |
418 | struct root_domain { | 423 | struct root_domain { |
419 | atomic_t refcount; | 424 | atomic_t refcount; |
425 | struct rcu_head rcu; | ||
420 | cpumask_var_t span; | 426 | cpumask_var_t span; |
421 | cpumask_var_t online; | 427 | cpumask_var_t online; |
422 | 428 | ||
@@ -460,7 +466,7 @@ struct rq { | |||
460 | u64 nohz_stamp; | 466 | u64 nohz_stamp; |
461 | unsigned char nohz_balance_kick; | 467 | unsigned char nohz_balance_kick; |
462 | #endif | 468 | #endif |
463 | unsigned int skip_clock_update; | 469 | int skip_clock_update; |
464 | 470 | ||
465 | /* capture load from *all* tasks on this cpu: */ | 471 | /* capture load from *all* tasks on this cpu: */ |
466 | struct load_weight load; | 472 | struct load_weight load; |
@@ -553,6 +559,10 @@ struct rq { | |||
553 | unsigned int ttwu_count; | 559 | unsigned int ttwu_count; |
554 | unsigned int ttwu_local; | 560 | unsigned int ttwu_local; |
555 | #endif | 561 | #endif |
562 | |||
563 | #ifdef CONFIG_SMP | ||
564 | struct task_struct *wake_list; | ||
565 | #endif | ||
556 | }; | 566 | }; |
557 | 567 | ||
558 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 568 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq) | |||
571 | 581 | ||
572 | #define rcu_dereference_check_sched_domain(p) \ | 582 | #define rcu_dereference_check_sched_domain(p) \ |
573 | rcu_dereference_check((p), \ | 583 | rcu_dereference_check((p), \ |
574 | rcu_read_lock_sched_held() || \ | 584 | rcu_read_lock_held() || \ |
575 | lockdep_is_held(&sched_domains_mutex)) | 585 | lockdep_is_held(&sched_domains_mutex)) |
576 | 586 | ||
577 | /* | 587 | /* |
@@ -596,7 +606,7 @@ static inline int cpu_of(struct rq *rq) | |||
596 | * Return the group to which this tasks belongs. | 606 | * Return the group to which this tasks belongs. |
597 | * | 607 | * |
598 | * We use task_subsys_state_check() and extend the RCU verification | 608 | * We use task_subsys_state_check() and extend the RCU verification |
599 | * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() | 609 | * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() |
600 | * holds that lock for each task it moves into the cgroup. Therefore | 610 | * holds that lock for each task it moves into the cgroup. Therefore |
601 | * by holding that lock, we pin the task to the current cgroup. | 611 | * by holding that lock, we pin the task to the current cgroup. |
602 | */ | 612 | */ |
@@ -606,7 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
606 | struct cgroup_subsys_state *css; | 616 | struct cgroup_subsys_state *css; |
607 | 617 | ||
608 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 618 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
609 | lockdep_is_held(&task_rq(p)->lock)); | 619 | lockdep_is_held(&p->pi_lock)); |
610 | tg = container_of(css, struct task_group, css); | 620 | tg = container_of(css, struct task_group, css); |
611 | 621 | ||
612 | return autogroup_task_group(p, tg); | 622 | return autogroup_task_group(p, tg); |
@@ -642,7 +652,7 @@ static void update_rq_clock(struct rq *rq) | |||
642 | { | 652 | { |
643 | s64 delta; | 653 | s64 delta; |
644 | 654 | ||
645 | if (rq->skip_clock_update) | 655 | if (rq->skip_clock_update > 0) |
646 | return; | 656 | return; |
647 | 657 | ||
648 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | 658 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
@@ -838,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) | |||
838 | return rq->curr == p; | 848 | return rq->curr == p; |
839 | } | 849 | } |
840 | 850 | ||
841 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
842 | static inline int task_running(struct rq *rq, struct task_struct *p) | 851 | static inline int task_running(struct rq *rq, struct task_struct *p) |
843 | { | 852 | { |
853 | #ifdef CONFIG_SMP | ||
854 | return p->on_cpu; | ||
855 | #else | ||
844 | return task_current(rq, p); | 856 | return task_current(rq, p); |
857 | #endif | ||
845 | } | 858 | } |
846 | 859 | ||
860 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
847 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 861 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
848 | { | 862 | { |
863 | #ifdef CONFIG_SMP | ||
864 | /* | ||
865 | * We can optimise this out completely for !SMP, because the | ||
866 | * SMP rebalancing from interrupt is the only thing that cares | ||
867 | * here. | ||
868 | */ | ||
869 | next->on_cpu = 1; | ||
870 | #endif | ||
849 | } | 871 | } |
850 | 872 | ||
851 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 873 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
852 | { | 874 | { |
875 | #ifdef CONFIG_SMP | ||
876 | /* | ||
877 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
878 | * We must ensure this doesn't happen until the switch is completely | ||
879 | * finished. | ||
880 | */ | ||
881 | smp_wmb(); | ||
882 | prev->on_cpu = 0; | ||
883 | #endif | ||
853 | #ifdef CONFIG_DEBUG_SPINLOCK | 884 | #ifdef CONFIG_DEBUG_SPINLOCK |
854 | /* this is a valid case when another task releases the spinlock */ | 885 | /* this is a valid case when another task releases the spinlock */ |
855 | rq->lock.owner = current; | 886 | rq->lock.owner = current; |
@@ -865,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
865 | } | 896 | } |
866 | 897 | ||
867 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 898 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
868 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
869 | { | ||
870 | #ifdef CONFIG_SMP | ||
871 | return p->oncpu; | ||
872 | #else | ||
873 | return task_current(rq, p); | ||
874 | #endif | ||
875 | } | ||
876 | |||
877 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 899 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
878 | { | 900 | { |
879 | #ifdef CONFIG_SMP | 901 | #ifdef CONFIG_SMP |
@@ -882,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
882 | * SMP rebalancing from interrupt is the only thing that cares | 904 | * SMP rebalancing from interrupt is the only thing that cares |
883 | * here. | 905 | * here. |
884 | */ | 906 | */ |
885 | next->oncpu = 1; | 907 | next->on_cpu = 1; |
886 | #endif | 908 | #endif |
887 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 909 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
888 | raw_spin_unlock_irq(&rq->lock); | 910 | raw_spin_unlock_irq(&rq->lock); |
@@ -895,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
895 | { | 917 | { |
896 | #ifdef CONFIG_SMP | 918 | #ifdef CONFIG_SMP |
897 | /* | 919 | /* |
898 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 920 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
899 | * We must ensure this doesn't happen until the switch is completely | 921 | * We must ensure this doesn't happen until the switch is completely |
900 | * finished. | 922 | * finished. |
901 | */ | 923 | */ |
902 | smp_wmb(); | 924 | smp_wmb(); |
903 | prev->oncpu = 0; | 925 | prev->on_cpu = 0; |
904 | #endif | 926 | #endif |
905 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 927 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
906 | local_irq_enable(); | 928 | local_irq_enable(); |
@@ -909,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
909 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 931 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
910 | 932 | ||
911 | /* | 933 | /* |
912 | * Check whether the task is waking, we use this to synchronize ->cpus_allowed | 934 | * __task_rq_lock - lock the rq @p resides on. |
913 | * against ttwu(). | ||
914 | */ | ||
915 | static inline int task_is_waking(struct task_struct *p) | ||
916 | { | ||
917 | return unlikely(p->state == TASK_WAKING); | ||
918 | } | ||
919 | |||
920 | /* | ||
921 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
922 | * Must be called interrupts disabled. | ||
923 | */ | 935 | */ |
924 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 936 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
925 | __acquires(rq->lock) | 937 | __acquires(rq->lock) |
926 | { | 938 | { |
927 | struct rq *rq; | 939 | struct rq *rq; |
928 | 940 | ||
941 | lockdep_assert_held(&p->pi_lock); | ||
942 | |||
929 | for (;;) { | 943 | for (;;) { |
930 | rq = task_rq(p); | 944 | rq = task_rq(p); |
931 | raw_spin_lock(&rq->lock); | 945 | raw_spin_lock(&rq->lock); |
@@ -936,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
936 | } | 950 | } |
937 | 951 | ||
938 | /* | 952 | /* |
939 | * task_rq_lock - lock the runqueue a given task resides on and disable | 953 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. |
940 | * interrupts. Note the ordering: we can safely lookup the task_rq without | ||
941 | * explicitly disabling preemption. | ||
942 | */ | 954 | */ |
943 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 955 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
956 | __acquires(p->pi_lock) | ||
944 | __acquires(rq->lock) | 957 | __acquires(rq->lock) |
945 | { | 958 | { |
946 | struct rq *rq; | 959 | struct rq *rq; |
947 | 960 | ||
948 | for (;;) { | 961 | for (;;) { |
949 | local_irq_save(*flags); | 962 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
950 | rq = task_rq(p); | 963 | rq = task_rq(p); |
951 | raw_spin_lock(&rq->lock); | 964 | raw_spin_lock(&rq->lock); |
952 | if (likely(rq == task_rq(p))) | 965 | if (likely(rq == task_rq(p))) |
953 | return rq; | 966 | return rq; |
954 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 967 | raw_spin_unlock(&rq->lock); |
968 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
955 | } | 969 | } |
956 | } | 970 | } |
957 | 971 | ||
@@ -961,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq) | |||
961 | raw_spin_unlock(&rq->lock); | 975 | raw_spin_unlock(&rq->lock); |
962 | } | 976 | } |
963 | 977 | ||
964 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 978 | static inline void |
979 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | ||
965 | __releases(rq->lock) | 980 | __releases(rq->lock) |
981 | __releases(p->pi_lock) | ||
966 | { | 982 | { |
967 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 983 | raw_spin_unlock(&rq->lock); |
984 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
968 | } | 985 | } |
969 | 986 | ||
970 | /* | 987 | /* |
@@ -1193,11 +1210,17 @@ int get_nohz_timer_target(void) | |||
1193 | int i; | 1210 | int i; |
1194 | struct sched_domain *sd; | 1211 | struct sched_domain *sd; |
1195 | 1212 | ||
1213 | rcu_read_lock(); | ||
1196 | for_each_domain(cpu, sd) { | 1214 | for_each_domain(cpu, sd) { |
1197 | for_each_cpu(i, sched_domain_span(sd)) | 1215 | for_each_cpu(i, sched_domain_span(sd)) { |
1198 | if (!idle_cpu(i)) | 1216 | if (!idle_cpu(i)) { |
1199 | return i; | 1217 | cpu = i; |
1218 | goto unlock; | ||
1219 | } | ||
1220 | } | ||
1200 | } | 1221 | } |
1222 | unlock: | ||
1223 | rcu_read_unlock(); | ||
1201 | return cpu; | 1224 | return cpu; |
1202 | } | 1225 | } |
1203 | /* | 1226 | /* |
@@ -1307,15 +1330,15 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1307 | { | 1330 | { |
1308 | u64 tmp; | 1331 | u64 tmp; |
1309 | 1332 | ||
1333 | tmp = (u64)delta_exec * weight; | ||
1334 | |||
1310 | if (!lw->inv_weight) { | 1335 | if (!lw->inv_weight) { |
1311 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) | 1336 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) |
1312 | lw->inv_weight = 1; | 1337 | lw->inv_weight = 1; |
1313 | else | 1338 | else |
1314 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) | 1339 | lw->inv_weight = WMULT_CONST / lw->weight; |
1315 | / (lw->weight+1); | ||
1316 | } | 1340 | } |
1317 | 1341 | ||
1318 | tmp = (u64)delta_exec * weight; | ||
1319 | /* | 1342 | /* |
1320 | * Check whether we'd overflow the 64-bit multiplication: | 1343 | * Check whether we'd overflow the 64-bit multiplication: |
1321 | */ | 1344 | */ |
@@ -1773,7 +1796,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1773 | update_rq_clock(rq); | 1796 | update_rq_clock(rq); |
1774 | sched_info_queued(p); | 1797 | sched_info_queued(p); |
1775 | p->sched_class->enqueue_task(rq, p, flags); | 1798 | p->sched_class->enqueue_task(rq, p, flags); |
1776 | p->se.on_rq = 1; | ||
1777 | } | 1799 | } |
1778 | 1800 | ||
1779 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 1801 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
@@ -1781,7 +1803,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
1781 | update_rq_clock(rq); | 1803 | update_rq_clock(rq); |
1782 | sched_info_dequeued(p); | 1804 | sched_info_dequeued(p); |
1783 | p->sched_class->dequeue_task(rq, p, flags); | 1805 | p->sched_class->dequeue_task(rq, p, flags); |
1784 | p->se.on_rq = 0; | ||
1785 | } | 1806 | } |
1786 | 1807 | ||
1787 | /* | 1808 | /* |
@@ -2116,7 +2137,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
2116 | * A queue event has occurred, and we're going to schedule. In | 2137 | * A queue event has occurred, and we're going to schedule. In |
2117 | * this case, we can save a useless back to back clock update. | 2138 | * this case, we can save a useless back to back clock update. |
2118 | */ | 2139 | */ |
2119 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) | 2140 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) |
2120 | rq->skip_clock_update = 1; | 2141 | rq->skip_clock_update = 1; |
2121 | } | 2142 | } |
2122 | 2143 | ||
@@ -2162,6 +2183,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
2162 | */ | 2183 | */ |
2163 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 2184 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
2164 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 2185 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); |
2186 | |||
2187 | #ifdef CONFIG_LOCKDEP | ||
2188 | WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || | ||
2189 | lockdep_is_held(&task_rq(p)->lock))); | ||
2190 | #endif | ||
2165 | #endif | 2191 | #endif |
2166 | 2192 | ||
2167 | trace_sched_migrate_task(p, new_cpu); | 2193 | trace_sched_migrate_task(p, new_cpu); |
@@ -2182,19 +2208,6 @@ struct migration_arg { | |||
2182 | static int migration_cpu_stop(void *data); | 2208 | static int migration_cpu_stop(void *data); |
2183 | 2209 | ||
2184 | /* | 2210 | /* |
2185 | * The task's runqueue lock must be held. | ||
2186 | * Returns true if you have to wait for migration thread. | ||
2187 | */ | ||
2188 | static bool migrate_task(struct task_struct *p, struct rq *rq) | ||
2189 | { | ||
2190 | /* | ||
2191 | * If the task is not on a runqueue (and not running), then | ||
2192 | * the next wake-up will properly place the task. | ||
2193 | */ | ||
2194 | return p->se.on_rq || task_running(rq, p); | ||
2195 | } | ||
2196 | |||
2197 | /* | ||
2198 | * wait_task_inactive - wait for a thread to unschedule. | 2211 | * wait_task_inactive - wait for a thread to unschedule. |
2199 | * | 2212 | * |
2200 | * If @match_state is nonzero, it's the @p->state value just checked and | 2213 | * If @match_state is nonzero, it's the @p->state value just checked and |
@@ -2251,11 +2264,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2251 | rq = task_rq_lock(p, &flags); | 2264 | rq = task_rq_lock(p, &flags); |
2252 | trace_sched_wait_task(p); | 2265 | trace_sched_wait_task(p); |
2253 | running = task_running(rq, p); | 2266 | running = task_running(rq, p); |
2254 | on_rq = p->se.on_rq; | 2267 | on_rq = p->on_rq; |
2255 | ncsw = 0; | 2268 | ncsw = 0; |
2256 | if (!match_state || p->state == match_state) | 2269 | if (!match_state || p->state == match_state) |
2257 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 2270 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
2258 | task_rq_unlock(rq, &flags); | 2271 | task_rq_unlock(rq, p, &flags); |
2259 | 2272 | ||
2260 | /* | 2273 | /* |
2261 | * If it changed from the expected state, bail out now. | 2274 | * If it changed from the expected state, bail out now. |
@@ -2309,7 +2322,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2309 | * Cause a process which is running on another CPU to enter | 2322 | * Cause a process which is running on another CPU to enter |
2310 | * kernel-mode, without any delay. (to get signals handled.) | 2323 | * kernel-mode, without any delay. (to get signals handled.) |
2311 | * | 2324 | * |
2312 | * NOTE: this function doesnt have to take the runqueue lock, | 2325 | * NOTE: this function doesn't have to take the runqueue lock, |
2313 | * because all it wants to ensure is that the remote task enters | 2326 | * because all it wants to ensure is that the remote task enters |
2314 | * the kernel. If the IPI races and the task has been migrated | 2327 | * the kernel. If the IPI races and the task has been migrated |
2315 | * to another CPU then no harm is done and the purpose has been | 2328 | * to another CPU then no harm is done and the purpose has been |
@@ -2330,7 +2343,7 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
2330 | 2343 | ||
2331 | #ifdef CONFIG_SMP | 2344 | #ifdef CONFIG_SMP |
2332 | /* | 2345 | /* |
2333 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2346 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock |
2334 | */ | 2347 | */ |
2335 | static int select_fallback_rq(int cpu, struct task_struct *p) | 2348 | static int select_fallback_rq(int cpu, struct task_struct *p) |
2336 | { | 2349 | { |
@@ -2363,12 +2376,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2363 | } | 2376 | } |
2364 | 2377 | ||
2365 | /* | 2378 | /* |
2366 | * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. | 2379 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
2367 | */ | 2380 | */ |
2368 | static inline | 2381 | static inline |
2369 | int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) | 2382 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
2370 | { | 2383 | { |
2371 | int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); | 2384 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); |
2372 | 2385 | ||
2373 | /* | 2386 | /* |
2374 | * In order not to call set_task_cpu() on a blocking task we need | 2387 | * In order not to call set_task_cpu() on a blocking task we need |
@@ -2394,27 +2407,62 @@ static void update_avg(u64 *avg, u64 sample) | |||
2394 | } | 2407 | } |
2395 | #endif | 2408 | #endif |
2396 | 2409 | ||
2397 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, | 2410 | static void |
2398 | bool is_sync, bool is_migrate, bool is_local, | 2411 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
2399 | unsigned long en_flags) | ||
2400 | { | 2412 | { |
2413 | #ifdef CONFIG_SCHEDSTATS | ||
2414 | struct rq *rq = this_rq(); | ||
2415 | |||
2416 | #ifdef CONFIG_SMP | ||
2417 | int this_cpu = smp_processor_id(); | ||
2418 | |||
2419 | if (cpu == this_cpu) { | ||
2420 | schedstat_inc(rq, ttwu_local); | ||
2421 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
2422 | } else { | ||
2423 | struct sched_domain *sd; | ||
2424 | |||
2425 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
2426 | rcu_read_lock(); | ||
2427 | for_each_domain(this_cpu, sd) { | ||
2428 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2429 | schedstat_inc(sd, ttwu_wake_remote); | ||
2430 | break; | ||
2431 | } | ||
2432 | } | ||
2433 | rcu_read_unlock(); | ||
2434 | } | ||
2435 | #endif /* CONFIG_SMP */ | ||
2436 | |||
2437 | schedstat_inc(rq, ttwu_count); | ||
2401 | schedstat_inc(p, se.statistics.nr_wakeups); | 2438 | schedstat_inc(p, se.statistics.nr_wakeups); |
2402 | if (is_sync) | 2439 | |
2440 | if (wake_flags & WF_SYNC) | ||
2403 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 2441 | schedstat_inc(p, se.statistics.nr_wakeups_sync); |
2404 | if (is_migrate) | 2442 | |
2443 | if (cpu != task_cpu(p)) | ||
2405 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 2444 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); |
2406 | if (is_local) | ||
2407 | schedstat_inc(p, se.statistics.nr_wakeups_local); | ||
2408 | else | ||
2409 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | ||
2410 | 2445 | ||
2446 | #endif /* CONFIG_SCHEDSTATS */ | ||
2447 | } | ||
2448 | |||
2449 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | ||
2450 | { | ||
2411 | activate_task(rq, p, en_flags); | 2451 | activate_task(rq, p, en_flags); |
2452 | p->on_rq = 1; | ||
2453 | |||
2454 | /* if a worker is waking up, notify workqueue */ | ||
2455 | if (p->flags & PF_WQ_WORKER) | ||
2456 | wq_worker_waking_up(p, cpu_of(rq)); | ||
2412 | } | 2457 | } |
2413 | 2458 | ||
2414 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | 2459 | /* |
2415 | int wake_flags, bool success) | 2460 | * Mark the task runnable and perform wakeup-preemption. |
2461 | */ | ||
2462 | static void | ||
2463 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2416 | { | 2464 | { |
2417 | trace_sched_wakeup(p, success); | 2465 | trace_sched_wakeup(p, true); |
2418 | check_preempt_curr(rq, p, wake_flags); | 2466 | check_preempt_curr(rq, p, wake_flags); |
2419 | 2467 | ||
2420 | p->state = TASK_RUNNING; | 2468 | p->state = TASK_RUNNING; |
@@ -2433,9 +2481,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2433 | rq->idle_stamp = 0; | 2481 | rq->idle_stamp = 0; |
2434 | } | 2482 | } |
2435 | #endif | 2483 | #endif |
2436 | /* if a worker is waking up, notify workqueue */ | 2484 | } |
2437 | if ((p->flags & PF_WQ_WORKER) && success) | 2485 | |
2438 | wq_worker_waking_up(p, cpu_of(rq)); | 2486 | static void |
2487 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) | ||
2488 | { | ||
2489 | #ifdef CONFIG_SMP | ||
2490 | if (p->sched_contributes_to_load) | ||
2491 | rq->nr_uninterruptible--; | ||
2492 | #endif | ||
2493 | |||
2494 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); | ||
2495 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2496 | } | ||
2497 | |||
2498 | /* | ||
2499 | * Called in case the task @p isn't fully descheduled from its runqueue, | ||
2500 | * in this case we must do a remote wakeup. Its a 'light' wakeup though, | ||
2501 | * since all we need to do is flip p->state to TASK_RUNNING, since | ||
2502 | * the task is still ->on_rq. | ||
2503 | */ | ||
2504 | static int ttwu_remote(struct task_struct *p, int wake_flags) | ||
2505 | { | ||
2506 | struct rq *rq; | ||
2507 | int ret = 0; | ||
2508 | |||
2509 | rq = __task_rq_lock(p); | ||
2510 | if (p->on_rq) { | ||
2511 | ttwu_do_wakeup(rq, p, wake_flags); | ||
2512 | ret = 1; | ||
2513 | } | ||
2514 | __task_rq_unlock(rq); | ||
2515 | |||
2516 | return ret; | ||
2517 | } | ||
2518 | |||
2519 | #ifdef CONFIG_SMP | ||
2520 | static void sched_ttwu_pending(void) | ||
2521 | { | ||
2522 | struct rq *rq = this_rq(); | ||
2523 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
2524 | |||
2525 | if (!list) | ||
2526 | return; | ||
2527 | |||
2528 | raw_spin_lock(&rq->lock); | ||
2529 | |||
2530 | while (list) { | ||
2531 | struct task_struct *p = list; | ||
2532 | list = list->wake_entry; | ||
2533 | ttwu_do_activate(rq, p, 0); | ||
2534 | } | ||
2535 | |||
2536 | raw_spin_unlock(&rq->lock); | ||
2537 | } | ||
2538 | |||
2539 | void scheduler_ipi(void) | ||
2540 | { | ||
2541 | sched_ttwu_pending(); | ||
2542 | } | ||
2543 | |||
2544 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | ||
2545 | { | ||
2546 | struct rq *rq = cpu_rq(cpu); | ||
2547 | struct task_struct *next = rq->wake_list; | ||
2548 | |||
2549 | for (;;) { | ||
2550 | struct task_struct *old = next; | ||
2551 | |||
2552 | p->wake_entry = next; | ||
2553 | next = cmpxchg(&rq->wake_list, old, p); | ||
2554 | if (next == old) | ||
2555 | break; | ||
2556 | } | ||
2557 | |||
2558 | if (!next) | ||
2559 | smp_send_reschedule(cpu); | ||
2560 | } | ||
2561 | #endif | ||
2562 | |||
2563 | static void ttwu_queue(struct task_struct *p, int cpu) | ||
2564 | { | ||
2565 | struct rq *rq = cpu_rq(cpu); | ||
2566 | |||
2567 | #if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) | ||
2568 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | ||
2569 | ttwu_queue_remote(p, cpu); | ||
2570 | return; | ||
2571 | } | ||
2572 | #endif | ||
2573 | |||
2574 | raw_spin_lock(&rq->lock); | ||
2575 | ttwu_do_activate(rq, p, 0); | ||
2576 | raw_spin_unlock(&rq->lock); | ||
2439 | } | 2577 | } |
2440 | 2578 | ||
2441 | /** | 2579 | /** |
@@ -2453,92 +2591,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
2453 | * Returns %true if @p was woken up, %false if it was already running | 2591 | * Returns %true if @p was woken up, %false if it was already running |
2454 | * or @state didn't match @p's state. | 2592 | * or @state didn't match @p's state. |
2455 | */ | 2593 | */ |
2456 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2594 | static int |
2457 | int wake_flags) | 2595 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
2458 | { | 2596 | { |
2459 | int cpu, orig_cpu, this_cpu, success = 0; | ||
2460 | unsigned long flags; | 2597 | unsigned long flags; |
2461 | unsigned long en_flags = ENQUEUE_WAKEUP; | 2598 | int cpu, success = 0; |
2462 | struct rq *rq; | ||
2463 | |||
2464 | this_cpu = get_cpu(); | ||
2465 | 2599 | ||
2466 | smp_wmb(); | 2600 | smp_wmb(); |
2467 | rq = task_rq_lock(p, &flags); | 2601 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2468 | if (!(p->state & state)) | 2602 | if (!(p->state & state)) |
2469 | goto out; | 2603 | goto out; |
2470 | 2604 | ||
2471 | if (p->se.on_rq) | 2605 | success = 1; /* we're going to change ->state */ |
2472 | goto out_running; | ||
2473 | |||
2474 | cpu = task_cpu(p); | 2606 | cpu = task_cpu(p); |
2475 | orig_cpu = cpu; | ||
2476 | 2607 | ||
2477 | #ifdef CONFIG_SMP | 2608 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
2478 | if (unlikely(task_running(rq, p))) | 2609 | goto stat; |
2479 | goto out_activate; | ||
2480 | 2610 | ||
2611 | #ifdef CONFIG_SMP | ||
2481 | /* | 2612 | /* |
2482 | * In order to handle concurrent wakeups and release the rq->lock | 2613 | * If the owning (remote) cpu is still in the middle of schedule() with |
2483 | * we put the task in TASK_WAKING state. | 2614 | * this task as prev, wait until its done referencing the task. |
2484 | * | ||
2485 | * First fix up the nr_uninterruptible count: | ||
2486 | */ | 2615 | */ |
2487 | if (task_contributes_to_load(p)) { | 2616 | while (p->on_cpu) { |
2488 | if (likely(cpu_online(orig_cpu))) | 2617 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
2489 | rq->nr_uninterruptible--; | 2618 | /* |
2490 | else | 2619 | * If called from interrupt context we could have landed in the |
2491 | this_rq()->nr_uninterruptible--; | 2620 | * middle of schedule(), in this case we should take care not |
2492 | } | 2621 | * to spin on ->on_cpu if p is current, since that would |
2493 | p->state = TASK_WAKING; | 2622 | * deadlock. |
2494 | 2623 | */ | |
2495 | if (p->sched_class->task_waking) { | 2624 | if (p == current) { |
2496 | p->sched_class->task_waking(rq, p); | 2625 | ttwu_queue(p, cpu); |
2497 | en_flags |= ENQUEUE_WAKING; | 2626 | goto stat; |
2627 | } | ||
2628 | #endif | ||
2629 | cpu_relax(); | ||
2498 | } | 2630 | } |
2499 | |||
2500 | cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); | ||
2501 | if (cpu != orig_cpu) | ||
2502 | set_task_cpu(p, cpu); | ||
2503 | __task_rq_unlock(rq); | ||
2504 | |||
2505 | rq = cpu_rq(cpu); | ||
2506 | raw_spin_lock(&rq->lock); | ||
2507 | |||
2508 | /* | 2631 | /* |
2509 | * We migrated the task without holding either rq->lock, however | 2632 | * Pairs with the smp_wmb() in finish_lock_switch(). |
2510 | * since the task is not on the task list itself, nobody else | ||
2511 | * will try and migrate the task, hence the rq should match the | ||
2512 | * cpu we just moved it to. | ||
2513 | */ | 2633 | */ |
2514 | WARN_ON(task_cpu(p) != cpu); | 2634 | smp_rmb(); |
2515 | WARN_ON(p->state != TASK_WAKING); | ||
2516 | 2635 | ||
2517 | #ifdef CONFIG_SCHEDSTATS | 2636 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2518 | schedstat_inc(rq, ttwu_count); | 2637 | p->state = TASK_WAKING; |
2519 | if (cpu == this_cpu) | ||
2520 | schedstat_inc(rq, ttwu_local); | ||
2521 | else { | ||
2522 | struct sched_domain *sd; | ||
2523 | for_each_domain(this_cpu, sd) { | ||
2524 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2525 | schedstat_inc(sd, ttwu_wake_remote); | ||
2526 | break; | ||
2527 | } | ||
2528 | } | ||
2529 | } | ||
2530 | #endif /* CONFIG_SCHEDSTATS */ | ||
2531 | 2638 | ||
2532 | out_activate: | 2639 | if (p->sched_class->task_waking) |
2640 | p->sched_class->task_waking(p); | ||
2641 | |||
2642 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | ||
2643 | if (task_cpu(p) != cpu) | ||
2644 | set_task_cpu(p, cpu); | ||
2533 | #endif /* CONFIG_SMP */ | 2645 | #endif /* CONFIG_SMP */ |
2534 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, | 2646 | |
2535 | cpu == this_cpu, en_flags); | 2647 | ttwu_queue(p, cpu); |
2536 | success = 1; | 2648 | stat: |
2537 | out_running: | 2649 | ttwu_stat(p, cpu, wake_flags); |
2538 | ttwu_post_activation(p, rq, wake_flags, success); | ||
2539 | out: | 2650 | out: |
2540 | task_rq_unlock(rq, &flags); | 2651 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2541 | put_cpu(); | ||
2542 | 2652 | ||
2543 | return success; | 2653 | return success; |
2544 | } | 2654 | } |
@@ -2547,31 +2657,34 @@ out: | |||
2547 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2657 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2548 | * @p: the thread to be awakened | 2658 | * @p: the thread to be awakened |
2549 | * | 2659 | * |
2550 | * Put @p on the run-queue if it's not already there. The caller must | 2660 | * Put @p on the run-queue if it's not already there. The caller must |
2551 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2661 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2552 | * the current task. this_rq() stays locked over invocation. | 2662 | * the current task. |
2553 | */ | 2663 | */ |
2554 | static void try_to_wake_up_local(struct task_struct *p) | 2664 | static void try_to_wake_up_local(struct task_struct *p) |
2555 | { | 2665 | { |
2556 | struct rq *rq = task_rq(p); | 2666 | struct rq *rq = task_rq(p); |
2557 | bool success = false; | ||
2558 | 2667 | ||
2559 | BUG_ON(rq != this_rq()); | 2668 | BUG_ON(rq != this_rq()); |
2560 | BUG_ON(p == current); | 2669 | BUG_ON(p == current); |
2561 | lockdep_assert_held(&rq->lock); | 2670 | lockdep_assert_held(&rq->lock); |
2562 | 2671 | ||
2672 | if (!raw_spin_trylock(&p->pi_lock)) { | ||
2673 | raw_spin_unlock(&rq->lock); | ||
2674 | raw_spin_lock(&p->pi_lock); | ||
2675 | raw_spin_lock(&rq->lock); | ||
2676 | } | ||
2677 | |||
2563 | if (!(p->state & TASK_NORMAL)) | 2678 | if (!(p->state & TASK_NORMAL)) |
2564 | return; | 2679 | goto out; |
2565 | 2680 | ||
2566 | if (!p->se.on_rq) { | 2681 | if (!p->on_rq) |
2567 | if (likely(!task_running(rq, p))) { | 2682 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2568 | schedstat_inc(rq, ttwu_count); | 2683 | |
2569 | schedstat_inc(rq, ttwu_local); | 2684 | ttwu_do_wakeup(rq, p, 0); |
2570 | } | 2685 | ttwu_stat(p, smp_processor_id(), 0); |
2571 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | 2686 | out: |
2572 | success = true; | 2687 | raw_spin_unlock(&p->pi_lock); |
2573 | } | ||
2574 | ttwu_post_activation(p, rq, 0, success); | ||
2575 | } | 2688 | } |
2576 | 2689 | ||
2577 | /** | 2690 | /** |
@@ -2604,19 +2717,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
2604 | */ | 2717 | */ |
2605 | static void __sched_fork(struct task_struct *p) | 2718 | static void __sched_fork(struct task_struct *p) |
2606 | { | 2719 | { |
2720 | p->on_rq = 0; | ||
2721 | |||
2722 | p->se.on_rq = 0; | ||
2607 | p->se.exec_start = 0; | 2723 | p->se.exec_start = 0; |
2608 | p->se.sum_exec_runtime = 0; | 2724 | p->se.sum_exec_runtime = 0; |
2609 | p->se.prev_sum_exec_runtime = 0; | 2725 | p->se.prev_sum_exec_runtime = 0; |
2610 | p->se.nr_migrations = 0; | 2726 | p->se.nr_migrations = 0; |
2611 | p->se.vruntime = 0; | 2727 | p->se.vruntime = 0; |
2728 | INIT_LIST_HEAD(&p->se.group_node); | ||
2612 | 2729 | ||
2613 | #ifdef CONFIG_SCHEDSTATS | 2730 | #ifdef CONFIG_SCHEDSTATS |
2614 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2731 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
2615 | #endif | 2732 | #endif |
2616 | 2733 | ||
2617 | INIT_LIST_HEAD(&p->rt.run_list); | 2734 | INIT_LIST_HEAD(&p->rt.run_list); |
2618 | p->se.on_rq = 0; | ||
2619 | INIT_LIST_HEAD(&p->se.group_node); | ||
2620 | 2735 | ||
2621 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2736 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
2622 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2737 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
@@ -2626,8 +2741,9 @@ static void __sched_fork(struct task_struct *p) | |||
2626 | /* | 2741 | /* |
2627 | * fork()/clone()-time setup: | 2742 | * fork()/clone()-time setup: |
2628 | */ | 2743 | */ |
2629 | void sched_fork(struct task_struct *p, int clone_flags) | 2744 | void sched_fork(struct task_struct *p) |
2630 | { | 2745 | { |
2746 | unsigned long flags; | ||
2631 | int cpu = get_cpu(); | 2747 | int cpu = get_cpu(); |
2632 | 2748 | ||
2633 | __sched_fork(p); | 2749 | __sched_fork(p); |
@@ -2678,16 +2794,16 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2678 | * | 2794 | * |
2679 | * Silence PROVE_RCU. | 2795 | * Silence PROVE_RCU. |
2680 | */ | 2796 | */ |
2681 | rcu_read_lock(); | 2797 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2682 | set_task_cpu(p, cpu); | 2798 | set_task_cpu(p, cpu); |
2683 | rcu_read_unlock(); | 2799 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2684 | 2800 | ||
2685 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2801 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2686 | if (likely(sched_info_on())) | 2802 | if (likely(sched_info_on())) |
2687 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2803 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
2688 | #endif | 2804 | #endif |
2689 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 2805 | #if defined(CONFIG_SMP) |
2690 | p->oncpu = 0; | 2806 | p->on_cpu = 0; |
2691 | #endif | 2807 | #endif |
2692 | #ifdef CONFIG_PREEMPT | 2808 | #ifdef CONFIG_PREEMPT |
2693 | /* Want to start with kernel preemption disabled. */ | 2809 | /* Want to start with kernel preemption disabled. */ |
@@ -2707,41 +2823,31 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2707 | * that must be done for every newly created context, then puts the task | 2823 | * that must be done for every newly created context, then puts the task |
2708 | * on the runqueue and wakes it. | 2824 | * on the runqueue and wakes it. |
2709 | */ | 2825 | */ |
2710 | void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 2826 | void wake_up_new_task(struct task_struct *p) |
2711 | { | 2827 | { |
2712 | unsigned long flags; | 2828 | unsigned long flags; |
2713 | struct rq *rq; | 2829 | struct rq *rq; |
2714 | int cpu __maybe_unused = get_cpu(); | ||
2715 | 2830 | ||
2831 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
2716 | #ifdef CONFIG_SMP | 2832 | #ifdef CONFIG_SMP |
2717 | rq = task_rq_lock(p, &flags); | ||
2718 | p->state = TASK_WAKING; | ||
2719 | |||
2720 | /* | 2833 | /* |
2721 | * Fork balancing, do it here and not earlier because: | 2834 | * Fork balancing, do it here and not earlier because: |
2722 | * - cpus_allowed can change in the fork path | 2835 | * - cpus_allowed can change in the fork path |
2723 | * - any previously selected cpu might disappear through hotplug | 2836 | * - any previously selected cpu might disappear through hotplug |
2724 | * | ||
2725 | * We set TASK_WAKING so that select_task_rq() can drop rq->lock | ||
2726 | * without people poking at ->cpus_allowed. | ||
2727 | */ | 2837 | */ |
2728 | cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); | 2838 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
2729 | set_task_cpu(p, cpu); | ||
2730 | |||
2731 | p->state = TASK_RUNNING; | ||
2732 | task_rq_unlock(rq, &flags); | ||
2733 | #endif | 2839 | #endif |
2734 | 2840 | ||
2735 | rq = task_rq_lock(p, &flags); | 2841 | rq = __task_rq_lock(p); |
2736 | activate_task(rq, p, 0); | 2842 | activate_task(rq, p, 0); |
2737 | trace_sched_wakeup_new(p, 1); | 2843 | p->on_rq = 1; |
2844 | trace_sched_wakeup_new(p, true); | ||
2738 | check_preempt_curr(rq, p, WF_FORK); | 2845 | check_preempt_curr(rq, p, WF_FORK); |
2739 | #ifdef CONFIG_SMP | 2846 | #ifdef CONFIG_SMP |
2740 | if (p->sched_class->task_woken) | 2847 | if (p->sched_class->task_woken) |
2741 | p->sched_class->task_woken(rq, p); | 2848 | p->sched_class->task_woken(rq, p); |
2742 | #endif | 2849 | #endif |
2743 | task_rq_unlock(rq, &flags); | 2850 | task_rq_unlock(rq, p, &flags); |
2744 | put_cpu(); | ||
2745 | } | 2851 | } |
2746 | 2852 | ||
2747 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2853 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -3450,27 +3556,22 @@ void sched_exec(void) | |||
3450 | { | 3556 | { |
3451 | struct task_struct *p = current; | 3557 | struct task_struct *p = current; |
3452 | unsigned long flags; | 3558 | unsigned long flags; |
3453 | struct rq *rq; | ||
3454 | int dest_cpu; | 3559 | int dest_cpu; |
3455 | 3560 | ||
3456 | rq = task_rq_lock(p, &flags); | 3561 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
3457 | dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); | 3562 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); |
3458 | if (dest_cpu == smp_processor_id()) | 3563 | if (dest_cpu == smp_processor_id()) |
3459 | goto unlock; | 3564 | goto unlock; |
3460 | 3565 | ||
3461 | /* | 3566 | if (likely(cpu_active(dest_cpu))) { |
3462 | * select_task_rq() can race against ->cpus_allowed | ||
3463 | */ | ||
3464 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | ||
3465 | likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { | ||
3466 | struct migration_arg arg = { p, dest_cpu }; | 3567 | struct migration_arg arg = { p, dest_cpu }; |
3467 | 3568 | ||
3468 | task_rq_unlock(rq, &flags); | 3569 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3469 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 3570 | stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); |
3470 | return; | 3571 | return; |
3471 | } | 3572 | } |
3472 | unlock: | 3573 | unlock: |
3473 | task_rq_unlock(rq, &flags); | 3574 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
3474 | } | 3575 | } |
3475 | 3576 | ||
3476 | #endif | 3577 | #endif |
@@ -3507,7 +3608,7 @@ unsigned long long task_delta_exec(struct task_struct *p) | |||
3507 | 3608 | ||
3508 | rq = task_rq_lock(p, &flags); | 3609 | rq = task_rq_lock(p, &flags); |
3509 | ns = do_task_delta_exec(p, rq); | 3610 | ns = do_task_delta_exec(p, rq); |
3510 | task_rq_unlock(rq, &flags); | 3611 | task_rq_unlock(rq, p, &flags); |
3511 | 3612 | ||
3512 | return ns; | 3613 | return ns; |
3513 | } | 3614 | } |
@@ -3525,7 +3626,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3525 | 3626 | ||
3526 | rq = task_rq_lock(p, &flags); | 3627 | rq = task_rq_lock(p, &flags); |
3527 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 3628 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
3528 | task_rq_unlock(rq, &flags); | 3629 | task_rq_unlock(rq, p, &flags); |
3529 | 3630 | ||
3530 | return ns; | 3631 | return ns; |
3531 | } | 3632 | } |
@@ -3549,7 +3650,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) | |||
3549 | rq = task_rq_lock(p, &flags); | 3650 | rq = task_rq_lock(p, &flags); |
3550 | thread_group_cputime(p, &totals); | 3651 | thread_group_cputime(p, &totals); |
3551 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | 3652 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); |
3552 | task_rq_unlock(rq, &flags); | 3653 | task_rq_unlock(rq, p, &flags); |
3553 | 3654 | ||
3554 | return ns; | 3655 | return ns; |
3555 | } | 3656 | } |
@@ -3903,9 +4004,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
3903 | /* | 4004 | /* |
3904 | * This function gets called by the timer code, with HZ frequency. | 4005 | * This function gets called by the timer code, with HZ frequency. |
3905 | * We call it with interrupts disabled. | 4006 | * We call it with interrupts disabled. |
3906 | * | ||
3907 | * It also gets called by the fork code, when changing the parent's | ||
3908 | * timeslices. | ||
3909 | */ | 4007 | */ |
3910 | void scheduler_tick(void) | 4008 | void scheduler_tick(void) |
3911 | { | 4009 | { |
@@ -4025,17 +4123,11 @@ static inline void schedule_debug(struct task_struct *prev) | |||
4025 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4123 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
4026 | 4124 | ||
4027 | schedstat_inc(this_rq(), sched_count); | 4125 | schedstat_inc(this_rq(), sched_count); |
4028 | #ifdef CONFIG_SCHEDSTATS | ||
4029 | if (unlikely(prev->lock_depth >= 0)) { | ||
4030 | schedstat_inc(this_rq(), rq_sched_info.bkl_count); | ||
4031 | schedstat_inc(prev, sched_info.bkl_count); | ||
4032 | } | ||
4033 | #endif | ||
4034 | } | 4126 | } |
4035 | 4127 | ||
4036 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 4128 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
4037 | { | 4129 | { |
4038 | if (prev->se.on_rq) | 4130 | if (prev->on_rq || rq->skip_clock_update < 0) |
4039 | update_rq_clock(rq); | 4131 | update_rq_clock(rq); |
4040 | prev->sched_class->put_prev_task(rq, prev); | 4132 | prev->sched_class->put_prev_task(rq, prev); |
4041 | } | 4133 | } |
@@ -4097,11 +4189,13 @@ need_resched: | |||
4097 | if (unlikely(signal_pending_state(prev->state, prev))) { | 4189 | if (unlikely(signal_pending_state(prev->state, prev))) { |
4098 | prev->state = TASK_RUNNING; | 4190 | prev->state = TASK_RUNNING; |
4099 | } else { | 4191 | } else { |
4192 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
4193 | prev->on_rq = 0; | ||
4194 | |||
4100 | /* | 4195 | /* |
4101 | * If a worker is going to sleep, notify and | 4196 | * If a worker went to sleep, notify and ask workqueue |
4102 | * ask workqueue whether it wants to wake up a | 4197 | * whether it wants to wake up a task to maintain |
4103 | * task to maintain concurrency. If so, wake | 4198 | * concurrency. |
4104 | * up the task. | ||
4105 | */ | 4199 | */ |
4106 | if (prev->flags & PF_WQ_WORKER) { | 4200 | if (prev->flags & PF_WQ_WORKER) { |
4107 | struct task_struct *to_wakeup; | 4201 | struct task_struct *to_wakeup; |
@@ -4110,21 +4204,20 @@ need_resched: | |||
4110 | if (to_wakeup) | 4204 | if (to_wakeup) |
4111 | try_to_wake_up_local(to_wakeup); | 4205 | try_to_wake_up_local(to_wakeup); |
4112 | } | 4206 | } |
4113 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 4207 | |
4208 | /* | ||
4209 | * If we are going to sleep and we have plugged IO | ||
4210 | * queued, make sure to submit it to avoid deadlocks. | ||
4211 | */ | ||
4212 | if (blk_needs_flush_plug(prev)) { | ||
4213 | raw_spin_unlock(&rq->lock); | ||
4214 | blk_schedule_flush_plug(prev); | ||
4215 | raw_spin_lock(&rq->lock); | ||
4216 | } | ||
4114 | } | 4217 | } |
4115 | switch_count = &prev->nvcsw; | 4218 | switch_count = &prev->nvcsw; |
4116 | } | 4219 | } |
4117 | 4220 | ||
4118 | /* | ||
4119 | * If we are going to sleep and we have plugged IO queued, make | ||
4120 | * sure to submit it to avoid deadlocks. | ||
4121 | */ | ||
4122 | if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) { | ||
4123 | raw_spin_unlock(&rq->lock); | ||
4124 | blk_flush_plug(prev); | ||
4125 | raw_spin_lock(&rq->lock); | ||
4126 | } | ||
4127 | |||
4128 | pre_schedule(rq, prev); | 4221 | pre_schedule(rq, prev); |
4129 | 4222 | ||
4130 | if (unlikely(!rq->nr_running)) | 4223 | if (unlikely(!rq->nr_running)) |
@@ -4161,70 +4254,53 @@ need_resched: | |||
4161 | EXPORT_SYMBOL(schedule); | 4254 | EXPORT_SYMBOL(schedule); |
4162 | 4255 | ||
4163 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4256 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
4164 | /* | ||
4165 | * Look out! "owner" is an entirely speculative pointer | ||
4166 | * access and not reliable. | ||
4167 | */ | ||
4168 | int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | ||
4169 | { | ||
4170 | unsigned int cpu; | ||
4171 | struct rq *rq; | ||
4172 | 4257 | ||
4173 | if (!sched_feat(OWNER_SPIN)) | 4258 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) |
4174 | return 0; | 4259 | { |
4260 | bool ret = false; | ||
4175 | 4261 | ||
4176 | #ifdef CONFIG_DEBUG_PAGEALLOC | 4262 | rcu_read_lock(); |
4177 | /* | 4263 | if (lock->owner != owner) |
4178 | * Need to access the cpu field knowing that | 4264 | goto fail; |
4179 | * DEBUG_PAGEALLOC could have unmapped it if | ||
4180 | * the mutex owner just released it and exited. | ||
4181 | */ | ||
4182 | if (probe_kernel_address(&owner->cpu, cpu)) | ||
4183 | return 0; | ||
4184 | #else | ||
4185 | cpu = owner->cpu; | ||
4186 | #endif | ||
4187 | 4265 | ||
4188 | /* | 4266 | /* |
4189 | * Even if the access succeeded (likely case), | 4267 | * Ensure we emit the owner->on_cpu, dereference _after_ checking |
4190 | * the cpu field may no longer be valid. | 4268 | * lock->owner still matches owner, if that fails, owner might |
4269 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
4270 | * ensures the memory stays valid. | ||
4191 | */ | 4271 | */ |
4192 | if (cpu >= nr_cpumask_bits) | 4272 | barrier(); |
4193 | return 0; | ||
4194 | 4273 | ||
4195 | /* | 4274 | ret = owner->on_cpu; |
4196 | * We need to validate that we can do a | 4275 | fail: |
4197 | * get_cpu() and that we have the percpu area. | 4276 | rcu_read_unlock(); |
4198 | */ | ||
4199 | if (!cpu_online(cpu)) | ||
4200 | return 0; | ||
4201 | 4277 | ||
4202 | rq = cpu_rq(cpu); | 4278 | return ret; |
4279 | } | ||
4203 | 4280 | ||
4204 | for (;;) { | 4281 | /* |
4205 | /* | 4282 | * Look out! "owner" is an entirely speculative pointer |
4206 | * Owner changed, break to re-assess state. | 4283 | * access and not reliable. |
4207 | */ | 4284 | */ |
4208 | if (lock->owner != owner) { | 4285 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
4209 | /* | 4286 | { |
4210 | * If the lock has switched to a different owner, | 4287 | if (!sched_feat(OWNER_SPIN)) |
4211 | * we likely have heavy contention. Return 0 to quit | 4288 | return 0; |
4212 | * optimistic spinning and not contend further: | ||
4213 | */ | ||
4214 | if (lock->owner) | ||
4215 | return 0; | ||
4216 | break; | ||
4217 | } | ||
4218 | 4289 | ||
4219 | /* | 4290 | while (owner_running(lock, owner)) { |
4220 | * Is that owner really running on that cpu? | 4291 | if (need_resched()) |
4221 | */ | ||
4222 | if (task_thread_info(rq->curr) != owner || need_resched()) | ||
4223 | return 0; | 4292 | return 0; |
4224 | 4293 | ||
4225 | arch_mutex_cpu_relax(); | 4294 | arch_mutex_cpu_relax(); |
4226 | } | 4295 | } |
4227 | 4296 | ||
4297 | /* | ||
4298 | * If the owner changed to another task there is likely | ||
4299 | * heavy contention, stop spinning. | ||
4300 | */ | ||
4301 | if (lock->owner) | ||
4302 | return 0; | ||
4303 | |||
4228 | return 1; | 4304 | return 1; |
4229 | } | 4305 | } |
4230 | #endif | 4306 | #endif |
@@ -4684,19 +4760,18 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
4684 | */ | 4760 | */ |
4685 | void rt_mutex_setprio(struct task_struct *p, int prio) | 4761 | void rt_mutex_setprio(struct task_struct *p, int prio) |
4686 | { | 4762 | { |
4687 | unsigned long flags; | ||
4688 | int oldprio, on_rq, running; | 4763 | int oldprio, on_rq, running; |
4689 | struct rq *rq; | 4764 | struct rq *rq; |
4690 | const struct sched_class *prev_class; | 4765 | const struct sched_class *prev_class; |
4691 | 4766 | ||
4692 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4767 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4693 | 4768 | ||
4694 | rq = task_rq_lock(p, &flags); | 4769 | rq = __task_rq_lock(p); |
4695 | 4770 | ||
4696 | trace_sched_pi_setprio(p, prio); | 4771 | trace_sched_pi_setprio(p, prio); |
4697 | oldprio = p->prio; | 4772 | oldprio = p->prio; |
4698 | prev_class = p->sched_class; | 4773 | prev_class = p->sched_class; |
4699 | on_rq = p->se.on_rq; | 4774 | on_rq = p->on_rq; |
4700 | running = task_current(rq, p); | 4775 | running = task_current(rq, p); |
4701 | if (on_rq) | 4776 | if (on_rq) |
4702 | dequeue_task(rq, p, 0); | 4777 | dequeue_task(rq, p, 0); |
@@ -4716,7 +4791,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4716 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4791 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4717 | 4792 | ||
4718 | check_class_changed(rq, p, prev_class, oldprio); | 4793 | check_class_changed(rq, p, prev_class, oldprio); |
4719 | task_rq_unlock(rq, &flags); | 4794 | __task_rq_unlock(rq); |
4720 | } | 4795 | } |
4721 | 4796 | ||
4722 | #endif | 4797 | #endif |
@@ -4744,7 +4819,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4744 | p->static_prio = NICE_TO_PRIO(nice); | 4819 | p->static_prio = NICE_TO_PRIO(nice); |
4745 | goto out_unlock; | 4820 | goto out_unlock; |
4746 | } | 4821 | } |
4747 | on_rq = p->se.on_rq; | 4822 | on_rq = p->on_rq; |
4748 | if (on_rq) | 4823 | if (on_rq) |
4749 | dequeue_task(rq, p, 0); | 4824 | dequeue_task(rq, p, 0); |
4750 | 4825 | ||
@@ -4764,7 +4839,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4764 | resched_task(rq->curr); | 4839 | resched_task(rq->curr); |
4765 | } | 4840 | } |
4766 | out_unlock: | 4841 | out_unlock: |
4767 | task_rq_unlock(rq, &flags); | 4842 | task_rq_unlock(rq, p, &flags); |
4768 | } | 4843 | } |
4769 | EXPORT_SYMBOL(set_user_nice); | 4844 | EXPORT_SYMBOL(set_user_nice); |
4770 | 4845 | ||
@@ -4878,8 +4953,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
4878 | static void | 4953 | static void |
4879 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 4954 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
4880 | { | 4955 | { |
4881 | BUG_ON(p->se.on_rq); | ||
4882 | |||
4883 | p->policy = policy; | 4956 | p->policy = policy; |
4884 | p->rt_priority = prio; | 4957 | p->rt_priority = prio; |
4885 | p->normal_prio = normal_prio(p); | 4958 | p->normal_prio = normal_prio(p); |
@@ -4994,20 +5067,17 @@ recheck: | |||
4994 | /* | 5067 | /* |
4995 | * make sure no PI-waiters arrive (or leave) while we are | 5068 | * make sure no PI-waiters arrive (or leave) while we are |
4996 | * changing the priority of the task: | 5069 | * changing the priority of the task: |
4997 | */ | 5070 | * |
4998 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 5071 | * To be able to change p->policy safely, the appropriate |
4999 | /* | ||
5000 | * To be able to change p->policy safely, the apropriate | ||
5001 | * runqueue lock must be held. | 5072 | * runqueue lock must be held. |
5002 | */ | 5073 | */ |
5003 | rq = __task_rq_lock(p); | 5074 | rq = task_rq_lock(p, &flags); |
5004 | 5075 | ||
5005 | /* | 5076 | /* |
5006 | * Changing the policy of the stop threads its a very bad idea | 5077 | * Changing the policy of the stop threads its a very bad idea |
5007 | */ | 5078 | */ |
5008 | if (p == rq->stop) { | 5079 | if (p == rq->stop) { |
5009 | __task_rq_unlock(rq); | 5080 | task_rq_unlock(rq, p, &flags); |
5010 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5011 | return -EINVAL; | 5081 | return -EINVAL; |
5012 | } | 5082 | } |
5013 | 5083 | ||
@@ -5031,8 +5101,7 @@ recheck: | |||
5031 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 5101 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
5032 | task_group(p)->rt_bandwidth.rt_runtime == 0 && | 5102 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
5033 | !task_group_is_autogroup(task_group(p))) { | 5103 | !task_group_is_autogroup(task_group(p))) { |
5034 | __task_rq_unlock(rq); | 5104 | task_rq_unlock(rq, p, &flags); |
5035 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5036 | return -EPERM; | 5105 | return -EPERM; |
5037 | } | 5106 | } |
5038 | } | 5107 | } |
@@ -5041,11 +5110,10 @@ recheck: | |||
5041 | /* recheck policy now with rq lock held */ | 5110 | /* recheck policy now with rq lock held */ |
5042 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 5111 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
5043 | policy = oldpolicy = -1; | 5112 | policy = oldpolicy = -1; |
5044 | __task_rq_unlock(rq); | 5113 | task_rq_unlock(rq, p, &flags); |
5045 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5046 | goto recheck; | 5114 | goto recheck; |
5047 | } | 5115 | } |
5048 | on_rq = p->se.on_rq; | 5116 | on_rq = p->on_rq; |
5049 | running = task_current(rq, p); | 5117 | running = task_current(rq, p); |
5050 | if (on_rq) | 5118 | if (on_rq) |
5051 | deactivate_task(rq, p, 0); | 5119 | deactivate_task(rq, p, 0); |
@@ -5064,8 +5132,7 @@ recheck: | |||
5064 | activate_task(rq, p, 0); | 5132 | activate_task(rq, p, 0); |
5065 | 5133 | ||
5066 | check_class_changed(rq, p, prev_class, oldprio); | 5134 | check_class_changed(rq, p, prev_class, oldprio); |
5067 | __task_rq_unlock(rq); | 5135 | task_rq_unlock(rq, p, &flags); |
5068 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
5069 | 5136 | ||
5070 | rt_mutex_adjust_pi(p); | 5137 | rt_mutex_adjust_pi(p); |
5071 | 5138 | ||
@@ -5316,7 +5383,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5316 | { | 5383 | { |
5317 | struct task_struct *p; | 5384 | struct task_struct *p; |
5318 | unsigned long flags; | 5385 | unsigned long flags; |
5319 | struct rq *rq; | ||
5320 | int retval; | 5386 | int retval; |
5321 | 5387 | ||
5322 | get_online_cpus(); | 5388 | get_online_cpus(); |
@@ -5331,9 +5397,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
5331 | if (retval) | 5397 | if (retval) |
5332 | goto out_unlock; | 5398 | goto out_unlock; |
5333 | 5399 | ||
5334 | rq = task_rq_lock(p, &flags); | 5400 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
5335 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 5401 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
5336 | task_rq_unlock(rq, &flags); | 5402 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
5337 | 5403 | ||
5338 | out_unlock: | 5404 | out_unlock: |
5339 | rcu_read_unlock(); | 5405 | rcu_read_unlock(); |
@@ -5658,7 +5724,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
5658 | 5724 | ||
5659 | rq = task_rq_lock(p, &flags); | 5725 | rq = task_rq_lock(p, &flags); |
5660 | time_slice = p->sched_class->get_rr_interval(rq, p); | 5726 | time_slice = p->sched_class->get_rr_interval(rq, p); |
5661 | task_rq_unlock(rq, &flags); | 5727 | task_rq_unlock(rq, p, &flags); |
5662 | 5728 | ||
5663 | rcu_read_unlock(); | 5729 | rcu_read_unlock(); |
5664 | jiffies_to_timespec(time_slice, &t); | 5730 | jiffies_to_timespec(time_slice, &t); |
@@ -5716,7 +5782,7 @@ void show_state_filter(unsigned long state_filter) | |||
5716 | do_each_thread(g, p) { | 5782 | do_each_thread(g, p) { |
5717 | /* | 5783 | /* |
5718 | * reset the NMI-timeout, listing all files on a slow | 5784 | * reset the NMI-timeout, listing all files on a slow |
5719 | * console might take alot of time: | 5785 | * console might take a lot of time: |
5720 | */ | 5786 | */ |
5721 | touch_nmi_watchdog(); | 5787 | touch_nmi_watchdog(); |
5722 | if (!state_filter || (p->state & state_filter)) | 5788 | if (!state_filter || (p->state & state_filter)) |
@@ -5776,17 +5842,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5776 | rcu_read_unlock(); | 5842 | rcu_read_unlock(); |
5777 | 5843 | ||
5778 | rq->curr = rq->idle = idle; | 5844 | rq->curr = rq->idle = idle; |
5779 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5845 | #if defined(CONFIG_SMP) |
5780 | idle->oncpu = 1; | 5846 | idle->on_cpu = 1; |
5781 | #endif | 5847 | #endif |
5782 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5848 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
5783 | 5849 | ||
5784 | /* Set the preempt count _outside_ the spinlocks! */ | 5850 | /* Set the preempt count _outside_ the spinlocks! */ |
5785 | #if defined(CONFIG_PREEMPT) | ||
5786 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
5787 | #else | ||
5788 | task_thread_info(idle)->preempt_count = 0; | 5851 | task_thread_info(idle)->preempt_count = 0; |
5789 | #endif | 5852 | |
5790 | /* | 5853 | /* |
5791 | * The idle tasks have their own, simple scheduling class: | 5854 | * The idle tasks have their own, simple scheduling class: |
5792 | */ | 5855 | */ |
@@ -5881,26 +5944,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
5881 | unsigned int dest_cpu; | 5944 | unsigned int dest_cpu; |
5882 | int ret = 0; | 5945 | int ret = 0; |
5883 | 5946 | ||
5884 | /* | ||
5885 | * Serialize against TASK_WAKING so that ttwu() and wunt() can | ||
5886 | * drop the rq->lock and still rely on ->cpus_allowed. | ||
5887 | */ | ||
5888 | again: | ||
5889 | while (task_is_waking(p)) | ||
5890 | cpu_relax(); | ||
5891 | rq = task_rq_lock(p, &flags); | 5947 | rq = task_rq_lock(p, &flags); |
5892 | if (task_is_waking(p)) { | 5948 | |
5893 | task_rq_unlock(rq, &flags); | 5949 | if (cpumask_equal(&p->cpus_allowed, new_mask)) |
5894 | goto again; | 5950 | goto out; |
5895 | } | ||
5896 | 5951 | ||
5897 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 5952 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
5898 | ret = -EINVAL; | 5953 | ret = -EINVAL; |
5899 | goto out; | 5954 | goto out; |
5900 | } | 5955 | } |
5901 | 5956 | ||
5902 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | 5957 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { |
5903 | !cpumask_equal(&p->cpus_allowed, new_mask))) { | ||
5904 | ret = -EINVAL; | 5958 | ret = -EINVAL; |
5905 | goto out; | 5959 | goto out; |
5906 | } | 5960 | } |
@@ -5917,16 +5971,16 @@ again: | |||
5917 | goto out; | 5971 | goto out; |
5918 | 5972 | ||
5919 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 5973 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5920 | if (migrate_task(p, rq)) { | 5974 | if (p->on_rq) { |
5921 | struct migration_arg arg = { p, dest_cpu }; | 5975 | struct migration_arg arg = { p, dest_cpu }; |
5922 | /* Need help from migration thread: drop lock and wait. */ | 5976 | /* Need help from migration thread: drop lock and wait. */ |
5923 | task_rq_unlock(rq, &flags); | 5977 | task_rq_unlock(rq, p, &flags); |
5924 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 5978 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
5925 | tlb_migrate_finish(p->mm); | 5979 | tlb_migrate_finish(p->mm); |
5926 | return 0; | 5980 | return 0; |
5927 | } | 5981 | } |
5928 | out: | 5982 | out: |
5929 | task_rq_unlock(rq, &flags); | 5983 | task_rq_unlock(rq, p, &flags); |
5930 | 5984 | ||
5931 | return ret; | 5985 | return ret; |
5932 | } | 5986 | } |
@@ -5954,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5954 | rq_src = cpu_rq(src_cpu); | 6008 | rq_src = cpu_rq(src_cpu); |
5955 | rq_dest = cpu_rq(dest_cpu); | 6009 | rq_dest = cpu_rq(dest_cpu); |
5956 | 6010 | ||
6011 | raw_spin_lock(&p->pi_lock); | ||
5957 | double_rq_lock(rq_src, rq_dest); | 6012 | double_rq_lock(rq_src, rq_dest); |
5958 | /* Already moved. */ | 6013 | /* Already moved. */ |
5959 | if (task_cpu(p) != src_cpu) | 6014 | if (task_cpu(p) != src_cpu) |
@@ -5966,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
5966 | * If we're not on a rq, the next wake-up will ensure we're | 6021 | * If we're not on a rq, the next wake-up will ensure we're |
5967 | * placed properly. | 6022 | * placed properly. |
5968 | */ | 6023 | */ |
5969 | if (p->se.on_rq) { | 6024 | if (p->on_rq) { |
5970 | deactivate_task(rq_src, p, 0); | 6025 | deactivate_task(rq_src, p, 0); |
5971 | set_task_cpu(p, dest_cpu); | 6026 | set_task_cpu(p, dest_cpu); |
5972 | activate_task(rq_dest, p, 0); | 6027 | activate_task(rq_dest, p, 0); |
@@ -5976,6 +6031,7 @@ done: | |||
5976 | ret = 1; | 6031 | ret = 1; |
5977 | fail: | 6032 | fail: |
5978 | double_rq_unlock(rq_src, rq_dest); | 6033 | double_rq_unlock(rq_src, rq_dest); |
6034 | raw_spin_unlock(&p->pi_lock); | ||
5979 | return ret; | 6035 | return ret; |
5980 | } | 6036 | } |
5981 | 6037 | ||
@@ -6316,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6316 | 6372 | ||
6317 | #ifdef CONFIG_HOTPLUG_CPU | 6373 | #ifdef CONFIG_HOTPLUG_CPU |
6318 | case CPU_DYING: | 6374 | case CPU_DYING: |
6375 | sched_ttwu_pending(); | ||
6319 | /* Update our root-domain */ | 6376 | /* Update our root-domain */ |
6320 | raw_spin_lock_irqsave(&rq->lock, flags); | 6377 | raw_spin_lock_irqsave(&rq->lock, flags); |
6321 | if (rq->rd) { | 6378 | if (rq->rd) { |
@@ -6331,6 +6388,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6331 | break; | 6388 | break; |
6332 | #endif | 6389 | #endif |
6333 | } | 6390 | } |
6391 | |||
6392 | update_max_interval(); | ||
6393 | |||
6334 | return NOTIFY_OK; | 6394 | return NOTIFY_OK; |
6335 | } | 6395 | } |
6336 | 6396 | ||
@@ -6391,6 +6451,8 @@ early_initcall(migration_init); | |||
6391 | 6451 | ||
6392 | #ifdef CONFIG_SMP | 6452 | #ifdef CONFIG_SMP |
6393 | 6453 | ||
6454 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
6455 | |||
6394 | #ifdef CONFIG_SCHED_DEBUG | 6456 | #ifdef CONFIG_SCHED_DEBUG |
6395 | 6457 | ||
6396 | static __read_mostly int sched_domain_debug_enabled; | 6458 | static __read_mostly int sched_domain_debug_enabled; |
@@ -6486,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
6486 | 6548 | ||
6487 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6549 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
6488 | { | 6550 | { |
6489 | cpumask_var_t groupmask; | ||
6490 | int level = 0; | 6551 | int level = 0; |
6491 | 6552 | ||
6492 | if (!sched_domain_debug_enabled) | 6553 | if (!sched_domain_debug_enabled) |
@@ -6499,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
6499 | 6560 | ||
6500 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6561 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
6501 | 6562 | ||
6502 | if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { | ||
6503 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
6504 | return; | ||
6505 | } | ||
6506 | |||
6507 | for (;;) { | 6563 | for (;;) { |
6508 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) | 6564 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
6509 | break; | 6565 | break; |
6510 | level++; | 6566 | level++; |
6511 | sd = sd->parent; | 6567 | sd = sd->parent; |
6512 | if (!sd) | 6568 | if (!sd) |
6513 | break; | 6569 | break; |
6514 | } | 6570 | } |
6515 | free_cpumask_var(groupmask); | ||
6516 | } | 6571 | } |
6517 | #else /* !CONFIG_SCHED_DEBUG */ | 6572 | #else /* !CONFIG_SCHED_DEBUG */ |
6518 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6573 | # define sched_domain_debug(sd, cpu) do { } while (0) |
@@ -6569,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
6569 | return 1; | 6624 | return 1; |
6570 | } | 6625 | } |
6571 | 6626 | ||
6572 | static void free_rootdomain(struct root_domain *rd) | 6627 | static void free_rootdomain(struct rcu_head *rcu) |
6573 | { | 6628 | { |
6574 | synchronize_sched(); | 6629 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
6575 | 6630 | ||
6576 | cpupri_cleanup(&rd->cpupri); | 6631 | cpupri_cleanup(&rd->cpupri); |
6577 | |||
6578 | free_cpumask_var(rd->rto_mask); | 6632 | free_cpumask_var(rd->rto_mask); |
6579 | free_cpumask_var(rd->online); | 6633 | free_cpumask_var(rd->online); |
6580 | free_cpumask_var(rd->span); | 6634 | free_cpumask_var(rd->span); |
@@ -6615,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
6615 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6669 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6616 | 6670 | ||
6617 | if (old_rd) | 6671 | if (old_rd) |
6618 | free_rootdomain(old_rd); | 6672 | call_rcu_sched(&old_rd->rcu, free_rootdomain); |
6619 | } | 6673 | } |
6620 | 6674 | ||
6621 | static int init_rootdomain(struct root_domain *rd) | 6675 | static int init_rootdomain(struct root_domain *rd) |
@@ -6666,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void) | |||
6666 | return rd; | 6720 | return rd; |
6667 | } | 6721 | } |
6668 | 6722 | ||
6723 | static void free_sched_domain(struct rcu_head *rcu) | ||
6724 | { | ||
6725 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
6726 | if (atomic_dec_and_test(&sd->groups->ref)) | ||
6727 | kfree(sd->groups); | ||
6728 | kfree(sd); | ||
6729 | } | ||
6730 | |||
6731 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | ||
6732 | { | ||
6733 | call_rcu(&sd->rcu, free_sched_domain); | ||
6734 | } | ||
6735 | |||
6736 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | ||
6737 | { | ||
6738 | for (; sd; sd = sd->parent) | ||
6739 | destroy_sched_domain(sd, cpu); | ||
6740 | } | ||
6741 | |||
6669 | /* | 6742 | /* |
6670 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6743 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
6671 | * hold the hotplug lock. | 6744 | * hold the hotplug lock. |
@@ -6676,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6676 | struct rq *rq = cpu_rq(cpu); | 6749 | struct rq *rq = cpu_rq(cpu); |
6677 | struct sched_domain *tmp; | 6750 | struct sched_domain *tmp; |
6678 | 6751 | ||
6679 | for (tmp = sd; tmp; tmp = tmp->parent) | ||
6680 | tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); | ||
6681 | |||
6682 | /* Remove the sched domains which do not contribute to scheduling. */ | 6752 | /* Remove the sched domains which do not contribute to scheduling. */ |
6683 | for (tmp = sd; tmp; ) { | 6753 | for (tmp = sd; tmp; ) { |
6684 | struct sched_domain *parent = tmp->parent; | 6754 | struct sched_domain *parent = tmp->parent; |
@@ -6689,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6689 | tmp->parent = parent->parent; | 6759 | tmp->parent = parent->parent; |
6690 | if (parent->parent) | 6760 | if (parent->parent) |
6691 | parent->parent->child = tmp; | 6761 | parent->parent->child = tmp; |
6762 | destroy_sched_domain(parent, cpu); | ||
6692 | } else | 6763 | } else |
6693 | tmp = tmp->parent; | 6764 | tmp = tmp->parent; |
6694 | } | 6765 | } |
6695 | 6766 | ||
6696 | if (sd && sd_degenerate(sd)) { | 6767 | if (sd && sd_degenerate(sd)) { |
6768 | tmp = sd; | ||
6697 | sd = sd->parent; | 6769 | sd = sd->parent; |
6770 | destroy_sched_domain(tmp, cpu); | ||
6698 | if (sd) | 6771 | if (sd) |
6699 | sd->child = NULL; | 6772 | sd->child = NULL; |
6700 | } | 6773 | } |
@@ -6702,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6702 | sched_domain_debug(sd, cpu); | 6775 | sched_domain_debug(sd, cpu); |
6703 | 6776 | ||
6704 | rq_attach_root(rq, rd); | 6777 | rq_attach_root(rq, rd); |
6778 | tmp = rq->sd; | ||
6705 | rcu_assign_pointer(rq->sd, sd); | 6779 | rcu_assign_pointer(rq->sd, sd); |
6780 | destroy_sched_domains(tmp, cpu); | ||
6706 | } | 6781 | } |
6707 | 6782 | ||
6708 | /* cpus with isolated domains */ | 6783 | /* cpus with isolated domains */ |
@@ -6718,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str) | |||
6718 | 6793 | ||
6719 | __setup("isolcpus=", isolated_cpu_setup); | 6794 | __setup("isolcpus=", isolated_cpu_setup); |
6720 | 6795 | ||
6721 | /* | ||
6722 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | ||
6723 | * to a function which identifies what group(along with sched group) a CPU | ||
6724 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids | ||
6725 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6726 | * | ||
6727 | * init_sched_build_groups will build a circular linked list of the groups | ||
6728 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6729 | * and ->cpu_power to 0. | ||
6730 | */ | ||
6731 | static void | ||
6732 | init_sched_build_groups(const struct cpumask *span, | ||
6733 | const struct cpumask *cpu_map, | ||
6734 | int (*group_fn)(int cpu, const struct cpumask *cpu_map, | ||
6735 | struct sched_group **sg, | ||
6736 | struct cpumask *tmpmask), | ||
6737 | struct cpumask *covered, struct cpumask *tmpmask) | ||
6738 | { | ||
6739 | struct sched_group *first = NULL, *last = NULL; | ||
6740 | int i; | ||
6741 | |||
6742 | cpumask_clear(covered); | ||
6743 | |||
6744 | for_each_cpu(i, span) { | ||
6745 | struct sched_group *sg; | ||
6746 | int group = group_fn(i, cpu_map, &sg, tmpmask); | ||
6747 | int j; | ||
6748 | |||
6749 | if (cpumask_test_cpu(i, covered)) | ||
6750 | continue; | ||
6751 | |||
6752 | cpumask_clear(sched_group_cpus(sg)); | ||
6753 | sg->cpu_power = 0; | ||
6754 | |||
6755 | for_each_cpu(j, span) { | ||
6756 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | ||
6757 | continue; | ||
6758 | |||
6759 | cpumask_set_cpu(j, covered); | ||
6760 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6761 | } | ||
6762 | if (!first) | ||
6763 | first = sg; | ||
6764 | if (last) | ||
6765 | last->next = sg; | ||
6766 | last = sg; | ||
6767 | } | ||
6768 | last->next = first; | ||
6769 | } | ||
6770 | |||
6771 | #define SD_NODES_PER_DOMAIN 16 | 6796 | #define SD_NODES_PER_DOMAIN 16 |
6772 | 6797 | ||
6773 | #ifdef CONFIG_NUMA | 6798 | #ifdef CONFIG_NUMA |
@@ -6784,7 +6809,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
6784 | */ | 6809 | */ |
6785 | static int find_next_best_node(int node, nodemask_t *used_nodes) | 6810 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
6786 | { | 6811 | { |
6787 | int i, n, val, min_val, best_node = 0; | 6812 | int i, n, val, min_val, best_node = -1; |
6788 | 6813 | ||
6789 | min_val = INT_MAX; | 6814 | min_val = INT_MAX; |
6790 | 6815 | ||
@@ -6808,7 +6833,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
6808 | } | 6833 | } |
6809 | } | 6834 | } |
6810 | 6835 | ||
6811 | node_set(best_node, *used_nodes); | 6836 | if (best_node != -1) |
6837 | node_set(best_node, *used_nodes); | ||
6812 | return best_node; | 6838 | return best_node; |
6813 | } | 6839 | } |
6814 | 6840 | ||
@@ -6834,315 +6860,130 @@ static void sched_domain_node_span(int node, struct cpumask *span) | |||
6834 | 6860 | ||
6835 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6861 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6836 | int next_node = find_next_best_node(node, &used_nodes); | 6862 | int next_node = find_next_best_node(node, &used_nodes); |
6837 | 6863 | if (next_node < 0) | |
6864 | break; | ||
6838 | cpumask_or(span, span, cpumask_of_node(next_node)); | 6865 | cpumask_or(span, span, cpumask_of_node(next_node)); |
6839 | } | 6866 | } |
6840 | } | 6867 | } |
6868 | |||
6869 | static const struct cpumask *cpu_node_mask(int cpu) | ||
6870 | { | ||
6871 | lockdep_assert_held(&sched_domains_mutex); | ||
6872 | |||
6873 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
6874 | |||
6875 | return sched_domains_tmpmask; | ||
6876 | } | ||
6877 | |||
6878 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
6879 | { | ||
6880 | return cpu_possible_mask; | ||
6881 | } | ||
6841 | #endif /* CONFIG_NUMA */ | 6882 | #endif /* CONFIG_NUMA */ |
6842 | 6883 | ||
6843 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6884 | static const struct cpumask *cpu_cpu_mask(int cpu) |
6885 | { | ||
6886 | return cpumask_of_node(cpu_to_node(cpu)); | ||
6887 | } | ||
6844 | 6888 | ||
6845 | /* | 6889 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6846 | * The cpus mask in sched_group and sched_domain hangs off the end. | ||
6847 | * | ||
6848 | * ( See the the comments in include/linux/sched.h:struct sched_group | ||
6849 | * and struct sched_domain. ) | ||
6850 | */ | ||
6851 | struct static_sched_group { | ||
6852 | struct sched_group sg; | ||
6853 | DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); | ||
6854 | }; | ||
6855 | 6890 | ||
6856 | struct static_sched_domain { | 6891 | struct sd_data { |
6857 | struct sched_domain sd; | 6892 | struct sched_domain **__percpu sd; |
6858 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 6893 | struct sched_group **__percpu sg; |
6859 | }; | 6894 | }; |
6860 | 6895 | ||
6861 | struct s_data { | 6896 | struct s_data { |
6862 | #ifdef CONFIG_NUMA | 6897 | struct sched_domain ** __percpu sd; |
6863 | int sd_allnodes; | ||
6864 | cpumask_var_t domainspan; | ||
6865 | cpumask_var_t covered; | ||
6866 | cpumask_var_t notcovered; | ||
6867 | #endif | ||
6868 | cpumask_var_t nodemask; | ||
6869 | cpumask_var_t this_sibling_map; | ||
6870 | cpumask_var_t this_core_map; | ||
6871 | cpumask_var_t this_book_map; | ||
6872 | cpumask_var_t send_covered; | ||
6873 | cpumask_var_t tmpmask; | ||
6874 | struct sched_group **sched_group_nodes; | ||
6875 | struct root_domain *rd; | 6898 | struct root_domain *rd; |
6876 | }; | 6899 | }; |
6877 | 6900 | ||
6878 | enum s_alloc { | 6901 | enum s_alloc { |
6879 | sa_sched_groups = 0, | ||
6880 | sa_rootdomain, | 6902 | sa_rootdomain, |
6881 | sa_tmpmask, | 6903 | sa_sd, |
6882 | sa_send_covered, | 6904 | sa_sd_storage, |
6883 | sa_this_book_map, | ||
6884 | sa_this_core_map, | ||
6885 | sa_this_sibling_map, | ||
6886 | sa_nodemask, | ||
6887 | sa_sched_group_nodes, | ||
6888 | #ifdef CONFIG_NUMA | ||
6889 | sa_notcovered, | ||
6890 | sa_covered, | ||
6891 | sa_domainspan, | ||
6892 | #endif | ||
6893 | sa_none, | 6905 | sa_none, |
6894 | }; | 6906 | }; |
6895 | 6907 | ||
6896 | /* | 6908 | struct sched_domain_topology_level; |
6897 | * SMT sched-domains: | ||
6898 | */ | ||
6899 | #ifdef CONFIG_SCHED_SMT | ||
6900 | static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); | ||
6901 | static DEFINE_PER_CPU(struct static_sched_group, sched_groups); | ||
6902 | 6909 | ||
6903 | static int | 6910 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
6904 | cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | 6911 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
6905 | struct sched_group **sg, struct cpumask *unused) | ||
6906 | { | ||
6907 | if (sg) | ||
6908 | *sg = &per_cpu(sched_groups, cpu).sg; | ||
6909 | return cpu; | ||
6910 | } | ||
6911 | #endif /* CONFIG_SCHED_SMT */ | ||
6912 | 6912 | ||
6913 | /* | 6913 | struct sched_domain_topology_level { |
6914 | * multi-core sched-domains: | 6914 | sched_domain_init_f init; |
6915 | */ | 6915 | sched_domain_mask_f mask; |
6916 | #ifdef CONFIG_SCHED_MC | 6916 | struct sd_data data; |
6917 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | 6917 | }; |
6918 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | ||
6919 | |||
6920 | static int | ||
6921 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | ||
6922 | struct sched_group **sg, struct cpumask *mask) | ||
6923 | { | ||
6924 | int group; | ||
6925 | #ifdef CONFIG_SCHED_SMT | ||
6926 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6927 | group = cpumask_first(mask); | ||
6928 | #else | ||
6929 | group = cpu; | ||
6930 | #endif | ||
6931 | if (sg) | ||
6932 | *sg = &per_cpu(sched_group_core, group).sg; | ||
6933 | return group; | ||
6934 | } | ||
6935 | #endif /* CONFIG_SCHED_MC */ | ||
6936 | 6918 | ||
6937 | /* | 6919 | /* |
6938 | * book sched-domains: | 6920 | * Assumes the sched_domain tree is fully constructed |
6939 | */ | 6921 | */ |
6940 | #ifdef CONFIG_SCHED_BOOK | 6922 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) |
6941 | static DEFINE_PER_CPU(struct static_sched_domain, book_domains); | ||
6942 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); | ||
6943 | |||
6944 | static int | ||
6945 | cpu_to_book_group(int cpu, const struct cpumask *cpu_map, | ||
6946 | struct sched_group **sg, struct cpumask *mask) | ||
6947 | { | 6923 | { |
6948 | int group = cpu; | 6924 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
6949 | #ifdef CONFIG_SCHED_MC | 6925 | struct sched_domain *child = sd->child; |
6950 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6951 | group = cpumask_first(mask); | ||
6952 | #elif defined(CONFIG_SCHED_SMT) | ||
6953 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6954 | group = cpumask_first(mask); | ||
6955 | #endif | ||
6956 | if (sg) | ||
6957 | *sg = &per_cpu(sched_group_book, group).sg; | ||
6958 | return group; | ||
6959 | } | ||
6960 | #endif /* CONFIG_SCHED_BOOK */ | ||
6961 | 6926 | ||
6962 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 6927 | if (child) |
6963 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | 6928 | cpu = cpumask_first(sched_domain_span(child)); |
6964 | 6929 | ||
6965 | static int | ||
6966 | cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | ||
6967 | struct sched_group **sg, struct cpumask *mask) | ||
6968 | { | ||
6969 | int group; | ||
6970 | #ifdef CONFIG_SCHED_BOOK | ||
6971 | cpumask_and(mask, cpu_book_mask(cpu), cpu_map); | ||
6972 | group = cpumask_first(mask); | ||
6973 | #elif defined(CONFIG_SCHED_MC) | ||
6974 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6975 | group = cpumask_first(mask); | ||
6976 | #elif defined(CONFIG_SCHED_SMT) | ||
6977 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6978 | group = cpumask_first(mask); | ||
6979 | #else | ||
6980 | group = cpu; | ||
6981 | #endif | ||
6982 | if (sg) | 6930 | if (sg) |
6983 | *sg = &per_cpu(sched_group_phys, group).sg; | 6931 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
6984 | return group; | 6932 | |
6933 | return cpu; | ||
6985 | } | 6934 | } |
6986 | 6935 | ||
6987 | #ifdef CONFIG_NUMA | ||
6988 | /* | 6936 | /* |
6989 | * The init_sched_build_groups can't handle what we want to do with node | 6937 | * build_sched_groups takes the cpumask we wish to span, and a pointer |
6990 | * groups, so roll our own. Now each node has its own list of groups which | 6938 | * to a function which identifies what group(along with sched group) a CPU |
6991 | * gets dynamically allocated. | 6939 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids |
6940 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
6941 | * | ||
6942 | * build_sched_groups will build a circular linked list of the groups | ||
6943 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6944 | * and ->cpu_power to 0. | ||
6992 | */ | 6945 | */ |
6993 | static DEFINE_PER_CPU(struct static_sched_domain, node_domains); | 6946 | static void |
6994 | static struct sched_group ***sched_group_nodes_bycpu; | 6947 | build_sched_groups(struct sched_domain *sd) |
6995 | |||
6996 | static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); | ||
6997 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); | ||
6998 | |||
6999 | static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, | ||
7000 | struct sched_group **sg, | ||
7001 | struct cpumask *nodemask) | ||
7002 | { | ||
7003 | int group; | ||
7004 | |||
7005 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); | ||
7006 | group = cpumask_first(nodemask); | ||
7007 | |||
7008 | if (sg) | ||
7009 | *sg = &per_cpu(sched_group_allnodes, group).sg; | ||
7010 | return group; | ||
7011 | } | ||
7012 | |||
7013 | static void init_numa_sched_groups_power(struct sched_group *group_head) | ||
7014 | { | ||
7015 | struct sched_group *sg = group_head; | ||
7016 | int j; | ||
7017 | |||
7018 | if (!sg) | ||
7019 | return; | ||
7020 | do { | ||
7021 | for_each_cpu(j, sched_group_cpus(sg)) { | ||
7022 | struct sched_domain *sd; | ||
7023 | |||
7024 | sd = &per_cpu(phys_domains, j).sd; | ||
7025 | if (j != group_first_cpu(sd->groups)) { | ||
7026 | /* | ||
7027 | * Only add "power" once for each | ||
7028 | * physical package. | ||
7029 | */ | ||
7030 | continue; | ||
7031 | } | ||
7032 | |||
7033 | sg->cpu_power += sd->groups->cpu_power; | ||
7034 | } | ||
7035 | sg = sg->next; | ||
7036 | } while (sg != group_head); | ||
7037 | } | ||
7038 | |||
7039 | static int build_numa_sched_groups(struct s_data *d, | ||
7040 | const struct cpumask *cpu_map, int num) | ||
7041 | { | 6948 | { |
7042 | struct sched_domain *sd; | 6949 | struct sched_group *first = NULL, *last = NULL; |
7043 | struct sched_group *sg, *prev; | 6950 | struct sd_data *sdd = sd->private; |
7044 | int n, j; | 6951 | const struct cpumask *span = sched_domain_span(sd); |
7045 | 6952 | struct cpumask *covered; | |
7046 | cpumask_clear(d->covered); | 6953 | int i; |
7047 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
7048 | if (cpumask_empty(d->nodemask)) { | ||
7049 | d->sched_group_nodes[num] = NULL; | ||
7050 | goto out; | ||
7051 | } | ||
7052 | |||
7053 | sched_domain_node_span(num, d->domainspan); | ||
7054 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
7055 | |||
7056 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7057 | GFP_KERNEL, num); | ||
7058 | if (!sg) { | ||
7059 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
7060 | num); | ||
7061 | return -ENOMEM; | ||
7062 | } | ||
7063 | d->sched_group_nodes[num] = sg; | ||
7064 | |||
7065 | for_each_cpu(j, d->nodemask) { | ||
7066 | sd = &per_cpu(node_domains, j).sd; | ||
7067 | sd->groups = sg; | ||
7068 | } | ||
7069 | 6954 | ||
7070 | sg->cpu_power = 0; | 6955 | lockdep_assert_held(&sched_domains_mutex); |
7071 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | 6956 | covered = sched_domains_tmpmask; |
7072 | sg->next = sg; | ||
7073 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
7074 | 6957 | ||
7075 | prev = sg; | 6958 | cpumask_clear(covered); |
7076 | for (j = 0; j < nr_node_ids; j++) { | ||
7077 | n = (num + j) % nr_node_ids; | ||
7078 | cpumask_complement(d->notcovered, d->covered); | ||
7079 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
7080 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
7081 | if (cpumask_empty(d->tmpmask)) | ||
7082 | break; | ||
7083 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
7084 | if (cpumask_empty(d->tmpmask)) | ||
7085 | continue; | ||
7086 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7087 | GFP_KERNEL, num); | ||
7088 | if (!sg) { | ||
7089 | printk(KERN_WARNING | ||
7090 | "Can not alloc domain group for node %d\n", j); | ||
7091 | return -ENOMEM; | ||
7092 | } | ||
7093 | sg->cpu_power = 0; | ||
7094 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
7095 | sg->next = prev->next; | ||
7096 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
7097 | prev->next = sg; | ||
7098 | prev = sg; | ||
7099 | } | ||
7100 | out: | ||
7101 | return 0; | ||
7102 | } | ||
7103 | #endif /* CONFIG_NUMA */ | ||
7104 | |||
7105 | #ifdef CONFIG_NUMA | ||
7106 | /* Free memory allocated for various sched_group structures */ | ||
7107 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
7108 | struct cpumask *nodemask) | ||
7109 | { | ||
7110 | int cpu, i; | ||
7111 | 6959 | ||
7112 | for_each_cpu(cpu, cpu_map) { | 6960 | for_each_cpu(i, span) { |
7113 | struct sched_group **sched_group_nodes | 6961 | struct sched_group *sg; |
7114 | = sched_group_nodes_bycpu[cpu]; | 6962 | int group = get_group(i, sdd, &sg); |
6963 | int j; | ||
7115 | 6964 | ||
7116 | if (!sched_group_nodes) | 6965 | if (cpumask_test_cpu(i, covered)) |
7117 | continue; | 6966 | continue; |
7118 | 6967 | ||
7119 | for (i = 0; i < nr_node_ids; i++) { | 6968 | cpumask_clear(sched_group_cpus(sg)); |
7120 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 6969 | sg->cpu_power = 0; |
7121 | 6970 | ||
7122 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 6971 | for_each_cpu(j, span) { |
7123 | if (cpumask_empty(nodemask)) | 6972 | if (get_group(j, sdd, NULL) != group) |
7124 | continue; | 6973 | continue; |
7125 | 6974 | ||
7126 | if (sg == NULL) | 6975 | cpumask_set_cpu(j, covered); |
7127 | continue; | 6976 | cpumask_set_cpu(j, sched_group_cpus(sg)); |
7128 | sg = sg->next; | ||
7129 | next_sg: | ||
7130 | oldsg = sg; | ||
7131 | sg = sg->next; | ||
7132 | kfree(oldsg); | ||
7133 | if (oldsg != sched_group_nodes[i]) | ||
7134 | goto next_sg; | ||
7135 | } | 6977 | } |
7136 | kfree(sched_group_nodes); | 6978 | |
7137 | sched_group_nodes_bycpu[cpu] = NULL; | 6979 | if (!first) |
6980 | first = sg; | ||
6981 | if (last) | ||
6982 | last->next = sg; | ||
6983 | last = sg; | ||
7138 | } | 6984 | } |
6985 | last->next = first; | ||
7139 | } | 6986 | } |
7140 | #else /* !CONFIG_NUMA */ | ||
7141 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
7142 | struct cpumask *nodemask) | ||
7143 | { | ||
7144 | } | ||
7145 | #endif /* CONFIG_NUMA */ | ||
7146 | 6987 | ||
7147 | /* | 6988 | /* |
7148 | * Initialize sched groups cpu_power. | 6989 | * Initialize sched groups cpu_power. |
@@ -7156,11 +6997,6 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
7156 | */ | 6997 | */ |
7157 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 6998 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
7158 | { | 6999 | { |
7159 | struct sched_domain *child; | ||
7160 | struct sched_group *group; | ||
7161 | long power; | ||
7162 | int weight; | ||
7163 | |||
7164 | WARN_ON(!sd || !sd->groups); | 7000 | WARN_ON(!sd || !sd->groups); |
7165 | 7001 | ||
7166 | if (cpu != group_first_cpu(sd->groups)) | 7002 | if (cpu != group_first_cpu(sd->groups)) |
@@ -7168,36 +7004,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7168 | 7004 | ||
7169 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | 7005 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); |
7170 | 7006 | ||
7171 | child = sd->child; | 7007 | update_group_power(sd, cpu); |
7172 | |||
7173 | sd->groups->cpu_power = 0; | ||
7174 | |||
7175 | if (!child) { | ||
7176 | power = SCHED_LOAD_SCALE; | ||
7177 | weight = cpumask_weight(sched_domain_span(sd)); | ||
7178 | /* | ||
7179 | * SMT siblings share the power of a single core. | ||
7180 | * Usually multiple threads get a better yield out of | ||
7181 | * that one core than a single thread would have, | ||
7182 | * reflect that in sd->smt_gain. | ||
7183 | */ | ||
7184 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
7185 | power *= sd->smt_gain; | ||
7186 | power /= weight; | ||
7187 | power >>= SCHED_LOAD_SHIFT; | ||
7188 | } | ||
7189 | sd->groups->cpu_power += power; | ||
7190 | return; | ||
7191 | } | ||
7192 | |||
7193 | /* | ||
7194 | * Add cpu_power of each child group to this groups cpu_power. | ||
7195 | */ | ||
7196 | group = child->groups; | ||
7197 | do { | ||
7198 | sd->groups->cpu_power += group->cpu_power; | ||
7199 | group = group->next; | ||
7200 | } while (group != child->groups); | ||
7201 | } | 7008 | } |
7202 | 7009 | ||
7203 | /* | 7010 | /* |
@@ -7211,15 +7018,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7211 | # define SD_INIT_NAME(sd, type) do { } while (0) | 7018 | # define SD_INIT_NAME(sd, type) do { } while (0) |
7212 | #endif | 7019 | #endif |
7213 | 7020 | ||
7214 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7021 | #define SD_INIT_FUNC(type) \ |
7215 | 7022 | static noinline struct sched_domain * \ | |
7216 | #define SD_INIT_FUNC(type) \ | 7023 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ |
7217 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7024 | { \ |
7218 | { \ | 7025 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ |
7219 | memset(sd, 0, sizeof(*sd)); \ | 7026 | *sd = SD_##type##_INIT; \ |
7220 | *sd = SD_##type##_INIT; \ | 7027 | SD_INIT_NAME(sd, type); \ |
7221 | sd->level = SD_LV_##type; \ | 7028 | sd->private = &tl->data; \ |
7222 | SD_INIT_NAME(sd, type); \ | 7029 | return sd; \ |
7223 | } | 7030 | } |
7224 | 7031 | ||
7225 | SD_INIT_FUNC(CPU) | 7032 | SD_INIT_FUNC(CPU) |
@@ -7238,13 +7045,14 @@ SD_INIT_FUNC(CPU) | |||
7238 | #endif | 7045 | #endif |
7239 | 7046 | ||
7240 | static int default_relax_domain_level = -1; | 7047 | static int default_relax_domain_level = -1; |
7048 | int sched_domain_level_max; | ||
7241 | 7049 | ||
7242 | static int __init setup_relax_domain_level(char *str) | 7050 | static int __init setup_relax_domain_level(char *str) |
7243 | { | 7051 | { |
7244 | unsigned long val; | 7052 | unsigned long val; |
7245 | 7053 | ||
7246 | val = simple_strtoul(str, NULL, 0); | 7054 | val = simple_strtoul(str, NULL, 0); |
7247 | if (val < SD_LV_MAX) | 7055 | if (val < sched_domain_level_max) |
7248 | default_relax_domain_level = val; | 7056 | default_relax_domain_level = val; |
7249 | 7057 | ||
7250 | return 1; | 7058 | return 1; |
@@ -7272,37 +7080,20 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
7272 | } | 7080 | } |
7273 | } | 7081 | } |
7274 | 7082 | ||
7083 | static void __sdt_free(const struct cpumask *cpu_map); | ||
7084 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
7085 | |||
7275 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | 7086 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
7276 | const struct cpumask *cpu_map) | 7087 | const struct cpumask *cpu_map) |
7277 | { | 7088 | { |
7278 | switch (what) { | 7089 | switch (what) { |
7279 | case sa_sched_groups: | ||
7280 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
7281 | d->sched_group_nodes = NULL; | ||
7282 | case sa_rootdomain: | 7090 | case sa_rootdomain: |
7283 | free_rootdomain(d->rd); /* fall through */ | 7091 | if (!atomic_read(&d->rd->refcount)) |
7284 | case sa_tmpmask: | 7092 | free_rootdomain(&d->rd->rcu); /* fall through */ |
7285 | free_cpumask_var(d->tmpmask); /* fall through */ | 7093 | case sa_sd: |
7286 | case sa_send_covered: | 7094 | free_percpu(d->sd); /* fall through */ |
7287 | free_cpumask_var(d->send_covered); /* fall through */ | 7095 | case sa_sd_storage: |
7288 | case sa_this_book_map: | 7096 | __sdt_free(cpu_map); /* fall through */ |
7289 | free_cpumask_var(d->this_book_map); /* fall through */ | ||
7290 | case sa_this_core_map: | ||
7291 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
7292 | case sa_this_sibling_map: | ||
7293 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
7294 | case sa_nodemask: | ||
7295 | free_cpumask_var(d->nodemask); /* fall through */ | ||
7296 | case sa_sched_group_nodes: | ||
7297 | #ifdef CONFIG_NUMA | ||
7298 | kfree(d->sched_group_nodes); /* fall through */ | ||
7299 | case sa_notcovered: | ||
7300 | free_cpumask_var(d->notcovered); /* fall through */ | ||
7301 | case sa_covered: | ||
7302 | free_cpumask_var(d->covered); /* fall through */ | ||
7303 | case sa_domainspan: | ||
7304 | free_cpumask_var(d->domainspan); /* fall through */ | ||
7305 | #endif | ||
7306 | case sa_none: | 7097 | case sa_none: |
7307 | break; | 7098 | break; |
7308 | } | 7099 | } |
@@ -7311,308 +7102,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
7311 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | 7102 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
7312 | const struct cpumask *cpu_map) | 7103 | const struct cpumask *cpu_map) |
7313 | { | 7104 | { |
7314 | #ifdef CONFIG_NUMA | 7105 | memset(d, 0, sizeof(*d)); |
7315 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) | 7106 | |
7316 | return sa_none; | 7107 | if (__sdt_alloc(cpu_map)) |
7317 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) | 7108 | return sa_sd_storage; |
7318 | return sa_domainspan; | 7109 | d->sd = alloc_percpu(struct sched_domain *); |
7319 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) | 7110 | if (!d->sd) |
7320 | return sa_covered; | 7111 | return sa_sd_storage; |
7321 | /* Allocate the per-node list of sched groups */ | ||
7322 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
7323 | sizeof(struct sched_group *), GFP_KERNEL); | ||
7324 | if (!d->sched_group_nodes) { | ||
7325 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
7326 | return sa_notcovered; | ||
7327 | } | ||
7328 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; | ||
7329 | #endif | ||
7330 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) | ||
7331 | return sa_sched_group_nodes; | ||
7332 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
7333 | return sa_nodemask; | ||
7334 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
7335 | return sa_this_sibling_map; | ||
7336 | if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) | ||
7337 | return sa_this_core_map; | ||
7338 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
7339 | return sa_this_book_map; | ||
7340 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
7341 | return sa_send_covered; | ||
7342 | d->rd = alloc_rootdomain(); | 7112 | d->rd = alloc_rootdomain(); |
7343 | if (!d->rd) { | 7113 | if (!d->rd) |
7344 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7114 | return sa_sd; |
7345 | return sa_tmpmask; | ||
7346 | } | ||
7347 | return sa_rootdomain; | 7115 | return sa_rootdomain; |
7348 | } | 7116 | } |
7349 | 7117 | ||
7350 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | 7118 | /* |
7351 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | 7119 | * NULL the sd_data elements we've used to build the sched_domain and |
7120 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
7121 | * will not free the data we're using. | ||
7122 | */ | ||
7123 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
7352 | { | 7124 | { |
7353 | struct sched_domain *sd = NULL; | 7125 | struct sd_data *sdd = sd->private; |
7354 | #ifdef CONFIG_NUMA | 7126 | struct sched_group *sg = sd->groups; |
7355 | struct sched_domain *parent; | ||
7356 | |||
7357 | d->sd_allnodes = 0; | ||
7358 | if (cpumask_weight(cpu_map) > | ||
7359 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
7360 | sd = &per_cpu(allnodes_domains, i).sd; | ||
7361 | SD_INIT(sd, ALLNODES); | ||
7362 | set_domain_attribute(sd, attr); | ||
7363 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
7364 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7365 | d->sd_allnodes = 1; | ||
7366 | } | ||
7367 | parent = sd; | ||
7368 | |||
7369 | sd = &per_cpu(node_domains, i).sd; | ||
7370 | SD_INIT(sd, NODE); | ||
7371 | set_domain_attribute(sd, attr); | ||
7372 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
7373 | sd->parent = parent; | ||
7374 | if (parent) | ||
7375 | parent->child = sd; | ||
7376 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
7377 | #endif | ||
7378 | return sd; | ||
7379 | } | ||
7380 | 7127 | ||
7381 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | 7128 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
7382 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7129 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
7383 | struct sched_domain *parent, int i) | ||
7384 | { | ||
7385 | struct sched_domain *sd; | ||
7386 | sd = &per_cpu(phys_domains, i).sd; | ||
7387 | SD_INIT(sd, CPU); | ||
7388 | set_domain_attribute(sd, attr); | ||
7389 | cpumask_copy(sched_domain_span(sd), d->nodemask); | ||
7390 | sd->parent = parent; | ||
7391 | if (parent) | ||
7392 | parent->child = sd; | ||
7393 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7394 | return sd; | ||
7395 | } | ||
7396 | 7130 | ||
7397 | static struct sched_domain *__build_book_sched_domain(struct s_data *d, | 7131 | if (cpu == cpumask_first(sched_group_cpus(sg))) { |
7398 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7132 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg); |
7399 | struct sched_domain *parent, int i) | 7133 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
7400 | { | 7134 | } |
7401 | struct sched_domain *sd = parent; | ||
7402 | #ifdef CONFIG_SCHED_BOOK | ||
7403 | sd = &per_cpu(book_domains, i).sd; | ||
7404 | SD_INIT(sd, BOOK); | ||
7405 | set_domain_attribute(sd, attr); | ||
7406 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); | ||
7407 | sd->parent = parent; | ||
7408 | parent->child = sd; | ||
7409 | cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7410 | #endif | ||
7411 | return sd; | ||
7412 | } | 7135 | } |
7413 | 7136 | ||
7414 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7137 | #ifdef CONFIG_SCHED_SMT |
7415 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7138 | static const struct cpumask *cpu_smt_mask(int cpu) |
7416 | struct sched_domain *parent, int i) | ||
7417 | { | 7139 | { |
7418 | struct sched_domain *sd = parent; | 7140 | return topology_thread_cpumask(cpu); |
7419 | #ifdef CONFIG_SCHED_MC | ||
7420 | sd = &per_cpu(core_domains, i).sd; | ||
7421 | SD_INIT(sd, MC); | ||
7422 | set_domain_attribute(sd, attr); | ||
7423 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); | ||
7424 | sd->parent = parent; | ||
7425 | parent->child = sd; | ||
7426 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7427 | #endif | ||
7428 | return sd; | ||
7429 | } | 7141 | } |
7430 | |||
7431 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
7432 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7433 | struct sched_domain *parent, int i) | ||
7434 | { | ||
7435 | struct sched_domain *sd = parent; | ||
7436 | #ifdef CONFIG_SCHED_SMT | ||
7437 | sd = &per_cpu(cpu_domains, i).sd; | ||
7438 | SD_INIT(sd, SIBLING); | ||
7439 | set_domain_attribute(sd, attr); | ||
7440 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); | ||
7441 | sd->parent = parent; | ||
7442 | parent->child = sd; | ||
7443 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7444 | #endif | 7142 | #endif |
7445 | return sd; | ||
7446 | } | ||
7447 | 7143 | ||
7448 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | 7144 | /* |
7449 | const struct cpumask *cpu_map, int cpu) | 7145 | * Topology list, bottom-up. |
7450 | { | 7146 | */ |
7451 | switch (l) { | 7147 | static struct sched_domain_topology_level default_topology[] = { |
7452 | #ifdef CONFIG_SCHED_SMT | 7148 | #ifdef CONFIG_SCHED_SMT |
7453 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ | 7149 | { sd_init_SIBLING, cpu_smt_mask, }, |
7454 | cpumask_and(d->this_sibling_map, cpu_map, | ||
7455 | topology_thread_cpumask(cpu)); | ||
7456 | if (cpu == cpumask_first(d->this_sibling_map)) | ||
7457 | init_sched_build_groups(d->this_sibling_map, cpu_map, | ||
7458 | &cpu_to_cpu_group, | ||
7459 | d->send_covered, d->tmpmask); | ||
7460 | break; | ||
7461 | #endif | 7150 | #endif |
7462 | #ifdef CONFIG_SCHED_MC | 7151 | #ifdef CONFIG_SCHED_MC |
7463 | case SD_LV_MC: /* set up multi-core groups */ | 7152 | { sd_init_MC, cpu_coregroup_mask, }, |
7464 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); | ||
7465 | if (cpu == cpumask_first(d->this_core_map)) | ||
7466 | init_sched_build_groups(d->this_core_map, cpu_map, | ||
7467 | &cpu_to_core_group, | ||
7468 | d->send_covered, d->tmpmask); | ||
7469 | break; | ||
7470 | #endif | 7153 | #endif |
7471 | #ifdef CONFIG_SCHED_BOOK | 7154 | #ifdef CONFIG_SCHED_BOOK |
7472 | case SD_LV_BOOK: /* set up book groups */ | 7155 | { sd_init_BOOK, cpu_book_mask, }, |
7473 | cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); | ||
7474 | if (cpu == cpumask_first(d->this_book_map)) | ||
7475 | init_sched_build_groups(d->this_book_map, cpu_map, | ||
7476 | &cpu_to_book_group, | ||
7477 | d->send_covered, d->tmpmask); | ||
7478 | break; | ||
7479 | #endif | 7156 | #endif |
7480 | case SD_LV_CPU: /* set up physical groups */ | 7157 | { sd_init_CPU, cpu_cpu_mask, }, |
7481 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | ||
7482 | if (!cpumask_empty(d->nodemask)) | ||
7483 | init_sched_build_groups(d->nodemask, cpu_map, | ||
7484 | &cpu_to_phys_group, | ||
7485 | d->send_covered, d->tmpmask); | ||
7486 | break; | ||
7487 | #ifdef CONFIG_NUMA | 7158 | #ifdef CONFIG_NUMA |
7488 | case SD_LV_ALLNODES: | 7159 | { sd_init_NODE, cpu_node_mask, }, |
7489 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, | 7160 | { sd_init_ALLNODES, cpu_allnodes_mask, }, |
7490 | d->send_covered, d->tmpmask); | ||
7491 | break; | ||
7492 | #endif | 7161 | #endif |
7493 | default: | 7162 | { NULL, }, |
7494 | break; | 7163 | }; |
7164 | |||
7165 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
7166 | |||
7167 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
7168 | { | ||
7169 | struct sched_domain_topology_level *tl; | ||
7170 | int j; | ||
7171 | |||
7172 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7173 | struct sd_data *sdd = &tl->data; | ||
7174 | |||
7175 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
7176 | if (!sdd->sd) | ||
7177 | return -ENOMEM; | ||
7178 | |||
7179 | sdd->sg = alloc_percpu(struct sched_group *); | ||
7180 | if (!sdd->sg) | ||
7181 | return -ENOMEM; | ||
7182 | |||
7183 | for_each_cpu(j, cpu_map) { | ||
7184 | struct sched_domain *sd; | ||
7185 | struct sched_group *sg; | ||
7186 | |||
7187 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
7188 | GFP_KERNEL, cpu_to_node(j)); | ||
7189 | if (!sd) | ||
7190 | return -ENOMEM; | ||
7191 | |||
7192 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
7193 | |||
7194 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
7195 | GFP_KERNEL, cpu_to_node(j)); | ||
7196 | if (!sg) | ||
7197 | return -ENOMEM; | ||
7198 | |||
7199 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
7200 | } | ||
7201 | } | ||
7202 | |||
7203 | return 0; | ||
7204 | } | ||
7205 | |||
7206 | static void __sdt_free(const struct cpumask *cpu_map) | ||
7207 | { | ||
7208 | struct sched_domain_topology_level *tl; | ||
7209 | int j; | ||
7210 | |||
7211 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
7212 | struct sd_data *sdd = &tl->data; | ||
7213 | |||
7214 | for_each_cpu(j, cpu_map) { | ||
7215 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
7216 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
7217 | } | ||
7218 | free_percpu(sdd->sd); | ||
7219 | free_percpu(sdd->sg); | ||
7495 | } | 7220 | } |
7496 | } | 7221 | } |
7497 | 7222 | ||
7223 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
7224 | struct s_data *d, const struct cpumask *cpu_map, | ||
7225 | struct sched_domain_attr *attr, struct sched_domain *child, | ||
7226 | int cpu) | ||
7227 | { | ||
7228 | struct sched_domain *sd = tl->init(tl, cpu); | ||
7229 | if (!sd) | ||
7230 | return child; | ||
7231 | |||
7232 | set_domain_attribute(sd, attr); | ||
7233 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
7234 | if (child) { | ||
7235 | sd->level = child->level + 1; | ||
7236 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
7237 | child->parent = sd; | ||
7238 | } | ||
7239 | sd->child = child; | ||
7240 | |||
7241 | return sd; | ||
7242 | } | ||
7243 | |||
7498 | /* | 7244 | /* |
7499 | * Build sched domains for a given set of cpus and attach the sched domains | 7245 | * Build sched domains for a given set of cpus and attach the sched domains |
7500 | * to the individual cpus | 7246 | * to the individual cpus |
7501 | */ | 7247 | */ |
7502 | static int __build_sched_domains(const struct cpumask *cpu_map, | 7248 | static int build_sched_domains(const struct cpumask *cpu_map, |
7503 | struct sched_domain_attr *attr) | 7249 | struct sched_domain_attr *attr) |
7504 | { | 7250 | { |
7505 | enum s_alloc alloc_state = sa_none; | 7251 | enum s_alloc alloc_state = sa_none; |
7506 | struct s_data d; | ||
7507 | struct sched_domain *sd; | 7252 | struct sched_domain *sd; |
7508 | int i; | 7253 | struct s_data d; |
7509 | #ifdef CONFIG_NUMA | 7254 | int i, ret = -ENOMEM; |
7510 | d.sd_allnodes = 0; | ||
7511 | #endif | ||
7512 | 7255 | ||
7513 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7256 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
7514 | if (alloc_state != sa_rootdomain) | 7257 | if (alloc_state != sa_rootdomain) |
7515 | goto error; | 7258 | goto error; |
7516 | alloc_state = sa_sched_groups; | ||
7517 | 7259 | ||
7518 | /* | 7260 | /* Set up domains for cpus specified by the cpu_map. */ |
7519 | * Set up domains for cpus specified by the cpu_map. | ||
7520 | */ | ||
7521 | for_each_cpu(i, cpu_map) { | 7261 | for_each_cpu(i, cpu_map) { |
7522 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), | 7262 | struct sched_domain_topology_level *tl; |
7523 | cpu_map); | ||
7524 | 7263 | ||
7525 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | 7264 | sd = NULL; |
7526 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | 7265 | for (tl = sched_domain_topology; tl->init; tl++) |
7527 | sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); | 7266 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); |
7528 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | ||
7529 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | ||
7530 | } | ||
7531 | |||
7532 | for_each_cpu(i, cpu_map) { | ||
7533 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | ||
7534 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | ||
7535 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
7536 | } | ||
7537 | |||
7538 | /* Set up physical groups */ | ||
7539 | for (i = 0; i < nr_node_ids; i++) | ||
7540 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); | ||
7541 | 7267 | ||
7542 | #ifdef CONFIG_NUMA | 7268 | while (sd->child) |
7543 | /* Set up node groups */ | 7269 | sd = sd->child; |
7544 | if (d.sd_allnodes) | ||
7545 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
7546 | |||
7547 | for (i = 0; i < nr_node_ids; i++) | ||
7548 | if (build_numa_sched_groups(&d, cpu_map, i)) | ||
7549 | goto error; | ||
7550 | #endif | ||
7551 | 7270 | ||
7552 | /* Calculate CPU power for physical packages and nodes */ | 7271 | *per_cpu_ptr(d.sd, i) = sd; |
7553 | #ifdef CONFIG_SCHED_SMT | ||
7554 | for_each_cpu(i, cpu_map) { | ||
7555 | sd = &per_cpu(cpu_domains, i).sd; | ||
7556 | init_sched_groups_power(i, sd); | ||
7557 | } | ||
7558 | #endif | ||
7559 | #ifdef CONFIG_SCHED_MC | ||
7560 | for_each_cpu(i, cpu_map) { | ||
7561 | sd = &per_cpu(core_domains, i).sd; | ||
7562 | init_sched_groups_power(i, sd); | ||
7563 | } | 7272 | } |
7564 | #endif | ||
7565 | #ifdef CONFIG_SCHED_BOOK | ||
7566 | for_each_cpu(i, cpu_map) { | ||
7567 | sd = &per_cpu(book_domains, i).sd; | ||
7568 | init_sched_groups_power(i, sd); | ||
7569 | } | ||
7570 | #endif | ||
7571 | 7273 | ||
7274 | /* Build the groups for the domains */ | ||
7572 | for_each_cpu(i, cpu_map) { | 7275 | for_each_cpu(i, cpu_map) { |
7573 | sd = &per_cpu(phys_domains, i).sd; | 7276 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
7574 | init_sched_groups_power(i, sd); | 7277 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); |
7575 | } | 7278 | get_group(i, sd->private, &sd->groups); |
7279 | atomic_inc(&sd->groups->ref); | ||
7576 | 7280 | ||
7577 | #ifdef CONFIG_NUMA | 7281 | if (i != cpumask_first(sched_domain_span(sd))) |
7578 | for (i = 0; i < nr_node_ids; i++) | 7282 | continue; |
7579 | init_numa_sched_groups_power(d.sched_group_nodes[i]); | ||
7580 | 7283 | ||
7581 | if (d.sd_allnodes) { | 7284 | build_sched_groups(sd); |
7582 | struct sched_group *sg; | 7285 | } |
7286 | } | ||
7583 | 7287 | ||
7584 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 7288 | /* Calculate CPU power for physical packages and nodes */ |
7585 | d.tmpmask); | 7289 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
7586 | init_numa_sched_groups_power(sg); | 7290 | if (!cpumask_test_cpu(i, cpu_map)) |
7291 | continue; | ||
7292 | |||
7293 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7294 | claim_allocations(i, sd); | ||
7295 | init_sched_groups_power(i, sd); | ||
7296 | } | ||
7587 | } | 7297 | } |
7588 | #endif | ||
7589 | 7298 | ||
7590 | /* Attach the domains */ | 7299 | /* Attach the domains */ |
7300 | rcu_read_lock(); | ||
7591 | for_each_cpu(i, cpu_map) { | 7301 | for_each_cpu(i, cpu_map) { |
7592 | #ifdef CONFIG_SCHED_SMT | 7302 | sd = *per_cpu_ptr(d.sd, i); |
7593 | sd = &per_cpu(cpu_domains, i).sd; | ||
7594 | #elif defined(CONFIG_SCHED_MC) | ||
7595 | sd = &per_cpu(core_domains, i).sd; | ||
7596 | #elif defined(CONFIG_SCHED_BOOK) | ||
7597 | sd = &per_cpu(book_domains, i).sd; | ||
7598 | #else | ||
7599 | sd = &per_cpu(phys_domains, i).sd; | ||
7600 | #endif | ||
7601 | cpu_attach_domain(sd, d.rd, i); | 7303 | cpu_attach_domain(sd, d.rd, i); |
7602 | } | 7304 | } |
7305 | rcu_read_unlock(); | ||
7603 | 7306 | ||
7604 | d.sched_group_nodes = NULL; /* don't free this we still need it */ | 7307 | ret = 0; |
7605 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | ||
7606 | return 0; | ||
7607 | |||
7608 | error: | 7308 | error: |
7609 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7309 | __free_domain_allocs(&d, alloc_state, cpu_map); |
7610 | return -ENOMEM; | 7310 | return ret; |
7611 | } | ||
7612 | |||
7613 | static int build_sched_domains(const struct cpumask *cpu_map) | ||
7614 | { | ||
7615 | return __build_sched_domains(cpu_map, NULL); | ||
7616 | } | 7311 | } |
7617 | 7312 | ||
7618 | static cpumask_var_t *doms_cur; /* current sched domains */ | 7313 | static cpumask_var_t *doms_cur; /* current sched domains */ |
@@ -7667,7 +7362,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | |||
7667 | * For now this just excludes isolated cpus, but could be used to | 7362 | * For now this just excludes isolated cpus, but could be used to |
7668 | * exclude other special cases in the future. | 7363 | * exclude other special cases in the future. |
7669 | */ | 7364 | */ |
7670 | static int arch_init_sched_domains(const struct cpumask *cpu_map) | 7365 | static int init_sched_domains(const struct cpumask *cpu_map) |
7671 | { | 7366 | { |
7672 | int err; | 7367 | int err; |
7673 | 7368 | ||
@@ -7678,32 +7373,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
7678 | doms_cur = &fallback_doms; | 7373 | doms_cur = &fallback_doms; |
7679 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 7374 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
7680 | dattr_cur = NULL; | 7375 | dattr_cur = NULL; |
7681 | err = build_sched_domains(doms_cur[0]); | 7376 | err = build_sched_domains(doms_cur[0], NULL); |
7682 | register_sched_domain_sysctl(); | 7377 | register_sched_domain_sysctl(); |
7683 | 7378 | ||
7684 | return err; | 7379 | return err; |
7685 | } | 7380 | } |
7686 | 7381 | ||
7687 | static void arch_destroy_sched_domains(const struct cpumask *cpu_map, | ||
7688 | struct cpumask *tmpmask) | ||
7689 | { | ||
7690 | free_sched_groups(cpu_map, tmpmask); | ||
7691 | } | ||
7692 | |||
7693 | /* | 7382 | /* |
7694 | * Detach sched domains from a group of cpus specified in cpu_map | 7383 | * Detach sched domains from a group of cpus specified in cpu_map |
7695 | * These cpus will now be attached to the NULL domain | 7384 | * These cpus will now be attached to the NULL domain |
7696 | */ | 7385 | */ |
7697 | static void detach_destroy_domains(const struct cpumask *cpu_map) | 7386 | static void detach_destroy_domains(const struct cpumask *cpu_map) |
7698 | { | 7387 | { |
7699 | /* Save because hotplug lock held. */ | ||
7700 | static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); | ||
7701 | int i; | 7388 | int i; |
7702 | 7389 | ||
7390 | rcu_read_lock(); | ||
7703 | for_each_cpu(i, cpu_map) | 7391 | for_each_cpu(i, cpu_map) |
7704 | cpu_attach_domain(NULL, &def_root_domain, i); | 7392 | cpu_attach_domain(NULL, &def_root_domain, i); |
7705 | synchronize_sched(); | 7393 | rcu_read_unlock(); |
7706 | arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); | ||
7707 | } | 7394 | } |
7708 | 7395 | ||
7709 | /* handle null as "default" */ | 7396 | /* handle null as "default" */ |
@@ -7792,8 +7479,7 @@ match1: | |||
7792 | goto match2; | 7479 | goto match2; |
7793 | } | 7480 | } |
7794 | /* no match - add a new doms_new */ | 7481 | /* no match - add a new doms_new */ |
7795 | __build_sched_domains(doms_new[i], | 7482 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); |
7796 | dattr_new ? dattr_new + i : NULL); | ||
7797 | match2: | 7483 | match2: |
7798 | ; | 7484 | ; |
7799 | } | 7485 | } |
@@ -7812,7 +7498,7 @@ match2: | |||
7812 | } | 7498 | } |
7813 | 7499 | ||
7814 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7500 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
7815 | static void arch_reinit_sched_domains(void) | 7501 | static void reinit_sched_domains(void) |
7816 | { | 7502 | { |
7817 | get_online_cpus(); | 7503 | get_online_cpus(); |
7818 | 7504 | ||
@@ -7845,7 +7531,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
7845 | else | 7531 | else |
7846 | sched_mc_power_savings = level; | 7532 | sched_mc_power_savings = level; |
7847 | 7533 | ||
7848 | arch_reinit_sched_domains(); | 7534 | reinit_sched_domains(); |
7849 | 7535 | ||
7850 | return count; | 7536 | return count; |
7851 | } | 7537 | } |
@@ -7964,14 +7650,9 @@ void __init sched_init_smp(void) | |||
7964 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7650 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
7965 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7651 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
7966 | 7652 | ||
7967 | #if defined(CONFIG_NUMA) | ||
7968 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
7969 | GFP_KERNEL); | ||
7970 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
7971 | #endif | ||
7972 | get_online_cpus(); | 7653 | get_online_cpus(); |
7973 | mutex_lock(&sched_domains_mutex); | 7654 | mutex_lock(&sched_domains_mutex); |
7974 | arch_init_sched_domains(cpu_active_mask); | 7655 | init_sched_domains(cpu_active_mask); |
7975 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 7656 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
7976 | if (cpumask_empty(non_isolated_cpus)) | 7657 | if (cpumask_empty(non_isolated_cpus)) |
7977 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 7658 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
@@ -8278,6 +7959,7 @@ void __init sched_init(void) | |||
8278 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 7959 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
8279 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 7960 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
8280 | #ifdef CONFIG_SMP | 7961 | #ifdef CONFIG_SMP |
7962 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | ||
8281 | #ifdef CONFIG_NO_HZ | 7963 | #ifdef CONFIG_NO_HZ |
8282 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 7964 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
8283 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | 7965 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
@@ -8337,7 +8019,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8337 | int old_prio = p->prio; | 8019 | int old_prio = p->prio; |
8338 | int on_rq; | 8020 | int on_rq; |
8339 | 8021 | ||
8340 | on_rq = p->se.on_rq; | 8022 | on_rq = p->on_rq; |
8341 | if (on_rq) | 8023 | if (on_rq) |
8342 | deactivate_task(rq, p, 0); | 8024 | deactivate_task(rq, p, 0); |
8343 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 8025 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
@@ -8550,7 +8232,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8550 | { | 8232 | { |
8551 | struct rt_rq *rt_rq; | 8233 | struct rt_rq *rt_rq; |
8552 | struct sched_rt_entity *rt_se; | 8234 | struct sched_rt_entity *rt_se; |
8553 | struct rq *rq; | ||
8554 | int i; | 8235 | int i; |
8555 | 8236 | ||
8556 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | 8237 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8564,8 +8245,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8564 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | 8245 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); |
8565 | 8246 | ||
8566 | for_each_possible_cpu(i) { | 8247 | for_each_possible_cpu(i) { |
8567 | rq = cpu_rq(i); | ||
8568 | |||
8569 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | 8248 | rt_rq = kzalloc_node(sizeof(struct rt_rq), |
8570 | GFP_KERNEL, cpu_to_node(i)); | 8249 | GFP_KERNEL, cpu_to_node(i)); |
8571 | if (!rt_rq) | 8250 | if (!rt_rq) |
@@ -8680,7 +8359,7 @@ void sched_move_task(struct task_struct *tsk) | |||
8680 | rq = task_rq_lock(tsk, &flags); | 8359 | rq = task_rq_lock(tsk, &flags); |
8681 | 8360 | ||
8682 | running = task_current(rq, tsk); | 8361 | running = task_current(rq, tsk); |
8683 | on_rq = tsk->se.on_rq; | 8362 | on_rq = tsk->on_rq; |
8684 | 8363 | ||
8685 | if (on_rq) | 8364 | if (on_rq) |
8686 | dequeue_task(rq, tsk, 0); | 8365 | dequeue_task(rq, tsk, 0); |
@@ -8699,7 +8378,7 @@ void sched_move_task(struct task_struct *tsk) | |||
8699 | if (on_rq) | 8378 | if (on_rq) |
8700 | enqueue_task(rq, tsk, 0); | 8379 | enqueue_task(rq, tsk, 0); |
8701 | 8380 | ||
8702 | task_rq_unlock(rq, &flags); | 8381 | task_rq_unlock(rq, tsk, &flags); |
8703 | } | 8382 | } |
8704 | #endif /* CONFIG_CGROUP_SCHED */ | 8383 | #endif /* CONFIG_CGROUP_SCHED */ |
8705 | 8384 | ||
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 5946ac515602..429242f3c484 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c | |||
@@ -179,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p) | |||
179 | struct autogroup *ag = autogroup_create(); | 179 | struct autogroup *ag = autogroup_create(); |
180 | 180 | ||
181 | autogroup_move_group(p, ag); | 181 | autogroup_move_group(p, ag); |
182 | /* drop extra refrence added by autogroup_create() */ | 182 | /* drop extra reference added by autogroup_create() */ |
183 | autogroup_kref_put(ag); | 183 | autogroup_kref_put(ag); |
184 | } | 184 | } |
185 | EXPORT_SYMBOL(sched_autogroup_create_attach); | 185 | EXPORT_SYMBOL(sched_autogroup_create_attach); |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7bacd83a4158..a6710a112b4f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
152 | read_lock_irqsave(&tasklist_lock, flags); | 152 | read_lock_irqsave(&tasklist_lock, flags); |
153 | 153 | ||
154 | do_each_thread(g, p) { | 154 | do_each_thread(g, p) { |
155 | if (!p->se.on_rq || task_cpu(p) != rq_cpu) | 155 | if (!p->on_rq || task_cpu(p) != rq_cpu) |
156 | continue; | 156 | continue; |
157 | 157 | ||
158 | print_task(m, rq, p); | 158 | print_task(m, rq, p); |
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
296 | P(ttwu_count); | 296 | P(ttwu_count); |
297 | P(ttwu_local); | 297 | P(ttwu_local); |
298 | 298 | ||
299 | SEQ_printf(m, " .%-30s: %d\n", "bkl_count", | ||
300 | rq->rq_sched_info.bkl_count); | ||
301 | |||
302 | #undef P | 299 | #undef P |
303 | #undef P64 | 300 | #undef P64 |
304 | #endif | 301 | #endif |
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
441 | P(se.statistics.wait_count); | 438 | P(se.statistics.wait_count); |
442 | PN(se.statistics.iowait_sum); | 439 | PN(se.statistics.iowait_sum); |
443 | P(se.statistics.iowait_count); | 440 | P(se.statistics.iowait_count); |
444 | P(sched_info.bkl_count); | ||
445 | P(se.nr_migrations); | 441 | P(se.nr_migrations); |
446 | P(se.statistics.nr_migrations_cold); | 442 | P(se.statistics.nr_migrations_cold); |
447 | P(se.statistics.nr_failed_migrations_affine); | 443 | P(se.statistics.nr_failed_migrations_affine); |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c7ec5c8e7b44..37f22626225e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) | |||
358 | } | 358 | } |
359 | 359 | ||
360 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); | 360 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); |
361 | #ifndef CONFIG_64BIT | ||
362 | smp_wmb(); | ||
363 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
364 | #endif | ||
361 | } | 365 | } |
362 | 366 | ||
363 | /* | 367 | /* |
@@ -1340,6 +1344,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1340 | hrtick_update(rq); | 1344 | hrtick_update(rq); |
1341 | } | 1345 | } |
1342 | 1346 | ||
1347 | static void set_next_buddy(struct sched_entity *se); | ||
1348 | |||
1343 | /* | 1349 | /* |
1344 | * The dequeue_task method is called before nr_running is | 1350 | * The dequeue_task method is called before nr_running is |
1345 | * decreased. We remove the task from the rbtree and | 1351 | * decreased. We remove the task from the rbtree and |
@@ -1349,14 +1355,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1349 | { | 1355 | { |
1350 | struct cfs_rq *cfs_rq; | 1356 | struct cfs_rq *cfs_rq; |
1351 | struct sched_entity *se = &p->se; | 1357 | struct sched_entity *se = &p->se; |
1358 | int task_sleep = flags & DEQUEUE_SLEEP; | ||
1352 | 1359 | ||
1353 | for_each_sched_entity(se) { | 1360 | for_each_sched_entity(se) { |
1354 | cfs_rq = cfs_rq_of(se); | 1361 | cfs_rq = cfs_rq_of(se); |
1355 | dequeue_entity(cfs_rq, se, flags); | 1362 | dequeue_entity(cfs_rq, se, flags); |
1356 | 1363 | ||
1357 | /* Don't dequeue parent if it has other entities besides us */ | 1364 | /* Don't dequeue parent if it has other entities besides us */ |
1358 | if (cfs_rq->load.weight) | 1365 | if (cfs_rq->load.weight) { |
1366 | /* | ||
1367 | * Bias pick_next to pick a task from this cfs_rq, as | ||
1368 | * p is sleeping when it is within its sched_slice. | ||
1369 | */ | ||
1370 | if (task_sleep && parent_entity(se)) | ||
1371 | set_next_buddy(parent_entity(se)); | ||
1359 | break; | 1372 | break; |
1373 | } | ||
1360 | flags |= DEQUEUE_SLEEP; | 1374 | flags |= DEQUEUE_SLEEP; |
1361 | } | 1375 | } |
1362 | 1376 | ||
@@ -1372,12 +1386,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1372 | 1386 | ||
1373 | #ifdef CONFIG_SMP | 1387 | #ifdef CONFIG_SMP |
1374 | 1388 | ||
1375 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | 1389 | static void task_waking_fair(struct task_struct *p) |
1376 | { | 1390 | { |
1377 | struct sched_entity *se = &p->se; | 1391 | struct sched_entity *se = &p->se; |
1378 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1392 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1393 | u64 min_vruntime; | ||
1379 | 1394 | ||
1380 | se->vruntime -= cfs_rq->min_vruntime; | 1395 | #ifndef CONFIG_64BIT |
1396 | u64 min_vruntime_copy; | ||
1397 | |||
1398 | do { | ||
1399 | min_vruntime_copy = cfs_rq->min_vruntime_copy; | ||
1400 | smp_rmb(); | ||
1401 | min_vruntime = cfs_rq->min_vruntime; | ||
1402 | } while (min_vruntime != min_vruntime_copy); | ||
1403 | #else | ||
1404 | min_vruntime = cfs_rq->min_vruntime; | ||
1405 | #endif | ||
1406 | |||
1407 | se->vruntime -= min_vruntime; | ||
1381 | } | 1408 | } |
1382 | 1409 | ||
1383 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1410 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1622,6 +1649,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1622 | /* | 1649 | /* |
1623 | * Otherwise, iterate the domains and find an elegible idle cpu. | 1650 | * Otherwise, iterate the domains and find an elegible idle cpu. |
1624 | */ | 1651 | */ |
1652 | rcu_read_lock(); | ||
1625 | for_each_domain(target, sd) { | 1653 | for_each_domain(target, sd) { |
1626 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 1654 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
1627 | break; | 1655 | break; |
@@ -1641,6 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1641 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | 1669 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) |
1642 | break; | 1670 | break; |
1643 | } | 1671 | } |
1672 | rcu_read_unlock(); | ||
1644 | 1673 | ||
1645 | return target; | 1674 | return target; |
1646 | } | 1675 | } |
@@ -1657,7 +1686,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
1657 | * preempt must be disabled. | 1686 | * preempt must be disabled. |
1658 | */ | 1687 | */ |
1659 | static int | 1688 | static int |
1660 | select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) | 1689 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) |
1661 | { | 1690 | { |
1662 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 1691 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
1663 | int cpu = smp_processor_id(); | 1692 | int cpu = smp_processor_id(); |
@@ -1673,6 +1702,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1673 | new_cpu = prev_cpu; | 1702 | new_cpu = prev_cpu; |
1674 | } | 1703 | } |
1675 | 1704 | ||
1705 | rcu_read_lock(); | ||
1676 | for_each_domain(cpu, tmp) { | 1706 | for_each_domain(cpu, tmp) { |
1677 | if (!(tmp->flags & SD_LOAD_BALANCE)) | 1707 | if (!(tmp->flags & SD_LOAD_BALANCE)) |
1678 | continue; | 1708 | continue; |
@@ -1723,9 +1753,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1723 | 1753 | ||
1724 | if (affine_sd) { | 1754 | if (affine_sd) { |
1725 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1755 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1726 | return select_idle_sibling(p, cpu); | 1756 | prev_cpu = cpu; |
1727 | else | 1757 | |
1728 | return select_idle_sibling(p, prev_cpu); | 1758 | new_cpu = select_idle_sibling(p, prev_cpu); |
1759 | goto unlock; | ||
1729 | } | 1760 | } |
1730 | 1761 | ||
1731 | while (sd) { | 1762 | while (sd) { |
@@ -1766,6 +1797,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1766 | } | 1797 | } |
1767 | /* while loop will break here if sd == NULL */ | 1798 | /* while loop will break here if sd == NULL */ |
1768 | } | 1799 | } |
1800 | unlock: | ||
1801 | rcu_read_unlock(); | ||
1769 | 1802 | ||
1770 | return new_cpu; | 1803 | return new_cpu; |
1771 | } | 1804 | } |
@@ -1789,10 +1822,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) | |||
1789 | * This is especially important for buddies when the leftmost | 1822 | * This is especially important for buddies when the leftmost |
1790 | * task is higher priority than the buddy. | 1823 | * task is higher priority than the buddy. |
1791 | */ | 1824 | */ |
1792 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 1825 | return calc_delta_fair(gran, se); |
1793 | gran = calc_delta_fair(gran, se); | ||
1794 | |||
1795 | return gran; | ||
1796 | } | 1826 | } |
1797 | 1827 | ||
1798 | /* | 1828 | /* |
@@ -1826,26 +1856,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | |||
1826 | 1856 | ||
1827 | static void set_last_buddy(struct sched_entity *se) | 1857 | static void set_last_buddy(struct sched_entity *se) |
1828 | { | 1858 | { |
1829 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1859 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
1830 | for_each_sched_entity(se) | 1860 | return; |
1831 | cfs_rq_of(se)->last = se; | 1861 | |
1832 | } | 1862 | for_each_sched_entity(se) |
1863 | cfs_rq_of(se)->last = se; | ||
1833 | } | 1864 | } |
1834 | 1865 | ||
1835 | static void set_next_buddy(struct sched_entity *se) | 1866 | static void set_next_buddy(struct sched_entity *se) |
1836 | { | 1867 | { |
1837 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1868 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
1838 | for_each_sched_entity(se) | 1869 | return; |
1839 | cfs_rq_of(se)->next = se; | 1870 | |
1840 | } | 1871 | for_each_sched_entity(se) |
1872 | cfs_rq_of(se)->next = se; | ||
1841 | } | 1873 | } |
1842 | 1874 | ||
1843 | static void set_skip_buddy(struct sched_entity *se) | 1875 | static void set_skip_buddy(struct sched_entity *se) |
1844 | { | 1876 | { |
1845 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1877 | for_each_sched_entity(se) |
1846 | for_each_sched_entity(se) | 1878 | cfs_rq_of(se)->skip = se; |
1847 | cfs_rq_of(se)->skip = se; | ||
1848 | } | ||
1849 | } | 1879 | } |
1850 | 1880 | ||
1851 | /* | 1881 | /* |
@@ -1857,12 +1887,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1857 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1887 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1858 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1888 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1859 | int scale = cfs_rq->nr_running >= sched_nr_latency; | 1889 | int scale = cfs_rq->nr_running >= sched_nr_latency; |
1890 | int next_buddy_marked = 0; | ||
1860 | 1891 | ||
1861 | if (unlikely(se == pse)) | 1892 | if (unlikely(se == pse)) |
1862 | return; | 1893 | return; |
1863 | 1894 | ||
1864 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) | 1895 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
1865 | set_next_buddy(pse); | 1896 | set_next_buddy(pse); |
1897 | next_buddy_marked = 1; | ||
1898 | } | ||
1866 | 1899 | ||
1867 | /* | 1900 | /* |
1868 | * We can come here with TIF_NEED_RESCHED already set from new task | 1901 | * We can come here with TIF_NEED_RESCHED already set from new task |
@@ -1890,8 +1923,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1890 | update_curr(cfs_rq); | 1923 | update_curr(cfs_rq); |
1891 | find_matching_se(&se, &pse); | 1924 | find_matching_se(&se, &pse); |
1892 | BUG_ON(!pse); | 1925 | BUG_ON(!pse); |
1893 | if (wakeup_preempt_entity(se, pse) == 1) | 1926 | if (wakeup_preempt_entity(se, pse) == 1) { |
1927 | /* | ||
1928 | * Bias pick_next to pick the sched entity that is | ||
1929 | * triggering this preemption. | ||
1930 | */ | ||
1931 | if (!next_buddy_marked) | ||
1932 | set_next_buddy(pse); | ||
1894 | goto preempt; | 1933 | goto preempt; |
1934 | } | ||
1895 | 1935 | ||
1896 | return; | 1936 | return; |
1897 | 1937 | ||
@@ -2102,23 +2142,22 @@ static unsigned long | |||
2102 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2142 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2103 | unsigned long max_load_move, struct sched_domain *sd, | 2143 | unsigned long max_load_move, struct sched_domain *sd, |
2104 | enum cpu_idle_type idle, int *all_pinned, | 2144 | enum cpu_idle_type idle, int *all_pinned, |
2105 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) | 2145 | struct cfs_rq *busiest_cfs_rq) |
2106 | { | 2146 | { |
2107 | int loops = 0, pulled = 0, pinned = 0; | 2147 | int loops = 0, pulled = 0; |
2108 | long rem_load_move = max_load_move; | 2148 | long rem_load_move = max_load_move; |
2109 | struct task_struct *p, *n; | 2149 | struct task_struct *p, *n; |
2110 | 2150 | ||
2111 | if (max_load_move == 0) | 2151 | if (max_load_move == 0) |
2112 | goto out; | 2152 | goto out; |
2113 | 2153 | ||
2114 | pinned = 1; | ||
2115 | |||
2116 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { | 2154 | list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { |
2117 | if (loops++ > sysctl_sched_nr_migrate) | 2155 | if (loops++ > sysctl_sched_nr_migrate) |
2118 | break; | 2156 | break; |
2119 | 2157 | ||
2120 | if ((p->se.load.weight >> 1) > rem_load_move || | 2158 | if ((p->se.load.weight >> 1) > rem_load_move || |
2121 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) | 2159 | !can_migrate_task(p, busiest, this_cpu, sd, idle, |
2160 | all_pinned)) | ||
2122 | continue; | 2161 | continue; |
2123 | 2162 | ||
2124 | pull_task(busiest, p, this_rq, this_cpu); | 2163 | pull_task(busiest, p, this_rq, this_cpu); |
@@ -2141,9 +2180,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2141 | */ | 2180 | */ |
2142 | if (rem_load_move <= 0) | 2181 | if (rem_load_move <= 0) |
2143 | break; | 2182 | break; |
2144 | |||
2145 | if (p->prio < *this_best_prio) | ||
2146 | *this_best_prio = p->prio; | ||
2147 | } | 2183 | } |
2148 | out: | 2184 | out: |
2149 | /* | 2185 | /* |
@@ -2153,9 +2189,6 @@ out: | |||
2153 | */ | 2189 | */ |
2154 | schedstat_add(sd, lb_gained[idle], pulled); | 2190 | schedstat_add(sd, lb_gained[idle], pulled); |
2155 | 2191 | ||
2156 | if (all_pinned) | ||
2157 | *all_pinned = pinned; | ||
2158 | |||
2159 | return max_load_move - rem_load_move; | 2192 | return max_load_move - rem_load_move; |
2160 | } | 2193 | } |
2161 | 2194 | ||
@@ -2206,7 +2239,7 @@ static unsigned long | |||
2206 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2239 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2207 | unsigned long max_load_move, | 2240 | unsigned long max_load_move, |
2208 | struct sched_domain *sd, enum cpu_idle_type idle, | 2241 | struct sched_domain *sd, enum cpu_idle_type idle, |
2209 | int *all_pinned, int *this_best_prio) | 2242 | int *all_pinned) |
2210 | { | 2243 | { |
2211 | long rem_load_move = max_load_move; | 2244 | long rem_load_move = max_load_move; |
2212 | int busiest_cpu = cpu_of(busiest); | 2245 | int busiest_cpu = cpu_of(busiest); |
@@ -2231,7 +2264,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2231 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 2264 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
2232 | 2265 | ||
2233 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | 2266 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
2234 | rem_load, sd, idle, all_pinned, this_best_prio, | 2267 | rem_load, sd, idle, all_pinned, |
2235 | busiest_cfs_rq); | 2268 | busiest_cfs_rq); |
2236 | 2269 | ||
2237 | if (!moved_load) | 2270 | if (!moved_load) |
@@ -2257,11 +2290,11 @@ static unsigned long | |||
2257 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2290 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2258 | unsigned long max_load_move, | 2291 | unsigned long max_load_move, |
2259 | struct sched_domain *sd, enum cpu_idle_type idle, | 2292 | struct sched_domain *sd, enum cpu_idle_type idle, |
2260 | int *all_pinned, int *this_best_prio) | 2293 | int *all_pinned) |
2261 | { | 2294 | { |
2262 | return balance_tasks(this_rq, this_cpu, busiest, | 2295 | return balance_tasks(this_rq, this_cpu, busiest, |
2263 | max_load_move, sd, idle, all_pinned, | 2296 | max_load_move, sd, idle, all_pinned, |
2264 | this_best_prio, &busiest->cfs); | 2297 | &busiest->cfs); |
2265 | } | 2298 | } |
2266 | #endif | 2299 | #endif |
2267 | 2300 | ||
@@ -2278,12 +2311,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2278 | int *all_pinned) | 2311 | int *all_pinned) |
2279 | { | 2312 | { |
2280 | unsigned long total_load_moved = 0, load_moved; | 2313 | unsigned long total_load_moved = 0, load_moved; |
2281 | int this_best_prio = this_rq->curr->prio; | ||
2282 | 2314 | ||
2283 | do { | 2315 | do { |
2284 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | 2316 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
2285 | max_load_move - total_load_moved, | 2317 | max_load_move - total_load_moved, |
2286 | sd, idle, all_pinned, &this_best_prio); | 2318 | sd, idle, all_pinned); |
2287 | 2319 | ||
2288 | total_load_moved += load_moved; | 2320 | total_load_moved += load_moved; |
2289 | 2321 | ||
@@ -2652,7 +2684,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2652 | /* | 2684 | /* |
2653 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | 2685 | * Only siblings can have significantly less than SCHED_LOAD_SCALE |
2654 | */ | 2686 | */ |
2655 | if (sd->level != SD_LV_SIBLING) | 2687 | if (!(sd->flags & SD_SHARE_CPUPOWER)) |
2656 | return 0; | 2688 | return 0; |
2657 | 2689 | ||
2658 | /* | 2690 | /* |
@@ -3062,7 +3094,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3062 | 3094 | ||
3063 | /* | 3095 | /* |
3064 | * if *imbalance is less than the average load per runnable task | 3096 | * if *imbalance is less than the average load per runnable task |
3065 | * there is no gaurantee that any tasks will be moved so we'll have | 3097 | * there is no guarantee that any tasks will be moved so we'll have |
3066 | * a think about bumping its value to force at least one task to be | 3098 | * a think about bumping its value to force at least one task to be |
3067 | * moved | 3099 | * moved |
3068 | */ | 3100 | */ |
@@ -3127,6 +3159,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3127 | if (!sds.busiest || sds.busiest_nr_running == 0) | 3159 | if (!sds.busiest || sds.busiest_nr_running == 0) |
3128 | goto out_balanced; | 3160 | goto out_balanced; |
3129 | 3161 | ||
3162 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | ||
3163 | |||
3130 | /* | 3164 | /* |
3131 | * If the busiest group is imbalanced the below checks don't | 3165 | * If the busiest group is imbalanced the below checks don't |
3132 | * work because they assumes all things are equal, which typically | 3166 | * work because they assumes all things are equal, which typically |
@@ -3151,7 +3185,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3151 | * Don't pull any tasks if this group is already above the domain | 3185 | * Don't pull any tasks if this group is already above the domain |
3152 | * average load. | 3186 | * average load. |
3153 | */ | 3187 | */ |
3154 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | ||
3155 | if (sds.this_load >= sds.avg_load) | 3188 | if (sds.this_load >= sds.avg_load) |
3156 | goto out_balanced; | 3189 | goto out_balanced; |
3157 | 3190 | ||
@@ -3340,6 +3373,7 @@ redo: | |||
3340 | * still unbalanced. ld_moved simply stays zero, so it is | 3373 | * still unbalanced. ld_moved simply stays zero, so it is |
3341 | * correctly treated as an imbalance. | 3374 | * correctly treated as an imbalance. |
3342 | */ | 3375 | */ |
3376 | all_pinned = 1; | ||
3343 | local_irq_save(flags); | 3377 | local_irq_save(flags); |
3344 | double_rq_lock(this_rq, busiest); | 3378 | double_rq_lock(this_rq, busiest); |
3345 | ld_moved = move_tasks(this_rq, this_cpu, busiest, | 3379 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
@@ -3467,6 +3501,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3467 | raw_spin_unlock(&this_rq->lock); | 3501 | raw_spin_unlock(&this_rq->lock); |
3468 | 3502 | ||
3469 | update_shares(this_cpu); | 3503 | update_shares(this_cpu); |
3504 | rcu_read_lock(); | ||
3470 | for_each_domain(this_cpu, sd) { | 3505 | for_each_domain(this_cpu, sd) { |
3471 | unsigned long interval; | 3506 | unsigned long interval; |
3472 | int balance = 1; | 3507 | int balance = 1; |
@@ -3488,6 +3523,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
3488 | break; | 3523 | break; |
3489 | } | 3524 | } |
3490 | } | 3525 | } |
3526 | rcu_read_unlock(); | ||
3491 | 3527 | ||
3492 | raw_spin_lock(&this_rq->lock); | 3528 | raw_spin_lock(&this_rq->lock); |
3493 | 3529 | ||
@@ -3536,6 +3572,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3536 | double_lock_balance(busiest_rq, target_rq); | 3572 | double_lock_balance(busiest_rq, target_rq); |
3537 | 3573 | ||
3538 | /* Search for an sd spanning us and the target CPU. */ | 3574 | /* Search for an sd spanning us and the target CPU. */ |
3575 | rcu_read_lock(); | ||
3539 | for_each_domain(target_cpu, sd) { | 3576 | for_each_domain(target_cpu, sd) { |
3540 | if ((sd->flags & SD_LOAD_BALANCE) && | 3577 | if ((sd->flags & SD_LOAD_BALANCE) && |
3541 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | 3578 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) |
@@ -3551,6 +3588,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
3551 | else | 3588 | else |
3552 | schedstat_inc(sd, alb_failed); | 3589 | schedstat_inc(sd, alb_failed); |
3553 | } | 3590 | } |
3591 | rcu_read_unlock(); | ||
3554 | double_unlock_balance(busiest_rq, target_rq); | 3592 | double_unlock_balance(busiest_rq, target_rq); |
3555 | out_unlock: | 3593 | out_unlock: |
3556 | busiest_rq->active_balance = 0; | 3594 | busiest_rq->active_balance = 0; |
@@ -3677,6 +3715,7 @@ static int find_new_ilb(int cpu) | |||
3677 | { | 3715 | { |
3678 | struct sched_domain *sd; | 3716 | struct sched_domain *sd; |
3679 | struct sched_group *ilb_group; | 3717 | struct sched_group *ilb_group; |
3718 | int ilb = nr_cpu_ids; | ||
3680 | 3719 | ||
3681 | /* | 3720 | /* |
3682 | * Have idle load balancer selection from semi-idle packages only | 3721 | * Have idle load balancer selection from semi-idle packages only |
@@ -3692,20 +3731,25 @@ static int find_new_ilb(int cpu) | |||
3692 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | 3731 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
3693 | goto out_done; | 3732 | goto out_done; |
3694 | 3733 | ||
3734 | rcu_read_lock(); | ||
3695 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3735 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
3696 | ilb_group = sd->groups; | 3736 | ilb_group = sd->groups; |
3697 | 3737 | ||
3698 | do { | 3738 | do { |
3699 | if (is_semi_idle_group(ilb_group)) | 3739 | if (is_semi_idle_group(ilb_group)) { |
3700 | return cpumask_first(nohz.grp_idle_mask); | 3740 | ilb = cpumask_first(nohz.grp_idle_mask); |
3741 | goto unlock; | ||
3742 | } | ||
3701 | 3743 | ||
3702 | ilb_group = ilb_group->next; | 3744 | ilb_group = ilb_group->next; |
3703 | 3745 | ||
3704 | } while (ilb_group != sd->groups); | 3746 | } while (ilb_group != sd->groups); |
3705 | } | 3747 | } |
3748 | unlock: | ||
3749 | rcu_read_unlock(); | ||
3706 | 3750 | ||
3707 | out_done: | 3751 | out_done: |
3708 | return nr_cpu_ids; | 3752 | return ilb; |
3709 | } | 3753 | } |
3710 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3754 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
3711 | static inline int find_new_ilb(int call_cpu) | 3755 | static inline int find_new_ilb(int call_cpu) |
@@ -3820,6 +3864,17 @@ void select_nohz_load_balancer(int stop_tick) | |||
3820 | 3864 | ||
3821 | static DEFINE_SPINLOCK(balancing); | 3865 | static DEFINE_SPINLOCK(balancing); |
3822 | 3866 | ||
3867 | static unsigned long __read_mostly max_load_balance_interval = HZ/10; | ||
3868 | |||
3869 | /* | ||
3870 | * Scale the max load_balance interval with the number of CPUs in the system. | ||
3871 | * This trades load-balance latency on larger machines for less cross talk. | ||
3872 | */ | ||
3873 | static void update_max_interval(void) | ||
3874 | { | ||
3875 | max_load_balance_interval = HZ*num_online_cpus()/10; | ||
3876 | } | ||
3877 | |||
3823 | /* | 3878 | /* |
3824 | * It checks each scheduling domain to see if it is due to be balanced, | 3879 | * It checks each scheduling domain to see if it is due to be balanced, |
3825 | * and initiates a balancing operation if so. | 3880 | * and initiates a balancing operation if so. |
@@ -3839,6 +3894,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3839 | 3894 | ||
3840 | update_shares(cpu); | 3895 | update_shares(cpu); |
3841 | 3896 | ||
3897 | rcu_read_lock(); | ||
3842 | for_each_domain(cpu, sd) { | 3898 | for_each_domain(cpu, sd) { |
3843 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3899 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3844 | continue; | 3900 | continue; |
@@ -3849,10 +3905,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3849 | 3905 | ||
3850 | /* scale ms to jiffies */ | 3906 | /* scale ms to jiffies */ |
3851 | interval = msecs_to_jiffies(interval); | 3907 | interval = msecs_to_jiffies(interval); |
3852 | if (unlikely(!interval)) | 3908 | interval = clamp(interval, 1UL, max_load_balance_interval); |
3853 | interval = 1; | ||
3854 | if (interval > HZ*num_online_cpus()/10) | ||
3855 | interval = HZ*num_online_cpus()/10; | ||
3856 | 3909 | ||
3857 | need_serialize = sd->flags & SD_SERIALIZE; | 3910 | need_serialize = sd->flags & SD_SERIALIZE; |
3858 | 3911 | ||
@@ -3887,6 +3940,7 @@ out: | |||
3887 | if (!balance) | 3940 | if (!balance) |
3888 | break; | 3941 | break; |
3889 | } | 3942 | } |
3943 | rcu_read_unlock(); | ||
3890 | 3944 | ||
3891 | /* | 3945 | /* |
3892 | * next_balance will be updated only when there is a need. | 3946 | * next_balance will be updated only when there is a need. |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69acc29b9..be40f7371ee1 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1) | |||
64 | * Decrement CPU power based on irq activity | 64 | * Decrement CPU power based on irq activity |
65 | */ | 65 | */ |
66 | SCHED_FEAT(NONIRQ_POWER, 1) | 66 | SCHED_FEAT(NONIRQ_POWER, 1) |
67 | |||
68 | /* | ||
69 | * Queue remote wakeups on the target CPU and process them | ||
70 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | ||
71 | */ | ||
72 | SCHED_FEAT(TTWU_QUEUE, 1) | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a776a6396427..0a51882534ea 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -7,7 +7,7 @@ | |||
7 | 7 | ||
8 | #ifdef CONFIG_SMP | 8 | #ifdef CONFIG_SMP |
9 | static int | 9 | static int |
10 | select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | 10 | select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) |
11 | { | 11 | { |
12 | return task_cpu(p); /* IDLE tasks as never migrated */ | 12 | return task_cpu(p); /* IDLE tasks as never migrated */ |
13 | } | 13 | } |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index db308cb08b75..64b2a37c07d0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | 183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); |
184 | } | 184 | } |
185 | 185 | ||
186 | typedef struct task_group *rt_rq_iter_t; | ||
187 | |||
188 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
189 | for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ | ||
190 | (&iter->list != &task_groups) && \ | ||
191 | (rt_rq = iter->rt_rq[cpu_of(rq)]); \ | ||
192 | iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) | ||
193 | |||
186 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | 194 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) |
187 | { | 195 | { |
188 | list_add_rcu(&rt_rq->leaf_rt_rq_list, | 196 | list_add_rcu(&rt_rq->leaf_rt_rq_list, |
@@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
288 | return ktime_to_ns(def_rt_bandwidth.rt_period); | 296 | return ktime_to_ns(def_rt_bandwidth.rt_period); |
289 | } | 297 | } |
290 | 298 | ||
299 | typedef struct rt_rq *rt_rq_iter_t; | ||
300 | |||
301 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
302 | for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
303 | |||
291 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | 304 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) |
292 | { | 305 | { |
293 | } | 306 | } |
@@ -402,12 +415,13 @@ next: | |||
402 | static void __disable_runtime(struct rq *rq) | 415 | static void __disable_runtime(struct rq *rq) |
403 | { | 416 | { |
404 | struct root_domain *rd = rq->rd; | 417 | struct root_domain *rd = rq->rd; |
418 | rt_rq_iter_t iter; | ||
405 | struct rt_rq *rt_rq; | 419 | struct rt_rq *rt_rq; |
406 | 420 | ||
407 | if (unlikely(!scheduler_running)) | 421 | if (unlikely(!scheduler_running)) |
408 | return; | 422 | return; |
409 | 423 | ||
410 | for_each_leaf_rt_rq(rt_rq, rq) { | 424 | for_each_rt_rq(rt_rq, iter, rq) { |
411 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 425 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
412 | s64 want; | 426 | s64 want; |
413 | int i; | 427 | int i; |
@@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq) | |||
487 | 501 | ||
488 | static void __enable_runtime(struct rq *rq) | 502 | static void __enable_runtime(struct rq *rq) |
489 | { | 503 | { |
504 | rt_rq_iter_t iter; | ||
490 | struct rt_rq *rt_rq; | 505 | struct rt_rq *rt_rq; |
491 | 506 | ||
492 | if (unlikely(!scheduler_running)) | 507 | if (unlikely(!scheduler_running)) |
@@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq) | |||
495 | /* | 510 | /* |
496 | * Reset each runqueue's bandwidth settings | 511 | * Reset each runqueue's bandwidth settings |
497 | */ | 512 | */ |
498 | for_each_leaf_rt_rq(rt_rq, rq) { | 513 | for_each_rt_rq(rt_rq, iter, rq) { |
499 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 514 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
500 | 515 | ||
501 | raw_spin_lock(&rt_b->rt_runtime_lock); | 516 | raw_spin_lock(&rt_b->rt_runtime_lock); |
@@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
562 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | 577 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { |
563 | rt_rq->rt_throttled = 0; | 578 | rt_rq->rt_throttled = 0; |
564 | enqueue = 1; | 579 | enqueue = 1; |
580 | |||
581 | /* | ||
582 | * Force a clock update if the CPU was idle, | ||
583 | * lest wakeup -> unthrottle time accumulate. | ||
584 | */ | ||
585 | if (rt_rq->rt_nr_running && rq->curr == rq->idle) | ||
586 | rq->skip_clock_update = -1; | ||
565 | } | 587 | } |
566 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | 588 | if (rt_rq->rt_time || rt_rq->rt_nr_running) |
567 | idle = 0; | 589 | idle = 0; |
@@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq) | |||
977 | static int find_lowest_rq(struct task_struct *task); | 999 | static int find_lowest_rq(struct task_struct *task); |
978 | 1000 | ||
979 | static int | 1001 | static int |
980 | select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | 1002 | select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) |
981 | { | 1003 | { |
1004 | struct task_struct *curr; | ||
1005 | struct rq *rq; | ||
1006 | int cpu; | ||
1007 | |||
982 | if (sd_flag != SD_BALANCE_WAKE) | 1008 | if (sd_flag != SD_BALANCE_WAKE) |
983 | return smp_processor_id(); | 1009 | return smp_processor_id(); |
984 | 1010 | ||
1011 | cpu = task_cpu(p); | ||
1012 | rq = cpu_rq(cpu); | ||
1013 | |||
1014 | rcu_read_lock(); | ||
1015 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | ||
1016 | |||
985 | /* | 1017 | /* |
986 | * If the current task is an RT task, then | 1018 | * If the current task on @p's runqueue is an RT task, then |
987 | * try to see if we can wake this RT task up on another | 1019 | * try to see if we can wake this RT task up on another |
988 | * runqueue. Otherwise simply start this RT task | 1020 | * runqueue. Otherwise simply start this RT task |
989 | * on its current runqueue. | 1021 | * on its current runqueue. |
@@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | |||
997 | * lock? | 1029 | * lock? |
998 | * | 1030 | * |
999 | * For equal prio tasks, we just let the scheduler sort it out. | 1031 | * For equal prio tasks, we just let the scheduler sort it out. |
1032 | * | ||
1033 | * Otherwise, just let it ride on the affined RQ and the | ||
1034 | * post-schedule router will push the preempted task away | ||
1035 | * | ||
1036 | * This test is optimistic, if we get it wrong the load-balancer | ||
1037 | * will have to sort it out. | ||
1000 | */ | 1038 | */ |
1001 | if (unlikely(rt_task(rq->curr)) && | 1039 | if (curr && unlikely(rt_task(curr)) && |
1002 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1040 | (curr->rt.nr_cpus_allowed < 2 || |
1003 | rq->curr->prio < p->prio) && | 1041 | curr->prio < p->prio) && |
1004 | (p->rt.nr_cpus_allowed > 1)) { | 1042 | (p->rt.nr_cpus_allowed > 1)) { |
1005 | int cpu = find_lowest_rq(p); | 1043 | int target = find_lowest_rq(p); |
1006 | 1044 | ||
1007 | return (cpu == -1) ? task_cpu(p) : cpu; | 1045 | if (target != -1) |
1046 | cpu = target; | ||
1008 | } | 1047 | } |
1048 | rcu_read_unlock(); | ||
1009 | 1049 | ||
1010 | /* | 1050 | return cpu; |
1011 | * Otherwise, just let it ride on the affined RQ and the | ||
1012 | * post-schedule router will push the preempted task away | ||
1013 | */ | ||
1014 | return task_cpu(p); | ||
1015 | } | 1051 | } |
1016 | 1052 | ||
1017 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1053 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
@@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1136 | * The previous task needs to be made eligible for pushing | 1172 | * The previous task needs to be made eligible for pushing |
1137 | * if it is still active | 1173 | * if it is still active |
1138 | */ | 1174 | */ |
1139 | if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) | 1175 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) |
1140 | enqueue_pushable_task(rq, p); | 1176 | enqueue_pushable_task(rq, p); |
1141 | } | 1177 | } |
1142 | 1178 | ||
@@ -1287,7 +1323,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1287 | !cpumask_test_cpu(lowest_rq->cpu, | 1323 | !cpumask_test_cpu(lowest_rq->cpu, |
1288 | &task->cpus_allowed) || | 1324 | &task->cpus_allowed) || |
1289 | task_running(rq, task) || | 1325 | task_running(rq, task) || |
1290 | !task->se.on_rq)) { | 1326 | !task->on_rq)) { |
1291 | 1327 | ||
1292 | raw_spin_unlock(&lowest_rq->lock); | 1328 | raw_spin_unlock(&lowest_rq->lock); |
1293 | lowest_rq = NULL; | 1329 | lowest_rq = NULL; |
@@ -1321,7 +1357,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
1321 | BUG_ON(task_current(rq, p)); | 1357 | BUG_ON(task_current(rq, p)); |
1322 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1358 | BUG_ON(p->rt.nr_cpus_allowed <= 1); |
1323 | 1359 | ||
1324 | BUG_ON(!p->se.on_rq); | 1360 | BUG_ON(!p->on_rq); |
1325 | BUG_ON(!rt_task(p)); | 1361 | BUG_ON(!rt_task(p)); |
1326 | 1362 | ||
1327 | return p; | 1363 | return p; |
@@ -1378,7 +1414,7 @@ retry: | |||
1378 | task = pick_next_pushable_task(rq); | 1414 | task = pick_next_pushable_task(rq); |
1379 | if (task_cpu(next_task) == rq->cpu && task == next_task) { | 1415 | if (task_cpu(next_task) == rq->cpu && task == next_task) { |
1380 | /* | 1416 | /* |
1381 | * If we get here, the task hasnt moved at all, but | 1417 | * If we get here, the task hasn't moved at all, but |
1382 | * it has failed to push. We will not try again, | 1418 | * it has failed to push. We will not try again, |
1383 | * since the other cpus will pull from us when they | 1419 | * since the other cpus will pull from us when they |
1384 | * are ready. | 1420 | * are ready. |
@@ -1467,7 +1503,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
1467 | */ | 1503 | */ |
1468 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { | 1504 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { |
1469 | WARN_ON(p == src_rq->curr); | 1505 | WARN_ON(p == src_rq->curr); |
1470 | WARN_ON(!p->se.on_rq); | 1506 | WARN_ON(!p->on_rq); |
1471 | 1507 | ||
1472 | /* | 1508 | /* |
1473 | * There's a chance that p is higher in priority | 1509 | * There's a chance that p is higher in priority |
@@ -1488,7 +1524,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
1488 | /* | 1524 | /* |
1489 | * We continue with the search, just in | 1525 | * We continue with the search, just in |
1490 | * case there's an even higher prio task | 1526 | * case there's an even higher prio task |
1491 | * in another runqueue. (low likelyhood | 1527 | * in another runqueue. (low likelihood |
1492 | * but possible) | 1528 | * but possible) |
1493 | */ | 1529 | */ |
1494 | } | 1530 | } |
@@ -1538,7 +1574,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
1538 | * Update the migration status of the RQ if we have an RT task | 1574 | * Update the migration status of the RQ if we have an RT task |
1539 | * which is running AND changing its weight value. | 1575 | * which is running AND changing its weight value. |
1540 | */ | 1576 | */ |
1541 | if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { | 1577 | if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { |
1542 | struct rq *rq = task_rq(p); | 1578 | struct rq *rq = task_rq(p); |
1543 | 1579 | ||
1544 | if (!task_current(rq, p)) { | 1580 | if (!task_current(rq, p)) { |
@@ -1608,7 +1644,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
1608 | * we may need to handle the pulling of RT tasks | 1644 | * we may need to handle the pulling of RT tasks |
1609 | * now. | 1645 | * now. |
1610 | */ | 1646 | */ |
1611 | if (p->se.on_rq && !rq->rt.rt_nr_running) | 1647 | if (p->on_rq && !rq->rt.rt_nr_running) |
1612 | pull_rt_task(rq); | 1648 | pull_rt_task(rq); |
1613 | } | 1649 | } |
1614 | 1650 | ||
@@ -1638,7 +1674,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1638 | * If that current running task is also an RT task | 1674 | * If that current running task is also an RT task |
1639 | * then see if we can move to another run queue. | 1675 | * then see if we can move to another run queue. |
1640 | */ | 1676 | */ |
1641 | if (p->se.on_rq && rq->curr != p) { | 1677 | if (p->on_rq && rq->curr != p) { |
1642 | #ifdef CONFIG_SMP | 1678 | #ifdef CONFIG_SMP |
1643 | if (rq->rt.overloaded && push_rt_task(rq) && | 1679 | if (rq->rt.overloaded && push_rt_task(rq) && |
1644 | /* Don't resched if we changed runqueues */ | 1680 | /* Don't resched if we changed runqueues */ |
@@ -1657,7 +1693,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
1657 | static void | 1693 | static void |
1658 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | 1694 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
1659 | { | 1695 | { |
1660 | if (!p->se.on_rq) | 1696 | if (!p->on_rq) |
1661 | return; | 1697 | return; |
1662 | 1698 | ||
1663 | if (rq->curr == p) { | 1699 | if (rq->curr == p) { |
@@ -1796,10 +1832,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | |||
1796 | 1832 | ||
1797 | static void print_rt_stats(struct seq_file *m, int cpu) | 1833 | static void print_rt_stats(struct seq_file *m, int cpu) |
1798 | { | 1834 | { |
1835 | rt_rq_iter_t iter; | ||
1799 | struct rt_rq *rt_rq; | 1836 | struct rt_rq *rt_rq; |
1800 | 1837 | ||
1801 | rcu_read_lock(); | 1838 | rcu_read_lock(); |
1802 | for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) | 1839 | for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) |
1803 | print_rt_rq(m, cpu, rt_rq); | 1840 | print_rt_rq(m, cpu, rt_rq); |
1804 | rcu_read_unlock(); | 1841 | rcu_read_unlock(); |
1805 | } | 1842 | } |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 1ba2bd40fdac..6f437632afab 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
@@ -9,8 +9,7 @@ | |||
9 | 9 | ||
10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
11 | static int | 11 | static int |
12 | select_task_rq_stop(struct rq *rq, struct task_struct *p, | 12 | select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) |
13 | int sd_flag, int flags) | ||
14 | { | 13 | { |
15 | return task_cpu(p); /* stop tasks as never migrate */ | 14 | return task_cpu(p); /* stop tasks as never migrate */ |
16 | } | 15 | } |
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
26 | { | 25 | { |
27 | struct task_struct *stop = rq->stop; | 26 | struct task_struct *stop = rq->stop; |
28 | 27 | ||
29 | if (stop && stop->se.on_rq) | 28 | if (stop && stop->on_rq) |
30 | return stop; | 29 | return stop; |
31 | 30 | ||
32 | return NULL; | 31 | return NULL; |
diff --git a/kernel/signal.c b/kernel/signal.c index 35c5f4b05344..ad5e818baacc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -2068,7 +2068,7 @@ relock: | |||
2068 | for (;;) { | 2068 | for (;;) { |
2069 | struct k_sigaction *ka; | 2069 | struct k_sigaction *ka; |
2070 | /* | 2070 | /* |
2071 | * Tracing can induce an artifical signal and choose sigaction. | 2071 | * Tracing can induce an artificial signal and choose sigaction. |
2072 | * The return value in @signr determines the default action, | 2072 | * The return value in @signr determines the default action, |
2073 | * but @info->si_signo is the signal number we will report. | 2073 | * but @info->si_signo is the signal number we will report. |
2074 | */ | 2074 | */ |
@@ -2941,8 +2941,8 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, | |||
2941 | /** | 2941 | /** |
2942 | * sys_rt_sigaction - alter an action taken by a process | 2942 | * sys_rt_sigaction - alter an action taken by a process |
2943 | * @sig: signal to be sent | 2943 | * @sig: signal to be sent |
2944 | * @act: the thread group ID of the thread | 2944 | * @act: new sigaction |
2945 | * @oact: the PID of the thread | 2945 | * @oact: used to save the previous sigaction |
2946 | * @sigsetsize: size of sigset_t type | 2946 | * @sigsetsize: size of sigset_t type |
2947 | */ | 2947 | */ |
2948 | SYSCALL_DEFINE4(rt_sigaction, int, sig, | 2948 | SYSCALL_DEFINE4(rt_sigaction, int, sig, |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 735d87095172..13960170cad4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | |||
58 | 58 | ||
59 | char *softirq_to_name[NR_SOFTIRQS] = { | 59 | char *softirq_to_name[NR_SOFTIRQS] = { |
60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", | 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", |
61 | "TASKLET", "SCHED", "HRTIMER", "RCU" | 61 | "TASKLET", "SCHED", "HRTIMER" |
62 | }; | 62 | }; |
63 | 63 | ||
64 | /* | 64 | /* |
@@ -567,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data) | |||
567 | /** | 567 | /** |
568 | * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks | 568 | * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks |
569 | * @ttimer: tasklet_hrtimer which is initialized | 569 | * @ttimer: tasklet_hrtimer which is initialized |
570 | * @function: hrtimer callback funtion which gets called from softirq context | 570 | * @function: hrtimer callback function which gets called from softirq context |
571 | * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) | 571 | * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) |
572 | * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) | 572 | * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) |
573 | */ | 573 | */ |
diff --git a/kernel/sys.c b/kernel/sys.c index af468edf096a..e4128b278f23 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -314,8 +314,8 @@ void kernel_restart_prepare(char *cmd) | |||
314 | { | 314 | { |
315 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 315 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
316 | system_state = SYSTEM_RESTART; | 316 | system_state = SYSTEM_RESTART; |
317 | usermodehelper_disable(); | ||
317 | device_shutdown(); | 318 | device_shutdown(); |
318 | sysdev_shutdown(); | ||
319 | syscore_shutdown(); | 319 | syscore_shutdown(); |
320 | } | 320 | } |
321 | 321 | ||
@@ -344,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state) | |||
344 | blocking_notifier_call_chain(&reboot_notifier_list, | 344 | blocking_notifier_call_chain(&reboot_notifier_list, |
345 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); | 345 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); |
346 | system_state = state; | 346 | system_state = state; |
347 | usermodehelper_disable(); | ||
347 | device_shutdown(); | 348 | device_shutdown(); |
348 | } | 349 | } |
349 | /** | 350 | /** |
@@ -354,7 +355,6 @@ static void kernel_shutdown_prepare(enum system_states state) | |||
354 | void kernel_halt(void) | 355 | void kernel_halt(void) |
355 | { | 356 | { |
356 | kernel_shutdown_prepare(SYSTEM_HALT); | 357 | kernel_shutdown_prepare(SYSTEM_HALT); |
357 | sysdev_shutdown(); | ||
358 | syscore_shutdown(); | 358 | syscore_shutdown(); |
359 | printk(KERN_EMERG "System halted.\n"); | 359 | printk(KERN_EMERG "System halted.\n"); |
360 | kmsg_dump(KMSG_DUMP_HALT); | 360 | kmsg_dump(KMSG_DUMP_HALT); |
@@ -374,7 +374,6 @@ void kernel_power_off(void) | |||
374 | if (pm_power_off_prepare) | 374 | if (pm_power_off_prepare) |
375 | pm_power_off_prepare(); | 375 | pm_power_off_prepare(); |
376 | disable_nonboot_cpus(); | 376 | disable_nonboot_cpus(); |
377 | sysdev_shutdown(); | ||
378 | syscore_shutdown(); | 377 | syscore_shutdown(); |
379 | printk(KERN_EMERG "Power down.\n"); | 378 | printk(KERN_EMERG "Power down.\n"); |
380 | kmsg_dump(KMSG_DUMP_POWEROFF); | 379 | kmsg_dump(KMSG_DUMP_POWEROFF); |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index b0425991e9ac..e2fd74b8e8c2 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
@@ -1,5 +1,5 @@ | |||
1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o |
2 | obj-y += timeconv.o posix-clock.o | 2 | obj-y += timeconv.o posix-clock.o alarmtimer.o |
3 | 3 | ||
4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 000000000000..9265014cb4db --- /dev/null +++ b/kernel/time/alarmtimer.c | |||
@@ -0,0 +1,694 @@ | |||
1 | /* | ||
2 | * Alarmtimer interface | ||
3 | * | ||
4 | * This interface provides a timer which is similarto hrtimers, | ||
5 | * but triggers a RTC alarm if the box is suspend. | ||
6 | * | ||
7 | * This interface is influenced by the Android RTC Alarm timer | ||
8 | * interface. | ||
9 | * | ||
10 | * Copyright (C) 2010 IBM Corperation | ||
11 | * | ||
12 | * Author: John Stultz <john.stultz@linaro.org> | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify | ||
15 | * it under the terms of the GNU General Public License version 2 as | ||
16 | * published by the Free Software Foundation. | ||
17 | */ | ||
18 | #include <linux/time.h> | ||
19 | #include <linux/hrtimer.h> | ||
20 | #include <linux/timerqueue.h> | ||
21 | #include <linux/rtc.h> | ||
22 | #include <linux/alarmtimer.h> | ||
23 | #include <linux/mutex.h> | ||
24 | #include <linux/platform_device.h> | ||
25 | #include <linux/posix-timers.h> | ||
26 | #include <linux/workqueue.h> | ||
27 | #include <linux/freezer.h> | ||
28 | |||
29 | /** | ||
30 | * struct alarm_base - Alarm timer bases | ||
31 | * @lock: Lock for syncrhonized access to the base | ||
32 | * @timerqueue: Timerqueue head managing the list of events | ||
33 | * @timer: hrtimer used to schedule events while running | ||
34 | * @gettime: Function to read the time correlating to the base | ||
35 | * @base_clockid: clockid for the base | ||
36 | */ | ||
37 | static struct alarm_base { | ||
38 | spinlock_t lock; | ||
39 | struct timerqueue_head timerqueue; | ||
40 | struct hrtimer timer; | ||
41 | ktime_t (*gettime)(void); | ||
42 | clockid_t base_clockid; | ||
43 | } alarm_bases[ALARM_NUMTYPE]; | ||
44 | |||
45 | #ifdef CONFIG_RTC_CLASS | ||
46 | /* rtc timer and device for setting alarm wakeups at suspend */ | ||
47 | static struct rtc_timer rtctimer; | ||
48 | static struct rtc_device *rtcdev; | ||
49 | #endif | ||
50 | |||
51 | /* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ | ||
52 | static ktime_t freezer_delta; | ||
53 | static DEFINE_SPINLOCK(freezer_delta_lock); | ||
54 | |||
55 | |||
56 | /** | ||
57 | * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue | ||
58 | * @base: pointer to the base where the timer is being run | ||
59 | * @alarm: pointer to alarm being enqueued. | ||
60 | * | ||
61 | * Adds alarm to a alarm_base timerqueue and if necessary sets | ||
62 | * an hrtimer to run. | ||
63 | * | ||
64 | * Must hold base->lock when calling. | ||
65 | */ | ||
66 | static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) | ||
67 | { | ||
68 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
69 | if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { | ||
70 | hrtimer_try_to_cancel(&base->timer); | ||
71 | hrtimer_start(&base->timer, alarm->node.expires, | ||
72 | HRTIMER_MODE_ABS); | ||
73 | } | ||
74 | } | ||
75 | |||
76 | /** | ||
77 | * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue | ||
78 | * @base: pointer to the base where the timer is running | ||
79 | * @alarm: pointer to alarm being removed | ||
80 | * | ||
81 | * Removes alarm to a alarm_base timerqueue and if necessary sets | ||
82 | * a new timer to run. | ||
83 | * | ||
84 | * Must hold base->lock when calling. | ||
85 | */ | ||
86 | static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) | ||
87 | { | ||
88 | struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); | ||
89 | |||
90 | timerqueue_del(&base->timerqueue, &alarm->node); | ||
91 | if (next == &alarm->node) { | ||
92 | hrtimer_try_to_cancel(&base->timer); | ||
93 | next = timerqueue_getnext(&base->timerqueue); | ||
94 | if (!next) | ||
95 | return; | ||
96 | hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); | ||
97 | } | ||
98 | } | ||
99 | |||
100 | |||
101 | /** | ||
102 | * alarmtimer_fired - Handles alarm hrtimer being fired. | ||
103 | * @timer: pointer to hrtimer being run | ||
104 | * | ||
105 | * When a alarm timer fires, this runs through the timerqueue to | ||
106 | * see which alarms expired, and runs those. If there are more alarm | ||
107 | * timers queued for the future, we set the hrtimer to fire when | ||
108 | * when the next future alarm timer expires. | ||
109 | */ | ||
110 | static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | ||
111 | { | ||
112 | struct alarm_base *base = container_of(timer, struct alarm_base, timer); | ||
113 | struct timerqueue_node *next; | ||
114 | unsigned long flags; | ||
115 | ktime_t now; | ||
116 | int ret = HRTIMER_NORESTART; | ||
117 | |||
118 | spin_lock_irqsave(&base->lock, flags); | ||
119 | now = base->gettime(); | ||
120 | while ((next = timerqueue_getnext(&base->timerqueue))) { | ||
121 | struct alarm *alarm; | ||
122 | ktime_t expired = next->expires; | ||
123 | |||
124 | if (expired.tv64 >= now.tv64) | ||
125 | break; | ||
126 | |||
127 | alarm = container_of(next, struct alarm, node); | ||
128 | |||
129 | timerqueue_del(&base->timerqueue, &alarm->node); | ||
130 | alarm->enabled = 0; | ||
131 | /* Re-add periodic timers */ | ||
132 | if (alarm->period.tv64) { | ||
133 | alarm->node.expires = ktime_add(expired, alarm->period); | ||
134 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
135 | alarm->enabled = 1; | ||
136 | } | ||
137 | spin_unlock_irqrestore(&base->lock, flags); | ||
138 | if (alarm->function) | ||
139 | alarm->function(alarm); | ||
140 | spin_lock_irqsave(&base->lock, flags); | ||
141 | } | ||
142 | |||
143 | if (next) { | ||
144 | hrtimer_set_expires(&base->timer, next->expires); | ||
145 | ret = HRTIMER_RESTART; | ||
146 | } | ||
147 | spin_unlock_irqrestore(&base->lock, flags); | ||
148 | |||
149 | return ret; | ||
150 | |||
151 | } | ||
152 | |||
153 | #ifdef CONFIG_RTC_CLASS | ||
154 | /** | ||
155 | * alarmtimer_suspend - Suspend time callback | ||
156 | * @dev: unused | ||
157 | * @state: unused | ||
158 | * | ||
159 | * When we are going into suspend, we look through the bases | ||
160 | * to see which is the soonest timer to expire. We then | ||
161 | * set an rtc timer to fire that far into the future, which | ||
162 | * will wake us from suspend. | ||
163 | */ | ||
164 | static int alarmtimer_suspend(struct device *dev) | ||
165 | { | ||
166 | struct rtc_time tm; | ||
167 | ktime_t min, now; | ||
168 | unsigned long flags; | ||
169 | int i; | ||
170 | |||
171 | spin_lock_irqsave(&freezer_delta_lock, flags); | ||
172 | min = freezer_delta; | ||
173 | freezer_delta = ktime_set(0, 0); | ||
174 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | ||
175 | |||
176 | /* If we have no rtcdev, just return */ | ||
177 | if (!rtcdev) | ||
178 | return 0; | ||
179 | |||
180 | /* Find the soonest timer to expire*/ | ||
181 | for (i = 0; i < ALARM_NUMTYPE; i++) { | ||
182 | struct alarm_base *base = &alarm_bases[i]; | ||
183 | struct timerqueue_node *next; | ||
184 | ktime_t delta; | ||
185 | |||
186 | spin_lock_irqsave(&base->lock, flags); | ||
187 | next = timerqueue_getnext(&base->timerqueue); | ||
188 | spin_unlock_irqrestore(&base->lock, flags); | ||
189 | if (!next) | ||
190 | continue; | ||
191 | delta = ktime_sub(next->expires, base->gettime()); | ||
192 | if (!min.tv64 || (delta.tv64 < min.tv64)) | ||
193 | min = delta; | ||
194 | } | ||
195 | if (min.tv64 == 0) | ||
196 | return 0; | ||
197 | |||
198 | /* XXX - Should we enforce a minimum sleep time? */ | ||
199 | WARN_ON(min.tv64 < NSEC_PER_SEC); | ||
200 | |||
201 | /* Setup an rtc timer to fire that far in the future */ | ||
202 | rtc_timer_cancel(rtcdev, &rtctimer); | ||
203 | rtc_read_time(rtcdev, &tm); | ||
204 | now = rtc_tm_to_ktime(tm); | ||
205 | now = ktime_add(now, min); | ||
206 | |||
207 | rtc_timer_start(rtcdev, &rtctimer, now, ktime_set(0, 0)); | ||
208 | |||
209 | return 0; | ||
210 | } | ||
211 | #else | ||
212 | static int alarmtimer_suspend(struct device *dev) | ||
213 | { | ||
214 | return 0; | ||
215 | } | ||
216 | #endif | ||
217 | |||
218 | static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) | ||
219 | { | ||
220 | ktime_t delta; | ||
221 | unsigned long flags; | ||
222 | struct alarm_base *base = &alarm_bases[type]; | ||
223 | |||
224 | delta = ktime_sub(absexp, base->gettime()); | ||
225 | |||
226 | spin_lock_irqsave(&freezer_delta_lock, flags); | ||
227 | if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) | ||
228 | freezer_delta = delta; | ||
229 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | ||
230 | } | ||
231 | |||
232 | |||
233 | /** | ||
234 | * alarm_init - Initialize an alarm structure | ||
235 | * @alarm: ptr to alarm to be initialized | ||
236 | * @type: the type of the alarm | ||
237 | * @function: callback that is run when the alarm fires | ||
238 | */ | ||
239 | void alarm_init(struct alarm *alarm, enum alarmtimer_type type, | ||
240 | void (*function)(struct alarm *)) | ||
241 | { | ||
242 | timerqueue_init(&alarm->node); | ||
243 | alarm->period = ktime_set(0, 0); | ||
244 | alarm->function = function; | ||
245 | alarm->type = type; | ||
246 | alarm->enabled = 0; | ||
247 | } | ||
248 | |||
249 | /** | ||
250 | * alarm_start - Sets an alarm to fire | ||
251 | * @alarm: ptr to alarm to set | ||
252 | * @start: time to run the alarm | ||
253 | * @period: period at which the alarm will recur | ||
254 | */ | ||
255 | void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) | ||
256 | { | ||
257 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
258 | unsigned long flags; | ||
259 | |||
260 | spin_lock_irqsave(&base->lock, flags); | ||
261 | if (alarm->enabled) | ||
262 | alarmtimer_remove(base, alarm); | ||
263 | alarm->node.expires = start; | ||
264 | alarm->period = period; | ||
265 | alarmtimer_enqueue(base, alarm); | ||
266 | alarm->enabled = 1; | ||
267 | spin_unlock_irqrestore(&base->lock, flags); | ||
268 | } | ||
269 | |||
270 | /** | ||
271 | * alarm_cancel - Tries to cancel an alarm timer | ||
272 | * @alarm: ptr to alarm to be canceled | ||
273 | */ | ||
274 | void alarm_cancel(struct alarm *alarm) | ||
275 | { | ||
276 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
277 | unsigned long flags; | ||
278 | |||
279 | spin_lock_irqsave(&base->lock, flags); | ||
280 | if (alarm->enabled) | ||
281 | alarmtimer_remove(base, alarm); | ||
282 | alarm->enabled = 0; | ||
283 | spin_unlock_irqrestore(&base->lock, flags); | ||
284 | } | ||
285 | |||
286 | |||
287 | /** | ||
288 | * clock2alarm - helper that converts from clockid to alarmtypes | ||
289 | * @clockid: clockid. | ||
290 | */ | ||
291 | static enum alarmtimer_type clock2alarm(clockid_t clockid) | ||
292 | { | ||
293 | if (clockid == CLOCK_REALTIME_ALARM) | ||
294 | return ALARM_REALTIME; | ||
295 | if (clockid == CLOCK_BOOTTIME_ALARM) | ||
296 | return ALARM_BOOTTIME; | ||
297 | return -1; | ||
298 | } | ||
299 | |||
300 | /** | ||
301 | * alarm_handle_timer - Callback for posix timers | ||
302 | * @alarm: alarm that fired | ||
303 | * | ||
304 | * Posix timer callback for expired alarm timers. | ||
305 | */ | ||
306 | static void alarm_handle_timer(struct alarm *alarm) | ||
307 | { | ||
308 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, | ||
309 | it.alarmtimer); | ||
310 | if (posix_timer_event(ptr, 0) != 0) | ||
311 | ptr->it_overrun++; | ||
312 | } | ||
313 | |||
314 | /** | ||
315 | * alarm_clock_getres - posix getres interface | ||
316 | * @which_clock: clockid | ||
317 | * @tp: timespec to fill | ||
318 | * | ||
319 | * Returns the granularity of underlying alarm base clock | ||
320 | */ | ||
321 | static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) | ||
322 | { | ||
323 | clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; | ||
324 | |||
325 | return hrtimer_get_res(baseid, tp); | ||
326 | } | ||
327 | |||
328 | /** | ||
329 | * alarm_clock_get - posix clock_get interface | ||
330 | * @which_clock: clockid | ||
331 | * @tp: timespec to fill. | ||
332 | * | ||
333 | * Provides the underlying alarm base time. | ||
334 | */ | ||
335 | static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) | ||
336 | { | ||
337 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; | ||
338 | |||
339 | *tp = ktime_to_timespec(base->gettime()); | ||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | /** | ||
344 | * alarm_timer_create - posix timer_create interface | ||
345 | * @new_timer: k_itimer pointer to manage | ||
346 | * | ||
347 | * Initializes the k_itimer structure. | ||
348 | */ | ||
349 | static int alarm_timer_create(struct k_itimer *new_timer) | ||
350 | { | ||
351 | enum alarmtimer_type type; | ||
352 | struct alarm_base *base; | ||
353 | |||
354 | if (!capable(CAP_WAKE_ALARM)) | ||
355 | return -EPERM; | ||
356 | |||
357 | type = clock2alarm(new_timer->it_clock); | ||
358 | base = &alarm_bases[type]; | ||
359 | alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); | ||
360 | return 0; | ||
361 | } | ||
362 | |||
363 | /** | ||
364 | * alarm_timer_get - posix timer_get interface | ||
365 | * @new_timer: k_itimer pointer | ||
366 | * @cur_setting: itimerspec data to fill | ||
367 | * | ||
368 | * Copies the itimerspec data out from the k_itimer | ||
369 | */ | ||
370 | static void alarm_timer_get(struct k_itimer *timr, | ||
371 | struct itimerspec *cur_setting) | ||
372 | { | ||
373 | cur_setting->it_interval = | ||
374 | ktime_to_timespec(timr->it.alarmtimer.period); | ||
375 | cur_setting->it_value = | ||
376 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | ||
377 | return; | ||
378 | } | ||
379 | |||
380 | /** | ||
381 | * alarm_timer_del - posix timer_del interface | ||
382 | * @timr: k_itimer pointer to be deleted | ||
383 | * | ||
384 | * Cancels any programmed alarms for the given timer. | ||
385 | */ | ||
386 | static int alarm_timer_del(struct k_itimer *timr) | ||
387 | { | ||
388 | alarm_cancel(&timr->it.alarmtimer); | ||
389 | return 0; | ||
390 | } | ||
391 | |||
392 | /** | ||
393 | * alarm_timer_set - posix timer_set interface | ||
394 | * @timr: k_itimer pointer to be deleted | ||
395 | * @flags: timer flags | ||
396 | * @new_setting: itimerspec to be used | ||
397 | * @old_setting: itimerspec being replaced | ||
398 | * | ||
399 | * Sets the timer to new_setting, and starts the timer. | ||
400 | */ | ||
401 | static int alarm_timer_set(struct k_itimer *timr, int flags, | ||
402 | struct itimerspec *new_setting, | ||
403 | struct itimerspec *old_setting) | ||
404 | { | ||
405 | /* Save old values */ | ||
406 | old_setting->it_interval = | ||
407 | ktime_to_timespec(timr->it.alarmtimer.period); | ||
408 | old_setting->it_value = | ||
409 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | ||
410 | |||
411 | /* If the timer was already set, cancel it */ | ||
412 | alarm_cancel(&timr->it.alarmtimer); | ||
413 | |||
414 | /* start the timer */ | ||
415 | alarm_start(&timr->it.alarmtimer, | ||
416 | timespec_to_ktime(new_setting->it_value), | ||
417 | timespec_to_ktime(new_setting->it_interval)); | ||
418 | return 0; | ||
419 | } | ||
420 | |||
421 | /** | ||
422 | * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep | ||
423 | * @alarm: ptr to alarm that fired | ||
424 | * | ||
425 | * Wakes up the task that set the alarmtimer | ||
426 | */ | ||
427 | static void alarmtimer_nsleep_wakeup(struct alarm *alarm) | ||
428 | { | ||
429 | struct task_struct *task = (struct task_struct *)alarm->data; | ||
430 | |||
431 | alarm->data = NULL; | ||
432 | if (task) | ||
433 | wake_up_process(task); | ||
434 | } | ||
435 | |||
436 | /** | ||
437 | * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation | ||
438 | * @alarm: ptr to alarmtimer | ||
439 | * @absexp: absolute expiration time | ||
440 | * | ||
441 | * Sets the alarm timer and sleeps until it is fired or interrupted. | ||
442 | */ | ||
443 | static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) | ||
444 | { | ||
445 | alarm->data = (void *)current; | ||
446 | do { | ||
447 | set_current_state(TASK_INTERRUPTIBLE); | ||
448 | alarm_start(alarm, absexp, ktime_set(0, 0)); | ||
449 | if (likely(alarm->data)) | ||
450 | schedule(); | ||
451 | |||
452 | alarm_cancel(alarm); | ||
453 | } while (alarm->data && !signal_pending(current)); | ||
454 | |||
455 | __set_current_state(TASK_RUNNING); | ||
456 | |||
457 | return (alarm->data == NULL); | ||
458 | } | ||
459 | |||
460 | |||
461 | /** | ||
462 | * update_rmtp - Update remaining timespec value | ||
463 | * @exp: expiration time | ||
464 | * @type: timer type | ||
465 | * @rmtp: user pointer to remaining timepsec value | ||
466 | * | ||
467 | * Helper function that fills in rmtp value with time between | ||
468 | * now and the exp value | ||
469 | */ | ||
470 | static int update_rmtp(ktime_t exp, enum alarmtimer_type type, | ||
471 | struct timespec __user *rmtp) | ||
472 | { | ||
473 | struct timespec rmt; | ||
474 | ktime_t rem; | ||
475 | |||
476 | rem = ktime_sub(exp, alarm_bases[type].gettime()); | ||
477 | |||
478 | if (rem.tv64 <= 0) | ||
479 | return 0; | ||
480 | rmt = ktime_to_timespec(rem); | ||
481 | |||
482 | if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) | ||
483 | return -EFAULT; | ||
484 | |||
485 | return 1; | ||
486 | |||
487 | } | ||
488 | |||
489 | /** | ||
490 | * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep | ||
491 | * @restart: ptr to restart block | ||
492 | * | ||
493 | * Handles restarted clock_nanosleep calls | ||
494 | */ | ||
495 | static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) | ||
496 | { | ||
497 | enum alarmtimer_type type = restart->nanosleep.index; | ||
498 | ktime_t exp; | ||
499 | struct timespec __user *rmtp; | ||
500 | struct alarm alarm; | ||
501 | int ret = 0; | ||
502 | |||
503 | exp.tv64 = restart->nanosleep.expires; | ||
504 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); | ||
505 | |||
506 | if (alarmtimer_do_nsleep(&alarm, exp)) | ||
507 | goto out; | ||
508 | |||
509 | if (freezing(current)) | ||
510 | alarmtimer_freezerset(exp, type); | ||
511 | |||
512 | rmtp = restart->nanosleep.rmtp; | ||
513 | if (rmtp) { | ||
514 | ret = update_rmtp(exp, type, rmtp); | ||
515 | if (ret <= 0) | ||
516 | goto out; | ||
517 | } | ||
518 | |||
519 | |||
520 | /* The other values in restart are already filled in */ | ||
521 | ret = -ERESTART_RESTARTBLOCK; | ||
522 | out: | ||
523 | return ret; | ||
524 | } | ||
525 | |||
526 | /** | ||
527 | * alarm_timer_nsleep - alarmtimer nanosleep | ||
528 | * @which_clock: clockid | ||
529 | * @flags: determins abstime or relative | ||
530 | * @tsreq: requested sleep time (abs or rel) | ||
531 | * @rmtp: remaining sleep time saved | ||
532 | * | ||
533 | * Handles clock_nanosleep calls against _ALARM clockids | ||
534 | */ | ||
535 | static int alarm_timer_nsleep(const clockid_t which_clock, int flags, | ||
536 | struct timespec *tsreq, struct timespec __user *rmtp) | ||
537 | { | ||
538 | enum alarmtimer_type type = clock2alarm(which_clock); | ||
539 | struct alarm alarm; | ||
540 | ktime_t exp; | ||
541 | int ret = 0; | ||
542 | struct restart_block *restart; | ||
543 | |||
544 | if (!capable(CAP_WAKE_ALARM)) | ||
545 | return -EPERM; | ||
546 | |||
547 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); | ||
548 | |||
549 | exp = timespec_to_ktime(*tsreq); | ||
550 | /* Convert (if necessary) to absolute time */ | ||
551 | if (flags != TIMER_ABSTIME) { | ||
552 | ktime_t now = alarm_bases[type].gettime(); | ||
553 | exp = ktime_add(now, exp); | ||
554 | } | ||
555 | |||
556 | if (alarmtimer_do_nsleep(&alarm, exp)) | ||
557 | goto out; | ||
558 | |||
559 | if (freezing(current)) | ||
560 | alarmtimer_freezerset(exp, type); | ||
561 | |||
562 | /* abs timers don't set remaining time or restart */ | ||
563 | if (flags == TIMER_ABSTIME) { | ||
564 | ret = -ERESTARTNOHAND; | ||
565 | goto out; | ||
566 | } | ||
567 | |||
568 | if (rmtp) { | ||
569 | ret = update_rmtp(exp, type, rmtp); | ||
570 | if (ret <= 0) | ||
571 | goto out; | ||
572 | } | ||
573 | |||
574 | restart = ¤t_thread_info()->restart_block; | ||
575 | restart->fn = alarm_timer_nsleep_restart; | ||
576 | restart->nanosleep.index = type; | ||
577 | restart->nanosleep.expires = exp.tv64; | ||
578 | restart->nanosleep.rmtp = rmtp; | ||
579 | ret = -ERESTART_RESTARTBLOCK; | ||
580 | |||
581 | out: | ||
582 | return ret; | ||
583 | } | ||
584 | |||
585 | |||
586 | /* Suspend hook structures */ | ||
587 | static const struct dev_pm_ops alarmtimer_pm_ops = { | ||
588 | .suspend = alarmtimer_suspend, | ||
589 | }; | ||
590 | |||
591 | static struct platform_driver alarmtimer_driver = { | ||
592 | .driver = { | ||
593 | .name = "alarmtimer", | ||
594 | .pm = &alarmtimer_pm_ops, | ||
595 | } | ||
596 | }; | ||
597 | |||
598 | /** | ||
599 | * alarmtimer_init - Initialize alarm timer code | ||
600 | * | ||
601 | * This function initializes the alarm bases and registers | ||
602 | * the posix clock ids. | ||
603 | */ | ||
604 | static int __init alarmtimer_init(void) | ||
605 | { | ||
606 | int error = 0; | ||
607 | int i; | ||
608 | struct k_clock alarm_clock = { | ||
609 | .clock_getres = alarm_clock_getres, | ||
610 | .clock_get = alarm_clock_get, | ||
611 | .timer_create = alarm_timer_create, | ||
612 | .timer_set = alarm_timer_set, | ||
613 | .timer_del = alarm_timer_del, | ||
614 | .timer_get = alarm_timer_get, | ||
615 | .nsleep = alarm_timer_nsleep, | ||
616 | }; | ||
617 | |||
618 | posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); | ||
619 | posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); | ||
620 | |||
621 | /* Initialize alarm bases */ | ||
622 | alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; | ||
623 | alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; | ||
624 | alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; | ||
625 | alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; | ||
626 | for (i = 0; i < ALARM_NUMTYPE; i++) { | ||
627 | timerqueue_init_head(&alarm_bases[i].timerqueue); | ||
628 | spin_lock_init(&alarm_bases[i].lock); | ||
629 | hrtimer_init(&alarm_bases[i].timer, | ||
630 | alarm_bases[i].base_clockid, | ||
631 | HRTIMER_MODE_ABS); | ||
632 | alarm_bases[i].timer.function = alarmtimer_fired; | ||
633 | } | ||
634 | error = platform_driver_register(&alarmtimer_driver); | ||
635 | platform_device_register_simple("alarmtimer", -1, NULL, 0); | ||
636 | |||
637 | return error; | ||
638 | } | ||
639 | device_initcall(alarmtimer_init); | ||
640 | |||
641 | #ifdef CONFIG_RTC_CLASS | ||
642 | /** | ||
643 | * has_wakealarm - check rtc device has wakealarm ability | ||
644 | * @dev: current device | ||
645 | * @name_ptr: name to be returned | ||
646 | * | ||
647 | * This helper function checks to see if the rtc device can wake | ||
648 | * from suspend. | ||
649 | */ | ||
650 | static int __init has_wakealarm(struct device *dev, void *name_ptr) | ||
651 | { | ||
652 | struct rtc_device *candidate = to_rtc_device(dev); | ||
653 | |||
654 | if (!candidate->ops->set_alarm) | ||
655 | return 0; | ||
656 | if (!device_may_wakeup(candidate->dev.parent)) | ||
657 | return 0; | ||
658 | |||
659 | *(const char **)name_ptr = dev_name(dev); | ||
660 | return 1; | ||
661 | } | ||
662 | |||
663 | /** | ||
664 | * alarmtimer_init_late - Late initializing of alarmtimer code | ||
665 | * | ||
666 | * This function locates a rtc device to use for wakealarms. | ||
667 | * Run as late_initcall to make sure rtc devices have been | ||
668 | * registered. | ||
669 | */ | ||
670 | static int __init alarmtimer_init_late(void) | ||
671 | { | ||
672 | char *str; | ||
673 | |||
674 | /* Find an rtc device and init the rtc_timer */ | ||
675 | class_find_device(rtc_class, NULL, &str, has_wakealarm); | ||
676 | if (str) | ||
677 | rtcdev = rtc_class_open(str); | ||
678 | if (!rtcdev) { | ||
679 | printk(KERN_WARNING "No RTC device found, ALARM timers will" | ||
680 | " not wake from suspend"); | ||
681 | } | ||
682 | rtc_timer_init(&rtctimer, NULL, NULL); | ||
683 | |||
684 | return 0; | ||
685 | } | ||
686 | #else | ||
687 | static int __init alarmtimer_init_late(void) | ||
688 | { | ||
689 | printk(KERN_WARNING "Kernel not built with RTC support, ALARM timers" | ||
690 | " will not wake from suspend"); | ||
691 | return 0; | ||
692 | } | ||
693 | #endif | ||
694 | late_initcall(alarmtimer_init_late); | ||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0d74b9ba90c8..22a9da9a9c96 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -194,6 +194,70 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
194 | } | 194 | } |
195 | EXPORT_SYMBOL_GPL(clockevents_register_device); | 195 | EXPORT_SYMBOL_GPL(clockevents_register_device); |
196 | 196 | ||
197 | static void clockevents_config(struct clock_event_device *dev, | ||
198 | u32 freq) | ||
199 | { | ||
200 | unsigned long sec; | ||
201 | |||
202 | if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
203 | return; | ||
204 | |||
205 | /* | ||
206 | * Calculate the maximum number of seconds we can sleep. Limit | ||
207 | * to 10 minutes for hardware which can program more than | ||
208 | * 32bit ticks so we still get reasonable conversion values. | ||
209 | */ | ||
210 | sec = dev->max_delta_ticks; | ||
211 | do_div(sec, freq); | ||
212 | if (!sec) | ||
213 | sec = 1; | ||
214 | else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) | ||
215 | sec = 600; | ||
216 | |||
217 | clockevents_calc_mult_shift(dev, freq, sec); | ||
218 | dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); | ||
219 | dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); | ||
220 | } | ||
221 | |||
222 | /** | ||
223 | * clockevents_config_and_register - Configure and register a clock event device | ||
224 | * @dev: device to register | ||
225 | * @freq: The clock frequency | ||
226 | * @min_delta: The minimum clock ticks to program in oneshot mode | ||
227 | * @max_delta: The maximum clock ticks to program in oneshot mode | ||
228 | * | ||
229 | * min/max_delta can be 0 for devices which do not support oneshot mode. | ||
230 | */ | ||
231 | void clockevents_config_and_register(struct clock_event_device *dev, | ||
232 | u32 freq, unsigned long min_delta, | ||
233 | unsigned long max_delta) | ||
234 | { | ||
235 | dev->min_delta_ticks = min_delta; | ||
236 | dev->max_delta_ticks = max_delta; | ||
237 | clockevents_config(dev, freq); | ||
238 | clockevents_register_device(dev); | ||
239 | } | ||
240 | |||
241 | /** | ||
242 | * clockevents_update_freq - Update frequency and reprogram a clock event device. | ||
243 | * @dev: device to modify | ||
244 | * @freq: new device frequency | ||
245 | * | ||
246 | * Reconfigure and reprogram a clock event device in oneshot | ||
247 | * mode. Must be called on the cpu for which the device delivers per | ||
248 | * cpu timer events with interrupts disabled! Returns 0 on success, | ||
249 | * -ETIME when the event is in the past. | ||
250 | */ | ||
251 | int clockevents_update_freq(struct clock_event_device *dev, u32 freq) | ||
252 | { | ||
253 | clockevents_config(dev, freq); | ||
254 | |||
255 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | ||
256 | return 0; | ||
257 | |||
258 | return clockevents_program_event(dev, dev->next_event, ktime_get()); | ||
259 | } | ||
260 | |||
197 | /* | 261 | /* |
198 | * Noop handler when we shut down an event device | 262 | * Noop handler when we shut down an event device |
199 | */ | 263 | */ |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6519cf62d9cd..d9d5f8c885f6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -626,19 +626,6 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
626 | list_add(&cs->list, entry); | 626 | list_add(&cs->list, entry); |
627 | } | 627 | } |
628 | 628 | ||
629 | |||
630 | /* | ||
631 | * Maximum time we expect to go between ticks. This includes idle | ||
632 | * tickless time. It provides the trade off between selecting a | ||
633 | * mult/shift pair that is very precise but can only handle a short | ||
634 | * period of time, vs. a mult/shift pair that can handle long periods | ||
635 | * of time but isn't as precise. | ||
636 | * | ||
637 | * This is a subsystem constant, and actual hardware limitations | ||
638 | * may override it (ie: clocksources that wrap every 3 seconds). | ||
639 | */ | ||
640 | #define MAX_UPDATE_LENGTH 5 /* Seconds */ | ||
641 | |||
642 | /** | 629 | /** |
643 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 630 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
644 | * @t: clocksource to be registered | 631 | * @t: clocksource to be registered |
@@ -652,15 +639,28 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
652 | */ | 639 | */ |
653 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 640 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
654 | { | 641 | { |
642 | unsigned long sec; | ||
643 | |||
655 | /* | 644 | /* |
656 | * Ideally we want to use some of the limits used in | 645 | * Calc the maximum number of seconds which we can run before |
657 | * clocksource_max_deferment, to provide a more informed | 646 | * wrapping around. For clocksources which have a mask > 32bit |
658 | * MAX_UPDATE_LENGTH. But for now this just gets the | 647 | * we need to limit the max sleep time to have a good |
659 | * register interface working properly. | 648 | * conversion precision. 10 minutes is still a reasonable |
649 | * amount. That results in a shift value of 24 for a | ||
650 | * clocksource with mask >= 40bit and f >= 4GHz. That maps to | ||
651 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | ||
652 | * margin as we do in clocksource_max_deferment() | ||
660 | */ | 653 | */ |
654 | sec = (cs->mask - (cs->mask >> 5)); | ||
655 | do_div(sec, freq); | ||
656 | do_div(sec, scale); | ||
657 | if (!sec) | ||
658 | sec = 1; | ||
659 | else if (sec > 600 && cs->mask > UINT_MAX) | ||
660 | sec = 600; | ||
661 | |||
661 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 662 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
662 | NSEC_PER_SEC/scale, | 663 | NSEC_PER_SEC / scale, sec * scale); |
663 | MAX_UPDATE_LENGTH*scale); | ||
664 | cs->max_idle_ns = clocksource_max_deferment(cs); | 664 | cs->max_idle_ns = clocksource_max_deferment(cs); |
665 | } | 665 | } |
666 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 666 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
@@ -685,8 +685,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
685 | /* Add clocksource to the clcoksource list */ | 685 | /* Add clocksource to the clcoksource list */ |
686 | mutex_lock(&clocksource_mutex); | 686 | mutex_lock(&clocksource_mutex); |
687 | clocksource_enqueue(cs); | 687 | clocksource_enqueue(cs); |
688 | clocksource_select(); | ||
689 | clocksource_enqueue_watchdog(cs); | 688 | clocksource_enqueue_watchdog(cs); |
689 | clocksource_select(); | ||
690 | mutex_unlock(&clocksource_mutex); | 690 | mutex_unlock(&clocksource_mutex); |
691 | return 0; | 691 | return 0; |
692 | } | 692 | } |
@@ -706,8 +706,8 @@ int clocksource_register(struct clocksource *cs) | |||
706 | 706 | ||
707 | mutex_lock(&clocksource_mutex); | 707 | mutex_lock(&clocksource_mutex); |
708 | clocksource_enqueue(cs); | 708 | clocksource_enqueue(cs); |
709 | clocksource_select(); | ||
710 | clocksource_enqueue_watchdog(cs); | 709 | clocksource_enqueue_watchdog(cs); |
710 | clocksource_select(); | ||
711 | mutex_unlock(&clocksource_mutex); | 711 | mutex_unlock(&clocksource_mutex); |
712 | return 0; | 712 | return 0; |
713 | } | 713 | } |
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index b2fa506667c0..a470154e0408 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -34,7 +34,7 @@ | |||
34 | * inaccuracies caused by missed or lost timer | 34 | * inaccuracies caused by missed or lost timer |
35 | * interrupts and the inability for the timer | 35 | * interrupts and the inability for the timer |
36 | * interrupt hardware to accuratly tick at the | 36 | * interrupt hardware to accuratly tick at the |
37 | * requested HZ value. It is also not reccomended | 37 | * requested HZ value. It is also not recommended |
38 | * for "tick-less" systems. | 38 | * for "tick-less" systems. |
39 | */ | 39 | */ |
40 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) | 40 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) |
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 25028dd4fa18..c340ca658f37 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c | |||
@@ -19,7 +19,6 @@ | |||
19 | */ | 19 | */ |
20 | #include <linux/device.h> | 20 | #include <linux/device.h> |
21 | #include <linux/file.h> | 21 | #include <linux/file.h> |
22 | #include <linux/mutex.h> | ||
23 | #include <linux/posix-clock.h> | 22 | #include <linux/posix-clock.h> |
24 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
25 | #include <linux/syscalls.h> | 24 | #include <linux/syscalls.h> |
@@ -34,19 +33,19 @@ static struct posix_clock *get_posix_clock(struct file *fp) | |||
34 | { | 33 | { |
35 | struct posix_clock *clk = fp->private_data; | 34 | struct posix_clock *clk = fp->private_data; |
36 | 35 | ||
37 | mutex_lock(&clk->mutex); | 36 | down_read(&clk->rwsem); |
38 | 37 | ||
39 | if (!clk->zombie) | 38 | if (!clk->zombie) |
40 | return clk; | 39 | return clk; |
41 | 40 | ||
42 | mutex_unlock(&clk->mutex); | 41 | up_read(&clk->rwsem); |
43 | 42 | ||
44 | return NULL; | 43 | return NULL; |
45 | } | 44 | } |
46 | 45 | ||
47 | static void put_posix_clock(struct posix_clock *clk) | 46 | static void put_posix_clock(struct posix_clock *clk) |
48 | { | 47 | { |
49 | mutex_unlock(&clk->mutex); | 48 | up_read(&clk->rwsem); |
50 | } | 49 | } |
51 | 50 | ||
52 | static ssize_t posix_clock_read(struct file *fp, char __user *buf, | 51 | static ssize_t posix_clock_read(struct file *fp, char __user *buf, |
@@ -156,7 +155,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) | |||
156 | struct posix_clock *clk = | 155 | struct posix_clock *clk = |
157 | container_of(inode->i_cdev, struct posix_clock, cdev); | 156 | container_of(inode->i_cdev, struct posix_clock, cdev); |
158 | 157 | ||
159 | mutex_lock(&clk->mutex); | 158 | down_read(&clk->rwsem); |
160 | 159 | ||
161 | if (clk->zombie) { | 160 | if (clk->zombie) { |
162 | err = -ENODEV; | 161 | err = -ENODEV; |
@@ -172,7 +171,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp) | |||
172 | fp->private_data = clk; | 171 | fp->private_data = clk; |
173 | } | 172 | } |
174 | out: | 173 | out: |
175 | mutex_unlock(&clk->mutex); | 174 | up_read(&clk->rwsem); |
176 | return err; | 175 | return err; |
177 | } | 176 | } |
178 | 177 | ||
@@ -211,25 +210,20 @@ int posix_clock_register(struct posix_clock *clk, dev_t devid) | |||
211 | int err; | 210 | int err; |
212 | 211 | ||
213 | kref_init(&clk->kref); | 212 | kref_init(&clk->kref); |
214 | mutex_init(&clk->mutex); | 213 | init_rwsem(&clk->rwsem); |
215 | 214 | ||
216 | cdev_init(&clk->cdev, &posix_clock_file_operations); | 215 | cdev_init(&clk->cdev, &posix_clock_file_operations); |
217 | clk->cdev.owner = clk->ops.owner; | 216 | clk->cdev.owner = clk->ops.owner; |
218 | err = cdev_add(&clk->cdev, devid, 1); | 217 | err = cdev_add(&clk->cdev, devid, 1); |
219 | if (err) | ||
220 | goto no_cdev; | ||
221 | 218 | ||
222 | return err; | 219 | return err; |
223 | no_cdev: | ||
224 | mutex_destroy(&clk->mutex); | ||
225 | return err; | ||
226 | } | 220 | } |
227 | EXPORT_SYMBOL_GPL(posix_clock_register); | 221 | EXPORT_SYMBOL_GPL(posix_clock_register); |
228 | 222 | ||
229 | static void delete_clock(struct kref *kref) | 223 | static void delete_clock(struct kref *kref) |
230 | { | 224 | { |
231 | struct posix_clock *clk = container_of(kref, struct posix_clock, kref); | 225 | struct posix_clock *clk = container_of(kref, struct posix_clock, kref); |
232 | mutex_destroy(&clk->mutex); | 226 | |
233 | if (clk->release) | 227 | if (clk->release) |
234 | clk->release(clk); | 228 | clk->release(clk); |
235 | } | 229 | } |
@@ -238,9 +232,9 @@ void posix_clock_unregister(struct posix_clock *clk) | |||
238 | { | 232 | { |
239 | cdev_del(&clk->cdev); | 233 | cdev_del(&clk->cdev); |
240 | 234 | ||
241 | mutex_lock(&clk->mutex); | 235 | down_write(&clk->rwsem); |
242 | clk->zombie = true; | 236 | clk->zombie = true; |
243 | mutex_unlock(&clk->mutex); | 237 | up_write(&clk->rwsem); |
244 | 238 | ||
245 | kref_put(&clk->kref, delete_clock); | 239 | kref_put(&clk->kref, delete_clock); |
246 | } | 240 | } |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index da800ffa810c..723c7637e55a 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -522,10 +522,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, | |||
522 | */ | 522 | */ |
523 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 523 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
524 | { | 524 | { |
525 | int cpu = smp_processor_id(); | ||
526 | |||
525 | /* Set it up only once ! */ | 527 | /* Set it up only once ! */ |
526 | if (bc->event_handler != tick_handle_oneshot_broadcast) { | 528 | if (bc->event_handler != tick_handle_oneshot_broadcast) { |
527 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; | 529 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; |
528 | int cpu = smp_processor_id(); | ||
529 | 530 | ||
530 | bc->event_handler = tick_handle_oneshot_broadcast; | 531 | bc->event_handler = tick_handle_oneshot_broadcast; |
531 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 532 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); |
@@ -551,6 +552,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
551 | tick_broadcast_set_event(tick_next_period, 1); | 552 | tick_broadcast_set_event(tick_next_period, 1); |
552 | } else | 553 | } else |
553 | bc->next_event.tv64 = KTIME_MAX; | 554 | bc->next_event.tv64 = KTIME_MAX; |
555 | } else { | ||
556 | /* | ||
557 | * The first cpu which switches to oneshot mode sets | ||
558 | * the bit for all other cpus which are in the general | ||
559 | * (periodic) broadcast mask. So the bit is set and | ||
560 | * would prevent the first broadcast enter after this | ||
561 | * to program the bc device. | ||
562 | */ | ||
563 | tick_broadcast_clear_oneshot(cpu); | ||
554 | } | 564 | } |
555 | } | 565 | } |
556 | 566 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8ad5d576755e..8e6a05a5915a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -596,6 +596,58 @@ void __init timekeeping_init(void) | |||
596 | static struct timespec timekeeping_suspend_time; | 596 | static struct timespec timekeeping_suspend_time; |
597 | 597 | ||
598 | /** | 598 | /** |
599 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval | ||
600 | * @delta: pointer to a timespec delta value | ||
601 | * | ||
602 | * Takes a timespec offset measuring a suspend interval and properly | ||
603 | * adds the sleep offset to the timekeeping variables. | ||
604 | */ | ||
605 | static void __timekeeping_inject_sleeptime(struct timespec *delta) | ||
606 | { | ||
607 | xtime = timespec_add(xtime, *delta); | ||
608 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); | ||
609 | total_sleep_time = timespec_add(total_sleep_time, *delta); | ||
610 | } | ||
611 | |||
612 | |||
613 | /** | ||
614 | * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values | ||
615 | * @delta: pointer to a timespec delta value | ||
616 | * | ||
617 | * This hook is for architectures that cannot support read_persistent_clock | ||
618 | * because their RTC/persistent clock is only accessible when irqs are enabled. | ||
619 | * | ||
620 | * This function should only be called by rtc_resume(), and allows | ||
621 | * a suspend offset to be injected into the timekeeping values. | ||
622 | */ | ||
623 | void timekeeping_inject_sleeptime(struct timespec *delta) | ||
624 | { | ||
625 | unsigned long flags; | ||
626 | struct timespec ts; | ||
627 | |||
628 | /* Make sure we don't set the clock twice */ | ||
629 | read_persistent_clock(&ts); | ||
630 | if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) | ||
631 | return; | ||
632 | |||
633 | write_seqlock_irqsave(&xtime_lock, flags); | ||
634 | timekeeping_forward_now(); | ||
635 | |||
636 | __timekeeping_inject_sleeptime(delta); | ||
637 | |||
638 | timekeeper.ntp_error = 0; | ||
639 | ntp_clear(); | ||
640 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | ||
641 | timekeeper.mult); | ||
642 | |||
643 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
644 | |||
645 | /* signal hrtimers about time change */ | ||
646 | clock_was_set(); | ||
647 | } | ||
648 | |||
649 | |||
650 | /** | ||
599 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 651 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
600 | * | 652 | * |
601 | * This is for the generic clocksource timekeeping. | 653 | * This is for the generic clocksource timekeeping. |
@@ -615,9 +667,7 @@ static void timekeeping_resume(void) | |||
615 | 667 | ||
616 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 668 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { |
617 | ts = timespec_sub(ts, timekeeping_suspend_time); | 669 | ts = timespec_sub(ts, timekeeping_suspend_time); |
618 | xtime = timespec_add(xtime, ts); | 670 | __timekeeping_inject_sleeptime(&ts); |
619 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); | ||
620 | total_sleep_time = timespec_add(total_sleep_time, ts); | ||
621 | } | 671 | } |
622 | /* re-base the last cycle value */ | 672 | /* re-base the last cycle value */ |
623 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 673 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 2f3b585b8d7d..a5d0a3a85dd8 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, | |||
236 | unsigned int timer_flag) | 236 | unsigned int timer_flag) |
237 | { | 237 | { |
238 | /* | 238 | /* |
239 | * It doesnt matter which lock we take: | 239 | * It doesn't matter which lock we take: |
240 | */ | 240 | */ |
241 | raw_spinlock_t *lock; | 241 | raw_spinlock_t *lock; |
242 | struct entry *entry, input; | 242 | struct entry *entry, input; |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61d7d59f4a1a..2ad39e556cb4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -141,7 +141,7 @@ if FTRACE | |||
141 | config FUNCTION_TRACER | 141 | config FUNCTION_TRACER |
142 | bool "Kernel Function Tracer" | 142 | bool "Kernel Function Tracer" |
143 | depends on HAVE_FUNCTION_TRACER | 143 | depends on HAVE_FUNCTION_TRACER |
144 | select FRAME_POINTER if !ARM_UNWIND && !S390 | 144 | select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE |
145 | select KALLSYMS | 145 | select KALLSYMS |
146 | select GENERIC_TRACER | 146 | select GENERIC_TRACER |
147 | select CONTEXT_SWITCH_TRACER | 147 | select CONTEXT_SWITCH_TRACER |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7aa40f8e182d..6957aa298dfa 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
@@ -850,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q) | |||
850 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); | 850 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); |
851 | } | 851 | } |
852 | 852 | ||
853 | static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) | 853 | static void blk_add_trace_unplug(void *ignore, struct request_queue *q, |
854 | unsigned int depth, bool explicit) | ||
854 | { | 855 | { |
855 | struct blk_trace *bt = q->blk_trace; | 856 | struct blk_trace *bt = q->blk_trace; |
856 | 857 | ||
857 | if (bt) { | 858 | if (bt) { |
858 | unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; | 859 | __be64 rpdu = cpu_to_be64(depth); |
859 | __be64 rpdu = cpu_to_be64(pdu); | 860 | u32 what; |
860 | 861 | ||
861 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0, | 862 | if (explicit) |
862 | sizeof(rpdu), &rpdu); | 863 | what = BLK_TA_UNPLUG_IO; |
863 | } | 864 | else |
864 | } | 865 | what = BLK_TA_UNPLUG_TIMER; |
865 | |||
866 | static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) | ||
867 | { | ||
868 | struct blk_trace *bt = q->blk_trace; | ||
869 | |||
870 | if (bt) { | ||
871 | unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE]; | ||
872 | __be64 rpdu = cpu_to_be64(pdu); | ||
873 | 866 | ||
874 | __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0, | 867 | __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); |
875 | sizeof(rpdu), &rpdu); | ||
876 | } | 868 | } |
877 | } | 869 | } |
878 | 870 | ||
@@ -1015,9 +1007,7 @@ static void blk_register_tracepoints(void) | |||
1015 | WARN_ON(ret); | 1007 | WARN_ON(ret); |
1016 | ret = register_trace_block_plug(blk_add_trace_plug, NULL); | 1008 | ret = register_trace_block_plug(blk_add_trace_plug, NULL); |
1017 | WARN_ON(ret); | 1009 | WARN_ON(ret); |
1018 | ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); | 1010 | ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); |
1019 | WARN_ON(ret); | ||
1020 | ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); | ||
1021 | WARN_ON(ret); | 1011 | WARN_ON(ret); |
1022 | ret = register_trace_block_split(blk_add_trace_split, NULL); | 1012 | ret = register_trace_block_split(blk_add_trace_split, NULL); |
1023 | WARN_ON(ret); | 1013 | WARN_ON(ret); |
@@ -1032,8 +1022,7 @@ static void blk_unregister_tracepoints(void) | |||
1032 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); | 1022 | unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); |
1033 | unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); | 1023 | unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); |
1034 | unregister_trace_block_split(blk_add_trace_split, NULL); | 1024 | unregister_trace_block_split(blk_add_trace_split, NULL); |
1035 | unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); | 1025 | unregister_trace_block_unplug(blk_add_trace_unplug, NULL); |
1036 | unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); | ||
1037 | unregister_trace_block_plug(blk_add_trace_plug, NULL); | 1026 | unregister_trace_block_plug(blk_add_trace_plug, NULL); |
1038 | unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); | 1027 | unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); |
1039 | unregister_trace_block_getrq(blk_add_trace_getrq, NULL); | 1028 | unregister_trace_block_getrq(blk_add_trace_getrq, NULL); |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c075f4ea6b94..d017c2c82c44 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -39,20 +39,26 @@ | |||
39 | #include "trace_stat.h" | 39 | #include "trace_stat.h" |
40 | 40 | ||
41 | #define FTRACE_WARN_ON(cond) \ | 41 | #define FTRACE_WARN_ON(cond) \ |
42 | do { \ | 42 | ({ \ |
43 | if (WARN_ON(cond)) \ | 43 | int ___r = cond; \ |
44 | if (WARN_ON(___r)) \ | ||
44 | ftrace_kill(); \ | 45 | ftrace_kill(); \ |
45 | } while (0) | 46 | ___r; \ |
47 | }) | ||
46 | 48 | ||
47 | #define FTRACE_WARN_ON_ONCE(cond) \ | 49 | #define FTRACE_WARN_ON_ONCE(cond) \ |
48 | do { \ | 50 | ({ \ |
49 | if (WARN_ON_ONCE(cond)) \ | 51 | int ___r = cond; \ |
52 | if (WARN_ON_ONCE(___r)) \ | ||
50 | ftrace_kill(); \ | 53 | ftrace_kill(); \ |
51 | } while (0) | 54 | ___r; \ |
55 | }) | ||
52 | 56 | ||
53 | /* hash bits for specific function selection */ | 57 | /* hash bits for specific function selection */ |
54 | #define FTRACE_HASH_BITS 7 | 58 | #define FTRACE_HASH_BITS 7 |
55 | #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) | 59 | #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) |
60 | #define FTRACE_HASH_DEFAULT_BITS 10 | ||
61 | #define FTRACE_HASH_MAX_BITS 12 | ||
56 | 62 | ||
57 | /* ftrace_enabled is a method to turn ftrace on or off */ | 63 | /* ftrace_enabled is a method to turn ftrace on or off */ |
58 | int ftrace_enabled __read_mostly; | 64 | int ftrace_enabled __read_mostly; |
@@ -81,23 +87,29 @@ static struct ftrace_ops ftrace_list_end __read_mostly = | |||
81 | .func = ftrace_stub, | 87 | .func = ftrace_stub, |
82 | }; | 88 | }; |
83 | 89 | ||
84 | static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; | 90 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; |
91 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | ||
85 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 92 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
86 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; | 93 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; |
87 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | 94 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; |
95 | static struct ftrace_ops global_ops; | ||
96 | |||
97 | static void | ||
98 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); | ||
88 | 99 | ||
89 | /* | 100 | /* |
90 | * Traverse the ftrace_list, invoking all entries. The reason that we | 101 | * Traverse the ftrace_global_list, invoking all entries. The reason that we |
91 | * can use rcu_dereference_raw() is that elements removed from this list | 102 | * can use rcu_dereference_raw() is that elements removed from this list |
92 | * are simply leaked, so there is no need to interact with a grace-period | 103 | * are simply leaked, so there is no need to interact with a grace-period |
93 | * mechanism. The rcu_dereference_raw() calls are needed to handle | 104 | * mechanism. The rcu_dereference_raw() calls are needed to handle |
94 | * concurrent insertions into the ftrace_list. | 105 | * concurrent insertions into the ftrace_global_list. |
95 | * | 106 | * |
96 | * Silly Alpha and silly pointer-speculation compiler optimizations! | 107 | * Silly Alpha and silly pointer-speculation compiler optimizations! |
97 | */ | 108 | */ |
98 | static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) | 109 | static void ftrace_global_list_func(unsigned long ip, |
110 | unsigned long parent_ip) | ||
99 | { | 111 | { |
100 | struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ | 112 | struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/ |
101 | 113 | ||
102 | while (op != &ftrace_list_end) { | 114 | while (op != &ftrace_list_end) { |
103 | op->func(ip, parent_ip); | 115 | op->func(ip, parent_ip); |
@@ -147,46 +159,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) | |||
147 | } | 159 | } |
148 | #endif | 160 | #endif |
149 | 161 | ||
150 | static int __register_ftrace_function(struct ftrace_ops *ops) | 162 | static void update_global_ops(void) |
151 | { | 163 | { |
152 | ops->next = ftrace_list; | 164 | ftrace_func_t func; |
165 | |||
153 | /* | 166 | /* |
154 | * We are entering ops into the ftrace_list but another | 167 | * If there's only one function registered, then call that |
155 | * CPU might be walking that list. We need to make sure | 168 | * function directly. Otherwise, we need to iterate over the |
156 | * the ops->next pointer is valid before another CPU sees | 169 | * registered callers. |
157 | * the ops pointer included into the ftrace_list. | ||
158 | */ | 170 | */ |
159 | rcu_assign_pointer(ftrace_list, ops); | 171 | if (ftrace_global_list == &ftrace_list_end || |
172 | ftrace_global_list->next == &ftrace_list_end) | ||
173 | func = ftrace_global_list->func; | ||
174 | else | ||
175 | func = ftrace_global_list_func; | ||
160 | 176 | ||
161 | if (ftrace_enabled) { | 177 | /* If we filter on pids, update to use the pid function */ |
162 | ftrace_func_t func; | 178 | if (!list_empty(&ftrace_pids)) { |
179 | set_ftrace_pid_function(func); | ||
180 | func = ftrace_pid_func; | ||
181 | } | ||
163 | 182 | ||
164 | if (ops->next == &ftrace_list_end) | 183 | global_ops.func = func; |
165 | func = ops->func; | 184 | } |
166 | else | ||
167 | func = ftrace_list_func; | ||
168 | 185 | ||
169 | if (!list_empty(&ftrace_pids)) { | 186 | static void update_ftrace_function(void) |
170 | set_ftrace_pid_function(func); | 187 | { |
171 | func = ftrace_pid_func; | 188 | ftrace_func_t func; |
172 | } | 189 | |
190 | update_global_ops(); | ||
191 | |||
192 | /* | ||
193 | * If we are at the end of the list and this ops is | ||
194 | * not dynamic, then have the mcount trampoline call | ||
195 | * the function directly | ||
196 | */ | ||
197 | if (ftrace_ops_list == &ftrace_list_end || | ||
198 | (ftrace_ops_list->next == &ftrace_list_end && | ||
199 | !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) | ||
200 | func = ftrace_ops_list->func; | ||
201 | else | ||
202 | func = ftrace_ops_list_func; | ||
173 | 203 | ||
174 | /* | ||
175 | * For one func, simply call it directly. | ||
176 | * For more than one func, call the chain. | ||
177 | */ | ||
178 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 204 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST |
179 | ftrace_trace_function = func; | 205 | ftrace_trace_function = func; |
180 | #else | 206 | #else |
181 | __ftrace_trace_function = func; | 207 | __ftrace_trace_function = func; |
182 | ftrace_trace_function = ftrace_test_stop_func; | 208 | ftrace_trace_function = ftrace_test_stop_func; |
183 | #endif | 209 | #endif |
184 | } | 210 | } |
185 | 211 | ||
186 | return 0; | 212 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
213 | { | ||
214 | ops->next = *list; | ||
215 | /* | ||
216 | * We are entering ops into the list but another | ||
217 | * CPU might be walking that list. We need to make sure | ||
218 | * the ops->next pointer is valid before another CPU sees | ||
219 | * the ops pointer included into the list. | ||
220 | */ | ||
221 | rcu_assign_pointer(*list, ops); | ||
187 | } | 222 | } |
188 | 223 | ||
189 | static int __unregister_ftrace_function(struct ftrace_ops *ops) | 224 | static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
190 | { | 225 | { |
191 | struct ftrace_ops **p; | 226 | struct ftrace_ops **p; |
192 | 227 | ||
@@ -194,13 +229,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
194 | * If we are removing the last function, then simply point | 229 | * If we are removing the last function, then simply point |
195 | * to the ftrace_stub. | 230 | * to the ftrace_stub. |
196 | */ | 231 | */ |
197 | if (ftrace_list == ops && ops->next == &ftrace_list_end) { | 232 | if (*list == ops && ops->next == &ftrace_list_end) { |
198 | ftrace_trace_function = ftrace_stub; | 233 | *list = &ftrace_list_end; |
199 | ftrace_list = &ftrace_list_end; | ||
200 | return 0; | 234 | return 0; |
201 | } | 235 | } |
202 | 236 | ||
203 | for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) | 237 | for (p = list; *p != &ftrace_list_end; p = &(*p)->next) |
204 | if (*p == ops) | 238 | if (*p == ops) |
205 | break; | 239 | break; |
206 | 240 | ||
@@ -208,53 +242,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
208 | return -1; | 242 | return -1; |
209 | 243 | ||
210 | *p = (*p)->next; | 244 | *p = (*p)->next; |
245 | return 0; | ||
246 | } | ||
211 | 247 | ||
212 | if (ftrace_enabled) { | 248 | static int __register_ftrace_function(struct ftrace_ops *ops) |
213 | /* If we only have one func left, then call that directly */ | 249 | { |
214 | if (ftrace_list->next == &ftrace_list_end) { | 250 | if (ftrace_disabled) |
215 | ftrace_func_t func = ftrace_list->func; | 251 | return -ENODEV; |
216 | 252 | ||
217 | if (!list_empty(&ftrace_pids)) { | 253 | if (FTRACE_WARN_ON(ops == &global_ops)) |
218 | set_ftrace_pid_function(func); | 254 | return -EINVAL; |
219 | func = ftrace_pid_func; | 255 | |
220 | } | 256 | if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) |
221 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 257 | return -EBUSY; |
222 | ftrace_trace_function = func; | 258 | |
223 | #else | 259 | if (!core_kernel_data((unsigned long)ops)) |
224 | __ftrace_trace_function = func; | 260 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; |
225 | #endif | 261 | |
226 | } | 262 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
227 | } | 263 | int first = ftrace_global_list == &ftrace_list_end; |
264 | add_ftrace_ops(&ftrace_global_list, ops); | ||
265 | ops->flags |= FTRACE_OPS_FL_ENABLED; | ||
266 | if (first) | ||
267 | add_ftrace_ops(&ftrace_ops_list, &global_ops); | ||
268 | } else | ||
269 | add_ftrace_ops(&ftrace_ops_list, ops); | ||
270 | |||
271 | if (ftrace_enabled) | ||
272 | update_ftrace_function(); | ||
228 | 273 | ||
229 | return 0; | 274 | return 0; |
230 | } | 275 | } |
231 | 276 | ||
232 | static void ftrace_update_pid_func(void) | 277 | static int __unregister_ftrace_function(struct ftrace_ops *ops) |
233 | { | 278 | { |
234 | ftrace_func_t func; | 279 | int ret; |
235 | 280 | ||
236 | if (ftrace_trace_function == ftrace_stub) | 281 | if (ftrace_disabled) |
237 | return; | 282 | return -ENODEV; |
238 | 283 | ||
239 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 284 | if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) |
240 | func = ftrace_trace_function; | 285 | return -EBUSY; |
241 | #else | ||
242 | func = __ftrace_trace_function; | ||
243 | #endif | ||
244 | 286 | ||
245 | if (!list_empty(&ftrace_pids)) { | 287 | if (FTRACE_WARN_ON(ops == &global_ops)) |
246 | set_ftrace_pid_function(func); | 288 | return -EINVAL; |
247 | func = ftrace_pid_func; | ||
248 | } else { | ||
249 | if (func == ftrace_pid_func) | ||
250 | func = ftrace_pid_function; | ||
251 | } | ||
252 | 289 | ||
253 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 290 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
254 | ftrace_trace_function = func; | 291 | ret = remove_ftrace_ops(&ftrace_global_list, ops); |
255 | #else | 292 | if (!ret && ftrace_global_list == &ftrace_list_end) |
256 | __ftrace_trace_function = func; | 293 | ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); |
257 | #endif | 294 | if (!ret) |
295 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
296 | } else | ||
297 | ret = remove_ftrace_ops(&ftrace_ops_list, ops); | ||
298 | |||
299 | if (ret < 0) | ||
300 | return ret; | ||
301 | |||
302 | if (ftrace_enabled) | ||
303 | update_ftrace_function(); | ||
304 | |||
305 | /* | ||
306 | * Dynamic ops may be freed, we must make sure that all | ||
307 | * callers are done before leaving this function. | ||
308 | */ | ||
309 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC) | ||
310 | synchronize_sched(); | ||
311 | |||
312 | return 0; | ||
313 | } | ||
314 | |||
315 | static void ftrace_update_pid_func(void) | ||
316 | { | ||
317 | /* Only do something if we are tracing something */ | ||
318 | if (ftrace_trace_function == ftrace_stub) | ||
319 | return; | ||
320 | |||
321 | update_ftrace_function(); | ||
258 | } | 322 | } |
259 | 323 | ||
260 | #ifdef CONFIG_FUNCTION_PROFILER | 324 | #ifdef CONFIG_FUNCTION_PROFILER |
@@ -888,8 +952,35 @@ enum { | |||
888 | FTRACE_START_FUNC_RET = (1 << 3), | 952 | FTRACE_START_FUNC_RET = (1 << 3), |
889 | FTRACE_STOP_FUNC_RET = (1 << 4), | 953 | FTRACE_STOP_FUNC_RET = (1 << 4), |
890 | }; | 954 | }; |
955 | struct ftrace_func_entry { | ||
956 | struct hlist_node hlist; | ||
957 | unsigned long ip; | ||
958 | }; | ||
891 | 959 | ||
892 | static int ftrace_filtered; | 960 | struct ftrace_hash { |
961 | unsigned long size_bits; | ||
962 | struct hlist_head *buckets; | ||
963 | unsigned long count; | ||
964 | struct rcu_head rcu; | ||
965 | }; | ||
966 | |||
967 | /* | ||
968 | * We make these constant because no one should touch them, | ||
969 | * but they are used as the default "empty hash", to avoid allocating | ||
970 | * it all the time. These are in a read only section such that if | ||
971 | * anyone does try to modify it, it will cause an exception. | ||
972 | */ | ||
973 | static const struct hlist_head empty_buckets[1]; | ||
974 | static const struct ftrace_hash empty_hash = { | ||
975 | .buckets = (struct hlist_head *)empty_buckets, | ||
976 | }; | ||
977 | #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) | ||
978 | |||
979 | static struct ftrace_ops global_ops = { | ||
980 | .func = ftrace_stub, | ||
981 | .notrace_hash = EMPTY_HASH, | ||
982 | .filter_hash = EMPTY_HASH, | ||
983 | }; | ||
893 | 984 | ||
894 | static struct dyn_ftrace *ftrace_new_addrs; | 985 | static struct dyn_ftrace *ftrace_new_addrs; |
895 | 986 | ||
@@ -912,6 +1003,269 @@ static struct ftrace_page *ftrace_pages; | |||
912 | 1003 | ||
913 | static struct dyn_ftrace *ftrace_free_records; | 1004 | static struct dyn_ftrace *ftrace_free_records; |
914 | 1005 | ||
1006 | static struct ftrace_func_entry * | ||
1007 | ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) | ||
1008 | { | ||
1009 | unsigned long key; | ||
1010 | struct ftrace_func_entry *entry; | ||
1011 | struct hlist_head *hhd; | ||
1012 | struct hlist_node *n; | ||
1013 | |||
1014 | if (!hash->count) | ||
1015 | return NULL; | ||
1016 | |||
1017 | if (hash->size_bits > 0) | ||
1018 | key = hash_long(ip, hash->size_bits); | ||
1019 | else | ||
1020 | key = 0; | ||
1021 | |||
1022 | hhd = &hash->buckets[key]; | ||
1023 | |||
1024 | hlist_for_each_entry_rcu(entry, n, hhd, hlist) { | ||
1025 | if (entry->ip == ip) | ||
1026 | return entry; | ||
1027 | } | ||
1028 | return NULL; | ||
1029 | } | ||
1030 | |||
1031 | static void __add_hash_entry(struct ftrace_hash *hash, | ||
1032 | struct ftrace_func_entry *entry) | ||
1033 | { | ||
1034 | struct hlist_head *hhd; | ||
1035 | unsigned long key; | ||
1036 | |||
1037 | if (hash->size_bits) | ||
1038 | key = hash_long(entry->ip, hash->size_bits); | ||
1039 | else | ||
1040 | key = 0; | ||
1041 | |||
1042 | hhd = &hash->buckets[key]; | ||
1043 | hlist_add_head(&entry->hlist, hhd); | ||
1044 | hash->count++; | ||
1045 | } | ||
1046 | |||
1047 | static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) | ||
1048 | { | ||
1049 | struct ftrace_func_entry *entry; | ||
1050 | |||
1051 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | ||
1052 | if (!entry) | ||
1053 | return -ENOMEM; | ||
1054 | |||
1055 | entry->ip = ip; | ||
1056 | __add_hash_entry(hash, entry); | ||
1057 | |||
1058 | return 0; | ||
1059 | } | ||
1060 | |||
1061 | static void | ||
1062 | free_hash_entry(struct ftrace_hash *hash, | ||
1063 | struct ftrace_func_entry *entry) | ||
1064 | { | ||
1065 | hlist_del(&entry->hlist); | ||
1066 | kfree(entry); | ||
1067 | hash->count--; | ||
1068 | } | ||
1069 | |||
1070 | static void | ||
1071 | remove_hash_entry(struct ftrace_hash *hash, | ||
1072 | struct ftrace_func_entry *entry) | ||
1073 | { | ||
1074 | hlist_del(&entry->hlist); | ||
1075 | hash->count--; | ||
1076 | } | ||
1077 | |||
1078 | static void ftrace_hash_clear(struct ftrace_hash *hash) | ||
1079 | { | ||
1080 | struct hlist_head *hhd; | ||
1081 | struct hlist_node *tp, *tn; | ||
1082 | struct ftrace_func_entry *entry; | ||
1083 | int size = 1 << hash->size_bits; | ||
1084 | int i; | ||
1085 | |||
1086 | if (!hash->count) | ||
1087 | return; | ||
1088 | |||
1089 | for (i = 0; i < size; i++) { | ||
1090 | hhd = &hash->buckets[i]; | ||
1091 | hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) | ||
1092 | free_hash_entry(hash, entry); | ||
1093 | } | ||
1094 | FTRACE_WARN_ON(hash->count); | ||
1095 | } | ||
1096 | |||
1097 | static void free_ftrace_hash(struct ftrace_hash *hash) | ||
1098 | { | ||
1099 | if (!hash || hash == EMPTY_HASH) | ||
1100 | return; | ||
1101 | ftrace_hash_clear(hash); | ||
1102 | kfree(hash->buckets); | ||
1103 | kfree(hash); | ||
1104 | } | ||
1105 | |||
1106 | static void __free_ftrace_hash_rcu(struct rcu_head *rcu) | ||
1107 | { | ||
1108 | struct ftrace_hash *hash; | ||
1109 | |||
1110 | hash = container_of(rcu, struct ftrace_hash, rcu); | ||
1111 | free_ftrace_hash(hash); | ||
1112 | } | ||
1113 | |||
1114 | static void free_ftrace_hash_rcu(struct ftrace_hash *hash) | ||
1115 | { | ||
1116 | if (!hash || hash == EMPTY_HASH) | ||
1117 | return; | ||
1118 | call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); | ||
1119 | } | ||
1120 | |||
1121 | static struct ftrace_hash *alloc_ftrace_hash(int size_bits) | ||
1122 | { | ||
1123 | struct ftrace_hash *hash; | ||
1124 | int size; | ||
1125 | |||
1126 | hash = kzalloc(sizeof(*hash), GFP_KERNEL); | ||
1127 | if (!hash) | ||
1128 | return NULL; | ||
1129 | |||
1130 | size = 1 << size_bits; | ||
1131 | hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); | ||
1132 | |||
1133 | if (!hash->buckets) { | ||
1134 | kfree(hash); | ||
1135 | return NULL; | ||
1136 | } | ||
1137 | |||
1138 | hash->size_bits = size_bits; | ||
1139 | |||
1140 | return hash; | ||
1141 | } | ||
1142 | |||
1143 | static struct ftrace_hash * | ||
1144 | alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | ||
1145 | { | ||
1146 | struct ftrace_func_entry *entry; | ||
1147 | struct ftrace_hash *new_hash; | ||
1148 | struct hlist_node *tp; | ||
1149 | int size; | ||
1150 | int ret; | ||
1151 | int i; | ||
1152 | |||
1153 | new_hash = alloc_ftrace_hash(size_bits); | ||
1154 | if (!new_hash) | ||
1155 | return NULL; | ||
1156 | |||
1157 | /* Empty hash? */ | ||
1158 | if (!hash || !hash->count) | ||
1159 | return new_hash; | ||
1160 | |||
1161 | size = 1 << hash->size_bits; | ||
1162 | for (i = 0; i < size; i++) { | ||
1163 | hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { | ||
1164 | ret = add_hash_entry(new_hash, entry->ip); | ||
1165 | if (ret < 0) | ||
1166 | goto free_hash; | ||
1167 | } | ||
1168 | } | ||
1169 | |||
1170 | FTRACE_WARN_ON(new_hash->count != hash->count); | ||
1171 | |||
1172 | return new_hash; | ||
1173 | |||
1174 | free_hash: | ||
1175 | free_ftrace_hash(new_hash); | ||
1176 | return NULL; | ||
1177 | } | ||
1178 | |||
1179 | static int | ||
1180 | ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src) | ||
1181 | { | ||
1182 | struct ftrace_func_entry *entry; | ||
1183 | struct hlist_node *tp, *tn; | ||
1184 | struct hlist_head *hhd; | ||
1185 | struct ftrace_hash *old_hash; | ||
1186 | struct ftrace_hash *new_hash; | ||
1187 | unsigned long key; | ||
1188 | int size = src->count; | ||
1189 | int bits = 0; | ||
1190 | int i; | ||
1191 | |||
1192 | /* | ||
1193 | * If the new source is empty, just free dst and assign it | ||
1194 | * the empty_hash. | ||
1195 | */ | ||
1196 | if (!src->count) { | ||
1197 | free_ftrace_hash_rcu(*dst); | ||
1198 | rcu_assign_pointer(*dst, EMPTY_HASH); | ||
1199 | return 0; | ||
1200 | } | ||
1201 | |||
1202 | /* | ||
1203 | * Make the hash size about 1/2 the # found | ||
1204 | */ | ||
1205 | for (size /= 2; size; size >>= 1) | ||
1206 | bits++; | ||
1207 | |||
1208 | /* Don't allocate too much */ | ||
1209 | if (bits > FTRACE_HASH_MAX_BITS) | ||
1210 | bits = FTRACE_HASH_MAX_BITS; | ||
1211 | |||
1212 | new_hash = alloc_ftrace_hash(bits); | ||
1213 | if (!new_hash) | ||
1214 | return -ENOMEM; | ||
1215 | |||
1216 | size = 1 << src->size_bits; | ||
1217 | for (i = 0; i < size; i++) { | ||
1218 | hhd = &src->buckets[i]; | ||
1219 | hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { | ||
1220 | if (bits > 0) | ||
1221 | key = hash_long(entry->ip, bits); | ||
1222 | else | ||
1223 | key = 0; | ||
1224 | remove_hash_entry(src, entry); | ||
1225 | __add_hash_entry(new_hash, entry); | ||
1226 | } | ||
1227 | } | ||
1228 | |||
1229 | old_hash = *dst; | ||
1230 | rcu_assign_pointer(*dst, new_hash); | ||
1231 | free_ftrace_hash_rcu(old_hash); | ||
1232 | |||
1233 | return 0; | ||
1234 | } | ||
1235 | |||
1236 | /* | ||
1237 | * Test the hashes for this ops to see if we want to call | ||
1238 | * the ops->func or not. | ||
1239 | * | ||
1240 | * It's a match if the ip is in the ops->filter_hash or | ||
1241 | * the filter_hash does not exist or is empty, | ||
1242 | * AND | ||
1243 | * the ip is not in the ops->notrace_hash. | ||
1244 | * | ||
1245 | * This needs to be called with preemption disabled as | ||
1246 | * the hashes are freed with call_rcu_sched(). | ||
1247 | */ | ||
1248 | static int | ||
1249 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | ||
1250 | { | ||
1251 | struct ftrace_hash *filter_hash; | ||
1252 | struct ftrace_hash *notrace_hash; | ||
1253 | int ret; | ||
1254 | |||
1255 | filter_hash = rcu_dereference_raw(ops->filter_hash); | ||
1256 | notrace_hash = rcu_dereference_raw(ops->notrace_hash); | ||
1257 | |||
1258 | if ((!filter_hash || !filter_hash->count || | ||
1259 | ftrace_lookup_ip(filter_hash, ip)) && | ||
1260 | (!notrace_hash || !notrace_hash->count || | ||
1261 | !ftrace_lookup_ip(notrace_hash, ip))) | ||
1262 | ret = 1; | ||
1263 | else | ||
1264 | ret = 0; | ||
1265 | |||
1266 | return ret; | ||
1267 | } | ||
1268 | |||
915 | /* | 1269 | /* |
916 | * This is a double for. Do not use 'break' to break out of the loop, | 1270 | * This is a double for. Do not use 'break' to break out of the loop, |
917 | * you must use a goto. | 1271 | * you must use a goto. |
@@ -926,6 +1280,105 @@ static struct dyn_ftrace *ftrace_free_records; | |||
926 | } \ | 1280 | } \ |
927 | } | 1281 | } |
928 | 1282 | ||
1283 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | ||
1284 | int filter_hash, | ||
1285 | bool inc) | ||
1286 | { | ||
1287 | struct ftrace_hash *hash; | ||
1288 | struct ftrace_hash *other_hash; | ||
1289 | struct ftrace_page *pg; | ||
1290 | struct dyn_ftrace *rec; | ||
1291 | int count = 0; | ||
1292 | int all = 0; | ||
1293 | |||
1294 | /* Only update if the ops has been registered */ | ||
1295 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) | ||
1296 | return; | ||
1297 | |||
1298 | /* | ||
1299 | * In the filter_hash case: | ||
1300 | * If the count is zero, we update all records. | ||
1301 | * Otherwise we just update the items in the hash. | ||
1302 | * | ||
1303 | * In the notrace_hash case: | ||
1304 | * We enable the update in the hash. | ||
1305 | * As disabling notrace means enabling the tracing, | ||
1306 | * and enabling notrace means disabling, the inc variable | ||
1307 | * gets inversed. | ||
1308 | */ | ||
1309 | if (filter_hash) { | ||
1310 | hash = ops->filter_hash; | ||
1311 | other_hash = ops->notrace_hash; | ||
1312 | if (!hash || !hash->count) | ||
1313 | all = 1; | ||
1314 | } else { | ||
1315 | inc = !inc; | ||
1316 | hash = ops->notrace_hash; | ||
1317 | other_hash = ops->filter_hash; | ||
1318 | /* | ||
1319 | * If the notrace hash has no items, | ||
1320 | * then there's nothing to do. | ||
1321 | */ | ||
1322 | if (hash && !hash->count) | ||
1323 | return; | ||
1324 | } | ||
1325 | |||
1326 | do_for_each_ftrace_rec(pg, rec) { | ||
1327 | int in_other_hash = 0; | ||
1328 | int in_hash = 0; | ||
1329 | int match = 0; | ||
1330 | |||
1331 | if (all) { | ||
1332 | /* | ||
1333 | * Only the filter_hash affects all records. | ||
1334 | * Update if the record is not in the notrace hash. | ||
1335 | */ | ||
1336 | if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) | ||
1337 | match = 1; | ||
1338 | } else { | ||
1339 | in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); | ||
1340 | in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); | ||
1341 | |||
1342 | /* | ||
1343 | * | ||
1344 | */ | ||
1345 | if (filter_hash && in_hash && !in_other_hash) | ||
1346 | match = 1; | ||
1347 | else if (!filter_hash && in_hash && | ||
1348 | (in_other_hash || !other_hash->count)) | ||
1349 | match = 1; | ||
1350 | } | ||
1351 | if (!match) | ||
1352 | continue; | ||
1353 | |||
1354 | if (inc) { | ||
1355 | rec->flags++; | ||
1356 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) | ||
1357 | return; | ||
1358 | } else { | ||
1359 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) | ||
1360 | return; | ||
1361 | rec->flags--; | ||
1362 | } | ||
1363 | count++; | ||
1364 | /* Shortcut, if we handled all records, we are done. */ | ||
1365 | if (!all && count == hash->count) | ||
1366 | return; | ||
1367 | } while_for_each_ftrace_rec(); | ||
1368 | } | ||
1369 | |||
1370 | static void ftrace_hash_rec_disable(struct ftrace_ops *ops, | ||
1371 | int filter_hash) | ||
1372 | { | ||
1373 | __ftrace_hash_rec_update(ops, filter_hash, 0); | ||
1374 | } | ||
1375 | |||
1376 | static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | ||
1377 | int filter_hash) | ||
1378 | { | ||
1379 | __ftrace_hash_rec_update(ops, filter_hash, 1); | ||
1380 | } | ||
1381 | |||
929 | static void ftrace_free_rec(struct dyn_ftrace *rec) | 1382 | static void ftrace_free_rec(struct dyn_ftrace *rec) |
930 | { | 1383 | { |
931 | rec->freelist = ftrace_free_records; | 1384 | rec->freelist = ftrace_free_records; |
@@ -1047,18 +1500,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
1047 | ftrace_addr = (unsigned long)FTRACE_ADDR; | 1500 | ftrace_addr = (unsigned long)FTRACE_ADDR; |
1048 | 1501 | ||
1049 | /* | 1502 | /* |
1050 | * If this record is not to be traced or we want to disable it, | 1503 | * If we are enabling tracing: |
1051 | * then disable it. | 1504 | * |
1505 | * If the record has a ref count, then we need to enable it | ||
1506 | * because someone is using it. | ||
1052 | * | 1507 | * |
1053 | * If we want to enable it and filtering is off, then enable it. | 1508 | * Otherwise we make sure its disabled. |
1054 | * | 1509 | * |
1055 | * If we want to enable it and filtering is on, enable it only if | 1510 | * If we are disabling tracing, then disable all records that |
1056 | * it's filtered | 1511 | * are enabled. |
1057 | */ | 1512 | */ |
1058 | if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { | 1513 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) |
1059 | if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) | 1514 | flag = FTRACE_FL_ENABLED; |
1060 | flag = FTRACE_FL_ENABLED; | ||
1061 | } | ||
1062 | 1515 | ||
1063 | /* If the state of this record hasn't changed, then do nothing */ | 1516 | /* If the state of this record hasn't changed, then do nothing */ |
1064 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) | 1517 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) |
@@ -1079,19 +1532,16 @@ static void ftrace_replace_code(int enable) | |||
1079 | struct ftrace_page *pg; | 1532 | struct ftrace_page *pg; |
1080 | int failed; | 1533 | int failed; |
1081 | 1534 | ||
1535 | if (unlikely(ftrace_disabled)) | ||
1536 | return; | ||
1537 | |||
1082 | do_for_each_ftrace_rec(pg, rec) { | 1538 | do_for_each_ftrace_rec(pg, rec) { |
1083 | /* | 1539 | /* Skip over free records */ |
1084 | * Skip over free records, records that have | 1540 | if (rec->flags & FTRACE_FL_FREE) |
1085 | * failed and not converted. | ||
1086 | */ | ||
1087 | if (rec->flags & FTRACE_FL_FREE || | ||
1088 | rec->flags & FTRACE_FL_FAILED || | ||
1089 | !(rec->flags & FTRACE_FL_CONVERTED)) | ||
1090 | continue; | 1541 | continue; |
1091 | 1542 | ||
1092 | failed = __ftrace_replace_code(rec, enable); | 1543 | failed = __ftrace_replace_code(rec, enable); |
1093 | if (failed) { | 1544 | if (failed) { |
1094 | rec->flags |= FTRACE_FL_FAILED; | ||
1095 | ftrace_bug(failed, rec->ip); | 1545 | ftrace_bug(failed, rec->ip); |
1096 | /* Stop processing */ | 1546 | /* Stop processing */ |
1097 | return; | 1547 | return; |
@@ -1107,10 +1557,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) | |||
1107 | 1557 | ||
1108 | ip = rec->ip; | 1558 | ip = rec->ip; |
1109 | 1559 | ||
1560 | if (unlikely(ftrace_disabled)) | ||
1561 | return 0; | ||
1562 | |||
1110 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); | 1563 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); |
1111 | if (ret) { | 1564 | if (ret) { |
1112 | ftrace_bug(ret, ip); | 1565 | ftrace_bug(ret, ip); |
1113 | rec->flags |= FTRACE_FL_FAILED; | ||
1114 | return 0; | 1566 | return 0; |
1115 | } | 1567 | } |
1116 | return 1; | 1568 | return 1; |
@@ -1171,6 +1623,7 @@ static void ftrace_run_update_code(int command) | |||
1171 | 1623 | ||
1172 | static ftrace_func_t saved_ftrace_func; | 1624 | static ftrace_func_t saved_ftrace_func; |
1173 | static int ftrace_start_up; | 1625 | static int ftrace_start_up; |
1626 | static int global_start_up; | ||
1174 | 1627 | ||
1175 | static void ftrace_startup_enable(int command) | 1628 | static void ftrace_startup_enable(int command) |
1176 | { | 1629 | { |
@@ -1185,19 +1638,36 @@ static void ftrace_startup_enable(int command) | |||
1185 | ftrace_run_update_code(command); | 1638 | ftrace_run_update_code(command); |
1186 | } | 1639 | } |
1187 | 1640 | ||
1188 | static void ftrace_startup(int command) | 1641 | static void ftrace_startup(struct ftrace_ops *ops, int command) |
1189 | { | 1642 | { |
1643 | bool hash_enable = true; | ||
1644 | |||
1190 | if (unlikely(ftrace_disabled)) | 1645 | if (unlikely(ftrace_disabled)) |
1191 | return; | 1646 | return; |
1192 | 1647 | ||
1193 | ftrace_start_up++; | 1648 | ftrace_start_up++; |
1194 | command |= FTRACE_ENABLE_CALLS; | 1649 | command |= FTRACE_ENABLE_CALLS; |
1195 | 1650 | ||
1651 | /* ops marked global share the filter hashes */ | ||
1652 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
1653 | ops = &global_ops; | ||
1654 | /* Don't update hash if global is already set */ | ||
1655 | if (global_start_up) | ||
1656 | hash_enable = false; | ||
1657 | global_start_up++; | ||
1658 | } | ||
1659 | |||
1660 | ops->flags |= FTRACE_OPS_FL_ENABLED; | ||
1661 | if (hash_enable) | ||
1662 | ftrace_hash_rec_enable(ops, 1); | ||
1663 | |||
1196 | ftrace_startup_enable(command); | 1664 | ftrace_startup_enable(command); |
1197 | } | 1665 | } |
1198 | 1666 | ||
1199 | static void ftrace_shutdown(int command) | 1667 | static void ftrace_shutdown(struct ftrace_ops *ops, int command) |
1200 | { | 1668 | { |
1669 | bool hash_disable = true; | ||
1670 | |||
1201 | if (unlikely(ftrace_disabled)) | 1671 | if (unlikely(ftrace_disabled)) |
1202 | return; | 1672 | return; |
1203 | 1673 | ||
@@ -1209,6 +1679,23 @@ static void ftrace_shutdown(int command) | |||
1209 | */ | 1679 | */ |
1210 | WARN_ON_ONCE(ftrace_start_up < 0); | 1680 | WARN_ON_ONCE(ftrace_start_up < 0); |
1211 | 1681 | ||
1682 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
1683 | ops = &global_ops; | ||
1684 | global_start_up--; | ||
1685 | WARN_ON_ONCE(global_start_up < 0); | ||
1686 | /* Don't update hash if global still has users */ | ||
1687 | if (global_start_up) { | ||
1688 | WARN_ON_ONCE(!ftrace_start_up); | ||
1689 | hash_disable = false; | ||
1690 | } | ||
1691 | } | ||
1692 | |||
1693 | if (hash_disable) | ||
1694 | ftrace_hash_rec_disable(ops, 1); | ||
1695 | |||
1696 | if (ops != &global_ops || !global_start_up) | ||
1697 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
1698 | |||
1212 | if (!ftrace_start_up) | 1699 | if (!ftrace_start_up) |
1213 | command |= FTRACE_DISABLE_CALLS; | 1700 | command |= FTRACE_DISABLE_CALLS; |
1214 | 1701 | ||
@@ -1268,15 +1755,15 @@ static int ftrace_update_code(struct module *mod) | |||
1268 | p->flags = 0L; | 1755 | p->flags = 0L; |
1269 | 1756 | ||
1270 | /* | 1757 | /* |
1271 | * Do the initial record convertion from mcount jump | 1758 | * Do the initial record conversion from mcount jump |
1272 | * to the NOP instructions. | 1759 | * to the NOP instructions. |
1273 | */ | 1760 | */ |
1274 | if (!ftrace_code_disable(mod, p)) { | 1761 | if (!ftrace_code_disable(mod, p)) { |
1275 | ftrace_free_rec(p); | 1762 | ftrace_free_rec(p); |
1276 | continue; | 1763 | /* Game over */ |
1764 | break; | ||
1277 | } | 1765 | } |
1278 | 1766 | ||
1279 | p->flags |= FTRACE_FL_CONVERTED; | ||
1280 | ftrace_update_cnt++; | 1767 | ftrace_update_cnt++; |
1281 | 1768 | ||
1282 | /* | 1769 | /* |
@@ -1351,9 +1838,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) | |||
1351 | enum { | 1838 | enum { |
1352 | FTRACE_ITER_FILTER = (1 << 0), | 1839 | FTRACE_ITER_FILTER = (1 << 0), |
1353 | FTRACE_ITER_NOTRACE = (1 << 1), | 1840 | FTRACE_ITER_NOTRACE = (1 << 1), |
1354 | FTRACE_ITER_FAILURES = (1 << 2), | 1841 | FTRACE_ITER_PRINTALL = (1 << 2), |
1355 | FTRACE_ITER_PRINTALL = (1 << 3), | 1842 | FTRACE_ITER_HASH = (1 << 3), |
1356 | FTRACE_ITER_HASH = (1 << 4), | 1843 | FTRACE_ITER_ENABLED = (1 << 4), |
1357 | }; | 1844 | }; |
1358 | 1845 | ||
1359 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ | 1846 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ |
@@ -1365,6 +1852,8 @@ struct ftrace_iterator { | |||
1365 | struct dyn_ftrace *func; | 1852 | struct dyn_ftrace *func; |
1366 | struct ftrace_func_probe *probe; | 1853 | struct ftrace_func_probe *probe; |
1367 | struct trace_parser parser; | 1854 | struct trace_parser parser; |
1855 | struct ftrace_hash *hash; | ||
1856 | struct ftrace_ops *ops; | ||
1368 | int hidx; | 1857 | int hidx; |
1369 | int idx; | 1858 | int idx; |
1370 | unsigned flags; | 1859 | unsigned flags; |
@@ -1461,8 +1950,12 @@ static void * | |||
1461 | t_next(struct seq_file *m, void *v, loff_t *pos) | 1950 | t_next(struct seq_file *m, void *v, loff_t *pos) |
1462 | { | 1951 | { |
1463 | struct ftrace_iterator *iter = m->private; | 1952 | struct ftrace_iterator *iter = m->private; |
1953 | struct ftrace_ops *ops = &global_ops; | ||
1464 | struct dyn_ftrace *rec = NULL; | 1954 | struct dyn_ftrace *rec = NULL; |
1465 | 1955 | ||
1956 | if (unlikely(ftrace_disabled)) | ||
1957 | return NULL; | ||
1958 | |||
1466 | if (iter->flags & FTRACE_ITER_HASH) | 1959 | if (iter->flags & FTRACE_ITER_HASH) |
1467 | return t_hash_next(m, pos); | 1960 | return t_hash_next(m, pos); |
1468 | 1961 | ||
@@ -1483,17 +1976,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
1483 | rec = &iter->pg->records[iter->idx++]; | 1976 | rec = &iter->pg->records[iter->idx++]; |
1484 | if ((rec->flags & FTRACE_FL_FREE) || | 1977 | if ((rec->flags & FTRACE_FL_FREE) || |
1485 | 1978 | ||
1486 | (!(iter->flags & FTRACE_ITER_FAILURES) && | ||
1487 | (rec->flags & FTRACE_FL_FAILED)) || | ||
1488 | |||
1489 | ((iter->flags & FTRACE_ITER_FAILURES) && | ||
1490 | !(rec->flags & FTRACE_FL_FAILED)) || | ||
1491 | |||
1492 | ((iter->flags & FTRACE_ITER_FILTER) && | 1979 | ((iter->flags & FTRACE_ITER_FILTER) && |
1493 | !(rec->flags & FTRACE_FL_FILTER)) || | 1980 | !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || |
1494 | 1981 | ||
1495 | ((iter->flags & FTRACE_ITER_NOTRACE) && | 1982 | ((iter->flags & FTRACE_ITER_NOTRACE) && |
1496 | !(rec->flags & FTRACE_FL_NOTRACE))) { | 1983 | !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || |
1984 | |||
1985 | ((iter->flags & FTRACE_ITER_ENABLED) && | ||
1986 | !(rec->flags & ~FTRACE_FL_MASK))) { | ||
1987 | |||
1497 | rec = NULL; | 1988 | rec = NULL; |
1498 | goto retry; | 1989 | goto retry; |
1499 | } | 1990 | } |
@@ -1517,10 +2008,15 @@ static void reset_iter_read(struct ftrace_iterator *iter) | |||
1517 | static void *t_start(struct seq_file *m, loff_t *pos) | 2008 | static void *t_start(struct seq_file *m, loff_t *pos) |
1518 | { | 2009 | { |
1519 | struct ftrace_iterator *iter = m->private; | 2010 | struct ftrace_iterator *iter = m->private; |
2011 | struct ftrace_ops *ops = &global_ops; | ||
1520 | void *p = NULL; | 2012 | void *p = NULL; |
1521 | loff_t l; | 2013 | loff_t l; |
1522 | 2014 | ||
1523 | mutex_lock(&ftrace_lock); | 2015 | mutex_lock(&ftrace_lock); |
2016 | |||
2017 | if (unlikely(ftrace_disabled)) | ||
2018 | return NULL; | ||
2019 | |||
1524 | /* | 2020 | /* |
1525 | * If an lseek was done, then reset and start from beginning. | 2021 | * If an lseek was done, then reset and start from beginning. |
1526 | */ | 2022 | */ |
@@ -1532,7 +2028,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
1532 | * off, we can short cut and just print out that all | 2028 | * off, we can short cut and just print out that all |
1533 | * functions are enabled. | 2029 | * functions are enabled. |
1534 | */ | 2030 | */ |
1535 | if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { | 2031 | if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { |
1536 | if (*pos > 0) | 2032 | if (*pos > 0) |
1537 | return t_hash_start(m, pos); | 2033 | return t_hash_start(m, pos); |
1538 | iter->flags |= FTRACE_ITER_PRINTALL; | 2034 | iter->flags |= FTRACE_ITER_PRINTALL; |
@@ -1590,7 +2086,11 @@ static int t_show(struct seq_file *m, void *v) | |||
1590 | if (!rec) | 2086 | if (!rec) |
1591 | return 0; | 2087 | return 0; |
1592 | 2088 | ||
1593 | seq_printf(m, "%ps\n", (void *)rec->ip); | 2089 | seq_printf(m, "%ps", (void *)rec->ip); |
2090 | if (iter->flags & FTRACE_ITER_ENABLED) | ||
2091 | seq_printf(m, " (%ld)", | ||
2092 | rec->flags & ~FTRACE_FL_MASK); | ||
2093 | seq_printf(m, "\n"); | ||
1594 | 2094 | ||
1595 | return 0; | 2095 | return 0; |
1596 | } | 2096 | } |
@@ -1630,44 +2130,46 @@ ftrace_avail_open(struct inode *inode, struct file *file) | |||
1630 | } | 2130 | } |
1631 | 2131 | ||
1632 | static int | 2132 | static int |
1633 | ftrace_failures_open(struct inode *inode, struct file *file) | 2133 | ftrace_enabled_open(struct inode *inode, struct file *file) |
1634 | { | 2134 | { |
1635 | int ret; | ||
1636 | struct seq_file *m; | ||
1637 | struct ftrace_iterator *iter; | 2135 | struct ftrace_iterator *iter; |
2136 | int ret; | ||
2137 | |||
2138 | if (unlikely(ftrace_disabled)) | ||
2139 | return -ENODEV; | ||
2140 | |||
2141 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | ||
2142 | if (!iter) | ||
2143 | return -ENOMEM; | ||
2144 | |||
2145 | iter->pg = ftrace_pages_start; | ||
2146 | iter->flags = FTRACE_ITER_ENABLED; | ||
1638 | 2147 | ||
1639 | ret = ftrace_avail_open(inode, file); | 2148 | ret = seq_open(file, &show_ftrace_seq_ops); |
1640 | if (!ret) { | 2149 | if (!ret) { |
1641 | m = file->private_data; | 2150 | struct seq_file *m = file->private_data; |
1642 | iter = m->private; | 2151 | |
1643 | iter->flags = FTRACE_ITER_FAILURES; | 2152 | m->private = iter; |
2153 | } else { | ||
2154 | kfree(iter); | ||
1644 | } | 2155 | } |
1645 | 2156 | ||
1646 | return ret; | 2157 | return ret; |
1647 | } | 2158 | } |
1648 | 2159 | ||
1649 | 2160 | static void ftrace_filter_reset(struct ftrace_hash *hash) | |
1650 | static void ftrace_filter_reset(int enable) | ||
1651 | { | 2161 | { |
1652 | struct ftrace_page *pg; | ||
1653 | struct dyn_ftrace *rec; | ||
1654 | unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | ||
1655 | |||
1656 | mutex_lock(&ftrace_lock); | 2162 | mutex_lock(&ftrace_lock); |
1657 | if (enable) | 2163 | ftrace_hash_clear(hash); |
1658 | ftrace_filtered = 0; | ||
1659 | do_for_each_ftrace_rec(pg, rec) { | ||
1660 | if (rec->flags & FTRACE_FL_FAILED) | ||
1661 | continue; | ||
1662 | rec->flags &= ~type; | ||
1663 | } while_for_each_ftrace_rec(); | ||
1664 | mutex_unlock(&ftrace_lock); | 2164 | mutex_unlock(&ftrace_lock); |
1665 | } | 2165 | } |
1666 | 2166 | ||
1667 | static int | 2167 | static int |
1668 | ftrace_regex_open(struct inode *inode, struct file *file, int enable) | 2168 | ftrace_regex_open(struct ftrace_ops *ops, int flag, |
2169 | struct inode *inode, struct file *file) | ||
1669 | { | 2170 | { |
1670 | struct ftrace_iterator *iter; | 2171 | struct ftrace_iterator *iter; |
2172 | struct ftrace_hash *hash; | ||
1671 | int ret = 0; | 2173 | int ret = 0; |
1672 | 2174 | ||
1673 | if (unlikely(ftrace_disabled)) | 2175 | if (unlikely(ftrace_disabled)) |
@@ -1682,21 +2184,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) | |||
1682 | return -ENOMEM; | 2184 | return -ENOMEM; |
1683 | } | 2185 | } |
1684 | 2186 | ||
2187 | if (flag & FTRACE_ITER_NOTRACE) | ||
2188 | hash = ops->notrace_hash; | ||
2189 | else | ||
2190 | hash = ops->filter_hash; | ||
2191 | |||
2192 | iter->ops = ops; | ||
2193 | iter->flags = flag; | ||
2194 | |||
2195 | if (file->f_mode & FMODE_WRITE) { | ||
2196 | mutex_lock(&ftrace_lock); | ||
2197 | iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); | ||
2198 | mutex_unlock(&ftrace_lock); | ||
2199 | |||
2200 | if (!iter->hash) { | ||
2201 | trace_parser_put(&iter->parser); | ||
2202 | kfree(iter); | ||
2203 | return -ENOMEM; | ||
2204 | } | ||
2205 | } | ||
2206 | |||
1685 | mutex_lock(&ftrace_regex_lock); | 2207 | mutex_lock(&ftrace_regex_lock); |
2208 | |||
1686 | if ((file->f_mode & FMODE_WRITE) && | 2209 | if ((file->f_mode & FMODE_WRITE) && |
1687 | (file->f_flags & O_TRUNC)) | 2210 | (file->f_flags & O_TRUNC)) |
1688 | ftrace_filter_reset(enable); | 2211 | ftrace_filter_reset(iter->hash); |
1689 | 2212 | ||
1690 | if (file->f_mode & FMODE_READ) { | 2213 | if (file->f_mode & FMODE_READ) { |
1691 | iter->pg = ftrace_pages_start; | 2214 | iter->pg = ftrace_pages_start; |
1692 | iter->flags = enable ? FTRACE_ITER_FILTER : | ||
1693 | FTRACE_ITER_NOTRACE; | ||
1694 | 2215 | ||
1695 | ret = seq_open(file, &show_ftrace_seq_ops); | 2216 | ret = seq_open(file, &show_ftrace_seq_ops); |
1696 | if (!ret) { | 2217 | if (!ret) { |
1697 | struct seq_file *m = file->private_data; | 2218 | struct seq_file *m = file->private_data; |
1698 | m->private = iter; | 2219 | m->private = iter; |
1699 | } else { | 2220 | } else { |
2221 | /* Failed */ | ||
2222 | free_ftrace_hash(iter->hash); | ||
1700 | trace_parser_put(&iter->parser); | 2223 | trace_parser_put(&iter->parser); |
1701 | kfree(iter); | 2224 | kfree(iter); |
1702 | } | 2225 | } |
@@ -1710,13 +2233,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) | |||
1710 | static int | 2233 | static int |
1711 | ftrace_filter_open(struct inode *inode, struct file *file) | 2234 | ftrace_filter_open(struct inode *inode, struct file *file) |
1712 | { | 2235 | { |
1713 | return ftrace_regex_open(inode, file, 1); | 2236 | return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, |
2237 | inode, file); | ||
1714 | } | 2238 | } |
1715 | 2239 | ||
1716 | static int | 2240 | static int |
1717 | ftrace_notrace_open(struct inode *inode, struct file *file) | 2241 | ftrace_notrace_open(struct inode *inode, struct file *file) |
1718 | { | 2242 | { |
1719 | return ftrace_regex_open(inode, file, 0); | 2243 | return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, |
2244 | inode, file); | ||
1720 | } | 2245 | } |
1721 | 2246 | ||
1722 | static loff_t | 2247 | static loff_t |
@@ -1761,86 +2286,99 @@ static int ftrace_match(char *str, char *regex, int len, int type) | |||
1761 | } | 2286 | } |
1762 | 2287 | ||
1763 | static int | 2288 | static int |
1764 | ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) | 2289 | enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) |
2290 | { | ||
2291 | struct ftrace_func_entry *entry; | ||
2292 | int ret = 0; | ||
2293 | |||
2294 | entry = ftrace_lookup_ip(hash, rec->ip); | ||
2295 | if (not) { | ||
2296 | /* Do nothing if it doesn't exist */ | ||
2297 | if (!entry) | ||
2298 | return 0; | ||
2299 | |||
2300 | free_hash_entry(hash, entry); | ||
2301 | } else { | ||
2302 | /* Do nothing if it exists */ | ||
2303 | if (entry) | ||
2304 | return 0; | ||
2305 | |||
2306 | ret = add_hash_entry(hash, rec->ip); | ||
2307 | } | ||
2308 | return ret; | ||
2309 | } | ||
2310 | |||
2311 | static int | ||
2312 | ftrace_match_record(struct dyn_ftrace *rec, char *mod, | ||
2313 | char *regex, int len, int type) | ||
1765 | { | 2314 | { |
1766 | char str[KSYM_SYMBOL_LEN]; | 2315 | char str[KSYM_SYMBOL_LEN]; |
2316 | char *modname; | ||
2317 | |||
2318 | kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); | ||
2319 | |||
2320 | if (mod) { | ||
2321 | /* module lookup requires matching the module */ | ||
2322 | if (!modname || strcmp(modname, mod)) | ||
2323 | return 0; | ||
2324 | |||
2325 | /* blank search means to match all funcs in the mod */ | ||
2326 | if (!len) | ||
2327 | return 1; | ||
2328 | } | ||
1767 | 2329 | ||
1768 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); | ||
1769 | return ftrace_match(str, regex, len, type); | 2330 | return ftrace_match(str, regex, len, type); |
1770 | } | 2331 | } |
1771 | 2332 | ||
1772 | static int ftrace_match_records(char *buff, int len, int enable) | 2333 | static int |
2334 | match_records(struct ftrace_hash *hash, char *buff, | ||
2335 | int len, char *mod, int not) | ||
1773 | { | 2336 | { |
1774 | unsigned int search_len; | 2337 | unsigned search_len = 0; |
1775 | struct ftrace_page *pg; | 2338 | struct ftrace_page *pg; |
1776 | struct dyn_ftrace *rec; | 2339 | struct dyn_ftrace *rec; |
1777 | unsigned long flag; | 2340 | int type = MATCH_FULL; |
1778 | char *search; | 2341 | char *search = buff; |
1779 | int type; | ||
1780 | int not; | ||
1781 | int found = 0; | 2342 | int found = 0; |
2343 | int ret; | ||
1782 | 2344 | ||
1783 | flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | 2345 | if (len) { |
1784 | type = filter_parse_regex(buff, len, &search, ¬); | 2346 | type = filter_parse_regex(buff, len, &search, ¬); |
1785 | 2347 | search_len = strlen(search); | |
1786 | search_len = strlen(search); | 2348 | } |
1787 | 2349 | ||
1788 | mutex_lock(&ftrace_lock); | 2350 | mutex_lock(&ftrace_lock); |
1789 | do_for_each_ftrace_rec(pg, rec) { | ||
1790 | 2351 | ||
1791 | if (rec->flags & FTRACE_FL_FAILED) | 2352 | if (unlikely(ftrace_disabled)) |
1792 | continue; | 2353 | goto out_unlock; |
1793 | 2354 | ||
1794 | if (ftrace_match_record(rec, search, search_len, type)) { | 2355 | do_for_each_ftrace_rec(pg, rec) { |
1795 | if (not) | 2356 | |
1796 | rec->flags &= ~flag; | 2357 | if (ftrace_match_record(rec, mod, search, search_len, type)) { |
1797 | else | 2358 | ret = enter_record(hash, rec, not); |
1798 | rec->flags |= flag; | 2359 | if (ret < 0) { |
2360 | found = ret; | ||
2361 | goto out_unlock; | ||
2362 | } | ||
1799 | found = 1; | 2363 | found = 1; |
1800 | } | 2364 | } |
1801 | /* | ||
1802 | * Only enable filtering if we have a function that | ||
1803 | * is filtered on. | ||
1804 | */ | ||
1805 | if (enable && (rec->flags & FTRACE_FL_FILTER)) | ||
1806 | ftrace_filtered = 1; | ||
1807 | } while_for_each_ftrace_rec(); | 2365 | } while_for_each_ftrace_rec(); |
2366 | out_unlock: | ||
1808 | mutex_unlock(&ftrace_lock); | 2367 | mutex_unlock(&ftrace_lock); |
1809 | 2368 | ||
1810 | return found; | 2369 | return found; |
1811 | } | 2370 | } |
1812 | 2371 | ||
1813 | static int | 2372 | static int |
1814 | ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, | 2373 | ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) |
1815 | char *regex, int len, int type) | ||
1816 | { | 2374 | { |
1817 | char str[KSYM_SYMBOL_LEN]; | 2375 | return match_records(hash, buff, len, NULL, 0); |
1818 | char *modname; | ||
1819 | |||
1820 | kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); | ||
1821 | |||
1822 | if (!modname || strcmp(modname, mod)) | ||
1823 | return 0; | ||
1824 | |||
1825 | /* blank search means to match all funcs in the mod */ | ||
1826 | if (len) | ||
1827 | return ftrace_match(str, regex, len, type); | ||
1828 | else | ||
1829 | return 1; | ||
1830 | } | 2376 | } |
1831 | 2377 | ||
1832 | static int ftrace_match_module_records(char *buff, char *mod, int enable) | 2378 | static int |
2379 | ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) | ||
1833 | { | 2380 | { |
1834 | unsigned search_len = 0; | ||
1835 | struct ftrace_page *pg; | ||
1836 | struct dyn_ftrace *rec; | ||
1837 | int type = MATCH_FULL; | ||
1838 | char *search = buff; | ||
1839 | unsigned long flag; | ||
1840 | int not = 0; | 2381 | int not = 0; |
1841 | int found = 0; | ||
1842 | |||
1843 | flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | ||
1844 | 2382 | ||
1845 | /* blank or '*' mean the same */ | 2383 | /* blank or '*' mean the same */ |
1846 | if (strcmp(buff, "*") == 0) | 2384 | if (strcmp(buff, "*") == 0) |
@@ -1852,32 +2390,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) | |||
1852 | not = 1; | 2390 | not = 1; |
1853 | } | 2391 | } |
1854 | 2392 | ||
1855 | if (strlen(buff)) { | 2393 | return match_records(hash, buff, strlen(buff), mod, not); |
1856 | type = filter_parse_regex(buff, strlen(buff), &search, ¬); | ||
1857 | search_len = strlen(search); | ||
1858 | } | ||
1859 | |||
1860 | mutex_lock(&ftrace_lock); | ||
1861 | do_for_each_ftrace_rec(pg, rec) { | ||
1862 | |||
1863 | if (rec->flags & FTRACE_FL_FAILED) | ||
1864 | continue; | ||
1865 | |||
1866 | if (ftrace_match_module_record(rec, mod, | ||
1867 | search, search_len, type)) { | ||
1868 | if (not) | ||
1869 | rec->flags &= ~flag; | ||
1870 | else | ||
1871 | rec->flags |= flag; | ||
1872 | found = 1; | ||
1873 | } | ||
1874 | if (enable && (rec->flags & FTRACE_FL_FILTER)) | ||
1875 | ftrace_filtered = 1; | ||
1876 | |||
1877 | } while_for_each_ftrace_rec(); | ||
1878 | mutex_unlock(&ftrace_lock); | ||
1879 | |||
1880 | return found; | ||
1881 | } | 2394 | } |
1882 | 2395 | ||
1883 | /* | 2396 | /* |
@@ -1888,7 +2401,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) | |||
1888 | static int | 2401 | static int |
1889 | ftrace_mod_callback(char *func, char *cmd, char *param, int enable) | 2402 | ftrace_mod_callback(char *func, char *cmd, char *param, int enable) |
1890 | { | 2403 | { |
2404 | struct ftrace_ops *ops = &global_ops; | ||
2405 | struct ftrace_hash *hash; | ||
1891 | char *mod; | 2406 | char *mod; |
2407 | int ret = -EINVAL; | ||
1892 | 2408 | ||
1893 | /* | 2409 | /* |
1894 | * cmd == 'mod' because we only registered this func | 2410 | * cmd == 'mod' because we only registered this func |
@@ -1900,15 +2416,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) | |||
1900 | 2416 | ||
1901 | /* we must have a module name */ | 2417 | /* we must have a module name */ |
1902 | if (!param) | 2418 | if (!param) |
1903 | return -EINVAL; | 2419 | return ret; |
1904 | 2420 | ||
1905 | mod = strsep(¶m, ":"); | 2421 | mod = strsep(¶m, ":"); |
1906 | if (!strlen(mod)) | 2422 | if (!strlen(mod)) |
1907 | return -EINVAL; | 2423 | return ret; |
1908 | 2424 | ||
1909 | if (ftrace_match_module_records(func, mod, enable)) | 2425 | if (enable) |
1910 | return 0; | 2426 | hash = ops->filter_hash; |
1911 | return -EINVAL; | 2427 | else |
2428 | hash = ops->notrace_hash; | ||
2429 | |||
2430 | ret = ftrace_match_module_records(hash, func, mod); | ||
2431 | if (!ret) | ||
2432 | ret = -EINVAL; | ||
2433 | if (ret < 0) | ||
2434 | return ret; | ||
2435 | |||
2436 | return 0; | ||
1912 | } | 2437 | } |
1913 | 2438 | ||
1914 | static struct ftrace_func_command ftrace_mod_cmd = { | 2439 | static struct ftrace_func_command ftrace_mod_cmd = { |
@@ -1959,6 +2484,7 @@ static int ftrace_probe_registered; | |||
1959 | 2484 | ||
1960 | static void __enable_ftrace_function_probe(void) | 2485 | static void __enable_ftrace_function_probe(void) |
1961 | { | 2486 | { |
2487 | int ret; | ||
1962 | int i; | 2488 | int i; |
1963 | 2489 | ||
1964 | if (ftrace_probe_registered) | 2490 | if (ftrace_probe_registered) |
@@ -1973,13 +2499,16 @@ static void __enable_ftrace_function_probe(void) | |||
1973 | if (i == FTRACE_FUNC_HASHSIZE) | 2499 | if (i == FTRACE_FUNC_HASHSIZE) |
1974 | return; | 2500 | return; |
1975 | 2501 | ||
1976 | __register_ftrace_function(&trace_probe_ops); | 2502 | ret = __register_ftrace_function(&trace_probe_ops); |
1977 | ftrace_startup(0); | 2503 | if (!ret) |
2504 | ftrace_startup(&trace_probe_ops, 0); | ||
2505 | |||
1978 | ftrace_probe_registered = 1; | 2506 | ftrace_probe_registered = 1; |
1979 | } | 2507 | } |
1980 | 2508 | ||
1981 | static void __disable_ftrace_function_probe(void) | 2509 | static void __disable_ftrace_function_probe(void) |
1982 | { | 2510 | { |
2511 | int ret; | ||
1983 | int i; | 2512 | int i; |
1984 | 2513 | ||
1985 | if (!ftrace_probe_registered) | 2514 | if (!ftrace_probe_registered) |
@@ -1992,8 +2521,10 @@ static void __disable_ftrace_function_probe(void) | |||
1992 | } | 2521 | } |
1993 | 2522 | ||
1994 | /* no more funcs left */ | 2523 | /* no more funcs left */ |
1995 | __unregister_ftrace_function(&trace_probe_ops); | 2524 | ret = __unregister_ftrace_function(&trace_probe_ops); |
1996 | ftrace_shutdown(0); | 2525 | if (!ret) |
2526 | ftrace_shutdown(&trace_probe_ops, 0); | ||
2527 | |||
1997 | ftrace_probe_registered = 0; | 2528 | ftrace_probe_registered = 0; |
1998 | } | 2529 | } |
1999 | 2530 | ||
@@ -2029,12 +2560,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
2029 | return -EINVAL; | 2560 | return -EINVAL; |
2030 | 2561 | ||
2031 | mutex_lock(&ftrace_lock); | 2562 | mutex_lock(&ftrace_lock); |
2032 | do_for_each_ftrace_rec(pg, rec) { | ||
2033 | 2563 | ||
2034 | if (rec->flags & FTRACE_FL_FAILED) | 2564 | if (unlikely(ftrace_disabled)) |
2035 | continue; | 2565 | goto out_unlock; |
2566 | |||
2567 | do_for_each_ftrace_rec(pg, rec) { | ||
2036 | 2568 | ||
2037 | if (!ftrace_match_record(rec, search, len, type)) | 2569 | if (!ftrace_match_record(rec, NULL, search, len, type)) |
2038 | continue; | 2570 | continue; |
2039 | 2571 | ||
2040 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | 2572 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); |
@@ -2195,18 +2727,22 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd) | |||
2195 | return ret; | 2727 | return ret; |
2196 | } | 2728 | } |
2197 | 2729 | ||
2198 | static int ftrace_process_regex(char *buff, int len, int enable) | 2730 | static int ftrace_process_regex(struct ftrace_hash *hash, |
2731 | char *buff, int len, int enable) | ||
2199 | { | 2732 | { |
2200 | char *func, *command, *next = buff; | 2733 | char *func, *command, *next = buff; |
2201 | struct ftrace_func_command *p; | 2734 | struct ftrace_func_command *p; |
2202 | int ret = -EINVAL; | 2735 | int ret; |
2203 | 2736 | ||
2204 | func = strsep(&next, ":"); | 2737 | func = strsep(&next, ":"); |
2205 | 2738 | ||
2206 | if (!next) { | 2739 | if (!next) { |
2207 | if (ftrace_match_records(func, len, enable)) | 2740 | ret = ftrace_match_records(hash, func, len); |
2208 | return 0; | 2741 | if (!ret) |
2209 | return ret; | 2742 | ret = -EINVAL; |
2743 | if (ret < 0) | ||
2744 | return ret; | ||
2745 | return 0; | ||
2210 | } | 2746 | } |
2211 | 2747 | ||
2212 | /* command found */ | 2748 | /* command found */ |
@@ -2239,6 +2775,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
2239 | 2775 | ||
2240 | mutex_lock(&ftrace_regex_lock); | 2776 | mutex_lock(&ftrace_regex_lock); |
2241 | 2777 | ||
2778 | ret = -ENODEV; | ||
2779 | if (unlikely(ftrace_disabled)) | ||
2780 | goto out_unlock; | ||
2781 | |||
2242 | if (file->f_mode & FMODE_READ) { | 2782 | if (file->f_mode & FMODE_READ) { |
2243 | struct seq_file *m = file->private_data; | 2783 | struct seq_file *m = file->private_data; |
2244 | iter = m->private; | 2784 | iter = m->private; |
@@ -2250,7 +2790,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
2250 | 2790 | ||
2251 | if (read >= 0 && trace_parser_loaded(parser) && | 2791 | if (read >= 0 && trace_parser_loaded(parser) && |
2252 | !trace_parser_cont(parser)) { | 2792 | !trace_parser_cont(parser)) { |
2253 | ret = ftrace_process_regex(parser->buffer, | 2793 | ret = ftrace_process_regex(iter->hash, parser->buffer, |
2254 | parser->idx, enable); | 2794 | parser->idx, enable); |
2255 | trace_parser_clear(parser); | 2795 | trace_parser_clear(parser); |
2256 | if (ret) | 2796 | if (ret) |
@@ -2278,22 +2818,49 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, | |||
2278 | return ftrace_regex_write(file, ubuf, cnt, ppos, 0); | 2818 | return ftrace_regex_write(file, ubuf, cnt, ppos, 0); |
2279 | } | 2819 | } |
2280 | 2820 | ||
2281 | static void | 2821 | static int |
2282 | ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) | 2822 | ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, |
2823 | int reset, int enable) | ||
2283 | { | 2824 | { |
2825 | struct ftrace_hash **orig_hash; | ||
2826 | struct ftrace_hash *hash; | ||
2827 | int ret; | ||
2828 | |||
2829 | /* All global ops uses the global ops filters */ | ||
2830 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) | ||
2831 | ops = &global_ops; | ||
2832 | |||
2284 | if (unlikely(ftrace_disabled)) | 2833 | if (unlikely(ftrace_disabled)) |
2285 | return; | 2834 | return -ENODEV; |
2835 | |||
2836 | if (enable) | ||
2837 | orig_hash = &ops->filter_hash; | ||
2838 | else | ||
2839 | orig_hash = &ops->notrace_hash; | ||
2840 | |||
2841 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | ||
2842 | if (!hash) | ||
2843 | return -ENOMEM; | ||
2286 | 2844 | ||
2287 | mutex_lock(&ftrace_regex_lock); | 2845 | mutex_lock(&ftrace_regex_lock); |
2288 | if (reset) | 2846 | if (reset) |
2289 | ftrace_filter_reset(enable); | 2847 | ftrace_filter_reset(hash); |
2290 | if (buf) | 2848 | if (buf) |
2291 | ftrace_match_records(buf, len, enable); | 2849 | ftrace_match_records(hash, buf, len); |
2850 | |||
2851 | mutex_lock(&ftrace_lock); | ||
2852 | ret = ftrace_hash_move(orig_hash, hash); | ||
2853 | mutex_unlock(&ftrace_lock); | ||
2854 | |||
2292 | mutex_unlock(&ftrace_regex_lock); | 2855 | mutex_unlock(&ftrace_regex_lock); |
2856 | |||
2857 | free_ftrace_hash(hash); | ||
2858 | return ret; | ||
2293 | } | 2859 | } |
2294 | 2860 | ||
2295 | /** | 2861 | /** |
2296 | * ftrace_set_filter - set a function to filter on in ftrace | 2862 | * ftrace_set_filter - set a function to filter on in ftrace |
2863 | * @ops - the ops to set the filter with | ||
2297 | * @buf - the string that holds the function filter text. | 2864 | * @buf - the string that holds the function filter text. |
2298 | * @len - the length of the string. | 2865 | * @len - the length of the string. |
2299 | * @reset - non zero to reset all filters before applying this filter. | 2866 | * @reset - non zero to reset all filters before applying this filter. |
@@ -2301,13 +2868,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) | |||
2301 | * Filters denote which functions should be enabled when tracing is enabled. | 2868 | * Filters denote which functions should be enabled when tracing is enabled. |
2302 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. | 2869 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. |
2303 | */ | 2870 | */ |
2304 | void ftrace_set_filter(unsigned char *buf, int len, int reset) | 2871 | void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, |
2872 | int len, int reset) | ||
2305 | { | 2873 | { |
2306 | ftrace_set_regex(buf, len, reset, 1); | 2874 | ftrace_set_regex(ops, buf, len, reset, 1); |
2307 | } | 2875 | } |
2876 | EXPORT_SYMBOL_GPL(ftrace_set_filter); | ||
2308 | 2877 | ||
2309 | /** | 2878 | /** |
2310 | * ftrace_set_notrace - set a function to not trace in ftrace | 2879 | * ftrace_set_notrace - set a function to not trace in ftrace |
2880 | * @ops - the ops to set the notrace filter with | ||
2311 | * @buf - the string that holds the function notrace text. | 2881 | * @buf - the string that holds the function notrace text. |
2312 | * @len - the length of the string. | 2882 | * @len - the length of the string. |
2313 | * @reset - non zero to reset all filters before applying this filter. | 2883 | * @reset - non zero to reset all filters before applying this filter. |
@@ -2316,10 +2886,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) | |||
2316 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled | 2886 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled |
2317 | * for tracing. | 2887 | * for tracing. |
2318 | */ | 2888 | */ |
2319 | void ftrace_set_notrace(unsigned char *buf, int len, int reset) | 2889 | void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, |
2890 | int len, int reset) | ||
2320 | { | 2891 | { |
2321 | ftrace_set_regex(buf, len, reset, 0); | 2892 | ftrace_set_regex(ops, buf, len, reset, 0); |
2322 | } | 2893 | } |
2894 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); | ||
2895 | /** | ||
2896 | * ftrace_set_filter - set a function to filter on in ftrace | ||
2897 | * @ops - the ops to set the filter with | ||
2898 | * @buf - the string that holds the function filter text. | ||
2899 | * @len - the length of the string. | ||
2900 | * @reset - non zero to reset all filters before applying this filter. | ||
2901 | * | ||
2902 | * Filters denote which functions should be enabled when tracing is enabled. | ||
2903 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. | ||
2904 | */ | ||
2905 | void ftrace_set_global_filter(unsigned char *buf, int len, int reset) | ||
2906 | { | ||
2907 | ftrace_set_regex(&global_ops, buf, len, reset, 1); | ||
2908 | } | ||
2909 | EXPORT_SYMBOL_GPL(ftrace_set_global_filter); | ||
2910 | |||
2911 | /** | ||
2912 | * ftrace_set_notrace - set a function to not trace in ftrace | ||
2913 | * @ops - the ops to set the notrace filter with | ||
2914 | * @buf - the string that holds the function notrace text. | ||
2915 | * @len - the length of the string. | ||
2916 | * @reset - non zero to reset all filters before applying this filter. | ||
2917 | * | ||
2918 | * Notrace Filters denote which functions should not be enabled when tracing | ||
2919 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled | ||
2920 | * for tracing. | ||
2921 | */ | ||
2922 | void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) | ||
2923 | { | ||
2924 | ftrace_set_regex(&global_ops, buf, len, reset, 0); | ||
2925 | } | ||
2926 | EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); | ||
2323 | 2927 | ||
2324 | /* | 2928 | /* |
2325 | * command line interface to allow users to set filters on boot up. | 2929 | * command line interface to allow users to set filters on boot up. |
@@ -2370,22 +2974,23 @@ static void __init set_ftrace_early_graph(char *buf) | |||
2370 | } | 2974 | } |
2371 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 2975 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
2372 | 2976 | ||
2373 | static void __init set_ftrace_early_filter(char *buf, int enable) | 2977 | static void __init |
2978 | set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) | ||
2374 | { | 2979 | { |
2375 | char *func; | 2980 | char *func; |
2376 | 2981 | ||
2377 | while (buf) { | 2982 | while (buf) { |
2378 | func = strsep(&buf, ","); | 2983 | func = strsep(&buf, ","); |
2379 | ftrace_set_regex(func, strlen(func), 0, enable); | 2984 | ftrace_set_regex(ops, func, strlen(func), 0, enable); |
2380 | } | 2985 | } |
2381 | } | 2986 | } |
2382 | 2987 | ||
2383 | static void __init set_ftrace_early_filters(void) | 2988 | static void __init set_ftrace_early_filters(void) |
2384 | { | 2989 | { |
2385 | if (ftrace_filter_buf[0]) | 2990 | if (ftrace_filter_buf[0]) |
2386 | set_ftrace_early_filter(ftrace_filter_buf, 1); | 2991 | set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); |
2387 | if (ftrace_notrace_buf[0]) | 2992 | if (ftrace_notrace_buf[0]) |
2388 | set_ftrace_early_filter(ftrace_notrace_buf, 0); | 2993 | set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); |
2389 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 2994 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
2390 | if (ftrace_graph_buf[0]) | 2995 | if (ftrace_graph_buf[0]) |
2391 | set_ftrace_early_graph(ftrace_graph_buf); | 2996 | set_ftrace_early_graph(ftrace_graph_buf); |
@@ -2393,11 +2998,14 @@ static void __init set_ftrace_early_filters(void) | |||
2393 | } | 2998 | } |
2394 | 2999 | ||
2395 | static int | 3000 | static int |
2396 | ftrace_regex_release(struct inode *inode, struct file *file, int enable) | 3001 | ftrace_regex_release(struct inode *inode, struct file *file) |
2397 | { | 3002 | { |
2398 | struct seq_file *m = (struct seq_file *)file->private_data; | 3003 | struct seq_file *m = (struct seq_file *)file->private_data; |
2399 | struct ftrace_iterator *iter; | 3004 | struct ftrace_iterator *iter; |
3005 | struct ftrace_hash **orig_hash; | ||
2400 | struct trace_parser *parser; | 3006 | struct trace_parser *parser; |
3007 | int filter_hash; | ||
3008 | int ret; | ||
2401 | 3009 | ||
2402 | mutex_lock(&ftrace_regex_lock); | 3010 | mutex_lock(&ftrace_regex_lock); |
2403 | if (file->f_mode & FMODE_READ) { | 3011 | if (file->f_mode & FMODE_READ) { |
@@ -2410,33 +3018,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) | |||
2410 | parser = &iter->parser; | 3018 | parser = &iter->parser; |
2411 | if (trace_parser_loaded(parser)) { | 3019 | if (trace_parser_loaded(parser)) { |
2412 | parser->buffer[parser->idx] = 0; | 3020 | parser->buffer[parser->idx] = 0; |
2413 | ftrace_match_records(parser->buffer, parser->idx, enable); | 3021 | ftrace_match_records(iter->hash, parser->buffer, parser->idx); |
2414 | } | 3022 | } |
2415 | 3023 | ||
2416 | mutex_lock(&ftrace_lock); | ||
2417 | if (ftrace_start_up && ftrace_enabled) | ||
2418 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | ||
2419 | mutex_unlock(&ftrace_lock); | ||
2420 | |||
2421 | trace_parser_put(parser); | 3024 | trace_parser_put(parser); |
3025 | |||
3026 | if (file->f_mode & FMODE_WRITE) { | ||
3027 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); | ||
3028 | |||
3029 | if (filter_hash) | ||
3030 | orig_hash = &iter->ops->filter_hash; | ||
3031 | else | ||
3032 | orig_hash = &iter->ops->notrace_hash; | ||
3033 | |||
3034 | mutex_lock(&ftrace_lock); | ||
3035 | /* | ||
3036 | * Remove the current set, update the hash and add | ||
3037 | * them back. | ||
3038 | */ | ||
3039 | ftrace_hash_rec_disable(iter->ops, filter_hash); | ||
3040 | ret = ftrace_hash_move(orig_hash, iter->hash); | ||
3041 | if (!ret) { | ||
3042 | ftrace_hash_rec_enable(iter->ops, filter_hash); | ||
3043 | if (iter->ops->flags & FTRACE_OPS_FL_ENABLED | ||
3044 | && ftrace_enabled) | ||
3045 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | ||
3046 | } | ||
3047 | mutex_unlock(&ftrace_lock); | ||
3048 | } | ||
3049 | free_ftrace_hash(iter->hash); | ||
2422 | kfree(iter); | 3050 | kfree(iter); |
2423 | 3051 | ||
2424 | mutex_unlock(&ftrace_regex_lock); | 3052 | mutex_unlock(&ftrace_regex_lock); |
2425 | return 0; | 3053 | return 0; |
2426 | } | 3054 | } |
2427 | 3055 | ||
2428 | static int | ||
2429 | ftrace_filter_release(struct inode *inode, struct file *file) | ||
2430 | { | ||
2431 | return ftrace_regex_release(inode, file, 1); | ||
2432 | } | ||
2433 | |||
2434 | static int | ||
2435 | ftrace_notrace_release(struct inode *inode, struct file *file) | ||
2436 | { | ||
2437 | return ftrace_regex_release(inode, file, 0); | ||
2438 | } | ||
2439 | |||
2440 | static const struct file_operations ftrace_avail_fops = { | 3056 | static const struct file_operations ftrace_avail_fops = { |
2441 | .open = ftrace_avail_open, | 3057 | .open = ftrace_avail_open, |
2442 | .read = seq_read, | 3058 | .read = seq_read, |
@@ -2444,8 +3060,8 @@ static const struct file_operations ftrace_avail_fops = { | |||
2444 | .release = seq_release_private, | 3060 | .release = seq_release_private, |
2445 | }; | 3061 | }; |
2446 | 3062 | ||
2447 | static const struct file_operations ftrace_failures_fops = { | 3063 | static const struct file_operations ftrace_enabled_fops = { |
2448 | .open = ftrace_failures_open, | 3064 | .open = ftrace_enabled_open, |
2449 | .read = seq_read, | 3065 | .read = seq_read, |
2450 | .llseek = seq_lseek, | 3066 | .llseek = seq_lseek, |
2451 | .release = seq_release_private, | 3067 | .release = seq_release_private, |
@@ -2456,7 +3072,7 @@ static const struct file_operations ftrace_filter_fops = { | |||
2456 | .read = seq_read, | 3072 | .read = seq_read, |
2457 | .write = ftrace_filter_write, | 3073 | .write = ftrace_filter_write, |
2458 | .llseek = ftrace_regex_lseek, | 3074 | .llseek = ftrace_regex_lseek, |
2459 | .release = ftrace_filter_release, | 3075 | .release = ftrace_regex_release, |
2460 | }; | 3076 | }; |
2461 | 3077 | ||
2462 | static const struct file_operations ftrace_notrace_fops = { | 3078 | static const struct file_operations ftrace_notrace_fops = { |
@@ -2464,7 +3080,7 @@ static const struct file_operations ftrace_notrace_fops = { | |||
2464 | .read = seq_read, | 3080 | .read = seq_read, |
2465 | .write = ftrace_notrace_write, | 3081 | .write = ftrace_notrace_write, |
2466 | .llseek = ftrace_regex_lseek, | 3082 | .llseek = ftrace_regex_lseek, |
2467 | .release = ftrace_notrace_release, | 3083 | .release = ftrace_regex_release, |
2468 | }; | 3084 | }; |
2469 | 3085 | ||
2470 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3086 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
@@ -2573,9 +3189,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
2573 | bool exists; | 3189 | bool exists; |
2574 | int i; | 3190 | int i; |
2575 | 3191 | ||
2576 | if (ftrace_disabled) | ||
2577 | return -ENODEV; | ||
2578 | |||
2579 | /* decode regex */ | 3192 | /* decode regex */ |
2580 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); | 3193 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); |
2581 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) | 3194 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) |
@@ -2584,12 +3197,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
2584 | search_len = strlen(search); | 3197 | search_len = strlen(search); |
2585 | 3198 | ||
2586 | mutex_lock(&ftrace_lock); | 3199 | mutex_lock(&ftrace_lock); |
3200 | |||
3201 | if (unlikely(ftrace_disabled)) { | ||
3202 | mutex_unlock(&ftrace_lock); | ||
3203 | return -ENODEV; | ||
3204 | } | ||
3205 | |||
2587 | do_for_each_ftrace_rec(pg, rec) { | 3206 | do_for_each_ftrace_rec(pg, rec) { |
2588 | 3207 | ||
2589 | if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) | 3208 | if (rec->flags & FTRACE_FL_FREE) |
2590 | continue; | 3209 | continue; |
2591 | 3210 | ||
2592 | if (ftrace_match_record(rec, search, search_len, type)) { | 3211 | if (ftrace_match_record(rec, NULL, search, search_len, type)) { |
2593 | /* if it is in the array */ | 3212 | /* if it is in the array */ |
2594 | exists = false; | 3213 | exists = false; |
2595 | for (i = 0; i < *idx; i++) { | 3214 | for (i = 0; i < *idx; i++) { |
@@ -2679,8 +3298,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
2679 | trace_create_file("available_filter_functions", 0444, | 3298 | trace_create_file("available_filter_functions", 0444, |
2680 | d_tracer, NULL, &ftrace_avail_fops); | 3299 | d_tracer, NULL, &ftrace_avail_fops); |
2681 | 3300 | ||
2682 | trace_create_file("failures", 0444, | 3301 | trace_create_file("enabled_functions", 0444, |
2683 | d_tracer, NULL, &ftrace_failures_fops); | 3302 | d_tracer, NULL, &ftrace_enabled_fops); |
2684 | 3303 | ||
2685 | trace_create_file("set_ftrace_filter", 0644, d_tracer, | 3304 | trace_create_file("set_ftrace_filter", 0644, d_tracer, |
2686 | NULL, &ftrace_filter_fops); | 3305 | NULL, &ftrace_filter_fops); |
@@ -2703,7 +3322,6 @@ static int ftrace_process_locs(struct module *mod, | |||
2703 | { | 3322 | { |
2704 | unsigned long *p; | 3323 | unsigned long *p; |
2705 | unsigned long addr; | 3324 | unsigned long addr; |
2706 | unsigned long flags; | ||
2707 | 3325 | ||
2708 | mutex_lock(&ftrace_lock); | 3326 | mutex_lock(&ftrace_lock); |
2709 | p = start; | 3327 | p = start; |
@@ -2720,10 +3338,7 @@ static int ftrace_process_locs(struct module *mod, | |||
2720 | ftrace_record_ip(addr); | 3338 | ftrace_record_ip(addr); |
2721 | } | 3339 | } |
2722 | 3340 | ||
2723 | /* disable interrupts to prevent kstop machine */ | ||
2724 | local_irq_save(flags); | ||
2725 | ftrace_update_code(mod); | 3341 | ftrace_update_code(mod); |
2726 | local_irq_restore(flags); | ||
2727 | mutex_unlock(&ftrace_lock); | 3342 | mutex_unlock(&ftrace_lock); |
2728 | 3343 | ||
2729 | return 0; | 3344 | return 0; |
@@ -2735,10 +3350,11 @@ void ftrace_release_mod(struct module *mod) | |||
2735 | struct dyn_ftrace *rec; | 3350 | struct dyn_ftrace *rec; |
2736 | struct ftrace_page *pg; | 3351 | struct ftrace_page *pg; |
2737 | 3352 | ||
3353 | mutex_lock(&ftrace_lock); | ||
3354 | |||
2738 | if (ftrace_disabled) | 3355 | if (ftrace_disabled) |
2739 | return; | 3356 | goto out_unlock; |
2740 | 3357 | ||
2741 | mutex_lock(&ftrace_lock); | ||
2742 | do_for_each_ftrace_rec(pg, rec) { | 3358 | do_for_each_ftrace_rec(pg, rec) { |
2743 | if (within_module_core(rec->ip, mod)) { | 3359 | if (within_module_core(rec->ip, mod)) { |
2744 | /* | 3360 | /* |
@@ -2749,6 +3365,7 @@ void ftrace_release_mod(struct module *mod) | |||
2749 | ftrace_free_rec(rec); | 3365 | ftrace_free_rec(rec); |
2750 | } | 3366 | } |
2751 | } while_for_each_ftrace_rec(); | 3367 | } while_for_each_ftrace_rec(); |
3368 | out_unlock: | ||
2752 | mutex_unlock(&ftrace_lock); | 3369 | mutex_unlock(&ftrace_lock); |
2753 | } | 3370 | } |
2754 | 3371 | ||
@@ -2835,6 +3452,10 @@ void __init ftrace_init(void) | |||
2835 | 3452 | ||
2836 | #else | 3453 | #else |
2837 | 3454 | ||
3455 | static struct ftrace_ops global_ops = { | ||
3456 | .func = ftrace_stub, | ||
3457 | }; | ||
3458 | |||
2838 | static int __init ftrace_nodyn_init(void) | 3459 | static int __init ftrace_nodyn_init(void) |
2839 | { | 3460 | { |
2840 | ftrace_enabled = 1; | 3461 | ftrace_enabled = 1; |
@@ -2845,12 +3466,38 @@ device_initcall(ftrace_nodyn_init); | |||
2845 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 3466 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } |
2846 | static inline void ftrace_startup_enable(int command) { } | 3467 | static inline void ftrace_startup_enable(int command) { } |
2847 | /* Keep as macros so we do not need to define the commands */ | 3468 | /* Keep as macros so we do not need to define the commands */ |
2848 | # define ftrace_startup(command) do { } while (0) | 3469 | # define ftrace_startup(ops, command) do { } while (0) |
2849 | # define ftrace_shutdown(command) do { } while (0) | 3470 | # define ftrace_shutdown(ops, command) do { } while (0) |
2850 | # define ftrace_startup_sysctl() do { } while (0) | 3471 | # define ftrace_startup_sysctl() do { } while (0) |
2851 | # define ftrace_shutdown_sysctl() do { } while (0) | 3472 | # define ftrace_shutdown_sysctl() do { } while (0) |
3473 | |||
3474 | static inline int | ||
3475 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | ||
3476 | { | ||
3477 | return 1; | ||
3478 | } | ||
3479 | |||
2852 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 3480 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
2853 | 3481 | ||
3482 | static void | ||
3483 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) | ||
3484 | { | ||
3485 | struct ftrace_ops *op; | ||
3486 | |||
3487 | /* | ||
3488 | * Some of the ops may be dynamically allocated, | ||
3489 | * they must be freed after a synchronize_sched(). | ||
3490 | */ | ||
3491 | preempt_disable_notrace(); | ||
3492 | op = rcu_dereference_raw(ftrace_ops_list); | ||
3493 | while (op != &ftrace_list_end) { | ||
3494 | if (ftrace_ops_test(op, ip)) | ||
3495 | op->func(ip, parent_ip); | ||
3496 | op = rcu_dereference_raw(op->next); | ||
3497 | }; | ||
3498 | preempt_enable_notrace(); | ||
3499 | } | ||
3500 | |||
2854 | static void clear_ftrace_swapper(void) | 3501 | static void clear_ftrace_swapper(void) |
2855 | { | 3502 | { |
2856 | struct task_struct *p; | 3503 | struct task_struct *p; |
@@ -3143,19 +3790,23 @@ void ftrace_kill(void) | |||
3143 | */ | 3790 | */ |
3144 | int register_ftrace_function(struct ftrace_ops *ops) | 3791 | int register_ftrace_function(struct ftrace_ops *ops) |
3145 | { | 3792 | { |
3146 | int ret; | 3793 | int ret = -1; |
3147 | |||
3148 | if (unlikely(ftrace_disabled)) | ||
3149 | return -1; | ||
3150 | 3794 | ||
3151 | mutex_lock(&ftrace_lock); | 3795 | mutex_lock(&ftrace_lock); |
3152 | 3796 | ||
3797 | if (unlikely(ftrace_disabled)) | ||
3798 | goto out_unlock; | ||
3799 | |||
3153 | ret = __register_ftrace_function(ops); | 3800 | ret = __register_ftrace_function(ops); |
3154 | ftrace_startup(0); | 3801 | if (!ret) |
3802 | ftrace_startup(ops, 0); | ||
3155 | 3803 | ||
3804 | |||
3805 | out_unlock: | ||
3156 | mutex_unlock(&ftrace_lock); | 3806 | mutex_unlock(&ftrace_lock); |
3157 | return ret; | 3807 | return ret; |
3158 | } | 3808 | } |
3809 | EXPORT_SYMBOL_GPL(register_ftrace_function); | ||
3159 | 3810 | ||
3160 | /** | 3811 | /** |
3161 | * unregister_ftrace_function - unregister a function for profiling. | 3812 | * unregister_ftrace_function - unregister a function for profiling. |
@@ -3169,25 +3820,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops) | |||
3169 | 3820 | ||
3170 | mutex_lock(&ftrace_lock); | 3821 | mutex_lock(&ftrace_lock); |
3171 | ret = __unregister_ftrace_function(ops); | 3822 | ret = __unregister_ftrace_function(ops); |
3172 | ftrace_shutdown(0); | 3823 | if (!ret) |
3824 | ftrace_shutdown(ops, 0); | ||
3173 | mutex_unlock(&ftrace_lock); | 3825 | mutex_unlock(&ftrace_lock); |
3174 | 3826 | ||
3175 | return ret; | 3827 | return ret; |
3176 | } | 3828 | } |
3829 | EXPORT_SYMBOL_GPL(unregister_ftrace_function); | ||
3177 | 3830 | ||
3178 | int | 3831 | int |
3179 | ftrace_enable_sysctl(struct ctl_table *table, int write, | 3832 | ftrace_enable_sysctl(struct ctl_table *table, int write, |
3180 | void __user *buffer, size_t *lenp, | 3833 | void __user *buffer, size_t *lenp, |
3181 | loff_t *ppos) | 3834 | loff_t *ppos) |
3182 | { | 3835 | { |
3183 | int ret; | 3836 | int ret = -ENODEV; |
3184 | |||
3185 | if (unlikely(ftrace_disabled)) | ||
3186 | return -ENODEV; | ||
3187 | 3837 | ||
3188 | mutex_lock(&ftrace_lock); | 3838 | mutex_lock(&ftrace_lock); |
3189 | 3839 | ||
3190 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 3840 | if (unlikely(ftrace_disabled)) |
3841 | goto out; | ||
3842 | |||
3843 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
3191 | 3844 | ||
3192 | if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) | 3845 | if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) |
3193 | goto out; | 3846 | goto out; |
@@ -3199,11 +3852,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, | |||
3199 | ftrace_startup_sysctl(); | 3852 | ftrace_startup_sysctl(); |
3200 | 3853 | ||
3201 | /* we are starting ftrace again */ | 3854 | /* we are starting ftrace again */ |
3202 | if (ftrace_list != &ftrace_list_end) { | 3855 | if (ftrace_ops_list != &ftrace_list_end) { |
3203 | if (ftrace_list->next == &ftrace_list_end) | 3856 | if (ftrace_ops_list->next == &ftrace_list_end) |
3204 | ftrace_trace_function = ftrace_list->func; | 3857 | ftrace_trace_function = ftrace_ops_list->func; |
3205 | else | 3858 | else |
3206 | ftrace_trace_function = ftrace_list_func; | 3859 | ftrace_trace_function = ftrace_ops_list_func; |
3207 | } | 3860 | } |
3208 | 3861 | ||
3209 | } else { | 3862 | } else { |
@@ -3392,7 +4045,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
3392 | ftrace_graph_return = retfunc; | 4045 | ftrace_graph_return = retfunc; |
3393 | ftrace_graph_entry = entryfunc; | 4046 | ftrace_graph_entry = entryfunc; |
3394 | 4047 | ||
3395 | ftrace_startup(FTRACE_START_FUNC_RET); | 4048 | ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); |
3396 | 4049 | ||
3397 | out: | 4050 | out: |
3398 | mutex_unlock(&ftrace_lock); | 4051 | mutex_unlock(&ftrace_lock); |
@@ -3409,7 +4062,7 @@ void unregister_ftrace_graph(void) | |||
3409 | ftrace_graph_active--; | 4062 | ftrace_graph_active--; |
3410 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; | 4063 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; |
3411 | ftrace_graph_entry = ftrace_graph_entry_stub; | 4064 | ftrace_graph_entry = ftrace_graph_entry_stub; |
3412 | ftrace_shutdown(FTRACE_STOP_FUNC_RET); | 4065 | ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); |
3413 | unregister_pm_notifier(&ftrace_suspend_notifier); | 4066 | unregister_pm_notifier(&ftrace_suspend_notifier); |
3414 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 4067 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
3415 | 4068 | ||
@@ -3425,7 +4078,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) | |||
3425 | atomic_set(&t->tracing_graph_pause, 0); | 4078 | atomic_set(&t->tracing_graph_pause, 0); |
3426 | atomic_set(&t->trace_overrun, 0); | 4079 | atomic_set(&t->trace_overrun, 0); |
3427 | t->ftrace_timestamp = 0; | 4080 | t->ftrace_timestamp = 0; |
3428 | /* make curr_ret_stack visable before we add the ret_stack */ | 4081 | /* make curr_ret_stack visible before we add the ret_stack */ |
3429 | smp_wmb(); | 4082 | smp_wmb(); |
3430 | t->ret_stack = ret_stack; | 4083 | t->ret_stack = ret_stack; |
3431 | } | 4084 | } |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d9c8bcafb120..0ef7b4b2a1f7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -1478,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage) | |||
1478 | return local_read(&bpage->entries) & RB_WRITE_MASK; | 1478 | return local_read(&bpage->entries) & RB_WRITE_MASK; |
1479 | } | 1479 | } |
1480 | 1480 | ||
1481 | /* Size is determined by what has been commited */ | 1481 | /* Size is determined by what has been committed */ |
1482 | static inline unsigned rb_page_size(struct buffer_page *bpage) | 1482 | static inline unsigned rb_page_size(struct buffer_page *bpage) |
1483 | { | 1483 | { |
1484 | return rb_page_commit(bpage); | 1484 | return rb_page_commit(bpage); |
@@ -2932,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
2932 | /* | 2932 | /* |
2933 | * cpu_buffer->pages just needs to point to the buffer, it | 2933 | * cpu_buffer->pages just needs to point to the buffer, it |
2934 | * has no specific buffer page to point to. Lets move it out | 2934 | * has no specific buffer page to point to. Lets move it out |
2935 | * of our way so we don't accidently swap it. | 2935 | * of our way so we don't accidentally swap it. |
2936 | */ | 2936 | */ |
2937 | cpu_buffer->pages = reader->list.prev; | 2937 | cpu_buffer->pages = reader->list.prev; |
2938 | 2938 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9541c27c1cf2..ee9c921d7f21 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -1110,6 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, | |||
1110 | 1110 | ||
1111 | entry->preempt_count = pc & 0xff; | 1111 | entry->preempt_count = pc & 0xff; |
1112 | entry->pid = (tsk) ? tsk->pid : 0; | 1112 | entry->pid = (tsk) ? tsk->pid : 0; |
1113 | entry->padding = 0; | ||
1113 | entry->flags = | 1114 | entry->flags = |
1114 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT | 1115 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT |
1115 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | | 1116 | (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | |
@@ -2013,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) | |||
2013 | { | 2014 | { |
2014 | enum print_line_t ret; | 2015 | enum print_line_t ret; |
2015 | 2016 | ||
2016 | if (iter->lost_events) | 2017 | if (iter->lost_events && |
2017 | trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", | 2018 | !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", |
2018 | iter->cpu, iter->lost_events); | 2019 | iter->cpu, iter->lost_events)) |
2020 | return TRACE_TYPE_PARTIAL_LINE; | ||
2019 | 2021 | ||
2020 | if (iter->trace && iter->trace->print_line) { | 2022 | if (iter->trace && iter->trace->print_line) { |
2021 | ret = iter->trace->print_line(iter); | 2023 | ret = iter->trace->print_line(iter); |
@@ -3229,6 +3231,14 @@ waitagain: | |||
3229 | 3231 | ||
3230 | if (iter->seq.len >= cnt) | 3232 | if (iter->seq.len >= cnt) |
3231 | break; | 3233 | break; |
3234 | |||
3235 | /* | ||
3236 | * Setting the full flag means we reached the trace_seq buffer | ||
3237 | * size and we should leave by partial output condition above. | ||
3238 | * One of the trace_seq_* functions is not used properly. | ||
3239 | */ | ||
3240 | WARN_ONCE(iter->seq.full, "full flag set for trace type %d", | ||
3241 | iter->ent->type); | ||
3232 | } | 3242 | } |
3233 | trace_access_unlock(iter->cpu_file); | 3243 | trace_access_unlock(iter->cpu_file); |
3234 | trace_event_read_unlock(); | 3244 | trace_event_read_unlock(); |
@@ -3239,7 +3249,7 @@ waitagain: | |||
3239 | trace_seq_init(&iter->seq); | 3249 | trace_seq_init(&iter->seq); |
3240 | 3250 | ||
3241 | /* | 3251 | /* |
3242 | * If there was nothing to send to user, inspite of consuming trace | 3252 | * If there was nothing to send to user, in spite of consuming trace |
3243 | * entries, go back to wait for more entries. | 3253 | * entries, go back to wait for more entries. |
3244 | */ | 3254 | */ |
3245 | if (sret == -EBUSY) | 3255 | if (sret == -EBUSY) |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5e9dfc6286dd..6b69c4bd306f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -419,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]); | |||
419 | extern unsigned long ftrace_update_tot_cnt; | 419 | extern unsigned long ftrace_update_tot_cnt; |
420 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func | 420 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func |
421 | extern int DYN_FTRACE_TEST_NAME(void); | 421 | extern int DYN_FTRACE_TEST_NAME(void); |
422 | #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 | ||
423 | extern int DYN_FTRACE_TEST_NAME2(void); | ||
422 | #endif | 424 | #endif |
423 | 425 | ||
424 | extern int ring_buffer_expanded; | 426 | extern int ring_buffer_expanded; |
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 685a67d55db0..6302747a1398 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c | |||
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void) | |||
46 | } | 46 | } |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * trace_clock(): 'inbetween' trace clock. Not completely serialized, | 49 | * trace_clock(): 'between' trace clock. Not completely serialized, |
50 | * but not completely incorrect when crossing CPUs either. | 50 | * but not completely incorrect when crossing CPUs either. |
51 | * | 51 | * |
52 | * This is based on cpu_clock(), which will allow at most ~1 jiffy of | 52 | * This is based on cpu_clock(), which will allow at most ~1 jiffy of |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 1516cb3ec549..e32744c84d94 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
@@ -27,7 +27,7 @@ | |||
27 | * in the structure. | 27 | * in the structure. |
28 | * | 28 | * |
29 | * * for structures within structures, the format of the internal | 29 | * * for structures within structures, the format of the internal |
30 | * structure is layed out. This allows the internal structure | 30 | * structure is laid out. This allows the internal structure |
31 | * to be deciphered for the format file. Although these macros | 31 | * to be deciphered for the format file. Although these macros |
32 | * may become out of sync with the internal structure, they | 32 | * may become out of sync with the internal structure, they |
33 | * will create a compile error if it happens. Since the | 33 | * will create a compile error if it happens. Since the |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e88f74fe1d4c..2fe110341359 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -116,6 +116,7 @@ static int trace_define_common_fields(void) | |||
116 | __common_field(unsigned char, flags); | 116 | __common_field(unsigned char, flags); |
117 | __common_field(unsigned char, preempt_count); | 117 | __common_field(unsigned char, preempt_count); |
118 | __common_field(int, pid); | 118 | __common_field(int, pid); |
119 | __common_field(int, padding); | ||
119 | 120 | ||
120 | return ret; | 121 | return ret; |
121 | } | 122 | } |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 16aee4d44e8f..8d0e1cc4e974 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
149 | static struct ftrace_ops trace_ops __read_mostly = | 149 | static struct ftrace_ops trace_ops __read_mostly = |
150 | { | 150 | { |
151 | .func = function_trace_call, | 151 | .func = function_trace_call, |
152 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
152 | }; | 153 | }; |
153 | 154 | ||
154 | static struct ftrace_ops trace_stack_ops __read_mostly = | 155 | static struct ftrace_ops trace_stack_ops __read_mostly = |
155 | { | 156 | { |
156 | .func = function_stack_trace_call, | 157 | .func = function_stack_trace_call, |
158 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
157 | }; | 159 | }; |
158 | 160 | ||
159 | /* Our two options */ | 161 | /* Our two options */ |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 76b05980225c..962cdb24ed81 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
@@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | |||
905 | * | 905 | * |
906 | * returns 1 if | 906 | * returns 1 if |
907 | * - we are inside irq code | 907 | * - we are inside irq code |
908 | * - we just extered irq code | 908 | * - we just entered irq code |
909 | * | 909 | * |
910 | * retunns 0 if | 910 | * retunns 0 if |
911 | * - funcgraph-interrupts option is set | 911 | * - funcgraph-interrupts option is set |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 92b6e1e12d98..c77424be284d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
@@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = { | |||
80 | * skip the latency if the sequence has changed - some other section | 80 | * skip the latency if the sequence has changed - some other section |
81 | * did a maximum and could disturb our measurement with serial console | 81 | * did a maximum and could disturb our measurement with serial console |
82 | * printouts, etc. Truly coinciding maximum latencies should be rare | 82 | * printouts, etc. Truly coinciding maximum latencies should be rare |
83 | * and what happens together happens separately as well, so this doesnt | 83 | * and what happens together happens separately as well, so this doesn't |
84 | * decrease the validity of the maximum found: | 84 | * decrease the validity of the maximum found: |
85 | */ | 85 | */ |
86 | static __cacheline_aligned_in_smp unsigned long max_sequence; | 86 | static __cacheline_aligned_in_smp unsigned long max_sequence; |
@@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
153 | static struct ftrace_ops trace_ops __read_mostly = | 153 | static struct ftrace_ops trace_ops __read_mostly = |
154 | { | 154 | { |
155 | .func = irqsoff_tracer_call, | 155 | .func = irqsoff_tracer_call, |
156 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
156 | }; | 157 | }; |
157 | #endif /* CONFIG_FUNCTION_TRACER */ | 158 | #endif /* CONFIG_FUNCTION_TRACER */ |
158 | 159 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8435b43b1782..f925c45f0afa 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = { | |||
53 | "common_preempt_count", | 53 | "common_preempt_count", |
54 | "common_pid", | 54 | "common_pid", |
55 | "common_tgid", | 55 | "common_tgid", |
56 | "common_lock_depth", | ||
57 | FIELD_STRING_IP, | 56 | FIELD_STRING_IP, |
58 | FIELD_STRING_RETIP, | 57 | FIELD_STRING_RETIP, |
59 | FIELD_STRING_FUNC, | 58 | FIELD_STRING_FUNC, |
@@ -1839,7 +1838,7 @@ static void unregister_probe_event(struct trace_probe *tp) | |||
1839 | kfree(tp->call.print_fmt); | 1838 | kfree(tp->call.print_fmt); |
1840 | } | 1839 | } |
1841 | 1840 | ||
1842 | /* Make a debugfs interface for controling probe points */ | 1841 | /* Make a debugfs interface for controlling probe points */ |
1843 | static __init int init_kprobe_trace(void) | 1842 | static __init int init_kprobe_trace(void) |
1844 | { | 1843 | { |
1845 | struct dentry *d_tracer; | 1844 | struct dentry *d_tracer; |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 456be9063c2d..cf535ccedc86 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -830,6 +830,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); | |||
830 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, | 830 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, |
831 | struct trace_event *event) | 831 | struct trace_event *event) |
832 | { | 832 | { |
833 | if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) | ||
834 | return TRACE_TYPE_PARTIAL_LINE; | ||
835 | |||
833 | return TRACE_TYPE_HANDLED; | 836 | return TRACE_TYPE_HANDLED; |
834 | } | 837 | } |
835 | 838 | ||
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2547d8813cf0..dff763b7baf1 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex); | |||
32 | 32 | ||
33 | struct trace_bprintk_fmt { | 33 | struct trace_bprintk_fmt { |
34 | struct list_head list; | 34 | struct list_head list; |
35 | char fmt[0]; | 35 | const char *fmt; |
36 | }; | 36 | }; |
37 | 37 | ||
38 | static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) | 38 | static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) |
@@ -49,6 +49,7 @@ static | |||
49 | void hold_module_trace_bprintk_format(const char **start, const char **end) | 49 | void hold_module_trace_bprintk_format(const char **start, const char **end) |
50 | { | 50 | { |
51 | const char **iter; | 51 | const char **iter; |
52 | char *fmt; | ||
52 | 53 | ||
53 | mutex_lock(&btrace_mutex); | 54 | mutex_lock(&btrace_mutex); |
54 | for (iter = start; iter < end; iter++) { | 55 | for (iter = start; iter < end; iter++) { |
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
58 | continue; | 59 | continue; |
59 | } | 60 | } |
60 | 61 | ||
61 | tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) | 62 | tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); |
62 | + strlen(*iter) + 1, GFP_KERNEL); | 63 | if (tb_fmt) |
63 | if (tb_fmt) { | 64 | fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); |
65 | if (tb_fmt && fmt) { | ||
64 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); | 66 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); |
65 | strcpy(tb_fmt->fmt, *iter); | 67 | strcpy(fmt, *iter); |
68 | tb_fmt->fmt = fmt; | ||
66 | *iter = tb_fmt->fmt; | 69 | *iter = tb_fmt->fmt; |
67 | } else | 70 | } else { |
71 | kfree(tb_fmt); | ||
68 | *iter = NULL; | 72 | *iter = NULL; |
73 | } | ||
69 | } | 74 | } |
70 | mutex_unlock(&btrace_mutex); | 75 | mutex_unlock(&btrace_mutex); |
71 | } | 76 | } |
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, | |||
84 | return 0; | 89 | return 0; |
85 | } | 90 | } |
86 | 91 | ||
92 | /* | ||
93 | * The debugfs/tracing/printk_formats file maps the addresses with | ||
94 | * the ASCII formats that are used in the bprintk events in the | ||
95 | * buffer. For userspace tools to be able to decode the events from | ||
96 | * the buffer, they need to be able to map the address with the format. | ||
97 | * | ||
98 | * The addresses of the bprintk formats are in their own section | ||
99 | * __trace_printk_fmt. But for modules we copy them into a link list. | ||
100 | * The code to print the formats and their addresses passes around the | ||
101 | * address of the fmt string. If the fmt address passed into the seq | ||
102 | * functions is within the kernel core __trace_printk_fmt section, then | ||
103 | * it simply uses the next pointer in the list. | ||
104 | * | ||
105 | * When the fmt pointer is outside the kernel core __trace_printk_fmt | ||
106 | * section, then we need to read the link list pointers. The trick is | ||
107 | * we pass the address of the string to the seq function just like | ||
108 | * we do for the kernel core formats. To get back the structure that | ||
109 | * holds the format, we simply use containerof() and then go to the | ||
110 | * next format in the list. | ||
111 | */ | ||
112 | static const char ** | ||
113 | find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) | ||
114 | { | ||
115 | struct trace_bprintk_fmt *mod_fmt; | ||
116 | |||
117 | if (list_empty(&trace_bprintk_fmt_list)) | ||
118 | return NULL; | ||
119 | |||
120 | /* | ||
121 | * v will point to the address of the fmt record from t_next | ||
122 | * v will be NULL from t_start. | ||
123 | * If this is the first pointer or called from start | ||
124 | * then we need to walk the list. | ||
125 | */ | ||
126 | if (!v || start_index == *pos) { | ||
127 | struct trace_bprintk_fmt *p; | ||
128 | |||
129 | /* search the module list */ | ||
130 | list_for_each_entry(p, &trace_bprintk_fmt_list, list) { | ||
131 | if (start_index == *pos) | ||
132 | return &p->fmt; | ||
133 | start_index++; | ||
134 | } | ||
135 | /* pos > index */ | ||
136 | return NULL; | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * v points to the address of the fmt field in the mod list | ||
141 | * structure that holds the module print format. | ||
142 | */ | ||
143 | mod_fmt = container_of(v, typeof(*mod_fmt), fmt); | ||
144 | if (mod_fmt->list.next == &trace_bprintk_fmt_list) | ||
145 | return NULL; | ||
146 | |||
147 | mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); | ||
148 | |||
149 | return &mod_fmt->fmt; | ||
150 | } | ||
151 | |||
152 | static void format_mod_start(void) | ||
153 | { | ||
154 | mutex_lock(&btrace_mutex); | ||
155 | } | ||
156 | |||
157 | static void format_mod_stop(void) | ||
158 | { | ||
159 | mutex_unlock(&btrace_mutex); | ||
160 | } | ||
161 | |||
87 | #else /* !CONFIG_MODULES */ | 162 | #else /* !CONFIG_MODULES */ |
88 | __init static int | 163 | __init static int |
89 | module_trace_bprintk_format_notify(struct notifier_block *self, | 164 | module_trace_bprintk_format_notify(struct notifier_block *self, |
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self, | |||
91 | { | 166 | { |
92 | return 0; | 167 | return 0; |
93 | } | 168 | } |
169 | static inline const char ** | ||
170 | find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) | ||
171 | { | ||
172 | return NULL; | ||
173 | } | ||
174 | static inline void format_mod_start(void) { } | ||
175 | static inline void format_mod_stop(void) { } | ||
94 | #endif /* CONFIG_MODULES */ | 176 | #endif /* CONFIG_MODULES */ |
95 | 177 | ||
96 | 178 | ||
@@ -153,20 +235,33 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) | |||
153 | } | 235 | } |
154 | EXPORT_SYMBOL_GPL(__ftrace_vprintk); | 236 | EXPORT_SYMBOL_GPL(__ftrace_vprintk); |
155 | 237 | ||
238 | static const char **find_next(void *v, loff_t *pos) | ||
239 | { | ||
240 | const char **fmt = v; | ||
241 | int start_index; | ||
242 | |||
243 | if (!fmt) | ||
244 | fmt = __start___trace_bprintk_fmt + *pos; | ||
245 | |||
246 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; | ||
247 | |||
248 | if (*pos < start_index) | ||
249 | return fmt; | ||
250 | |||
251 | return find_next_mod_format(start_index, v, fmt, pos); | ||
252 | } | ||
253 | |||
156 | static void * | 254 | static void * |
157 | t_start(struct seq_file *m, loff_t *pos) | 255 | t_start(struct seq_file *m, loff_t *pos) |
158 | { | 256 | { |
159 | const char **fmt = __start___trace_bprintk_fmt + *pos; | 257 | format_mod_start(); |
160 | 258 | return find_next(NULL, pos); | |
161 | if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) | ||
162 | return NULL; | ||
163 | return fmt; | ||
164 | } | 259 | } |
165 | 260 | ||
166 | static void *t_next(struct seq_file *m, void * v, loff_t *pos) | 261 | static void *t_next(struct seq_file *m, void * v, loff_t *pos) |
167 | { | 262 | { |
168 | (*pos)++; | 263 | (*pos)++; |
169 | return t_start(m, pos); | 264 | return find_next(v, pos); |
170 | } | 265 | } |
171 | 266 | ||
172 | static int t_show(struct seq_file *m, void *v) | 267 | static int t_show(struct seq_file *m, void *v) |
@@ -205,6 +300,7 @@ static int t_show(struct seq_file *m, void *v) | |||
205 | 300 | ||
206 | static void t_stop(struct seq_file *m, void *p) | 301 | static void t_stop(struct seq_file *m, void *p) |
207 | { | 302 | { |
303 | format_mod_stop(); | ||
208 | } | 304 | } |
209 | 305 | ||
210 | static const struct seq_operations show_format_seq_ops = { | 306 | static const struct seq_operations show_format_seq_ops = { |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7319559ed59f..f029dd4fd2ca 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
@@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
129 | static struct ftrace_ops trace_ops __read_mostly = | 129 | static struct ftrace_ops trace_ops __read_mostly = |
130 | { | 130 | { |
131 | .func = wakeup_tracer_call, | 131 | .func = wakeup_tracer_call, |
132 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
132 | }; | 133 | }; |
133 | #endif /* CONFIG_FUNCTION_TRACER */ | 134 | #endif /* CONFIG_FUNCTION_TRACER */ |
134 | 135 | ||
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 659732eba07c..288541f977fb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) | |||
101 | 101 | ||
102 | #ifdef CONFIG_DYNAMIC_FTRACE | 102 | #ifdef CONFIG_DYNAMIC_FTRACE |
103 | 103 | ||
104 | static int trace_selftest_test_probe1_cnt; | ||
105 | static void trace_selftest_test_probe1_func(unsigned long ip, | ||
106 | unsigned long pip) | ||
107 | { | ||
108 | trace_selftest_test_probe1_cnt++; | ||
109 | } | ||
110 | |||
111 | static int trace_selftest_test_probe2_cnt; | ||
112 | static void trace_selftest_test_probe2_func(unsigned long ip, | ||
113 | unsigned long pip) | ||
114 | { | ||
115 | trace_selftest_test_probe2_cnt++; | ||
116 | } | ||
117 | |||
118 | static int trace_selftest_test_probe3_cnt; | ||
119 | static void trace_selftest_test_probe3_func(unsigned long ip, | ||
120 | unsigned long pip) | ||
121 | { | ||
122 | trace_selftest_test_probe3_cnt++; | ||
123 | } | ||
124 | |||
125 | static int trace_selftest_test_global_cnt; | ||
126 | static void trace_selftest_test_global_func(unsigned long ip, | ||
127 | unsigned long pip) | ||
128 | { | ||
129 | trace_selftest_test_global_cnt++; | ||
130 | } | ||
131 | |||
132 | static int trace_selftest_test_dyn_cnt; | ||
133 | static void trace_selftest_test_dyn_func(unsigned long ip, | ||
134 | unsigned long pip) | ||
135 | { | ||
136 | trace_selftest_test_dyn_cnt++; | ||
137 | } | ||
138 | |||
139 | static struct ftrace_ops test_probe1 = { | ||
140 | .func = trace_selftest_test_probe1_func, | ||
141 | }; | ||
142 | |||
143 | static struct ftrace_ops test_probe2 = { | ||
144 | .func = trace_selftest_test_probe2_func, | ||
145 | }; | ||
146 | |||
147 | static struct ftrace_ops test_probe3 = { | ||
148 | .func = trace_selftest_test_probe3_func, | ||
149 | }; | ||
150 | |||
151 | static struct ftrace_ops test_global = { | ||
152 | .func = trace_selftest_test_global_func, | ||
153 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
154 | }; | ||
155 | |||
156 | static void print_counts(void) | ||
157 | { | ||
158 | printk("(%d %d %d %d %d) ", | ||
159 | trace_selftest_test_probe1_cnt, | ||
160 | trace_selftest_test_probe2_cnt, | ||
161 | trace_selftest_test_probe3_cnt, | ||
162 | trace_selftest_test_global_cnt, | ||
163 | trace_selftest_test_dyn_cnt); | ||
164 | } | ||
165 | |||
166 | static void reset_counts(void) | ||
167 | { | ||
168 | trace_selftest_test_probe1_cnt = 0; | ||
169 | trace_selftest_test_probe2_cnt = 0; | ||
170 | trace_selftest_test_probe3_cnt = 0; | ||
171 | trace_selftest_test_global_cnt = 0; | ||
172 | trace_selftest_test_dyn_cnt = 0; | ||
173 | } | ||
174 | |||
175 | static int trace_selftest_ops(int cnt) | ||
176 | { | ||
177 | int save_ftrace_enabled = ftrace_enabled; | ||
178 | struct ftrace_ops *dyn_ops; | ||
179 | char *func1_name; | ||
180 | char *func2_name; | ||
181 | int len1; | ||
182 | int len2; | ||
183 | int ret = -1; | ||
184 | |||
185 | printk(KERN_CONT "PASSED\n"); | ||
186 | pr_info("Testing dynamic ftrace ops #%d: ", cnt); | ||
187 | |||
188 | ftrace_enabled = 1; | ||
189 | reset_counts(); | ||
190 | |||
191 | /* Handle PPC64 '.' name */ | ||
192 | func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | ||
193 | func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); | ||
194 | len1 = strlen(func1_name); | ||
195 | len2 = strlen(func2_name); | ||
196 | |||
197 | /* | ||
198 | * Probe 1 will trace function 1. | ||
199 | * Probe 2 will trace function 2. | ||
200 | * Probe 3 will trace functions 1 and 2. | ||
201 | */ | ||
202 | ftrace_set_filter(&test_probe1, func1_name, len1, 1); | ||
203 | ftrace_set_filter(&test_probe2, func2_name, len2, 1); | ||
204 | ftrace_set_filter(&test_probe3, func1_name, len1, 1); | ||
205 | ftrace_set_filter(&test_probe3, func2_name, len2, 0); | ||
206 | |||
207 | register_ftrace_function(&test_probe1); | ||
208 | register_ftrace_function(&test_probe2); | ||
209 | register_ftrace_function(&test_probe3); | ||
210 | register_ftrace_function(&test_global); | ||
211 | |||
212 | DYN_FTRACE_TEST_NAME(); | ||
213 | |||
214 | print_counts(); | ||
215 | |||
216 | if (trace_selftest_test_probe1_cnt != 1) | ||
217 | goto out; | ||
218 | if (trace_selftest_test_probe2_cnt != 0) | ||
219 | goto out; | ||
220 | if (trace_selftest_test_probe3_cnt != 1) | ||
221 | goto out; | ||
222 | if (trace_selftest_test_global_cnt == 0) | ||
223 | goto out; | ||
224 | |||
225 | DYN_FTRACE_TEST_NAME2(); | ||
226 | |||
227 | print_counts(); | ||
228 | |||
229 | if (trace_selftest_test_probe1_cnt != 1) | ||
230 | goto out; | ||
231 | if (trace_selftest_test_probe2_cnt != 1) | ||
232 | goto out; | ||
233 | if (trace_selftest_test_probe3_cnt != 2) | ||
234 | goto out; | ||
235 | |||
236 | /* Add a dynamic probe */ | ||
237 | dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); | ||
238 | if (!dyn_ops) { | ||
239 | printk("MEMORY ERROR "); | ||
240 | goto out; | ||
241 | } | ||
242 | |||
243 | dyn_ops->func = trace_selftest_test_dyn_func; | ||
244 | |||
245 | register_ftrace_function(dyn_ops); | ||
246 | |||
247 | trace_selftest_test_global_cnt = 0; | ||
248 | |||
249 | DYN_FTRACE_TEST_NAME(); | ||
250 | |||
251 | print_counts(); | ||
252 | |||
253 | if (trace_selftest_test_probe1_cnt != 2) | ||
254 | goto out_free; | ||
255 | if (trace_selftest_test_probe2_cnt != 1) | ||
256 | goto out_free; | ||
257 | if (trace_selftest_test_probe3_cnt != 3) | ||
258 | goto out_free; | ||
259 | if (trace_selftest_test_global_cnt == 0) | ||
260 | goto out; | ||
261 | if (trace_selftest_test_dyn_cnt == 0) | ||
262 | goto out_free; | ||
263 | |||
264 | DYN_FTRACE_TEST_NAME2(); | ||
265 | |||
266 | print_counts(); | ||
267 | |||
268 | if (trace_selftest_test_probe1_cnt != 2) | ||
269 | goto out_free; | ||
270 | if (trace_selftest_test_probe2_cnt != 2) | ||
271 | goto out_free; | ||
272 | if (trace_selftest_test_probe3_cnt != 4) | ||
273 | goto out_free; | ||
274 | |||
275 | ret = 0; | ||
276 | out_free: | ||
277 | unregister_ftrace_function(dyn_ops); | ||
278 | kfree(dyn_ops); | ||
279 | |||
280 | out: | ||
281 | /* Purposely unregister in the same order */ | ||
282 | unregister_ftrace_function(&test_probe1); | ||
283 | unregister_ftrace_function(&test_probe2); | ||
284 | unregister_ftrace_function(&test_probe3); | ||
285 | unregister_ftrace_function(&test_global); | ||
286 | |||
287 | /* Make sure everything is off */ | ||
288 | reset_counts(); | ||
289 | DYN_FTRACE_TEST_NAME(); | ||
290 | DYN_FTRACE_TEST_NAME(); | ||
291 | |||
292 | if (trace_selftest_test_probe1_cnt || | ||
293 | trace_selftest_test_probe2_cnt || | ||
294 | trace_selftest_test_probe3_cnt || | ||
295 | trace_selftest_test_global_cnt || | ||
296 | trace_selftest_test_dyn_cnt) | ||
297 | ret = -1; | ||
298 | |||
299 | ftrace_enabled = save_ftrace_enabled; | ||
300 | |||
301 | return ret; | ||
302 | } | ||
303 | |||
104 | /* Test dynamic code modification and ftrace filters */ | 304 | /* Test dynamic code modification and ftrace filters */ |
105 | int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | 305 | int trace_selftest_startup_dynamic_tracing(struct tracer *trace, |
106 | struct trace_array *tr, | 306 | struct trace_array *tr, |
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
131 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | 331 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); |
132 | 332 | ||
133 | /* filter only on our function */ | 333 | /* filter only on our function */ |
134 | ftrace_set_filter(func_name, strlen(func_name), 1); | 334 | ftrace_set_global_filter(func_name, strlen(func_name), 1); |
135 | 335 | ||
136 | /* enable tracing */ | 336 | /* enable tracing */ |
137 | ret = tracer_init(trace, tr); | 337 | ret = tracer_init(trace, tr); |
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
166 | 366 | ||
167 | /* check the trace buffer */ | 367 | /* check the trace buffer */ |
168 | ret = trace_test_buffer(tr, &count); | 368 | ret = trace_test_buffer(tr, &count); |
169 | trace->reset(tr); | ||
170 | tracing_start(); | 369 | tracing_start(); |
171 | 370 | ||
172 | /* we should only have one item */ | 371 | /* we should only have one item */ |
173 | if (!ret && count != 1) { | 372 | if (!ret && count != 1) { |
373 | trace->reset(tr); | ||
174 | printk(KERN_CONT ".. filter failed count=%ld ..", count); | 374 | printk(KERN_CONT ".. filter failed count=%ld ..", count); |
175 | ret = -1; | 375 | ret = -1; |
176 | goto out; | 376 | goto out; |
177 | } | 377 | } |
178 | 378 | ||
379 | /* Test the ops with global tracing running */ | ||
380 | ret = trace_selftest_ops(1); | ||
381 | trace->reset(tr); | ||
382 | |||
179 | out: | 383 | out: |
180 | ftrace_enabled = save_ftrace_enabled; | 384 | ftrace_enabled = save_ftrace_enabled; |
181 | tracer_enabled = save_tracer_enabled; | 385 | tracer_enabled = save_tracer_enabled; |
182 | 386 | ||
183 | /* Enable tracing on all functions again */ | 387 | /* Enable tracing on all functions again */ |
184 | ftrace_set_filter(NULL, 0, 1); | 388 | ftrace_set_global_filter(NULL, 0, 1); |
389 | |||
390 | /* Test the ops with global tracing off */ | ||
391 | if (!ret) | ||
392 | ret = trace_selftest_ops(2); | ||
185 | 393 | ||
186 | return ret; | 394 | return ret; |
187 | } | 395 | } |
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 54dd77cce5bf..b4c475a0a48b 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c | |||
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void) | |||
5 | /* used to call mcount */ | 5 | /* used to call mcount */ |
6 | return 0; | 6 | return 0; |
7 | } | 7 | } |
8 | |||
9 | int DYN_FTRACE_TEST_NAME2(void) | ||
10 | { | ||
11 | /* used to call mcount */ | ||
12 | return 0; | ||
13 | } | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4c5dead0c239..b0b53b8e4c25 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
133 | static struct ftrace_ops trace_ops __read_mostly = | 133 | static struct ftrace_ops trace_ops __read_mostly = |
134 | { | 134 | { |
135 | .func = stack_trace_call, | 135 | .func = stack_trace_call, |
136 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
136 | }; | 137 | }; |
137 | 138 | ||
138 | static ssize_t | 139 | static ssize_t |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 68187af4889e..b219f1449c54 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
@@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
251 | { | 251 | { |
252 | WARN_ON(strcmp((*entry)->name, elem->name) != 0); | 252 | WARN_ON(strcmp((*entry)->name, elem->name) != 0); |
253 | 253 | ||
254 | if (elem->regfunc && !elem->state && active) | 254 | if (elem->regfunc && !jump_label_enabled(&elem->key) && active) |
255 | elem->regfunc(); | 255 | elem->regfunc(); |
256 | else if (elem->unregfunc && elem->state && !active) | 256 | else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) |
257 | elem->unregfunc(); | 257 | elem->unregfunc(); |
258 | 258 | ||
259 | /* | 259 | /* |
@@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
264 | * is used. | 264 | * is used. |
265 | */ | 265 | */ |
266 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); | 266 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); |
267 | if (!elem->state && active) { | 267 | if (active && !jump_label_enabled(&elem->key)) |
268 | jump_label_enable(&elem->state); | 268 | jump_label_inc(&elem->key); |
269 | elem->state = active; | 269 | else if (!active && jump_label_enabled(&elem->key)) |
270 | } else if (elem->state && !active) { | 270 | jump_label_dec(&elem->key); |
271 | jump_label_disable(&elem->state); | ||
272 | elem->state = active; | ||
273 | } | ||
274 | } | 271 | } |
275 | 272 | ||
276 | /* | 273 | /* |
@@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
281 | */ | 278 | */ |
282 | static void disable_tracepoint(struct tracepoint *elem) | 279 | static void disable_tracepoint(struct tracepoint *elem) |
283 | { | 280 | { |
284 | if (elem->unregfunc && elem->state) | 281 | if (elem->unregfunc && jump_label_enabled(&elem->key)) |
285 | elem->unregfunc(); | 282 | elem->unregfunc(); |
286 | 283 | ||
287 | if (elem->state) { | 284 | if (jump_label_enabled(&elem->key)) |
288 | jump_label_disable(&elem->state); | 285 | jump_label_dec(&elem->key); |
289 | elem->state = 0; | ||
290 | } | ||
291 | rcu_assign_pointer(elem->funcs, NULL); | 286 | rcu_assign_pointer(elem->funcs, NULL); |
292 | } | 287 | } |
293 | 288 | ||
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index eb27fd3430a2..92cb706c7fc8 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c | |||
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register); | |||
20 | 20 | ||
21 | /* | 21 | /* |
22 | * Removes a registered user return notifier. Must be called from atomic | 22 | * Removes a registered user return notifier. Must be called from atomic |
23 | * context, and from the same cpu registration occured in. | 23 | * context, and from the same cpu registration occurred in. |
24 | */ | 24 | */ |
25 | void user_return_notifier_unregister(struct user_return_notifier *urn) | 25 | void user_return_notifier_unregister(struct user_return_notifier *urn) |
26 | { | 26 | { |
diff --git a/kernel/wait.c b/kernel/wait.c index b0310eb6cc1e..f45ea8d2a1ce 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait); | |||
142 | * woken up through the queue. | 142 | * woken up through the queue. |
143 | * | 143 | * |
144 | * This prevents waiter starvation where an exclusive waiter | 144 | * This prevents waiter starvation where an exclusive waiter |
145 | * aborts and is woken up concurrently and noone wakes up | 145 | * aborts and is woken up concurrently and no one wakes up |
146 | * the next waiter. | 146 | * the next waiter. |
147 | */ | 147 | */ |
148 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | 148 | void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 140dce750450..14733d4d156b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -430,9 +430,12 @@ static int watchdog_enable(int cpu) | |||
430 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); | 430 | p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); |
431 | if (IS_ERR(p)) { | 431 | if (IS_ERR(p)) { |
432 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); | 432 | printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); |
433 | if (!err) | 433 | if (!err) { |
434 | /* if hardlockup hasn't already set this */ | 434 | /* if hardlockup hasn't already set this */ |
435 | err = PTR_ERR(p); | 435 | err = PTR_ERR(p); |
436 | /* and disable the perf event */ | ||
437 | watchdog_nmi_disable(cpu); | ||
438 | } | ||
436 | goto out; | 439 | goto out; |
437 | } | 440 | } |
438 | kthread_bind(p, cpu); | 441 | kthread_bind(p, cpu); |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 04ef830690ec..e3378e8d3a5c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -1291,8 +1291,14 @@ __acquires(&gcwq->lock) | |||
1291 | return true; | 1291 | return true; |
1292 | spin_unlock_irq(&gcwq->lock); | 1292 | spin_unlock_irq(&gcwq->lock); |
1293 | 1293 | ||
1294 | /* CPU has come up inbetween, retry migration */ | 1294 | /* |
1295 | * We've raced with CPU hot[un]plug. Give it a breather | ||
1296 | * and retry migration. cond_resched() is required here; | ||
1297 | * otherwise, we might deadlock against cpu_stop trying to | ||
1298 | * bring down the CPU on non-preemptive kernel. | ||
1299 | */ | ||
1295 | cpu_relax(); | 1300 | cpu_relax(); |
1301 | cond_resched(); | ||
1296 | } | 1302 | } |
1297 | } | 1303 | } |
1298 | 1304 | ||