diff options
Diffstat (limited to 'kernel')
140 files changed, 11146 insertions, 5414 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 88c92fb44618..5068e2a4e75f 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
| @@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE | |||
| 199 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | 199 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE |
| 200 | 200 | ||
| 201 | config MUTEX_SPIN_ON_OWNER | 201 | config MUTEX_SPIN_ON_OWNER |
| 202 | def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES | 202 | def_bool SMP && !DEBUG_MUTEXES |
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf987b95b356..24e7cb0ba26a 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
| @@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY | |||
| 35 | 35 | ||
| 36 | config PREEMPT | 36 | config PREEMPT |
| 37 | bool "Preemptible Kernel (Low-Latency Desktop)" | 37 | bool "Preemptible Kernel (Low-Latency Desktop)" |
| 38 | select PREEMPT_COUNT | ||
| 38 | help | 39 | help |
| 39 | This option reduces the latency of the kernel by making | 40 | This option reduces the latency of the kernel by making |
| 40 | all kernel code (that is not executing in a critical section) | 41 | all kernel code (that is not executing in a critical section) |
| @@ -52,3 +53,5 @@ config PREEMPT | |||
| 52 | 53 | ||
| 53 | endchoice | 54 | endchoice |
| 54 | 55 | ||
| 56 | config PREEMPT_COUNT | ||
| 57 | bool \ No newline at end of file | ||
diff --git a/kernel/Makefile b/kernel/Makefile index 85cbfb31e73e..eca595e2fd52 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ | |||
| 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
| 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ | 12 | notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ |
| 13 | async.o range.o jump_label.o | 13 | async.o range.o |
| 14 | obj-y += groups.o | 14 | obj-y += groups.o |
| 15 | 15 | ||
| 16 | ifdef CONFIG_FUNCTION_TRACER | 16 | ifdef CONFIG_FUNCTION_TRACER |
| @@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg | |||
| 21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg | 21 | CFLAGS_REMOVE_rtmutex-debug.o = -pg |
| 22 | CFLAGS_REMOVE_cgroup-debug.o = -pg | 22 | CFLAGS_REMOVE_cgroup-debug.o = -pg |
| 23 | CFLAGS_REMOVE_sched_clock.o = -pg | 23 | CFLAGS_REMOVE_sched_clock.o = -pg |
| 24 | CFLAGS_REMOVE_perf_event.o = -pg | ||
| 25 | CFLAGS_REMOVE_irq_work.o = -pg | 24 | CFLAGS_REMOVE_irq_work.o = -pg |
| 26 | endif | 25 | endif |
| 27 | 26 | ||
| @@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o | |||
| 62 | obj-$(CONFIG_CGROUPS) += cgroup.o | 61 | obj-$(CONFIG_CGROUPS) += cgroup.o |
| 63 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o | 62 | obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o |
| 64 | obj-$(CONFIG_CPUSETS) += cpuset.o | 63 | obj-$(CONFIG_CPUSETS) += cpuset.o |
| 65 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o | ||
| 66 | obj-$(CONFIG_UTS_NS) += utsname.o | 64 | obj-$(CONFIG_UTS_NS) += utsname.o |
| 67 | obj-$(CONFIG_USER_NS) += user_namespace.o | 65 | obj-$(CONFIG_USER_NS) += user_namespace.o |
| 68 | obj-$(CONFIG_PID_NS) += pid_namespace.o | 66 | obj-$(CONFIG_PID_NS) += pid_namespace.o |
| @@ -103,11 +101,13 @@ obj-$(CONFIG_RING_BUFFER) += trace/ | |||
| 103 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 101 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
| 104 | obj-$(CONFIG_SMP) += sched_cpupri.o | 102 | obj-$(CONFIG_SMP) += sched_cpupri.o |
| 105 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 103 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
| 106 | obj-$(CONFIG_PERF_EVENTS) += perf_event.o | 104 | |
| 107 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | 105 | obj-$(CONFIG_PERF_EVENTS) += events/ |
| 106 | |||
| 108 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o | 107 | obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o |
| 109 | obj-$(CONFIG_PADATA) += padata.o | 108 | obj-$(CONFIG_PADATA) += padata.o |
| 110 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 109 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
| 110 | obj-$(CONFIG_JUMP_LABEL) += jump_label.o | ||
| 111 | 111 | ||
| 112 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | 112 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) |
| 113 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 113 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
| @@ -126,11 +126,10 @@ targets += config_data.gz | |||
| 126 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE | 126 | $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE |
| 127 | $(call if_changed,gzip) | 127 | $(call if_changed,gzip) |
| 128 | 128 | ||
| 129 | quiet_cmd_ikconfiggz = IKCFG $@ | 129 | filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") |
| 130 | cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ | ||
| 131 | targets += config_data.h | 130 | targets += config_data.h |
| 132 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE | 131 | $(obj)/config_data.h: $(obj)/config_data.gz FORCE |
| 133 | $(call if_changed,ikconfiggz) | 132 | $(call filechk,ikconfiggz) |
| 134 | 133 | ||
| 135 | $(obj)/time.o: $(obj)/timeconst.h | 134 | $(obj)/time.o: $(obj)/timeconst.h |
| 136 | 135 | ||
diff --git a/kernel/async.c b/kernel/async.c index cd9dbb913c77..d5fe7af0de2e 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
| @@ -49,12 +49,13 @@ asynchronous and synchronous parts of the kernel. | |||
| 49 | */ | 49 | */ |
| 50 | 50 | ||
| 51 | #include <linux/async.h> | 51 | #include <linux/async.h> |
| 52 | #include <linux/atomic.h> | ||
| 53 | #include <linux/ktime.h> | ||
| 52 | #include <linux/module.h> | 54 | #include <linux/module.h> |
| 53 | #include <linux/wait.h> | 55 | #include <linux/wait.h> |
| 54 | #include <linux/sched.h> | 56 | #include <linux/sched.h> |
| 55 | #include <linux/slab.h> | 57 | #include <linux/slab.h> |
| 56 | #include <linux/workqueue.h> | 58 | #include <linux/workqueue.h> |
| 57 | #include <asm/atomic.h> | ||
| 58 | 59 | ||
| 59 | static async_cookie_t next_cookie = 1; | 60 | static async_cookie_t next_cookie = 1; |
| 60 | 61 | ||
| @@ -128,7 +129,8 @@ static void async_run_entry_fn(struct work_struct *work) | |||
| 128 | 129 | ||
| 129 | /* 2) run (and print duration) */ | 130 | /* 2) run (and print duration) */ |
| 130 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 131 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 131 | printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, | 132 | printk(KERN_DEBUG "calling %lli_%pF @ %i\n", |
| 133 | (long long)entry->cookie, | ||
| 132 | entry->func, task_pid_nr(current)); | 134 | entry->func, task_pid_nr(current)); |
| 133 | calltime = ktime_get(); | 135 | calltime = ktime_get(); |
| 134 | } | 136 | } |
| @@ -136,7 +138,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
| 136 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 138 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 137 | rettime = ktime_get(); | 139 | rettime = ktime_get(); |
| 138 | delta = ktime_sub(rettime, calltime); | 140 | delta = ktime_sub(rettime, calltime); |
| 139 | printk("initcall %lli_%pF returned 0 after %lld usecs\n", | 141 | printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", |
| 140 | (long long)entry->cookie, | 142 | (long long)entry->cookie, |
| 141 | entry->func, | 143 | entry->func, |
| 142 | (long long)ktime_to_ns(delta) >> 10); | 144 | (long long)ktime_to_ns(delta) >> 10); |
| @@ -270,7 +272,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, | |||
| 270 | ktime_t starttime, delta, endtime; | 272 | ktime_t starttime, delta, endtime; |
| 271 | 273 | ||
| 272 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 274 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
| 273 | printk("async_waiting @ %i\n", task_pid_nr(current)); | 275 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); |
| 274 | starttime = ktime_get(); | 276 | starttime = ktime_get(); |
| 275 | } | 277 | } |
| 276 | 278 | ||
| @@ -280,7 +282,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, | |||
| 280 | endtime = ktime_get(); | 282 | endtime = ktime_get(); |
| 281 | delta = ktime_sub(endtime, starttime); | 283 | delta = ktime_sub(endtime, starttime); |
| 282 | 284 | ||
| 283 | printk("async_continuing @ %i after %lli usec\n", | 285 | printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", |
| 284 | task_pid_nr(current), | 286 | task_pid_nr(current), |
| 285 | (long long)ktime_to_ns(delta) >> 10); | 287 | (long long)ktime_to_ns(delta) >> 10); |
| 286 | } | 288 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index 939500317066..0a1355ca3d79 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -43,7 +43,7 @@ | |||
| 43 | 43 | ||
| 44 | #include <linux/init.h> | 44 | #include <linux/init.h> |
| 45 | #include <asm/types.h> | 45 | #include <asm/types.h> |
| 46 | #include <asm/atomic.h> | 46 | #include <linux/atomic.h> |
| 47 | #include <linux/mm.h> | 47 | #include <linux/mm.h> |
| 48 | #include <linux/module.h> | 48 | #include <linux/module.h> |
| 49 | #include <linux/slab.h> | 49 | #include <linux/slab.h> |
| @@ -55,6 +55,9 @@ | |||
| 55 | #include <net/sock.h> | 55 | #include <net/sock.h> |
| 56 | #include <net/netlink.h> | 56 | #include <net/netlink.h> |
| 57 | #include <linux/skbuff.h> | 57 | #include <linux/skbuff.h> |
| 58 | #ifdef CONFIG_SECURITY | ||
| 59 | #include <linux/security.h> | ||
| 60 | #endif | ||
| 58 | #include <linux/netlink.h> | 61 | #include <linux/netlink.h> |
| 59 | #include <linux/freezer.h> | 62 | #include <linux/freezer.h> |
| 60 | #include <linux/tty.h> | 63 | #include <linux/tty.h> |
| @@ -1502,6 +1505,32 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, | |||
| 1502 | } | 1505 | } |
| 1503 | } | 1506 | } |
| 1504 | 1507 | ||
| 1508 | #ifdef CONFIG_SECURITY | ||
| 1509 | /** | ||
| 1510 | * audit_log_secctx - Converts and logs SELinux context | ||
| 1511 | * @ab: audit_buffer | ||
| 1512 | * @secid: security number | ||
| 1513 | * | ||
| 1514 | * This is a helper function that calls security_secid_to_secctx to convert | ||
| 1515 | * secid to secctx and then adds the (converted) SELinux context to the audit | ||
| 1516 | * log by calling audit_log_format, thus also preventing leak of internal secid | ||
| 1517 | * to userspace. If secid cannot be converted audit_panic is called. | ||
| 1518 | */ | ||
| 1519 | void audit_log_secctx(struct audit_buffer *ab, u32 secid) | ||
| 1520 | { | ||
| 1521 | u32 len; | ||
| 1522 | char *secctx; | ||
| 1523 | |||
| 1524 | if (security_secid_to_secctx(secid, &secctx, &len)) { | ||
| 1525 | audit_panic("Cannot convert secid to context"); | ||
| 1526 | } else { | ||
| 1527 | audit_log_format(ab, " obj=%s", secctx); | ||
| 1528 | security_release_secctx(secctx, len); | ||
| 1529 | } | ||
| 1530 | } | ||
| 1531 | EXPORT_SYMBOL(audit_log_secctx); | ||
| 1532 | #endif | ||
| 1533 | |||
| 1505 | EXPORT_SYMBOL(audit_log_start); | 1534 | EXPORT_SYMBOL(audit_log_start); |
| 1506 | EXPORT_SYMBOL(audit_log_end); | 1535 | EXPORT_SYMBOL(audit_log_end); |
| 1507 | EXPORT_SYMBOL(audit_log_format); | 1536 | EXPORT_SYMBOL(audit_log_format); |
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e99dda04b126..5bf0790497e7 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c | |||
| @@ -93,16 +93,10 @@ static inline void get_tree(struct audit_tree *tree) | |||
| 93 | atomic_inc(&tree->count); | 93 | atomic_inc(&tree->count); |
| 94 | } | 94 | } |
| 95 | 95 | ||
| 96 | static void __put_tree(struct rcu_head *rcu) | ||
| 97 | { | ||
| 98 | struct audit_tree *tree = container_of(rcu, struct audit_tree, head); | ||
| 99 | kfree(tree); | ||
| 100 | } | ||
| 101 | |||
| 102 | static inline void put_tree(struct audit_tree *tree) | 96 | static inline void put_tree(struct audit_tree *tree) |
| 103 | { | 97 | { |
| 104 | if (atomic_dec_and_test(&tree->count)) | 98 | if (atomic_dec_and_test(&tree->count)) |
| 105 | call_rcu(&tree->head, __put_tree); | 99 | kfree_rcu(tree, head); |
| 106 | } | 100 | } |
| 107 | 101 | ||
| 108 | /* to avoid bringing the entire thing in audit.h */ | 102 | /* to avoid bringing the entire thing in audit.h */ |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b33513a08beb..ce4b054acee5 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -44,7 +44,7 @@ | |||
| 44 | 44 | ||
| 45 | #include <linux/init.h> | 45 | #include <linux/init.h> |
| 46 | #include <asm/types.h> | 46 | #include <asm/types.h> |
| 47 | #include <asm/atomic.h> | 47 | #include <linux/atomic.h> |
| 48 | #include <linux/fs.h> | 48 | #include <linux/fs.h> |
| 49 | #include <linux/namei.h> | 49 | #include <linux/namei.h> |
| 50 | #include <linux/mm.h> | 50 | #include <linux/mm.h> |
| @@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) | |||
| 443 | 443 | ||
| 444 | /* Determine if any context name data matches a rule's watch data */ | 444 | /* Determine if any context name data matches a rule's watch data */ |
| 445 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 | 445 | /* Compare a task_struct with an audit_rule. Return 1 on match, 0 |
| 446 | * otherwise. */ | 446 | * otherwise. |
| 447 | * | ||
| 448 | * If task_creation is true, this is an explicit indication that we are | ||
| 449 | * filtering a task rule at task creation time. This and tsk == current are | ||
| 450 | * the only situations where tsk->cred may be accessed without an rcu read lock. | ||
| 451 | */ | ||
| 447 | static int audit_filter_rules(struct task_struct *tsk, | 452 | static int audit_filter_rules(struct task_struct *tsk, |
| 448 | struct audit_krule *rule, | 453 | struct audit_krule *rule, |
| 449 | struct audit_context *ctx, | 454 | struct audit_context *ctx, |
| 450 | struct audit_names *name, | 455 | struct audit_names *name, |
| 451 | enum audit_state *state) | 456 | enum audit_state *state, |
| 457 | bool task_creation) | ||
| 452 | { | 458 | { |
| 453 | const struct cred *cred = get_task_cred(tsk); | 459 | const struct cred *cred; |
| 454 | int i, j, need_sid = 1; | 460 | int i, j, need_sid = 1; |
| 455 | u32 sid; | 461 | u32 sid; |
| 456 | 462 | ||
| 463 | cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); | ||
| 464 | |||
| 457 | for (i = 0; i < rule->field_count; i++) { | 465 | for (i = 0; i < rule->field_count; i++) { |
| 458 | struct audit_field *f = &rule->fields[i]; | 466 | struct audit_field *f = &rule->fields[i]; |
| 459 | int result = 0; | 467 | int result = 0; |
| @@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 637 | break; | 645 | break; |
| 638 | } | 646 | } |
| 639 | 647 | ||
| 640 | if (!result) { | 648 | if (!result) |
| 641 | put_cred(cred); | ||
| 642 | return 0; | 649 | return 0; |
| 643 | } | ||
| 644 | } | 650 | } |
| 645 | 651 | ||
| 646 | if (ctx) { | 652 | if (ctx) { |
| @@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 656 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; | 662 | case AUDIT_NEVER: *state = AUDIT_DISABLED; break; |
| 657 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; | 663 | case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; |
| 658 | } | 664 | } |
| 659 | put_cred(cred); | ||
| 660 | return 1; | 665 | return 1; |
| 661 | } | 666 | } |
| 662 | 667 | ||
| @@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) | |||
| 671 | 676 | ||
| 672 | rcu_read_lock(); | 677 | rcu_read_lock(); |
| 673 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { | 678 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { |
| 674 | if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { | 679 | if (audit_filter_rules(tsk, &e->rule, NULL, NULL, |
| 680 | &state, true)) { | ||
| 675 | if (state == AUDIT_RECORD_CONTEXT) | 681 | if (state == AUDIT_RECORD_CONTEXT) |
| 676 | *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); | 682 | *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); |
| 677 | rcu_read_unlock(); | 683 | rcu_read_unlock(); |
| @@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, | |||
| 705 | list_for_each_entry_rcu(e, list, list) { | 711 | list_for_each_entry_rcu(e, list, list) { |
| 706 | if ((e->rule.mask[word] & bit) == bit && | 712 | if ((e->rule.mask[word] & bit) == bit && |
| 707 | audit_filter_rules(tsk, &e->rule, ctx, NULL, | 713 | audit_filter_rules(tsk, &e->rule, ctx, NULL, |
| 708 | &state)) { | 714 | &state, false)) { |
| 709 | rcu_read_unlock(); | 715 | rcu_read_unlock(); |
| 710 | ctx->current_state = state; | 716 | ctx->current_state = state; |
| 711 | return state; | 717 | return state; |
| @@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) | |||
| 743 | 749 | ||
| 744 | list_for_each_entry_rcu(e, list, list) { | 750 | list_for_each_entry_rcu(e, list, list) { |
| 745 | if ((e->rule.mask[word] & bit) == bit && | 751 | if ((e->rule.mask[word] & bit) == bit && |
| 746 | audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { | 752 | audit_filter_rules(tsk, &e->rule, ctx, n, |
| 753 | &state, false)) { | ||
| 747 | rcu_read_unlock(); | 754 | rcu_read_unlock(); |
| 748 | ctx->current_state = state; | 755 | ctx->current_state = state; |
| 749 | return; | 756 | return; |
diff --git a/kernel/capability.c b/kernel/capability.c index bf0c734d0c12..283c529f8b1c 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
| @@ -22,12 +22,8 @@ | |||
| 22 | */ | 22 | */ |
| 23 | 23 | ||
| 24 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; | 24 | const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; |
| 25 | const kernel_cap_t __cap_full_set = CAP_FULL_SET; | ||
| 26 | const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; | ||
| 27 | 25 | ||
| 28 | EXPORT_SYMBOL(__cap_empty_set); | 26 | EXPORT_SYMBOL(__cap_empty_set); |
| 29 | EXPORT_SYMBOL(__cap_full_set); | ||
| 30 | EXPORT_SYMBOL(__cap_init_eff_set); | ||
| 31 | 27 | ||
| 32 | int file_caps_enabled = 1; | 28 | int file_caps_enabled = 1; |
| 33 | 29 | ||
| @@ -399,3 +395,15 @@ bool task_ns_capable(struct task_struct *t, int cap) | |||
| 399 | return ns_capable(task_cred_xxx(t, user)->user_ns, cap); | 395 | return ns_capable(task_cred_xxx(t, user)->user_ns, cap); |
| 400 | } | 396 | } |
| 401 | EXPORT_SYMBOL(task_ns_capable); | 397 | EXPORT_SYMBOL(task_ns_capable); |
| 398 | |||
| 399 | /** | ||
| 400 | * nsown_capable - Check superior capability to one's own user_ns | ||
| 401 | * @cap: The capability in question | ||
| 402 | * | ||
| 403 | * Return true if the current task has the given superior capability | ||
| 404 | * targeted at its own user namespace. | ||
| 405 | */ | ||
| 406 | bool nsown_capable(int cap) | ||
| 407 | { | ||
| 408 | return ns_capable(current_user_ns(), cap); | ||
| 409 | } | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 25c7eb52de1a..1d2b6ceea95d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -27,9 +27,11 @@ | |||
| 27 | */ | 27 | */ |
| 28 | 28 | ||
| 29 | #include <linux/cgroup.h> | 29 | #include <linux/cgroup.h> |
| 30 | #include <linux/cred.h> | ||
| 30 | #include <linux/ctype.h> | 31 | #include <linux/ctype.h> |
| 31 | #include <linux/errno.h> | 32 | #include <linux/errno.h> |
| 32 | #include <linux/fs.h> | 33 | #include <linux/fs.h> |
| 34 | #include <linux/init_task.h> | ||
| 33 | #include <linux/kernel.h> | 35 | #include <linux/kernel.h> |
| 34 | #include <linux/list.h> | 36 | #include <linux/list.h> |
| 35 | #include <linux/mm.h> | 37 | #include <linux/mm.h> |
| @@ -57,8 +59,9 @@ | |||
| 57 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ | 59 | #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
| 58 | #include <linux/eventfd.h> | 60 | #include <linux/eventfd.h> |
| 59 | #include <linux/poll.h> | 61 | #include <linux/poll.h> |
| 62 | #include <linux/flex_array.h> /* used in cgroup_attach_proc */ | ||
| 60 | 63 | ||
| 61 | #include <asm/atomic.h> | 64 | #include <linux/atomic.h> |
| 62 | 65 | ||
| 63 | static DEFINE_MUTEX(cgroup_mutex); | 66 | static DEFINE_MUTEX(cgroup_mutex); |
| 64 | 67 | ||
| @@ -326,12 +329,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) | |||
| 326 | return &css_set_table[index]; | 329 | return &css_set_table[index]; |
| 327 | } | 330 | } |
| 328 | 331 | ||
| 329 | static void free_css_set_rcu(struct rcu_head *obj) | ||
| 330 | { | ||
| 331 | struct css_set *cg = container_of(obj, struct css_set, rcu_head); | ||
| 332 | kfree(cg); | ||
| 333 | } | ||
| 334 | |||
| 335 | /* We don't maintain the lists running through each css_set to its | 332 | /* We don't maintain the lists running through each css_set to its |
| 336 | * task until after the first call to cgroup_iter_start(). This | 333 | * task until after the first call to cgroup_iter_start(). This |
| 337 | * reduces the fork()/exit() overhead for people who have cgroups | 334 | * reduces the fork()/exit() overhead for people who have cgroups |
| @@ -375,7 +372,7 @@ static void __put_css_set(struct css_set *cg, int taskexit) | |||
| 375 | } | 372 | } |
| 376 | 373 | ||
| 377 | write_unlock(&css_set_lock); | 374 | write_unlock(&css_set_lock); |
| 378 | call_rcu(&cg->rcu_head, free_css_set_rcu); | 375 | kfree_rcu(cg, rcu_head); |
| 379 | } | 376 | } |
| 380 | 377 | ||
| 381 | /* | 378 | /* |
| @@ -812,13 +809,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) | |||
| 812 | return ret; | 809 | return ret; |
| 813 | } | 810 | } |
| 814 | 811 | ||
| 815 | static void free_cgroup_rcu(struct rcu_head *obj) | ||
| 816 | { | ||
| 817 | struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head); | ||
| 818 | |||
| 819 | kfree(cgrp); | ||
| 820 | } | ||
| 821 | |||
| 822 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 812 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
| 823 | { | 813 | { |
| 824 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 814 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
| @@ -856,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
| 856 | */ | 846 | */ |
| 857 | BUG_ON(!list_empty(&cgrp->pidlists)); | 847 | BUG_ON(!list_empty(&cgrp->pidlists)); |
| 858 | 848 | ||
| 859 | call_rcu(&cgrp->rcu_head, free_cgroup_rcu); | 849 | kfree_rcu(cgrp, rcu_head); |
| 860 | } | 850 | } |
| 861 | iput(inode); | 851 | iput(inode); |
| 862 | } | 852 | } |
| @@ -1526,6 +1516,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1526 | struct cgroup *root_cgrp = &root->top_cgroup; | 1516 | struct cgroup *root_cgrp = &root->top_cgroup; |
| 1527 | struct inode *inode; | 1517 | struct inode *inode; |
| 1528 | struct cgroupfs_root *existing_root; | 1518 | struct cgroupfs_root *existing_root; |
| 1519 | const struct cred *cred; | ||
| 1529 | int i; | 1520 | int i; |
| 1530 | 1521 | ||
| 1531 | BUG_ON(sb->s_root != NULL); | 1522 | BUG_ON(sb->s_root != NULL); |
| @@ -1605,7 +1596,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1605 | BUG_ON(!list_empty(&root_cgrp->children)); | 1596 | BUG_ON(!list_empty(&root_cgrp->children)); |
| 1606 | BUG_ON(root->number_of_cgroups != 1); | 1597 | BUG_ON(root->number_of_cgroups != 1); |
| 1607 | 1598 | ||
| 1599 | cred = override_creds(&init_cred); | ||
| 1608 | cgroup_populate_dir(root_cgrp); | 1600 | cgroup_populate_dir(root_cgrp); |
| 1601 | revert_creds(cred); | ||
| 1609 | mutex_unlock(&cgroup_mutex); | 1602 | mutex_unlock(&cgroup_mutex); |
| 1610 | mutex_unlock(&inode->i_mutex); | 1603 | mutex_unlock(&inode->i_mutex); |
| 1611 | } else { | 1604 | } else { |
| @@ -1709,7 +1702,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
| 1709 | { | 1702 | { |
| 1710 | char *start; | 1703 | char *start; |
| 1711 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, | 1704 | struct dentry *dentry = rcu_dereference_check(cgrp->dentry, |
| 1712 | rcu_read_lock_held() || | ||
| 1713 | cgroup_lock_is_held()); | 1705 | cgroup_lock_is_held()); |
| 1714 | 1706 | ||
| 1715 | if (!dentry || cgrp == dummytop) { | 1707 | if (!dentry || cgrp == dummytop) { |
| @@ -1735,7 +1727,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
| 1735 | break; | 1727 | break; |
| 1736 | 1728 | ||
| 1737 | dentry = rcu_dereference_check(cgrp->dentry, | 1729 | dentry = rcu_dereference_check(cgrp->dentry, |
| 1738 | rcu_read_lock_held() || | ||
| 1739 | cgroup_lock_is_held()); | 1730 | cgroup_lock_is_held()); |
| 1740 | if (!cgrp->parent) | 1731 | if (!cgrp->parent) |
| 1741 | continue; | 1732 | continue; |
| @@ -1748,6 +1739,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) | |||
| 1748 | } | 1739 | } |
| 1749 | EXPORT_SYMBOL_GPL(cgroup_path); | 1740 | EXPORT_SYMBOL_GPL(cgroup_path); |
| 1750 | 1741 | ||
| 1742 | /* | ||
| 1743 | * cgroup_task_migrate - move a task from one cgroup to another. | ||
| 1744 | * | ||
| 1745 | * 'guarantee' is set if the caller promises that a new css_set for the task | ||
| 1746 | * will already exist. If not set, this function might sleep, and can fail with | ||
| 1747 | * -ENOMEM. Otherwise, it can only fail with -ESRCH. | ||
| 1748 | */ | ||
| 1749 | static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, | ||
| 1750 | struct task_struct *tsk, bool guarantee) | ||
| 1751 | { | ||
| 1752 | struct css_set *oldcg; | ||
| 1753 | struct css_set *newcg; | ||
| 1754 | |||
| 1755 | /* | ||
| 1756 | * get old css_set. we need to take task_lock and refcount it, because | ||
| 1757 | * an exiting task can change its css_set to init_css_set and drop its | ||
| 1758 | * old one without taking cgroup_mutex. | ||
| 1759 | */ | ||
| 1760 | task_lock(tsk); | ||
| 1761 | oldcg = tsk->cgroups; | ||
| 1762 | get_css_set(oldcg); | ||
| 1763 | task_unlock(tsk); | ||
| 1764 | |||
| 1765 | /* locate or allocate a new css_set for this task. */ | ||
| 1766 | if (guarantee) { | ||
| 1767 | /* we know the css_set we want already exists. */ | ||
| 1768 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | ||
| 1769 | read_lock(&css_set_lock); | ||
| 1770 | newcg = find_existing_css_set(oldcg, cgrp, template); | ||
| 1771 | BUG_ON(!newcg); | ||
| 1772 | get_css_set(newcg); | ||
| 1773 | read_unlock(&css_set_lock); | ||
| 1774 | } else { | ||
| 1775 | might_sleep(); | ||
| 1776 | /* find_css_set will give us newcg already referenced. */ | ||
| 1777 | newcg = find_css_set(oldcg, cgrp); | ||
| 1778 | if (!newcg) { | ||
| 1779 | put_css_set(oldcg); | ||
| 1780 | return -ENOMEM; | ||
| 1781 | } | ||
| 1782 | } | ||
| 1783 | put_css_set(oldcg); | ||
| 1784 | |||
| 1785 | /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ | ||
| 1786 | task_lock(tsk); | ||
| 1787 | if (tsk->flags & PF_EXITING) { | ||
| 1788 | task_unlock(tsk); | ||
| 1789 | put_css_set(newcg); | ||
| 1790 | return -ESRCH; | ||
| 1791 | } | ||
| 1792 | rcu_assign_pointer(tsk->cgroups, newcg); | ||
| 1793 | task_unlock(tsk); | ||
| 1794 | |||
| 1795 | /* Update the css_set linked lists if we're using them */ | ||
| 1796 | write_lock(&css_set_lock); | ||
| 1797 | if (!list_empty(&tsk->cg_list)) | ||
| 1798 | list_move(&tsk->cg_list, &newcg->tasks); | ||
| 1799 | write_unlock(&css_set_lock); | ||
| 1800 | |||
| 1801 | /* | ||
| 1802 | * We just gained a reference on oldcg by taking it from the task. As | ||
| 1803 | * trading it for newcg is protected by cgroup_mutex, we're safe to drop | ||
| 1804 | * it here; it will be freed under RCU. | ||
| 1805 | */ | ||
| 1806 | put_css_set(oldcg); | ||
| 1807 | |||
| 1808 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | ||
| 1809 | return 0; | ||
| 1810 | } | ||
| 1811 | |||
| 1751 | /** | 1812 | /** |
| 1752 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' | 1813 | * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' |
| 1753 | * @cgrp: the cgroup the task is attaching to | 1814 | * @cgrp: the cgroup the task is attaching to |
| @@ -1758,11 +1819,9 @@ EXPORT_SYMBOL_GPL(cgroup_path); | |||
| 1758 | */ | 1819 | */ |
| 1759 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1820 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
| 1760 | { | 1821 | { |
| 1761 | int retval = 0; | 1822 | int retval; |
| 1762 | struct cgroup_subsys *ss, *failed_ss = NULL; | 1823 | struct cgroup_subsys *ss, *failed_ss = NULL; |
| 1763 | struct cgroup *oldcgrp; | 1824 | struct cgroup *oldcgrp; |
| 1764 | struct css_set *cg; | ||
| 1765 | struct css_set *newcg; | ||
| 1766 | struct cgroupfs_root *root = cgrp->root; | 1825 | struct cgroupfs_root *root = cgrp->root; |
| 1767 | 1826 | ||
| 1768 | /* Nothing to do if the task is already in that cgroup */ | 1827 | /* Nothing to do if the task is already in that cgroup */ |
| @@ -1772,7 +1831,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 1772 | 1831 | ||
| 1773 | for_each_subsys(root, ss) { | 1832 | for_each_subsys(root, ss) { |
| 1774 | if (ss->can_attach) { | 1833 | if (ss->can_attach) { |
| 1775 | retval = ss->can_attach(ss, cgrp, tsk, false); | 1834 | retval = ss->can_attach(ss, cgrp, tsk); |
| 1776 | if (retval) { | 1835 | if (retval) { |
| 1777 | /* | 1836 | /* |
| 1778 | * Remember on which subsystem the can_attach() | 1837 | * Remember on which subsystem the can_attach() |
| @@ -1784,46 +1843,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 1784 | goto out; | 1843 | goto out; |
| 1785 | } | 1844 | } |
| 1786 | } | 1845 | } |
| 1846 | if (ss->can_attach_task) { | ||
| 1847 | retval = ss->can_attach_task(cgrp, tsk); | ||
| 1848 | if (retval) { | ||
| 1849 | failed_ss = ss; | ||
| 1850 | goto out; | ||
| 1851 | } | ||
| 1852 | } | ||
| 1787 | } | 1853 | } |
| 1788 | 1854 | ||
| 1789 | task_lock(tsk); | 1855 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); |
| 1790 | cg = tsk->cgroups; | 1856 | if (retval) |
| 1791 | get_css_set(cg); | ||
| 1792 | task_unlock(tsk); | ||
| 1793 | /* | ||
| 1794 | * Locate or allocate a new css_set for this task, | ||
| 1795 | * based on its final set of cgroups | ||
| 1796 | */ | ||
| 1797 | newcg = find_css_set(cg, cgrp); | ||
| 1798 | put_css_set(cg); | ||
| 1799 | if (!newcg) { | ||
| 1800 | retval = -ENOMEM; | ||
| 1801 | goto out; | 1857 | goto out; |
| 1802 | } | ||
| 1803 | |||
| 1804 | task_lock(tsk); | ||
| 1805 | if (tsk->flags & PF_EXITING) { | ||
| 1806 | task_unlock(tsk); | ||
| 1807 | put_css_set(newcg); | ||
| 1808 | retval = -ESRCH; | ||
| 1809 | goto out; | ||
| 1810 | } | ||
| 1811 | rcu_assign_pointer(tsk->cgroups, newcg); | ||
| 1812 | task_unlock(tsk); | ||
| 1813 | |||
| 1814 | /* Update the css_set linked lists if we're using them */ | ||
| 1815 | write_lock(&css_set_lock); | ||
| 1816 | if (!list_empty(&tsk->cg_list)) | ||
| 1817 | list_move(&tsk->cg_list, &newcg->tasks); | ||
| 1818 | write_unlock(&css_set_lock); | ||
| 1819 | 1858 | ||
| 1820 | for_each_subsys(root, ss) { | 1859 | for_each_subsys(root, ss) { |
| 1860 | if (ss->pre_attach) | ||
| 1861 | ss->pre_attach(cgrp); | ||
| 1862 | if (ss->attach_task) | ||
| 1863 | ss->attach_task(cgrp, tsk); | ||
| 1821 | if (ss->attach) | 1864 | if (ss->attach) |
| 1822 | ss->attach(ss, cgrp, oldcgrp, tsk, false); | 1865 | ss->attach(ss, cgrp, oldcgrp, tsk); |
| 1823 | } | 1866 | } |
| 1824 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | 1867 | |
| 1825 | synchronize_rcu(); | 1868 | synchronize_rcu(); |
| 1826 | put_css_set(cg); | ||
| 1827 | 1869 | ||
| 1828 | /* | 1870 | /* |
| 1829 | * wake up rmdir() waiter. the rmdir should fail since the cgroup | 1871 | * wake up rmdir() waiter. the rmdir should fail since the cgroup |
| @@ -1842,7 +1884,7 @@ out: | |||
| 1842 | */ | 1884 | */ |
| 1843 | break; | 1885 | break; |
| 1844 | if (ss->cancel_attach) | 1886 | if (ss->cancel_attach) |
| 1845 | ss->cancel_attach(ss, cgrp, tsk, false); | 1887 | ss->cancel_attach(ss, cgrp, tsk); |
| 1846 | } | 1888 | } |
| 1847 | } | 1889 | } |
| 1848 | return retval; | 1890 | return retval; |
| @@ -1873,49 +1915,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) | |||
| 1873 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); | 1915 | EXPORT_SYMBOL_GPL(cgroup_attach_task_all); |
| 1874 | 1916 | ||
| 1875 | /* | 1917 | /* |
| 1876 | * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex | 1918 | * cgroup_attach_proc works in two stages, the first of which prefetches all |
| 1877 | * held. May take task_lock of task | 1919 | * new css_sets needed (to make sure we have enough memory before committing |
| 1920 | * to the move) and stores them in a list of entries of the following type. | ||
| 1921 | * TODO: possible optimization: use css_set->rcu_head for chaining instead | ||
| 1922 | */ | ||
| 1923 | struct cg_list_entry { | ||
| 1924 | struct css_set *cg; | ||
| 1925 | struct list_head links; | ||
| 1926 | }; | ||
| 1927 | |||
| 1928 | static bool css_set_check_fetched(struct cgroup *cgrp, | ||
| 1929 | struct task_struct *tsk, struct css_set *cg, | ||
| 1930 | struct list_head *newcg_list) | ||
| 1931 | { | ||
| 1932 | struct css_set *newcg; | ||
| 1933 | struct cg_list_entry *cg_entry; | ||
| 1934 | struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; | ||
| 1935 | |||
| 1936 | read_lock(&css_set_lock); | ||
| 1937 | newcg = find_existing_css_set(cg, cgrp, template); | ||
| 1938 | if (newcg) | ||
| 1939 | get_css_set(newcg); | ||
| 1940 | read_unlock(&css_set_lock); | ||
| 1941 | |||
| 1942 | /* doesn't exist at all? */ | ||
| 1943 | if (!newcg) | ||
| 1944 | return false; | ||
| 1945 | /* see if it's already in the list */ | ||
| 1946 | list_for_each_entry(cg_entry, newcg_list, links) { | ||
| 1947 | if (cg_entry->cg == newcg) { | ||
| 1948 | put_css_set(newcg); | ||
| 1949 | return true; | ||
| 1950 | } | ||
| 1951 | } | ||
| 1952 | |||
| 1953 | /* not found */ | ||
| 1954 | put_css_set(newcg); | ||
| 1955 | return false; | ||
| 1956 | } | ||
| 1957 | |||
| 1958 | /* | ||
| 1959 | * Find the new css_set and store it in the list in preparation for moving the | ||
| 1960 | * given task to the given cgroup. Returns 0 or -ENOMEM. | ||
| 1961 | */ | ||
| 1962 | static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, | ||
| 1963 | struct list_head *newcg_list) | ||
| 1964 | { | ||
| 1965 | struct css_set *newcg; | ||
| 1966 | struct cg_list_entry *cg_entry; | ||
| 1967 | |||
| 1968 | /* ensure a new css_set will exist for this thread */ | ||
| 1969 | newcg = find_css_set(cg, cgrp); | ||
| 1970 | if (!newcg) | ||
| 1971 | return -ENOMEM; | ||
| 1972 | /* add it to the list */ | ||
| 1973 | cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); | ||
| 1974 | if (!cg_entry) { | ||
| 1975 | put_css_set(newcg); | ||
| 1976 | return -ENOMEM; | ||
| 1977 | } | ||
| 1978 | cg_entry->cg = newcg; | ||
| 1979 | list_add(&cg_entry->links, newcg_list); | ||
| 1980 | return 0; | ||
| 1981 | } | ||
| 1982 | |||
| 1983 | /** | ||
| 1984 | * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup | ||
| 1985 | * @cgrp: the cgroup to attach to | ||
| 1986 | * @leader: the threadgroup leader task_struct of the group to be attached | ||
| 1987 | * | ||
| 1988 | * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will | ||
| 1989 | * take task_lock of each thread in leader's threadgroup individually in turn. | ||
| 1990 | */ | ||
| 1991 | int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) | ||
| 1992 | { | ||
| 1993 | int retval, i, group_size; | ||
| 1994 | struct cgroup_subsys *ss, *failed_ss = NULL; | ||
| 1995 | bool cancel_failed_ss = false; | ||
| 1996 | /* guaranteed to be initialized later, but the compiler needs this */ | ||
| 1997 | struct cgroup *oldcgrp = NULL; | ||
| 1998 | struct css_set *oldcg; | ||
| 1999 | struct cgroupfs_root *root = cgrp->root; | ||
| 2000 | /* threadgroup list cursor and array */ | ||
| 2001 | struct task_struct *tsk; | ||
| 2002 | struct flex_array *group; | ||
| 2003 | /* | ||
| 2004 | * we need to make sure we have css_sets for all the tasks we're | ||
| 2005 | * going to move -before- we actually start moving them, so that in | ||
| 2006 | * case we get an ENOMEM we can bail out before making any changes. | ||
| 2007 | */ | ||
| 2008 | struct list_head newcg_list; | ||
| 2009 | struct cg_list_entry *cg_entry, *temp_nobe; | ||
| 2010 | |||
| 2011 | /* | ||
| 2012 | * step 0: in order to do expensive, possibly blocking operations for | ||
| 2013 | * every thread, we cannot iterate the thread group list, since it needs | ||
| 2014 | * rcu or tasklist locked. instead, build an array of all threads in the | ||
| 2015 | * group - threadgroup_fork_lock prevents new threads from appearing, | ||
| 2016 | * and if threads exit, this will just be an over-estimate. | ||
| 2017 | */ | ||
| 2018 | group_size = get_nr_threads(leader); | ||
| 2019 | /* flex_array supports very large thread-groups better than kmalloc. */ | ||
| 2020 | group = flex_array_alloc(sizeof(struct task_struct *), group_size, | ||
| 2021 | GFP_KERNEL); | ||
| 2022 | if (!group) | ||
| 2023 | return -ENOMEM; | ||
| 2024 | /* pre-allocate to guarantee space while iterating in rcu read-side. */ | ||
| 2025 | retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); | ||
| 2026 | if (retval) | ||
| 2027 | goto out_free_group_list; | ||
| 2028 | |||
| 2029 | /* prevent changes to the threadgroup list while we take a snapshot. */ | ||
| 2030 | rcu_read_lock(); | ||
| 2031 | if (!thread_group_leader(leader)) { | ||
| 2032 | /* | ||
| 2033 | * a race with de_thread from another thread's exec() may strip | ||
| 2034 | * us of our leadership, making while_each_thread unsafe to use | ||
| 2035 | * on this task. if this happens, there is no choice but to | ||
| 2036 | * throw this task away and try again (from cgroup_procs_write); | ||
| 2037 | * this is "double-double-toil-and-trouble-check locking". | ||
| 2038 | */ | ||
| 2039 | rcu_read_unlock(); | ||
| 2040 | retval = -EAGAIN; | ||
| 2041 | goto out_free_group_list; | ||
| 2042 | } | ||
| 2043 | /* take a reference on each task in the group to go in the array. */ | ||
| 2044 | tsk = leader; | ||
| 2045 | i = 0; | ||
| 2046 | do { | ||
| 2047 | /* as per above, nr_threads may decrease, but not increase. */ | ||
| 2048 | BUG_ON(i >= group_size); | ||
| 2049 | get_task_struct(tsk); | ||
| 2050 | /* | ||
| 2051 | * saying GFP_ATOMIC has no effect here because we did prealloc | ||
| 2052 | * earlier, but it's good form to communicate our expectations. | ||
| 2053 | */ | ||
| 2054 | retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); | ||
| 2055 | BUG_ON(retval != 0); | ||
| 2056 | i++; | ||
| 2057 | } while_each_thread(leader, tsk); | ||
| 2058 | /* remember the number of threads in the array for later. */ | ||
| 2059 | group_size = i; | ||
| 2060 | rcu_read_unlock(); | ||
| 2061 | |||
| 2062 | /* | ||
| 2063 | * step 1: check that we can legitimately attach to the cgroup. | ||
| 2064 | */ | ||
| 2065 | for_each_subsys(root, ss) { | ||
| 2066 | if (ss->can_attach) { | ||
| 2067 | retval = ss->can_attach(ss, cgrp, leader); | ||
| 2068 | if (retval) { | ||
| 2069 | failed_ss = ss; | ||
| 2070 | goto out_cancel_attach; | ||
| 2071 | } | ||
| 2072 | } | ||
| 2073 | /* a callback to be run on every thread in the threadgroup. */ | ||
| 2074 | if (ss->can_attach_task) { | ||
| 2075 | /* run on each task in the threadgroup. */ | ||
| 2076 | for (i = 0; i < group_size; i++) { | ||
| 2077 | tsk = flex_array_get_ptr(group, i); | ||
| 2078 | retval = ss->can_attach_task(cgrp, tsk); | ||
| 2079 | if (retval) { | ||
| 2080 | failed_ss = ss; | ||
| 2081 | cancel_failed_ss = true; | ||
| 2082 | goto out_cancel_attach; | ||
| 2083 | } | ||
| 2084 | } | ||
| 2085 | } | ||
| 2086 | } | ||
| 2087 | |||
| 2088 | /* | ||
| 2089 | * step 2: make sure css_sets exist for all threads to be migrated. | ||
| 2090 | * we use find_css_set, which allocates a new one if necessary. | ||
| 2091 | */ | ||
| 2092 | INIT_LIST_HEAD(&newcg_list); | ||
| 2093 | for (i = 0; i < group_size; i++) { | ||
| 2094 | tsk = flex_array_get_ptr(group, i); | ||
| 2095 | /* nothing to do if this task is already in the cgroup */ | ||
| 2096 | oldcgrp = task_cgroup_from_root(tsk, root); | ||
| 2097 | if (cgrp == oldcgrp) | ||
| 2098 | continue; | ||
| 2099 | /* get old css_set pointer */ | ||
| 2100 | task_lock(tsk); | ||
| 2101 | if (tsk->flags & PF_EXITING) { | ||
| 2102 | /* ignore this task if it's going away */ | ||
| 2103 | task_unlock(tsk); | ||
| 2104 | continue; | ||
| 2105 | } | ||
| 2106 | oldcg = tsk->cgroups; | ||
| 2107 | get_css_set(oldcg); | ||
| 2108 | task_unlock(tsk); | ||
| 2109 | /* see if the new one for us is already in the list? */ | ||
| 2110 | if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { | ||
| 2111 | /* was already there, nothing to do. */ | ||
| 2112 | put_css_set(oldcg); | ||
| 2113 | } else { | ||
| 2114 | /* we don't already have it. get new one. */ | ||
| 2115 | retval = css_set_prefetch(cgrp, oldcg, &newcg_list); | ||
| 2116 | put_css_set(oldcg); | ||
| 2117 | if (retval) | ||
| 2118 | goto out_list_teardown; | ||
| 2119 | } | ||
| 2120 | } | ||
| 2121 | |||
| 2122 | /* | ||
| 2123 | * step 3: now that we're guaranteed success wrt the css_sets, proceed | ||
| 2124 | * to move all tasks to the new cgroup, calling ss->attach_task for each | ||
| 2125 | * one along the way. there are no failure cases after here, so this is | ||
| 2126 | * the commit point. | ||
| 2127 | */ | ||
| 2128 | for_each_subsys(root, ss) { | ||
| 2129 | if (ss->pre_attach) | ||
| 2130 | ss->pre_attach(cgrp); | ||
| 2131 | } | ||
| 2132 | for (i = 0; i < group_size; i++) { | ||
| 2133 | tsk = flex_array_get_ptr(group, i); | ||
| 2134 | /* leave current thread as it is if it's already there */ | ||
| 2135 | oldcgrp = task_cgroup_from_root(tsk, root); | ||
| 2136 | if (cgrp == oldcgrp) | ||
| 2137 | continue; | ||
| 2138 | /* attach each task to each subsystem */ | ||
| 2139 | for_each_subsys(root, ss) { | ||
| 2140 | if (ss->attach_task) | ||
| 2141 | ss->attach_task(cgrp, tsk); | ||
| 2142 | } | ||
| 2143 | /* if the thread is PF_EXITING, it can just get skipped. */ | ||
| 2144 | retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); | ||
| 2145 | BUG_ON(retval != 0 && retval != -ESRCH); | ||
| 2146 | } | ||
| 2147 | /* nothing is sensitive to fork() after this point. */ | ||
| 2148 | |||
| 2149 | /* | ||
| 2150 | * step 4: do expensive, non-thread-specific subsystem callbacks. | ||
| 2151 | * TODO: if ever a subsystem needs to know the oldcgrp for each task | ||
| 2152 | * being moved, this call will need to be reworked to communicate that. | ||
| 2153 | */ | ||
| 2154 | for_each_subsys(root, ss) { | ||
| 2155 | if (ss->attach) | ||
| 2156 | ss->attach(ss, cgrp, oldcgrp, leader); | ||
| 2157 | } | ||
| 2158 | |||
| 2159 | /* | ||
| 2160 | * step 5: success! and cleanup | ||
| 2161 | */ | ||
| 2162 | synchronize_rcu(); | ||
| 2163 | cgroup_wakeup_rmdir_waiter(cgrp); | ||
| 2164 | retval = 0; | ||
| 2165 | out_list_teardown: | ||
| 2166 | /* clean up the list of prefetched css_sets. */ | ||
| 2167 | list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { | ||
| 2168 | list_del(&cg_entry->links); | ||
| 2169 | put_css_set(cg_entry->cg); | ||
| 2170 | kfree(cg_entry); | ||
| 2171 | } | ||
| 2172 | out_cancel_attach: | ||
| 2173 | /* same deal as in cgroup_attach_task */ | ||
| 2174 | if (retval) { | ||
| 2175 | for_each_subsys(root, ss) { | ||
| 2176 | if (ss == failed_ss) { | ||
| 2177 | if (cancel_failed_ss && ss->cancel_attach) | ||
| 2178 | ss->cancel_attach(ss, cgrp, leader); | ||
| 2179 | break; | ||
| 2180 | } | ||
| 2181 | if (ss->cancel_attach) | ||
| 2182 | ss->cancel_attach(ss, cgrp, leader); | ||
| 2183 | } | ||
| 2184 | } | ||
| 2185 | /* clean up the array of referenced threads in the group. */ | ||
| 2186 | for (i = 0; i < group_size; i++) { | ||
| 2187 | tsk = flex_array_get_ptr(group, i); | ||
| 2188 | put_task_struct(tsk); | ||
| 2189 | } | ||
| 2190 | out_free_group_list: | ||
| 2191 | flex_array_free(group); | ||
| 2192 | return retval; | ||
| 2193 | } | ||
| 2194 | |||
| 2195 | /* | ||
| 2196 | * Find the task_struct of the task to attach by vpid and pass it along to the | ||
| 2197 | * function to attach either it or all tasks in its threadgroup. Will take | ||
| 2198 | * cgroup_mutex; may take task_lock of task. | ||
| 1878 | */ | 2199 | */ |
| 1879 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) | 2200 | static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) |
| 1880 | { | 2201 | { |
| 1881 | struct task_struct *tsk; | 2202 | struct task_struct *tsk; |
| 1882 | const struct cred *cred = current_cred(), *tcred; | 2203 | const struct cred *cred = current_cred(), *tcred; |
| 1883 | int ret; | 2204 | int ret; |
| 1884 | 2205 | ||
| 2206 | if (!cgroup_lock_live_group(cgrp)) | ||
| 2207 | return -ENODEV; | ||
| 2208 | |||
| 1885 | if (pid) { | 2209 | if (pid) { |
| 1886 | rcu_read_lock(); | 2210 | rcu_read_lock(); |
| 1887 | tsk = find_task_by_vpid(pid); | 2211 | tsk = find_task_by_vpid(pid); |
| 1888 | if (!tsk || tsk->flags & PF_EXITING) { | 2212 | if (!tsk) { |
| 1889 | rcu_read_unlock(); | 2213 | rcu_read_unlock(); |
| 2214 | cgroup_unlock(); | ||
| 2215 | return -ESRCH; | ||
| 2216 | } | ||
| 2217 | if (threadgroup) { | ||
| 2218 | /* | ||
| 2219 | * RCU protects this access, since tsk was found in the | ||
| 2220 | * tid map. a race with de_thread may cause group_leader | ||
| 2221 | * to stop being the leader, but cgroup_attach_proc will | ||
| 2222 | * detect it later. | ||
| 2223 | */ | ||
| 2224 | tsk = tsk->group_leader; | ||
| 2225 | } else if (tsk->flags & PF_EXITING) { | ||
| 2226 | /* optimization for the single-task-only case */ | ||
| 2227 | rcu_read_unlock(); | ||
| 2228 | cgroup_unlock(); | ||
| 1890 | return -ESRCH; | 2229 | return -ESRCH; |
| 1891 | } | 2230 | } |
| 1892 | 2231 | ||
| 2232 | /* | ||
| 2233 | * even if we're attaching all tasks in the thread group, we | ||
| 2234 | * only need to check permissions on one of them. | ||
| 2235 | */ | ||
| 1893 | tcred = __task_cred(tsk); | 2236 | tcred = __task_cred(tsk); |
| 1894 | if (cred->euid && | 2237 | if (cred->euid && |
| 1895 | cred->euid != tcred->uid && | 2238 | cred->euid != tcred->uid && |
| 1896 | cred->euid != tcred->suid) { | 2239 | cred->euid != tcred->suid) { |
| 1897 | rcu_read_unlock(); | 2240 | rcu_read_unlock(); |
| 2241 | cgroup_unlock(); | ||
| 1898 | return -EACCES; | 2242 | return -EACCES; |
| 1899 | } | 2243 | } |
| 1900 | get_task_struct(tsk); | 2244 | get_task_struct(tsk); |
| 1901 | rcu_read_unlock(); | 2245 | rcu_read_unlock(); |
| 1902 | } else { | 2246 | } else { |
| 1903 | tsk = current; | 2247 | if (threadgroup) |
| 2248 | tsk = current->group_leader; | ||
| 2249 | else | ||
| 2250 | tsk = current; | ||
| 1904 | get_task_struct(tsk); | 2251 | get_task_struct(tsk); |
| 1905 | } | 2252 | } |
| 1906 | 2253 | ||
| 1907 | ret = cgroup_attach_task(cgrp, tsk); | 2254 | if (threadgroup) { |
| 2255 | threadgroup_fork_write_lock(tsk); | ||
| 2256 | ret = cgroup_attach_proc(cgrp, tsk); | ||
| 2257 | threadgroup_fork_write_unlock(tsk); | ||
| 2258 | } else { | ||
| 2259 | ret = cgroup_attach_task(cgrp, tsk); | ||
| 2260 | } | ||
| 1908 | put_task_struct(tsk); | 2261 | put_task_struct(tsk); |
| 2262 | cgroup_unlock(); | ||
| 1909 | return ret; | 2263 | return ret; |
| 1910 | } | 2264 | } |
| 1911 | 2265 | ||
| 1912 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) | 2266 | static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) |
| 1913 | { | 2267 | { |
| 2268 | return attach_task_by_pid(cgrp, pid, false); | ||
| 2269 | } | ||
| 2270 | |||
| 2271 | static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) | ||
| 2272 | { | ||
| 1914 | int ret; | 2273 | int ret; |
| 1915 | if (!cgroup_lock_live_group(cgrp)) | 2274 | do { |
| 1916 | return -ENODEV; | 2275 | /* |
| 1917 | ret = attach_task_by_pid(cgrp, pid); | 2276 | * attach_proc fails with -EAGAIN if threadgroup leadership |
| 1918 | cgroup_unlock(); | 2277 | * changes in the middle of the operation, in which case we need |
| 2278 | * to find the task_struct for the new leader and start over. | ||
| 2279 | */ | ||
| 2280 | ret = attach_task_by_pid(cgrp, tgid, true); | ||
| 2281 | } while (ret == -EAGAIN); | ||
| 1919 | return ret; | 2282 | return ret; |
| 1920 | } | 2283 | } |
| 1921 | 2284 | ||
| @@ -3182,7 +3545,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, | |||
| 3182 | } | 3545 | } |
| 3183 | 3546 | ||
| 3184 | /* the process need read permission on control file */ | 3547 | /* the process need read permission on control file */ |
| 3185 | ret = file_permission(cfile, MAY_READ); | 3548 | /* AV: shouldn't we check that it's been opened for read instead? */ |
| 3549 | ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ); | ||
| 3186 | if (ret < 0) | 3550 | if (ret < 0) |
| 3187 | goto fail; | 3551 | goto fail; |
| 3188 | 3552 | ||
| @@ -3272,9 +3636,9 @@ static struct cftype files[] = { | |||
| 3272 | { | 3636 | { |
| 3273 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", | 3637 | .name = CGROUP_FILE_GENERIC_PREFIX "procs", |
| 3274 | .open = cgroup_procs_open, | 3638 | .open = cgroup_procs_open, |
| 3275 | /* .write_u64 = cgroup_procs_write, TODO */ | 3639 | .write_u64 = cgroup_procs_write, |
| 3276 | .release = cgroup_pidlist_release, | 3640 | .release = cgroup_pidlist_release, |
| 3277 | .mode = S_IRUGO, | 3641 | .mode = S_IRUGO | S_IWUSR, |
| 3278 | }, | 3642 | }, |
| 3279 | { | 3643 | { |
| 3280 | .name = "notify_on_release", | 3644 | .name = "notify_on_release", |
| @@ -4270,122 +4634,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
| 4270 | } | 4634 | } |
| 4271 | 4635 | ||
| 4272 | /** | 4636 | /** |
| 4273 | * cgroup_clone - clone the cgroup the given subsystem is attached to | ||
| 4274 | * @tsk: the task to be moved | ||
| 4275 | * @subsys: the given subsystem | ||
| 4276 | * @nodename: the name for the new cgroup | ||
| 4277 | * | ||
| 4278 | * Duplicate the current cgroup in the hierarchy that the given | ||
| 4279 | * subsystem is attached to, and move this task into the new | ||
| 4280 | * child. | ||
| 4281 | */ | ||
| 4282 | int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, | ||
| 4283 | char *nodename) | ||
| 4284 | { | ||
| 4285 | struct dentry *dentry; | ||
| 4286 | int ret = 0; | ||
| 4287 | struct cgroup *parent, *child; | ||
| 4288 | struct inode *inode; | ||
| 4289 | struct css_set *cg; | ||
| 4290 | struct cgroupfs_root *root; | ||
| 4291 | struct cgroup_subsys *ss; | ||
| 4292 | |||
| 4293 | /* We shouldn't be called by an unregistered subsystem */ | ||
| 4294 | BUG_ON(!subsys->active); | ||
| 4295 | |||
| 4296 | /* First figure out what hierarchy and cgroup we're dealing | ||
| 4297 | * with, and pin them so we can drop cgroup_mutex */ | ||
| 4298 | mutex_lock(&cgroup_mutex); | ||
| 4299 | again: | ||
| 4300 | root = subsys->root; | ||
| 4301 | if (root == &rootnode) { | ||
| 4302 | mutex_unlock(&cgroup_mutex); | ||
| 4303 | return 0; | ||
| 4304 | } | ||
| 4305 | |||
| 4306 | /* Pin the hierarchy */ | ||
| 4307 | if (!atomic_inc_not_zero(&root->sb->s_active)) { | ||
| 4308 | /* We race with the final deactivate_super() */ | ||
| 4309 | mutex_unlock(&cgroup_mutex); | ||
| 4310 | return 0; | ||
| 4311 | } | ||
| 4312 | |||
| 4313 | /* Keep the cgroup alive */ | ||
| 4314 | task_lock(tsk); | ||
| 4315 | parent = task_cgroup(tsk, subsys->subsys_id); | ||
| 4316 | cg = tsk->cgroups; | ||
| 4317 | get_css_set(cg); | ||
| 4318 | task_unlock(tsk); | ||
| 4319 | |||
| 4320 | mutex_unlock(&cgroup_mutex); | ||
| 4321 | |||
| 4322 | /* Now do the VFS work to create a cgroup */ | ||
| 4323 | inode = parent->dentry->d_inode; | ||
| 4324 | |||
| 4325 | /* Hold the parent directory mutex across this operation to | ||
| 4326 | * stop anyone else deleting the new cgroup */ | ||
| 4327 | mutex_lock(&inode->i_mutex); | ||
| 4328 | dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); | ||
| 4329 | if (IS_ERR(dentry)) { | ||
| 4330 | printk(KERN_INFO | ||
| 4331 | "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, | ||
| 4332 | PTR_ERR(dentry)); | ||
| 4333 | ret = PTR_ERR(dentry); | ||
| 4334 | goto out_release; | ||
| 4335 | } | ||
| 4336 | |||
| 4337 | /* Create the cgroup directory, which also creates the cgroup */ | ||
| 4338 | ret = vfs_mkdir(inode, dentry, 0755); | ||
| 4339 | child = __d_cgrp(dentry); | ||
| 4340 | dput(dentry); | ||
| 4341 | if (ret) { | ||
| 4342 | printk(KERN_INFO | ||
| 4343 | "Failed to create cgroup %s: %d\n", nodename, | ||
| 4344 | ret); | ||
| 4345 | goto out_release; | ||
| 4346 | } | ||
| 4347 | |||
| 4348 | /* The cgroup now exists. Retake cgroup_mutex and check | ||
| 4349 | * that we're still in the same state that we thought we | ||
| 4350 | * were. */ | ||
| 4351 | mutex_lock(&cgroup_mutex); | ||
| 4352 | if ((root != subsys->root) || | ||
| 4353 | (parent != task_cgroup(tsk, subsys->subsys_id))) { | ||
| 4354 | /* Aargh, we raced ... */ | ||
| 4355 | mutex_unlock(&inode->i_mutex); | ||
| 4356 | put_css_set(cg); | ||
| 4357 | |||
| 4358 | deactivate_super(root->sb); | ||
| 4359 | /* The cgroup is still accessible in the VFS, but | ||
| 4360 | * we're not going to try to rmdir() it at this | ||
| 4361 | * point. */ | ||
| 4362 | printk(KERN_INFO | ||
| 4363 | "Race in cgroup_clone() - leaking cgroup %s\n", | ||
| 4364 | nodename); | ||
| 4365 | goto again; | ||
| 4366 | } | ||
| 4367 | |||
| 4368 | /* do any required auto-setup */ | ||
| 4369 | for_each_subsys(root, ss) { | ||
| 4370 | if (ss->post_clone) | ||
| 4371 | ss->post_clone(ss, child); | ||
| 4372 | } | ||
| 4373 | |||
| 4374 | /* All seems fine. Finish by moving the task into the new cgroup */ | ||
| 4375 | ret = cgroup_attach_task(child, tsk); | ||
| 4376 | mutex_unlock(&cgroup_mutex); | ||
| 4377 | |||
| 4378 | out_release: | ||
| 4379 | mutex_unlock(&inode->i_mutex); | ||
| 4380 | |||
| 4381 | mutex_lock(&cgroup_mutex); | ||
| 4382 | put_css_set(cg); | ||
| 4383 | mutex_unlock(&cgroup_mutex); | ||
| 4384 | deactivate_super(root->sb); | ||
| 4385 | return ret; | ||
| 4386 | } | ||
| 4387 | |||
| 4388 | /** | ||
| 4389 | * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp | 4637 | * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp |
| 4390 | * @cgrp: the cgroup in question | 4638 | * @cgrp: the cgroup in question |
| 4391 | * @task: the task in question | 4639 | * @task: the task in question |
| @@ -4569,8 +4817,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) | |||
| 4569 | * on this or this is under rcu_read_lock(). Once css->id is allocated, | 4817 | * on this or this is under rcu_read_lock(). Once css->id is allocated, |
| 4570 | * it's unchanged until freed. | 4818 | * it's unchanged until freed. |
| 4571 | */ | 4819 | */ |
| 4572 | cssid = rcu_dereference_check(css->id, | 4820 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); |
| 4573 | rcu_read_lock_held() || atomic_read(&css->refcnt)); | ||
| 4574 | 4821 | ||
| 4575 | if (cssid) | 4822 | if (cssid) |
| 4576 | return cssid->id; | 4823 | return cssid->id; |
| @@ -4582,8 +4829,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) | |||
| 4582 | { | 4829 | { |
| 4583 | struct css_id *cssid; | 4830 | struct css_id *cssid; |
| 4584 | 4831 | ||
| 4585 | cssid = rcu_dereference_check(css->id, | 4832 | cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt)); |
| 4586 | rcu_read_lock_held() || atomic_read(&css->refcnt)); | ||
| 4587 | 4833 | ||
| 4588 | if (cssid) | 4834 | if (cssid) |
| 4589 | return cssid->depth; | 4835 | return cssid->depth; |
| @@ -4623,14 +4869,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child, | |||
| 4623 | return ret; | 4869 | return ret; |
| 4624 | } | 4870 | } |
| 4625 | 4871 | ||
| 4626 | static void __free_css_id_cb(struct rcu_head *head) | ||
| 4627 | { | ||
| 4628 | struct css_id *id; | ||
| 4629 | |||
| 4630 | id = container_of(head, struct css_id, rcu_head); | ||
| 4631 | kfree(id); | ||
| 4632 | } | ||
| 4633 | |||
| 4634 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | 4872 | void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) |
| 4635 | { | 4873 | { |
| 4636 | struct css_id *id = css->id; | 4874 | struct css_id *id = css->id; |
| @@ -4645,7 +4883,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) | |||
| 4645 | spin_lock(&ss->id_lock); | 4883 | spin_lock(&ss->id_lock); |
| 4646 | idr_remove(&ss->idr, id->id); | 4884 | idr_remove(&ss->idr, id->id); |
| 4647 | spin_unlock(&ss->id_lock); | 4885 | spin_unlock(&ss->id_lock); |
| 4648 | call_rcu(&id->rcu_head, __free_css_id_cb); | 4886 | kfree_rcu(id, rcu_head); |
| 4649 | } | 4887 | } |
| 4650 | EXPORT_SYMBOL_GPL(free_css_id); | 4888 | EXPORT_SYMBOL_GPL(free_css_id); |
| 4651 | 4889 | ||
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e7bebb7c6c38..e691818d7e45 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
| @@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss, | |||
| 160 | */ | 160 | */ |
| 161 | static int freezer_can_attach(struct cgroup_subsys *ss, | 161 | static int freezer_can_attach(struct cgroup_subsys *ss, |
| 162 | struct cgroup *new_cgroup, | 162 | struct cgroup *new_cgroup, |
| 163 | struct task_struct *task, bool threadgroup) | 163 | struct task_struct *task) |
| 164 | { | 164 | { |
| 165 | struct freezer *freezer; | 165 | struct freezer *freezer; |
| 166 | 166 | ||
| @@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss, | |||
| 172 | if (freezer->state != CGROUP_THAWED) | 172 | if (freezer->state != CGROUP_THAWED) |
| 173 | return -EBUSY; | 173 | return -EBUSY; |
| 174 | 174 | ||
| 175 | return 0; | ||
| 176 | } | ||
| 177 | |||
| 178 | static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | ||
| 179 | { | ||
| 175 | rcu_read_lock(); | 180 | rcu_read_lock(); |
| 176 | if (__cgroup_freezing_or_frozen(task)) { | 181 | if (__cgroup_freezing_or_frozen(tsk)) { |
| 177 | rcu_read_unlock(); | 182 | rcu_read_unlock(); |
| 178 | return -EBUSY; | 183 | return -EBUSY; |
| 179 | } | 184 | } |
| 180 | rcu_read_unlock(); | 185 | rcu_read_unlock(); |
| 181 | |||
| 182 | if (threadgroup) { | ||
| 183 | struct task_struct *c; | ||
| 184 | |||
| 185 | rcu_read_lock(); | ||
| 186 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
| 187 | if (__cgroup_freezing_or_frozen(c)) { | ||
| 188 | rcu_read_unlock(); | ||
| 189 | return -EBUSY; | ||
| 190 | } | ||
| 191 | } | ||
| 192 | rcu_read_unlock(); | ||
| 193 | } | ||
| 194 | |||
| 195 | return 0; | 186 | return 0; |
| 196 | } | 187 | } |
| 197 | 188 | ||
| @@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = { | |||
| 390 | .populate = freezer_populate, | 381 | .populate = freezer_populate, |
| 391 | .subsys_id = freezer_subsys_id, | 382 | .subsys_id = freezer_subsys_id, |
| 392 | .can_attach = freezer_can_attach, | 383 | .can_attach = freezer_can_attach, |
| 384 | .can_attach_task = freezer_can_attach_task, | ||
| 385 | .pre_attach = NULL, | ||
| 386 | .attach_task = NULL, | ||
| 393 | .attach = NULL, | 387 | .attach = NULL, |
| 394 | .fork = freezer_fork, | 388 | .fork = freezer_fork, |
| 395 | .exit = NULL, | 389 | .exit = NULL, |
diff --git a/kernel/compat.c b/kernel/compat.c index 38b1d2c1cbe8..e2435ee9993a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -158,6 +158,7 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user | |||
| 158 | __put_user(ts->tv_sec, &cts->tv_sec) || | 158 | __put_user(ts->tv_sec, &cts->tv_sec) || |
| 159 | __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; | 159 | __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; |
| 160 | } | 160 | } |
| 161 | EXPORT_SYMBOL_GPL(put_compat_timespec); | ||
| 161 | 162 | ||
| 162 | static long compat_nanosleep_restart(struct restart_block *restart) | 163 | static long compat_nanosleep_restart(struct restart_block *restart) |
| 163 | { | 164 | { |
| @@ -293,6 +294,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) | |||
| 293 | return compat_jiffies_to_clock_t(jiffies); | 294 | return compat_jiffies_to_clock_t(jiffies); |
| 294 | } | 295 | } |
| 295 | 296 | ||
| 297 | #ifdef __ARCH_WANT_SYS_SIGPENDING | ||
| 298 | |||
| 296 | /* | 299 | /* |
| 297 | * Assumption: old_sigset_t and compat_old_sigset_t are both | 300 | * Assumption: old_sigset_t and compat_old_sigset_t are both |
| 298 | * types that can be passed to put_user()/get_user(). | 301 | * types that can be passed to put_user()/get_user(). |
| @@ -312,6 +315,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) | |||
| 312 | return ret; | 315 | return ret; |
| 313 | } | 316 | } |
| 314 | 317 | ||
| 318 | #endif | ||
| 319 | |||
| 320 | #ifdef __ARCH_WANT_SYS_SIGPROCMASK | ||
| 321 | |||
| 315 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | 322 | asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, |
| 316 | compat_old_sigset_t __user *oset) | 323 | compat_old_sigset_t __user *oset) |
| 317 | { | 324 | { |
| @@ -333,6 +340,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, | |||
| 333 | return ret; | 340 | return ret; |
| 334 | } | 341 | } |
| 335 | 342 | ||
| 343 | #endif | ||
| 344 | |||
| 336 | asmlinkage long compat_sys_setrlimit(unsigned int resource, | 345 | asmlinkage long compat_sys_setrlimit(unsigned int resource, |
| 337 | struct compat_rlimit __user *rlim) | 346 | struct compat_rlimit __user *rlim) |
| 338 | { | 347 | { |
| @@ -882,6 +891,7 @@ sigset_from_compat (sigset_t *set, compat_sigset_t *compat) | |||
| 882 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); | 891 | case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); |
| 883 | } | 892 | } |
| 884 | } | 893 | } |
| 894 | EXPORT_SYMBOL_GPL(sigset_from_compat); | ||
| 885 | 895 | ||
| 886 | asmlinkage long | 896 | asmlinkage long |
| 887 | compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | 897 | compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, |
| @@ -890,10 +900,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | |||
| 890 | { | 900 | { |
| 891 | compat_sigset_t s32; | 901 | compat_sigset_t s32; |
| 892 | sigset_t s; | 902 | sigset_t s; |
| 893 | int sig; | ||
| 894 | struct timespec t; | 903 | struct timespec t; |
| 895 | siginfo_t info; | 904 | siginfo_t info; |
| 896 | long ret, timeout = 0; | 905 | long ret; |
| 897 | 906 | ||
| 898 | if (sigsetsize != sizeof(sigset_t)) | 907 | if (sigsetsize != sizeof(sigset_t)) |
| 899 | return -EINVAL; | 908 | return -EINVAL; |
| @@ -901,51 +910,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | |||
| 901 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) | 910 | if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) |
| 902 | return -EFAULT; | 911 | return -EFAULT; |
| 903 | sigset_from_compat(&s, &s32); | 912 | sigset_from_compat(&s, &s32); |
| 904 | sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
| 905 | signotset(&s); | ||
| 906 | 913 | ||
| 907 | if (uts) { | 914 | if (uts) { |
| 908 | if (get_compat_timespec (&t, uts)) | 915 | if (get_compat_timespec(&t, uts)) |
| 909 | return -EFAULT; | 916 | return -EFAULT; |
| 910 | if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 | ||
| 911 | || t.tv_sec < 0) | ||
| 912 | return -EINVAL; | ||
| 913 | } | 917 | } |
| 914 | 918 | ||
| 915 | spin_lock_irq(¤t->sighand->siglock); | 919 | ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); |
| 916 | sig = dequeue_signal(current, &s, &info); | ||
| 917 | if (!sig) { | ||
| 918 | timeout = MAX_SCHEDULE_TIMEOUT; | ||
| 919 | if (uts) | ||
| 920 | timeout = timespec_to_jiffies(&t) | ||
| 921 | +(t.tv_sec || t.tv_nsec); | ||
| 922 | if (timeout) { | ||
| 923 | current->real_blocked = current->blocked; | ||
| 924 | sigandsets(¤t->blocked, ¤t->blocked, &s); | ||
| 925 | |||
| 926 | recalc_sigpending(); | ||
| 927 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 928 | |||
| 929 | timeout = schedule_timeout_interruptible(timeout); | ||
| 930 | |||
| 931 | spin_lock_irq(¤t->sighand->siglock); | ||
| 932 | sig = dequeue_signal(current, &s, &info); | ||
| 933 | current->blocked = current->real_blocked; | ||
| 934 | siginitset(¤t->real_blocked, 0); | ||
| 935 | recalc_sigpending(); | ||
| 936 | } | ||
| 937 | } | ||
| 938 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 939 | 920 | ||
| 940 | if (sig) { | 921 | if (ret > 0 && uinfo) { |
| 941 | ret = sig; | 922 | if (copy_siginfo_to_user32(uinfo, &info)) |
| 942 | if (uinfo) { | 923 | ret = -EFAULT; |
| 943 | if (copy_siginfo_to_user32(uinfo, &info)) | ||
| 944 | ret = -EFAULT; | ||
| 945 | } | ||
| 946 | }else { | ||
| 947 | ret = timeout?-EINTR:-EAGAIN; | ||
| 948 | } | 924 | } |
| 925 | |||
| 949 | return ret; | 926 | return ret; |
| 950 | 927 | ||
| 951 | } | 928 | } |
| @@ -1016,11 +993,8 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat | |||
| 1016 | sigset_from_compat(&newset, &newset32); | 993 | sigset_from_compat(&newset, &newset32); |
| 1017 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 994 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
| 1018 | 995 | ||
| 1019 | spin_lock_irq(¤t->sighand->siglock); | ||
| 1020 | current->saved_sigmask = current->blocked; | 996 | current->saved_sigmask = current->blocked; |
| 1021 | current->blocked = newset; | 997 | set_current_blocked(&newset); |
| 1022 | recalc_sigpending(); | ||
| 1023 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 1024 | 998 | ||
| 1025 | current->state = TASK_INTERRUPTIBLE; | 999 | current->state = TASK_INTERRUPTIBLE; |
| 1026 | schedule(); | 1000 | schedule(); |
diff --git a/kernel/configs.c b/kernel/configs.c index b4066b44a99d..42e8fa075eed 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
| @@ -92,8 +92,8 @@ static void __exit ikconfig_cleanup(void) | |||
| 92 | module_init(ikconfig_init); | 92 | module_init(ikconfig_init); |
| 93 | module_exit(ikconfig_cleanup); | 93 | module_exit(ikconfig_cleanup); |
| 94 | 94 | ||
| 95 | #endif /* CONFIG_IKCONFIG_PROC */ | ||
| 96 | |||
| 95 | MODULE_LICENSE("GPL"); | 97 | MODULE_LICENSE("GPL"); |
| 96 | MODULE_AUTHOR("Randy Dunlap"); | 98 | MODULE_AUTHOR("Randy Dunlap"); |
| 97 | MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); | 99 | MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); |
| 98 | |||
| 99 | #endif /* CONFIG_IKCONFIG_PROC */ | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 33eee16addb8..10131fdaff70 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -55,7 +55,7 @@ | |||
| 55 | #include <linux/sort.h> | 55 | #include <linux/sort.h> |
| 56 | 56 | ||
| 57 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
| 58 | #include <asm/atomic.h> | 58 | #include <linux/atomic.h> |
| 59 | #include <linux/mutex.h> | 59 | #include <linux/mutex.h> |
| 60 | #include <linux/workqueue.h> | 60 | #include <linux/workqueue.h> |
| 61 | #include <linux/cgroup.h> | 61 | #include <linux/cgroup.h> |
| @@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) | |||
| 1159 | static int update_relax_domain_level(struct cpuset *cs, s64 val) | 1159 | static int update_relax_domain_level(struct cpuset *cs, s64 val) |
| 1160 | { | 1160 | { |
| 1161 | #ifdef CONFIG_SMP | 1161 | #ifdef CONFIG_SMP |
| 1162 | if (val < -1 || val >= SD_LV_MAX) | 1162 | if (val < -1 || val >= sched_domain_level_max) |
| 1163 | return -EINVAL; | 1163 | return -EINVAL; |
| 1164 | #endif | 1164 | #endif |
| 1165 | 1165 | ||
| @@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
| 1367 | return val; | 1367 | return val; |
| 1368 | } | 1368 | } |
| 1369 | 1369 | ||
| 1370 | /* Protected by cgroup_lock */ | ||
| 1371 | static cpumask_var_t cpus_attach; | ||
| 1372 | |||
| 1373 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | 1370 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ |
| 1374 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | 1371 | static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, |
| 1375 | struct task_struct *tsk, bool threadgroup) | 1372 | struct task_struct *tsk) |
| 1376 | { | 1373 | { |
| 1377 | int ret; | ||
| 1378 | struct cpuset *cs = cgroup_cs(cont); | 1374 | struct cpuset *cs = cgroup_cs(cont); |
| 1379 | 1375 | ||
| 1380 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1376 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
| @@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, | |||
| 1391 | if (tsk->flags & PF_THREAD_BOUND) | 1387 | if (tsk->flags & PF_THREAD_BOUND) |
| 1392 | return -EINVAL; | 1388 | return -EINVAL; |
| 1393 | 1389 | ||
| 1394 | ret = security_task_setscheduler(tsk); | ||
| 1395 | if (ret) | ||
| 1396 | return ret; | ||
| 1397 | if (threadgroup) { | ||
| 1398 | struct task_struct *c; | ||
| 1399 | |||
| 1400 | rcu_read_lock(); | ||
| 1401 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
| 1402 | ret = security_task_setscheduler(c); | ||
| 1403 | if (ret) { | ||
| 1404 | rcu_read_unlock(); | ||
| 1405 | return ret; | ||
| 1406 | } | ||
| 1407 | } | ||
| 1408 | rcu_read_unlock(); | ||
| 1409 | } | ||
| 1410 | return 0; | 1390 | return 0; |
| 1411 | } | 1391 | } |
| 1412 | 1392 | ||
| 1413 | static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, | 1393 | static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) |
| 1414 | struct cpuset *cs) | 1394 | { |
| 1395 | return security_task_setscheduler(task); | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | /* | ||
| 1399 | * Protected by cgroup_lock. The nodemasks must be stored globally because | ||
| 1400 | * dynamically allocating them is not allowed in pre_attach, and they must | ||
| 1401 | * persist among pre_attach, attach_task, and attach. | ||
| 1402 | */ | ||
| 1403 | static cpumask_var_t cpus_attach; | ||
| 1404 | static nodemask_t cpuset_attach_nodemask_from; | ||
| 1405 | static nodemask_t cpuset_attach_nodemask_to; | ||
| 1406 | |||
| 1407 | /* Set-up work for before attaching each task. */ | ||
| 1408 | static void cpuset_pre_attach(struct cgroup *cont) | ||
| 1409 | { | ||
| 1410 | struct cpuset *cs = cgroup_cs(cont); | ||
| 1411 | |||
| 1412 | if (cs == &top_cpuset) | ||
| 1413 | cpumask_copy(cpus_attach, cpu_possible_mask); | ||
| 1414 | else | ||
| 1415 | guarantee_online_cpus(cs, cpus_attach); | ||
| 1416 | |||
| 1417 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | ||
| 1418 | } | ||
| 1419 | |||
| 1420 | /* Per-thread attachment work. */ | ||
| 1421 | static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) | ||
| 1415 | { | 1422 | { |
| 1416 | int err; | 1423 | int err; |
| 1424 | struct cpuset *cs = cgroup_cs(cont); | ||
| 1425 | |||
| 1417 | /* | 1426 | /* |
| 1418 | * can_attach beforehand should guarantee that this doesn't fail. | 1427 | * can_attach beforehand should guarantee that this doesn't fail. |
| 1419 | * TODO: have a better way to handle failure here | 1428 | * TODO: have a better way to handle failure here |
| @@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, | |||
| 1421 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | 1430 | err = set_cpus_allowed_ptr(tsk, cpus_attach); |
| 1422 | WARN_ON_ONCE(err); | 1431 | WARN_ON_ONCE(err); |
| 1423 | 1432 | ||
| 1424 | cpuset_change_task_nodemask(tsk, to); | 1433 | cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); |
| 1425 | cpuset_update_task_spread_flag(cs, tsk); | 1434 | cpuset_update_task_spread_flag(cs, tsk); |
| 1426 | |||
| 1427 | } | 1435 | } |
| 1428 | 1436 | ||
| 1429 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, | 1437 | static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, |
| 1430 | struct cgroup *oldcont, struct task_struct *tsk, | 1438 | struct cgroup *oldcont, struct task_struct *tsk) |
| 1431 | bool threadgroup) | ||
| 1432 | { | 1439 | { |
| 1433 | struct mm_struct *mm; | 1440 | struct mm_struct *mm; |
| 1434 | struct cpuset *cs = cgroup_cs(cont); | 1441 | struct cpuset *cs = cgroup_cs(cont); |
| 1435 | struct cpuset *oldcs = cgroup_cs(oldcont); | 1442 | struct cpuset *oldcs = cgroup_cs(oldcont); |
| 1436 | static nodemask_t to; /* protected by cgroup_mutex */ | ||
| 1437 | |||
| 1438 | if (cs == &top_cpuset) { | ||
| 1439 | cpumask_copy(cpus_attach, cpu_possible_mask); | ||
| 1440 | } else { | ||
| 1441 | guarantee_online_cpus(cs, cpus_attach); | ||
| 1442 | } | ||
| 1443 | guarantee_online_mems(cs, &to); | ||
| 1444 | |||
| 1445 | /* do per-task migration stuff possibly for each in the threadgroup */ | ||
| 1446 | cpuset_attach_task(tsk, &to, cs); | ||
| 1447 | if (threadgroup) { | ||
| 1448 | struct task_struct *c; | ||
| 1449 | rcu_read_lock(); | ||
| 1450 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
| 1451 | cpuset_attach_task(c, &to, cs); | ||
| 1452 | } | ||
| 1453 | rcu_read_unlock(); | ||
| 1454 | } | ||
| 1455 | 1443 | ||
| 1456 | /* change mm; only needs to be done once even if threadgroup */ | 1444 | /* |
| 1457 | to = cs->mems_allowed; | 1445 | * Change mm, possibly for multiple threads in a threadgroup. This is |
| 1446 | * expensive and may sleep. | ||
| 1447 | */ | ||
| 1448 | cpuset_attach_nodemask_from = oldcs->mems_allowed; | ||
| 1449 | cpuset_attach_nodemask_to = cs->mems_allowed; | ||
| 1458 | mm = get_task_mm(tsk); | 1450 | mm = get_task_mm(tsk); |
| 1459 | if (mm) { | 1451 | if (mm) { |
| 1460 | mpol_rebind_mm(mm, &to); | 1452 | mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); |
| 1461 | if (is_memory_migrate(cs)) | 1453 | if (is_memory_migrate(cs)) |
| 1462 | cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); | 1454 | cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, |
| 1455 | &cpuset_attach_nodemask_to); | ||
| 1463 | mmput(mm); | 1456 | mmput(mm); |
| 1464 | } | 1457 | } |
| 1465 | } | 1458 | } |
| @@ -1809,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 1809 | } | 1802 | } |
| 1810 | 1803 | ||
| 1811 | /* | 1804 | /* |
| 1812 | * post_clone() is called at the end of cgroup_clone(). | 1805 | * post_clone() is called during cgroup_create() when the |
| 1813 | * 'cgroup' was just created automatically as a result of | 1806 | * clone_children mount argument was specified. The cgroup |
| 1814 | * a cgroup_clone(), and the current task is about to | 1807 | * can not yet have any tasks. |
| 1815 | * be moved into 'cgroup'. | ||
| 1816 | * | 1808 | * |
| 1817 | * Currently we refuse to set up the cgroup - thereby | 1809 | * Currently we refuse to set up the cgroup - thereby |
| 1818 | * refusing the task to be entered, and as a result refusing | 1810 | * refusing the task to be entered, and as a result refusing |
| @@ -1911,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = { | |||
| 1911 | .create = cpuset_create, | 1903 | .create = cpuset_create, |
| 1912 | .destroy = cpuset_destroy, | 1904 | .destroy = cpuset_destroy, |
| 1913 | .can_attach = cpuset_can_attach, | 1905 | .can_attach = cpuset_can_attach, |
| 1906 | .can_attach_task = cpuset_can_attach_task, | ||
| 1907 | .pre_attach = cpuset_pre_attach, | ||
| 1908 | .attach_task = cpuset_attach_task, | ||
| 1914 | .attach = cpuset_attach, | 1909 | .attach = cpuset_attach, |
| 1915 | .populate = cpuset_populate, | 1910 | .populate = cpuset_populate, |
| 1916 | .post_clone = cpuset_post_clone, | 1911 | .post_clone = cpuset_post_clone, |
| @@ -2195,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) | |||
| 2195 | rcu_read_lock(); | 2190 | rcu_read_lock(); |
| 2196 | cs = task_cs(tsk); | 2191 | cs = task_cs(tsk); |
| 2197 | if (cs) | 2192 | if (cs) |
| 2198 | cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); | 2193 | do_set_cpus_allowed(tsk, cs->cpus_allowed); |
| 2199 | rcu_read_unlock(); | 2194 | rcu_read_unlock(); |
| 2200 | 2195 | ||
| 2201 | /* | 2196 | /* |
| @@ -2222,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) | |||
| 2222 | * Like above we can temporary set any mask and rely on | 2217 | * Like above we can temporary set any mask and rely on |
| 2223 | * set_cpus_allowed_ptr() as synchronization point. | 2218 | * set_cpus_allowed_ptr() as synchronization point. |
| 2224 | */ | 2219 | */ |
| 2225 | cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); | 2220 | do_set_cpus_allowed(tsk, cpu_possible_mask); |
| 2226 | cpu = cpumask_any(cpu_active_mask); | 2221 | cpu = cpumask_any(cpu_active_mask); |
| 2227 | } | 2222 | } |
| 2228 | 2223 | ||
| @@ -2465,11 +2460,19 @@ static int cpuset_spread_node(int *rotor) | |||
| 2465 | 2460 | ||
| 2466 | int cpuset_mem_spread_node(void) | 2461 | int cpuset_mem_spread_node(void) |
| 2467 | { | 2462 | { |
| 2463 | if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE) | ||
| 2464 | current->cpuset_mem_spread_rotor = | ||
| 2465 | node_random(¤t->mems_allowed); | ||
| 2466 | |||
| 2468 | return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); | 2467 | return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); |
| 2469 | } | 2468 | } |
| 2470 | 2469 | ||
| 2471 | int cpuset_slab_spread_node(void) | 2470 | int cpuset_slab_spread_node(void) |
| 2472 | { | 2471 | { |
| 2472 | if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE) | ||
| 2473 | current->cpuset_slab_spread_rotor = | ||
| 2474 | node_random(¤t->mems_allowed); | ||
| 2475 | |||
| 2473 | return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); | 2476 | return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); |
| 2474 | } | 2477 | } |
| 2475 | 2478 | ||
diff --git a/kernel/cred.c b/kernel/cred.c index 5557b55048df..8ef31f53c44c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | /* Task credentials management - see Documentation/credentials.txt | 1 | /* Task credentials management - see Documentation/security/credentials.txt |
| 2 | * | 2 | * |
| 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. |
| 4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
| @@ -49,11 +49,12 @@ struct cred init_cred = { | |||
| 49 | .magic = CRED_MAGIC, | 49 | .magic = CRED_MAGIC, |
| 50 | #endif | 50 | #endif |
| 51 | .securebits = SECUREBITS_DEFAULT, | 51 | .securebits = SECUREBITS_DEFAULT, |
| 52 | .cap_inheritable = CAP_INIT_INH_SET, | 52 | .cap_inheritable = CAP_EMPTY_SET, |
| 53 | .cap_permitted = CAP_FULL_SET, | 53 | .cap_permitted = CAP_FULL_SET, |
| 54 | .cap_effective = CAP_INIT_EFF_SET, | 54 | .cap_effective = CAP_FULL_SET, |
| 55 | .cap_bset = CAP_INIT_BSET, | 55 | .cap_bset = CAP_FULL_SET, |
| 56 | .user = INIT_USER, | 56 | .user = INIT_USER, |
| 57 | .user_ns = &init_user_ns, | ||
| 57 | .group_info = &init_groups, | 58 | .group_info = &init_groups, |
| 58 | #ifdef CONFIG_KEYS | 59 | #ifdef CONFIG_KEYS |
| 59 | .tgcred = &init_tgcred, | 60 | .tgcred = &init_tgcred, |
| @@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
| 410 | goto error_put; | 411 | goto error_put; |
| 411 | } | 412 | } |
| 412 | 413 | ||
| 414 | /* cache user_ns in cred. Doesn't need a refcount because it will | ||
| 415 | * stay pinned by cred->user | ||
| 416 | */ | ||
| 417 | new->user_ns = new->user->user_ns; | ||
| 418 | |||
| 413 | #ifdef CONFIG_KEYS | 419 | #ifdef CONFIG_KEYS |
| 414 | /* new threads get their own thread keyrings if their parent already | 420 | /* new threads get their own thread keyrings if their parent already |
| 415 | * had one */ | 421 | * had one */ |
| @@ -502,10 +508,8 @@ int commit_creds(struct cred *new) | |||
| 502 | key_fsgid_changed(task); | 508 | key_fsgid_changed(task); |
| 503 | 509 | ||
| 504 | /* do it | 510 | /* do it |
| 505 | * - What if a process setreuid()'s and this brings the | 511 | * RLIMIT_NPROC limits on user->processes have already been checked |
| 506 | * new uid over his NPROC rlimit? We can check this now | 512 | * in set_user(). |
| 507 | * cheaply with the new uid cache, so if it matters | ||
| 508 | * we should be checking for it. -DaveM | ||
| 509 | */ | 513 | */ |
| 510 | alter_cred_subscribers(new, 2); | 514 | alter_cred_subscribers(new, 2); |
| 511 | if (new->user != old->user) | 515 | if (new->user != old->user) |
| @@ -741,12 +745,6 @@ int set_create_files_as(struct cred *new, struct inode *inode) | |||
| 741 | } | 745 | } |
| 742 | EXPORT_SYMBOL(set_create_files_as); | 746 | EXPORT_SYMBOL(set_create_files_as); |
| 743 | 747 | ||
| 744 | struct user_namespace *current_user_ns(void) | ||
| 745 | { | ||
| 746 | return _current_user_ns(); | ||
| 747 | } | ||
| 748 | EXPORT_SYMBOL(current_user_ns); | ||
| 749 | |||
| 750 | #ifdef CONFIG_DEBUG_CREDENTIALS | 748 | #ifdef CONFIG_DEBUG_CREDENTIALS |
| 751 | 749 | ||
| 752 | bool creds_are_invalid(const struct cred *cred) | 750 | bool creds_are_invalid(const struct cred *cred) |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index bad6786dee88..0d7c08784efb 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
| @@ -51,7 +51,7 @@ | |||
| 51 | 51 | ||
| 52 | #include <asm/cacheflush.h> | 52 | #include <asm/cacheflush.h> |
| 53 | #include <asm/byteorder.h> | 53 | #include <asm/byteorder.h> |
| 54 | #include <asm/atomic.h> | 54 | #include <linux/atomic.h> |
| 55 | #include <asm/system.h> | 55 | #include <asm/system.h> |
| 56 | 56 | ||
| 57 | #include "debug_core.h" | 57 | #include "debug_core.h" |
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index a11db956dd62..34872482315e 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c | |||
| @@ -42,6 +42,8 @@ | |||
| 42 | /* Our I/O buffers. */ | 42 | /* Our I/O buffers. */ |
| 43 | static char remcom_in_buffer[BUFMAX]; | 43 | static char remcom_in_buffer[BUFMAX]; |
| 44 | static char remcom_out_buffer[BUFMAX]; | 44 | static char remcom_out_buffer[BUFMAX]; |
| 45 | static int gdbstub_use_prev_in_buf; | ||
| 46 | static int gdbstub_prev_in_buf_pos; | ||
| 45 | 47 | ||
| 46 | /* Storage for the registers, in GDB format. */ | 48 | /* Storage for the registers, in GDB format. */ |
| 47 | static unsigned long gdb_regs[(NUMREGBYTES + | 49 | static unsigned long gdb_regs[(NUMREGBYTES + |
| @@ -58,6 +60,13 @@ static int gdbstub_read_wait(void) | |||
| 58 | int ret = -1; | 60 | int ret = -1; |
| 59 | int i; | 61 | int i; |
| 60 | 62 | ||
| 63 | if (unlikely(gdbstub_use_prev_in_buf)) { | ||
| 64 | if (gdbstub_prev_in_buf_pos < gdbstub_use_prev_in_buf) | ||
| 65 | return remcom_in_buffer[gdbstub_prev_in_buf_pos++]; | ||
| 66 | else | ||
| 67 | gdbstub_use_prev_in_buf = 0; | ||
| 68 | } | ||
| 69 | |||
| 61 | /* poll any additional I/O interfaces that are defined */ | 70 | /* poll any additional I/O interfaces that are defined */ |
| 62 | while (ret < 0) | 71 | while (ret < 0) |
| 63 | for (i = 0; kdb_poll_funcs[i] != NULL; i++) { | 72 | for (i = 0; kdb_poll_funcs[i] != NULL; i++) { |
| @@ -109,7 +118,6 @@ static void get_packet(char *buffer) | |||
| 109 | buffer[count] = ch; | 118 | buffer[count] = ch; |
| 110 | count = count + 1; | 119 | count = count + 1; |
| 111 | } | 120 | } |
| 112 | buffer[count] = 0; | ||
| 113 | 121 | ||
| 114 | if (ch == '#') { | 122 | if (ch == '#') { |
| 115 | xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; | 123 | xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; |
| @@ -124,6 +132,7 @@ static void get_packet(char *buffer) | |||
| 124 | if (dbg_io_ops->flush) | 132 | if (dbg_io_ops->flush) |
| 125 | dbg_io_ops->flush(); | 133 | dbg_io_ops->flush(); |
| 126 | } | 134 | } |
| 135 | buffer[count] = 0; | ||
| 127 | } while (checksum != xmitcsum); | 136 | } while (checksum != xmitcsum); |
| 128 | } | 137 | } |
| 129 | 138 | ||
| @@ -1082,12 +1091,11 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) | |||
| 1082 | case 'c': | 1091 | case 'c': |
| 1083 | strcpy(remcom_in_buffer, cmd); | 1092 | strcpy(remcom_in_buffer, cmd); |
| 1084 | return 0; | 1093 | return 0; |
| 1085 | case '?': | 1094 | case '$': |
| 1086 | gdb_cmd_status(ks); | 1095 | strcpy(remcom_in_buffer, cmd); |
| 1087 | break; | 1096 | gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); |
| 1088 | case '\0': | 1097 | gdbstub_prev_in_buf_pos = 0; |
| 1089 | strcpy(remcom_out_buffer, ""); | 1098 | return 0; |
| 1090 | break; | ||
| 1091 | } | 1099 | } |
| 1092 | dbg_io_ops->write_char('+'); | 1100 | dbg_io_ops->write_char('+'); |
| 1093 | put_packet(remcom_out_buffer); | 1101 | put_packet(remcom_out_buffer); |
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 2f62fe85f16a..7179eac7b41c 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c | |||
| @@ -112,9 +112,8 @@ kdb_bt(int argc, const char **argv) | |||
| 112 | unsigned long addr; | 112 | unsigned long addr; |
| 113 | long offset; | 113 | long offset; |
| 114 | 114 | ||
| 115 | kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ | 115 | /* Prompt after each proc in bta */ |
| 116 | kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each | 116 | kdbgetintenv("BTAPROMPT", &btaprompt); |
| 117 | * proc in bta */ | ||
| 118 | 117 | ||
| 119 | if (strcmp(argv[0], "bta") == 0) { | 118 | if (strcmp(argv[0], "bta") == 0) { |
| 120 | struct task_struct *g, *p; | 119 | struct task_struct *g, *p; |
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds index 56c88e4db309..9834ad303ab6 100644 --- a/kernel/debug/kdb/kdb_cmds +++ b/kernel/debug/kdb/kdb_cmds | |||
| @@ -18,16 +18,12 @@ defcmd dumpcommon "" "Common kdb debugging" | |||
| 18 | endefcmd | 18 | endefcmd |
| 19 | 19 | ||
| 20 | defcmd dumpall "" "First line debugging" | 20 | defcmd dumpall "" "First line debugging" |
| 21 | set BTSYMARG 1 | ||
| 22 | set BTARGS 9 | ||
| 23 | pid R | 21 | pid R |
| 24 | -dumpcommon | 22 | -dumpcommon |
| 25 | -bta | 23 | -bta |
| 26 | endefcmd | 24 | endefcmd |
| 27 | 25 | ||
| 28 | defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" | 26 | defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" |
| 29 | set BTSYMARG 1 | ||
| 30 | set BTARGS 9 | ||
| 31 | pid R | 27 | pid R |
| 32 | -dumpcommon | 28 | -dumpcommon |
| 33 | -btc | 29 | -btc |
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index dd0b1b7dd02c..d9ca9aa481ec 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c | |||
| @@ -30,6 +30,8 @@ EXPORT_SYMBOL_GPL(kdb_poll_funcs); | |||
| 30 | int kdb_poll_idx = 1; | 30 | int kdb_poll_idx = 1; |
| 31 | EXPORT_SYMBOL_GPL(kdb_poll_idx); | 31 | EXPORT_SYMBOL_GPL(kdb_poll_idx); |
| 32 | 32 | ||
| 33 | static struct kgdb_state *kdb_ks; | ||
| 34 | |||
| 33 | int kdb_stub(struct kgdb_state *ks) | 35 | int kdb_stub(struct kgdb_state *ks) |
| 34 | { | 36 | { |
| 35 | int error = 0; | 37 | int error = 0; |
| @@ -39,6 +41,7 @@ int kdb_stub(struct kgdb_state *ks) | |||
| 39 | kdb_dbtrap_t db_result = KDB_DB_NOBPT; | 41 | kdb_dbtrap_t db_result = KDB_DB_NOBPT; |
| 40 | int i; | 42 | int i; |
| 41 | 43 | ||
| 44 | kdb_ks = ks; | ||
| 42 | if (KDB_STATE(REENTRY)) { | 45 | if (KDB_STATE(REENTRY)) { |
| 43 | reason = KDB_REASON_SWITCH; | 46 | reason = KDB_REASON_SWITCH; |
| 44 | KDB_STATE_CLEAR(REENTRY); | 47 | KDB_STATE_CLEAR(REENTRY); |
| @@ -123,20 +126,8 @@ int kdb_stub(struct kgdb_state *ks) | |||
| 123 | KDB_STATE_CLEAR(PAGER); | 126 | KDB_STATE_CLEAR(PAGER); |
| 124 | kdbnearsym_cleanup(); | 127 | kdbnearsym_cleanup(); |
| 125 | if (error == KDB_CMD_KGDB) { | 128 | if (error == KDB_CMD_KGDB) { |
| 126 | if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { | 129 | if (KDB_STATE(DOING_KGDB)) |
| 127 | /* | ||
| 128 | * This inteface glue which allows kdb to transition in into | ||
| 129 | * the gdb stub. In order to do this the '?' or '' gdb serial | ||
| 130 | * packet response is processed here. And then control is | ||
| 131 | * passed to the gdbstub. | ||
| 132 | */ | ||
| 133 | if (KDB_STATE(DOING_KGDB)) | ||
| 134 | gdbstub_state(ks, "?"); | ||
| 135 | else | ||
| 136 | gdbstub_state(ks, ""); | ||
| 137 | KDB_STATE_CLEAR(DOING_KGDB); | 130 | KDB_STATE_CLEAR(DOING_KGDB); |
| 138 | KDB_STATE_CLEAR(DOING_KGDB2); | ||
| 139 | } | ||
| 140 | return DBG_PASS_EVENT; | 131 | return DBG_PASS_EVENT; |
| 141 | } | 132 | } |
| 142 | kdb_bp_install(ks->linux_regs); | 133 | kdb_bp_install(ks->linux_regs); |
| @@ -166,3 +157,7 @@ int kdb_stub(struct kgdb_state *ks) | |||
| 166 | return kgdb_info[ks->cpu].ret_state; | 157 | return kgdb_info[ks->cpu].ret_state; |
| 167 | } | 158 | } |
| 168 | 159 | ||
| 160 | void kdb_gdb_state_pass(char *buf) | ||
| 161 | { | ||
| 162 | gdbstub_state(kdb_ks, buf); | ||
| 163 | } | ||
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 96fdaac46a80..4802eb5840e1 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
| @@ -31,15 +31,21 @@ char kdb_prompt_str[CMD_BUFLEN]; | |||
| 31 | 31 | ||
| 32 | int kdb_trap_printk; | 32 | int kdb_trap_printk; |
| 33 | 33 | ||
| 34 | static void kgdb_transition_check(char *buffer) | 34 | static int kgdb_transition_check(char *buffer) |
| 35 | { | 35 | { |
| 36 | int slen = strlen(buffer); | 36 | if (buffer[0] != '+' && buffer[0] != '$') { |
| 37 | if (strncmp(buffer, "$?#3f", slen) != 0 && | ||
| 38 | strncmp(buffer, "$qSupported#37", slen) != 0 && | ||
| 39 | strncmp(buffer, "+$qSupported#37", slen) != 0) { | ||
| 40 | KDB_STATE_SET(KGDB_TRANS); | 37 | KDB_STATE_SET(KGDB_TRANS); |
| 41 | kdb_printf("%s", buffer); | 38 | kdb_printf("%s", buffer); |
| 39 | } else { | ||
| 40 | int slen = strlen(buffer); | ||
| 41 | if (slen > 3 && buffer[slen - 3] == '#') { | ||
| 42 | kdb_gdb_state_pass(buffer); | ||
| 43 | strcpy(buffer, "kgdb"); | ||
| 44 | KDB_STATE_SET(DOING_KGDB); | ||
| 45 | return 1; | ||
| 46 | } | ||
| 42 | } | 47 | } |
| 48 | return 0; | ||
| 43 | } | 49 | } |
| 44 | 50 | ||
| 45 | static int kdb_read_get_key(char *buffer, size_t bufsize) | 51 | static int kdb_read_get_key(char *buffer, size_t bufsize) |
| @@ -251,6 +257,10 @@ poll_again: | |||
| 251 | case 13: /* enter */ | 257 | case 13: /* enter */ |
| 252 | *lastchar++ = '\n'; | 258 | *lastchar++ = '\n'; |
| 253 | *lastchar++ = '\0'; | 259 | *lastchar++ = '\0'; |
| 260 | if (!KDB_STATE(KGDB_TRANS)) { | ||
| 261 | KDB_STATE_SET(KGDB_TRANS); | ||
| 262 | kdb_printf("%s", buffer); | ||
| 263 | } | ||
| 254 | kdb_printf("\n"); | 264 | kdb_printf("\n"); |
| 255 | return buffer; | 265 | return buffer; |
| 256 | case 4: /* Del */ | 266 | case 4: /* Del */ |
| @@ -382,22 +392,26 @@ poll_again: | |||
| 382 | * printed characters if we think that | 392 | * printed characters if we think that |
| 383 | * kgdb is connecting, until the check | 393 | * kgdb is connecting, until the check |
| 384 | * fails */ | 394 | * fails */ |
| 385 | if (!KDB_STATE(KGDB_TRANS)) | 395 | if (!KDB_STATE(KGDB_TRANS)) { |
| 386 | kgdb_transition_check(buffer); | 396 | if (kgdb_transition_check(buffer)) |
| 387 | else | 397 | return buffer; |
| 398 | } else { | ||
| 388 | kdb_printf("%c", key); | 399 | kdb_printf("%c", key); |
| 400 | } | ||
| 389 | } | 401 | } |
| 390 | /* Special escape to kgdb */ | 402 | /* Special escape to kgdb */ |
| 391 | if (lastchar - buffer >= 5 && | 403 | if (lastchar - buffer >= 5 && |
| 392 | strcmp(lastchar - 5, "$?#3f") == 0) { | 404 | strcmp(lastchar - 5, "$?#3f") == 0) { |
| 405 | kdb_gdb_state_pass(lastchar - 5); | ||
| 393 | strcpy(buffer, "kgdb"); | 406 | strcpy(buffer, "kgdb"); |
| 394 | KDB_STATE_SET(DOING_KGDB); | 407 | KDB_STATE_SET(DOING_KGDB); |
| 395 | return buffer; | 408 | return buffer; |
| 396 | } | 409 | } |
| 397 | if (lastchar - buffer >= 14 && | 410 | if (lastchar - buffer >= 11 && |
| 398 | strcmp(lastchar - 14, "$qSupported#37") == 0) { | 411 | strcmp(lastchar - 11, "$qSupported") == 0) { |
| 412 | kdb_gdb_state_pass(lastchar - 11); | ||
| 399 | strcpy(buffer, "kgdb"); | 413 | strcpy(buffer, "kgdb"); |
| 400 | KDB_STATE_SET(DOING_KGDB2); | 414 | KDB_STATE_SET(DOING_KGDB); |
| 401 | return buffer; | 415 | return buffer; |
| 402 | } | 416 | } |
| 403 | } | 417 | } |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index be14779bcef6..63786e71a3cd 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -145,7 +145,6 @@ static char *__env[] = { | |||
| 145 | #endif | 145 | #endif |
| 146 | "RADIX=16", | 146 | "RADIX=16", |
| 147 | "MDCOUNT=8", /* lines of md output */ | 147 | "MDCOUNT=8", /* lines of md output */ |
| 148 | "BTARGS=9", /* 9 possible args in bt */ | ||
| 149 | KDB_PLATFORM_ENV, | 148 | KDB_PLATFORM_ENV, |
| 150 | "DTABCOUNT=30", | 149 | "DTABCOUNT=30", |
| 151 | "NOSECT=1", | 150 | "NOSECT=1", |
| @@ -172,6 +171,7 @@ static char *__env[] = { | |||
| 172 | (char *)0, | 171 | (char *)0, |
| 173 | (char *)0, | 172 | (char *)0, |
| 174 | (char *)0, | 173 | (char *)0, |
| 174 | (char *)0, | ||
| 175 | }; | 175 | }; |
| 176 | 176 | ||
| 177 | static const int __nenv = (sizeof(__env) / sizeof(char *)); | 177 | static const int __nenv = (sizeof(__env) / sizeof(char *)); |
| @@ -1386,7 +1386,7 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, | |||
| 1386 | } | 1386 | } |
| 1387 | 1387 | ||
| 1388 | if (result == KDB_CMD_KGDB) { | 1388 | if (result == KDB_CMD_KGDB) { |
| 1389 | if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2))) | 1389 | if (!KDB_STATE(DOING_KGDB)) |
| 1390 | kdb_printf("Entering please attach debugger " | 1390 | kdb_printf("Entering please attach debugger " |
| 1391 | "or use $D#44+ or $3#33\n"); | 1391 | "or use $D#44+ or $3#33\n"); |
| 1392 | break; | 1392 | break; |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 35d69ed1dfb5..e381d105b40b 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
| @@ -21,7 +21,6 @@ | |||
| 21 | #define KDB_CMD_SS (-1003) | 21 | #define KDB_CMD_SS (-1003) |
| 22 | #define KDB_CMD_SSB (-1004) | 22 | #define KDB_CMD_SSB (-1004) |
| 23 | #define KDB_CMD_KGDB (-1005) | 23 | #define KDB_CMD_KGDB (-1005) |
| 24 | #define KDB_CMD_KGDB2 (-1006) | ||
| 25 | 24 | ||
| 26 | /* Internal debug flags */ | 25 | /* Internal debug flags */ |
| 27 | #define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ | 26 | #define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ |
| @@ -146,7 +145,6 @@ extern int kdb_state; | |||
| 146 | * keyboard on this cpu */ | 145 | * keyboard on this cpu */ |
| 147 | #define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ | 146 | #define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ |
| 148 | #define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ | 147 | #define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ |
| 149 | #define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */ | ||
| 150 | #define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ | 148 | #define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ |
| 151 | #define KDB_STATE_ARCH 0xff000000 /* Reserved for arch | 149 | #define KDB_STATE_ARCH 0xff000000 /* Reserved for arch |
| 152 | * specific use */ | 150 | * specific use */ |
| @@ -218,6 +216,7 @@ extern void kdb_print_nameval(const char *name, unsigned long val); | |||
| 218 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); | 216 | extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); |
| 219 | extern void kdb_meminfo_proc_show(void); | 217 | extern void kdb_meminfo_proc_show(void); |
| 220 | extern char *kdb_getstr(char *, size_t, char *); | 218 | extern char *kdb_getstr(char *, size_t, char *); |
| 219 | extern void kdb_gdb_state_pass(char *buf); | ||
| 221 | 220 | ||
| 222 | /* Defines for kdb_symbol_print */ | 221 | /* Defines for kdb_symbol_print */ |
| 223 | #define KDB_SP_SPACEB 0x0001 /* Space before string */ | 222 | #define KDB_SP_SPACEB 0x0001 /* Space before string */ |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ead9b610aa71..418b3f7053aa 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
| @@ -19,8 +19,10 @@ | |||
| 19 | #include <linux/time.h> | 19 | #include <linux/time.h> |
| 20 | #include <linux/sysctl.h> | 20 | #include <linux/sysctl.h> |
| 21 | #include <linux/delayacct.h> | 21 | #include <linux/delayacct.h> |
| 22 | #include <linux/module.h> | ||
| 22 | 23 | ||
| 23 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ | 24 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ |
| 25 | EXPORT_SYMBOL_GPL(delayacct_on); | ||
| 24 | struct kmem_cache *delayacct_cache; | 26 | struct kmem_cache *delayacct_cache; |
| 25 | 27 | ||
| 26 | static int __init delayacct_setup_disable(char *str) | 28 | static int __init delayacct_setup_disable(char *str) |
diff --git a/kernel/events/Makefile b/kernel/events/Makefile new file mode 100644 index 000000000000..89e5e8aa4c36 --- /dev/null +++ b/kernel/events/Makefile | |||
| @@ -0,0 +1,6 @@ | |||
| 1 | ifdef CONFIG_FUNCTION_TRACER | ||
| 2 | CFLAGS_REMOVE_core.o = -pg | ||
| 3 | endif | ||
| 4 | |||
| 5 | obj-y := core.o ring_buffer.o | ||
| 6 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o | ||
diff --git a/kernel/perf_event.c b/kernel/events/core.c index 8e81a9860a0d..b8785e26ee1c 100644 --- a/kernel/perf_event.c +++ b/kernel/events/core.c | |||
| @@ -2,8 +2,8 @@ | |||
| 2 | * Performance events core code: | 2 | * Performance events core code: |
| 3 | * | 3 | * |
| 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> |
| 5 | * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar | 5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar |
| 6 | * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
| 7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | 7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
| 8 | * | 8 | * |
| 9 | * For licensing details see kernel-base/COPYING | 9 | * For licensing details see kernel-base/COPYING |
| @@ -36,13 +36,15 @@ | |||
| 36 | #include <linux/ftrace_event.h> | 36 | #include <linux/ftrace_event.h> |
| 37 | #include <linux/hw_breakpoint.h> | 37 | #include <linux/hw_breakpoint.h> |
| 38 | 38 | ||
| 39 | #include "internal.h" | ||
| 40 | |||
| 39 | #include <asm/irq_regs.h> | 41 | #include <asm/irq_regs.h> |
| 40 | 42 | ||
| 41 | struct remote_function_call { | 43 | struct remote_function_call { |
| 42 | struct task_struct *p; | 44 | struct task_struct *p; |
| 43 | int (*func)(void *info); | 45 | int (*func)(void *info); |
| 44 | void *info; | 46 | void *info; |
| 45 | int ret; | 47 | int ret; |
| 46 | }; | 48 | }; |
| 47 | 49 | ||
| 48 | static void remote_function(void *data) | 50 | static void remote_function(void *data) |
| @@ -76,10 +78,10 @@ static int | |||
| 76 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | 78 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) |
| 77 | { | 79 | { |
| 78 | struct remote_function_call data = { | 80 | struct remote_function_call data = { |
| 79 | .p = p, | 81 | .p = p, |
| 80 | .func = func, | 82 | .func = func, |
| 81 | .info = info, | 83 | .info = info, |
| 82 | .ret = -ESRCH, /* No such (running) process */ | 84 | .ret = -ESRCH, /* No such (running) process */ |
| 83 | }; | 85 | }; |
| 84 | 86 | ||
| 85 | if (task_curr(p)) | 87 | if (task_curr(p)) |
| @@ -100,10 +102,10 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | |||
| 100 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | 102 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) |
| 101 | { | 103 | { |
| 102 | struct remote_function_call data = { | 104 | struct remote_function_call data = { |
| 103 | .p = NULL, | 105 | .p = NULL, |
| 104 | .func = func, | 106 | .func = func, |
| 105 | .info = info, | 107 | .info = info, |
| 106 | .ret = -ENXIO, /* No such CPU */ | 108 | .ret = -ENXIO, /* No such CPU */ |
| 107 | }; | 109 | }; |
| 108 | 110 | ||
| 109 | smp_call_function_single(cpu, remote_function, &data, 1); | 111 | smp_call_function_single(cpu, remote_function, &data, 1); |
| @@ -125,7 +127,7 @@ enum event_type_t { | |||
| 125 | * perf_sched_events : >0 events exist | 127 | * perf_sched_events : >0 events exist |
| 126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | 128 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu |
| 127 | */ | 129 | */ |
| 128 | atomic_t perf_sched_events __read_mostly; | 130 | struct jump_label_key perf_sched_events __read_mostly; |
| 129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 131 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
| 130 | 132 | ||
| 131 | static atomic_t nr_mmap_events __read_mostly; | 133 | static atomic_t nr_mmap_events __read_mostly; |
| @@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx) | |||
| 200 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | 202 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); |
| 201 | } | 203 | } |
| 202 | 204 | ||
| 205 | static void perf_ctx_lock(struct perf_cpu_context *cpuctx, | ||
| 206 | struct perf_event_context *ctx) | ||
| 207 | { | ||
| 208 | raw_spin_lock(&cpuctx->ctx.lock); | ||
| 209 | if (ctx) | ||
| 210 | raw_spin_lock(&ctx->lock); | ||
| 211 | } | ||
| 212 | |||
| 213 | static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, | ||
| 214 | struct perf_event_context *ctx) | ||
| 215 | { | ||
| 216 | if (ctx) | ||
| 217 | raw_spin_unlock(&ctx->lock); | ||
| 218 | raw_spin_unlock(&cpuctx->ctx.lock); | ||
| 219 | } | ||
| 220 | |||
| 203 | #ifdef CONFIG_CGROUP_PERF | 221 | #ifdef CONFIG_CGROUP_PERF |
| 204 | 222 | ||
| 205 | /* | 223 | /* |
| @@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) | |||
| 340 | rcu_read_lock(); | 358 | rcu_read_lock(); |
| 341 | 359 | ||
| 342 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 360 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
| 343 | |||
| 344 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | 361 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); |
| 345 | 362 | ||
| 346 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
| 347 | |||
| 348 | /* | 363 | /* |
| 349 | * perf_cgroup_events says at least one | 364 | * perf_cgroup_events says at least one |
| 350 | * context on this CPU has cgroup events. | 365 | * context on this CPU has cgroup events. |
| @@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode) | |||
| 353 | * events for a context. | 368 | * events for a context. |
| 354 | */ | 369 | */ |
| 355 | if (cpuctx->ctx.nr_cgroups > 0) { | 370 | if (cpuctx->ctx.nr_cgroups > 0) { |
| 371 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
| 372 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
| 356 | 373 | ||
| 357 | if (mode & PERF_CGROUP_SWOUT) { | 374 | if (mode & PERF_CGROUP_SWOUT) { |
| 358 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | 375 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); |
| @@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode) | |||
| 372 | cpuctx->cgrp = perf_cgroup_from_task(task); | 389 | cpuctx->cgrp = perf_cgroup_from_task(task); |
| 373 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); | 390 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); |
| 374 | } | 391 | } |
| 392 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
| 393 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
| 375 | } | 394 | } |
| 376 | |||
| 377 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
| 378 | } | 395 | } |
| 379 | 396 | ||
| 380 | rcu_read_unlock(); | 397 | rcu_read_unlock(); |
| @@ -586,14 +603,6 @@ static void get_ctx(struct perf_event_context *ctx) | |||
| 586 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); | 603 | WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); |
| 587 | } | 604 | } |
| 588 | 605 | ||
| 589 | static void free_ctx(struct rcu_head *head) | ||
| 590 | { | ||
| 591 | struct perf_event_context *ctx; | ||
| 592 | |||
| 593 | ctx = container_of(head, struct perf_event_context, rcu_head); | ||
| 594 | kfree(ctx); | ||
| 595 | } | ||
| 596 | |||
| 597 | static void put_ctx(struct perf_event_context *ctx) | 606 | static void put_ctx(struct perf_event_context *ctx) |
| 598 | { | 607 | { |
| 599 | if (atomic_dec_and_test(&ctx->refcount)) { | 608 | if (atomic_dec_and_test(&ctx->refcount)) { |
| @@ -601,7 +610,7 @@ static void put_ctx(struct perf_event_context *ctx) | |||
| 601 | put_ctx(ctx->parent_ctx); | 610 | put_ctx(ctx->parent_ctx); |
| 602 | if (ctx->task) | 611 | if (ctx->task) |
| 603 | put_task_struct(ctx->task); | 612 | put_task_struct(ctx->task); |
| 604 | call_rcu(&ctx->rcu_head, free_ctx); | 613 | kfree_rcu(ctx, rcu_head); |
| 605 | } | 614 | } |
| 606 | } | 615 | } |
| 607 | 616 | ||
| @@ -739,6 +748,7 @@ static u64 perf_event_time(struct perf_event *event) | |||
| 739 | 748 | ||
| 740 | /* | 749 | /* |
| 741 | * Update the total_time_enabled and total_time_running fields for a event. | 750 | * Update the total_time_enabled and total_time_running fields for a event. |
| 751 | * The caller of this function needs to hold the ctx->lock. | ||
| 742 | */ | 752 | */ |
| 743 | static void update_event_times(struct perf_event *event) | 753 | static void update_event_times(struct perf_event *event) |
| 744 | { | 754 | { |
| @@ -1113,6 +1123,10 @@ static int __perf_remove_from_context(void *info) | |||
| 1113 | raw_spin_lock(&ctx->lock); | 1123 | raw_spin_lock(&ctx->lock); |
| 1114 | event_sched_out(event, cpuctx, ctx); | 1124 | event_sched_out(event, cpuctx, ctx); |
| 1115 | list_del_event(event, ctx); | 1125 | list_del_event(event, ctx); |
| 1126 | if (!ctx->nr_events && cpuctx->task_ctx == ctx) { | ||
| 1127 | ctx->is_active = 0; | ||
| 1128 | cpuctx->task_ctx = NULL; | ||
| 1129 | } | ||
| 1116 | raw_spin_unlock(&ctx->lock); | 1130 | raw_spin_unlock(&ctx->lock); |
| 1117 | 1131 | ||
| 1118 | return 0; | 1132 | return 0; |
| @@ -1462,8 +1476,24 @@ static void add_event_to_ctx(struct perf_event *event, | |||
| 1462 | event->tstamp_stopped = tstamp; | 1476 | event->tstamp_stopped = tstamp; |
| 1463 | } | 1477 | } |
| 1464 | 1478 | ||
| 1465 | static void perf_event_context_sched_in(struct perf_event_context *ctx, | 1479 | static void task_ctx_sched_out(struct perf_event_context *ctx); |
| 1466 | struct task_struct *tsk); | 1480 | static void |
| 1481 | ctx_sched_in(struct perf_event_context *ctx, | ||
| 1482 | struct perf_cpu_context *cpuctx, | ||
| 1483 | enum event_type_t event_type, | ||
| 1484 | struct task_struct *task); | ||
| 1485 | |||
| 1486 | static void perf_event_sched_in(struct perf_cpu_context *cpuctx, | ||
| 1487 | struct perf_event_context *ctx, | ||
| 1488 | struct task_struct *task) | ||
| 1489 | { | ||
| 1490 | cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); | ||
| 1491 | if (ctx) | ||
| 1492 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); | ||
| 1493 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); | ||
| 1494 | if (ctx) | ||
| 1495 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); | ||
| 1496 | } | ||
| 1467 | 1497 | ||
| 1468 | /* | 1498 | /* |
| 1469 | * Cross CPU call to install and enable a performance event | 1499 | * Cross CPU call to install and enable a performance event |
| @@ -1474,20 +1504,37 @@ static int __perf_install_in_context(void *info) | |||
| 1474 | { | 1504 | { |
| 1475 | struct perf_event *event = info; | 1505 | struct perf_event *event = info; |
| 1476 | struct perf_event_context *ctx = event->ctx; | 1506 | struct perf_event_context *ctx = event->ctx; |
| 1477 | struct perf_event *leader = event->group_leader; | ||
| 1478 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1507 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
| 1479 | int err; | 1508 | struct perf_event_context *task_ctx = cpuctx->task_ctx; |
| 1509 | struct task_struct *task = current; | ||
| 1510 | |||
| 1511 | perf_ctx_lock(cpuctx, task_ctx); | ||
| 1512 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
| 1480 | 1513 | ||
| 1481 | /* | 1514 | /* |
| 1482 | * In case we're installing a new context to an already running task, | 1515 | * If there was an active task_ctx schedule it out. |
| 1483 | * could also happen before perf_event_task_sched_in() on architectures | ||
| 1484 | * which do context switches with IRQs enabled. | ||
| 1485 | */ | 1516 | */ |
| 1486 | if (ctx->task && !cpuctx->task_ctx) | 1517 | if (task_ctx) |
| 1487 | perf_event_context_sched_in(ctx, ctx->task); | 1518 | task_ctx_sched_out(task_ctx); |
| 1519 | |||
| 1520 | /* | ||
| 1521 | * If the context we're installing events in is not the | ||
| 1522 | * active task_ctx, flip them. | ||
| 1523 | */ | ||
| 1524 | if (ctx->task && task_ctx != ctx) { | ||
| 1525 | if (task_ctx) | ||
| 1526 | raw_spin_unlock(&task_ctx->lock); | ||
| 1527 | raw_spin_lock(&ctx->lock); | ||
| 1528 | task_ctx = ctx; | ||
| 1529 | } | ||
| 1530 | |||
| 1531 | if (task_ctx) { | ||
| 1532 | cpuctx->task_ctx = task_ctx; | ||
| 1533 | task = task_ctx->task; | ||
| 1534 | } | ||
| 1535 | |||
| 1536 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | ||
| 1488 | 1537 | ||
| 1489 | raw_spin_lock(&ctx->lock); | ||
| 1490 | ctx->is_active = 1; | ||
| 1491 | update_context_time(ctx); | 1538 | update_context_time(ctx); |
| 1492 | /* | 1539 | /* |
| 1493 | * update cgrp time only if current cgrp | 1540 | * update cgrp time only if current cgrp |
| @@ -1498,43 +1545,13 @@ static int __perf_install_in_context(void *info) | |||
| 1498 | 1545 | ||
| 1499 | add_event_to_ctx(event, ctx); | 1546 | add_event_to_ctx(event, ctx); |
| 1500 | 1547 | ||
| 1501 | if (!event_filter_match(event)) | ||
| 1502 | goto unlock; | ||
| 1503 | |||
| 1504 | /* | 1548 | /* |
| 1505 | * Don't put the event on if it is disabled or if | 1549 | * Schedule everything back in |
| 1506 | * it is in a group and the group isn't on. | ||
| 1507 | */ | 1550 | */ |
| 1508 | if (event->state != PERF_EVENT_STATE_INACTIVE || | 1551 | perf_event_sched_in(cpuctx, task_ctx, task); |
| 1509 | (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) | ||
| 1510 | goto unlock; | ||
| 1511 | |||
| 1512 | /* | ||
| 1513 | * An exclusive event can't go on if there are already active | ||
| 1514 | * hardware events, and no hardware event can go on if there | ||
| 1515 | * is already an exclusive event on. | ||
| 1516 | */ | ||
| 1517 | if (!group_can_go_on(event, cpuctx, 1)) | ||
| 1518 | err = -EEXIST; | ||
| 1519 | else | ||
| 1520 | err = event_sched_in(event, cpuctx, ctx); | ||
| 1521 | |||
| 1522 | if (err) { | ||
| 1523 | /* | ||
| 1524 | * This event couldn't go on. If it is in a group | ||
| 1525 | * then we have to pull the whole group off. | ||
| 1526 | * If the event group is pinned then put it in error state. | ||
| 1527 | */ | ||
| 1528 | if (leader != event) | ||
| 1529 | group_sched_out(leader, cpuctx, ctx); | ||
| 1530 | if (leader->attr.pinned) { | ||
| 1531 | update_group_times(leader); | ||
| 1532 | leader->state = PERF_EVENT_STATE_ERROR; | ||
| 1533 | } | ||
| 1534 | } | ||
| 1535 | 1552 | ||
| 1536 | unlock: | 1553 | perf_pmu_enable(cpuctx->ctx.pmu); |
| 1537 | raw_spin_unlock(&ctx->lock); | 1554 | perf_ctx_unlock(cpuctx, task_ctx); |
| 1538 | 1555 | ||
| 1539 | return 0; | 1556 | return 0; |
| 1540 | } | 1557 | } |
| @@ -1747,7 +1764,7 @@ out: | |||
| 1747 | raw_spin_unlock_irq(&ctx->lock); | 1764 | raw_spin_unlock_irq(&ctx->lock); |
| 1748 | } | 1765 | } |
| 1749 | 1766 | ||
| 1750 | static int perf_event_refresh(struct perf_event *event, int refresh) | 1767 | int perf_event_refresh(struct perf_event *event, int refresh) |
| 1751 | { | 1768 | { |
| 1752 | /* | 1769 | /* |
| 1753 | * not supported on inherited events | 1770 | * not supported on inherited events |
| @@ -1760,36 +1777,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh) | |||
| 1760 | 1777 | ||
| 1761 | return 0; | 1778 | return 0; |
| 1762 | } | 1779 | } |
| 1780 | EXPORT_SYMBOL_GPL(perf_event_refresh); | ||
| 1763 | 1781 | ||
| 1764 | static void ctx_sched_out(struct perf_event_context *ctx, | 1782 | static void ctx_sched_out(struct perf_event_context *ctx, |
| 1765 | struct perf_cpu_context *cpuctx, | 1783 | struct perf_cpu_context *cpuctx, |
| 1766 | enum event_type_t event_type) | 1784 | enum event_type_t event_type) |
| 1767 | { | 1785 | { |
| 1768 | struct perf_event *event; | 1786 | struct perf_event *event; |
| 1787 | int is_active = ctx->is_active; | ||
| 1769 | 1788 | ||
| 1770 | raw_spin_lock(&ctx->lock); | 1789 | ctx->is_active &= ~event_type; |
| 1771 | perf_pmu_disable(ctx->pmu); | ||
| 1772 | ctx->is_active = 0; | ||
| 1773 | if (likely(!ctx->nr_events)) | 1790 | if (likely(!ctx->nr_events)) |
| 1774 | goto out; | 1791 | return; |
| 1792 | |||
| 1775 | update_context_time(ctx); | 1793 | update_context_time(ctx); |
| 1776 | update_cgrp_time_from_cpuctx(cpuctx); | 1794 | update_cgrp_time_from_cpuctx(cpuctx); |
| 1777 | |||
| 1778 | if (!ctx->nr_active) | 1795 | if (!ctx->nr_active) |
| 1779 | goto out; | 1796 | return; |
| 1780 | 1797 | ||
| 1781 | if (event_type & EVENT_PINNED) { | 1798 | perf_pmu_disable(ctx->pmu); |
| 1799 | if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { | ||
| 1782 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 1800 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) |
| 1783 | group_sched_out(event, cpuctx, ctx); | 1801 | group_sched_out(event, cpuctx, ctx); |
| 1784 | } | 1802 | } |
| 1785 | 1803 | ||
| 1786 | if (event_type & EVENT_FLEXIBLE) { | 1804 | if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { |
| 1787 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 1805 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) |
| 1788 | group_sched_out(event, cpuctx, ctx); | 1806 | group_sched_out(event, cpuctx, ctx); |
| 1789 | } | 1807 | } |
| 1790 | out: | ||
| 1791 | perf_pmu_enable(ctx->pmu); | 1808 | perf_pmu_enable(ctx->pmu); |
| 1792 | raw_spin_unlock(&ctx->lock); | ||
| 1793 | } | 1809 | } |
| 1794 | 1810 | ||
| 1795 | /* | 1811 | /* |
| @@ -1937,8 +1953,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
| 1937 | rcu_read_unlock(); | 1953 | rcu_read_unlock(); |
| 1938 | 1954 | ||
| 1939 | if (do_switch) { | 1955 | if (do_switch) { |
| 1956 | raw_spin_lock(&ctx->lock); | ||
| 1940 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); | 1957 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); |
| 1941 | cpuctx->task_ctx = NULL; | 1958 | cpuctx->task_ctx = NULL; |
| 1959 | raw_spin_unlock(&ctx->lock); | ||
| 1942 | } | 1960 | } |
| 1943 | } | 1961 | } |
| 1944 | 1962 | ||
| @@ -1973,8 +1991,7 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
| 1973 | perf_cgroup_sched_out(task); | 1991 | perf_cgroup_sched_out(task); |
| 1974 | } | 1992 | } |
| 1975 | 1993 | ||
| 1976 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1994 | static void task_ctx_sched_out(struct perf_event_context *ctx) |
| 1977 | enum event_type_t event_type) | ||
| 1978 | { | 1995 | { |
| 1979 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1996 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
| 1980 | 1997 | ||
| @@ -1984,7 +2001,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, | |||
| 1984 | if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) | 2001 | if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) |
| 1985 | return; | 2002 | return; |
| 1986 | 2003 | ||
| 1987 | ctx_sched_out(ctx, cpuctx, event_type); | 2004 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); |
| 1988 | cpuctx->task_ctx = NULL; | 2005 | cpuctx->task_ctx = NULL; |
| 1989 | } | 2006 | } |
| 1990 | 2007 | ||
| @@ -2063,11 +2080,11 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
| 2063 | struct task_struct *task) | 2080 | struct task_struct *task) |
| 2064 | { | 2081 | { |
| 2065 | u64 now; | 2082 | u64 now; |
| 2083 | int is_active = ctx->is_active; | ||
| 2066 | 2084 | ||
| 2067 | raw_spin_lock(&ctx->lock); | 2085 | ctx->is_active |= event_type; |
| 2068 | ctx->is_active = 1; | ||
| 2069 | if (likely(!ctx->nr_events)) | 2086 | if (likely(!ctx->nr_events)) |
| 2070 | goto out; | 2087 | return; |
| 2071 | 2088 | ||
| 2072 | now = perf_clock(); | 2089 | now = perf_clock(); |
| 2073 | ctx->timestamp = now; | 2090 | ctx->timestamp = now; |
| @@ -2076,15 +2093,12 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
| 2076 | * First go through the list and put on any pinned groups | 2093 | * First go through the list and put on any pinned groups |
| 2077 | * in order to give them the best chance of going on. | 2094 | * in order to give them the best chance of going on. |
| 2078 | */ | 2095 | */ |
| 2079 | if (event_type & EVENT_PINNED) | 2096 | if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) |
| 2080 | ctx_pinned_sched_in(ctx, cpuctx); | 2097 | ctx_pinned_sched_in(ctx, cpuctx); |
| 2081 | 2098 | ||
| 2082 | /* Then walk through the lower prio flexible groups */ | 2099 | /* Then walk through the lower prio flexible groups */ |
| 2083 | if (event_type & EVENT_FLEXIBLE) | 2100 | if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) |
| 2084 | ctx_flexible_sched_in(ctx, cpuctx); | 2101 | ctx_flexible_sched_in(ctx, cpuctx); |
| 2085 | |||
| 2086 | out: | ||
| 2087 | raw_spin_unlock(&ctx->lock); | ||
| 2088 | } | 2102 | } |
| 2089 | 2103 | ||
| 2090 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 2104 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
| @@ -2096,19 +2110,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | |||
| 2096 | ctx_sched_in(ctx, cpuctx, event_type, task); | 2110 | ctx_sched_in(ctx, cpuctx, event_type, task); |
| 2097 | } | 2111 | } |
| 2098 | 2112 | ||
| 2099 | static void task_ctx_sched_in(struct perf_event_context *ctx, | ||
| 2100 | enum event_type_t event_type) | ||
| 2101 | { | ||
| 2102 | struct perf_cpu_context *cpuctx; | ||
| 2103 | |||
| 2104 | cpuctx = __get_cpu_context(ctx); | ||
| 2105 | if (cpuctx->task_ctx == ctx) | ||
| 2106 | return; | ||
| 2107 | |||
| 2108 | ctx_sched_in(ctx, cpuctx, event_type, NULL); | ||
| 2109 | cpuctx->task_ctx = ctx; | ||
| 2110 | } | ||
| 2111 | |||
| 2112 | static void perf_event_context_sched_in(struct perf_event_context *ctx, | 2113 | static void perf_event_context_sched_in(struct perf_event_context *ctx, |
| 2113 | struct task_struct *task) | 2114 | struct task_struct *task) |
| 2114 | { | 2115 | { |
| @@ -2118,6 +2119,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
| 2118 | if (cpuctx->task_ctx == ctx) | 2119 | if (cpuctx->task_ctx == ctx) |
| 2119 | return; | 2120 | return; |
| 2120 | 2121 | ||
| 2122 | perf_ctx_lock(cpuctx, ctx); | ||
| 2121 | perf_pmu_disable(ctx->pmu); | 2123 | perf_pmu_disable(ctx->pmu); |
| 2122 | /* | 2124 | /* |
| 2123 | * We want to keep the following priority order: | 2125 | * We want to keep the following priority order: |
| @@ -2126,18 +2128,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, | |||
| 2126 | */ | 2128 | */ |
| 2127 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2129 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
| 2128 | 2130 | ||
| 2129 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); | 2131 | perf_event_sched_in(cpuctx, ctx, task); |
| 2130 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); | ||
| 2131 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); | ||
| 2132 | 2132 | ||
| 2133 | cpuctx->task_ctx = ctx; | 2133 | cpuctx->task_ctx = ctx; |
| 2134 | 2134 | ||
| 2135 | perf_pmu_enable(ctx->pmu); | ||
| 2136 | perf_ctx_unlock(cpuctx, ctx); | ||
| 2137 | |||
| 2135 | /* | 2138 | /* |
| 2136 | * Since these rotations are per-cpu, we need to ensure the | 2139 | * Since these rotations are per-cpu, we need to ensure the |
| 2137 | * cpu-context we got scheduled on is actually rotating. | 2140 | * cpu-context we got scheduled on is actually rotating. |
| 2138 | */ | 2141 | */ |
| 2139 | perf_pmu_rotate_start(ctx->pmu); | 2142 | perf_pmu_rotate_start(ctx->pmu); |
| 2140 | perf_pmu_enable(ctx->pmu); | ||
| 2141 | } | 2143 | } |
| 2142 | 2144 | ||
| 2143 | /* | 2145 | /* |
| @@ -2277,7 +2279,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
| 2277 | u64 interrupts, now; | 2279 | u64 interrupts, now; |
| 2278 | s64 delta; | 2280 | s64 delta; |
| 2279 | 2281 | ||
| 2280 | raw_spin_lock(&ctx->lock); | ||
| 2281 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { | 2282 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
| 2282 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 2283 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
| 2283 | continue; | 2284 | continue; |
| @@ -2309,7 +2310,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
| 2309 | if (delta > 0) | 2310 | if (delta > 0) |
| 2310 | perf_adjust_period(event, period, delta); | 2311 | perf_adjust_period(event, period, delta); |
| 2311 | } | 2312 | } |
| 2312 | raw_spin_unlock(&ctx->lock); | ||
| 2313 | } | 2313 | } |
| 2314 | 2314 | ||
| 2315 | /* | 2315 | /* |
| @@ -2317,16 +2317,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) | |||
| 2317 | */ | 2317 | */ |
| 2318 | static void rotate_ctx(struct perf_event_context *ctx) | 2318 | static void rotate_ctx(struct perf_event_context *ctx) |
| 2319 | { | 2319 | { |
| 2320 | raw_spin_lock(&ctx->lock); | ||
| 2321 | |||
| 2322 | /* | 2320 | /* |
| 2323 | * Rotate the first entry last of non-pinned groups. Rotation might be | 2321 | * Rotate the first entry last of non-pinned groups. Rotation might be |
| 2324 | * disabled by the inheritance code. | 2322 | * disabled by the inheritance code. |
| 2325 | */ | 2323 | */ |
| 2326 | if (!ctx->rotate_disable) | 2324 | if (!ctx->rotate_disable) |
| 2327 | list_rotate_left(&ctx->flexible_groups); | 2325 | list_rotate_left(&ctx->flexible_groups); |
| 2328 | |||
| 2329 | raw_spin_unlock(&ctx->lock); | ||
| 2330 | } | 2326 | } |
| 2331 | 2327 | ||
| 2332 | /* | 2328 | /* |
| @@ -2353,6 +2349,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
| 2353 | rotate = 1; | 2349 | rotate = 1; |
| 2354 | } | 2350 | } |
| 2355 | 2351 | ||
| 2352 | perf_ctx_lock(cpuctx, cpuctx->task_ctx); | ||
| 2356 | perf_pmu_disable(cpuctx->ctx.pmu); | 2353 | perf_pmu_disable(cpuctx->ctx.pmu); |
| 2357 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); | 2354 | perf_ctx_adjust_freq(&cpuctx->ctx, interval); |
| 2358 | if (ctx) | 2355 | if (ctx) |
| @@ -2363,21 +2360,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
| 2363 | 2360 | ||
| 2364 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2361 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
| 2365 | if (ctx) | 2362 | if (ctx) |
| 2366 | task_ctx_sched_out(ctx, EVENT_FLEXIBLE); | 2363 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); |
| 2367 | 2364 | ||
| 2368 | rotate_ctx(&cpuctx->ctx); | 2365 | rotate_ctx(&cpuctx->ctx); |
| 2369 | if (ctx) | 2366 | if (ctx) |
| 2370 | rotate_ctx(ctx); | 2367 | rotate_ctx(ctx); |
| 2371 | 2368 | ||
| 2372 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); | 2369 | perf_event_sched_in(cpuctx, ctx, current); |
| 2373 | if (ctx) | ||
| 2374 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); | ||
| 2375 | 2370 | ||
| 2376 | done: | 2371 | done: |
| 2377 | if (remove) | 2372 | if (remove) |
| 2378 | list_del_init(&cpuctx->rotation_list); | 2373 | list_del_init(&cpuctx->rotation_list); |
| 2379 | 2374 | ||
| 2380 | perf_pmu_enable(cpuctx->ctx.pmu); | 2375 | perf_pmu_enable(cpuctx->ctx.pmu); |
| 2376 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | ||
| 2381 | } | 2377 | } |
| 2382 | 2378 | ||
| 2383 | void perf_event_task_tick(void) | 2379 | void perf_event_task_tick(void) |
| @@ -2432,9 +2428,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
| 2432 | * in. | 2428 | * in. |
| 2433 | */ | 2429 | */ |
| 2434 | perf_cgroup_sched_out(current); | 2430 | perf_cgroup_sched_out(current); |
| 2435 | task_ctx_sched_out(ctx, EVENT_ALL); | ||
| 2436 | 2431 | ||
| 2437 | raw_spin_lock(&ctx->lock); | 2432 | raw_spin_lock(&ctx->lock); |
| 2433 | task_ctx_sched_out(ctx); | ||
| 2438 | 2434 | ||
| 2439 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { | 2435 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) { |
| 2440 | ret = event_enable_on_exec(event, ctx); | 2436 | ret = event_enable_on_exec(event, ctx); |
| @@ -2843,16 +2839,12 @@ retry: | |||
| 2843 | unclone_ctx(ctx); | 2839 | unclone_ctx(ctx); |
| 2844 | ++ctx->pin_count; | 2840 | ++ctx->pin_count; |
| 2845 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2841 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
| 2846 | } | 2842 | } else { |
| 2847 | |||
| 2848 | if (!ctx) { | ||
| 2849 | ctx = alloc_perf_context(pmu, task); | 2843 | ctx = alloc_perf_context(pmu, task); |
| 2850 | err = -ENOMEM; | 2844 | err = -ENOMEM; |
| 2851 | if (!ctx) | 2845 | if (!ctx) |
| 2852 | goto errout; | 2846 | goto errout; |
| 2853 | 2847 | ||
| 2854 | get_ctx(ctx); | ||
| 2855 | |||
| 2856 | err = 0; | 2848 | err = 0; |
| 2857 | mutex_lock(&task->perf_event_mutex); | 2849 | mutex_lock(&task->perf_event_mutex); |
| 2858 | /* | 2850 | /* |
| @@ -2864,14 +2856,14 @@ retry: | |||
| 2864 | else if (task->perf_event_ctxp[ctxn]) | 2856 | else if (task->perf_event_ctxp[ctxn]) |
| 2865 | err = -EAGAIN; | 2857 | err = -EAGAIN; |
| 2866 | else { | 2858 | else { |
| 2859 | get_ctx(ctx); | ||
| 2867 | ++ctx->pin_count; | 2860 | ++ctx->pin_count; |
| 2868 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); | 2861 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); |
| 2869 | } | 2862 | } |
| 2870 | mutex_unlock(&task->perf_event_mutex); | 2863 | mutex_unlock(&task->perf_event_mutex); |
| 2871 | 2864 | ||
| 2872 | if (unlikely(err)) { | 2865 | if (unlikely(err)) { |
| 2873 | put_task_struct(task); | 2866 | put_ctx(ctx); |
| 2874 | kfree(ctx); | ||
| 2875 | 2867 | ||
| 2876 | if (err == -EAGAIN) | 2868 | if (err == -EAGAIN) |
| 2877 | goto retry; | 2869 | goto retry; |
| @@ -2898,7 +2890,7 @@ static void free_event_rcu(struct rcu_head *head) | |||
| 2898 | kfree(event); | 2890 | kfree(event); |
| 2899 | } | 2891 | } |
| 2900 | 2892 | ||
| 2901 | static void perf_buffer_put(struct perf_buffer *buffer); | 2893 | static void ring_buffer_put(struct ring_buffer *rb); |
| 2902 | 2894 | ||
| 2903 | static void free_event(struct perf_event *event) | 2895 | static void free_event(struct perf_event *event) |
| 2904 | { | 2896 | { |
| @@ -2921,9 +2913,9 @@ static void free_event(struct perf_event *event) | |||
| 2921 | } | 2913 | } |
| 2922 | } | 2914 | } |
| 2923 | 2915 | ||
| 2924 | if (event->buffer) { | 2916 | if (event->rb) { |
| 2925 | perf_buffer_put(event->buffer); | 2917 | ring_buffer_put(event->rb); |
| 2926 | event->buffer = NULL; | 2918 | event->rb = NULL; |
| 2927 | } | 2919 | } |
| 2928 | 2920 | ||
| 2929 | if (is_cgroup_event(event)) | 2921 | if (is_cgroup_event(event)) |
| @@ -2942,12 +2934,6 @@ int perf_event_release_kernel(struct perf_event *event) | |||
| 2942 | { | 2934 | { |
| 2943 | struct perf_event_context *ctx = event->ctx; | 2935 | struct perf_event_context *ctx = event->ctx; |
| 2944 | 2936 | ||
| 2945 | /* | ||
| 2946 | * Remove from the PMU, can't get re-enabled since we got | ||
| 2947 | * here because the last ref went. | ||
| 2948 | */ | ||
| 2949 | perf_event_disable(event); | ||
| 2950 | |||
| 2951 | WARN_ON_ONCE(ctx->parent_ctx); | 2937 | WARN_ON_ONCE(ctx->parent_ctx); |
| 2952 | /* | 2938 | /* |
| 2953 | * There are two ways this annotation is useful: | 2939 | * There are two ways this annotation is useful: |
| @@ -2964,8 +2950,8 @@ int perf_event_release_kernel(struct perf_event *event) | |||
| 2964 | mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); | 2950 | mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); |
| 2965 | raw_spin_lock_irq(&ctx->lock); | 2951 | raw_spin_lock_irq(&ctx->lock); |
| 2966 | perf_group_detach(event); | 2952 | perf_group_detach(event); |
| 2967 | list_del_event(event, ctx); | ||
| 2968 | raw_spin_unlock_irq(&ctx->lock); | 2953 | raw_spin_unlock_irq(&ctx->lock); |
| 2954 | perf_remove_from_context(event); | ||
| 2969 | mutex_unlock(&ctx->mutex); | 2955 | mutex_unlock(&ctx->mutex); |
| 2970 | 2956 | ||
| 2971 | free_event(event); | 2957 | free_event(event); |
| @@ -3157,13 +3143,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
| 3157 | static unsigned int perf_poll(struct file *file, poll_table *wait) | 3143 | static unsigned int perf_poll(struct file *file, poll_table *wait) |
| 3158 | { | 3144 | { |
| 3159 | struct perf_event *event = file->private_data; | 3145 | struct perf_event *event = file->private_data; |
| 3160 | struct perf_buffer *buffer; | 3146 | struct ring_buffer *rb; |
| 3161 | unsigned int events = POLL_HUP; | 3147 | unsigned int events = POLL_HUP; |
| 3162 | 3148 | ||
| 3163 | rcu_read_lock(); | 3149 | rcu_read_lock(); |
| 3164 | buffer = rcu_dereference(event->buffer); | 3150 | rb = rcu_dereference(event->rb); |
| 3165 | if (buffer) | 3151 | if (rb) |
| 3166 | events = atomic_xchg(&buffer->poll, 0); | 3152 | events = atomic_xchg(&rb->poll, 0); |
| 3167 | rcu_read_unlock(); | 3153 | rcu_read_unlock(); |
| 3168 | 3154 | ||
| 3169 | poll_wait(file, &event->waitq, wait); | 3155 | poll_wait(file, &event->waitq, wait); |
| @@ -3366,6 +3352,18 @@ static int perf_event_index(struct perf_event *event) | |||
| 3366 | return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; | 3352 | return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; |
| 3367 | } | 3353 | } |
| 3368 | 3354 | ||
| 3355 | static void calc_timer_values(struct perf_event *event, | ||
| 3356 | u64 *running, | ||
| 3357 | u64 *enabled) | ||
| 3358 | { | ||
| 3359 | u64 now, ctx_time; | ||
| 3360 | |||
| 3361 | now = perf_clock(); | ||
| 3362 | ctx_time = event->shadow_ctx_time + now; | ||
| 3363 | *enabled = ctx_time - event->tstamp_enabled; | ||
| 3364 | *running = ctx_time - event->tstamp_running; | ||
| 3365 | } | ||
| 3366 | |||
| 3369 | /* | 3367 | /* |
| 3370 | * Callers need to ensure there can be no nesting of this function, otherwise | 3368 | * Callers need to ensure there can be no nesting of this function, otherwise |
| 3371 | * the seqlock logic goes bad. We can not serialize this because the arch | 3369 | * the seqlock logic goes bad. We can not serialize this because the arch |
| @@ -3374,14 +3372,25 @@ static int perf_event_index(struct perf_event *event) | |||
| 3374 | void perf_event_update_userpage(struct perf_event *event) | 3372 | void perf_event_update_userpage(struct perf_event *event) |
| 3375 | { | 3373 | { |
| 3376 | struct perf_event_mmap_page *userpg; | 3374 | struct perf_event_mmap_page *userpg; |
| 3377 | struct perf_buffer *buffer; | 3375 | struct ring_buffer *rb; |
| 3376 | u64 enabled, running; | ||
| 3378 | 3377 | ||
| 3379 | rcu_read_lock(); | 3378 | rcu_read_lock(); |
| 3380 | buffer = rcu_dereference(event->buffer); | 3379 | /* |
| 3381 | if (!buffer) | 3380 | * compute total_time_enabled, total_time_running |
| 3381 | * based on snapshot values taken when the event | ||
| 3382 | * was last scheduled in. | ||
| 3383 | * | ||
| 3384 | * we cannot simply called update_context_time() | ||
| 3385 | * because of locking issue as we can be called in | ||
| 3386 | * NMI context | ||
| 3387 | */ | ||
| 3388 | calc_timer_values(event, &enabled, &running); | ||
| 3389 | rb = rcu_dereference(event->rb); | ||
| 3390 | if (!rb) | ||
| 3382 | goto unlock; | 3391 | goto unlock; |
| 3383 | 3392 | ||
| 3384 | userpg = buffer->user_page; | 3393 | userpg = rb->user_page; |
| 3385 | 3394 | ||
| 3386 | /* | 3395 | /* |
| 3387 | * Disable preemption so as to not let the corresponding user-space | 3396 | * Disable preemption so as to not let the corresponding user-space |
| @@ -3395,10 +3404,10 @@ void perf_event_update_userpage(struct perf_event *event) | |||
| 3395 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 3404 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
| 3396 | userpg->offset -= local64_read(&event->hw.prev_count); | 3405 | userpg->offset -= local64_read(&event->hw.prev_count); |
| 3397 | 3406 | ||
| 3398 | userpg->time_enabled = event->total_time_enabled + | 3407 | userpg->time_enabled = enabled + |
| 3399 | atomic64_read(&event->child_total_time_enabled); | 3408 | atomic64_read(&event->child_total_time_enabled); |
| 3400 | 3409 | ||
| 3401 | userpg->time_running = event->total_time_running + | 3410 | userpg->time_running = running + |
| 3402 | atomic64_read(&event->child_total_time_running); | 3411 | atomic64_read(&event->child_total_time_running); |
| 3403 | 3412 | ||
| 3404 | barrier(); | 3413 | barrier(); |
| @@ -3408,220 +3417,10 @@ unlock: | |||
| 3408 | rcu_read_unlock(); | 3417 | rcu_read_unlock(); |
| 3409 | } | 3418 | } |
| 3410 | 3419 | ||
| 3411 | static unsigned long perf_data_size(struct perf_buffer *buffer); | ||
| 3412 | |||
| 3413 | static void | ||
| 3414 | perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) | ||
| 3415 | { | ||
| 3416 | long max_size = perf_data_size(buffer); | ||
| 3417 | |||
| 3418 | if (watermark) | ||
| 3419 | buffer->watermark = min(max_size, watermark); | ||
| 3420 | |||
| 3421 | if (!buffer->watermark) | ||
| 3422 | buffer->watermark = max_size / 2; | ||
| 3423 | |||
| 3424 | if (flags & PERF_BUFFER_WRITABLE) | ||
| 3425 | buffer->writable = 1; | ||
| 3426 | |||
| 3427 | atomic_set(&buffer->refcount, 1); | ||
| 3428 | } | ||
| 3429 | |||
| 3430 | #ifndef CONFIG_PERF_USE_VMALLOC | ||
| 3431 | |||
| 3432 | /* | ||
| 3433 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. | ||
| 3434 | */ | ||
| 3435 | |||
| 3436 | static struct page * | ||
| 3437 | perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) | ||
| 3438 | { | ||
| 3439 | if (pgoff > buffer->nr_pages) | ||
| 3440 | return NULL; | ||
| 3441 | |||
| 3442 | if (pgoff == 0) | ||
| 3443 | return virt_to_page(buffer->user_page); | ||
| 3444 | |||
| 3445 | return virt_to_page(buffer->data_pages[pgoff - 1]); | ||
| 3446 | } | ||
| 3447 | |||
| 3448 | static void *perf_mmap_alloc_page(int cpu) | ||
| 3449 | { | ||
| 3450 | struct page *page; | ||
| 3451 | int node; | ||
| 3452 | |||
| 3453 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); | ||
| 3454 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
| 3455 | if (!page) | ||
| 3456 | return NULL; | ||
| 3457 | |||
| 3458 | return page_address(page); | ||
| 3459 | } | ||
| 3460 | |||
| 3461 | static struct perf_buffer * | ||
| 3462 | perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) | ||
| 3463 | { | ||
| 3464 | struct perf_buffer *buffer; | ||
| 3465 | unsigned long size; | ||
| 3466 | int i; | ||
| 3467 | |||
| 3468 | size = sizeof(struct perf_buffer); | ||
| 3469 | size += nr_pages * sizeof(void *); | ||
| 3470 | |||
| 3471 | buffer = kzalloc(size, GFP_KERNEL); | ||
| 3472 | if (!buffer) | ||
| 3473 | goto fail; | ||
| 3474 | |||
| 3475 | buffer->user_page = perf_mmap_alloc_page(cpu); | ||
| 3476 | if (!buffer->user_page) | ||
| 3477 | goto fail_user_page; | ||
| 3478 | |||
| 3479 | for (i = 0; i < nr_pages; i++) { | ||
| 3480 | buffer->data_pages[i] = perf_mmap_alloc_page(cpu); | ||
| 3481 | if (!buffer->data_pages[i]) | ||
| 3482 | goto fail_data_pages; | ||
| 3483 | } | ||
| 3484 | |||
| 3485 | buffer->nr_pages = nr_pages; | ||
| 3486 | |||
| 3487 | perf_buffer_init(buffer, watermark, flags); | ||
| 3488 | |||
| 3489 | return buffer; | ||
| 3490 | |||
| 3491 | fail_data_pages: | ||
| 3492 | for (i--; i >= 0; i--) | ||
| 3493 | free_page((unsigned long)buffer->data_pages[i]); | ||
| 3494 | |||
| 3495 | free_page((unsigned long)buffer->user_page); | ||
| 3496 | |||
| 3497 | fail_user_page: | ||
| 3498 | kfree(buffer); | ||
| 3499 | |||
| 3500 | fail: | ||
| 3501 | return NULL; | ||
| 3502 | } | ||
| 3503 | |||
| 3504 | static void perf_mmap_free_page(unsigned long addr) | ||
| 3505 | { | ||
| 3506 | struct page *page = virt_to_page((void *)addr); | ||
| 3507 | |||
| 3508 | page->mapping = NULL; | ||
| 3509 | __free_page(page); | ||
| 3510 | } | ||
| 3511 | |||
| 3512 | static void perf_buffer_free(struct perf_buffer *buffer) | ||
| 3513 | { | ||
| 3514 | int i; | ||
| 3515 | |||
| 3516 | perf_mmap_free_page((unsigned long)buffer->user_page); | ||
| 3517 | for (i = 0; i < buffer->nr_pages; i++) | ||
| 3518 | perf_mmap_free_page((unsigned long)buffer->data_pages[i]); | ||
| 3519 | kfree(buffer); | ||
| 3520 | } | ||
| 3521 | |||
| 3522 | static inline int page_order(struct perf_buffer *buffer) | ||
| 3523 | { | ||
| 3524 | return 0; | ||
| 3525 | } | ||
| 3526 | |||
| 3527 | #else | ||
| 3528 | |||
| 3529 | /* | ||
| 3530 | * Back perf_mmap() with vmalloc memory. | ||
| 3531 | * | ||
| 3532 | * Required for architectures that have d-cache aliasing issues. | ||
| 3533 | */ | ||
| 3534 | |||
| 3535 | static inline int page_order(struct perf_buffer *buffer) | ||
| 3536 | { | ||
| 3537 | return buffer->page_order; | ||
| 3538 | } | ||
| 3539 | |||
| 3540 | static struct page * | ||
| 3541 | perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) | ||
| 3542 | { | ||
| 3543 | if (pgoff > (1UL << page_order(buffer))) | ||
| 3544 | return NULL; | ||
| 3545 | |||
| 3546 | return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); | ||
| 3547 | } | ||
| 3548 | |||
| 3549 | static void perf_mmap_unmark_page(void *addr) | ||
| 3550 | { | ||
| 3551 | struct page *page = vmalloc_to_page(addr); | ||
| 3552 | |||
| 3553 | page->mapping = NULL; | ||
| 3554 | } | ||
| 3555 | |||
| 3556 | static void perf_buffer_free_work(struct work_struct *work) | ||
| 3557 | { | ||
| 3558 | struct perf_buffer *buffer; | ||
| 3559 | void *base; | ||
| 3560 | int i, nr; | ||
| 3561 | |||
| 3562 | buffer = container_of(work, struct perf_buffer, work); | ||
| 3563 | nr = 1 << page_order(buffer); | ||
| 3564 | |||
| 3565 | base = buffer->user_page; | ||
| 3566 | for (i = 0; i < nr + 1; i++) | ||
| 3567 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); | ||
| 3568 | |||
| 3569 | vfree(base); | ||
| 3570 | kfree(buffer); | ||
| 3571 | } | ||
| 3572 | |||
| 3573 | static void perf_buffer_free(struct perf_buffer *buffer) | ||
| 3574 | { | ||
| 3575 | schedule_work(&buffer->work); | ||
| 3576 | } | ||
| 3577 | |||
| 3578 | static struct perf_buffer * | ||
| 3579 | perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) | ||
| 3580 | { | ||
| 3581 | struct perf_buffer *buffer; | ||
| 3582 | unsigned long size; | ||
| 3583 | void *all_buf; | ||
| 3584 | |||
| 3585 | size = sizeof(struct perf_buffer); | ||
| 3586 | size += sizeof(void *); | ||
| 3587 | |||
| 3588 | buffer = kzalloc(size, GFP_KERNEL); | ||
| 3589 | if (!buffer) | ||
| 3590 | goto fail; | ||
| 3591 | |||
| 3592 | INIT_WORK(&buffer->work, perf_buffer_free_work); | ||
| 3593 | |||
| 3594 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); | ||
| 3595 | if (!all_buf) | ||
| 3596 | goto fail_all_buf; | ||
| 3597 | |||
| 3598 | buffer->user_page = all_buf; | ||
| 3599 | buffer->data_pages[0] = all_buf + PAGE_SIZE; | ||
| 3600 | buffer->page_order = ilog2(nr_pages); | ||
| 3601 | buffer->nr_pages = 1; | ||
| 3602 | |||
| 3603 | perf_buffer_init(buffer, watermark, flags); | ||
| 3604 | |||
| 3605 | return buffer; | ||
| 3606 | |||
| 3607 | fail_all_buf: | ||
| 3608 | kfree(buffer); | ||
| 3609 | |||
| 3610 | fail: | ||
| 3611 | return NULL; | ||
| 3612 | } | ||
| 3613 | |||
| 3614 | #endif | ||
| 3615 | |||
| 3616 | static unsigned long perf_data_size(struct perf_buffer *buffer) | ||
| 3617 | { | ||
| 3618 | return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); | ||
| 3619 | } | ||
| 3620 | |||
| 3621 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 3420 | static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
| 3622 | { | 3421 | { |
| 3623 | struct perf_event *event = vma->vm_file->private_data; | 3422 | struct perf_event *event = vma->vm_file->private_data; |
| 3624 | struct perf_buffer *buffer; | 3423 | struct ring_buffer *rb; |
| 3625 | int ret = VM_FAULT_SIGBUS; | 3424 | int ret = VM_FAULT_SIGBUS; |
| 3626 | 3425 | ||
| 3627 | if (vmf->flags & FAULT_FLAG_MKWRITE) { | 3426 | if (vmf->flags & FAULT_FLAG_MKWRITE) { |
| @@ -3631,14 +3430,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 3631 | } | 3430 | } |
| 3632 | 3431 | ||
| 3633 | rcu_read_lock(); | 3432 | rcu_read_lock(); |
| 3634 | buffer = rcu_dereference(event->buffer); | 3433 | rb = rcu_dereference(event->rb); |
| 3635 | if (!buffer) | 3434 | if (!rb) |
| 3636 | goto unlock; | 3435 | goto unlock; |
| 3637 | 3436 | ||
| 3638 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) | 3437 | if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) |
| 3639 | goto unlock; | 3438 | goto unlock; |
| 3640 | 3439 | ||
| 3641 | vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); | 3440 | vmf->page = perf_mmap_to_page(rb, vmf->pgoff); |
| 3642 | if (!vmf->page) | 3441 | if (!vmf->page) |
| 3643 | goto unlock; | 3442 | goto unlock; |
| 3644 | 3443 | ||
| @@ -3653,35 +3452,35 @@ unlock: | |||
| 3653 | return ret; | 3452 | return ret; |
| 3654 | } | 3453 | } |
| 3655 | 3454 | ||
| 3656 | static void perf_buffer_free_rcu(struct rcu_head *rcu_head) | 3455 | static void rb_free_rcu(struct rcu_head *rcu_head) |
| 3657 | { | 3456 | { |
| 3658 | struct perf_buffer *buffer; | 3457 | struct ring_buffer *rb; |
| 3659 | 3458 | ||
| 3660 | buffer = container_of(rcu_head, struct perf_buffer, rcu_head); | 3459 | rb = container_of(rcu_head, struct ring_buffer, rcu_head); |
| 3661 | perf_buffer_free(buffer); | 3460 | rb_free(rb); |
| 3662 | } | 3461 | } |
| 3663 | 3462 | ||
| 3664 | static struct perf_buffer *perf_buffer_get(struct perf_event *event) | 3463 | static struct ring_buffer *ring_buffer_get(struct perf_event *event) |
| 3665 | { | 3464 | { |
| 3666 | struct perf_buffer *buffer; | 3465 | struct ring_buffer *rb; |
| 3667 | 3466 | ||
| 3668 | rcu_read_lock(); | 3467 | rcu_read_lock(); |
| 3669 | buffer = rcu_dereference(event->buffer); | 3468 | rb = rcu_dereference(event->rb); |
| 3670 | if (buffer) { | 3469 | if (rb) { |
| 3671 | if (!atomic_inc_not_zero(&buffer->refcount)) | 3470 | if (!atomic_inc_not_zero(&rb->refcount)) |
| 3672 | buffer = NULL; | 3471 | rb = NULL; |
| 3673 | } | 3472 | } |
| 3674 | rcu_read_unlock(); | 3473 | rcu_read_unlock(); |
| 3675 | 3474 | ||
| 3676 | return buffer; | 3475 | return rb; |
| 3677 | } | 3476 | } |
| 3678 | 3477 | ||
| 3679 | static void perf_buffer_put(struct perf_buffer *buffer) | 3478 | static void ring_buffer_put(struct ring_buffer *rb) |
| 3680 | { | 3479 | { |
| 3681 | if (!atomic_dec_and_test(&buffer->refcount)) | 3480 | if (!atomic_dec_and_test(&rb->refcount)) |
| 3682 | return; | 3481 | return; |
| 3683 | 3482 | ||
| 3684 | call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); | 3483 | call_rcu(&rb->rcu_head, rb_free_rcu); |
| 3685 | } | 3484 | } |
| 3686 | 3485 | ||
| 3687 | static void perf_mmap_open(struct vm_area_struct *vma) | 3486 | static void perf_mmap_open(struct vm_area_struct *vma) |
| @@ -3696,16 +3495,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) | |||
| 3696 | struct perf_event *event = vma->vm_file->private_data; | 3495 | struct perf_event *event = vma->vm_file->private_data; |
| 3697 | 3496 | ||
| 3698 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { | 3497 | if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { |
| 3699 | unsigned long size = perf_data_size(event->buffer); | 3498 | unsigned long size = perf_data_size(event->rb); |
| 3700 | struct user_struct *user = event->mmap_user; | 3499 | struct user_struct *user = event->mmap_user; |
| 3701 | struct perf_buffer *buffer = event->buffer; | 3500 | struct ring_buffer *rb = event->rb; |
| 3702 | 3501 | ||
| 3703 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); | 3502 | atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); |
| 3704 | vma->vm_mm->locked_vm -= event->mmap_locked; | 3503 | vma->vm_mm->locked_vm -= event->mmap_locked; |
| 3705 | rcu_assign_pointer(event->buffer, NULL); | 3504 | rcu_assign_pointer(event->rb, NULL); |
| 3706 | mutex_unlock(&event->mmap_mutex); | 3505 | mutex_unlock(&event->mmap_mutex); |
| 3707 | 3506 | ||
| 3708 | perf_buffer_put(buffer); | 3507 | ring_buffer_put(rb); |
| 3709 | free_uid(user); | 3508 | free_uid(user); |
| 3710 | } | 3509 | } |
| 3711 | } | 3510 | } |
| @@ -3723,7 +3522,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 3723 | unsigned long user_locked, user_lock_limit; | 3522 | unsigned long user_locked, user_lock_limit; |
| 3724 | struct user_struct *user = current_user(); | 3523 | struct user_struct *user = current_user(); |
| 3725 | unsigned long locked, lock_limit; | 3524 | unsigned long locked, lock_limit; |
| 3726 | struct perf_buffer *buffer; | 3525 | struct ring_buffer *rb; |
| 3727 | unsigned long vma_size; | 3526 | unsigned long vma_size; |
| 3728 | unsigned long nr_pages; | 3527 | unsigned long nr_pages; |
| 3729 | long user_extra, extra; | 3528 | long user_extra, extra; |
| @@ -3732,7 +3531,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 3732 | /* | 3531 | /* |
| 3733 | * Don't allow mmap() of inherited per-task counters. This would | 3532 | * Don't allow mmap() of inherited per-task counters. This would |
| 3734 | * create a performance issue due to all children writing to the | 3533 | * create a performance issue due to all children writing to the |
| 3735 | * same buffer. | 3534 | * same rb. |
| 3736 | */ | 3535 | */ |
| 3737 | if (event->cpu == -1 && event->attr.inherit) | 3536 | if (event->cpu == -1 && event->attr.inherit) |
| 3738 | return -EINVAL; | 3537 | return -EINVAL; |
| @@ -3744,7 +3543,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 3744 | nr_pages = (vma_size / PAGE_SIZE) - 1; | 3543 | nr_pages = (vma_size / PAGE_SIZE) - 1; |
| 3745 | 3544 | ||
| 3746 | /* | 3545 | /* |
| 3747 | * If we have buffer pages ensure they're a power-of-two number, so we | 3546 | * If we have rb pages ensure they're a power-of-two number, so we |
| 3748 | * can do bitmasks instead of modulo. | 3547 | * can do bitmasks instead of modulo. |
| 3749 | */ | 3548 | */ |
| 3750 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) | 3549 | if (nr_pages != 0 && !is_power_of_2(nr_pages)) |
| @@ -3758,9 +3557,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 3758 | 3557 | ||
| 3759 | WARN_ON_ONCE(event->ctx->parent_ctx); | 3558 | WARN_ON_ONCE(event->ctx->parent_ctx); |
| 3760 | mutex_lock(&event->mmap_mutex); | 3559 | mutex_lock(&event->mmap_mutex); |
| 3761 | if (event->buffer) { | 3560 | if (event->rb) { |
| 3762 | if (event->buffer->nr_pages == nr_pages) | 3561 | if (event->rb->nr_pages == nr_pages) |
| 3763 | atomic_inc(&event->buffer->refcount); | 3562 | atomic_inc(&event->rb->refcount); |
| 3764 | else | 3563 | else |
| 3765 | ret = -EINVAL; | 3564 | ret = -EINVAL; |
| 3766 | goto unlock; | 3565 | goto unlock; |
| @@ -3790,18 +3589,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 3790 | goto unlock; | 3589 | goto unlock; |
| 3791 | } | 3590 | } |
| 3792 | 3591 | ||
| 3793 | WARN_ON(event->buffer); | 3592 | WARN_ON(event->rb); |
| 3794 | 3593 | ||
| 3795 | if (vma->vm_flags & VM_WRITE) | 3594 | if (vma->vm_flags & VM_WRITE) |
| 3796 | flags |= PERF_BUFFER_WRITABLE; | 3595 | flags |= RING_BUFFER_WRITABLE; |
| 3596 | |||
| 3597 | rb = rb_alloc(nr_pages, | ||
| 3598 | event->attr.watermark ? event->attr.wakeup_watermark : 0, | ||
| 3599 | event->cpu, flags); | ||
| 3797 | 3600 | ||
| 3798 | buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, | 3601 | if (!rb) { |
| 3799 | event->cpu, flags); | ||
| 3800 | if (!buffer) { | ||
| 3801 | ret = -ENOMEM; | 3602 | ret = -ENOMEM; |
| 3802 | goto unlock; | 3603 | goto unlock; |
| 3803 | } | 3604 | } |
| 3804 | rcu_assign_pointer(event->buffer, buffer); | 3605 | rcu_assign_pointer(event->rb, rb); |
| 3805 | 3606 | ||
| 3806 | atomic_long_add(user_extra, &user->locked_vm); | 3607 | atomic_long_add(user_extra, &user->locked_vm); |
| 3807 | event->mmap_locked = extra; | 3608 | event->mmap_locked = extra; |
| @@ -3900,117 +3701,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) | |||
| 3900 | } | 3701 | } |
| 3901 | EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); | 3702 | EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); |
| 3902 | 3703 | ||
| 3903 | /* | ||
| 3904 | * Output | ||
| 3905 | */ | ||
| 3906 | static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, | ||
| 3907 | unsigned long offset, unsigned long head) | ||
| 3908 | { | ||
| 3909 | unsigned long mask; | ||
| 3910 | |||
| 3911 | if (!buffer->writable) | ||
| 3912 | return true; | ||
| 3913 | |||
| 3914 | mask = perf_data_size(buffer) - 1; | ||
| 3915 | |||
| 3916 | offset = (offset - tail) & mask; | ||
| 3917 | head = (head - tail) & mask; | ||
| 3918 | |||
| 3919 | if ((int)(head - offset) < 0) | ||
| 3920 | return false; | ||
| 3921 | |||
| 3922 | return true; | ||
| 3923 | } | ||
| 3924 | |||
| 3925 | static void perf_output_wakeup(struct perf_output_handle *handle) | ||
| 3926 | { | ||
| 3927 | atomic_set(&handle->buffer->poll, POLL_IN); | ||
| 3928 | |||
| 3929 | if (handle->nmi) { | ||
| 3930 | handle->event->pending_wakeup = 1; | ||
| 3931 | irq_work_queue(&handle->event->pending); | ||
| 3932 | } else | ||
| 3933 | perf_event_wakeup(handle->event); | ||
| 3934 | } | ||
| 3935 | |||
| 3936 | /* | ||
| 3937 | * We need to ensure a later event_id doesn't publish a head when a former | ||
| 3938 | * event isn't done writing. However since we need to deal with NMIs we | ||
| 3939 | * cannot fully serialize things. | ||
| 3940 | * | ||
| 3941 | * We only publish the head (and generate a wakeup) when the outer-most | ||
| 3942 | * event completes. | ||
| 3943 | */ | ||
| 3944 | static void perf_output_get_handle(struct perf_output_handle *handle) | ||
| 3945 | { | ||
| 3946 | struct perf_buffer *buffer = handle->buffer; | ||
| 3947 | |||
| 3948 | preempt_disable(); | ||
| 3949 | local_inc(&buffer->nest); | ||
| 3950 | handle->wakeup = local_read(&buffer->wakeup); | ||
| 3951 | } | ||
| 3952 | |||
| 3953 | static void perf_output_put_handle(struct perf_output_handle *handle) | ||
| 3954 | { | ||
| 3955 | struct perf_buffer *buffer = handle->buffer; | ||
| 3956 | unsigned long head; | ||
| 3957 | |||
| 3958 | again: | ||
| 3959 | head = local_read(&buffer->head); | ||
| 3960 | |||
| 3961 | /* | ||
| 3962 | * IRQ/NMI can happen here, which means we can miss a head update. | ||
| 3963 | */ | ||
| 3964 | |||
| 3965 | if (!local_dec_and_test(&buffer->nest)) | ||
| 3966 | goto out; | ||
| 3967 | |||
| 3968 | /* | ||
| 3969 | * Publish the known good head. Rely on the full barrier implied | ||
| 3970 | * by atomic_dec_and_test() order the buffer->head read and this | ||
| 3971 | * write. | ||
| 3972 | */ | ||
| 3973 | buffer->user_page->data_head = head; | ||
| 3974 | |||
| 3975 | /* | ||
| 3976 | * Now check if we missed an update, rely on the (compiler) | ||
| 3977 | * barrier in atomic_dec_and_test() to re-read buffer->head. | ||
| 3978 | */ | ||
| 3979 | if (unlikely(head != local_read(&buffer->head))) { | ||
| 3980 | local_inc(&buffer->nest); | ||
| 3981 | goto again; | ||
| 3982 | } | ||
| 3983 | |||
| 3984 | if (handle->wakeup != local_read(&buffer->wakeup)) | ||
| 3985 | perf_output_wakeup(handle); | ||
| 3986 | |||
| 3987 | out: | ||
| 3988 | preempt_enable(); | ||
| 3989 | } | ||
| 3990 | |||
| 3991 | __always_inline void perf_output_copy(struct perf_output_handle *handle, | ||
| 3992 | const void *buf, unsigned int len) | ||
| 3993 | { | ||
| 3994 | do { | ||
| 3995 | unsigned long size = min_t(unsigned long, handle->size, len); | ||
| 3996 | |||
| 3997 | memcpy(handle->addr, buf, size); | ||
| 3998 | |||
| 3999 | len -= size; | ||
| 4000 | handle->addr += size; | ||
| 4001 | buf += size; | ||
| 4002 | handle->size -= size; | ||
| 4003 | if (!handle->size) { | ||
| 4004 | struct perf_buffer *buffer = handle->buffer; | ||
| 4005 | |||
| 4006 | handle->page++; | ||
| 4007 | handle->page &= buffer->nr_pages - 1; | ||
| 4008 | handle->addr = buffer->data_pages[handle->page]; | ||
| 4009 | handle->size = PAGE_SIZE << page_order(buffer); | ||
| 4010 | } | ||
| 4011 | } while (len); | ||
| 4012 | } | ||
| 4013 | |||
| 4014 | static void __perf_event_header__init_id(struct perf_event_header *header, | 3704 | static void __perf_event_header__init_id(struct perf_event_header *header, |
| 4015 | struct perf_sample_data *data, | 3705 | struct perf_sample_data *data, |
| 4016 | struct perf_event *event) | 3706 | struct perf_event *event) |
| @@ -4041,9 +3731,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header, | |||
| 4041 | } | 3731 | } |
| 4042 | } | 3732 | } |
| 4043 | 3733 | ||
| 4044 | static void perf_event_header__init_id(struct perf_event_header *header, | 3734 | void perf_event_header__init_id(struct perf_event_header *header, |
| 4045 | struct perf_sample_data *data, | 3735 | struct perf_sample_data *data, |
| 4046 | struct perf_event *event) | 3736 | struct perf_event *event) |
| 4047 | { | 3737 | { |
| 4048 | if (event->attr.sample_id_all) | 3738 | if (event->attr.sample_id_all) |
| 4049 | __perf_event_header__init_id(header, data, event); | 3739 | __perf_event_header__init_id(header, data, event); |
| @@ -4070,121 +3760,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, | |||
| 4070 | perf_output_put(handle, data->cpu_entry); | 3760 | perf_output_put(handle, data->cpu_entry); |
| 4071 | } | 3761 | } |
| 4072 | 3762 | ||
| 4073 | static void perf_event__output_id_sample(struct perf_event *event, | 3763 | void perf_event__output_id_sample(struct perf_event *event, |
| 4074 | struct perf_output_handle *handle, | 3764 | struct perf_output_handle *handle, |
| 4075 | struct perf_sample_data *sample) | 3765 | struct perf_sample_data *sample) |
| 4076 | { | 3766 | { |
| 4077 | if (event->attr.sample_id_all) | 3767 | if (event->attr.sample_id_all) |
| 4078 | __perf_event__output_id_sample(handle, sample); | 3768 | __perf_event__output_id_sample(handle, sample); |
| 4079 | } | 3769 | } |
| 4080 | 3770 | ||
| 4081 | int perf_output_begin(struct perf_output_handle *handle, | ||
| 4082 | struct perf_event *event, unsigned int size, | ||
| 4083 | int nmi, int sample) | ||
| 4084 | { | ||
| 4085 | struct perf_buffer *buffer; | ||
| 4086 | unsigned long tail, offset, head; | ||
| 4087 | int have_lost; | ||
| 4088 | struct perf_sample_data sample_data; | ||
| 4089 | struct { | ||
| 4090 | struct perf_event_header header; | ||
| 4091 | u64 id; | ||
| 4092 | u64 lost; | ||
| 4093 | } lost_event; | ||
| 4094 | |||
| 4095 | rcu_read_lock(); | ||
| 4096 | /* | ||
| 4097 | * For inherited events we send all the output towards the parent. | ||
| 4098 | */ | ||
| 4099 | if (event->parent) | ||
| 4100 | event = event->parent; | ||
| 4101 | |||
| 4102 | buffer = rcu_dereference(event->buffer); | ||
| 4103 | if (!buffer) | ||
| 4104 | goto out; | ||
| 4105 | |||
| 4106 | handle->buffer = buffer; | ||
| 4107 | handle->event = event; | ||
| 4108 | handle->nmi = nmi; | ||
| 4109 | handle->sample = sample; | ||
| 4110 | |||
| 4111 | if (!buffer->nr_pages) | ||
| 4112 | goto out; | ||
| 4113 | |||
| 4114 | have_lost = local_read(&buffer->lost); | ||
| 4115 | if (have_lost) { | ||
| 4116 | lost_event.header.size = sizeof(lost_event); | ||
| 4117 | perf_event_header__init_id(&lost_event.header, &sample_data, | ||
| 4118 | event); | ||
| 4119 | size += lost_event.header.size; | ||
| 4120 | } | ||
| 4121 | |||
| 4122 | perf_output_get_handle(handle); | ||
| 4123 | |||
| 4124 | do { | ||
| 4125 | /* | ||
| 4126 | * Userspace could choose to issue a mb() before updating the | ||
| 4127 | * tail pointer. So that all reads will be completed before the | ||
| 4128 | * write is issued. | ||
| 4129 | */ | ||
| 4130 | tail = ACCESS_ONCE(buffer->user_page->data_tail); | ||
| 4131 | smp_rmb(); | ||
| 4132 | offset = head = local_read(&buffer->head); | ||
| 4133 | head += size; | ||
| 4134 | if (unlikely(!perf_output_space(buffer, tail, offset, head))) | ||
| 4135 | goto fail; | ||
| 4136 | } while (local_cmpxchg(&buffer->head, offset, head) != offset); | ||
| 4137 | |||
| 4138 | if (head - local_read(&buffer->wakeup) > buffer->watermark) | ||
| 4139 | local_add(buffer->watermark, &buffer->wakeup); | ||
| 4140 | |||
| 4141 | handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); | ||
| 4142 | handle->page &= buffer->nr_pages - 1; | ||
| 4143 | handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); | ||
| 4144 | handle->addr = buffer->data_pages[handle->page]; | ||
| 4145 | handle->addr += handle->size; | ||
| 4146 | handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; | ||
| 4147 | |||
| 4148 | if (have_lost) { | ||
| 4149 | lost_event.header.type = PERF_RECORD_LOST; | ||
| 4150 | lost_event.header.misc = 0; | ||
| 4151 | lost_event.id = event->id; | ||
| 4152 | lost_event.lost = local_xchg(&buffer->lost, 0); | ||
| 4153 | |||
| 4154 | perf_output_put(handle, lost_event); | ||
| 4155 | perf_event__output_id_sample(event, handle, &sample_data); | ||
| 4156 | } | ||
| 4157 | |||
| 4158 | return 0; | ||
| 4159 | |||
| 4160 | fail: | ||
| 4161 | local_inc(&buffer->lost); | ||
| 4162 | perf_output_put_handle(handle); | ||
| 4163 | out: | ||
| 4164 | rcu_read_unlock(); | ||
| 4165 | |||
| 4166 | return -ENOSPC; | ||
| 4167 | } | ||
| 4168 | |||
| 4169 | void perf_output_end(struct perf_output_handle *handle) | ||
| 4170 | { | ||
| 4171 | struct perf_event *event = handle->event; | ||
| 4172 | struct perf_buffer *buffer = handle->buffer; | ||
| 4173 | |||
| 4174 | int wakeup_events = event->attr.wakeup_events; | ||
| 4175 | |||
| 4176 | if (handle->sample && wakeup_events) { | ||
| 4177 | int events = local_inc_return(&buffer->events); | ||
| 4178 | if (events >= wakeup_events) { | ||
| 4179 | local_sub(wakeup_events, &buffer->events); | ||
| 4180 | local_inc(&buffer->wakeup); | ||
| 4181 | } | ||
| 4182 | } | ||
| 4183 | |||
| 4184 | perf_output_put_handle(handle); | ||
| 4185 | rcu_read_unlock(); | ||
| 4186 | } | ||
| 4187 | |||
| 4188 | static void perf_output_read_one(struct perf_output_handle *handle, | 3771 | static void perf_output_read_one(struct perf_output_handle *handle, |
| 4189 | struct perf_event *event, | 3772 | struct perf_event *event, |
| 4190 | u64 enabled, u64 running) | 3773 | u64 enabled, u64 running) |
| @@ -4205,7 +3788,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, | |||
| 4205 | if (read_format & PERF_FORMAT_ID) | 3788 | if (read_format & PERF_FORMAT_ID) |
| 4206 | values[n++] = primary_event_id(event); | 3789 | values[n++] = primary_event_id(event); |
| 4207 | 3790 | ||
| 4208 | perf_output_copy(handle, values, n * sizeof(u64)); | 3791 | __output_copy(handle, values, n * sizeof(u64)); |
| 4209 | } | 3792 | } |
| 4210 | 3793 | ||
| 4211 | /* | 3794 | /* |
| @@ -4235,7 +3818,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 4235 | if (read_format & PERF_FORMAT_ID) | 3818 | if (read_format & PERF_FORMAT_ID) |
| 4236 | values[n++] = primary_event_id(leader); | 3819 | values[n++] = primary_event_id(leader); |
| 4237 | 3820 | ||
| 4238 | perf_output_copy(handle, values, n * sizeof(u64)); | 3821 | __output_copy(handle, values, n * sizeof(u64)); |
| 4239 | 3822 | ||
| 4240 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { | 3823 | list_for_each_entry(sub, &leader->sibling_list, group_entry) { |
| 4241 | n = 0; | 3824 | n = 0; |
| @@ -4247,7 +3830,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 4247 | if (read_format & PERF_FORMAT_ID) | 3830 | if (read_format & PERF_FORMAT_ID) |
| 4248 | values[n++] = primary_event_id(sub); | 3831 | values[n++] = primary_event_id(sub); |
| 4249 | 3832 | ||
| 4250 | perf_output_copy(handle, values, n * sizeof(u64)); | 3833 | __output_copy(handle, values, n * sizeof(u64)); |
| 4251 | } | 3834 | } |
| 4252 | } | 3835 | } |
| 4253 | 3836 | ||
| @@ -4257,7 +3840,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, | |||
| 4257 | static void perf_output_read(struct perf_output_handle *handle, | 3840 | static void perf_output_read(struct perf_output_handle *handle, |
| 4258 | struct perf_event *event) | 3841 | struct perf_event *event) |
| 4259 | { | 3842 | { |
| 4260 | u64 enabled = 0, running = 0, now, ctx_time; | 3843 | u64 enabled = 0, running = 0; |
| 4261 | u64 read_format = event->attr.read_format; | 3844 | u64 read_format = event->attr.read_format; |
| 4262 | 3845 | ||
| 4263 | /* | 3846 | /* |
| @@ -4269,12 +3852,8 @@ static void perf_output_read(struct perf_output_handle *handle, | |||
| 4269 | * because of locking issue as we are called in | 3852 | * because of locking issue as we are called in |
| 4270 | * NMI context | 3853 | * NMI context |
| 4271 | */ | 3854 | */ |
| 4272 | if (read_format & PERF_FORMAT_TOTAL_TIMES) { | 3855 | if (read_format & PERF_FORMAT_TOTAL_TIMES) |
| 4273 | now = perf_clock(); | 3856 | calc_timer_values(event, &enabled, &running); |
| 4274 | ctx_time = event->shadow_ctx_time + now; | ||
| 4275 | enabled = ctx_time - event->tstamp_enabled; | ||
| 4276 | running = ctx_time - event->tstamp_running; | ||
| 4277 | } | ||
| 4278 | 3857 | ||
| 4279 | if (event->attr.read_format & PERF_FORMAT_GROUP) | 3858 | if (event->attr.read_format & PERF_FORMAT_GROUP) |
| 4280 | perf_output_read_group(handle, event, enabled, running); | 3859 | perf_output_read_group(handle, event, enabled, running); |
| @@ -4327,7 +3906,7 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 4327 | 3906 | ||
| 4328 | size *= sizeof(u64); | 3907 | size *= sizeof(u64); |
| 4329 | 3908 | ||
| 4330 | perf_output_copy(handle, data->callchain, size); | 3909 | __output_copy(handle, data->callchain, size); |
| 4331 | } else { | 3910 | } else { |
| 4332 | u64 nr = 0; | 3911 | u64 nr = 0; |
| 4333 | perf_output_put(handle, nr); | 3912 | perf_output_put(handle, nr); |
| @@ -4337,8 +3916,8 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 4337 | if (sample_type & PERF_SAMPLE_RAW) { | 3916 | if (sample_type & PERF_SAMPLE_RAW) { |
| 4338 | if (data->raw) { | 3917 | if (data->raw) { |
| 4339 | perf_output_put(handle, data->raw->size); | 3918 | perf_output_put(handle, data->raw->size); |
| 4340 | perf_output_copy(handle, data->raw->data, | 3919 | __output_copy(handle, data->raw->data, |
| 4341 | data->raw->size); | 3920 | data->raw->size); |
| 4342 | } else { | 3921 | } else { |
| 4343 | struct { | 3922 | struct { |
| 4344 | u32 size; | 3923 | u32 size; |
| @@ -4350,6 +3929,20 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 4350 | perf_output_put(handle, raw); | 3929 | perf_output_put(handle, raw); |
| 4351 | } | 3930 | } |
| 4352 | } | 3931 | } |
| 3932 | |||
| 3933 | if (!event->attr.watermark) { | ||
| 3934 | int wakeup_events = event->attr.wakeup_events; | ||
| 3935 | |||
| 3936 | if (wakeup_events) { | ||
| 3937 | struct ring_buffer *rb = handle->rb; | ||
| 3938 | int events = local_inc_return(&rb->events); | ||
| 3939 | |||
| 3940 | if (events >= wakeup_events) { | ||
| 3941 | local_sub(wakeup_events, &rb->events); | ||
| 3942 | local_inc(&rb->wakeup); | ||
| 3943 | } | ||
| 3944 | } | ||
| 3945 | } | ||
| 4353 | } | 3946 | } |
| 4354 | 3947 | ||
| 4355 | void perf_prepare_sample(struct perf_event_header *header, | 3948 | void perf_prepare_sample(struct perf_event_header *header, |
| @@ -4394,7 +3987,7 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
| 4394 | } | 3987 | } |
| 4395 | } | 3988 | } |
| 4396 | 3989 | ||
| 4397 | static void perf_event_output(struct perf_event *event, int nmi, | 3990 | static void perf_event_output(struct perf_event *event, |
| 4398 | struct perf_sample_data *data, | 3991 | struct perf_sample_data *data, |
| 4399 | struct pt_regs *regs) | 3992 | struct pt_regs *regs) |
| 4400 | { | 3993 | { |
| @@ -4406,7 +3999,7 @@ static void perf_event_output(struct perf_event *event, int nmi, | |||
| 4406 | 3999 | ||
| 4407 | perf_prepare_sample(&header, data, event, regs); | 4000 | perf_prepare_sample(&header, data, event, regs); |
| 4408 | 4001 | ||
| 4409 | if (perf_output_begin(&handle, event, header.size, nmi, 1)) | 4002 | if (perf_output_begin(&handle, event, header.size)) |
| 4410 | goto exit; | 4003 | goto exit; |
| 4411 | 4004 | ||
| 4412 | perf_output_sample(&handle, &header, data, event); | 4005 | perf_output_sample(&handle, &header, data, event); |
| @@ -4446,7 +4039,7 @@ perf_event_read_event(struct perf_event *event, | |||
| 4446 | int ret; | 4039 | int ret; |
| 4447 | 4040 | ||
| 4448 | perf_event_header__init_id(&read_event.header, &sample, event); | 4041 | perf_event_header__init_id(&read_event.header, &sample, event); |
| 4449 | ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); | 4042 | ret = perf_output_begin(&handle, event, read_event.header.size); |
| 4450 | if (ret) | 4043 | if (ret) |
| 4451 | return; | 4044 | return; |
| 4452 | 4045 | ||
| @@ -4489,7 +4082,7 @@ static void perf_event_task_output(struct perf_event *event, | |||
| 4489 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); | 4082 | perf_event_header__init_id(&task_event->event_id.header, &sample, event); |
| 4490 | 4083 | ||
| 4491 | ret = perf_output_begin(&handle, event, | 4084 | ret = perf_output_begin(&handle, event, |
| 4492 | task_event->event_id.header.size, 0, 0); | 4085 | task_event->event_id.header.size); |
| 4493 | if (ret) | 4086 | if (ret) |
| 4494 | goto out; | 4087 | goto out; |
| 4495 | 4088 | ||
| @@ -4626,7 +4219,7 @@ static void perf_event_comm_output(struct perf_event *event, | |||
| 4626 | 4219 | ||
| 4627 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); | 4220 | perf_event_header__init_id(&comm_event->event_id.header, &sample, event); |
| 4628 | ret = perf_output_begin(&handle, event, | 4221 | ret = perf_output_begin(&handle, event, |
| 4629 | comm_event->event_id.header.size, 0, 0); | 4222 | comm_event->event_id.header.size); |
| 4630 | 4223 | ||
| 4631 | if (ret) | 4224 | if (ret) |
| 4632 | goto out; | 4225 | goto out; |
| @@ -4635,7 +4228,7 @@ static void perf_event_comm_output(struct perf_event *event, | |||
| 4635 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); | 4228 | comm_event->event_id.tid = perf_event_tid(event, comm_event->task); |
| 4636 | 4229 | ||
| 4637 | perf_output_put(&handle, comm_event->event_id); | 4230 | perf_output_put(&handle, comm_event->event_id); |
| 4638 | perf_output_copy(&handle, comm_event->comm, | 4231 | __output_copy(&handle, comm_event->comm, |
| 4639 | comm_event->comm_size); | 4232 | comm_event->comm_size); |
| 4640 | 4233 | ||
| 4641 | perf_event__output_id_sample(event, &handle, &sample); | 4234 | perf_event__output_id_sample(event, &handle, &sample); |
| @@ -4773,7 +4366,7 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
| 4773 | 4366 | ||
| 4774 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); | 4367 | perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); |
| 4775 | ret = perf_output_begin(&handle, event, | 4368 | ret = perf_output_begin(&handle, event, |
| 4776 | mmap_event->event_id.header.size, 0, 0); | 4369 | mmap_event->event_id.header.size); |
| 4777 | if (ret) | 4370 | if (ret) |
| 4778 | goto out; | 4371 | goto out; |
| 4779 | 4372 | ||
| @@ -4781,7 +4374,7 @@ static void perf_event_mmap_output(struct perf_event *event, | |||
| 4781 | mmap_event->event_id.tid = perf_event_tid(event, current); | 4374 | mmap_event->event_id.tid = perf_event_tid(event, current); |
| 4782 | 4375 | ||
| 4783 | perf_output_put(&handle, mmap_event->event_id); | 4376 | perf_output_put(&handle, mmap_event->event_id); |
| 4784 | perf_output_copy(&handle, mmap_event->file_name, | 4377 | __output_copy(&handle, mmap_event->file_name, |
| 4785 | mmap_event->file_size); | 4378 | mmap_event->file_size); |
| 4786 | 4379 | ||
| 4787 | perf_event__output_id_sample(event, &handle, &sample); | 4380 | perf_event__output_id_sample(event, &handle, &sample); |
| @@ -4837,7 +4430,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
| 4837 | 4430 | ||
| 4838 | if (file) { | 4431 | if (file) { |
| 4839 | /* | 4432 | /* |
| 4840 | * d_path works from the end of the buffer backwards, so we | 4433 | * d_path works from the end of the rb backwards, so we |
| 4841 | * need to add enough zero bytes after the string to handle | 4434 | * need to add enough zero bytes after the string to handle |
| 4842 | * the 64bit alignment we do later. | 4435 | * the 64bit alignment we do later. |
| 4843 | */ | 4436 | */ |
| @@ -4968,7 +4561,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
| 4968 | perf_event_header__init_id(&throttle_event.header, &sample, event); | 4561 | perf_event_header__init_id(&throttle_event.header, &sample, event); |
| 4969 | 4562 | ||
| 4970 | ret = perf_output_begin(&handle, event, | 4563 | ret = perf_output_begin(&handle, event, |
| 4971 | throttle_event.header.size, 1, 0); | 4564 | throttle_event.header.size); |
| 4972 | if (ret) | 4565 | if (ret) |
| 4973 | return; | 4566 | return; |
| 4974 | 4567 | ||
| @@ -4981,7 +4574,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) | |||
| 4981 | * Generic event overflow handling, sampling. | 4574 | * Generic event overflow handling, sampling. |
| 4982 | */ | 4575 | */ |
| 4983 | 4576 | ||
| 4984 | static int __perf_event_overflow(struct perf_event *event, int nmi, | 4577 | static int __perf_event_overflow(struct perf_event *event, |
| 4985 | int throttle, struct perf_sample_data *data, | 4578 | int throttle, struct perf_sample_data *data, |
| 4986 | struct pt_regs *regs) | 4579 | struct pt_regs *regs) |
| 4987 | { | 4580 | { |
| @@ -5024,26 +4617,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
| 5024 | if (events && atomic_dec_and_test(&event->event_limit)) { | 4617 | if (events && atomic_dec_and_test(&event->event_limit)) { |
| 5025 | ret = 1; | 4618 | ret = 1; |
| 5026 | event->pending_kill = POLL_HUP; | 4619 | event->pending_kill = POLL_HUP; |
| 5027 | if (nmi) { | 4620 | event->pending_disable = 1; |
| 5028 | event->pending_disable = 1; | 4621 | irq_work_queue(&event->pending); |
| 5029 | irq_work_queue(&event->pending); | ||
| 5030 | } else | ||
| 5031 | perf_event_disable(event); | ||
| 5032 | } | 4622 | } |
| 5033 | 4623 | ||
| 5034 | if (event->overflow_handler) | 4624 | if (event->overflow_handler) |
| 5035 | event->overflow_handler(event, nmi, data, regs); | 4625 | event->overflow_handler(event, data, regs); |
| 5036 | else | 4626 | else |
| 5037 | perf_event_output(event, nmi, data, regs); | 4627 | perf_event_output(event, data, regs); |
| 4628 | |||
| 4629 | if (event->fasync && event->pending_kill) { | ||
| 4630 | event->pending_wakeup = 1; | ||
| 4631 | irq_work_queue(&event->pending); | ||
| 4632 | } | ||
| 5038 | 4633 | ||
| 5039 | return ret; | 4634 | return ret; |
| 5040 | } | 4635 | } |
| 5041 | 4636 | ||
| 5042 | int perf_event_overflow(struct perf_event *event, int nmi, | 4637 | int perf_event_overflow(struct perf_event *event, |
| 5043 | struct perf_sample_data *data, | 4638 | struct perf_sample_data *data, |
| 5044 | struct pt_regs *regs) | 4639 | struct pt_regs *regs) |
| 5045 | { | 4640 | { |
| 5046 | return __perf_event_overflow(event, nmi, 1, data, regs); | 4641 | return __perf_event_overflow(event, 1, data, regs); |
| 5047 | } | 4642 | } |
| 5048 | 4643 | ||
| 5049 | /* | 4644 | /* |
| @@ -5092,7 +4687,7 @@ again: | |||
| 5092 | } | 4687 | } |
| 5093 | 4688 | ||
| 5094 | static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | 4689 | static void perf_swevent_overflow(struct perf_event *event, u64 overflow, |
| 5095 | int nmi, struct perf_sample_data *data, | 4690 | struct perf_sample_data *data, |
| 5096 | struct pt_regs *regs) | 4691 | struct pt_regs *regs) |
| 5097 | { | 4692 | { |
| 5098 | struct hw_perf_event *hwc = &event->hw; | 4693 | struct hw_perf_event *hwc = &event->hw; |
| @@ -5106,7 +4701,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
| 5106 | return; | 4701 | return; |
| 5107 | 4702 | ||
| 5108 | for (; overflow; overflow--) { | 4703 | for (; overflow; overflow--) { |
| 5109 | if (__perf_event_overflow(event, nmi, throttle, | 4704 | if (__perf_event_overflow(event, throttle, |
| 5110 | data, regs)) { | 4705 | data, regs)) { |
| 5111 | /* | 4706 | /* |
| 5112 | * We inhibit the overflow from happening when | 4707 | * We inhibit the overflow from happening when |
| @@ -5119,7 +4714,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, | |||
| 5119 | } | 4714 | } |
| 5120 | 4715 | ||
| 5121 | static void perf_swevent_event(struct perf_event *event, u64 nr, | 4716 | static void perf_swevent_event(struct perf_event *event, u64 nr, |
| 5122 | int nmi, struct perf_sample_data *data, | 4717 | struct perf_sample_data *data, |
| 5123 | struct pt_regs *regs) | 4718 | struct pt_regs *regs) |
| 5124 | { | 4719 | { |
| 5125 | struct hw_perf_event *hwc = &event->hw; | 4720 | struct hw_perf_event *hwc = &event->hw; |
| @@ -5133,12 +4728,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, | |||
| 5133 | return; | 4728 | return; |
| 5134 | 4729 | ||
| 5135 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) | 4730 | if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) |
| 5136 | return perf_swevent_overflow(event, 1, nmi, data, regs); | 4731 | return perf_swevent_overflow(event, 1, data, regs); |
| 5137 | 4732 | ||
| 5138 | if (local64_add_negative(nr, &hwc->period_left)) | 4733 | if (local64_add_negative(nr, &hwc->period_left)) |
| 5139 | return; | 4734 | return; |
| 5140 | 4735 | ||
| 5141 | perf_swevent_overflow(event, 0, nmi, data, regs); | 4736 | perf_swevent_overflow(event, 0, data, regs); |
| 5142 | } | 4737 | } |
| 5143 | 4738 | ||
| 5144 | static int perf_exclude_event(struct perf_event *event, | 4739 | static int perf_exclude_event(struct perf_event *event, |
| @@ -5226,7 +4821,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) | |||
| 5226 | } | 4821 | } |
| 5227 | 4822 | ||
| 5228 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | 4823 | static void do_perf_sw_event(enum perf_type_id type, u32 event_id, |
| 5229 | u64 nr, int nmi, | 4824 | u64 nr, |
| 5230 | struct perf_sample_data *data, | 4825 | struct perf_sample_data *data, |
| 5231 | struct pt_regs *regs) | 4826 | struct pt_regs *regs) |
| 5232 | { | 4827 | { |
| @@ -5242,7 +4837,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, | |||
| 5242 | 4837 | ||
| 5243 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 4838 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
| 5244 | if (perf_swevent_match(event, type, event_id, data, regs)) | 4839 | if (perf_swevent_match(event, type, event_id, data, regs)) |
| 5245 | perf_swevent_event(event, nr, nmi, data, regs); | 4840 | perf_swevent_event(event, nr, data, regs); |
| 5246 | } | 4841 | } |
| 5247 | end: | 4842 | end: |
| 5248 | rcu_read_unlock(); | 4843 | rcu_read_unlock(); |
| @@ -5263,8 +4858,7 @@ inline void perf_swevent_put_recursion_context(int rctx) | |||
| 5263 | put_recursion_context(swhash->recursion, rctx); | 4858 | put_recursion_context(swhash->recursion, rctx); |
| 5264 | } | 4859 | } |
| 5265 | 4860 | ||
| 5266 | void __perf_sw_event(u32 event_id, u64 nr, int nmi, | 4861 | void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) |
| 5267 | struct pt_regs *regs, u64 addr) | ||
| 5268 | { | 4862 | { |
| 5269 | struct perf_sample_data data; | 4863 | struct perf_sample_data data; |
| 5270 | int rctx; | 4864 | int rctx; |
| @@ -5276,7 +4870,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, | |||
| 5276 | 4870 | ||
| 5277 | perf_sample_data_init(&data, addr); | 4871 | perf_sample_data_init(&data, addr); |
| 5278 | 4872 | ||
| 5279 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); | 4873 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); |
| 5280 | 4874 | ||
| 5281 | perf_swevent_put_recursion_context(rctx); | 4875 | perf_swevent_put_recursion_context(rctx); |
| 5282 | preempt_enable_notrace(); | 4876 | preempt_enable_notrace(); |
| @@ -5331,14 +4925,6 @@ swevent_hlist_deref(struct swevent_htable *swhash) | |||
| 5331 | lockdep_is_held(&swhash->hlist_mutex)); | 4925 | lockdep_is_held(&swhash->hlist_mutex)); |
| 5332 | } | 4926 | } |
| 5333 | 4927 | ||
| 5334 | static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) | ||
| 5335 | { | ||
| 5336 | struct swevent_hlist *hlist; | ||
| 5337 | |||
| 5338 | hlist = container_of(rcu_head, struct swevent_hlist, rcu_head); | ||
| 5339 | kfree(hlist); | ||
| 5340 | } | ||
| 5341 | |||
| 5342 | static void swevent_hlist_release(struct swevent_htable *swhash) | 4928 | static void swevent_hlist_release(struct swevent_htable *swhash) |
| 5343 | { | 4929 | { |
| 5344 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); | 4930 | struct swevent_hlist *hlist = swevent_hlist_deref(swhash); |
| @@ -5347,7 +4933,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) | |||
| 5347 | return; | 4933 | return; |
| 5348 | 4934 | ||
| 5349 | rcu_assign_pointer(swhash->swevent_hlist, NULL); | 4935 | rcu_assign_pointer(swhash->swevent_hlist, NULL); |
| 5350 | call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); | 4936 | kfree_rcu(hlist, rcu_head); |
| 5351 | } | 4937 | } |
| 5352 | 4938 | ||
| 5353 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) | 4939 | static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) |
| @@ -5429,7 +5015,7 @@ fail: | |||
| 5429 | return err; | 5015 | return err; |
| 5430 | } | 5016 | } |
| 5431 | 5017 | ||
| 5432 | atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 5018 | struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
| 5433 | 5019 | ||
| 5434 | static void sw_perf_event_destroy(struct perf_event *event) | 5020 | static void sw_perf_event_destroy(struct perf_event *event) |
| 5435 | { | 5021 | { |
| @@ -5532,7 +5118,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, | |||
| 5532 | 5118 | ||
| 5533 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { | 5119 | hlist_for_each_entry_rcu(event, node, head, hlist_entry) { |
| 5534 | if (perf_tp_event_match(event, &data, regs)) | 5120 | if (perf_tp_event_match(event, &data, regs)) |
| 5535 | perf_swevent_event(event, count, 1, &data, regs); | 5121 | perf_swevent_event(event, count, &data, regs); |
| 5536 | } | 5122 | } |
| 5537 | 5123 | ||
| 5538 | perf_swevent_put_recursion_context(rctx); | 5124 | perf_swevent_put_recursion_context(rctx); |
| @@ -5625,7 +5211,7 @@ void perf_bp_event(struct perf_event *bp, void *data) | |||
| 5625 | perf_sample_data_init(&sample, bp->attr.bp_addr); | 5211 | perf_sample_data_init(&sample, bp->attr.bp_addr); |
| 5626 | 5212 | ||
| 5627 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) | 5213 | if (!bp->hw.state && !perf_exclude_event(bp, regs)) |
| 5628 | perf_swevent_event(bp, 1, 1, &sample, regs); | 5214 | perf_swevent_event(bp, 1, &sample, regs); |
| 5629 | } | 5215 | } |
| 5630 | #endif | 5216 | #endif |
| 5631 | 5217 | ||
| @@ -5654,7 +5240,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
| 5654 | 5240 | ||
| 5655 | if (regs && !perf_exclude_event(event, regs)) { | 5241 | if (regs && !perf_exclude_event(event, regs)) { |
| 5656 | if (!(event->attr.exclude_idle && current->pid == 0)) | 5242 | if (!(event->attr.exclude_idle && current->pid == 0)) |
| 5657 | if (perf_event_overflow(event, 0, &data, regs)) | 5243 | if (perf_event_overflow(event, &data, regs)) |
| 5658 | ret = HRTIMER_NORESTART; | 5244 | ret = HRTIMER_NORESTART; |
| 5659 | } | 5245 | } |
| 5660 | 5246 | ||
| @@ -5994,6 +5580,7 @@ free_dev: | |||
| 5994 | } | 5580 | } |
| 5995 | 5581 | ||
| 5996 | static struct lock_class_key cpuctx_mutex; | 5582 | static struct lock_class_key cpuctx_mutex; |
| 5583 | static struct lock_class_key cpuctx_lock; | ||
| 5997 | 5584 | ||
| 5998 | int perf_pmu_register(struct pmu *pmu, char *name, int type) | 5585 | int perf_pmu_register(struct pmu *pmu, char *name, int type) |
| 5999 | { | 5586 | { |
| @@ -6044,6 +5631,7 @@ skip_type: | |||
| 6044 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 5631 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
| 6045 | __perf_event_init_context(&cpuctx->ctx); | 5632 | __perf_event_init_context(&cpuctx->ctx); |
| 6046 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); | 5633 | lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); |
| 5634 | lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); | ||
| 6047 | cpuctx->ctx.type = cpu_context; | 5635 | cpuctx->ctx.type = cpu_context; |
| 6048 | cpuctx->ctx.pmu = pmu; | 5636 | cpuctx->ctx.pmu = pmu; |
| 6049 | cpuctx->jiffies_interval = 1; | 5637 | cpuctx->jiffies_interval = 1; |
| @@ -6158,7 +5746,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 6158 | struct task_struct *task, | 5746 | struct task_struct *task, |
| 6159 | struct perf_event *group_leader, | 5747 | struct perf_event *group_leader, |
| 6160 | struct perf_event *parent_event, | 5748 | struct perf_event *parent_event, |
| 6161 | perf_overflow_handler_t overflow_handler) | 5749 | perf_overflow_handler_t overflow_handler, |
| 5750 | void *context) | ||
| 6162 | { | 5751 | { |
| 6163 | struct pmu *pmu; | 5752 | struct pmu *pmu; |
| 6164 | struct perf_event *event; | 5753 | struct perf_event *event; |
| @@ -6216,10 +5805,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, | |||
| 6216 | #endif | 5805 | #endif |
| 6217 | } | 5806 | } |
| 6218 | 5807 | ||
| 6219 | if (!overflow_handler && parent_event) | 5808 | if (!overflow_handler && parent_event) { |
| 6220 | overflow_handler = parent_event->overflow_handler; | 5809 | overflow_handler = parent_event->overflow_handler; |
| 5810 | context = parent_event->overflow_handler_context; | ||
| 5811 | } | ||
| 6221 | 5812 | ||
| 6222 | event->overflow_handler = overflow_handler; | 5813 | event->overflow_handler = overflow_handler; |
| 5814 | event->overflow_handler_context = context; | ||
| 6223 | 5815 | ||
| 6224 | if (attr->disabled) | 5816 | if (attr->disabled) |
| 6225 | event->state = PERF_EVENT_STATE_OFF; | 5817 | event->state = PERF_EVENT_STATE_OFF; |
| @@ -6334,13 +5926,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
| 6334 | if (ret) | 5926 | if (ret) |
| 6335 | return -EFAULT; | 5927 | return -EFAULT; |
| 6336 | 5928 | ||
| 6337 | /* | ||
| 6338 | * If the type exists, the corresponding creation will verify | ||
| 6339 | * the attr->config. | ||
| 6340 | */ | ||
| 6341 | if (attr->type >= PERF_TYPE_MAX) | ||
| 6342 | return -EINVAL; | ||
| 6343 | |||
| 6344 | if (attr->__reserved_1) | 5929 | if (attr->__reserved_1) |
| 6345 | return -EINVAL; | 5930 | return -EINVAL; |
| 6346 | 5931 | ||
| @@ -6362,7 +5947,7 @@ err_size: | |||
| 6362 | static int | 5947 | static int |
| 6363 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | 5948 | perf_event_set_output(struct perf_event *event, struct perf_event *output_event) |
| 6364 | { | 5949 | { |
| 6365 | struct perf_buffer *buffer = NULL, *old_buffer = NULL; | 5950 | struct ring_buffer *rb = NULL, *old_rb = NULL; |
| 6366 | int ret = -EINVAL; | 5951 | int ret = -EINVAL; |
| 6367 | 5952 | ||
| 6368 | if (!output_event) | 5953 | if (!output_event) |
| @@ -6379,7 +5964,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) | |||
| 6379 | goto out; | 5964 | goto out; |
| 6380 | 5965 | ||
| 6381 | /* | 5966 | /* |
| 6382 | * If its not a per-cpu buffer, it must be the same task. | 5967 | * If its not a per-cpu rb, it must be the same task. |
| 6383 | */ | 5968 | */ |
| 6384 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) | 5969 | if (output_event->cpu == -1 && output_event->ctx != event->ctx) |
| 6385 | goto out; | 5970 | goto out; |
| @@ -6391,20 +5976,20 @@ set: | |||
| 6391 | goto unlock; | 5976 | goto unlock; |
| 6392 | 5977 | ||
| 6393 | if (output_event) { | 5978 | if (output_event) { |
| 6394 | /* get the buffer we want to redirect to */ | 5979 | /* get the rb we want to redirect to */ |
| 6395 | buffer = perf_buffer_get(output_event); | 5980 | rb = ring_buffer_get(output_event); |
| 6396 | if (!buffer) | 5981 | if (!rb) |
| 6397 | goto unlock; | 5982 | goto unlock; |
| 6398 | } | 5983 | } |
| 6399 | 5984 | ||
| 6400 | old_buffer = event->buffer; | 5985 | old_rb = event->rb; |
| 6401 | rcu_assign_pointer(event->buffer, buffer); | 5986 | rcu_assign_pointer(event->rb, rb); |
| 6402 | ret = 0; | 5987 | ret = 0; |
| 6403 | unlock: | 5988 | unlock: |
| 6404 | mutex_unlock(&event->mmap_mutex); | 5989 | mutex_unlock(&event->mmap_mutex); |
| 6405 | 5990 | ||
| 6406 | if (old_buffer) | 5991 | if (old_rb) |
| 6407 | perf_buffer_put(old_buffer); | 5992 | ring_buffer_put(old_rb); |
| 6408 | out: | 5993 | out: |
| 6409 | return ret; | 5994 | return ret; |
| 6410 | } | 5995 | } |
| @@ -6486,7 +6071,8 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6486 | } | 6071 | } |
| 6487 | } | 6072 | } |
| 6488 | 6073 | ||
| 6489 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); | 6074 | event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, |
| 6075 | NULL, NULL); | ||
| 6490 | if (IS_ERR(event)) { | 6076 | if (IS_ERR(event)) { |
| 6491 | err = PTR_ERR(event); | 6077 | err = PTR_ERR(event); |
| 6492 | goto err_task; | 6078 | goto err_task; |
| @@ -6671,7 +6257,8 @@ err_fd: | |||
| 6671 | struct perf_event * | 6257 | struct perf_event * |
| 6672 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | 6258 | perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, |
| 6673 | struct task_struct *task, | 6259 | struct task_struct *task, |
| 6674 | perf_overflow_handler_t overflow_handler) | 6260 | perf_overflow_handler_t overflow_handler, |
| 6261 | void *context) | ||
| 6675 | { | 6262 | { |
| 6676 | struct perf_event_context *ctx; | 6263 | struct perf_event_context *ctx; |
| 6677 | struct perf_event *event; | 6264 | struct perf_event *event; |
| @@ -6681,7 +6268,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
| 6681 | * Get the target context (task or percpu): | 6268 | * Get the target context (task or percpu): |
| 6682 | */ | 6269 | */ |
| 6683 | 6270 | ||
| 6684 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); | 6271 | event = perf_event_alloc(attr, cpu, task, NULL, NULL, |
| 6272 | overflow_handler, context); | ||
| 6685 | if (IS_ERR(event)) { | 6273 | if (IS_ERR(event)) { |
| 6686 | err = PTR_ERR(event); | 6274 | err = PTR_ERR(event); |
| 6687 | goto err; | 6275 | goto err; |
| @@ -6788,7 +6376,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
| 6788 | * our context. | 6376 | * our context. |
| 6789 | */ | 6377 | */ |
| 6790 | child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); | 6378 | child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); |
| 6791 | task_ctx_sched_out(child_ctx, EVENT_ALL); | ||
| 6792 | 6379 | ||
| 6793 | /* | 6380 | /* |
| 6794 | * Take the context lock here so that if find_get_context is | 6381 | * Take the context lock here so that if find_get_context is |
| @@ -6796,6 +6383,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) | |||
| 6796 | * incremented the context's refcount before we do put_ctx below. | 6383 | * incremented the context's refcount before we do put_ctx below. |
| 6797 | */ | 6384 | */ |
| 6798 | raw_spin_lock(&child_ctx->lock); | 6385 | raw_spin_lock(&child_ctx->lock); |
| 6386 | task_ctx_sched_out(child_ctx); | ||
| 6799 | child->perf_event_ctxp[ctxn] = NULL; | 6387 | child->perf_event_ctxp[ctxn] = NULL; |
| 6800 | /* | 6388 | /* |
| 6801 | * If this context is a clone; unclone it so it can't get | 6389 | * If this context is a clone; unclone it so it can't get |
| @@ -6965,7 +6553,7 @@ inherit_event(struct perf_event *parent_event, | |||
| 6965 | parent_event->cpu, | 6553 | parent_event->cpu, |
| 6966 | child, | 6554 | child, |
| 6967 | group_leader, parent_event, | 6555 | group_leader, parent_event, |
| 6968 | NULL); | 6556 | NULL, NULL); |
| 6969 | if (IS_ERR(child_event)) | 6557 | if (IS_ERR(child_event)) |
| 6970 | return child_event; | 6558 | return child_event; |
| 6971 | get_ctx(child_ctx); | 6559 | get_ctx(child_ctx); |
| @@ -6992,6 +6580,8 @@ inherit_event(struct perf_event *parent_event, | |||
| 6992 | 6580 | ||
| 6993 | child_event->ctx = child_ctx; | 6581 | child_event->ctx = child_ctx; |
| 6994 | child_event->overflow_handler = parent_event->overflow_handler; | 6582 | child_event->overflow_handler = parent_event->overflow_handler; |
| 6583 | child_event->overflow_handler_context | ||
| 6584 | = parent_event->overflow_handler_context; | ||
| 6995 | 6585 | ||
| 6996 | /* | 6586 | /* |
| 6997 | * Precalculate sample_data sizes | 6587 | * Precalculate sample_data sizes |
| @@ -7410,26 +7000,12 @@ static int __perf_cgroup_move(void *info) | |||
| 7410 | return 0; | 7000 | return 0; |
| 7411 | } | 7001 | } |
| 7412 | 7002 | ||
| 7413 | static void perf_cgroup_move(struct task_struct *task) | 7003 | static void |
| 7004 | perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) | ||
| 7414 | { | 7005 | { |
| 7415 | task_function_call(task, __perf_cgroup_move, task); | 7006 | task_function_call(task, __perf_cgroup_move, task); |
| 7416 | } | 7007 | } |
| 7417 | 7008 | ||
| 7418 | static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
| 7419 | struct cgroup *old_cgrp, struct task_struct *task, | ||
| 7420 | bool threadgroup) | ||
| 7421 | { | ||
| 7422 | perf_cgroup_move(task); | ||
| 7423 | if (threadgroup) { | ||
| 7424 | struct task_struct *c; | ||
| 7425 | rcu_read_lock(); | ||
| 7426 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
| 7427 | perf_cgroup_move(c); | ||
| 7428 | } | ||
| 7429 | rcu_read_unlock(); | ||
| 7430 | } | ||
| 7431 | } | ||
| 7432 | |||
| 7433 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | 7009 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
| 7434 | struct cgroup *old_cgrp, struct task_struct *task) | 7010 | struct cgroup *old_cgrp, struct task_struct *task) |
| 7435 | { | 7011 | { |
| @@ -7441,15 +7017,15 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
| 7441 | if (!(task->flags & PF_EXITING)) | 7017 | if (!(task->flags & PF_EXITING)) |
| 7442 | return; | 7018 | return; |
| 7443 | 7019 | ||
| 7444 | perf_cgroup_move(task); | 7020 | perf_cgroup_attach_task(cgrp, task); |
| 7445 | } | 7021 | } |
| 7446 | 7022 | ||
| 7447 | struct cgroup_subsys perf_subsys = { | 7023 | struct cgroup_subsys perf_subsys = { |
| 7448 | .name = "perf_event", | 7024 | .name = "perf_event", |
| 7449 | .subsys_id = perf_subsys_id, | 7025 | .subsys_id = perf_subsys_id, |
| 7450 | .create = perf_cgroup_create, | 7026 | .create = perf_cgroup_create, |
| 7451 | .destroy = perf_cgroup_destroy, | 7027 | .destroy = perf_cgroup_destroy, |
| 7452 | .exit = perf_cgroup_exit, | 7028 | .exit = perf_cgroup_exit, |
| 7453 | .attach = perf_cgroup_attach, | 7029 | .attach_task = perf_cgroup_attach_task, |
| 7454 | }; | 7030 | }; |
| 7455 | #endif /* CONFIG_CGROUP_PERF */ | 7031 | #endif /* CONFIG_CGROUP_PERF */ |
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 086adf25a55e..b7971d6f38bf 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c | |||
| @@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp) | |||
| 431 | struct perf_event * | 431 | struct perf_event * |
| 432 | register_user_hw_breakpoint(struct perf_event_attr *attr, | 432 | register_user_hw_breakpoint(struct perf_event_attr *attr, |
| 433 | perf_overflow_handler_t triggered, | 433 | perf_overflow_handler_t triggered, |
| 434 | void *context, | ||
| 434 | struct task_struct *tsk) | 435 | struct task_struct *tsk) |
| 435 | { | 436 | { |
| 436 | return perf_event_create_kernel_counter(attr, -1, tsk, triggered); | 437 | return perf_event_create_kernel_counter(attr, -1, tsk, triggered, |
| 438 | context); | ||
| 437 | } | 439 | } |
| 438 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); | 440 | EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); |
| 439 | 441 | ||
| @@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); | |||
| 502 | */ | 504 | */ |
| 503 | struct perf_event * __percpu * | 505 | struct perf_event * __percpu * |
| 504 | register_wide_hw_breakpoint(struct perf_event_attr *attr, | 506 | register_wide_hw_breakpoint(struct perf_event_attr *attr, |
| 505 | perf_overflow_handler_t triggered) | 507 | perf_overflow_handler_t triggered, |
| 508 | void *context) | ||
| 506 | { | 509 | { |
| 507 | struct perf_event * __percpu *cpu_events, **pevent, *bp; | 510 | struct perf_event * __percpu *cpu_events, **pevent, *bp; |
| 508 | long err; | 511 | long err; |
| @@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, | |||
| 515 | get_online_cpus(); | 518 | get_online_cpus(); |
| 516 | for_each_online_cpu(cpu) { | 519 | for_each_online_cpu(cpu) { |
| 517 | pevent = per_cpu_ptr(cpu_events, cpu); | 520 | pevent = per_cpu_ptr(cpu_events, cpu); |
| 518 | bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); | 521 | bp = perf_event_create_kernel_counter(attr, cpu, NULL, |
| 522 | triggered, context); | ||
| 519 | 523 | ||
| 520 | *pevent = bp; | 524 | *pevent = bp; |
| 521 | 525 | ||
diff --git a/kernel/events/internal.h b/kernel/events/internal.h new file mode 100644 index 000000000000..09097dd8116c --- /dev/null +++ b/kernel/events/internal.h | |||
| @@ -0,0 +1,96 @@ | |||
| 1 | #ifndef _KERNEL_EVENTS_INTERNAL_H | ||
| 2 | #define _KERNEL_EVENTS_INTERNAL_H | ||
| 3 | |||
| 4 | #define RING_BUFFER_WRITABLE 0x01 | ||
| 5 | |||
| 6 | struct ring_buffer { | ||
| 7 | atomic_t refcount; | ||
| 8 | struct rcu_head rcu_head; | ||
| 9 | #ifdef CONFIG_PERF_USE_VMALLOC | ||
| 10 | struct work_struct work; | ||
| 11 | int page_order; /* allocation order */ | ||
| 12 | #endif | ||
| 13 | int nr_pages; /* nr of data pages */ | ||
| 14 | int writable; /* are we writable */ | ||
| 15 | |||
| 16 | atomic_t poll; /* POLL_ for wakeups */ | ||
| 17 | |||
| 18 | local_t head; /* write position */ | ||
| 19 | local_t nest; /* nested writers */ | ||
| 20 | local_t events; /* event limit */ | ||
| 21 | local_t wakeup; /* wakeup stamp */ | ||
| 22 | local_t lost; /* nr records lost */ | ||
| 23 | |||
| 24 | long watermark; /* wakeup watermark */ | ||
| 25 | |||
| 26 | struct perf_event_mmap_page *user_page; | ||
| 27 | void *data_pages[0]; | ||
| 28 | }; | ||
| 29 | |||
| 30 | extern void rb_free(struct ring_buffer *rb); | ||
| 31 | extern struct ring_buffer * | ||
| 32 | rb_alloc(int nr_pages, long watermark, int cpu, int flags); | ||
| 33 | extern void perf_event_wakeup(struct perf_event *event); | ||
| 34 | |||
| 35 | extern void | ||
| 36 | perf_event_header__init_id(struct perf_event_header *header, | ||
| 37 | struct perf_sample_data *data, | ||
| 38 | struct perf_event *event); | ||
| 39 | extern void | ||
| 40 | perf_event__output_id_sample(struct perf_event *event, | ||
| 41 | struct perf_output_handle *handle, | ||
| 42 | struct perf_sample_data *sample); | ||
| 43 | |||
| 44 | extern struct page * | ||
| 45 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff); | ||
| 46 | |||
| 47 | #ifdef CONFIG_PERF_USE_VMALLOC | ||
| 48 | /* | ||
| 49 | * Back perf_mmap() with vmalloc memory. | ||
| 50 | * | ||
| 51 | * Required for architectures that have d-cache aliasing issues. | ||
| 52 | */ | ||
| 53 | |||
| 54 | static inline int page_order(struct ring_buffer *rb) | ||
| 55 | { | ||
| 56 | return rb->page_order; | ||
| 57 | } | ||
| 58 | |||
| 59 | #else | ||
| 60 | |||
| 61 | static inline int page_order(struct ring_buffer *rb) | ||
| 62 | { | ||
| 63 | return 0; | ||
| 64 | } | ||
| 65 | #endif | ||
| 66 | |||
| 67 | static unsigned long perf_data_size(struct ring_buffer *rb) | ||
| 68 | { | ||
| 69 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); | ||
| 70 | } | ||
| 71 | |||
| 72 | static inline void | ||
| 73 | __output_copy(struct perf_output_handle *handle, | ||
| 74 | const void *buf, unsigned int len) | ||
| 75 | { | ||
| 76 | do { | ||
| 77 | unsigned long size = min_t(unsigned long, handle->size, len); | ||
| 78 | |||
| 79 | memcpy(handle->addr, buf, size); | ||
| 80 | |||
| 81 | len -= size; | ||
| 82 | handle->addr += size; | ||
| 83 | buf += size; | ||
| 84 | handle->size -= size; | ||
| 85 | if (!handle->size) { | ||
| 86 | struct ring_buffer *rb = handle->rb; | ||
| 87 | |||
| 88 | handle->page++; | ||
| 89 | handle->page &= rb->nr_pages - 1; | ||
| 90 | handle->addr = rb->data_pages[handle->page]; | ||
| 91 | handle->size = PAGE_SIZE << page_order(rb); | ||
| 92 | } | ||
| 93 | } while (len); | ||
| 94 | } | ||
| 95 | |||
| 96 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ | ||
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c new file mode 100644 index 000000000000..a2a29205cc0f --- /dev/null +++ b/kernel/events/ring_buffer.c | |||
| @@ -0,0 +1,380 @@ | |||
| 1 | /* | ||
| 2 | * Performance events ring-buffer code: | ||
| 3 | * | ||
| 4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
| 5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | ||
| 6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
| 7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
| 8 | * | ||
| 9 | * For licensing details see kernel-base/COPYING | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/perf_event.h> | ||
| 13 | #include <linux/vmalloc.h> | ||
| 14 | #include <linux/slab.h> | ||
| 15 | |||
| 16 | #include "internal.h" | ||
| 17 | |||
| 18 | static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, | ||
| 19 | unsigned long offset, unsigned long head) | ||
| 20 | { | ||
| 21 | unsigned long mask; | ||
| 22 | |||
| 23 | if (!rb->writable) | ||
| 24 | return true; | ||
| 25 | |||
| 26 | mask = perf_data_size(rb) - 1; | ||
| 27 | |||
| 28 | offset = (offset - tail) & mask; | ||
| 29 | head = (head - tail) & mask; | ||
| 30 | |||
| 31 | if ((int)(head - offset) < 0) | ||
| 32 | return false; | ||
| 33 | |||
| 34 | return true; | ||
| 35 | } | ||
| 36 | |||
| 37 | static void perf_output_wakeup(struct perf_output_handle *handle) | ||
| 38 | { | ||
| 39 | atomic_set(&handle->rb->poll, POLL_IN); | ||
| 40 | |||
| 41 | handle->event->pending_wakeup = 1; | ||
| 42 | irq_work_queue(&handle->event->pending); | ||
| 43 | } | ||
| 44 | |||
| 45 | /* | ||
| 46 | * We need to ensure a later event_id doesn't publish a head when a former | ||
| 47 | * event isn't done writing. However since we need to deal with NMIs we | ||
| 48 | * cannot fully serialize things. | ||
| 49 | * | ||
| 50 | * We only publish the head (and generate a wakeup) when the outer-most | ||
| 51 | * event completes. | ||
| 52 | */ | ||
| 53 | static void perf_output_get_handle(struct perf_output_handle *handle) | ||
| 54 | { | ||
| 55 | struct ring_buffer *rb = handle->rb; | ||
| 56 | |||
| 57 | preempt_disable(); | ||
| 58 | local_inc(&rb->nest); | ||
| 59 | handle->wakeup = local_read(&rb->wakeup); | ||
| 60 | } | ||
| 61 | |||
| 62 | static void perf_output_put_handle(struct perf_output_handle *handle) | ||
| 63 | { | ||
| 64 | struct ring_buffer *rb = handle->rb; | ||
| 65 | unsigned long head; | ||
| 66 | |||
| 67 | again: | ||
| 68 | head = local_read(&rb->head); | ||
| 69 | |||
| 70 | /* | ||
| 71 | * IRQ/NMI can happen here, which means we can miss a head update. | ||
| 72 | */ | ||
| 73 | |||
| 74 | if (!local_dec_and_test(&rb->nest)) | ||
| 75 | goto out; | ||
| 76 | |||
| 77 | /* | ||
| 78 | * Publish the known good head. Rely on the full barrier implied | ||
| 79 | * by atomic_dec_and_test() order the rb->head read and this | ||
| 80 | * write. | ||
| 81 | */ | ||
| 82 | rb->user_page->data_head = head; | ||
| 83 | |||
| 84 | /* | ||
| 85 | * Now check if we missed an update, rely on the (compiler) | ||
| 86 | * barrier in atomic_dec_and_test() to re-read rb->head. | ||
| 87 | */ | ||
| 88 | if (unlikely(head != local_read(&rb->head))) { | ||
| 89 | local_inc(&rb->nest); | ||
| 90 | goto again; | ||
| 91 | } | ||
| 92 | |||
| 93 | if (handle->wakeup != local_read(&rb->wakeup)) | ||
| 94 | perf_output_wakeup(handle); | ||
| 95 | |||
| 96 | out: | ||
| 97 | preempt_enable(); | ||
| 98 | } | ||
| 99 | |||
| 100 | int perf_output_begin(struct perf_output_handle *handle, | ||
| 101 | struct perf_event *event, unsigned int size) | ||
| 102 | { | ||
| 103 | struct ring_buffer *rb; | ||
| 104 | unsigned long tail, offset, head; | ||
| 105 | int have_lost; | ||
| 106 | struct perf_sample_data sample_data; | ||
| 107 | struct { | ||
| 108 | struct perf_event_header header; | ||
| 109 | u64 id; | ||
| 110 | u64 lost; | ||
| 111 | } lost_event; | ||
| 112 | |||
| 113 | rcu_read_lock(); | ||
| 114 | /* | ||
| 115 | * For inherited events we send all the output towards the parent. | ||
| 116 | */ | ||
| 117 | if (event->parent) | ||
| 118 | event = event->parent; | ||
| 119 | |||
| 120 | rb = rcu_dereference(event->rb); | ||
| 121 | if (!rb) | ||
| 122 | goto out; | ||
| 123 | |||
| 124 | handle->rb = rb; | ||
| 125 | handle->event = event; | ||
| 126 | |||
| 127 | if (!rb->nr_pages) | ||
| 128 | goto out; | ||
| 129 | |||
| 130 | have_lost = local_read(&rb->lost); | ||
| 131 | if (have_lost) { | ||
| 132 | lost_event.header.size = sizeof(lost_event); | ||
| 133 | perf_event_header__init_id(&lost_event.header, &sample_data, | ||
| 134 | event); | ||
| 135 | size += lost_event.header.size; | ||
| 136 | } | ||
| 137 | |||
| 138 | perf_output_get_handle(handle); | ||
| 139 | |||
| 140 | do { | ||
| 141 | /* | ||
| 142 | * Userspace could choose to issue a mb() before updating the | ||
| 143 | * tail pointer. So that all reads will be completed before the | ||
| 144 | * write is issued. | ||
| 145 | */ | ||
| 146 | tail = ACCESS_ONCE(rb->user_page->data_tail); | ||
| 147 | smp_rmb(); | ||
| 148 | offset = head = local_read(&rb->head); | ||
| 149 | head += size; | ||
| 150 | if (unlikely(!perf_output_space(rb, tail, offset, head))) | ||
| 151 | goto fail; | ||
| 152 | } while (local_cmpxchg(&rb->head, offset, head) != offset); | ||
| 153 | |||
| 154 | if (head - local_read(&rb->wakeup) > rb->watermark) | ||
| 155 | local_add(rb->watermark, &rb->wakeup); | ||
| 156 | |||
| 157 | handle->page = offset >> (PAGE_SHIFT + page_order(rb)); | ||
| 158 | handle->page &= rb->nr_pages - 1; | ||
| 159 | handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); | ||
| 160 | handle->addr = rb->data_pages[handle->page]; | ||
| 161 | handle->addr += handle->size; | ||
| 162 | handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; | ||
| 163 | |||
| 164 | if (have_lost) { | ||
| 165 | lost_event.header.type = PERF_RECORD_LOST; | ||
| 166 | lost_event.header.misc = 0; | ||
| 167 | lost_event.id = event->id; | ||
| 168 | lost_event.lost = local_xchg(&rb->lost, 0); | ||
| 169 | |||
| 170 | perf_output_put(handle, lost_event); | ||
| 171 | perf_event__output_id_sample(event, handle, &sample_data); | ||
| 172 | } | ||
| 173 | |||
| 174 | return 0; | ||
| 175 | |||
| 176 | fail: | ||
| 177 | local_inc(&rb->lost); | ||
| 178 | perf_output_put_handle(handle); | ||
| 179 | out: | ||
| 180 | rcu_read_unlock(); | ||
| 181 | |||
| 182 | return -ENOSPC; | ||
| 183 | } | ||
| 184 | |||
| 185 | void perf_output_copy(struct perf_output_handle *handle, | ||
| 186 | const void *buf, unsigned int len) | ||
| 187 | { | ||
| 188 | __output_copy(handle, buf, len); | ||
| 189 | } | ||
| 190 | |||
| 191 | void perf_output_end(struct perf_output_handle *handle) | ||
| 192 | { | ||
| 193 | perf_output_put_handle(handle); | ||
| 194 | rcu_read_unlock(); | ||
| 195 | } | ||
| 196 | |||
| 197 | static void | ||
| 198 | ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) | ||
| 199 | { | ||
| 200 | long max_size = perf_data_size(rb); | ||
| 201 | |||
| 202 | if (watermark) | ||
| 203 | rb->watermark = min(max_size, watermark); | ||
| 204 | |||
| 205 | if (!rb->watermark) | ||
| 206 | rb->watermark = max_size / 2; | ||
| 207 | |||
| 208 | if (flags & RING_BUFFER_WRITABLE) | ||
| 209 | rb->writable = 1; | ||
| 210 | |||
| 211 | atomic_set(&rb->refcount, 1); | ||
| 212 | } | ||
| 213 | |||
| 214 | #ifndef CONFIG_PERF_USE_VMALLOC | ||
| 215 | |||
| 216 | /* | ||
| 217 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. | ||
| 218 | */ | ||
| 219 | |||
| 220 | struct page * | ||
| 221 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | ||
| 222 | { | ||
| 223 | if (pgoff > rb->nr_pages) | ||
| 224 | return NULL; | ||
| 225 | |||
| 226 | if (pgoff == 0) | ||
| 227 | return virt_to_page(rb->user_page); | ||
| 228 | |||
| 229 | return virt_to_page(rb->data_pages[pgoff - 1]); | ||
| 230 | } | ||
| 231 | |||
| 232 | static void *perf_mmap_alloc_page(int cpu) | ||
| 233 | { | ||
| 234 | struct page *page; | ||
| 235 | int node; | ||
| 236 | |||
| 237 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); | ||
| 238 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
| 239 | if (!page) | ||
| 240 | return NULL; | ||
| 241 | |||
| 242 | return page_address(page); | ||
| 243 | } | ||
| 244 | |||
| 245 | struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) | ||
| 246 | { | ||
| 247 | struct ring_buffer *rb; | ||
| 248 | unsigned long size; | ||
| 249 | int i; | ||
| 250 | |||
| 251 | size = sizeof(struct ring_buffer); | ||
| 252 | size += nr_pages * sizeof(void *); | ||
| 253 | |||
| 254 | rb = kzalloc(size, GFP_KERNEL); | ||
| 255 | if (!rb) | ||
| 256 | goto fail; | ||
| 257 | |||
| 258 | rb->user_page = perf_mmap_alloc_page(cpu); | ||
| 259 | if (!rb->user_page) | ||
| 260 | goto fail_user_page; | ||
| 261 | |||
| 262 | for (i = 0; i < nr_pages; i++) { | ||
| 263 | rb->data_pages[i] = perf_mmap_alloc_page(cpu); | ||
| 264 | if (!rb->data_pages[i]) | ||
| 265 | goto fail_data_pages; | ||
| 266 | } | ||
| 267 | |||
| 268 | rb->nr_pages = nr_pages; | ||
| 269 | |||
| 270 | ring_buffer_init(rb, watermark, flags); | ||
| 271 | |||
| 272 | return rb; | ||
| 273 | |||
| 274 | fail_data_pages: | ||
| 275 | for (i--; i >= 0; i--) | ||
| 276 | free_page((unsigned long)rb->data_pages[i]); | ||
| 277 | |||
| 278 | free_page((unsigned long)rb->user_page); | ||
| 279 | |||
| 280 | fail_user_page: | ||
| 281 | kfree(rb); | ||
| 282 | |||
| 283 | fail: | ||
| 284 | return NULL; | ||
| 285 | } | ||
| 286 | |||
| 287 | static void perf_mmap_free_page(unsigned long addr) | ||
| 288 | { | ||
| 289 | struct page *page = virt_to_page((void *)addr); | ||
| 290 | |||
| 291 | page->mapping = NULL; | ||
| 292 | __free_page(page); | ||
| 293 | } | ||
| 294 | |||
| 295 | void rb_free(struct ring_buffer *rb) | ||
| 296 | { | ||
| 297 | int i; | ||
| 298 | |||
| 299 | perf_mmap_free_page((unsigned long)rb->user_page); | ||
| 300 | for (i = 0; i < rb->nr_pages; i++) | ||
| 301 | perf_mmap_free_page((unsigned long)rb->data_pages[i]); | ||
| 302 | kfree(rb); | ||
| 303 | } | ||
| 304 | |||
| 305 | #else | ||
| 306 | |||
| 307 | struct page * | ||
| 308 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | ||
| 309 | { | ||
| 310 | if (pgoff > (1UL << page_order(rb))) | ||
| 311 | return NULL; | ||
| 312 | |||
| 313 | return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); | ||
| 314 | } | ||
| 315 | |||
| 316 | static void perf_mmap_unmark_page(void *addr) | ||
| 317 | { | ||
| 318 | struct page *page = vmalloc_to_page(addr); | ||
| 319 | |||
| 320 | page->mapping = NULL; | ||
| 321 | } | ||
| 322 | |||
| 323 | static void rb_free_work(struct work_struct *work) | ||
| 324 | { | ||
| 325 | struct ring_buffer *rb; | ||
| 326 | void *base; | ||
| 327 | int i, nr; | ||
| 328 | |||
| 329 | rb = container_of(work, struct ring_buffer, work); | ||
| 330 | nr = 1 << page_order(rb); | ||
| 331 | |||
| 332 | base = rb->user_page; | ||
| 333 | for (i = 0; i < nr + 1; i++) | ||
| 334 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); | ||
| 335 | |||
| 336 | vfree(base); | ||
| 337 | kfree(rb); | ||
| 338 | } | ||
| 339 | |||
| 340 | void rb_free(struct ring_buffer *rb) | ||
| 341 | { | ||
| 342 | schedule_work(&rb->work); | ||
| 343 | } | ||
| 344 | |||
| 345 | struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) | ||
| 346 | { | ||
| 347 | struct ring_buffer *rb; | ||
| 348 | unsigned long size; | ||
| 349 | void *all_buf; | ||
| 350 | |||
| 351 | size = sizeof(struct ring_buffer); | ||
| 352 | size += sizeof(void *); | ||
| 353 | |||
| 354 | rb = kzalloc(size, GFP_KERNEL); | ||
| 355 | if (!rb) | ||
| 356 | goto fail; | ||
| 357 | |||
| 358 | INIT_WORK(&rb->work, rb_free_work); | ||
| 359 | |||
| 360 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); | ||
| 361 | if (!all_buf) | ||
| 362 | goto fail_all_buf; | ||
| 363 | |||
| 364 | rb->user_page = all_buf; | ||
| 365 | rb->data_pages[0] = all_buf + PAGE_SIZE; | ||
| 366 | rb->page_order = ilog2(nr_pages); | ||
| 367 | rb->nr_pages = 1; | ||
| 368 | |||
| 369 | ring_buffer_init(rb, watermark, flags); | ||
| 370 | |||
| 371 | return rb; | ||
| 372 | |||
| 373 | fail_all_buf: | ||
| 374 | kfree(rb); | ||
| 375 | |||
| 376 | fail: | ||
| 377 | return NULL; | ||
| 378 | } | ||
| 379 | |||
| 380 | #endif | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 8dd874181542..2913b3509d42 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -85,7 +85,6 @@ static void __exit_signal(struct task_struct *tsk) | |||
| 85 | struct tty_struct *uninitialized_var(tty); | 85 | struct tty_struct *uninitialized_var(tty); |
| 86 | 86 | ||
| 87 | sighand = rcu_dereference_check(tsk->sighand, | 87 | sighand = rcu_dereference_check(tsk->sighand, |
| 88 | rcu_read_lock_held() || | ||
| 89 | lockdep_tasklist_lock_is_held()); | 88 | lockdep_tasklist_lock_is_held()); |
| 90 | spin_lock(&sighand->siglock); | 89 | spin_lock(&sighand->siglock); |
| 91 | 90 | ||
| @@ -169,7 +168,6 @@ void release_task(struct task_struct * p) | |||
| 169 | struct task_struct *leader; | 168 | struct task_struct *leader; |
| 170 | int zap_leader; | 169 | int zap_leader; |
| 171 | repeat: | 170 | repeat: |
| 172 | tracehook_prepare_release_task(p); | ||
| 173 | /* don't need to get the RCU readlock here - the process is dead and | 171 | /* don't need to get the RCU readlock here - the process is dead and |
| 174 | * can't be modifying its own credentials. But shut RCU-lockdep up */ | 172 | * can't be modifying its own credentials. But shut RCU-lockdep up */ |
| 175 | rcu_read_lock(); | 173 | rcu_read_lock(); |
| @@ -179,7 +177,7 @@ repeat: | |||
| 179 | proc_flush_task(p); | 177 | proc_flush_task(p); |
| 180 | 178 | ||
| 181 | write_lock_irq(&tasklist_lock); | 179 | write_lock_irq(&tasklist_lock); |
| 182 | tracehook_finish_release_task(p); | 180 | ptrace_release_task(p); |
| 183 | __exit_signal(p); | 181 | __exit_signal(p); |
| 184 | 182 | ||
| 185 | /* | 183 | /* |
| @@ -190,22 +188,12 @@ repeat: | |||
| 190 | zap_leader = 0; | 188 | zap_leader = 0; |
| 191 | leader = p->group_leader; | 189 | leader = p->group_leader; |
| 192 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { | 190 | if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { |
| 193 | BUG_ON(task_detached(leader)); | ||
| 194 | do_notify_parent(leader, leader->exit_signal); | ||
| 195 | /* | 191 | /* |
| 196 | * If we were the last child thread and the leader has | 192 | * If we were the last child thread and the leader has |
| 197 | * exited already, and the leader's parent ignores SIGCHLD, | 193 | * exited already, and the leader's parent ignores SIGCHLD, |
| 198 | * then we are the one who should release the leader. | 194 | * then we are the one who should release the leader. |
| 199 | * | ||
| 200 | * do_notify_parent() will have marked it self-reaping in | ||
| 201 | * that case. | ||
| 202 | */ | ||
| 203 | zap_leader = task_detached(leader); | ||
| 204 | |||
| 205 | /* | ||
| 206 | * This maintains the invariant that release_task() | ||
| 207 | * only runs on a task in EXIT_DEAD, just for sanity. | ||
| 208 | */ | 195 | */ |
| 196 | zap_leader = do_notify_parent(leader, leader->exit_signal); | ||
| 209 | if (zap_leader) | 197 | if (zap_leader) |
| 210 | leader->exit_state = EXIT_DEAD; | 198 | leader->exit_state = EXIT_DEAD; |
| 211 | } | 199 | } |
| @@ -277,18 +265,16 @@ int is_current_pgrp_orphaned(void) | |||
| 277 | return retval; | 265 | return retval; |
| 278 | } | 266 | } |
| 279 | 267 | ||
| 280 | static int has_stopped_jobs(struct pid *pgrp) | 268 | static bool has_stopped_jobs(struct pid *pgrp) |
| 281 | { | 269 | { |
| 282 | int retval = 0; | ||
| 283 | struct task_struct *p; | 270 | struct task_struct *p; |
| 284 | 271 | ||
| 285 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { | 272 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
| 286 | if (!task_is_stopped(p)) | 273 | if (p->signal->flags & SIGNAL_STOP_STOPPED) |
| 287 | continue; | 274 | return true; |
| 288 | retval = 1; | ||
| 289 | break; | ||
| 290 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); | 275 | } while_each_pid_task(pgrp, PIDTYPE_PGID, p); |
| 291 | return retval; | 276 | |
| 277 | return false; | ||
| 292 | } | 278 | } |
| 293 | 279 | ||
| 294 | /* | 280 | /* |
| @@ -561,29 +547,28 @@ void exit_files(struct task_struct *tsk) | |||
| 561 | 547 | ||
| 562 | #ifdef CONFIG_MM_OWNER | 548 | #ifdef CONFIG_MM_OWNER |
| 563 | /* | 549 | /* |
| 564 | * Task p is exiting and it owned mm, lets find a new owner for it | 550 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
| 565 | */ | 551 | */ |
| 566 | static inline int | ||
| 567 | mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) | ||
| 568 | { | ||
| 569 | /* | ||
| 570 | * If there are other users of the mm and the owner (us) is exiting | ||
| 571 | * we need to find a new owner to take on the responsibility. | ||
| 572 | */ | ||
| 573 | if (atomic_read(&mm->mm_users) <= 1) | ||
| 574 | return 0; | ||
| 575 | if (mm->owner != p) | ||
| 576 | return 0; | ||
| 577 | return 1; | ||
| 578 | } | ||
| 579 | |||
| 580 | void mm_update_next_owner(struct mm_struct *mm) | 552 | void mm_update_next_owner(struct mm_struct *mm) |
| 581 | { | 553 | { |
| 582 | struct task_struct *c, *g, *p = current; | 554 | struct task_struct *c, *g, *p = current; |
| 583 | 555 | ||
| 584 | retry: | 556 | retry: |
| 585 | if (!mm_need_new_owner(mm, p)) | 557 | /* |
| 558 | * If the exiting or execing task is not the owner, it's | ||
| 559 | * someone else's problem. | ||
| 560 | */ | ||
| 561 | if (mm->owner != p) | ||
| 586 | return; | 562 | return; |
| 563 | /* | ||
| 564 | * The current owner is exiting/execing and there are no other | ||
| 565 | * candidates. Do not leave the mm pointing to a possibly | ||
| 566 | * freed task structure. | ||
| 567 | */ | ||
| 568 | if (atomic_read(&mm->mm_users) <= 1) { | ||
| 569 | mm->owner = NULL; | ||
| 570 | return; | ||
| 571 | } | ||
| 587 | 572 | ||
| 588 | read_lock(&tasklist_lock); | 573 | read_lock(&tasklist_lock); |
| 589 | /* | 574 | /* |
| @@ -752,7 +737,7 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, | |||
| 752 | { | 737 | { |
| 753 | list_move_tail(&p->sibling, &p->real_parent->children); | 738 | list_move_tail(&p->sibling, &p->real_parent->children); |
| 754 | 739 | ||
| 755 | if (task_detached(p)) | 740 | if (p->exit_state == EXIT_DEAD) |
| 756 | return; | 741 | return; |
| 757 | /* | 742 | /* |
| 758 | * If this is a threaded reparent there is no need to | 743 | * If this is a threaded reparent there is no need to |
| @@ -765,10 +750,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, | |||
| 765 | p->exit_signal = SIGCHLD; | 750 | p->exit_signal = SIGCHLD; |
| 766 | 751 | ||
| 767 | /* If it has exited notify the new parent about this child's death. */ | 752 | /* If it has exited notify the new parent about this child's death. */ |
| 768 | if (!task_ptrace(p) && | 753 | if (!p->ptrace && |
| 769 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { | 754 | p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { |
| 770 | do_notify_parent(p, p->exit_signal); | 755 | if (do_notify_parent(p, p->exit_signal)) { |
| 771 | if (task_detached(p)) { | ||
| 772 | p->exit_state = EXIT_DEAD; | 756 | p->exit_state = EXIT_DEAD; |
| 773 | list_move_tail(&p->sibling, dead); | 757 | list_move_tail(&p->sibling, dead); |
| 774 | } | 758 | } |
| @@ -795,7 +779,7 @@ static void forget_original_parent(struct task_struct *father) | |||
| 795 | do { | 779 | do { |
| 796 | t->real_parent = reaper; | 780 | t->real_parent = reaper; |
| 797 | if (t->parent == father) { | 781 | if (t->parent == father) { |
| 798 | BUG_ON(task_ptrace(t)); | 782 | BUG_ON(t->ptrace); |
| 799 | t->parent = t->real_parent; | 783 | t->parent = t->real_parent; |
| 800 | } | 784 | } |
| 801 | if (t->pdeath_signal) | 785 | if (t->pdeath_signal) |
| @@ -820,8 +804,7 @@ static void forget_original_parent(struct task_struct *father) | |||
| 820 | */ | 804 | */ |
| 821 | static void exit_notify(struct task_struct *tsk, int group_dead) | 805 | static void exit_notify(struct task_struct *tsk, int group_dead) |
| 822 | { | 806 | { |
| 823 | int signal; | 807 | bool autoreap; |
| 824 | void *cookie; | ||
| 825 | 808 | ||
| 826 | /* | 809 | /* |
| 827 | * This does two things: | 810 | * This does two things: |
| @@ -852,26 +835,33 @@ static void exit_notify(struct task_struct *tsk, int group_dead) | |||
| 852 | * we have changed execution domain as these two values started | 835 | * we have changed execution domain as these two values started |
| 853 | * the same after a fork. | 836 | * the same after a fork. |
| 854 | */ | 837 | */ |
| 855 | if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && | 838 | if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && |
| 856 | (tsk->parent_exec_id != tsk->real_parent->self_exec_id || | 839 | (tsk->parent_exec_id != tsk->real_parent->self_exec_id || |
| 857 | tsk->self_exec_id != tsk->parent_exec_id)) | 840 | tsk->self_exec_id != tsk->parent_exec_id)) |
| 858 | tsk->exit_signal = SIGCHLD; | 841 | tsk->exit_signal = SIGCHLD; |
| 859 | 842 | ||
| 860 | signal = tracehook_notify_death(tsk, &cookie, group_dead); | 843 | if (unlikely(tsk->ptrace)) { |
| 861 | if (signal >= 0) | 844 | int sig = thread_group_leader(tsk) && |
| 862 | signal = do_notify_parent(tsk, signal); | 845 | thread_group_empty(tsk) && |
| 846 | !ptrace_reparented(tsk) ? | ||
| 847 | tsk->exit_signal : SIGCHLD; | ||
| 848 | autoreap = do_notify_parent(tsk, sig); | ||
| 849 | } else if (thread_group_leader(tsk)) { | ||
| 850 | autoreap = thread_group_empty(tsk) && | ||
| 851 | do_notify_parent(tsk, tsk->exit_signal); | ||
| 852 | } else { | ||
| 853 | autoreap = true; | ||
| 854 | } | ||
| 863 | 855 | ||
| 864 | tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; | 856 | tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; |
| 865 | 857 | ||
| 866 | /* mt-exec, de_thread() is waiting for group leader */ | 858 | /* mt-exec, de_thread() is waiting for group leader */ |
| 867 | if (unlikely(tsk->signal->notify_count < 0)) | 859 | if (unlikely(tsk->signal->notify_count < 0)) |
| 868 | wake_up_process(tsk->signal->group_exit_task); | 860 | wake_up_process(tsk->signal->group_exit_task); |
| 869 | write_unlock_irq(&tasklist_lock); | 861 | write_unlock_irq(&tasklist_lock); |
| 870 | 862 | ||
| 871 | tracehook_report_death(tsk, signal, cookie, group_dead); | ||
| 872 | |||
| 873 | /* If the process is dead, release it - nobody will wait for it */ | 863 | /* If the process is dead, release it - nobody will wait for it */ |
| 874 | if (signal == DEATH_REAP) | 864 | if (autoreap) |
| 875 | release_task(tsk); | 865 | release_task(tsk); |
| 876 | } | 866 | } |
| 877 | 867 | ||
| @@ -907,7 +897,6 @@ NORET_TYPE void do_exit(long code) | |||
| 907 | 897 | ||
| 908 | profile_task_exit(tsk); | 898 | profile_task_exit(tsk); |
| 909 | 899 | ||
| 910 | WARN_ON(atomic_read(&tsk->fs_excl)); | ||
| 911 | WARN_ON(blk_needs_flush_plug(tsk)); | 900 | WARN_ON(blk_needs_flush_plug(tsk)); |
| 912 | 901 | ||
| 913 | if (unlikely(in_interrupt())) | 902 | if (unlikely(in_interrupt())) |
| @@ -924,7 +913,7 @@ NORET_TYPE void do_exit(long code) | |||
| 924 | */ | 913 | */ |
| 925 | set_fs(USER_DS); | 914 | set_fs(USER_DS); |
| 926 | 915 | ||
| 927 | tracehook_report_exit(&code); | 916 | ptrace_event(PTRACE_EVENT_EXIT, code); |
| 928 | 917 | ||
| 929 | validate_creds_for_do_exit(tsk); | 918 | validate_creds_for_do_exit(tsk); |
| 930 | 919 | ||
| @@ -991,6 +980,7 @@ NORET_TYPE void do_exit(long code) | |||
| 991 | trace_sched_process_exit(tsk); | 980 | trace_sched_process_exit(tsk); |
| 992 | 981 | ||
| 993 | exit_sem(tsk); | 982 | exit_sem(tsk); |
| 983 | exit_shm(tsk); | ||
| 994 | exit_files(tsk); | 984 | exit_files(tsk); |
| 995 | exit_fs(tsk); | 985 | exit_fs(tsk); |
| 996 | check_stack_usage(); | 986 | check_stack_usage(); |
| @@ -1236,9 +1226,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1236 | traced = ptrace_reparented(p); | 1226 | traced = ptrace_reparented(p); |
| 1237 | /* | 1227 | /* |
| 1238 | * It can be ptraced but not reparented, check | 1228 | * It can be ptraced but not reparented, check |
| 1239 | * !task_detached() to filter out sub-threads. | 1229 | * thread_group_leader() to filter out sub-threads. |
| 1240 | */ | 1230 | */ |
| 1241 | if (likely(!traced) && likely(!task_detached(p))) { | 1231 | if (likely(!traced) && thread_group_leader(p)) { |
| 1242 | struct signal_struct *psig; | 1232 | struct signal_struct *psig; |
| 1243 | struct signal_struct *sig; | 1233 | struct signal_struct *sig; |
| 1244 | unsigned long maxrss; | 1234 | unsigned long maxrss; |
| @@ -1346,16 +1336,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1346 | /* We dropped tasklist, ptracer could die and untrace */ | 1336 | /* We dropped tasklist, ptracer could die and untrace */ |
| 1347 | ptrace_unlink(p); | 1337 | ptrace_unlink(p); |
| 1348 | /* | 1338 | /* |
| 1349 | * If this is not a detached task, notify the parent. | 1339 | * If this is not a sub-thread, notify the parent. |
| 1350 | * If it's still not detached after that, don't release | 1340 | * If parent wants a zombie, don't release it now. |
| 1351 | * it now. | ||
| 1352 | */ | 1341 | */ |
| 1353 | if (!task_detached(p)) { | 1342 | if (thread_group_leader(p) && |
| 1354 | do_notify_parent(p, p->exit_signal); | 1343 | !do_notify_parent(p, p->exit_signal)) { |
| 1355 | if (!task_detached(p)) { | 1344 | p->exit_state = EXIT_ZOMBIE; |
| 1356 | p->exit_state = EXIT_ZOMBIE; | 1345 | p = NULL; |
| 1357 | p = NULL; | ||
| 1358 | } | ||
| 1359 | } | 1346 | } |
| 1360 | write_unlock_irq(&tasklist_lock); | 1347 | write_unlock_irq(&tasklist_lock); |
| 1361 | } | 1348 | } |
| @@ -1368,7 +1355,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
| 1368 | static int *task_stopped_code(struct task_struct *p, bool ptrace) | 1355 | static int *task_stopped_code(struct task_struct *p, bool ptrace) |
| 1369 | { | 1356 | { |
| 1370 | if (ptrace) { | 1357 | if (ptrace) { |
| 1371 | if (task_is_stopped_or_traced(p)) | 1358 | if (task_is_stopped_or_traced(p) && |
| 1359 | !(p->jobctl & JOBCTL_LISTENING)) | ||
| 1372 | return &p->exit_code; | 1360 | return &p->exit_code; |
| 1373 | } else { | 1361 | } else { |
| 1374 | if (p->signal->flags & SIGNAL_STOP_STOPPED) | 1362 | if (p->signal->flags & SIGNAL_STOP_STOPPED) |
| @@ -1377,11 +1365,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) | |||
| 1377 | return NULL; | 1365 | return NULL; |
| 1378 | } | 1366 | } |
| 1379 | 1367 | ||
| 1380 | /* | 1368 | /** |
| 1381 | * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold | 1369 | * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED |
| 1382 | * read_lock(&tasklist_lock) on entry. If we return zero, we still hold | 1370 | * @wo: wait options |
| 1383 | * the lock and this task is uninteresting. If we return nonzero, we have | 1371 | * @ptrace: is the wait for ptrace |
| 1384 | * released the lock and the system call should return. | 1372 | * @p: task to wait for |
| 1373 | * | ||
| 1374 | * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. | ||
| 1375 | * | ||
| 1376 | * CONTEXT: | ||
| 1377 | * read_lock(&tasklist_lock), which is released if return value is | ||
| 1378 | * non-zero. Also, grabs and releases @p->sighand->siglock. | ||
| 1379 | * | ||
| 1380 | * RETURNS: | ||
| 1381 | * 0 if wait condition didn't exist and search for other wait conditions | ||
| 1382 | * should continue. Non-zero return, -errno on failure and @p's pid on | ||
| 1383 | * success, implies that tasklist_lock is released and wait condition | ||
| 1384 | * search should terminate. | ||
| 1385 | */ | 1385 | */ |
| 1386 | static int wait_task_stopped(struct wait_opts *wo, | 1386 | static int wait_task_stopped(struct wait_opts *wo, |
| 1387 | int ptrace, struct task_struct *p) | 1387 | int ptrace, struct task_struct *p) |
| @@ -1397,6 +1397,9 @@ static int wait_task_stopped(struct wait_opts *wo, | |||
| 1397 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) | 1397 | if (!ptrace && !(wo->wo_flags & WUNTRACED)) |
| 1398 | return 0; | 1398 | return 0; |
| 1399 | 1399 | ||
| 1400 | if (!task_stopped_code(p, ptrace)) | ||
| 1401 | return 0; | ||
| 1402 | |||
| 1400 | exit_code = 0; | 1403 | exit_code = 0; |
| 1401 | spin_lock_irq(&p->sighand->siglock); | 1404 | spin_lock_irq(&p->sighand->siglock); |
| 1402 | 1405 | ||
| @@ -1538,33 +1541,83 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, | |||
| 1538 | return 0; | 1541 | return 0; |
| 1539 | } | 1542 | } |
| 1540 | 1543 | ||
| 1541 | if (likely(!ptrace) && unlikely(task_ptrace(p))) { | 1544 | /* dead body doesn't have much to contribute */ |
| 1545 | if (p->exit_state == EXIT_DEAD) | ||
| 1546 | return 0; | ||
| 1547 | |||
| 1548 | /* slay zombie? */ | ||
| 1549 | if (p->exit_state == EXIT_ZOMBIE) { | ||
| 1550 | /* | ||
| 1551 | * A zombie ptracee is only visible to its ptracer. | ||
| 1552 | * Notification and reaping will be cascaded to the real | ||
| 1553 | * parent when the ptracer detaches. | ||
| 1554 | */ | ||
| 1555 | if (likely(!ptrace) && unlikely(p->ptrace)) { | ||
| 1556 | /* it will become visible, clear notask_error */ | ||
| 1557 | wo->notask_error = 0; | ||
| 1558 | return 0; | ||
| 1559 | } | ||
| 1560 | |||
| 1561 | /* we don't reap group leaders with subthreads */ | ||
| 1562 | if (!delay_group_leader(p)) | ||
| 1563 | return wait_task_zombie(wo, p); | ||
| 1564 | |||
| 1565 | /* | ||
| 1566 | * Allow access to stopped/continued state via zombie by | ||
| 1567 | * falling through. Clearing of notask_error is complex. | ||
| 1568 | * | ||
| 1569 | * When !@ptrace: | ||
| 1570 | * | ||
| 1571 | * If WEXITED is set, notask_error should naturally be | ||
| 1572 | * cleared. If not, subset of WSTOPPED|WCONTINUED is set, | ||
| 1573 | * so, if there are live subthreads, there are events to | ||
| 1574 | * wait for. If all subthreads are dead, it's still safe | ||
| 1575 | * to clear - this function will be called again in finite | ||
| 1576 | * amount time once all the subthreads are released and | ||
| 1577 | * will then return without clearing. | ||
| 1578 | * | ||
| 1579 | * When @ptrace: | ||
| 1580 | * | ||
| 1581 | * Stopped state is per-task and thus can't change once the | ||
| 1582 | * target task dies. Only continued and exited can happen. | ||
| 1583 | * Clear notask_error if WCONTINUED | WEXITED. | ||
| 1584 | */ | ||
| 1585 | if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) | ||
| 1586 | wo->notask_error = 0; | ||
| 1587 | } else { | ||
| 1588 | /* | ||
| 1589 | * If @p is ptraced by a task in its real parent's group, | ||
| 1590 | * hide group stop/continued state when looking at @p as | ||
| 1591 | * the real parent; otherwise, a single stop can be | ||
| 1592 | * reported twice as group and ptrace stops. | ||
| 1593 | * | ||
| 1594 | * If a ptracer wants to distinguish the two events for its | ||
| 1595 | * own children, it should create a separate process which | ||
| 1596 | * takes the role of real parent. | ||
| 1597 | */ | ||
| 1598 | if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) | ||
| 1599 | return 0; | ||
| 1600 | |||
| 1542 | /* | 1601 | /* |
| 1543 | * This child is hidden by ptrace. | 1602 | * @p is alive and it's gonna stop, continue or exit, so |
| 1544 | * We aren't allowed to see it now, but eventually we will. | 1603 | * there always is something to wait for. |
| 1545 | */ | 1604 | */ |
| 1546 | wo->notask_error = 0; | 1605 | wo->notask_error = 0; |
| 1547 | return 0; | ||
| 1548 | } | 1606 | } |
| 1549 | 1607 | ||
| 1550 | if (p->exit_state == EXIT_DEAD) | ||
| 1551 | return 0; | ||
| 1552 | |||
| 1553 | /* | 1608 | /* |
| 1554 | * We don't reap group leaders with subthreads. | 1609 | * Wait for stopped. Depending on @ptrace, different stopped state |
| 1610 | * is used and the two don't interact with each other. | ||
| 1555 | */ | 1611 | */ |
| 1556 | if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) | 1612 | ret = wait_task_stopped(wo, ptrace, p); |
| 1557 | return wait_task_zombie(wo, p); | 1613 | if (ret) |
| 1614 | return ret; | ||
| 1558 | 1615 | ||
| 1559 | /* | 1616 | /* |
| 1560 | * It's stopped or running now, so it might | 1617 | * Wait for continued. There's only one continued state and the |
| 1561 | * later continue, exit, or stop again. | 1618 | * ptracer can consume it which can confuse the real parent. Don't |
| 1619 | * use WCONTINUED from ptracer. You don't need or want it. | ||
| 1562 | */ | 1620 | */ |
| 1563 | wo->notask_error = 0; | ||
| 1564 | |||
| 1565 | if (task_stopped_code(p, ptrace)) | ||
| 1566 | return wait_task_stopped(wo, ptrace, p); | ||
| 1567 | |||
| 1568 | return wait_task_continued(wo, p); | 1621 | return wait_task_continued(wo, p); |
| 1569 | } | 1622 | } |
| 1570 | 1623 | ||
diff --git a/kernel/extable.c b/kernel/extable.c index 7f8f263f8524..5339705b8241 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
| @@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr) | |||
| 72 | return 0; | 72 | return 0; |
| 73 | } | 73 | } |
| 74 | 74 | ||
| 75 | /** | ||
| 76 | * core_kernel_data - tell if addr points to kernel data | ||
| 77 | * @addr: address to test | ||
| 78 | * | ||
| 79 | * Returns true if @addr passed in is from the core kernel data | ||
| 80 | * section. | ||
| 81 | * | ||
| 82 | * Note: On some archs it may return true for core RODATA, and false | ||
| 83 | * for others. But will always be true for core RW data. | ||
| 84 | */ | ||
| 85 | int core_kernel_data(unsigned long addr) | ||
| 86 | { | ||
| 87 | if (addr >= (unsigned long)_sdata && | ||
| 88 | addr < (unsigned long)_edata) | ||
| 89 | return 1; | ||
| 90 | return 0; | ||
| 91 | } | ||
| 92 | |||
| 75 | int __kernel_text_address(unsigned long addr) | 93 | int __kernel_text_address(unsigned long addr) |
| 76 | { | 94 | { |
| 77 | if (core_kernel_text(addr)) | 95 | if (core_kernel_text(addr)) |
diff --git a/kernel/fork.c b/kernel/fork.c index e7548dee636b..8e6b6f4fb272 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -37,7 +37,6 @@ | |||
| 37 | #include <linux/swap.h> | 37 | #include <linux/swap.h> |
| 38 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
| 39 | #include <linux/jiffies.h> | 39 | #include <linux/jiffies.h> |
| 40 | #include <linux/tracehook.h> | ||
| 41 | #include <linux/futex.h> | 40 | #include <linux/futex.h> |
| 42 | #include <linux/compat.h> | 41 | #include <linux/compat.h> |
| 43 | #include <linux/kthread.h> | 42 | #include <linux/kthread.h> |
| @@ -59,7 +58,6 @@ | |||
| 59 | #include <linux/taskstats_kern.h> | 58 | #include <linux/taskstats_kern.h> |
| 60 | #include <linux/random.h> | 59 | #include <linux/random.h> |
| 61 | #include <linux/tty.h> | 60 | #include <linux/tty.h> |
| 62 | #include <linux/proc_fs.h> | ||
| 63 | #include <linux/blkdev.h> | 61 | #include <linux/blkdev.h> |
| 64 | #include <linux/fs_struct.h> | 62 | #include <linux/fs_struct.h> |
| 65 | #include <linux/magic.h> | 63 | #include <linux/magic.h> |
| @@ -82,7 +80,7 @@ | |||
| 82 | * Protected counters by write_lock_irq(&tasklist_lock) | 80 | * Protected counters by write_lock_irq(&tasklist_lock) |
| 83 | */ | 81 | */ |
| 84 | unsigned long total_forks; /* Handle normal Linux uptimes. */ | 82 | unsigned long total_forks; /* Handle normal Linux uptimes. */ |
| 85 | int nr_threads; /* The idle threads do not count.. */ | 83 | int nr_threads; /* The idle threads do not count.. */ |
| 86 | 84 | ||
| 87 | int max_threads; /* tunable limit on nr_threads */ | 85 | int max_threads; /* tunable limit on nr_threads */ |
| 88 | 86 | ||
| @@ -234,7 +232,7 @@ void __init fork_init(unsigned long mempages) | |||
| 234 | /* | 232 | /* |
| 235 | * we need to allow at least 20 threads to boot a system | 233 | * we need to allow at least 20 threads to boot a system |
| 236 | */ | 234 | */ |
| 237 | if(max_threads < 20) | 235 | if (max_threads < 20) |
| 238 | max_threads = 20; | 236 | max_threads = 20; |
| 239 | 237 | ||
| 240 | init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; | 238 | init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; |
| @@ -270,7 +268,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 270 | return NULL; | 268 | return NULL; |
| 271 | } | 269 | } |
| 272 | 270 | ||
| 273 | err = arch_dup_task_struct(tsk, orig); | 271 | err = arch_dup_task_struct(tsk, orig); |
| 274 | if (err) | 272 | if (err) |
| 275 | goto out; | 273 | goto out; |
| 276 | 274 | ||
| @@ -290,9 +288,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 290 | tsk->stack_canary = get_random_int(); | 288 | tsk->stack_canary = get_random_int(); |
| 291 | #endif | 289 | #endif |
| 292 | 290 | ||
| 293 | /* One for us, one for whoever does the "release_task()" (usually parent) */ | 291 | /* |
| 294 | atomic_set(&tsk->usage,2); | 292 | * One for us, one for whoever does the "release_task()" (usually |
| 295 | atomic_set(&tsk->fs_excl, 0); | 293 | * parent) |
| 294 | */ | ||
| 295 | atomic_set(&tsk->usage, 2); | ||
| 296 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 296 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
| 297 | tsk->btrace_seq = 0; | 297 | tsk->btrace_seq = 0; |
| 298 | #endif | 298 | #endif |
| @@ -383,15 +383,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 383 | get_file(file); | 383 | get_file(file); |
| 384 | if (tmp->vm_flags & VM_DENYWRITE) | 384 | if (tmp->vm_flags & VM_DENYWRITE) |
| 385 | atomic_dec(&inode->i_writecount); | 385 | atomic_dec(&inode->i_writecount); |
| 386 | spin_lock(&mapping->i_mmap_lock); | 386 | mutex_lock(&mapping->i_mmap_mutex); |
| 387 | if (tmp->vm_flags & VM_SHARED) | 387 | if (tmp->vm_flags & VM_SHARED) |
| 388 | mapping->i_mmap_writable++; | 388 | mapping->i_mmap_writable++; |
| 389 | tmp->vm_truncate_count = mpnt->vm_truncate_count; | ||
| 390 | flush_dcache_mmap_lock(mapping); | 389 | flush_dcache_mmap_lock(mapping); |
| 391 | /* insert tmp into the share list, just after mpnt */ | 390 | /* insert tmp into the share list, just after mpnt */ |
| 392 | vma_prio_tree_add(tmp, mpnt); | 391 | vma_prio_tree_add(tmp, mpnt); |
| 393 | flush_dcache_mmap_unlock(mapping); | 392 | flush_dcache_mmap_unlock(mapping); |
| 394 | spin_unlock(&mapping->i_mmap_lock); | 393 | mutex_unlock(&mapping->i_mmap_mutex); |
| 395 | } | 394 | } |
| 396 | 395 | ||
| 397 | /* | 396 | /* |
| @@ -441,7 +440,7 @@ fail_nomem: | |||
| 441 | goto out; | 440 | goto out; |
| 442 | } | 441 | } |
| 443 | 442 | ||
| 444 | static inline int mm_alloc_pgd(struct mm_struct * mm) | 443 | static inline int mm_alloc_pgd(struct mm_struct *mm) |
| 445 | { | 444 | { |
| 446 | mm->pgd = pgd_alloc(mm); | 445 | mm->pgd = pgd_alloc(mm); |
| 447 | if (unlikely(!mm->pgd)) | 446 | if (unlikely(!mm->pgd)) |
| @@ -449,7 +448,7 @@ static inline int mm_alloc_pgd(struct mm_struct * mm) | |||
| 449 | return 0; | 448 | return 0; |
| 450 | } | 449 | } |
| 451 | 450 | ||
| 452 | static inline void mm_free_pgd(struct mm_struct * mm) | 451 | static inline void mm_free_pgd(struct mm_struct *mm) |
| 453 | { | 452 | { |
| 454 | pgd_free(mm, mm->pgd); | 453 | pgd_free(mm, mm->pgd); |
| 455 | } | 454 | } |
| @@ -486,7 +485,7 @@ static void mm_init_aio(struct mm_struct *mm) | |||
| 486 | #endif | 485 | #endif |
| 487 | } | 486 | } |
| 488 | 487 | ||
| 489 | static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | 488 | static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) |
| 490 | { | 489 | { |
| 491 | atomic_set(&mm->mm_users, 1); | 490 | atomic_set(&mm->mm_users, 1); |
| 492 | atomic_set(&mm->mm_count, 1); | 491 | atomic_set(&mm->mm_count, 1); |
| @@ -517,16 +516,17 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
| 517 | /* | 516 | /* |
| 518 | * Allocate and initialize an mm_struct. | 517 | * Allocate and initialize an mm_struct. |
| 519 | */ | 518 | */ |
| 520 | struct mm_struct * mm_alloc(void) | 519 | struct mm_struct *mm_alloc(void) |
| 521 | { | 520 | { |
| 522 | struct mm_struct * mm; | 521 | struct mm_struct *mm; |
| 523 | 522 | ||
| 524 | mm = allocate_mm(); | 523 | mm = allocate_mm(); |
| 525 | if (mm) { | 524 | if (!mm) |
| 526 | memset(mm, 0, sizeof(*mm)); | 525 | return NULL; |
| 527 | mm = mm_init(mm, current); | 526 | |
| 528 | } | 527 | memset(mm, 0, sizeof(*mm)); |
| 529 | return mm; | 528 | mm_init_cpumask(mm); |
| 529 | return mm_init(mm, current); | ||
| 530 | } | 530 | } |
| 531 | 531 | ||
| 532 | /* | 532 | /* |
| @@ -573,6 +573,57 @@ void mmput(struct mm_struct *mm) | |||
| 573 | } | 573 | } |
| 574 | EXPORT_SYMBOL_GPL(mmput); | 574 | EXPORT_SYMBOL_GPL(mmput); |
| 575 | 575 | ||
| 576 | /* | ||
| 577 | * We added or removed a vma mapping the executable. The vmas are only mapped | ||
| 578 | * during exec and are not mapped with the mmap system call. | ||
| 579 | * Callers must hold down_write() on the mm's mmap_sem for these | ||
| 580 | */ | ||
| 581 | void added_exe_file_vma(struct mm_struct *mm) | ||
| 582 | { | ||
| 583 | mm->num_exe_file_vmas++; | ||
| 584 | } | ||
| 585 | |||
| 586 | void removed_exe_file_vma(struct mm_struct *mm) | ||
| 587 | { | ||
| 588 | mm->num_exe_file_vmas--; | ||
| 589 | if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { | ||
| 590 | fput(mm->exe_file); | ||
| 591 | mm->exe_file = NULL; | ||
| 592 | } | ||
| 593 | |||
| 594 | } | ||
| 595 | |||
| 596 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | ||
| 597 | { | ||
| 598 | if (new_exe_file) | ||
| 599 | get_file(new_exe_file); | ||
| 600 | if (mm->exe_file) | ||
| 601 | fput(mm->exe_file); | ||
| 602 | mm->exe_file = new_exe_file; | ||
| 603 | mm->num_exe_file_vmas = 0; | ||
| 604 | } | ||
| 605 | |||
| 606 | struct file *get_mm_exe_file(struct mm_struct *mm) | ||
| 607 | { | ||
| 608 | struct file *exe_file; | ||
| 609 | |||
| 610 | /* We need mmap_sem to protect against races with removal of | ||
| 611 | * VM_EXECUTABLE vmas */ | ||
| 612 | down_read(&mm->mmap_sem); | ||
| 613 | exe_file = mm->exe_file; | ||
| 614 | if (exe_file) | ||
| 615 | get_file(exe_file); | ||
| 616 | up_read(&mm->mmap_sem); | ||
| 617 | return exe_file; | ||
| 618 | } | ||
| 619 | |||
| 620 | static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) | ||
| 621 | { | ||
| 622 | /* It's safe to write the exe_file pointer without exe_file_lock because | ||
| 623 | * this is called during fork when the task is not yet in /proc */ | ||
| 624 | newmm->exe_file = get_mm_exe_file(oldmm); | ||
| 625 | } | ||
| 626 | |||
| 576 | /** | 627 | /** |
| 577 | * get_task_mm - acquire a reference to the task's mm | 628 | * get_task_mm - acquire a reference to the task's mm |
| 578 | * | 629 | * |
| @@ -679,6 +730,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
| 679 | goto fail_nomem; | 730 | goto fail_nomem; |
| 680 | 731 | ||
| 681 | memcpy(mm, oldmm, sizeof(*mm)); | 732 | memcpy(mm, oldmm, sizeof(*mm)); |
| 733 | mm_init_cpumask(mm); | ||
| 682 | 734 | ||
| 683 | /* Initializing for Swap token stuff */ | 735 | /* Initializing for Swap token stuff */ |
| 684 | mm->token_priority = 0; | 736 | mm->token_priority = 0; |
| @@ -726,9 +778,9 @@ fail_nocontext: | |||
| 726 | return NULL; | 778 | return NULL; |
| 727 | } | 779 | } |
| 728 | 780 | ||
| 729 | static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | 781 | static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) |
| 730 | { | 782 | { |
| 731 | struct mm_struct * mm, *oldmm; | 783 | struct mm_struct *mm, *oldmm; |
| 732 | int retval; | 784 | int retval; |
| 733 | 785 | ||
| 734 | tsk->min_flt = tsk->maj_flt = 0; | 786 | tsk->min_flt = tsk->maj_flt = 0; |
| @@ -795,7 +847,7 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) | |||
| 795 | return 0; | 847 | return 0; |
| 796 | } | 848 | } |
| 797 | 849 | ||
| 798 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | 850 | static int copy_files(unsigned long clone_flags, struct task_struct *tsk) |
| 799 | { | 851 | { |
| 800 | struct files_struct *oldf, *newf; | 852 | struct files_struct *oldf, *newf; |
| 801 | int error = 0; | 853 | int error = 0; |
| @@ -927,6 +979,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
| 927 | tty_audit_fork(sig); | 979 | tty_audit_fork(sig); |
| 928 | sched_autogroup_fork(sig); | 980 | sched_autogroup_fork(sig); |
| 929 | 981 | ||
| 982 | #ifdef CONFIG_CGROUPS | ||
| 983 | init_rwsem(&sig->threadgroup_fork_lock); | ||
| 984 | #endif | ||
| 985 | |||
| 930 | sig->oom_adj = current->signal->oom_adj; | 986 | sig->oom_adj = current->signal->oom_adj; |
| 931 | sig->oom_score_adj = current->signal->oom_score_adj; | 987 | sig->oom_score_adj = current->signal->oom_score_adj; |
| 932 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | 988 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
| @@ -958,7 +1014,7 @@ static void rt_mutex_init_task(struct task_struct *p) | |||
| 958 | { | 1014 | { |
| 959 | raw_spin_lock_init(&p->pi_lock); | 1015 | raw_spin_lock_init(&p->pi_lock); |
| 960 | #ifdef CONFIG_RT_MUTEXES | 1016 | #ifdef CONFIG_RT_MUTEXES |
| 961 | plist_head_init_raw(&p->pi_waiters, &p->pi_lock); | 1017 | plist_head_init(&p->pi_waiters); |
| 962 | p->pi_blocked_on = NULL; | 1018 | p->pi_blocked_on = NULL; |
| 963 | #endif | 1019 | #endif |
| 964 | } | 1020 | } |
| @@ -1055,6 +1111,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1055 | p->real_cred->user != INIT_USER) | 1111 | p->real_cred->user != INIT_USER) |
| 1056 | goto bad_fork_free; | 1112 | goto bad_fork_free; |
| 1057 | } | 1113 | } |
| 1114 | current->flags &= ~PF_NPROC_EXCEEDED; | ||
| 1058 | 1115 | ||
| 1059 | retval = copy_creds(p, clone_flags); | 1116 | retval = copy_creds(p, clone_flags); |
| 1060 | if (retval < 0) | 1117 | if (retval < 0) |
| @@ -1103,22 +1160,27 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1103 | 1160 | ||
| 1104 | posix_cpu_timers_init(p); | 1161 | posix_cpu_timers_init(p); |
| 1105 | 1162 | ||
| 1106 | p->lock_depth = -1; /* -1 = no lock */ | ||
| 1107 | do_posix_clock_monotonic_gettime(&p->start_time); | 1163 | do_posix_clock_monotonic_gettime(&p->start_time); |
| 1108 | p->real_start_time = p->start_time; | 1164 | p->real_start_time = p->start_time; |
| 1109 | monotonic_to_bootbased(&p->real_start_time); | 1165 | monotonic_to_bootbased(&p->real_start_time); |
| 1110 | p->io_context = NULL; | 1166 | p->io_context = NULL; |
| 1111 | p->audit_context = NULL; | 1167 | p->audit_context = NULL; |
| 1168 | if (clone_flags & CLONE_THREAD) | ||
| 1169 | threadgroup_fork_read_lock(current); | ||
| 1112 | cgroup_fork(p); | 1170 | cgroup_fork(p); |
| 1113 | #ifdef CONFIG_NUMA | 1171 | #ifdef CONFIG_NUMA |
| 1114 | p->mempolicy = mpol_dup(p->mempolicy); | 1172 | p->mempolicy = mpol_dup(p->mempolicy); |
| 1115 | if (IS_ERR(p->mempolicy)) { | 1173 | if (IS_ERR(p->mempolicy)) { |
| 1116 | retval = PTR_ERR(p->mempolicy); | 1174 | retval = PTR_ERR(p->mempolicy); |
| 1117 | p->mempolicy = NULL; | 1175 | p->mempolicy = NULL; |
| 1118 | goto bad_fork_cleanup_cgroup; | 1176 | goto bad_fork_cleanup_cgroup; |
| 1119 | } | 1177 | } |
| 1120 | mpol_fix_fork_child_flag(p); | 1178 | mpol_fix_fork_child_flag(p); |
| 1121 | #endif | 1179 | #endif |
| 1180 | #ifdef CONFIG_CPUSETS | ||
| 1181 | p->cpuset_mem_spread_rotor = NUMA_NO_NODE; | ||
| 1182 | p->cpuset_slab_spread_rotor = NUMA_NO_NODE; | ||
| 1183 | #endif | ||
| 1122 | #ifdef CONFIG_TRACE_IRQFLAGS | 1184 | #ifdef CONFIG_TRACE_IRQFLAGS |
| 1123 | p->irq_events = 0; | 1185 | p->irq_events = 0; |
| 1124 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 1186 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
| @@ -1153,30 +1215,38 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1153 | #endif | 1215 | #endif |
| 1154 | 1216 | ||
| 1155 | /* Perform scheduler related setup. Assign this task to a CPU. */ | 1217 | /* Perform scheduler related setup. Assign this task to a CPU. */ |
| 1156 | sched_fork(p, clone_flags); | 1218 | sched_fork(p); |
| 1157 | 1219 | ||
| 1158 | retval = perf_event_init_task(p); | 1220 | retval = perf_event_init_task(p); |
| 1159 | if (retval) | 1221 | if (retval) |
| 1160 | goto bad_fork_cleanup_policy; | 1222 | goto bad_fork_cleanup_policy; |
| 1161 | 1223 | retval = audit_alloc(p); | |
| 1162 | if ((retval = audit_alloc(p))) | 1224 | if (retval) |
| 1163 | goto bad_fork_cleanup_policy; | 1225 | goto bad_fork_cleanup_policy; |
| 1164 | /* copy all the process information */ | 1226 | /* copy all the process information */ |
| 1165 | if ((retval = copy_semundo(clone_flags, p))) | 1227 | retval = copy_semundo(clone_flags, p); |
| 1228 | if (retval) | ||
| 1166 | goto bad_fork_cleanup_audit; | 1229 | goto bad_fork_cleanup_audit; |
| 1167 | if ((retval = copy_files(clone_flags, p))) | 1230 | retval = copy_files(clone_flags, p); |
| 1231 | if (retval) | ||
| 1168 | goto bad_fork_cleanup_semundo; | 1232 | goto bad_fork_cleanup_semundo; |
| 1169 | if ((retval = copy_fs(clone_flags, p))) | 1233 | retval = copy_fs(clone_flags, p); |
| 1234 | if (retval) | ||
| 1170 | goto bad_fork_cleanup_files; | 1235 | goto bad_fork_cleanup_files; |
| 1171 | if ((retval = copy_sighand(clone_flags, p))) | 1236 | retval = copy_sighand(clone_flags, p); |
| 1237 | if (retval) | ||
| 1172 | goto bad_fork_cleanup_fs; | 1238 | goto bad_fork_cleanup_fs; |
| 1173 | if ((retval = copy_signal(clone_flags, p))) | 1239 | retval = copy_signal(clone_flags, p); |
| 1240 | if (retval) | ||
| 1174 | goto bad_fork_cleanup_sighand; | 1241 | goto bad_fork_cleanup_sighand; |
| 1175 | if ((retval = copy_mm(clone_flags, p))) | 1242 | retval = copy_mm(clone_flags, p); |
| 1243 | if (retval) | ||
| 1176 | goto bad_fork_cleanup_signal; | 1244 | goto bad_fork_cleanup_signal; |
| 1177 | if ((retval = copy_namespaces(clone_flags, p))) | 1245 | retval = copy_namespaces(clone_flags, p); |
| 1246 | if (retval) | ||
| 1178 | goto bad_fork_cleanup_mm; | 1247 | goto bad_fork_cleanup_mm; |
| 1179 | if ((retval = copy_io(clone_flags, p))) | 1248 | retval = copy_io(clone_flags, p); |
| 1249 | if (retval) | ||
| 1180 | goto bad_fork_cleanup_namespaces; | 1250 | goto bad_fork_cleanup_namespaces; |
| 1181 | retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); | 1251 | retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); |
| 1182 | if (retval) | 1252 | if (retval) |
| @@ -1194,17 +1264,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1194 | if (clone_flags & CLONE_THREAD) | 1264 | if (clone_flags & CLONE_THREAD) |
| 1195 | p->tgid = current->tgid; | 1265 | p->tgid = current->tgid; |
| 1196 | 1266 | ||
| 1197 | if (current->nsproxy != p->nsproxy) { | ||
| 1198 | retval = ns_cgroup_clone(p, pid); | ||
| 1199 | if (retval) | ||
| 1200 | goto bad_fork_free_pid; | ||
| 1201 | } | ||
| 1202 | |||
| 1203 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | 1267 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; |
| 1204 | /* | 1268 | /* |
| 1205 | * Clear TID on mm_release()? | 1269 | * Clear TID on mm_release()? |
| 1206 | */ | 1270 | */ |
| 1207 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; | 1271 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; |
| 1208 | #ifdef CONFIG_BLOCK | 1272 | #ifdef CONFIG_BLOCK |
| 1209 | p->plug = NULL; | 1273 | p->plug = NULL; |
| 1210 | #endif | 1274 | #endif |
| @@ -1272,7 +1336,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1272 | * it's process group. | 1336 | * it's process group. |
| 1273 | * A fatal signal pending means that current will exit, so the new | 1337 | * A fatal signal pending means that current will exit, so the new |
| 1274 | * thread can't slip out of an OOM kill (or normal SIGKILL). | 1338 | * thread can't slip out of an OOM kill (or normal SIGKILL). |
| 1275 | */ | 1339 | */ |
| 1276 | recalc_sigpending(); | 1340 | recalc_sigpending(); |
| 1277 | if (signal_pending(current)) { | 1341 | if (signal_pending(current)) { |
| 1278 | spin_unlock(¤t->sighand->siglock); | 1342 | spin_unlock(¤t->sighand->siglock); |
| @@ -1290,7 +1354,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1290 | } | 1354 | } |
| 1291 | 1355 | ||
| 1292 | if (likely(p->pid)) { | 1356 | if (likely(p->pid)) { |
| 1293 | tracehook_finish_clone(p, clone_flags, trace); | 1357 | ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); |
| 1294 | 1358 | ||
| 1295 | if (thread_group_leader(p)) { | 1359 | if (thread_group_leader(p)) { |
| 1296 | if (is_child_reaper(pid)) | 1360 | if (is_child_reaper(pid)) |
| @@ -1313,6 +1377,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1313 | write_unlock_irq(&tasklist_lock); | 1377 | write_unlock_irq(&tasklist_lock); |
| 1314 | proc_fork_connector(p); | 1378 | proc_fork_connector(p); |
| 1315 | cgroup_post_fork(p); | 1379 | cgroup_post_fork(p); |
| 1380 | if (clone_flags & CLONE_THREAD) | ||
| 1381 | threadgroup_fork_read_unlock(current); | ||
| 1316 | perf_event_fork(p); | 1382 | perf_event_fork(p); |
| 1317 | return p; | 1383 | return p; |
| 1318 | 1384 | ||
| @@ -1351,6 +1417,8 @@ bad_fork_cleanup_policy: | |||
| 1351 | mpol_put(p->mempolicy); | 1417 | mpol_put(p->mempolicy); |
| 1352 | bad_fork_cleanup_cgroup: | 1418 | bad_fork_cleanup_cgroup: |
| 1353 | #endif | 1419 | #endif |
| 1420 | if (clone_flags & CLONE_THREAD) | ||
| 1421 | threadgroup_fork_read_unlock(current); | ||
| 1354 | cgroup_exit(p, cgroup_callbacks_done); | 1422 | cgroup_exit(p, cgroup_callbacks_done); |
| 1355 | delayacct_tsk_free(p); | 1423 | delayacct_tsk_free(p); |
| 1356 | module_put(task_thread_info(p)->exec_domain->module); | 1424 | module_put(task_thread_info(p)->exec_domain->module); |
| @@ -1427,10 +1495,22 @@ long do_fork(unsigned long clone_flags, | |||
| 1427 | } | 1495 | } |
| 1428 | 1496 | ||
| 1429 | /* | 1497 | /* |
| 1430 | * When called from kernel_thread, don't do user tracing stuff. | 1498 | * Determine whether and which event to report to ptracer. When |
| 1499 | * called from kernel_thread or CLONE_UNTRACED is explicitly | ||
| 1500 | * requested, no event is reported; otherwise, report if the event | ||
| 1501 | * for the type of forking is enabled. | ||
| 1431 | */ | 1502 | */ |
| 1432 | if (likely(user_mode(regs))) | 1503 | if (likely(user_mode(regs)) && !(clone_flags & CLONE_UNTRACED)) { |
| 1433 | trace = tracehook_prepare_clone(clone_flags); | 1504 | if (clone_flags & CLONE_VFORK) |
| 1505 | trace = PTRACE_EVENT_VFORK; | ||
| 1506 | else if ((clone_flags & CSIGNAL) != SIGCHLD) | ||
| 1507 | trace = PTRACE_EVENT_CLONE; | ||
| 1508 | else | ||
| 1509 | trace = PTRACE_EVENT_FORK; | ||
| 1510 | |||
| 1511 | if (likely(!ptrace_event_enabled(current, trace))) | ||
| 1512 | trace = 0; | ||
| 1513 | } | ||
| 1434 | 1514 | ||
| 1435 | p = copy_process(clone_flags, stack_start, regs, stack_size, | 1515 | p = copy_process(clone_flags, stack_start, regs, stack_size, |
| 1436 | child_tidptr, NULL, trace); | 1516 | child_tidptr, NULL, trace); |
| @@ -1454,26 +1534,26 @@ long do_fork(unsigned long clone_flags, | |||
| 1454 | } | 1534 | } |
| 1455 | 1535 | ||
| 1456 | audit_finish_fork(p); | 1536 | audit_finish_fork(p); |
| 1457 | tracehook_report_clone(regs, clone_flags, nr, p); | ||
| 1458 | 1537 | ||
| 1459 | /* | 1538 | /* |
| 1460 | * We set PF_STARTING at creation in case tracing wants to | 1539 | * We set PF_STARTING at creation in case tracing wants to |
| 1461 | * use this to distinguish a fully live task from one that | 1540 | * use this to distinguish a fully live task from one that |
| 1462 | * hasn't gotten to tracehook_report_clone() yet. Now we | 1541 | * hasn't finished SIGSTOP raising yet. Now we clear it |
| 1463 | * clear it and set the child going. | 1542 | * and set the child going. |
| 1464 | */ | 1543 | */ |
| 1465 | p->flags &= ~PF_STARTING; | 1544 | p->flags &= ~PF_STARTING; |
| 1466 | 1545 | ||
| 1467 | wake_up_new_task(p, clone_flags); | 1546 | wake_up_new_task(p); |
| 1468 | 1547 | ||
| 1469 | tracehook_report_clone_complete(trace, regs, | 1548 | /* forking complete and child started to run, tell ptracer */ |
| 1470 | clone_flags, nr, p); | 1549 | if (unlikely(trace)) |
| 1550 | ptrace_event(trace, nr); | ||
| 1471 | 1551 | ||
| 1472 | if (clone_flags & CLONE_VFORK) { | 1552 | if (clone_flags & CLONE_VFORK) { |
| 1473 | freezer_do_not_count(); | 1553 | freezer_do_not_count(); |
| 1474 | wait_for_completion(&vfork); | 1554 | wait_for_completion(&vfork); |
| 1475 | freezer_count(); | 1555 | freezer_count(); |
| 1476 | tracehook_report_vfork_done(p, nr); | 1556 | ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); |
| 1477 | } | 1557 | } |
| 1478 | } else { | 1558 | } else { |
| 1479 | nr = PTR_ERR(p); | 1559 | nr = PTR_ERR(p); |
| @@ -1508,11 +1588,19 @@ void __init proc_caches_init(void) | |||
| 1508 | fs_cachep = kmem_cache_create("fs_cache", | 1588 | fs_cachep = kmem_cache_create("fs_cache", |
| 1509 | sizeof(struct fs_struct), 0, | 1589 | sizeof(struct fs_struct), 0, |
| 1510 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | 1590 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); |
| 1591 | /* | ||
| 1592 | * FIXME! The "sizeof(struct mm_struct)" currently includes the | ||
| 1593 | * whole struct cpumask for the OFFSTACK case. We could change | ||
| 1594 | * this to *only* allocate as much of it as required by the | ||
| 1595 | * maximum number of CPU's we can ever have. The cpumask_allocation | ||
| 1596 | * is at the end of the structure, exactly for that reason. | ||
| 1597 | */ | ||
| 1511 | mm_cachep = kmem_cache_create("mm_struct", | 1598 | mm_cachep = kmem_cache_create("mm_struct", |
| 1512 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | 1599 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, |
| 1513 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); | 1600 | SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); |
| 1514 | vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); | 1601 | vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); |
| 1515 | mmap_init(); | 1602 | mmap_init(); |
| 1603 | nsproxy_cache_init(); | ||
| 1516 | } | 1604 | } |
| 1517 | 1605 | ||
| 1518 | /* | 1606 | /* |
| @@ -1609,12 +1697,14 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) | |||
| 1609 | */ | 1697 | */ |
| 1610 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) | 1698 | if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) |
| 1611 | do_sysvsem = 1; | 1699 | do_sysvsem = 1; |
| 1612 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1700 | err = unshare_fs(unshare_flags, &new_fs); |
| 1701 | if (err) | ||
| 1613 | goto bad_unshare_out; | 1702 | goto bad_unshare_out; |
| 1614 | if ((err = unshare_fd(unshare_flags, &new_fd))) | 1703 | err = unshare_fd(unshare_flags, &new_fd); |
| 1704 | if (err) | ||
| 1615 | goto bad_unshare_cleanup_fs; | 1705 | goto bad_unshare_cleanup_fs; |
| 1616 | if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, | 1706 | err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_fs); |
| 1617 | new_fs))) | 1707 | if (err) |
| 1618 | goto bad_unshare_cleanup_fd; | 1708 | goto bad_unshare_cleanup_fd; |
| 1619 | 1709 | ||
| 1620 | if (new_fs || new_fd || do_sysvsem || new_nsproxy) { | 1710 | if (new_fs || new_fd || do_sysvsem || new_nsproxy) { |
diff --git a/kernel/freezer.c b/kernel/freezer.c index 66ecd2ead215..7b01de98bb6a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
| @@ -17,7 +17,7 @@ static inline void frozen_process(void) | |||
| 17 | { | 17 | { |
| 18 | if (!unlikely(current->flags & PF_NOFREEZE)) { | 18 | if (!unlikely(current->flags & PF_NOFREEZE)) { |
| 19 | current->flags |= PF_FROZEN; | 19 | current->flags |= PF_FROZEN; |
| 20 | wmb(); | 20 | smp_wmb(); |
| 21 | } | 21 | } |
| 22 | clear_freeze_flag(current); | 22 | clear_freeze_flag(current); |
| 23 | } | 23 | } |
| @@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only) | |||
| 93 | * the task as frozen and next clears its TIF_FREEZE. | 93 | * the task as frozen and next clears its TIF_FREEZE. |
| 94 | */ | 94 | */ |
| 95 | if (!freezing(p)) { | 95 | if (!freezing(p)) { |
| 96 | rmb(); | 96 | smp_rmb(); |
| 97 | if (frozen(p)) | 97 | if (frozen(p)) |
| 98 | return false; | 98 | return false; |
| 99 | 99 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index fe28dc282eae..11cbe052b2e8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -218,6 +218,8 @@ static void drop_futex_key_refs(union futex_key *key) | |||
| 218 | * @uaddr: virtual address of the futex | 218 | * @uaddr: virtual address of the futex |
| 219 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED | 219 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED |
| 220 | * @key: address where result is stored. | 220 | * @key: address where result is stored. |
| 221 | * @rw: mapping needs to be read/write (values: VERIFY_READ, | ||
| 222 | * VERIFY_WRITE) | ||
| 221 | * | 223 | * |
| 222 | * Returns a negative error code or 0 | 224 | * Returns a negative error code or 0 |
| 223 | * The key words are stored in *key on success. | 225 | * The key words are stored in *key on success. |
| @@ -229,12 +231,12 @@ static void drop_futex_key_refs(union futex_key *key) | |||
| 229 | * lock_page() might sleep, the caller should not hold a spinlock. | 231 | * lock_page() might sleep, the caller should not hold a spinlock. |
| 230 | */ | 232 | */ |
| 231 | static int | 233 | static int |
| 232 | get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) | 234 | get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) |
| 233 | { | 235 | { |
| 234 | unsigned long address = (unsigned long)uaddr; | 236 | unsigned long address = (unsigned long)uaddr; |
| 235 | struct mm_struct *mm = current->mm; | 237 | struct mm_struct *mm = current->mm; |
| 236 | struct page *page, *page_head; | 238 | struct page *page, *page_head; |
| 237 | int err; | 239 | int err, ro = 0; |
| 238 | 240 | ||
| 239 | /* | 241 | /* |
| 240 | * The futex address must be "naturally" aligned. | 242 | * The futex address must be "naturally" aligned. |
| @@ -262,8 +264,18 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) | |||
| 262 | 264 | ||
| 263 | again: | 265 | again: |
| 264 | err = get_user_pages_fast(address, 1, 1, &page); | 266 | err = get_user_pages_fast(address, 1, 1, &page); |
| 267 | /* | ||
| 268 | * If write access is not required (eg. FUTEX_WAIT), try | ||
| 269 | * and get read-only access. | ||
| 270 | */ | ||
| 271 | if (err == -EFAULT && rw == VERIFY_READ) { | ||
| 272 | err = get_user_pages_fast(address, 1, 0, &page); | ||
| 273 | ro = 1; | ||
| 274 | } | ||
| 265 | if (err < 0) | 275 | if (err < 0) |
| 266 | return err; | 276 | return err; |
| 277 | else | ||
| 278 | err = 0; | ||
| 267 | 279 | ||
| 268 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 280 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| 269 | page_head = page; | 281 | page_head = page; |
| @@ -305,6 +317,13 @@ again: | |||
| 305 | if (!page_head->mapping) { | 317 | if (!page_head->mapping) { |
| 306 | unlock_page(page_head); | 318 | unlock_page(page_head); |
| 307 | put_page(page_head); | 319 | put_page(page_head); |
| 320 | /* | ||
| 321 | * ZERO_PAGE pages don't have a mapping. Avoid a busy loop | ||
| 322 | * trying to find one. RW mapping would have COW'd (and thus | ||
| 323 | * have a mapping) so this page is RO and won't ever change. | ||
| 324 | */ | ||
| 325 | if ((page_head == ZERO_PAGE(address))) | ||
| 326 | return -EFAULT; | ||
| 308 | goto again; | 327 | goto again; |
| 309 | } | 328 | } |
| 310 | 329 | ||
| @@ -316,6 +335,15 @@ again: | |||
| 316 | * the object not the particular process. | 335 | * the object not the particular process. |
| 317 | */ | 336 | */ |
| 318 | if (PageAnon(page_head)) { | 337 | if (PageAnon(page_head)) { |
| 338 | /* | ||
| 339 | * A RO anonymous page will never change and thus doesn't make | ||
| 340 | * sense for futex operations. | ||
| 341 | */ | ||
| 342 | if (ro) { | ||
| 343 | err = -EFAULT; | ||
| 344 | goto out; | ||
| 345 | } | ||
| 346 | |||
| 319 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ | 347 | key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ |
| 320 | key->private.mm = mm; | 348 | key->private.mm = mm; |
| 321 | key->private.address = address; | 349 | key->private.address = address; |
| @@ -327,9 +355,10 @@ again: | |||
| 327 | 355 | ||
| 328 | get_futex_key_refs(key); | 356 | get_futex_key_refs(key); |
| 329 | 357 | ||
| 358 | out: | ||
| 330 | unlock_page(page_head); | 359 | unlock_page(page_head); |
| 331 | put_page(page_head); | 360 | put_page(page_head); |
| 332 | return 0; | 361 | return err; |
| 333 | } | 362 | } |
| 334 | 363 | ||
| 335 | static inline void put_futex_key(union futex_key *key) | 364 | static inline void put_futex_key(union futex_key *key) |
| @@ -355,8 +384,8 @@ static int fault_in_user_writeable(u32 __user *uaddr) | |||
| 355 | int ret; | 384 | int ret; |
| 356 | 385 | ||
| 357 | down_read(&mm->mmap_sem); | 386 | down_read(&mm->mmap_sem); |
| 358 | ret = get_user_pages(current, mm, (unsigned long)uaddr, | 387 | ret = fixup_user_fault(current, mm, (unsigned long)uaddr, |
| 359 | 1, 1, 0, NULL, NULL); | 388 | FAULT_FLAG_WRITE); |
| 360 | up_read(&mm->mmap_sem); | 389 | up_read(&mm->mmap_sem); |
| 361 | 390 | ||
| 362 | return ret < 0 ? ret : 0; | 391 | return ret < 0 ? ret : 0; |
| @@ -940,7 +969,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | |||
| 940 | if (!bitset) | 969 | if (!bitset) |
| 941 | return -EINVAL; | 970 | return -EINVAL; |
| 942 | 971 | ||
| 943 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); | 972 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); |
| 944 | if (unlikely(ret != 0)) | 973 | if (unlikely(ret != 0)) |
| 945 | goto out; | 974 | goto out; |
| 946 | 975 | ||
| @@ -986,10 +1015,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, | |||
| 986 | int ret, op_ret; | 1015 | int ret, op_ret; |
| 987 | 1016 | ||
| 988 | retry: | 1017 | retry: |
| 989 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); | 1018 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); |
| 990 | if (unlikely(ret != 0)) | 1019 | if (unlikely(ret != 0)) |
| 991 | goto out; | 1020 | goto out; |
| 992 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); | 1021 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
| 993 | if (unlikely(ret != 0)) | 1022 | if (unlikely(ret != 0)) |
| 994 | goto out_put_key1; | 1023 | goto out_put_key1; |
| 995 | 1024 | ||
| @@ -1243,10 +1272,11 @@ retry: | |||
| 1243 | pi_state = NULL; | 1272 | pi_state = NULL; |
| 1244 | } | 1273 | } |
| 1245 | 1274 | ||
| 1246 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1); | 1275 | ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); |
| 1247 | if (unlikely(ret != 0)) | 1276 | if (unlikely(ret != 0)) |
| 1248 | goto out; | 1277 | goto out; |
| 1249 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); | 1278 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, |
| 1279 | requeue_pi ? VERIFY_WRITE : VERIFY_READ); | ||
| 1250 | if (unlikely(ret != 0)) | 1280 | if (unlikely(ret != 0)) |
| 1251 | goto out_put_key1; | 1281 | goto out_put_key1; |
| 1252 | 1282 | ||
| @@ -1790,7 +1820,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | |||
| 1790 | * while the syscall executes. | 1820 | * while the syscall executes. |
| 1791 | */ | 1821 | */ |
| 1792 | retry: | 1822 | retry: |
| 1793 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); | 1823 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); |
| 1794 | if (unlikely(ret != 0)) | 1824 | if (unlikely(ret != 0)) |
| 1795 | return ret; | 1825 | return ret; |
| 1796 | 1826 | ||
| @@ -1941,7 +1971,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, | |||
| 1941 | } | 1971 | } |
| 1942 | 1972 | ||
| 1943 | retry: | 1973 | retry: |
| 1944 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key); | 1974 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); |
| 1945 | if (unlikely(ret != 0)) | 1975 | if (unlikely(ret != 0)) |
| 1946 | goto out; | 1976 | goto out; |
| 1947 | 1977 | ||
| @@ -2060,7 +2090,7 @@ retry: | |||
| 2060 | if ((uval & FUTEX_TID_MASK) != vpid) | 2090 | if ((uval & FUTEX_TID_MASK) != vpid) |
| 2061 | return -EPERM; | 2091 | return -EPERM; |
| 2062 | 2092 | ||
| 2063 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); | 2093 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); |
| 2064 | if (unlikely(ret != 0)) | 2094 | if (unlikely(ret != 0)) |
| 2065 | goto out; | 2095 | goto out; |
| 2066 | 2096 | ||
| @@ -2249,7 +2279,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
| 2249 | debug_rt_mutex_init_waiter(&rt_waiter); | 2279 | debug_rt_mutex_init_waiter(&rt_waiter); |
| 2250 | rt_waiter.task = NULL; | 2280 | rt_waiter.task = NULL; |
| 2251 | 2281 | ||
| 2252 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2); | 2282 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
| 2253 | if (unlikely(ret != 0)) | 2283 | if (unlikely(ret != 0)) |
| 2254 | goto out; | 2284 | goto out; |
| 2255 | 2285 | ||
| @@ -2697,7 +2727,7 @@ static int __init futex_init(void) | |||
| 2697 | futex_cmpxchg_enabled = 1; | 2727 | futex_cmpxchg_enabled = 1; |
| 2698 | 2728 | ||
| 2699 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | 2729 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { |
| 2700 | plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); | 2730 | plist_head_init(&futex_queues[i].chain); |
| 2701 | spin_lock_init(&futex_queues[i].lock); | 2731 | spin_lock_init(&futex_queues[i].lock); |
| 2702 | } | 2732 | } |
| 2703 | 2733 | ||
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index b8cadf70b1fb..a92028196cc1 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
| @@ -2,7 +2,8 @@ menu "GCOV-based kernel profiling" | |||
| 2 | 2 | ||
| 3 | config GCOV_KERNEL | 3 | config GCOV_KERNEL |
| 4 | bool "Enable gcov-based kernel profiling" | 4 | bool "Enable gcov-based kernel profiling" |
| 5 | depends on DEBUG_FS && CONSTRUCTORS | 5 | depends on DEBUG_FS |
| 6 | select CONSTRUCTORS if !UML | ||
| 6 | default n | 7 | default n |
| 7 | ---help--- | 8 | ---help--- |
| 8 | This option enables gcov-based code profiling (e.g. for code coverage | 9 | This option enables gcov-based code profiling (e.g. for code coverage |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 87fdb3f8db14..a9205e32a059 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
| @@ -64,24 +64,27 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | |||
| 64 | .clock_base = | 64 | .clock_base = |
| 65 | { | 65 | { |
| 66 | { | 66 | { |
| 67 | .index = CLOCK_REALTIME, | 67 | .index = HRTIMER_BASE_MONOTONIC, |
| 68 | .get_time = &ktime_get_real, | 68 | .clockid = CLOCK_MONOTONIC, |
| 69 | .get_time = &ktime_get, | ||
| 69 | .resolution = KTIME_LOW_RES, | 70 | .resolution = KTIME_LOW_RES, |
| 70 | }, | 71 | }, |
| 71 | { | 72 | { |
| 72 | .index = CLOCK_MONOTONIC, | 73 | .index = HRTIMER_BASE_REALTIME, |
| 73 | .get_time = &ktime_get, | 74 | .clockid = CLOCK_REALTIME, |
| 75 | .get_time = &ktime_get_real, | ||
| 74 | .resolution = KTIME_LOW_RES, | 76 | .resolution = KTIME_LOW_RES, |
| 75 | }, | 77 | }, |
| 76 | { | 78 | { |
| 77 | .index = CLOCK_BOOTTIME, | 79 | .index = HRTIMER_BASE_BOOTTIME, |
| 80 | .clockid = CLOCK_BOOTTIME, | ||
| 78 | .get_time = &ktime_get_boottime, | 81 | .get_time = &ktime_get_boottime, |
| 79 | .resolution = KTIME_LOW_RES, | 82 | .resolution = KTIME_LOW_RES, |
| 80 | }, | 83 | }, |
| 81 | } | 84 | } |
| 82 | }; | 85 | }; |
| 83 | 86 | ||
| 84 | static int hrtimer_clock_to_base_table[MAX_CLOCKS] = { | 87 | static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { |
| 85 | [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, | 88 | [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, |
| 86 | [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, | 89 | [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, |
| 87 | [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, | 90 | [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, |
| @@ -196,7 +199,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, | |||
| 196 | struct hrtimer_cpu_base *new_cpu_base; | 199 | struct hrtimer_cpu_base *new_cpu_base; |
| 197 | int this_cpu = smp_processor_id(); | 200 | int this_cpu = smp_processor_id(); |
| 198 | int cpu = hrtimer_get_target(this_cpu, pinned); | 201 | int cpu = hrtimer_get_target(this_cpu, pinned); |
| 199 | int basenum = hrtimer_clockid_to_base(base->index); | 202 | int basenum = base->index; |
| 200 | 203 | ||
| 201 | again: | 204 | again: |
| 202 | new_cpu_base = &per_cpu(hrtimer_bases, cpu); | 205 | new_cpu_base = &per_cpu(hrtimer_bases, cpu); |
| @@ -621,66 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer, | |||
| 621 | return res; | 624 | return res; |
| 622 | } | 625 | } |
| 623 | 626 | ||
| 624 | |||
| 625 | /* | ||
| 626 | * Retrigger next event is called after clock was set | ||
| 627 | * | ||
| 628 | * Called with interrupts disabled via on_each_cpu() | ||
| 629 | */ | ||
| 630 | static void retrigger_next_event(void *arg) | ||
| 631 | { | ||
| 632 | struct hrtimer_cpu_base *base; | ||
| 633 | struct timespec realtime_offset, wtm, sleep; | ||
| 634 | |||
| 635 | if (!hrtimer_hres_active()) | ||
| 636 | return; | ||
| 637 | |||
| 638 | get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, | ||
| 639 | &sleep); | ||
| 640 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
| 641 | |||
| 642 | base = &__get_cpu_var(hrtimer_bases); | ||
| 643 | |||
| 644 | /* Adjust CLOCK_REALTIME offset */ | ||
| 645 | raw_spin_lock(&base->lock); | ||
| 646 | base->clock_base[HRTIMER_BASE_REALTIME].offset = | ||
| 647 | timespec_to_ktime(realtime_offset); | ||
| 648 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
| 649 | timespec_to_ktime(sleep); | ||
| 650 | |||
| 651 | hrtimer_force_reprogram(base, 0); | ||
| 652 | raw_spin_unlock(&base->lock); | ||
| 653 | } | ||
| 654 | |||
| 655 | /* | ||
| 656 | * Clock realtime was set | ||
| 657 | * | ||
| 658 | * Change the offset of the realtime clock vs. the monotonic | ||
| 659 | * clock. | ||
| 660 | * | ||
| 661 | * We might have to reprogram the high resolution timer interrupt. On | ||
| 662 | * SMP we call the architecture specific code to retrigger _all_ high | ||
| 663 | * resolution timer interrupts. On UP we just disable interrupts and | ||
| 664 | * call the high resolution interrupt code. | ||
| 665 | */ | ||
| 666 | void clock_was_set(void) | ||
| 667 | { | ||
| 668 | /* Retrigger the CPU local events everywhere */ | ||
| 669 | on_each_cpu(retrigger_next_event, NULL, 1); | ||
| 670 | } | ||
| 671 | |||
| 672 | /* | ||
| 673 | * During resume we might have to reprogram the high resolution timer | ||
| 674 | * interrupt (on the local CPU): | ||
| 675 | */ | ||
| 676 | void hres_timers_resume(void) | ||
| 677 | { | ||
| 678 | WARN_ONCE(!irqs_disabled(), | ||
| 679 | KERN_INFO "hres_timers_resume() called with IRQs enabled!"); | ||
| 680 | |||
| 681 | retrigger_next_event(NULL); | ||
| 682 | } | ||
| 683 | |||
| 684 | /* | 627 | /* |
| 685 | * Initialize the high resolution related parts of cpu_base | 628 | * Initialize the high resolution related parts of cpu_base |
| 686 | */ | 629 | */ |
| @@ -715,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 715 | } | 658 | } |
| 716 | 659 | ||
| 717 | /* | 660 | /* |
| 661 | * Retrigger next event is called after clock was set | ||
| 662 | * | ||
| 663 | * Called with interrupts disabled via on_each_cpu() | ||
| 664 | */ | ||
| 665 | static void retrigger_next_event(void *arg) | ||
| 666 | { | ||
| 667 | struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); | ||
| 668 | struct timespec realtime_offset, xtim, wtm, sleep; | ||
| 669 | |||
| 670 | if (!hrtimer_hres_active()) | ||
| 671 | return; | ||
| 672 | |||
| 673 | /* Optimized out for !HIGH_RES */ | ||
| 674 | get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); | ||
| 675 | set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); | ||
| 676 | |||
| 677 | /* Adjust CLOCK_REALTIME offset */ | ||
| 678 | raw_spin_lock(&base->lock); | ||
| 679 | base->clock_base[HRTIMER_BASE_REALTIME].offset = | ||
| 680 | timespec_to_ktime(realtime_offset); | ||
| 681 | base->clock_base[HRTIMER_BASE_BOOTTIME].offset = | ||
| 682 | timespec_to_ktime(sleep); | ||
| 683 | |||
| 684 | hrtimer_force_reprogram(base, 0); | ||
| 685 | raw_spin_unlock(&base->lock); | ||
| 686 | } | ||
| 687 | |||
| 688 | /* | ||
| 718 | * Switch to high resolution mode | 689 | * Switch to high resolution mode |
| 719 | */ | 690 | */ |
| 720 | static int hrtimer_switch_to_hres(void) | 691 | static int hrtimer_switch_to_hres(void) |
| 721 | { | 692 | { |
| 722 | int cpu = smp_processor_id(); | 693 | int i, cpu = smp_processor_id(); |
| 723 | struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); | 694 | struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); |
| 724 | unsigned long flags; | 695 | unsigned long flags; |
| 725 | 696 | ||
| @@ -735,9 +706,8 @@ static int hrtimer_switch_to_hres(void) | |||
| 735 | return 0; | 706 | return 0; |
| 736 | } | 707 | } |
| 737 | base->hres_active = 1; | 708 | base->hres_active = 1; |
| 738 | base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; | 709 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
| 739 | base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; | 710 | base->clock_base[i].resolution = KTIME_HIGH_RES; |
| 740 | base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; | ||
| 741 | 711 | ||
| 742 | tick_setup_sched_timer(); | 712 | tick_setup_sched_timer(); |
| 743 | 713 | ||
| @@ -761,9 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
| 761 | return 0; | 731 | return 0; |
| 762 | } | 732 | } |
| 763 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 733 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
| 734 | static inline void retrigger_next_event(void *arg) { } | ||
| 764 | 735 | ||
| 765 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 736 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
| 766 | 737 | ||
| 738 | /* | ||
| 739 | * Clock realtime was set | ||
| 740 | * | ||
| 741 | * Change the offset of the realtime clock vs. the monotonic | ||
| 742 | * clock. | ||
| 743 | * | ||
| 744 | * We might have to reprogram the high resolution timer interrupt. On | ||
| 745 | * SMP we call the architecture specific code to retrigger _all_ high | ||
| 746 | * resolution timer interrupts. On UP we just disable interrupts and | ||
| 747 | * call the high resolution interrupt code. | ||
| 748 | */ | ||
| 749 | void clock_was_set(void) | ||
| 750 | { | ||
| 751 | #ifdef CONFIG_HIGH_RES_TIMERS | ||
| 752 | /* Retrigger the CPU local events everywhere */ | ||
| 753 | on_each_cpu(retrigger_next_event, NULL, 1); | ||
| 754 | #endif | ||
| 755 | timerfd_clock_was_set(); | ||
| 756 | } | ||
| 757 | |||
| 758 | /* | ||
| 759 | * During resume we might have to reprogram the high resolution timer | ||
| 760 | * interrupt (on the local CPU): | ||
| 761 | */ | ||
| 762 | void hrtimers_resume(void) | ||
| 763 | { | ||
| 764 | WARN_ONCE(!irqs_disabled(), | ||
| 765 | KERN_INFO "hrtimers_resume() called with IRQs enabled!"); | ||
| 766 | |||
| 767 | retrigger_next_event(NULL); | ||
| 768 | timerfd_clock_was_set(); | ||
| 769 | } | ||
| 770 | |||
| 767 | static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) | 771 | static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) |
| 768 | { | 772 | { |
| 769 | #ifdef CONFIG_TIMER_STATS | 773 | #ifdef CONFIG_TIMER_STATS |
| @@ -856,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, | |||
| 856 | debug_activate(timer); | 860 | debug_activate(timer); |
| 857 | 861 | ||
| 858 | timerqueue_add(&base->active, &timer->node); | 862 | timerqueue_add(&base->active, &timer->node); |
| 863 | base->cpu_base->active_bases |= 1 << base->index; | ||
| 859 | 864 | ||
| 860 | /* | 865 | /* |
| 861 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the | 866 | * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the |
| @@ -897,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
| 897 | #endif | 902 | #endif |
| 898 | } | 903 | } |
| 899 | timerqueue_del(&base->active, &timer->node); | 904 | timerqueue_del(&base->active, &timer->node); |
| 905 | if (!timerqueue_getnext(&base->active)) | ||
| 906 | base->cpu_base->active_bases &= ~(1 << base->index); | ||
| 900 | out: | 907 | out: |
| 901 | timer->state = newstate; | 908 | timer->state = newstate; |
| 902 | } | 909 | } |
| @@ -1234,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) | |||
| 1234 | void hrtimer_interrupt(struct clock_event_device *dev) | 1241 | void hrtimer_interrupt(struct clock_event_device *dev) |
| 1235 | { | 1242 | { |
| 1236 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1243 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
| 1237 | struct hrtimer_clock_base *base; | ||
| 1238 | ktime_t expires_next, now, entry_time, delta; | 1244 | ktime_t expires_next, now, entry_time, delta; |
| 1239 | int i, retries = 0; | 1245 | int i, retries = 0; |
| 1240 | 1246 | ||
| @@ -1256,12 +1262,15 @@ retry: | |||
| 1256 | */ | 1262 | */ |
| 1257 | cpu_base->expires_next.tv64 = KTIME_MAX; | 1263 | cpu_base->expires_next.tv64 = KTIME_MAX; |
| 1258 | 1264 | ||
| 1259 | base = cpu_base->clock_base; | ||
| 1260 | |||
| 1261 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 1265 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
| 1262 | ktime_t basenow; | 1266 | struct hrtimer_clock_base *base; |
| 1263 | struct timerqueue_node *node; | 1267 | struct timerqueue_node *node; |
| 1268 | ktime_t basenow; | ||
| 1269 | |||
| 1270 | if (!(cpu_base->active_bases & (1 << i))) | ||
| 1271 | continue; | ||
| 1264 | 1272 | ||
| 1273 | base = cpu_base->clock_base + i; | ||
| 1265 | basenow = ktime_add(now, base->offset); | 1274 | basenow = ktime_add(now, base->offset); |
| 1266 | 1275 | ||
| 1267 | while ((node = timerqueue_getnext(&base->active))) { | 1276 | while ((node = timerqueue_getnext(&base->active))) { |
| @@ -1294,7 +1303,6 @@ retry: | |||
| 1294 | 1303 | ||
| 1295 | __run_hrtimer(timer, &basenow); | 1304 | __run_hrtimer(timer, &basenow); |
| 1296 | } | 1305 | } |
| 1297 | base++; | ||
| 1298 | } | 1306 | } |
| 1299 | 1307 | ||
| 1300 | /* | 1308 | /* |
| @@ -1525,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) | |||
| 1525 | struct timespec __user *rmtp; | 1533 | struct timespec __user *rmtp; |
| 1526 | int ret = 0; | 1534 | int ret = 0; |
| 1527 | 1535 | ||
| 1528 | hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, | 1536 | hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, |
| 1529 | HRTIMER_MODE_ABS); | 1537 | HRTIMER_MODE_ABS); |
| 1530 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); | 1538 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); |
| 1531 | 1539 | ||
| @@ -1577,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, | |||
| 1577 | 1585 | ||
| 1578 | restart = ¤t_thread_info()->restart_block; | 1586 | restart = ¤t_thread_info()->restart_block; |
| 1579 | restart->fn = hrtimer_nanosleep_restart; | 1587 | restart->fn = hrtimer_nanosleep_restart; |
| 1580 | restart->nanosleep.index = t.timer.base->index; | 1588 | restart->nanosleep.clockid = t.timer.base->clockid; |
| 1581 | restart->nanosleep.rmtp = rmtp; | 1589 | restart->nanosleep.rmtp = rmtp; |
| 1582 | restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); | 1590 | restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); |
| 1583 | 1591 | ||
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 53ead174da2f..ea640120ab86 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
| @@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; | |||
| 33 | /* | 33 | /* |
| 34 | * Zero means infinite timeout - no checking done: | 34 | * Zero means infinite timeout - no checking done: |
| 35 | */ | 35 | */ |
| 36 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; | 36 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; |
| 37 | 37 | ||
| 38 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; | 38 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; |
| 39 | 39 | ||
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index c574f9a12c48..5a38bf4de641 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
| @@ -48,6 +48,14 @@ config IRQ_PREFLOW_FASTEOI | |||
| 48 | config IRQ_EDGE_EOI_HANDLER | 48 | config IRQ_EDGE_EOI_HANDLER |
| 49 | bool | 49 | bool |
| 50 | 50 | ||
| 51 | # Generic configurable interrupt chip implementation | ||
| 52 | config GENERIC_IRQ_CHIP | ||
| 53 | bool | ||
| 54 | |||
| 55 | # Generic irq_domain hw <--> linux irq number translation | ||
| 56 | config IRQ_DOMAIN | ||
| 57 | bool | ||
| 58 | |||
| 51 | # Support forced irq threading | 59 | # Support forced irq threading |
| 52 | config IRQ_FORCED_THREADING | 60 | config IRQ_FORCED_THREADING |
| 53 | bool | 61 | bool |
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 54329cd7b3ee..fff17381f0af 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
| @@ -1,6 +1,8 @@ | |||
| 1 | 1 | ||
| 2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o | 2 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o |
| 3 | obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o | ||
| 3 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 4 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
| 5 | obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o | ||
| 4 | obj-$(CONFIG_PROC_FS) += proc.o | 6 | obj-$(CONFIG_PROC_FS) += proc.o |
| 5 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o | 7 | obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o |
| 6 | obj-$(CONFIG_PM_SLEEP) += pm.o | 8 | obj-$(CONFIG_PM_SLEEP) += pm.o |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 4af1e2b244cb..d5a3009da71a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -310,6 +310,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
| 310 | out_unlock: | 310 | out_unlock: |
| 311 | raw_spin_unlock(&desc->lock); | 311 | raw_spin_unlock(&desc->lock); |
| 312 | } | 312 | } |
| 313 | EXPORT_SYMBOL_GPL(handle_simple_irq); | ||
| 313 | 314 | ||
| 314 | /** | 315 | /** |
| 315 | * handle_level_irq - Level type irq handler | 316 | * handle_level_irq - Level type irq handler |
| @@ -573,6 +574,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
| 573 | if (handle != handle_bad_irq && is_chained) { | 574 | if (handle != handle_bad_irq && is_chained) { |
| 574 | irq_settings_set_noprobe(desc); | 575 | irq_settings_set_noprobe(desc); |
| 575 | irq_settings_set_norequest(desc); | 576 | irq_settings_set_norequest(desc); |
| 577 | irq_settings_set_nothread(desc); | ||
| 576 | irq_startup(desc); | 578 | irq_startup(desc); |
| 577 | } | 579 | } |
| 578 | out: | 580 | out: |
| @@ -612,6 +614,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) | |||
| 612 | 614 | ||
| 613 | irq_put_desc_unlock(desc, flags); | 615 | irq_put_desc_unlock(desc, flags); |
| 614 | } | 616 | } |
| 617 | EXPORT_SYMBOL_GPL(irq_modify_status); | ||
| 615 | 618 | ||
| 616 | /** | 619 | /** |
| 617 | * irq_cpu_online - Invoke all irq_cpu_online functions. | 620 | * irq_cpu_online - Invoke all irq_cpu_online functions. |
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 306cba37e9a5..97a8bfadc88a 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h | |||
| @@ -27,6 +27,7 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) | |||
| 27 | P(IRQ_PER_CPU); | 27 | P(IRQ_PER_CPU); |
| 28 | P(IRQ_NOPROBE); | 28 | P(IRQ_NOPROBE); |
| 29 | P(IRQ_NOREQUEST); | 29 | P(IRQ_NOREQUEST); |
| 30 | P(IRQ_NOTHREAD); | ||
| 30 | P(IRQ_NOAUTOEN); | 31 | P(IRQ_NOAUTOEN); |
| 31 | 32 | ||
| 32 | PS(IRQS_AUTODETECT); | 33 | PS(IRQS_AUTODETECT); |
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 1ef4ffcdfa55..bd8e788d71e0 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c | |||
| @@ -87,8 +87,8 @@ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) | |||
| 87 | { | 87 | { |
| 88 | struct irq_devres match_data = { irq, dev_id }; | 88 | struct irq_devres match_data = { irq, dev_id }; |
| 89 | 89 | ||
| 90 | free_irq(irq, dev_id); | ||
| 91 | WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, | 90 | WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, |
| 92 | &match_data)); | 91 | &match_data)); |
| 92 | free_irq(irq, dev_id); | ||
| 93 | } | 93 | } |
| 94 | EXPORT_SYMBOL(devm_free_irq); | 94 | EXPORT_SYMBOL(devm_free_irq); |
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c new file mode 100644 index 000000000000..3a2cab407b93 --- /dev/null +++ b/kernel/irq/generic-chip.c | |||
| @@ -0,0 +1,368 @@ | |||
| 1 | /* | ||
| 2 | * Library implementing the most common irq chip callback functions | ||
| 3 | * | ||
| 4 | * Copyright (C) 2011, Thomas Gleixner | ||
| 5 | */ | ||
| 6 | #include <linux/io.h> | ||
| 7 | #include <linux/irq.h> | ||
| 8 | #include <linux/slab.h> | ||
| 9 | #include <linux/interrupt.h> | ||
| 10 | #include <linux/kernel_stat.h> | ||
| 11 | #include <linux/syscore_ops.h> | ||
| 12 | |||
| 13 | #include "internals.h" | ||
| 14 | |||
| 15 | static LIST_HEAD(gc_list); | ||
| 16 | static DEFINE_RAW_SPINLOCK(gc_lock); | ||
| 17 | |||
| 18 | static inline struct irq_chip_regs *cur_regs(struct irq_data *d) | ||
| 19 | { | ||
| 20 | return &container_of(d->chip, struct irq_chip_type, chip)->regs; | ||
| 21 | } | ||
| 22 | |||
| 23 | /** | ||
| 24 | * irq_gc_noop - NOOP function | ||
| 25 | * @d: irq_data | ||
| 26 | */ | ||
| 27 | void irq_gc_noop(struct irq_data *d) | ||
| 28 | { | ||
| 29 | } | ||
| 30 | |||
| 31 | /** | ||
| 32 | * irq_gc_mask_disable_reg - Mask chip via disable register | ||
| 33 | * @d: irq_data | ||
| 34 | * | ||
| 35 | * Chip has separate enable/disable registers instead of a single mask | ||
| 36 | * register. | ||
| 37 | */ | ||
| 38 | void irq_gc_mask_disable_reg(struct irq_data *d) | ||
| 39 | { | ||
| 40 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
| 41 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
| 42 | |||
| 43 | irq_gc_lock(gc); | ||
| 44 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); | ||
| 45 | gc->mask_cache &= ~mask; | ||
| 46 | irq_gc_unlock(gc); | ||
| 47 | } | ||
| 48 | |||
| 49 | /** | ||
| 50 | * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register | ||
| 51 | * @d: irq_data | ||
| 52 | * | ||
| 53 | * Chip has a single mask register. Values of this register are cached | ||
| 54 | * and protected by gc->lock | ||
| 55 | */ | ||
| 56 | void irq_gc_mask_set_bit(struct irq_data *d) | ||
| 57 | { | ||
| 58 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
| 59 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
| 60 | |||
| 61 | irq_gc_lock(gc); | ||
| 62 | gc->mask_cache |= mask; | ||
| 63 | irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); | ||
| 64 | irq_gc_unlock(gc); | ||
| 65 | } | ||
| 66 | |||
| 67 | /** | ||
| 68 | * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register | ||
| 69 | * @d: irq_data | ||
| 70 | * | ||
| 71 | * Chip has a single mask register. Values of this register are cached | ||
| 72 | * and protected by gc->lock | ||
| 73 | */ | ||
| 74 | void irq_gc_mask_clr_bit(struct irq_data *d) | ||
| 75 | { | ||
| 76 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
| 77 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
| 78 | |||
| 79 | irq_gc_lock(gc); | ||
| 80 | gc->mask_cache &= ~mask; | ||
| 81 | irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); | ||
| 82 | irq_gc_unlock(gc); | ||
| 83 | } | ||
| 84 | |||
| 85 | /** | ||
| 86 | * irq_gc_unmask_enable_reg - Unmask chip via enable register | ||
| 87 | * @d: irq_data | ||
| 88 | * | ||
| 89 | * Chip has separate enable/disable registers instead of a single mask | ||
| 90 | * register. | ||
| 91 | */ | ||
| 92 | void irq_gc_unmask_enable_reg(struct irq_data *d) | ||
| 93 | { | ||
| 94 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
| 95 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
| 96 | |||
| 97 | irq_gc_lock(gc); | ||
| 98 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); | ||
| 99 | gc->mask_cache |= mask; | ||
| 100 | irq_gc_unlock(gc); | ||
| 101 | } | ||
| 102 | |||
| 103 | /** | ||
| 104 | * irq_gc_ack_set_bit - Ack pending interrupt via setting bit | ||
| 105 | * @d: irq_data | ||
| 106 | */ | ||
| 107 | void irq_gc_ack_set_bit(struct irq_data *d) | ||
| 108 | { | ||
| 109 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
| 110 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
| 111 | |||
| 112 | irq_gc_lock(gc); | ||
| 113 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
| 114 | irq_gc_unlock(gc); | ||
| 115 | } | ||
| 116 | |||
| 117 | /** | ||
| 118 | * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit | ||
| 119 | * @d: irq_data | ||
| 120 | */ | ||
| 121 | void irq_gc_ack_clr_bit(struct irq_data *d) | ||
| 122 | { | ||
| 123 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
| 124 | u32 mask = ~(1 << (d->irq - gc->irq_base)); | ||
| 125 | |||
| 126 | irq_gc_lock(gc); | ||
| 127 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
| 128 | irq_gc_unlock(gc); | ||
| 129 | } | ||
| 130 | |||
| 131 | /** | ||
| 132 | * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt | ||
| 133 | * @d: irq_data | ||
| 134 | */ | ||
| 135 | void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) | ||
| 136 | { | ||
| 137 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
| 138 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
| 139 | |||
| 140 | irq_gc_lock(gc); | ||
| 141 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); | ||
| 142 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); | ||
| 143 | irq_gc_unlock(gc); | ||
| 144 | } | ||
| 145 | |||
| 146 | /** | ||
| 147 | * irq_gc_eoi - EOI interrupt | ||
| 148 | * @d: irq_data | ||
| 149 | */ | ||
| 150 | void irq_gc_eoi(struct irq_data *d) | ||
| 151 | { | ||
| 152 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
| 153 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
| 154 | |||
| 155 | irq_gc_lock(gc); | ||
| 156 | irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); | ||
| 157 | irq_gc_unlock(gc); | ||
| 158 | } | ||
| 159 | |||
| 160 | /** | ||
| 161 | * irq_gc_set_wake - Set/clr wake bit for an interrupt | ||
| 162 | * @d: irq_data | ||
| 163 | * | ||
| 164 | * For chips where the wake from suspend functionality is not | ||
| 165 | * configured in a separate register and the wakeup active state is | ||
| 166 | * just stored in a bitmask. | ||
| 167 | */ | ||
| 168 | int irq_gc_set_wake(struct irq_data *d, unsigned int on) | ||
| 169 | { | ||
| 170 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
| 171 | u32 mask = 1 << (d->irq - gc->irq_base); | ||
| 172 | |||
| 173 | if (!(mask & gc->wake_enabled)) | ||
| 174 | return -EINVAL; | ||
| 175 | |||
| 176 | irq_gc_lock(gc); | ||
| 177 | if (on) | ||
| 178 | gc->wake_active |= mask; | ||
| 179 | else | ||
| 180 | gc->wake_active &= ~mask; | ||
| 181 | irq_gc_unlock(gc); | ||
| 182 | return 0; | ||
| 183 | } | ||
| 184 | |||
| 185 | /** | ||
| 186 | * irq_alloc_generic_chip - Allocate a generic chip and initialize it | ||
| 187 | * @name: Name of the irq chip | ||
| 188 | * @num_ct: Number of irq_chip_type instances associated with this | ||
| 189 | * @irq_base: Interrupt base nr for this chip | ||
| 190 | * @reg_base: Register base address (virtual) | ||
| 191 | * @handler: Default flow handler associated with this chip | ||
| 192 | * | ||
| 193 | * Returns an initialized irq_chip_generic structure. The chip defaults | ||
| 194 | * to the primary (index 0) irq_chip_type and @handler | ||
| 195 | */ | ||
| 196 | struct irq_chip_generic * | ||
| 197 | irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, | ||
| 198 | void __iomem *reg_base, irq_flow_handler_t handler) | ||
| 199 | { | ||
| 200 | struct irq_chip_generic *gc; | ||
| 201 | unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); | ||
| 202 | |||
| 203 | gc = kzalloc(sz, GFP_KERNEL); | ||
| 204 | if (gc) { | ||
| 205 | raw_spin_lock_init(&gc->lock); | ||
| 206 | gc->num_ct = num_ct; | ||
| 207 | gc->irq_base = irq_base; | ||
| 208 | gc->reg_base = reg_base; | ||
| 209 | gc->chip_types->chip.name = name; | ||
| 210 | gc->chip_types->handler = handler; | ||
| 211 | } | ||
| 212 | return gc; | ||
| 213 | } | ||
| 214 | |||
| 215 | /* | ||
| 216 | * Separate lockdep class for interrupt chip which can nest irq_desc | ||
| 217 | * lock. | ||
| 218 | */ | ||
| 219 | static struct lock_class_key irq_nested_lock_class; | ||
| 220 | |||
| 221 | /** | ||
| 222 | * irq_setup_generic_chip - Setup a range of interrupts with a generic chip | ||
| 223 | * @gc: Generic irq chip holding all data | ||
| 224 | * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base | ||
| 225 | * @flags: Flags for initialization | ||
| 226 | * @clr: IRQ_* bits to clear | ||
| 227 | * @set: IRQ_* bits to set | ||
| 228 | * | ||
| 229 | * Set up max. 32 interrupts starting from gc->irq_base. Note, this | ||
| 230 | * initializes all interrupts to the primary irq_chip_type and its | ||
| 231 | * associated handler. | ||
| 232 | */ | ||
| 233 | void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, | ||
| 234 | enum irq_gc_flags flags, unsigned int clr, | ||
| 235 | unsigned int set) | ||
| 236 | { | ||
| 237 | struct irq_chip_type *ct = gc->chip_types; | ||
| 238 | unsigned int i; | ||
| 239 | |||
| 240 | raw_spin_lock(&gc_lock); | ||
| 241 | list_add_tail(&gc->list, &gc_list); | ||
| 242 | raw_spin_unlock(&gc_lock); | ||
| 243 | |||
| 244 | /* Init mask cache ? */ | ||
| 245 | if (flags & IRQ_GC_INIT_MASK_CACHE) | ||
| 246 | gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); | ||
| 247 | |||
| 248 | for (i = gc->irq_base; msk; msk >>= 1, i++) { | ||
| 249 | if (!msk & 0x01) | ||
| 250 | continue; | ||
| 251 | |||
| 252 | if (flags & IRQ_GC_INIT_NESTED_LOCK) | ||
| 253 | irq_set_lockdep_class(i, &irq_nested_lock_class); | ||
| 254 | |||
| 255 | irq_set_chip_and_handler(i, &ct->chip, ct->handler); | ||
| 256 | irq_set_chip_data(i, gc); | ||
| 257 | irq_modify_status(i, clr, set); | ||
| 258 | } | ||
| 259 | gc->irq_cnt = i - gc->irq_base; | ||
| 260 | } | ||
| 261 | |||
| 262 | /** | ||
| 263 | * irq_setup_alt_chip - Switch to alternative chip | ||
| 264 | * @d: irq_data for this interrupt | ||
| 265 | * @type Flow type to be initialized | ||
| 266 | * | ||
| 267 | * Only to be called from chip->irq_set_type() callbacks. | ||
| 268 | */ | ||
| 269 | int irq_setup_alt_chip(struct irq_data *d, unsigned int type) | ||
| 270 | { | ||
| 271 | struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); | ||
| 272 | struct irq_chip_type *ct = gc->chip_types; | ||
| 273 | unsigned int i; | ||
| 274 | |||
| 275 | for (i = 0; i < gc->num_ct; i++, ct++) { | ||
| 276 | if (ct->type & type) { | ||
| 277 | d->chip = &ct->chip; | ||
| 278 | irq_data_to_desc(d)->handle_irq = ct->handler; | ||
| 279 | return 0; | ||
| 280 | } | ||
| 281 | } | ||
| 282 | return -EINVAL; | ||
| 283 | } | ||
| 284 | |||
| 285 | /** | ||
| 286 | * irq_remove_generic_chip - Remove a chip | ||
| 287 | * @gc: Generic irq chip holding all data | ||
| 288 | * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base | ||
| 289 | * @clr: IRQ_* bits to clear | ||
| 290 | * @set: IRQ_* bits to set | ||
| 291 | * | ||
| 292 | * Remove up to 32 interrupts starting from gc->irq_base. | ||
| 293 | */ | ||
| 294 | void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, | ||
| 295 | unsigned int clr, unsigned int set) | ||
| 296 | { | ||
| 297 | unsigned int i = gc->irq_base; | ||
| 298 | |||
| 299 | raw_spin_lock(&gc_lock); | ||
| 300 | list_del(&gc->list); | ||
| 301 | raw_spin_unlock(&gc_lock); | ||
| 302 | |||
| 303 | for (; msk; msk >>= 1, i++) { | ||
| 304 | if (!msk & 0x01) | ||
| 305 | continue; | ||
| 306 | |||
| 307 | /* Remove handler first. That will mask the irq line */ | ||
| 308 | irq_set_handler(i, NULL); | ||
| 309 | irq_set_chip(i, &no_irq_chip); | ||
| 310 | irq_set_chip_data(i, NULL); | ||
| 311 | irq_modify_status(i, clr, set); | ||
| 312 | } | ||
| 313 | } | ||
| 314 | |||
| 315 | #ifdef CONFIG_PM | ||
| 316 | static int irq_gc_suspend(void) | ||
| 317 | { | ||
| 318 | struct irq_chip_generic *gc; | ||
| 319 | |||
| 320 | list_for_each_entry(gc, &gc_list, list) { | ||
| 321 | struct irq_chip_type *ct = gc->chip_types; | ||
| 322 | |||
| 323 | if (ct->chip.irq_suspend) | ||
| 324 | ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); | ||
| 325 | } | ||
| 326 | return 0; | ||
| 327 | } | ||
| 328 | |||
| 329 | static void irq_gc_resume(void) | ||
| 330 | { | ||
| 331 | struct irq_chip_generic *gc; | ||
| 332 | |||
| 333 | list_for_each_entry(gc, &gc_list, list) { | ||
| 334 | struct irq_chip_type *ct = gc->chip_types; | ||
| 335 | |||
| 336 | if (ct->chip.irq_resume) | ||
| 337 | ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); | ||
| 338 | } | ||
| 339 | } | ||
| 340 | #else | ||
| 341 | #define irq_gc_suspend NULL | ||
| 342 | #define irq_gc_resume NULL | ||
| 343 | #endif | ||
| 344 | |||
| 345 | static void irq_gc_shutdown(void) | ||
| 346 | { | ||
| 347 | struct irq_chip_generic *gc; | ||
| 348 | |||
| 349 | list_for_each_entry(gc, &gc_list, list) { | ||
| 350 | struct irq_chip_type *ct = gc->chip_types; | ||
| 351 | |||
| 352 | if (ct->chip.irq_pm_shutdown) | ||
| 353 | ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); | ||
| 354 | } | ||
| 355 | } | ||
| 356 | |||
| 357 | static struct syscore_ops irq_gc_syscore_ops = { | ||
| 358 | .suspend = irq_gc_suspend, | ||
| 359 | .resume = irq_gc_resume, | ||
| 360 | .shutdown = irq_gc_shutdown, | ||
| 361 | }; | ||
| 362 | |||
| 363 | static int __init irq_gc_init_ops(void) | ||
| 364 | { | ||
| 365 | register_syscore_ops(&irq_gc_syscore_ops); | ||
| 366 | return 0; | ||
| 367 | } | ||
| 368 | device_initcall(irq_gc_init_ops); | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 90cb55f6d7eb..470d08c82bbe 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -133,12 +133,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) | |||
| 133 | switch (res) { | 133 | switch (res) { |
| 134 | case IRQ_WAKE_THREAD: | 134 | case IRQ_WAKE_THREAD: |
| 135 | /* | 135 | /* |
| 136 | * Set result to handled so the spurious check | ||
| 137 | * does not trigger. | ||
| 138 | */ | ||
| 139 | res = IRQ_HANDLED; | ||
| 140 | |||
| 141 | /* | ||
| 142 | * Catch drivers which return WAKE_THREAD but | 136 | * Catch drivers which return WAKE_THREAD but |
| 143 | * did not set up a thread function | 137 | * did not set up a thread function |
| 144 | */ | 138 | */ |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2c039c9b9383..4c60a50e66b2 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -22,7 +22,7 @@ | |||
| 22 | */ | 22 | */ |
| 23 | static struct lock_class_key irq_desc_lock_class; | 23 | static struct lock_class_key irq_desc_lock_class; |
| 24 | 24 | ||
| 25 | #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) | 25 | #if defined(CONFIG_SMP) |
| 26 | static void __init init_irq_default_affinity(void) | 26 | static void __init init_irq_default_affinity(void) |
| 27 | { | 27 | { |
| 28 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); | 28 | alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); |
| @@ -257,13 +257,11 @@ int __init early_irq_init(void) | |||
| 257 | count = ARRAY_SIZE(irq_desc); | 257 | count = ARRAY_SIZE(irq_desc); |
| 258 | 258 | ||
| 259 | for (i = 0; i < count; i++) { | 259 | for (i = 0; i < count; i++) { |
| 260 | desc[i].irq_data.irq = i; | ||
| 261 | desc[i].irq_data.chip = &no_irq_chip; | ||
| 262 | desc[i].kstat_irqs = alloc_percpu(unsigned int); | 260 | desc[i].kstat_irqs = alloc_percpu(unsigned int); |
| 263 | irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); | 261 | alloc_masks(&desc[i], GFP_KERNEL, node); |
| 264 | alloc_masks(desc + i, GFP_KERNEL, node); | 262 | raw_spin_lock_init(&desc[i].lock); |
| 265 | desc_smp_init(desc + i, node); | ||
| 266 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); | 263 | lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); |
| 264 | desc_set_defaults(i, &desc[i], node); | ||
| 267 | } | 265 | } |
| 268 | return arch_early_irq_init(); | 266 | return arch_early_irq_init(); |
| 269 | } | 267 | } |
| @@ -290,6 +288,22 @@ static int irq_expand_nr_irqs(unsigned int nr) | |||
| 290 | 288 | ||
| 291 | #endif /* !CONFIG_SPARSE_IRQ */ | 289 | #endif /* !CONFIG_SPARSE_IRQ */ |
| 292 | 290 | ||
| 291 | /** | ||
| 292 | * generic_handle_irq - Invoke the handler for a particular irq | ||
| 293 | * @irq: The irq number to handle | ||
| 294 | * | ||
| 295 | */ | ||
| 296 | int generic_handle_irq(unsigned int irq) | ||
| 297 | { | ||
| 298 | struct irq_desc *desc = irq_to_desc(irq); | ||
| 299 | |||
| 300 | if (!desc) | ||
| 301 | return -EINVAL; | ||
| 302 | generic_handle_irq_desc(irq, desc); | ||
| 303 | return 0; | ||
| 304 | } | ||
| 305 | EXPORT_SYMBOL_GPL(generic_handle_irq); | ||
| 306 | |||
| 293 | /* Dynamic interrupt handling */ | 307 | /* Dynamic interrupt handling */ |
| 294 | 308 | ||
| 295 | /** | 309 | /** |
| @@ -311,6 +325,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt) | |||
| 311 | bitmap_clear(allocated_irqs, from, cnt); | 325 | bitmap_clear(allocated_irqs, from, cnt); |
| 312 | mutex_unlock(&sparse_irq_lock); | 326 | mutex_unlock(&sparse_irq_lock); |
| 313 | } | 327 | } |
| 328 | EXPORT_SYMBOL_GPL(irq_free_descs); | ||
| 314 | 329 | ||
| 315 | /** | 330 | /** |
| 316 | * irq_alloc_descs - allocate and initialize a range of irq descriptors | 331 | * irq_alloc_descs - allocate and initialize a range of irq descriptors |
| @@ -329,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) | |||
| 329 | if (!cnt) | 344 | if (!cnt) |
| 330 | return -EINVAL; | 345 | return -EINVAL; |
| 331 | 346 | ||
| 347 | if (irq >= 0) { | ||
| 348 | if (from > irq) | ||
| 349 | return -EINVAL; | ||
| 350 | from = irq; | ||
| 351 | } | ||
| 352 | |||
| 332 | mutex_lock(&sparse_irq_lock); | 353 | mutex_lock(&sparse_irq_lock); |
| 333 | 354 | ||
| 334 | start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, | 355 | start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, |
| @@ -351,6 +372,7 @@ err: | |||
| 351 | mutex_unlock(&sparse_irq_lock); | 372 | mutex_unlock(&sparse_irq_lock); |
| 352 | return ret; | 373 | return ret; |
| 353 | } | 374 | } |
| 375 | EXPORT_SYMBOL_GPL(irq_alloc_descs); | ||
| 354 | 376 | ||
| 355 | /** | 377 | /** |
| 356 | * irq_reserve_irqs - mark irqs allocated | 378 | * irq_reserve_irqs - mark irqs allocated |
| @@ -430,7 +452,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | |||
| 430 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; | 452 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; |
| 431 | } | 453 | } |
| 432 | 454 | ||
| 433 | #ifdef CONFIG_GENERIC_HARDIRQS | ||
| 434 | unsigned int kstat_irqs(unsigned int irq) | 455 | unsigned int kstat_irqs(unsigned int irq) |
| 435 | { | 456 | { |
| 436 | struct irq_desc *desc = irq_to_desc(irq); | 457 | struct irq_desc *desc = irq_to_desc(irq); |
| @@ -443,4 +464,3 @@ unsigned int kstat_irqs(unsigned int irq) | |||
| 443 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); | 464 | sum += *per_cpu_ptr(desc->kstat_irqs, cpu); |
| 444 | return sum; | 465 | return sum; |
| 445 | } | 466 | } |
| 446 | #endif /* CONFIG_GENERIC_HARDIRQS */ | ||
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c new file mode 100644 index 000000000000..d5828da3fd38 --- /dev/null +++ b/kernel/irq/irqdomain.c | |||
| @@ -0,0 +1,180 @@ | |||
| 1 | #include <linux/irq.h> | ||
| 2 | #include <linux/irqdomain.h> | ||
| 3 | #include <linux/module.h> | ||
| 4 | #include <linux/mutex.h> | ||
| 5 | #include <linux/of.h> | ||
| 6 | #include <linux/of_address.h> | ||
| 7 | #include <linux/slab.h> | ||
| 8 | |||
| 9 | static LIST_HEAD(irq_domain_list); | ||
| 10 | static DEFINE_MUTEX(irq_domain_mutex); | ||
| 11 | |||
| 12 | /** | ||
| 13 | * irq_domain_add() - Register an irq_domain | ||
| 14 | * @domain: ptr to initialized irq_domain structure | ||
| 15 | * | ||
| 16 | * Registers an irq_domain structure. The irq_domain must at a minimum be | ||
| 17 | * initialized with an ops structure pointer, and either a ->to_irq hook or | ||
| 18 | * a valid irq_base value. Everything else is optional. | ||
| 19 | */ | ||
| 20 | void irq_domain_add(struct irq_domain *domain) | ||
| 21 | { | ||
| 22 | struct irq_data *d; | ||
| 23 | int hwirq; | ||
| 24 | |||
| 25 | /* | ||
| 26 | * This assumes that the irq_domain owner has already allocated | ||
| 27 | * the irq_descs. This block will be removed when support for dynamic | ||
| 28 | * allocation of irq_descs is added to irq_domain. | ||
| 29 | */ | ||
| 30 | for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { | ||
| 31 | d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); | ||
| 32 | if (d || d->domain) { | ||
| 33 | /* things are broken; just report, don't clean up */ | ||
| 34 | WARN(1, "error: irq_desc already assigned to a domain"); | ||
| 35 | return; | ||
| 36 | } | ||
| 37 | d->domain = domain; | ||
| 38 | d->hwirq = hwirq; | ||
| 39 | } | ||
| 40 | |||
| 41 | mutex_lock(&irq_domain_mutex); | ||
| 42 | list_add(&domain->list, &irq_domain_list); | ||
| 43 | mutex_unlock(&irq_domain_mutex); | ||
| 44 | } | ||
| 45 | |||
| 46 | /** | ||
| 47 | * irq_domain_del() - Unregister an irq_domain | ||
| 48 | * @domain: ptr to registered irq_domain. | ||
| 49 | */ | ||
| 50 | void irq_domain_del(struct irq_domain *domain) | ||
| 51 | { | ||
| 52 | struct irq_data *d; | ||
| 53 | int hwirq; | ||
| 54 | |||
| 55 | mutex_lock(&irq_domain_mutex); | ||
| 56 | list_del(&domain->list); | ||
| 57 | mutex_unlock(&irq_domain_mutex); | ||
| 58 | |||
| 59 | /* Clear the irq_domain assignments */ | ||
| 60 | for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) { | ||
| 61 | d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq)); | ||
| 62 | d->domain = NULL; | ||
| 63 | } | ||
| 64 | } | ||
| 65 | |||
| 66 | #if defined(CONFIG_OF_IRQ) | ||
| 67 | /** | ||
| 68 | * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec | ||
| 69 | * | ||
| 70 | * Used by the device tree interrupt mapping code to translate a device tree | ||
| 71 | * interrupt specifier to a valid linux irq number. Returns either a valid | ||
| 72 | * linux IRQ number or 0. | ||
| 73 | * | ||
| 74 | * When the caller no longer need the irq number returned by this function it | ||
| 75 | * should arrange to call irq_dispose_mapping(). | ||
| 76 | */ | ||
| 77 | unsigned int irq_create_of_mapping(struct device_node *controller, | ||
| 78 | const u32 *intspec, unsigned int intsize) | ||
| 79 | { | ||
| 80 | struct irq_domain *domain; | ||
| 81 | unsigned long hwirq; | ||
| 82 | unsigned int irq, type; | ||
| 83 | int rc = -EINVAL; | ||
| 84 | |||
| 85 | /* Find a domain which can translate the irq spec */ | ||
| 86 | mutex_lock(&irq_domain_mutex); | ||
| 87 | list_for_each_entry(domain, &irq_domain_list, list) { | ||
| 88 | if (!domain->ops->dt_translate) | ||
| 89 | continue; | ||
| 90 | rc = domain->ops->dt_translate(domain, controller, | ||
| 91 | intspec, intsize, &hwirq, &type); | ||
| 92 | if (rc == 0) | ||
| 93 | break; | ||
| 94 | } | ||
| 95 | mutex_unlock(&irq_domain_mutex); | ||
| 96 | |||
| 97 | if (rc != 0) | ||
| 98 | return 0; | ||
| 99 | |||
| 100 | irq = irq_domain_to_irq(domain, hwirq); | ||
| 101 | if (type != IRQ_TYPE_NONE) | ||
| 102 | irq_set_irq_type(irq, type); | ||
| 103 | pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", | ||
| 104 | controller->full_name, (int)hwirq, irq, type); | ||
| 105 | return irq; | ||
| 106 | } | ||
| 107 | EXPORT_SYMBOL_GPL(irq_create_of_mapping); | ||
| 108 | |||
| 109 | /** | ||
| 110 | * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() | ||
| 111 | * @irq: linux irq number to be discarded | ||
| 112 | * | ||
| 113 | * Calling this function indicates the caller no longer needs a reference to | ||
| 114 | * the linux irq number returned by a prior call to irq_create_of_mapping(). | ||
| 115 | */ | ||
| 116 | void irq_dispose_mapping(unsigned int irq) | ||
| 117 | { | ||
| 118 | /* | ||
| 119 | * nothing yet; will be filled when support for dynamic allocation of | ||
| 120 | * irq_descs is added to irq_domain | ||
| 121 | */ | ||
| 122 | } | ||
| 123 | EXPORT_SYMBOL_GPL(irq_dispose_mapping); | ||
| 124 | |||
| 125 | int irq_domain_simple_dt_translate(struct irq_domain *d, | ||
| 126 | struct device_node *controller, | ||
| 127 | const u32 *intspec, unsigned int intsize, | ||
| 128 | unsigned long *out_hwirq, unsigned int *out_type) | ||
| 129 | { | ||
| 130 | if (d->of_node != controller) | ||
| 131 | return -EINVAL; | ||
| 132 | if (intsize < 1) | ||
| 133 | return -EINVAL; | ||
| 134 | |||
| 135 | *out_hwirq = intspec[0]; | ||
| 136 | *out_type = IRQ_TYPE_NONE; | ||
| 137 | if (intsize > 1) | ||
| 138 | *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; | ||
| 139 | return 0; | ||
| 140 | } | ||
| 141 | |||
| 142 | struct irq_domain_ops irq_domain_simple_ops = { | ||
| 143 | .dt_translate = irq_domain_simple_dt_translate, | ||
| 144 | }; | ||
| 145 | EXPORT_SYMBOL_GPL(irq_domain_simple_ops); | ||
| 146 | |||
| 147 | /** | ||
| 148 | * irq_domain_create_simple() - Set up a 'simple' translation range | ||
| 149 | */ | ||
| 150 | void irq_domain_add_simple(struct device_node *controller, int irq_base) | ||
| 151 | { | ||
| 152 | struct irq_domain *domain; | ||
| 153 | |||
| 154 | domain = kzalloc(sizeof(*domain), GFP_KERNEL); | ||
| 155 | if (!domain) { | ||
| 156 | WARN_ON(1); | ||
| 157 | return; | ||
| 158 | } | ||
| 159 | |||
| 160 | domain->irq_base = irq_base; | ||
| 161 | domain->of_node = of_node_get(controller); | ||
| 162 | domain->ops = &irq_domain_simple_ops; | ||
| 163 | irq_domain_add(domain); | ||
| 164 | } | ||
| 165 | EXPORT_SYMBOL_GPL(irq_domain_add_simple); | ||
| 166 | |||
| 167 | void irq_domain_generate_simple(const struct of_device_id *match, | ||
| 168 | u64 phys_base, unsigned int irq_start) | ||
| 169 | { | ||
| 170 | struct device_node *node; | ||
| 171 | pr_info("looking for phys_base=%llx, irq_start=%i\n", | ||
| 172 | (unsigned long long) phys_base, (int) irq_start); | ||
| 173 | node = of_find_matching_node_by_address(NULL, match, phys_base); | ||
| 174 | if (node) | ||
| 175 | irq_domain_add_simple(node, irq_start); | ||
| 176 | else | ||
| 177 | pr_info("no node found\n"); | ||
| 178 | } | ||
| 179 | EXPORT_SYMBOL_GPL(irq_domain_generate_simple); | ||
| 180 | #endif /* CONFIG_OF_IRQ */ | ||
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 07c1611f3899..0a7840aeb0fb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -491,6 +491,9 @@ int irq_set_irq_wake(unsigned int irq, unsigned int on) | |||
| 491 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); | 491 | struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); |
| 492 | int ret = 0; | 492 | int ret = 0; |
| 493 | 493 | ||
| 494 | if (!desc) | ||
| 495 | return -EINVAL; | ||
| 496 | |||
| 494 | /* wakeup-capable irqs can be shared between drivers that | 497 | /* wakeup-capable irqs can be shared between drivers that |
| 495 | * don't need to have the same sleep mode behaviors. | 498 | * don't need to have the same sleep mode behaviors. |
| 496 | */ | 499 | */ |
| @@ -723,13 +726,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } | |||
| 723 | * context. So we need to disable bh here to avoid deadlocks and other | 726 | * context. So we need to disable bh here to avoid deadlocks and other |
| 724 | * side effects. | 727 | * side effects. |
| 725 | */ | 728 | */ |
| 726 | static void | 729 | static irqreturn_t |
| 727 | irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) | 730 | irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) |
| 728 | { | 731 | { |
| 732 | irqreturn_t ret; | ||
| 733 | |||
| 729 | local_bh_disable(); | 734 | local_bh_disable(); |
| 730 | action->thread_fn(action->irq, action->dev_id); | 735 | ret = action->thread_fn(action->irq, action->dev_id); |
| 731 | irq_finalize_oneshot(desc, action, false); | 736 | irq_finalize_oneshot(desc, action, false); |
| 732 | local_bh_enable(); | 737 | local_bh_enable(); |
| 738 | return ret; | ||
| 733 | } | 739 | } |
| 734 | 740 | ||
| 735 | /* | 741 | /* |
| @@ -737,10 +743,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) | |||
| 737 | * preemtible - many of them need to sleep and wait for slow busses to | 743 | * preemtible - many of them need to sleep and wait for slow busses to |
| 738 | * complete. | 744 | * complete. |
| 739 | */ | 745 | */ |
| 740 | static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) | 746 | static irqreturn_t irq_thread_fn(struct irq_desc *desc, |
| 747 | struct irqaction *action) | ||
| 741 | { | 748 | { |
| 742 | action->thread_fn(action->irq, action->dev_id); | 749 | irqreturn_t ret; |
| 750 | |||
| 751 | ret = action->thread_fn(action->irq, action->dev_id); | ||
| 743 | irq_finalize_oneshot(desc, action, false); | 752 | irq_finalize_oneshot(desc, action, false); |
| 753 | return ret; | ||
| 744 | } | 754 | } |
| 745 | 755 | ||
| 746 | /* | 756 | /* |
| @@ -753,7 +763,8 @@ static int irq_thread(void *data) | |||
| 753 | }; | 763 | }; |
| 754 | struct irqaction *action = data; | 764 | struct irqaction *action = data; |
| 755 | struct irq_desc *desc = irq_to_desc(action->irq); | 765 | struct irq_desc *desc = irq_to_desc(action->irq); |
| 756 | void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); | 766 | irqreturn_t (*handler_fn)(struct irq_desc *desc, |
| 767 | struct irqaction *action); | ||
| 757 | int wake; | 768 | int wake; |
| 758 | 769 | ||
| 759 | if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, | 770 | if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, |
| @@ -783,8 +794,12 @@ static int irq_thread(void *data) | |||
| 783 | desc->istate |= IRQS_PENDING; | 794 | desc->istate |= IRQS_PENDING; |
| 784 | raw_spin_unlock_irq(&desc->lock); | 795 | raw_spin_unlock_irq(&desc->lock); |
| 785 | } else { | 796 | } else { |
| 797 | irqreturn_t action_ret; | ||
| 798 | |||
| 786 | raw_spin_unlock_irq(&desc->lock); | 799 | raw_spin_unlock_irq(&desc->lock); |
| 787 | handler_fn(desc, action); | 800 | action_ret = handler_fn(desc, action); |
| 801 | if (!noirqdebug) | ||
| 802 | note_interrupt(action->irq, desc, action_ret); | ||
| 788 | } | 803 | } |
| 789 | 804 | ||
| 790 | wake = atomic_dec_and_test(&desc->threads_active); | 805 | wake = atomic_dec_and_test(&desc->threads_active); |
| @@ -900,7 +915,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) | |||
| 900 | */ | 915 | */ |
| 901 | new->handler = irq_nested_primary_handler; | 916 | new->handler = irq_nested_primary_handler; |
| 902 | } else { | 917 | } else { |
| 903 | irq_setup_forced_threading(new); | 918 | if (irq_settings_can_thread(desc)) |
| 919 | irq_setup_forced_threading(new); | ||
| 904 | } | 920 | } |
| 905 | 921 | ||
| 906 | /* | 922 | /* |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 834899f2500f..4bd4faa6323a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir; | |||
| 19 | 19 | ||
| 20 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
| 21 | 21 | ||
| 22 | static int irq_affinity_proc_show(struct seq_file *m, void *v) | 22 | static int show_irq_affinity(int type, struct seq_file *m, void *v) |
| 23 | { | 23 | { |
| 24 | struct irq_desc *desc = irq_to_desc((long)m->private); | 24 | struct irq_desc *desc = irq_to_desc((long)m->private); |
| 25 | const struct cpumask *mask = desc->irq_data.affinity; | 25 | const struct cpumask *mask = desc->irq_data.affinity; |
| @@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) | |||
| 28 | if (irqd_is_setaffinity_pending(&desc->irq_data)) | 28 | if (irqd_is_setaffinity_pending(&desc->irq_data)) |
| 29 | mask = desc->pending_mask; | 29 | mask = desc->pending_mask; |
| 30 | #endif | 30 | #endif |
| 31 | seq_cpumask(m, mask); | 31 | if (type) |
| 32 | seq_cpumask_list(m, mask); | ||
| 33 | else | ||
| 34 | seq_cpumask(m, mask); | ||
| 32 | seq_putc(m, '\n'); | 35 | seq_putc(m, '\n'); |
| 33 | return 0; | 36 | return 0; |
| 34 | } | 37 | } |
| @@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) | |||
| 59 | #endif | 62 | #endif |
| 60 | 63 | ||
| 61 | int no_irq_affinity; | 64 | int no_irq_affinity; |
| 62 | static ssize_t irq_affinity_proc_write(struct file *file, | 65 | static int irq_affinity_proc_show(struct seq_file *m, void *v) |
| 66 | { | ||
| 67 | return show_irq_affinity(0, m, v); | ||
| 68 | } | ||
| 69 | |||
| 70 | static int irq_affinity_list_proc_show(struct seq_file *m, void *v) | ||
| 71 | { | ||
| 72 | return show_irq_affinity(1, m, v); | ||
| 73 | } | ||
| 74 | |||
| 75 | |||
| 76 | static ssize_t write_irq_affinity(int type, struct file *file, | ||
| 63 | const char __user *buffer, size_t count, loff_t *pos) | 77 | const char __user *buffer, size_t count, loff_t *pos) |
| 64 | { | 78 | { |
| 65 | unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; | 79 | unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; |
| @@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file, | |||
| 72 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) | 86 | if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) |
| 73 | return -ENOMEM; | 87 | return -ENOMEM; |
| 74 | 88 | ||
| 75 | err = cpumask_parse_user(buffer, count, new_value); | 89 | if (type) |
| 90 | err = cpumask_parselist_user(buffer, count, new_value); | ||
| 91 | else | ||
| 92 | err = cpumask_parse_user(buffer, count, new_value); | ||
| 76 | if (err) | 93 | if (err) |
| 77 | goto free_cpumask; | 94 | goto free_cpumask; |
| 78 | 95 | ||
| @@ -100,11 +117,28 @@ free_cpumask: | |||
| 100 | return err; | 117 | return err; |
| 101 | } | 118 | } |
| 102 | 119 | ||
| 120 | static ssize_t irq_affinity_proc_write(struct file *file, | ||
| 121 | const char __user *buffer, size_t count, loff_t *pos) | ||
| 122 | { | ||
| 123 | return write_irq_affinity(0, file, buffer, count, pos); | ||
| 124 | } | ||
| 125 | |||
| 126 | static ssize_t irq_affinity_list_proc_write(struct file *file, | ||
| 127 | const char __user *buffer, size_t count, loff_t *pos) | ||
| 128 | { | ||
| 129 | return write_irq_affinity(1, file, buffer, count, pos); | ||
| 130 | } | ||
| 131 | |||
| 103 | static int irq_affinity_proc_open(struct inode *inode, struct file *file) | 132 | static int irq_affinity_proc_open(struct inode *inode, struct file *file) |
| 104 | { | 133 | { |
| 105 | return single_open(file, irq_affinity_proc_show, PDE(inode)->data); | 134 | return single_open(file, irq_affinity_proc_show, PDE(inode)->data); |
| 106 | } | 135 | } |
| 107 | 136 | ||
| 137 | static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) | ||
| 138 | { | ||
| 139 | return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); | ||
| 140 | } | ||
| 141 | |||
| 108 | static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) | 142 | static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) |
| 109 | { | 143 | { |
| 110 | return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); | 144 | return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); |
| @@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = { | |||
| 125 | .release = single_release, | 159 | .release = single_release, |
| 126 | }; | 160 | }; |
| 127 | 161 | ||
| 162 | static const struct file_operations irq_affinity_list_proc_fops = { | ||
| 163 | .open = irq_affinity_list_proc_open, | ||
| 164 | .read = seq_read, | ||
| 165 | .llseek = seq_lseek, | ||
| 166 | .release = single_release, | ||
| 167 | .write = irq_affinity_list_proc_write, | ||
| 168 | }; | ||
| 169 | |||
| 128 | static int default_affinity_show(struct seq_file *m, void *v) | 170 | static int default_affinity_show(struct seq_file *m, void *v) |
| 129 | { | 171 | { |
| 130 | seq_cpumask(m, irq_default_affinity); | 172 | seq_cpumask(m, irq_default_affinity); |
| @@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
| 289 | proc_create_data("affinity_hint", 0400, desc->dir, | 331 | proc_create_data("affinity_hint", 0400, desc->dir, |
| 290 | &irq_affinity_hint_proc_fops, (void *)(long)irq); | 332 | &irq_affinity_hint_proc_fops, (void *)(long)irq); |
| 291 | 333 | ||
| 334 | /* create /proc/irq/<irq>/smp_affinity_list */ | ||
| 335 | proc_create_data("smp_affinity_list", 0600, desc->dir, | ||
| 336 | &irq_affinity_list_proc_fops, (void *)(long)irq); | ||
| 337 | |||
| 292 | proc_create_data("node", 0444, desc->dir, | 338 | proc_create_data("node", 0444, desc->dir, |
| 293 | &irq_node_proc_fops, (void *)(long)irq); | 339 | &irq_node_proc_fops, (void *)(long)irq); |
| 294 | #endif | 340 | #endif |
| @@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) | |||
| 306 | #ifdef CONFIG_SMP | 352 | #ifdef CONFIG_SMP |
| 307 | remove_proc_entry("smp_affinity", desc->dir); | 353 | remove_proc_entry("smp_affinity", desc->dir); |
| 308 | remove_proc_entry("affinity_hint", desc->dir); | 354 | remove_proc_entry("affinity_hint", desc->dir); |
| 355 | remove_proc_entry("smp_affinity_list", desc->dir); | ||
| 309 | remove_proc_entry("node", desc->dir); | 356 | remove_proc_entry("node", desc->dir); |
| 310 | #endif | 357 | #endif |
| 311 | remove_proc_entry("spurious", desc->dir); | 358 | remove_proc_entry("spurious", desc->dir); |
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index 0d91730b6330..f1667833d444 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h | |||
| @@ -8,6 +8,7 @@ enum { | |||
| 8 | _IRQ_LEVEL = IRQ_LEVEL, | 8 | _IRQ_LEVEL = IRQ_LEVEL, |
| 9 | _IRQ_NOPROBE = IRQ_NOPROBE, | 9 | _IRQ_NOPROBE = IRQ_NOPROBE, |
| 10 | _IRQ_NOREQUEST = IRQ_NOREQUEST, | 10 | _IRQ_NOREQUEST = IRQ_NOREQUEST, |
| 11 | _IRQ_NOTHREAD = IRQ_NOTHREAD, | ||
| 11 | _IRQ_NOAUTOEN = IRQ_NOAUTOEN, | 12 | _IRQ_NOAUTOEN = IRQ_NOAUTOEN, |
| 12 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, | 13 | _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, |
| 13 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, | 14 | _IRQ_NO_BALANCING = IRQ_NO_BALANCING, |
| @@ -20,6 +21,7 @@ enum { | |||
| 20 | #define IRQ_LEVEL GOT_YOU_MORON | 21 | #define IRQ_LEVEL GOT_YOU_MORON |
| 21 | #define IRQ_NOPROBE GOT_YOU_MORON | 22 | #define IRQ_NOPROBE GOT_YOU_MORON |
| 22 | #define IRQ_NOREQUEST GOT_YOU_MORON | 23 | #define IRQ_NOREQUEST GOT_YOU_MORON |
| 24 | #define IRQ_NOTHREAD GOT_YOU_MORON | ||
| 23 | #define IRQ_NOAUTOEN GOT_YOU_MORON | 25 | #define IRQ_NOAUTOEN GOT_YOU_MORON |
| 24 | #define IRQ_NESTED_THREAD GOT_YOU_MORON | 26 | #define IRQ_NESTED_THREAD GOT_YOU_MORON |
| 25 | #undef IRQF_MODIFY_MASK | 27 | #undef IRQF_MODIFY_MASK |
| @@ -94,6 +96,21 @@ static inline void irq_settings_set_norequest(struct irq_desc *desc) | |||
| 94 | desc->status_use_accessors |= _IRQ_NOREQUEST; | 96 | desc->status_use_accessors |= _IRQ_NOREQUEST; |
| 95 | } | 97 | } |
| 96 | 98 | ||
| 99 | static inline bool irq_settings_can_thread(struct irq_desc *desc) | ||
| 100 | { | ||
| 101 | return !(desc->status_use_accessors & _IRQ_NOTHREAD); | ||
| 102 | } | ||
| 103 | |||
| 104 | static inline void irq_settings_clr_nothread(struct irq_desc *desc) | ||
| 105 | { | ||
| 106 | desc->status_use_accessors &= ~_IRQ_NOTHREAD; | ||
| 107 | } | ||
| 108 | |||
| 109 | static inline void irq_settings_set_nothread(struct irq_desc *desc) | ||
| 110 | { | ||
| 111 | desc->status_use_accessors |= _IRQ_NOTHREAD; | ||
| 112 | } | ||
| 113 | |||
| 97 | static inline bool irq_settings_can_probe(struct irq_desc *desc) | 114 | static inline bool irq_settings_can_probe(struct irq_desc *desc) |
| 98 | { | 115 | { |
| 99 | return !(desc->status_use_accessors & _IRQ_NOPROBE); | 116 | return !(desc->status_use_accessors & _IRQ_NOPROBE); |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dfbd550401b2..aa57d5da18c1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
| @@ -167,6 +167,13 @@ out: | |||
| 167 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); | 167 | jiffies + POLL_SPURIOUS_IRQ_INTERVAL); |
| 168 | } | 168 | } |
| 169 | 169 | ||
| 170 | static inline int bad_action_ret(irqreturn_t action_ret) | ||
| 171 | { | ||
| 172 | if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD))) | ||
| 173 | return 0; | ||
| 174 | return 1; | ||
| 175 | } | ||
| 176 | |||
| 170 | /* | 177 | /* |
| 171 | * If 99,900 of the previous 100,000 interrupts have not been handled | 178 | * If 99,900 of the previous 100,000 interrupts have not been handled |
| 172 | * then assume that the IRQ is stuck in some manner. Drop a diagnostic | 179 | * then assume that the IRQ is stuck in some manner. Drop a diagnostic |
| @@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, | |||
| 182 | struct irqaction *action; | 189 | struct irqaction *action; |
| 183 | unsigned long flags; | 190 | unsigned long flags; |
| 184 | 191 | ||
| 185 | if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { | 192 | if (bad_action_ret(action_ret)) { |
| 186 | printk(KERN_ERR "irq event %d: bogus return value %x\n", | 193 | printk(KERN_ERR "irq event %d: bogus return value %x\n", |
| 187 | irq, action_ret); | 194 | irq, action_ret); |
| 188 | } else { | 195 | } else { |
| @@ -201,10 +208,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc, | |||
| 201 | raw_spin_lock_irqsave(&desc->lock, flags); | 208 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 202 | action = desc->action; | 209 | action = desc->action; |
| 203 | while (action) { | 210 | while (action) { |
| 204 | printk(KERN_ERR "[<%p>]", action->handler); | 211 | printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); |
| 205 | print_symbol(" (%s)", | 212 | if (action->thread_fn) |
| 206 | (unsigned long)action->handler); | 213 | printk(KERN_CONT " threaded [<%p>] %pf", |
| 207 | printk("\n"); | 214 | action->thread_fn, action->thread_fn); |
| 215 | printk(KERN_CONT "\n"); | ||
| 208 | action = action->next; | 216 | action = action->next; |
| 209 | } | 217 | } |
| 210 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 218 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| @@ -262,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
| 262 | if (desc->istate & IRQS_POLL_INPROGRESS) | 270 | if (desc->istate & IRQS_POLL_INPROGRESS) |
| 263 | return; | 271 | return; |
| 264 | 272 | ||
| 265 | if (unlikely(action_ret != IRQ_HANDLED)) { | 273 | /* we get here again via the threaded handler */ |
| 274 | if (action_ret == IRQ_WAKE_THREAD) | ||
| 275 | return; | ||
| 276 | |||
| 277 | if (bad_action_ret(action_ret)) { | ||
| 278 | report_bad_irq(irq, desc, action_ret); | ||
| 279 | return; | ||
| 280 | } | ||
| 281 | |||
| 282 | if (unlikely(action_ret == IRQ_NONE)) { | ||
| 266 | /* | 283 | /* |
| 267 | * If we are seeing only the odd spurious IRQ caused by | 284 | * If we are seeing only the odd spurious IRQ caused by |
| 268 | * bus asynchronicity then don't eventually trigger an error, | 285 | * bus asynchronicity then don't eventually trigger an error, |
| @@ -274,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
| 274 | else | 291 | else |
| 275 | desc->irqs_unhandled++; | 292 | desc->irqs_unhandled++; |
| 276 | desc->last_unhandled = jiffies; | 293 | desc->last_unhandled = jiffies; |
| 277 | if (unlikely(action_ret != IRQ_NONE)) | ||
| 278 | report_bad_irq(irq, desc, action_ret); | ||
| 279 | } | 294 | } |
| 280 | 295 | ||
| 281 | if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { | 296 | if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 3b79bd938330..a8ce45097f3d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
| @@ -2,43 +2,23 @@ | |||
| 2 | * jump label support | 2 | * jump label support |
| 3 | * | 3 | * |
| 4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> | 4 | * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> |
| 5 | * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com> | ||
| 5 | * | 6 | * |
| 6 | */ | 7 | */ |
| 7 | #include <linux/jump_label.h> | ||
| 8 | #include <linux/memory.h> | 8 | #include <linux/memory.h> |
| 9 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
| 10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
| 11 | #include <linux/list.h> | 11 | #include <linux/list.h> |
| 12 | #include <linux/jhash.h> | ||
| 13 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
| 14 | #include <linux/sort.h> | 13 | #include <linux/sort.h> |
| 15 | #include <linux/err.h> | 14 | #include <linux/err.h> |
| 15 | #include <linux/jump_label.h> | ||
| 16 | 16 | ||
| 17 | #ifdef HAVE_JUMP_LABEL | 17 | #ifdef HAVE_JUMP_LABEL |
| 18 | 18 | ||
| 19 | #define JUMP_LABEL_HASH_BITS 6 | ||
| 20 | #define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) | ||
| 21 | static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; | ||
| 22 | |||
| 23 | /* mutex to protect coming/going of the the jump_label table */ | 19 | /* mutex to protect coming/going of the the jump_label table */ |
| 24 | static DEFINE_MUTEX(jump_label_mutex); | 20 | static DEFINE_MUTEX(jump_label_mutex); |
| 25 | 21 | ||
| 26 | struct jump_label_entry { | ||
| 27 | struct hlist_node hlist; | ||
| 28 | struct jump_entry *table; | ||
| 29 | int nr_entries; | ||
| 30 | /* hang modules off here */ | ||
| 31 | struct hlist_head modules; | ||
| 32 | unsigned long key; | ||
| 33 | }; | ||
| 34 | |||
| 35 | struct jump_label_module_entry { | ||
| 36 | struct hlist_node hlist; | ||
| 37 | struct jump_entry *table; | ||
| 38 | int nr_entries; | ||
| 39 | struct module *mod; | ||
| 40 | }; | ||
| 41 | |||
| 42 | void jump_label_lock(void) | 22 | void jump_label_lock(void) |
| 43 | { | 23 | { |
| 44 | mutex_lock(&jump_label_mutex); | 24 | mutex_lock(&jump_label_mutex); |
| @@ -49,6 +29,11 @@ void jump_label_unlock(void) | |||
| 49 | mutex_unlock(&jump_label_mutex); | 29 | mutex_unlock(&jump_label_mutex); |
| 50 | } | 30 | } |
| 51 | 31 | ||
| 32 | bool jump_label_enabled(struct jump_label_key *key) | ||
| 33 | { | ||
| 34 | return !!atomic_read(&key->enabled); | ||
| 35 | } | ||
| 36 | |||
| 52 | static int jump_label_cmp(const void *a, const void *b) | 37 | static int jump_label_cmp(const void *a, const void *b) |
| 53 | { | 38 | { |
| 54 | const struct jump_entry *jea = a; | 39 | const struct jump_entry *jea = a; |
| @@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b) | |||
| 64 | } | 49 | } |
| 65 | 50 | ||
| 66 | static void | 51 | static void |
| 67 | sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) | 52 | jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) |
| 68 | { | 53 | { |
| 69 | unsigned long size; | 54 | unsigned long size; |
| 70 | 55 | ||
| @@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) | |||
| 73 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); | 58 | sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); |
| 74 | } | 59 | } |
| 75 | 60 | ||
| 76 | static struct jump_label_entry *get_jump_label_entry(jump_label_t key) | 61 | static void jump_label_update(struct jump_label_key *key, int enable); |
| 77 | { | ||
| 78 | struct hlist_head *head; | ||
| 79 | struct hlist_node *node; | ||
| 80 | struct jump_label_entry *e; | ||
| 81 | u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
| 82 | |||
| 83 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
| 84 | hlist_for_each_entry(e, node, head, hlist) { | ||
| 85 | if (key == e->key) | ||
| 86 | return e; | ||
| 87 | } | ||
| 88 | return NULL; | ||
| 89 | } | ||
| 90 | 62 | ||
| 91 | static struct jump_label_entry * | 63 | void jump_label_inc(struct jump_label_key *key) |
| 92 | add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) | ||
| 93 | { | 64 | { |
| 94 | struct hlist_head *head; | 65 | if (atomic_inc_not_zero(&key->enabled)) |
| 95 | struct jump_label_entry *e; | 66 | return; |
| 96 | u32 hash; | ||
| 97 | |||
| 98 | e = get_jump_label_entry(key); | ||
| 99 | if (e) | ||
| 100 | return ERR_PTR(-EEXIST); | ||
| 101 | |||
| 102 | e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); | ||
| 103 | if (!e) | ||
| 104 | return ERR_PTR(-ENOMEM); | ||
| 105 | |||
| 106 | hash = jhash((void *)&key, sizeof(jump_label_t), 0); | ||
| 107 | head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; | ||
| 108 | e->key = key; | ||
| 109 | e->table = table; | ||
| 110 | e->nr_entries = nr_entries; | ||
| 111 | INIT_HLIST_HEAD(&(e->modules)); | ||
| 112 | hlist_add_head(&e->hlist, head); | ||
| 113 | return e; | ||
| 114 | } | ||
| 115 | 67 | ||
| 116 | static int | 68 | jump_label_lock(); |
| 117 | build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) | 69 | if (atomic_add_return(1, &key->enabled) == 1) |
| 118 | { | 70 | jump_label_update(key, JUMP_LABEL_ENABLE); |
| 119 | struct jump_entry *iter, *iter_begin; | 71 | jump_label_unlock(); |
| 120 | struct jump_label_entry *entry; | ||
| 121 | int count; | ||
| 122 | |||
| 123 | sort_jump_label_entries(start, stop); | ||
| 124 | iter = start; | ||
| 125 | while (iter < stop) { | ||
| 126 | entry = get_jump_label_entry(iter->key); | ||
| 127 | if (!entry) { | ||
| 128 | iter_begin = iter; | ||
| 129 | count = 0; | ||
| 130 | while ((iter < stop) && | ||
| 131 | (iter->key == iter_begin->key)) { | ||
| 132 | iter++; | ||
| 133 | count++; | ||
| 134 | } | ||
| 135 | entry = add_jump_label_entry(iter_begin->key, | ||
| 136 | count, iter_begin); | ||
| 137 | if (IS_ERR(entry)) | ||
| 138 | return PTR_ERR(entry); | ||
| 139 | } else { | ||
| 140 | WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); | ||
| 141 | return -1; | ||
| 142 | } | ||
| 143 | } | ||
| 144 | return 0; | ||
| 145 | } | 72 | } |
| 146 | 73 | ||
| 147 | /*** | 74 | void jump_label_dec(struct jump_label_key *key) |
| 148 | * jump_label_update - update jump label text | ||
| 149 | * @key - key value associated with a a jump label | ||
| 150 | * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE | ||
| 151 | * | ||
| 152 | * Will enable/disable the jump for jump label @key, depending on the | ||
| 153 | * value of @type. | ||
| 154 | * | ||
| 155 | */ | ||
| 156 | |||
| 157 | void jump_label_update(unsigned long key, enum jump_label_type type) | ||
| 158 | { | 75 | { |
| 159 | struct jump_entry *iter; | 76 | if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) |
| 160 | struct jump_label_entry *entry; | 77 | return; |
| 161 | struct hlist_node *module_node; | ||
| 162 | struct jump_label_module_entry *e_module; | ||
| 163 | int count; | ||
| 164 | 78 | ||
| 165 | jump_label_lock(); | 79 | jump_label_update(key, JUMP_LABEL_DISABLE); |
| 166 | entry = get_jump_label_entry((jump_label_t)key); | ||
| 167 | if (entry) { | ||
| 168 | count = entry->nr_entries; | ||
| 169 | iter = entry->table; | ||
| 170 | while (count--) { | ||
| 171 | if (kernel_text_address(iter->code)) | ||
| 172 | arch_jump_label_transform(iter, type); | ||
| 173 | iter++; | ||
| 174 | } | ||
| 175 | /* eanble/disable jump labels in modules */ | ||
| 176 | hlist_for_each_entry(e_module, module_node, &(entry->modules), | ||
| 177 | hlist) { | ||
| 178 | count = e_module->nr_entries; | ||
| 179 | iter = e_module->table; | ||
| 180 | while (count--) { | ||
| 181 | if (iter->key && | ||
| 182 | kernel_text_address(iter->code)) | ||
| 183 | arch_jump_label_transform(iter, type); | ||
| 184 | iter++; | ||
| 185 | } | ||
| 186 | } | ||
| 187 | } | ||
| 188 | jump_label_unlock(); | 80 | jump_label_unlock(); |
| 189 | } | 81 | } |
| 190 | 82 | ||
| @@ -197,77 +89,36 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end) | |||
| 197 | return 0; | 89 | return 0; |
| 198 | } | 90 | } |
| 199 | 91 | ||
| 200 | #ifdef CONFIG_MODULES | 92 | static int __jump_label_text_reserved(struct jump_entry *iter_start, |
| 201 | 93 | struct jump_entry *iter_stop, void *start, void *end) | |
| 202 | static int module_conflict(void *start, void *end) | ||
| 203 | { | ||
| 204 | struct hlist_head *head; | ||
| 205 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | ||
| 206 | struct jump_label_entry *e; | ||
| 207 | struct jump_label_module_entry *e_module; | ||
| 208 | struct jump_entry *iter; | ||
| 209 | int i, count; | ||
| 210 | int conflict = 0; | ||
| 211 | |||
| 212 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | ||
| 213 | head = &jump_label_table[i]; | ||
| 214 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | ||
| 215 | hlist_for_each_entry_safe(e_module, module_node, | ||
| 216 | module_node_next, | ||
| 217 | &(e->modules), hlist) { | ||
| 218 | count = e_module->nr_entries; | ||
| 219 | iter = e_module->table; | ||
| 220 | while (count--) { | ||
| 221 | if (addr_conflict(iter, start, end)) { | ||
| 222 | conflict = 1; | ||
| 223 | goto out; | ||
| 224 | } | ||
| 225 | iter++; | ||
| 226 | } | ||
| 227 | } | ||
| 228 | } | ||
| 229 | } | ||
| 230 | out: | ||
| 231 | return conflict; | ||
| 232 | } | ||
| 233 | |||
| 234 | #endif | ||
| 235 | |||
| 236 | /*** | ||
| 237 | * jump_label_text_reserved - check if addr range is reserved | ||
| 238 | * @start: start text addr | ||
| 239 | * @end: end text addr | ||
| 240 | * | ||
| 241 | * checks if the text addr located between @start and @end | ||
| 242 | * overlaps with any of the jump label patch addresses. Code | ||
| 243 | * that wants to modify kernel text should first verify that | ||
| 244 | * it does not overlap with any of the jump label addresses. | ||
| 245 | * Caller must hold jump_label_mutex. | ||
| 246 | * | ||
| 247 | * returns 1 if there is an overlap, 0 otherwise | ||
| 248 | */ | ||
| 249 | int jump_label_text_reserved(void *start, void *end) | ||
| 250 | { | 94 | { |
| 251 | struct jump_entry *iter; | 95 | struct jump_entry *iter; |
| 252 | struct jump_entry *iter_start = __start___jump_table; | ||
| 253 | struct jump_entry *iter_stop = __start___jump_table; | ||
| 254 | int conflict = 0; | ||
| 255 | 96 | ||
| 256 | iter = iter_start; | 97 | iter = iter_start; |
| 257 | while (iter < iter_stop) { | 98 | while (iter < iter_stop) { |
| 258 | if (addr_conflict(iter, start, end)) { | 99 | if (addr_conflict(iter, start, end)) |
| 259 | conflict = 1; | 100 | return 1; |
| 260 | goto out; | ||
| 261 | } | ||
| 262 | iter++; | 101 | iter++; |
| 263 | } | 102 | } |
| 264 | 103 | ||
| 265 | /* now check modules */ | 104 | return 0; |
| 266 | #ifdef CONFIG_MODULES | 105 | } |
| 267 | conflict = module_conflict(start, end); | 106 | |
| 268 | #endif | 107 | static void __jump_label_update(struct jump_label_key *key, |
| 269 | out: | 108 | struct jump_entry *entry, |
| 270 | return conflict; | 109 | struct jump_entry *stop, int enable) |
| 110 | { | ||
| 111 | for (; (entry < stop) && | ||
| 112 | (entry->key == (jump_label_t)(unsigned long)key); | ||
| 113 | entry++) { | ||
| 114 | /* | ||
| 115 | * entry->code set to 0 invalidates module init text sections | ||
| 116 | * kernel_text_address() verifies we are not in core kernel | ||
| 117 | * init code, see jump_label_invalidate_module_init(). | ||
| 118 | */ | ||
| 119 | if (entry->code && kernel_text_address(entry->code)) | ||
| 120 | arch_jump_label_transform(entry, enable); | ||
| 121 | } | ||
| 271 | } | 122 | } |
| 272 | 123 | ||
| 273 | /* | 124 | /* |
| @@ -277,145 +128,181 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr) | |||
| 277 | { | 128 | { |
| 278 | } | 129 | } |
| 279 | 130 | ||
| 280 | static __init int init_jump_label(void) | 131 | static __init int jump_label_init(void) |
| 281 | { | 132 | { |
| 282 | int ret; | ||
| 283 | struct jump_entry *iter_start = __start___jump_table; | 133 | struct jump_entry *iter_start = __start___jump_table; |
| 284 | struct jump_entry *iter_stop = __stop___jump_table; | 134 | struct jump_entry *iter_stop = __stop___jump_table; |
| 135 | struct jump_label_key *key = NULL; | ||
| 285 | struct jump_entry *iter; | 136 | struct jump_entry *iter; |
| 286 | 137 | ||
| 287 | jump_label_lock(); | 138 | jump_label_lock(); |
| 288 | ret = build_jump_label_hashtable(__start___jump_table, | 139 | jump_label_sort_entries(iter_start, iter_stop); |
| 289 | __stop___jump_table); | 140 | |
| 290 | iter = iter_start; | 141 | for (iter = iter_start; iter < iter_stop; iter++) { |
| 291 | while (iter < iter_stop) { | ||
| 292 | arch_jump_label_text_poke_early(iter->code); | 142 | arch_jump_label_text_poke_early(iter->code); |
| 293 | iter++; | 143 | if (iter->key == (jump_label_t)(unsigned long)key) |
| 144 | continue; | ||
| 145 | |||
| 146 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
| 147 | atomic_set(&key->enabled, 0); | ||
| 148 | key->entries = iter; | ||
| 149 | #ifdef CONFIG_MODULES | ||
| 150 | key->next = NULL; | ||
| 151 | #endif | ||
| 294 | } | 152 | } |
| 295 | jump_label_unlock(); | 153 | jump_label_unlock(); |
| 296 | return ret; | 154 | |
| 155 | return 0; | ||
| 297 | } | 156 | } |
| 298 | early_initcall(init_jump_label); | 157 | early_initcall(jump_label_init); |
| 299 | 158 | ||
| 300 | #ifdef CONFIG_MODULES | 159 | #ifdef CONFIG_MODULES |
| 301 | 160 | ||
| 302 | static struct jump_label_module_entry * | 161 | struct jump_label_mod { |
| 303 | add_jump_label_module_entry(struct jump_label_entry *entry, | 162 | struct jump_label_mod *next; |
| 304 | struct jump_entry *iter_begin, | 163 | struct jump_entry *entries; |
| 305 | int count, struct module *mod) | 164 | struct module *mod; |
| 165 | }; | ||
| 166 | |||
| 167 | static int __jump_label_mod_text_reserved(void *start, void *end) | ||
| 306 | { | 168 | { |
| 307 | struct jump_label_module_entry *e; | 169 | struct module *mod; |
| 308 | 170 | ||
| 309 | e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); | 171 | mod = __module_text_address((unsigned long)start); |
| 310 | if (!e) | 172 | if (!mod) |
| 311 | return ERR_PTR(-ENOMEM); | 173 | return 0; |
| 312 | e->mod = mod; | 174 | |
| 313 | e->nr_entries = count; | 175 | WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); |
| 314 | e->table = iter_begin; | 176 | |
| 315 | hlist_add_head(&e->hlist, &entry->modules); | 177 | return __jump_label_text_reserved(mod->jump_entries, |
| 316 | return e; | 178 | mod->jump_entries + mod->num_jump_entries, |
| 179 | start, end); | ||
| 317 | } | 180 | } |
| 318 | 181 | ||
| 319 | static int add_jump_label_module(struct module *mod) | 182 | static void __jump_label_mod_update(struct jump_label_key *key, int enable) |
| 320 | { | 183 | { |
| 321 | struct jump_entry *iter, *iter_begin; | 184 | struct jump_label_mod *mod = key->next; |
| 322 | struct jump_label_entry *entry; | ||
| 323 | struct jump_label_module_entry *module_entry; | ||
| 324 | int count; | ||
| 325 | 185 | ||
| 326 | /* if the module doesn't have jump label entries, just return */ | 186 | while (mod) { |
| 327 | if (!mod->num_jump_entries) | 187 | struct module *m = mod->mod; |
| 328 | return 0; | ||
| 329 | 188 | ||
| 330 | sort_jump_label_entries(mod->jump_entries, | 189 | __jump_label_update(key, mod->entries, |
| 331 | mod->jump_entries + mod->num_jump_entries); | 190 | m->jump_entries + m->num_jump_entries, |
| 332 | iter = mod->jump_entries; | 191 | enable); |
| 333 | while (iter < mod->jump_entries + mod->num_jump_entries) { | 192 | mod = mod->next; |
| 334 | entry = get_jump_label_entry(iter->key); | ||
| 335 | iter_begin = iter; | ||
| 336 | count = 0; | ||
| 337 | while ((iter < mod->jump_entries + mod->num_jump_entries) && | ||
| 338 | (iter->key == iter_begin->key)) { | ||
| 339 | iter++; | ||
| 340 | count++; | ||
| 341 | } | ||
| 342 | if (!entry) { | ||
| 343 | entry = add_jump_label_entry(iter_begin->key, 0, NULL); | ||
| 344 | if (IS_ERR(entry)) | ||
| 345 | return PTR_ERR(entry); | ||
| 346 | } | ||
| 347 | module_entry = add_jump_label_module_entry(entry, iter_begin, | ||
| 348 | count, mod); | ||
| 349 | if (IS_ERR(module_entry)) | ||
| 350 | return PTR_ERR(module_entry); | ||
| 351 | } | 193 | } |
| 352 | return 0; | ||
| 353 | } | 194 | } |
| 354 | 195 | ||
| 355 | static void remove_jump_label_module(struct module *mod) | 196 | /*** |
| 197 | * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() | ||
| 198 | * @mod: module to patch | ||
| 199 | * | ||
| 200 | * Allow for run-time selection of the optimal nops. Before the module | ||
| 201 | * loads patch these with arch_get_jump_label_nop(), which is specified by | ||
| 202 | * the arch specific jump label code. | ||
| 203 | */ | ||
| 204 | void jump_label_apply_nops(struct module *mod) | ||
| 356 | { | 205 | { |
| 357 | struct hlist_head *head; | 206 | struct jump_entry *iter_start = mod->jump_entries; |
| 358 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | 207 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
| 359 | struct jump_label_entry *e; | 208 | struct jump_entry *iter; |
| 360 | struct jump_label_module_entry *e_module; | ||
| 361 | int i; | ||
| 362 | 209 | ||
| 363 | /* if the module doesn't have jump label entries, just return */ | 210 | /* if the module doesn't have jump label entries, just return */ |
| 364 | if (!mod->num_jump_entries) | 211 | if (iter_start == iter_stop) |
| 365 | return; | 212 | return; |
| 366 | 213 | ||
| 367 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | 214 | for (iter = iter_start; iter < iter_stop; iter++) |
| 368 | head = &jump_label_table[i]; | 215 | arch_jump_label_text_poke_early(iter->code); |
| 369 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | 216 | } |
| 370 | hlist_for_each_entry_safe(e_module, module_node, | 217 | |
| 371 | module_node_next, | 218 | static int jump_label_add_module(struct module *mod) |
| 372 | &(e->modules), hlist) { | 219 | { |
| 373 | if (e_module->mod == mod) { | 220 | struct jump_entry *iter_start = mod->jump_entries; |
| 374 | hlist_del(&e_module->hlist); | 221 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
| 375 | kfree(e_module); | 222 | struct jump_entry *iter; |
| 376 | } | 223 | struct jump_label_key *key = NULL; |
| 377 | } | 224 | struct jump_label_mod *jlm; |
| 378 | if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { | 225 | |
| 379 | hlist_del(&e->hlist); | 226 | /* if the module doesn't have jump label entries, just return */ |
| 380 | kfree(e); | 227 | if (iter_start == iter_stop) |
| 381 | } | 228 | return 0; |
| 229 | |||
| 230 | jump_label_sort_entries(iter_start, iter_stop); | ||
| 231 | |||
| 232 | for (iter = iter_start; iter < iter_stop; iter++) { | ||
| 233 | if (iter->key == (jump_label_t)(unsigned long)key) | ||
| 234 | continue; | ||
| 235 | |||
| 236 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
| 237 | |||
| 238 | if (__module_address(iter->key) == mod) { | ||
| 239 | atomic_set(&key->enabled, 0); | ||
| 240 | key->entries = iter; | ||
| 241 | key->next = NULL; | ||
| 242 | continue; | ||
| 382 | } | 243 | } |
| 244 | |||
| 245 | jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); | ||
| 246 | if (!jlm) | ||
| 247 | return -ENOMEM; | ||
| 248 | |||
| 249 | jlm->mod = mod; | ||
| 250 | jlm->entries = iter; | ||
| 251 | jlm->next = key->next; | ||
| 252 | key->next = jlm; | ||
| 253 | |||
| 254 | if (jump_label_enabled(key)) | ||
| 255 | __jump_label_update(key, iter, iter_stop, | ||
| 256 | JUMP_LABEL_ENABLE); | ||
| 383 | } | 257 | } |
| 258 | |||
| 259 | return 0; | ||
| 384 | } | 260 | } |
| 385 | 261 | ||
| 386 | static void remove_jump_label_module_init(struct module *mod) | 262 | static void jump_label_del_module(struct module *mod) |
| 387 | { | 263 | { |
| 388 | struct hlist_head *head; | 264 | struct jump_entry *iter_start = mod->jump_entries; |
| 389 | struct hlist_node *node, *node_next, *module_node, *module_node_next; | 265 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; |
| 390 | struct jump_label_entry *e; | ||
| 391 | struct jump_label_module_entry *e_module; | ||
| 392 | struct jump_entry *iter; | 266 | struct jump_entry *iter; |
| 393 | int i, count; | 267 | struct jump_label_key *key = NULL; |
| 268 | struct jump_label_mod *jlm, **prev; | ||
| 394 | 269 | ||
| 395 | /* if the module doesn't have jump label entries, just return */ | 270 | for (iter = iter_start; iter < iter_stop; iter++) { |
| 396 | if (!mod->num_jump_entries) | 271 | if (iter->key == (jump_label_t)(unsigned long)key) |
| 397 | return; | 272 | continue; |
| 273 | |||
| 274 | key = (struct jump_label_key *)(unsigned long)iter->key; | ||
| 275 | |||
| 276 | if (__module_address(iter->key) == mod) | ||
| 277 | continue; | ||
| 278 | |||
| 279 | prev = &key->next; | ||
| 280 | jlm = key->next; | ||
| 281 | |||
| 282 | while (jlm && jlm->mod != mod) { | ||
| 283 | prev = &jlm->next; | ||
| 284 | jlm = jlm->next; | ||
| 285 | } | ||
| 398 | 286 | ||
| 399 | for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { | 287 | if (jlm) { |
| 400 | head = &jump_label_table[i]; | 288 | *prev = jlm->next; |
| 401 | hlist_for_each_entry_safe(e, node, node_next, head, hlist) { | 289 | kfree(jlm); |
| 402 | hlist_for_each_entry_safe(e_module, module_node, | ||
| 403 | module_node_next, | ||
| 404 | &(e->modules), hlist) { | ||
| 405 | if (e_module->mod != mod) | ||
| 406 | continue; | ||
| 407 | count = e_module->nr_entries; | ||
| 408 | iter = e_module->table; | ||
| 409 | while (count--) { | ||
| 410 | if (within_module_init(iter->code, mod)) | ||
| 411 | iter->key = 0; | ||
| 412 | iter++; | ||
| 413 | } | ||
| 414 | } | ||
| 415 | } | 290 | } |
| 416 | } | 291 | } |
| 417 | } | 292 | } |
| 418 | 293 | ||
| 294 | static void jump_label_invalidate_module_init(struct module *mod) | ||
| 295 | { | ||
| 296 | struct jump_entry *iter_start = mod->jump_entries; | ||
| 297 | struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; | ||
| 298 | struct jump_entry *iter; | ||
| 299 | |||
| 300 | for (iter = iter_start; iter < iter_stop; iter++) { | ||
| 301 | if (within_module_init(iter->code, mod)) | ||
| 302 | iter->code = 0; | ||
| 303 | } | ||
| 304 | } | ||
| 305 | |||
| 419 | static int | 306 | static int |
| 420 | jump_label_module_notify(struct notifier_block *self, unsigned long val, | 307 | jump_label_module_notify(struct notifier_block *self, unsigned long val, |
| 421 | void *data) | 308 | void *data) |
| @@ -426,59 +313,81 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, | |||
| 426 | switch (val) { | 313 | switch (val) { |
| 427 | case MODULE_STATE_COMING: | 314 | case MODULE_STATE_COMING: |
| 428 | jump_label_lock(); | 315 | jump_label_lock(); |
| 429 | ret = add_jump_label_module(mod); | 316 | ret = jump_label_add_module(mod); |
| 430 | if (ret) | 317 | if (ret) |
| 431 | remove_jump_label_module(mod); | 318 | jump_label_del_module(mod); |
| 432 | jump_label_unlock(); | 319 | jump_label_unlock(); |
| 433 | break; | 320 | break; |
| 434 | case MODULE_STATE_GOING: | 321 | case MODULE_STATE_GOING: |
| 435 | jump_label_lock(); | 322 | jump_label_lock(); |
| 436 | remove_jump_label_module(mod); | 323 | jump_label_del_module(mod); |
| 437 | jump_label_unlock(); | 324 | jump_label_unlock(); |
| 438 | break; | 325 | break; |
| 439 | case MODULE_STATE_LIVE: | 326 | case MODULE_STATE_LIVE: |
| 440 | jump_label_lock(); | 327 | jump_label_lock(); |
| 441 | remove_jump_label_module_init(mod); | 328 | jump_label_invalidate_module_init(mod); |
| 442 | jump_label_unlock(); | 329 | jump_label_unlock(); |
| 443 | break; | 330 | break; |
| 444 | } | 331 | } |
| 445 | return ret; | ||
| 446 | } | ||
| 447 | |||
| 448 | /*** | ||
| 449 | * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() | ||
| 450 | * @mod: module to patch | ||
| 451 | * | ||
| 452 | * Allow for run-time selection of the optimal nops. Before the module | ||
| 453 | * loads patch these with arch_get_jump_label_nop(), which is specified by | ||
| 454 | * the arch specific jump label code. | ||
| 455 | */ | ||
| 456 | void jump_label_apply_nops(struct module *mod) | ||
| 457 | { | ||
| 458 | struct jump_entry *iter; | ||
| 459 | |||
| 460 | /* if the module doesn't have jump label entries, just return */ | ||
| 461 | if (!mod->num_jump_entries) | ||
| 462 | return; | ||
| 463 | 332 | ||
| 464 | iter = mod->jump_entries; | 333 | return notifier_from_errno(ret); |
| 465 | while (iter < mod->jump_entries + mod->num_jump_entries) { | ||
| 466 | arch_jump_label_text_poke_early(iter->code); | ||
| 467 | iter++; | ||
| 468 | } | ||
| 469 | } | 334 | } |
| 470 | 335 | ||
| 471 | struct notifier_block jump_label_module_nb = { | 336 | struct notifier_block jump_label_module_nb = { |
| 472 | .notifier_call = jump_label_module_notify, | 337 | .notifier_call = jump_label_module_notify, |
| 473 | .priority = 0, | 338 | .priority = 1, /* higher than tracepoints */ |
| 474 | }; | 339 | }; |
| 475 | 340 | ||
| 476 | static __init int init_jump_label_module(void) | 341 | static __init int jump_label_init_module(void) |
| 477 | { | 342 | { |
| 478 | return register_module_notifier(&jump_label_module_nb); | 343 | return register_module_notifier(&jump_label_module_nb); |
| 479 | } | 344 | } |
| 480 | early_initcall(init_jump_label_module); | 345 | early_initcall(jump_label_init_module); |
| 481 | 346 | ||
| 482 | #endif /* CONFIG_MODULES */ | 347 | #endif /* CONFIG_MODULES */ |
| 483 | 348 | ||
| 349 | /*** | ||
| 350 | * jump_label_text_reserved - check if addr range is reserved | ||
| 351 | * @start: start text addr | ||
| 352 | * @end: end text addr | ||
| 353 | * | ||
| 354 | * checks if the text addr located between @start and @end | ||
| 355 | * overlaps with any of the jump label patch addresses. Code | ||
| 356 | * that wants to modify kernel text should first verify that | ||
| 357 | * it does not overlap with any of the jump label addresses. | ||
| 358 | * Caller must hold jump_label_mutex. | ||
| 359 | * | ||
| 360 | * returns 1 if there is an overlap, 0 otherwise | ||
| 361 | */ | ||
| 362 | int jump_label_text_reserved(void *start, void *end) | ||
| 363 | { | ||
| 364 | int ret = __jump_label_text_reserved(__start___jump_table, | ||
| 365 | __stop___jump_table, start, end); | ||
| 366 | |||
| 367 | if (ret) | ||
| 368 | return ret; | ||
| 369 | |||
| 370 | #ifdef CONFIG_MODULES | ||
| 371 | ret = __jump_label_mod_text_reserved(start, end); | ||
| 372 | #endif | ||
| 373 | return ret; | ||
| 374 | } | ||
| 375 | |||
| 376 | static void jump_label_update(struct jump_label_key *key, int enable) | ||
| 377 | { | ||
| 378 | struct jump_entry *entry = key->entries, *stop = __stop___jump_table; | ||
| 379 | |||
| 380 | #ifdef CONFIG_MODULES | ||
| 381 | struct module *mod = __module_address((jump_label_t)key); | ||
| 382 | |||
| 383 | __jump_label_mod_update(key, enable); | ||
| 384 | |||
| 385 | if (mod) | ||
| 386 | stop = mod->jump_entries + mod->num_jump_entries; | ||
| 387 | #endif | ||
| 388 | /* if there are no users, entry can be NULL */ | ||
| 389 | if (entry) | ||
| 390 | __jump_label_update(key, entry, stop, enable); | ||
| 391 | } | ||
| 392 | |||
| 484 | #endif | 393 | #endif |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 87b77de03dd3..296fbc84d659 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -1095,7 +1095,7 @@ size_t crash_get_memory_size(void) | |||
| 1095 | size_t size = 0; | 1095 | size_t size = 0; |
| 1096 | mutex_lock(&kexec_mutex); | 1096 | mutex_lock(&kexec_mutex); |
| 1097 | if (crashk_res.end != crashk_res.start) | 1097 | if (crashk_res.end != crashk_res.start) |
| 1098 | size = crashk_res.end - crashk_res.start + 1; | 1098 | size = resource_size(&crashk_res); |
| 1099 | mutex_unlock(&kexec_mutex); | 1099 | mutex_unlock(&kexec_mutex); |
| 1100 | return size; | 1100 | return size; |
| 1101 | } | 1101 | } |
| @@ -1531,13 +1531,7 @@ int kernel_kexec(void) | |||
| 1531 | if (error) | 1531 | if (error) |
| 1532 | goto Enable_cpus; | 1532 | goto Enable_cpus; |
| 1533 | local_irq_disable(); | 1533 | local_irq_disable(); |
| 1534 | /* Suspend system devices */ | 1534 | error = syscore_suspend(); |
| 1535 | error = sysdev_suspend(PMSG_FREEZE); | ||
| 1536 | if (!error) { | ||
| 1537 | error = syscore_suspend(); | ||
| 1538 | if (error) | ||
| 1539 | sysdev_resume(); | ||
| 1540 | } | ||
| 1541 | if (error) | 1535 | if (error) |
| 1542 | goto Enable_irqs; | 1536 | goto Enable_irqs; |
| 1543 | } else | 1537 | } else |
| @@ -1553,7 +1547,6 @@ int kernel_kexec(void) | |||
| 1553 | #ifdef CONFIG_KEXEC_JUMP | 1547 | #ifdef CONFIG_KEXEC_JUMP |
| 1554 | if (kexec_image->preserve_context) { | 1548 | if (kexec_image->preserve_context) { |
| 1555 | syscore_resume(); | 1549 | syscore_resume(); |
| 1556 | sysdev_resume(); | ||
| 1557 | Enable_irqs: | 1550 | Enable_irqs: |
| 1558 | local_irq_enable(); | 1551 | local_irq_enable(); |
| 1559 | Enable_cpus: | 1552 | Enable_cpus: |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 9cd0591c96a2..ddc7644c1305 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/kmod.h> | 25 | #include <linux/kmod.h> |
| 26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
| 27 | #include <linux/completion.h> | 27 | #include <linux/completion.h> |
| 28 | #include <linux/cred.h> | ||
| 28 | #include <linux/file.h> | 29 | #include <linux/file.h> |
| 29 | #include <linux/fdtable.h> | 30 | #include <linux/fdtable.h> |
| 30 | #include <linux/workqueue.h> | 31 | #include <linux/workqueue.h> |
| @@ -43,6 +44,13 @@ extern int max_threads; | |||
| 43 | 44 | ||
| 44 | static struct workqueue_struct *khelper_wq; | 45 | static struct workqueue_struct *khelper_wq; |
| 45 | 46 | ||
| 47 | #define CAP_BSET (void *)1 | ||
| 48 | #define CAP_PI (void *)2 | ||
| 49 | |||
| 50 | static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; | ||
| 51 | static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; | ||
| 52 | static DEFINE_SPINLOCK(umh_sysctl_lock); | ||
| 53 | |||
| 46 | #ifdef CONFIG_MODULES | 54 | #ifdef CONFIG_MODULES |
| 47 | 55 | ||
| 48 | /* | 56 | /* |
| @@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module); | |||
| 132 | static int ____call_usermodehelper(void *data) | 140 | static int ____call_usermodehelper(void *data) |
| 133 | { | 141 | { |
| 134 | struct subprocess_info *sub_info = data; | 142 | struct subprocess_info *sub_info = data; |
| 143 | struct cred *new; | ||
| 135 | int retval; | 144 | int retval; |
| 136 | 145 | ||
| 137 | spin_lock_irq(¤t->sighand->siglock); | 146 | spin_lock_irq(¤t->sighand->siglock); |
| @@ -147,12 +156,27 @@ static int ____call_usermodehelper(void *data) | |||
| 147 | */ | 156 | */ |
| 148 | set_user_nice(current, 0); | 157 | set_user_nice(current, 0); |
| 149 | 158 | ||
| 159 | retval = -ENOMEM; | ||
| 160 | new = prepare_kernel_cred(current); | ||
| 161 | if (!new) | ||
| 162 | goto fail; | ||
| 163 | |||
| 164 | spin_lock(&umh_sysctl_lock); | ||
| 165 | new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); | ||
| 166 | new->cap_inheritable = cap_intersect(usermodehelper_inheritable, | ||
| 167 | new->cap_inheritable); | ||
| 168 | spin_unlock(&umh_sysctl_lock); | ||
| 169 | |||
| 150 | if (sub_info->init) { | 170 | if (sub_info->init) { |
| 151 | retval = sub_info->init(sub_info); | 171 | retval = sub_info->init(sub_info, new); |
| 152 | if (retval) | 172 | if (retval) { |
| 173 | abort_creds(new); | ||
| 153 | goto fail; | 174 | goto fail; |
| 175 | } | ||
| 154 | } | 176 | } |
| 155 | 177 | ||
| 178 | commit_creds(new); | ||
| 179 | |||
| 156 | retval = kernel_execve(sub_info->path, | 180 | retval = kernel_execve(sub_info->path, |
| 157 | (const char *const *)sub_info->argv, | 181 | (const char *const *)sub_info->argv, |
| 158 | (const char *const *)sub_info->envp); | 182 | (const char *const *)sub_info->envp); |
| @@ -245,13 +269,12 @@ static void __call_usermodehelper(struct work_struct *work) | |||
| 245 | } | 269 | } |
| 246 | } | 270 | } |
| 247 | 271 | ||
| 248 | #ifdef CONFIG_PM_SLEEP | ||
| 249 | /* | 272 | /* |
| 250 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY | 273 | * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY |
| 251 | * (used for preventing user land processes from being created after the user | 274 | * (used for preventing user land processes from being created after the user |
| 252 | * land has been frozen during a system-wide hibernation or suspend operation). | 275 | * land has been frozen during a system-wide hibernation or suspend operation). |
| 253 | */ | 276 | */ |
| 254 | static int usermodehelper_disabled; | 277 | static int usermodehelper_disabled = 1; |
| 255 | 278 | ||
| 256 | /* Number of helpers running */ | 279 | /* Number of helpers running */ |
| 257 | static atomic_t running_helpers = ATOMIC_INIT(0); | 280 | static atomic_t running_helpers = ATOMIC_INIT(0); |
| @@ -301,6 +324,15 @@ void usermodehelper_enable(void) | |||
| 301 | usermodehelper_disabled = 0; | 324 | usermodehelper_disabled = 0; |
| 302 | } | 325 | } |
| 303 | 326 | ||
| 327 | /** | ||
| 328 | * usermodehelper_is_disabled - check if new helpers are allowed to be started | ||
| 329 | */ | ||
| 330 | bool usermodehelper_is_disabled(void) | ||
| 331 | { | ||
| 332 | return usermodehelper_disabled; | ||
| 333 | } | ||
| 334 | EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); | ||
| 335 | |||
| 304 | static void helper_lock(void) | 336 | static void helper_lock(void) |
| 305 | { | 337 | { |
| 306 | atomic_inc(&running_helpers); | 338 | atomic_inc(&running_helpers); |
| @@ -312,12 +344,6 @@ static void helper_unlock(void) | |||
| 312 | if (atomic_dec_and_test(&running_helpers)) | 344 | if (atomic_dec_and_test(&running_helpers)) |
| 313 | wake_up(&running_helpers_waitq); | 345 | wake_up(&running_helpers_waitq); |
| 314 | } | 346 | } |
| 315 | #else /* CONFIG_PM_SLEEP */ | ||
| 316 | #define usermodehelper_disabled 0 | ||
| 317 | |||
| 318 | static inline void helper_lock(void) {} | ||
| 319 | static inline void helper_unlock(void) {} | ||
| 320 | #endif /* CONFIG_PM_SLEEP */ | ||
| 321 | 347 | ||
| 322 | /** | 348 | /** |
| 323 | * call_usermodehelper_setup - prepare to call a usermode helper | 349 | * call_usermodehelper_setup - prepare to call a usermode helper |
| @@ -364,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); | |||
| 364 | * context in which call_usermodehelper_exec is called. | 390 | * context in which call_usermodehelper_exec is called. |
| 365 | */ | 391 | */ |
| 366 | void call_usermodehelper_setfns(struct subprocess_info *info, | 392 | void call_usermodehelper_setfns(struct subprocess_info *info, |
| 367 | int (*init)(struct subprocess_info *info), | 393 | int (*init)(struct subprocess_info *info, struct cred *new), |
| 368 | void (*cleanup)(struct subprocess_info *info), | 394 | void (*cleanup)(struct subprocess_info *info), |
| 369 | void *data) | 395 | void *data) |
| 370 | { | 396 | { |
| @@ -418,6 +444,84 @@ unlock: | |||
| 418 | } | 444 | } |
| 419 | EXPORT_SYMBOL(call_usermodehelper_exec); | 445 | EXPORT_SYMBOL(call_usermodehelper_exec); |
| 420 | 446 | ||
| 447 | static int proc_cap_handler(struct ctl_table *table, int write, | ||
| 448 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 449 | { | ||
| 450 | struct ctl_table t; | ||
| 451 | unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; | ||
| 452 | kernel_cap_t new_cap; | ||
| 453 | int err, i; | ||
| 454 | |||
| 455 | if (write && (!capable(CAP_SETPCAP) || | ||
| 456 | !capable(CAP_SYS_MODULE))) | ||
| 457 | return -EPERM; | ||
| 458 | |||
| 459 | /* | ||
| 460 | * convert from the global kernel_cap_t to the ulong array to print to | ||
| 461 | * userspace if this is a read. | ||
| 462 | */ | ||
| 463 | spin_lock(&umh_sysctl_lock); | ||
| 464 | for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { | ||
| 465 | if (table->data == CAP_BSET) | ||
| 466 | cap_array[i] = usermodehelper_bset.cap[i]; | ||
| 467 | else if (table->data == CAP_PI) | ||
| 468 | cap_array[i] = usermodehelper_inheritable.cap[i]; | ||
| 469 | else | ||
| 470 | BUG(); | ||
| 471 | } | ||
| 472 | spin_unlock(&umh_sysctl_lock); | ||
| 473 | |||
| 474 | t = *table; | ||
| 475 | t.data = &cap_array; | ||
| 476 | |||
| 477 | /* | ||
| 478 | * actually read or write and array of ulongs from userspace. Remember | ||
| 479 | * these are least significant 32 bits first | ||
| 480 | */ | ||
| 481 | err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); | ||
| 482 | if (err < 0) | ||
| 483 | return err; | ||
| 484 | |||
| 485 | /* | ||
| 486 | * convert from the sysctl array of ulongs to the kernel_cap_t | ||
| 487 | * internal representation | ||
| 488 | */ | ||
| 489 | for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) | ||
| 490 | new_cap.cap[i] = cap_array[i]; | ||
| 491 | |||
| 492 | /* | ||
| 493 | * Drop everything not in the new_cap (but don't add things) | ||
| 494 | */ | ||
| 495 | spin_lock(&umh_sysctl_lock); | ||
| 496 | if (write) { | ||
| 497 | if (table->data == CAP_BSET) | ||
| 498 | usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); | ||
| 499 | if (table->data == CAP_PI) | ||
| 500 | usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); | ||
| 501 | } | ||
| 502 | spin_unlock(&umh_sysctl_lock); | ||
| 503 | |||
| 504 | return 0; | ||
| 505 | } | ||
| 506 | |||
| 507 | struct ctl_table usermodehelper_table[] = { | ||
| 508 | { | ||
| 509 | .procname = "bset", | ||
| 510 | .data = CAP_BSET, | ||
| 511 | .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), | ||
| 512 | .mode = 0600, | ||
| 513 | .proc_handler = proc_cap_handler, | ||
| 514 | }, | ||
| 515 | { | ||
| 516 | .procname = "inheritable", | ||
| 517 | .data = CAP_PI, | ||
| 518 | .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), | ||
| 519 | .mode = 0600, | ||
| 520 | .proc_handler = proc_cap_handler, | ||
| 521 | }, | ||
| 522 | { } | ||
| 523 | }; | ||
| 524 | |||
| 421 | void __init usermodehelper_init(void) | 525 | void __init usermodehelper_init(void) |
| 422 | { | 526 | { |
| 423 | khelper_wq = create_singlethread_workqueue("khelper"); | 527 | khelper_wq = create_singlethread_workqueue("khelper"); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 77981813a1e7..b30fd54eb985 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -1255,19 +1255,29 @@ static int __kprobes in_kprobes_functions(unsigned long addr) | |||
| 1255 | /* | 1255 | /* |
| 1256 | * If we have a symbol_name argument, look it up and add the offset field | 1256 | * If we have a symbol_name argument, look it up and add the offset field |
| 1257 | * to it. This way, we can specify a relative address to a symbol. | 1257 | * to it. This way, we can specify a relative address to a symbol. |
| 1258 | * This returns encoded errors if it fails to look up symbol or invalid | ||
| 1259 | * combination of parameters. | ||
| 1258 | */ | 1260 | */ |
| 1259 | static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | 1261 | static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) |
| 1260 | { | 1262 | { |
| 1261 | kprobe_opcode_t *addr = p->addr; | 1263 | kprobe_opcode_t *addr = p->addr; |
| 1264 | |||
| 1265 | if ((p->symbol_name && p->addr) || | ||
| 1266 | (!p->symbol_name && !p->addr)) | ||
| 1267 | goto invalid; | ||
| 1268 | |||
| 1262 | if (p->symbol_name) { | 1269 | if (p->symbol_name) { |
| 1263 | if (addr) | ||
| 1264 | return NULL; | ||
| 1265 | kprobe_lookup_name(p->symbol_name, addr); | 1270 | kprobe_lookup_name(p->symbol_name, addr); |
| 1271 | if (!addr) | ||
| 1272 | return ERR_PTR(-ENOENT); | ||
| 1266 | } | 1273 | } |
| 1267 | 1274 | ||
| 1268 | if (!addr) | 1275 | addr = (kprobe_opcode_t *)(((char *)addr) + p->offset); |
| 1269 | return NULL; | 1276 | if (addr) |
| 1270 | return (kprobe_opcode_t *)(((char *)addr) + p->offset); | 1277 | return addr; |
| 1278 | |||
| 1279 | invalid: | ||
| 1280 | return ERR_PTR(-EINVAL); | ||
| 1271 | } | 1281 | } |
| 1272 | 1282 | ||
| 1273 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | 1283 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ |
| @@ -1311,8 +1321,8 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
| 1311 | kprobe_opcode_t *addr; | 1321 | kprobe_opcode_t *addr; |
| 1312 | 1322 | ||
| 1313 | addr = kprobe_addr(p); | 1323 | addr = kprobe_addr(p); |
| 1314 | if (!addr) | 1324 | if (IS_ERR(addr)) |
| 1315 | return -EINVAL; | 1325 | return PTR_ERR(addr); |
| 1316 | p->addr = addr; | 1326 | p->addr = addr; |
| 1317 | 1327 | ||
| 1318 | ret = check_kprobe_rereg(p); | 1328 | ret = check_kprobe_rereg(p); |
| @@ -1335,6 +1345,8 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
| 1335 | */ | 1345 | */ |
| 1336 | probed_mod = __module_text_address((unsigned long) p->addr); | 1346 | probed_mod = __module_text_address((unsigned long) p->addr); |
| 1337 | if (probed_mod) { | 1347 | if (probed_mod) { |
| 1348 | /* Return -ENOENT if fail. */ | ||
| 1349 | ret = -ENOENT; | ||
| 1338 | /* | 1350 | /* |
| 1339 | * We must hold a refcount of the probed module while updating | 1351 | * We must hold a refcount of the probed module while updating |
| 1340 | * its code to prohibit unexpected unloading. | 1352 | * its code to prohibit unexpected unloading. |
| @@ -1351,6 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
| 1351 | module_put(probed_mod); | 1363 | module_put(probed_mod); |
| 1352 | goto fail_with_jump_label; | 1364 | goto fail_with_jump_label; |
| 1353 | } | 1365 | } |
| 1366 | /* ret will be updated by following code */ | ||
| 1354 | } | 1367 | } |
| 1355 | preempt_enable(); | 1368 | preempt_enable(); |
| 1356 | jump_label_unlock(); | 1369 | jump_label_unlock(); |
| @@ -1399,7 +1412,7 @@ out: | |||
| 1399 | fail_with_jump_label: | 1412 | fail_with_jump_label: |
| 1400 | preempt_enable(); | 1413 | preempt_enable(); |
| 1401 | jump_label_unlock(); | 1414 | jump_label_unlock(); |
| 1402 | return -EINVAL; | 1415 | return ret; |
| 1403 | } | 1416 | } |
| 1404 | EXPORT_SYMBOL_GPL(register_kprobe); | 1417 | EXPORT_SYMBOL_GPL(register_kprobe); |
| 1405 | 1418 | ||
| @@ -1686,8 +1699,8 @@ int __kprobes register_kretprobe(struct kretprobe *rp) | |||
| 1686 | 1699 | ||
| 1687 | if (kretprobe_blacklist_size) { | 1700 | if (kretprobe_blacklist_size) { |
| 1688 | addr = kprobe_addr(&rp->kp); | 1701 | addr = kprobe_addr(&rp->kp); |
| 1689 | if (!addr) | 1702 | if (IS_ERR(addr)) |
| 1690 | return -EINVAL; | 1703 | return PTR_ERR(addr); |
| 1691 | 1704 | ||
| 1692 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { | 1705 | for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { |
| 1693 | if (kretprobe_blacklist[i].addr == addr) | 1706 | if (kretprobe_blacklist[i].addr == addr) |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0b624e791805..3b053c04dd86 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/kexec.h> | 16 | #include <linux/kexec.h> |
| 17 | #include <linux/profile.h> | 17 | #include <linux/profile.h> |
| 18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
| 19 | #include <linux/capability.h> | ||
| 19 | 20 | ||
| 20 | #define KERNEL_ATTR_RO(_name) \ | 21 | #define KERNEL_ATTR_RO(_name) \ |
| 21 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | 22 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) |
| @@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo); | |||
| 131 | 132 | ||
| 132 | #endif /* CONFIG_KEXEC */ | 133 | #endif /* CONFIG_KEXEC */ |
| 133 | 134 | ||
| 135 | /* whether file capabilities are enabled */ | ||
| 136 | static ssize_t fscaps_show(struct kobject *kobj, | ||
| 137 | struct kobj_attribute *attr, char *buf) | ||
| 138 | { | ||
| 139 | return sprintf(buf, "%d\n", file_caps_enabled); | ||
| 140 | } | ||
| 141 | KERNEL_ATTR_RO(fscaps); | ||
| 142 | |||
| 134 | /* | 143 | /* |
| 135 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. | 144 | * Make /sys/kernel/notes give the raw contents of our kernel .notes section. |
| 136 | */ | 145 | */ |
| @@ -158,6 +167,7 @@ struct kobject *kernel_kobj; | |||
| 158 | EXPORT_SYMBOL_GPL(kernel_kobj); | 167 | EXPORT_SYMBOL_GPL(kernel_kobj); |
| 159 | 168 | ||
| 160 | static struct attribute * kernel_attrs[] = { | 169 | static struct attribute * kernel_attrs[] = { |
| 170 | &fscaps_attr.attr, | ||
| 161 | #if defined(CONFIG_HOTPLUG) | 171 | #if defined(CONFIG_HOTPLUG) |
| 162 | &uevent_seqnum_attr.attr, | 172 | &uevent_seqnum_attr.attr, |
| 163 | &uevent_helper_attr.attr, | 173 | &uevent_helper_attr.attr, |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 3b34d2732bce..4ba7cccb4994 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) | |||
| 202 | return; | 202 | return; |
| 203 | } | 203 | } |
| 204 | 204 | ||
| 205 | p->cpus_allowed = cpumask_of_cpu(cpu); | 205 | /* It's safe because the task is inactive. */ |
| 206 | p->rt.nr_cpus_allowed = 1; | 206 | do_set_cpus_allowed(p, cpumask_of(cpu)); |
| 207 | p->flags |= PF_THREAD_BOUND; | 207 | p->flags |= PF_THREAD_BOUND; |
| 208 | } | 208 | } |
| 209 | EXPORT_SYMBOL(kthread_bind); | 209 | EXPORT_SYMBOL(kthread_bind); |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 53a68956f131..8c24294e477f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) | |||
| 490 | usage[i] = '\0'; | 490 | usage[i] = '\0'; |
| 491 | } | 491 | } |
| 492 | 492 | ||
| 493 | static int __print_lock_name(struct lock_class *class) | ||
| 494 | { | ||
| 495 | char str[KSYM_NAME_LEN]; | ||
| 496 | const char *name; | ||
| 497 | |||
| 498 | name = class->name; | ||
| 499 | if (!name) | ||
| 500 | name = __get_key_name(class->key, str); | ||
| 501 | |||
| 502 | return printk("%s", name); | ||
| 503 | } | ||
| 504 | |||
| 493 | static void print_lock_name(struct lock_class *class) | 505 | static void print_lock_name(struct lock_class *class) |
| 494 | { | 506 | { |
| 495 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; | 507 | char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; |
| @@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth) | |||
| 1053 | return 0; | 1065 | return 0; |
| 1054 | } | 1066 | } |
| 1055 | 1067 | ||
| 1068 | static void | ||
| 1069 | print_circular_lock_scenario(struct held_lock *src, | ||
| 1070 | struct held_lock *tgt, | ||
| 1071 | struct lock_list *prt) | ||
| 1072 | { | ||
| 1073 | struct lock_class *source = hlock_class(src); | ||
| 1074 | struct lock_class *target = hlock_class(tgt); | ||
| 1075 | struct lock_class *parent = prt->class; | ||
| 1076 | |||
| 1077 | /* | ||
| 1078 | * A direct locking problem where unsafe_class lock is taken | ||
| 1079 | * directly by safe_class lock, then all we need to show | ||
| 1080 | * is the deadlock scenario, as it is obvious that the | ||
| 1081 | * unsafe lock is taken under the safe lock. | ||
| 1082 | * | ||
| 1083 | * But if there is a chain instead, where the safe lock takes | ||
| 1084 | * an intermediate lock (middle_class) where this lock is | ||
| 1085 | * not the same as the safe lock, then the lock chain is | ||
| 1086 | * used to describe the problem. Otherwise we would need | ||
| 1087 | * to show a different CPU case for each link in the chain | ||
| 1088 | * from the safe_class lock to the unsafe_class lock. | ||
| 1089 | */ | ||
| 1090 | if (parent != source) { | ||
| 1091 | printk("Chain exists of:\n "); | ||
| 1092 | __print_lock_name(source); | ||
| 1093 | printk(" --> "); | ||
| 1094 | __print_lock_name(parent); | ||
| 1095 | printk(" --> "); | ||
| 1096 | __print_lock_name(target); | ||
| 1097 | printk("\n\n"); | ||
| 1098 | } | ||
| 1099 | |||
| 1100 | printk(" Possible unsafe locking scenario:\n\n"); | ||
| 1101 | printk(" CPU0 CPU1\n"); | ||
| 1102 | printk(" ---- ----\n"); | ||
| 1103 | printk(" lock("); | ||
| 1104 | __print_lock_name(target); | ||
| 1105 | printk(");\n"); | ||
| 1106 | printk(" lock("); | ||
| 1107 | __print_lock_name(parent); | ||
| 1108 | printk(");\n"); | ||
| 1109 | printk(" lock("); | ||
| 1110 | __print_lock_name(target); | ||
| 1111 | printk(");\n"); | ||
| 1112 | printk(" lock("); | ||
| 1113 | __print_lock_name(source); | ||
| 1114 | printk(");\n"); | ||
| 1115 | printk("\n *** DEADLOCK ***\n\n"); | ||
| 1116 | } | ||
| 1117 | |||
| 1056 | /* | 1118 | /* |
| 1057 | * When a circular dependency is detected, print the | 1119 | * When a circular dependency is detected, print the |
| 1058 | * header first: | 1120 | * header first: |
| @@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
| 1096 | { | 1158 | { |
| 1097 | struct task_struct *curr = current; | 1159 | struct task_struct *curr = current; |
| 1098 | struct lock_list *parent; | 1160 | struct lock_list *parent; |
| 1161 | struct lock_list *first_parent; | ||
| 1099 | int depth; | 1162 | int depth; |
| 1100 | 1163 | ||
| 1101 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1164 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| @@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
| 1109 | print_circular_bug_header(target, depth, check_src, check_tgt); | 1172 | print_circular_bug_header(target, depth, check_src, check_tgt); |
| 1110 | 1173 | ||
| 1111 | parent = get_lock_parent(target); | 1174 | parent = get_lock_parent(target); |
| 1175 | first_parent = parent; | ||
| 1112 | 1176 | ||
| 1113 | while (parent) { | 1177 | while (parent) { |
| 1114 | print_circular_bug_entry(parent, --depth); | 1178 | print_circular_bug_entry(parent, --depth); |
| @@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
| 1116 | } | 1180 | } |
| 1117 | 1181 | ||
| 1118 | printk("\nother info that might help us debug this:\n\n"); | 1182 | printk("\nother info that might help us debug this:\n\n"); |
| 1183 | print_circular_lock_scenario(check_src, check_tgt, | ||
| 1184 | first_parent); | ||
| 1185 | |||
| 1119 | lockdep_print_held_locks(curr); | 1186 | lockdep_print_held_locks(curr); |
| 1120 | 1187 | ||
| 1121 | printk("\nstack backtrace:\n"); | 1188 | printk("\nstack backtrace:\n"); |
| @@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf, | |||
| 1314 | printk("\n"); | 1381 | printk("\n"); |
| 1315 | 1382 | ||
| 1316 | if (depth == 0 && (entry != root)) { | 1383 | if (depth == 0 && (entry != root)) { |
| 1317 | printk("lockdep:%s bad BFS generated tree\n", __func__); | 1384 | printk("lockdep:%s bad path found in chain graph\n", __func__); |
| 1318 | break; | 1385 | break; |
| 1319 | } | 1386 | } |
| 1320 | 1387 | ||
| @@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf, | |||
| 1325 | return; | 1392 | return; |
| 1326 | } | 1393 | } |
| 1327 | 1394 | ||
| 1395 | static void | ||
| 1396 | print_irq_lock_scenario(struct lock_list *safe_entry, | ||
| 1397 | struct lock_list *unsafe_entry, | ||
| 1398 | struct lock_class *prev_class, | ||
| 1399 | struct lock_class *next_class) | ||
| 1400 | { | ||
| 1401 | struct lock_class *safe_class = safe_entry->class; | ||
| 1402 | struct lock_class *unsafe_class = unsafe_entry->class; | ||
| 1403 | struct lock_class *middle_class = prev_class; | ||
| 1404 | |||
| 1405 | if (middle_class == safe_class) | ||
| 1406 | middle_class = next_class; | ||
| 1407 | |||
| 1408 | /* | ||
| 1409 | * A direct locking problem where unsafe_class lock is taken | ||
| 1410 | * directly by safe_class lock, then all we need to show | ||
| 1411 | * is the deadlock scenario, as it is obvious that the | ||
| 1412 | * unsafe lock is taken under the safe lock. | ||
| 1413 | * | ||
| 1414 | * But if there is a chain instead, where the safe lock takes | ||
| 1415 | * an intermediate lock (middle_class) where this lock is | ||
| 1416 | * not the same as the safe lock, then the lock chain is | ||
| 1417 | * used to describe the problem. Otherwise we would need | ||
| 1418 | * to show a different CPU case for each link in the chain | ||
| 1419 | * from the safe_class lock to the unsafe_class lock. | ||
| 1420 | */ | ||
| 1421 | if (middle_class != unsafe_class) { | ||
| 1422 | printk("Chain exists of:\n "); | ||
| 1423 | __print_lock_name(safe_class); | ||
| 1424 | printk(" --> "); | ||
| 1425 | __print_lock_name(middle_class); | ||
| 1426 | printk(" --> "); | ||
| 1427 | __print_lock_name(unsafe_class); | ||
| 1428 | printk("\n\n"); | ||
| 1429 | } | ||
| 1430 | |||
| 1431 | printk(" Possible interrupt unsafe locking scenario:\n\n"); | ||
| 1432 | printk(" CPU0 CPU1\n"); | ||
| 1433 | printk(" ---- ----\n"); | ||
| 1434 | printk(" lock("); | ||
| 1435 | __print_lock_name(unsafe_class); | ||
| 1436 | printk(");\n"); | ||
| 1437 | printk(" local_irq_disable();\n"); | ||
| 1438 | printk(" lock("); | ||
| 1439 | __print_lock_name(safe_class); | ||
| 1440 | printk(");\n"); | ||
| 1441 | printk(" lock("); | ||
| 1442 | __print_lock_name(middle_class); | ||
| 1443 | printk(");\n"); | ||
| 1444 | printk(" <Interrupt>\n"); | ||
| 1445 | printk(" lock("); | ||
| 1446 | __print_lock_name(safe_class); | ||
| 1447 | printk(");\n"); | ||
| 1448 | printk("\n *** DEADLOCK ***\n\n"); | ||
| 1449 | } | ||
| 1450 | |||
| 1328 | static int | 1451 | static int |
| 1329 | print_bad_irq_dependency(struct task_struct *curr, | 1452 | print_bad_irq_dependency(struct task_struct *curr, |
| 1330 | struct lock_list *prev_root, | 1453 | struct lock_list *prev_root, |
| @@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
| 1376 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); | 1499 | print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); |
| 1377 | 1500 | ||
| 1378 | printk("\nother info that might help us debug this:\n\n"); | 1501 | printk("\nother info that might help us debug this:\n\n"); |
| 1502 | print_irq_lock_scenario(backwards_entry, forwards_entry, | ||
| 1503 | hlock_class(prev), hlock_class(next)); | ||
| 1504 | |||
| 1379 | lockdep_print_held_locks(curr); | 1505 | lockdep_print_held_locks(curr); |
| 1380 | 1506 | ||
| 1381 | printk("\nthe dependencies between %s-irq-safe lock", irqclass); | 1507 | printk("\nthe dependencies between %s-irq-safe lock", irqclass); |
| @@ -1539,6 +1665,26 @@ static inline void inc_chains(void) | |||
| 1539 | 1665 | ||
| 1540 | #endif | 1666 | #endif |
| 1541 | 1667 | ||
| 1668 | static void | ||
| 1669 | print_deadlock_scenario(struct held_lock *nxt, | ||
| 1670 | struct held_lock *prv) | ||
| 1671 | { | ||
| 1672 | struct lock_class *next = hlock_class(nxt); | ||
| 1673 | struct lock_class *prev = hlock_class(prv); | ||
| 1674 | |||
| 1675 | printk(" Possible unsafe locking scenario:\n\n"); | ||
| 1676 | printk(" CPU0\n"); | ||
| 1677 | printk(" ----\n"); | ||
| 1678 | printk(" lock("); | ||
| 1679 | __print_lock_name(prev); | ||
| 1680 | printk(");\n"); | ||
| 1681 | printk(" lock("); | ||
| 1682 | __print_lock_name(next); | ||
| 1683 | printk(");\n"); | ||
| 1684 | printk("\n *** DEADLOCK ***\n\n"); | ||
| 1685 | printk(" May be due to missing lock nesting notation\n\n"); | ||
| 1686 | } | ||
| 1687 | |||
| 1542 | static int | 1688 | static int |
| 1543 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | 1689 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, |
| 1544 | struct held_lock *next) | 1690 | struct held_lock *next) |
| @@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
| 1557 | print_lock(prev); | 1703 | print_lock(prev); |
| 1558 | 1704 | ||
| 1559 | printk("\nother info that might help us debug this:\n"); | 1705 | printk("\nother info that might help us debug this:\n"); |
| 1706 | print_deadlock_scenario(next, prev); | ||
| 1560 | lockdep_print_held_locks(curr); | 1707 | lockdep_print_held_locks(curr); |
| 1561 | 1708 | ||
| 1562 | printk("\nstack backtrace:\n"); | 1709 | printk("\nstack backtrace:\n"); |
| @@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, | |||
| 1826 | struct list_head *hash_head = chainhashentry(chain_key); | 1973 | struct list_head *hash_head = chainhashentry(chain_key); |
| 1827 | struct lock_chain *chain; | 1974 | struct lock_chain *chain; |
| 1828 | struct held_lock *hlock_curr, *hlock_next; | 1975 | struct held_lock *hlock_curr, *hlock_next; |
| 1829 | int i, j, n, cn; | 1976 | int i, j; |
| 1830 | 1977 | ||
| 1831 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 1978 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
| 1832 | return 0; | 1979 | return 0; |
| @@ -1886,15 +2033,9 @@ cache_hit: | |||
| 1886 | } | 2033 | } |
| 1887 | i++; | 2034 | i++; |
| 1888 | chain->depth = curr->lockdep_depth + 1 - i; | 2035 | chain->depth = curr->lockdep_depth + 1 - i; |
| 1889 | cn = nr_chain_hlocks; | 2036 | if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { |
| 1890 | while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) { | 2037 | chain->base = nr_chain_hlocks; |
| 1891 | n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth); | 2038 | nr_chain_hlocks += chain->depth; |
| 1892 | if (n == cn) | ||
| 1893 | break; | ||
| 1894 | cn = n; | ||
| 1895 | } | ||
| 1896 | if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { | ||
| 1897 | chain->base = cn; | ||
| 1898 | for (j = 0; j < chain->depth - 1; j++, i++) { | 2039 | for (j = 0; j < chain->depth - 1; j++, i++) { |
| 1899 | int lock_id = curr->held_locks[i].class_idx - 1; | 2040 | int lock_id = curr->held_locks[i].class_idx - 1; |
| 1900 | chain_hlocks[chain->base + j] = lock_id; | 2041 | chain_hlocks[chain->base + j] = lock_id; |
| @@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr) | |||
| 2011 | #endif | 2152 | #endif |
| 2012 | } | 2153 | } |
| 2013 | 2154 | ||
| 2155 | static void | ||
| 2156 | print_usage_bug_scenario(struct held_lock *lock) | ||
| 2157 | { | ||
| 2158 | struct lock_class *class = hlock_class(lock); | ||
| 2159 | |||
| 2160 | printk(" Possible unsafe locking scenario:\n\n"); | ||
| 2161 | printk(" CPU0\n"); | ||
| 2162 | printk(" ----\n"); | ||
| 2163 | printk(" lock("); | ||
| 2164 | __print_lock_name(class); | ||
| 2165 | printk(");\n"); | ||
| 2166 | printk(" <Interrupt>\n"); | ||
| 2167 | printk(" lock("); | ||
| 2168 | __print_lock_name(class); | ||
| 2169 | printk(");\n"); | ||
| 2170 | printk("\n *** DEADLOCK ***\n\n"); | ||
| 2171 | } | ||
| 2172 | |||
| 2014 | static int | 2173 | static int |
| 2015 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | 2174 | print_usage_bug(struct task_struct *curr, struct held_lock *this, |
| 2016 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | 2175 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) |
| @@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
| 2039 | 2198 | ||
| 2040 | print_irqtrace_events(curr); | 2199 | print_irqtrace_events(curr); |
| 2041 | printk("\nother info that might help us debug this:\n"); | 2200 | printk("\nother info that might help us debug this:\n"); |
| 2201 | print_usage_bug_scenario(this); | ||
| 2202 | |||
| 2042 | lockdep_print_held_locks(curr); | 2203 | lockdep_print_held_locks(curr); |
| 2043 | 2204 | ||
| 2044 | printk("\nstack backtrace:\n"); | 2205 | printk("\nstack backtrace:\n"); |
| @@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
| 2073 | struct held_lock *this, int forwards, | 2234 | struct held_lock *this, int forwards, |
| 2074 | const char *irqclass) | 2235 | const char *irqclass) |
| 2075 | { | 2236 | { |
| 2237 | struct lock_list *entry = other; | ||
| 2238 | struct lock_list *middle = NULL; | ||
| 2239 | int depth; | ||
| 2240 | |||
| 2076 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2241 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 2077 | return 0; | 2242 | return 0; |
| 2078 | 2243 | ||
| @@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
| 2091 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); | 2256 | printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); |
| 2092 | 2257 | ||
| 2093 | printk("\nother info that might help us debug this:\n"); | 2258 | printk("\nother info that might help us debug this:\n"); |
| 2259 | |||
| 2260 | /* Find a middle lock (if one exists) */ | ||
| 2261 | depth = get_lock_depth(other); | ||
| 2262 | do { | ||
| 2263 | if (depth == 0 && (entry != root)) { | ||
| 2264 | printk("lockdep:%s bad path found in chain graph\n", __func__); | ||
| 2265 | break; | ||
| 2266 | } | ||
| 2267 | middle = entry; | ||
| 2268 | entry = get_lock_parent(entry); | ||
| 2269 | depth--; | ||
| 2270 | } while (entry && entry != root && (depth >= 0)); | ||
| 2271 | if (forwards) | ||
| 2272 | print_irq_lock_scenario(root, other, | ||
| 2273 | middle ? middle->class : root->class, other->class); | ||
| 2274 | else | ||
| 2275 | print_irq_lock_scenario(other, root, | ||
| 2276 | middle ? middle->class : other->class, root->class); | ||
| 2277 | |||
| 2094 | lockdep_print_held_locks(curr); | 2278 | lockdep_print_held_locks(curr); |
| 2095 | 2279 | ||
| 2096 | printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); | 2280 | printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); |
| @@ -2284,6 +2468,9 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) | |||
| 2284 | 2468 | ||
| 2285 | BUG_ON(usage_bit >= LOCK_USAGE_STATES); | 2469 | BUG_ON(usage_bit >= LOCK_USAGE_STATES); |
| 2286 | 2470 | ||
| 2471 | if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys) | ||
| 2472 | continue; | ||
| 2473 | |||
| 2287 | if (!mark_lock(curr, hlock, usage_bit)) | 2474 | if (!mark_lock(curr, hlock, usage_bit)) |
| 2288 | return 0; | 2475 | return 0; |
| 2289 | } | 2476 | } |
| @@ -2294,34 +2481,13 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) | |||
| 2294 | /* | 2481 | /* |
| 2295 | * Hardirqs will be enabled: | 2482 | * Hardirqs will be enabled: |
| 2296 | */ | 2483 | */ |
| 2297 | void trace_hardirqs_on_caller(unsigned long ip) | 2484 | static void __trace_hardirqs_on_caller(unsigned long ip) |
| 2298 | { | 2485 | { |
| 2299 | struct task_struct *curr = current; | 2486 | struct task_struct *curr = current; |
| 2300 | 2487 | ||
| 2301 | time_hardirqs_on(CALLER_ADDR0, ip); | ||
| 2302 | |||
| 2303 | if (unlikely(!debug_locks || current->lockdep_recursion)) | ||
| 2304 | return; | ||
| 2305 | |||
| 2306 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) | ||
| 2307 | return; | ||
| 2308 | |||
| 2309 | if (unlikely(curr->hardirqs_enabled)) { | ||
| 2310 | /* | ||
| 2311 | * Neither irq nor preemption are disabled here | ||
| 2312 | * so this is racy by nature but losing one hit | ||
| 2313 | * in a stat is not a big deal. | ||
| 2314 | */ | ||
| 2315 | __debug_atomic_inc(redundant_hardirqs_on); | ||
| 2316 | return; | ||
| 2317 | } | ||
| 2318 | /* we'll do an OFF -> ON transition: */ | 2488 | /* we'll do an OFF -> ON transition: */ |
| 2319 | curr->hardirqs_enabled = 1; | 2489 | curr->hardirqs_enabled = 1; |
| 2320 | 2490 | ||
| 2321 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 2322 | return; | ||
| 2323 | if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) | ||
| 2324 | return; | ||
| 2325 | /* | 2491 | /* |
| 2326 | * We are going to turn hardirqs on, so set the | 2492 | * We are going to turn hardirqs on, so set the |
| 2327 | * usage bit for all held locks: | 2493 | * usage bit for all held locks: |
| @@ -2341,6 +2507,37 @@ void trace_hardirqs_on_caller(unsigned long ip) | |||
| 2341 | curr->hardirq_enable_event = ++curr->irq_events; | 2507 | curr->hardirq_enable_event = ++curr->irq_events; |
| 2342 | debug_atomic_inc(hardirqs_on_events); | 2508 | debug_atomic_inc(hardirqs_on_events); |
| 2343 | } | 2509 | } |
| 2510 | |||
| 2511 | void trace_hardirqs_on_caller(unsigned long ip) | ||
| 2512 | { | ||
| 2513 | time_hardirqs_on(CALLER_ADDR0, ip); | ||
| 2514 | |||
| 2515 | if (unlikely(!debug_locks || current->lockdep_recursion)) | ||
| 2516 | return; | ||
| 2517 | |||
| 2518 | if (unlikely(current->hardirqs_enabled)) { | ||
| 2519 | /* | ||
| 2520 | * Neither irq nor preemption are disabled here | ||
| 2521 | * so this is racy by nature but losing one hit | ||
| 2522 | * in a stat is not a big deal. | ||
| 2523 | */ | ||
| 2524 | __debug_atomic_inc(redundant_hardirqs_on); | ||
| 2525 | return; | ||
| 2526 | } | ||
| 2527 | |||
| 2528 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 2529 | return; | ||
| 2530 | |||
| 2531 | if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) | ||
| 2532 | return; | ||
| 2533 | |||
| 2534 | if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) | ||
| 2535 | return; | ||
| 2536 | |||
| 2537 | current->lockdep_recursion = 1; | ||
| 2538 | __trace_hardirqs_on_caller(ip); | ||
| 2539 | current->lockdep_recursion = 0; | ||
| 2540 | } | ||
| 2344 | EXPORT_SYMBOL(trace_hardirqs_on_caller); | 2541 | EXPORT_SYMBOL(trace_hardirqs_on_caller); |
| 2345 | 2542 | ||
| 2346 | void trace_hardirqs_on(void) | 2543 | void trace_hardirqs_on(void) |
| @@ -2390,7 +2587,7 @@ void trace_softirqs_on(unsigned long ip) | |||
| 2390 | { | 2587 | { |
| 2391 | struct task_struct *curr = current; | 2588 | struct task_struct *curr = current; |
| 2392 | 2589 | ||
| 2393 | if (unlikely(!debug_locks)) | 2590 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
| 2394 | return; | 2591 | return; |
| 2395 | 2592 | ||
| 2396 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2593 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
| @@ -2401,6 +2598,7 @@ void trace_softirqs_on(unsigned long ip) | |||
| 2401 | return; | 2598 | return; |
| 2402 | } | 2599 | } |
| 2403 | 2600 | ||
| 2601 | current->lockdep_recursion = 1; | ||
| 2404 | /* | 2602 | /* |
| 2405 | * We'll do an OFF -> ON transition: | 2603 | * We'll do an OFF -> ON transition: |
| 2406 | */ | 2604 | */ |
| @@ -2415,6 +2613,7 @@ void trace_softirqs_on(unsigned long ip) | |||
| 2415 | */ | 2613 | */ |
| 2416 | if (curr->hardirqs_enabled) | 2614 | if (curr->hardirqs_enabled) |
| 2417 | mark_held_locks(curr, SOFTIRQ); | 2615 | mark_held_locks(curr, SOFTIRQ); |
| 2616 | current->lockdep_recursion = 0; | ||
| 2418 | } | 2617 | } |
| 2419 | 2618 | ||
| 2420 | /* | 2619 | /* |
| @@ -2424,7 +2623,7 @@ void trace_softirqs_off(unsigned long ip) | |||
| 2424 | { | 2623 | { |
| 2425 | struct task_struct *curr = current; | 2624 | struct task_struct *curr = current; |
| 2426 | 2625 | ||
| 2427 | if (unlikely(!debug_locks)) | 2626 | if (unlikely(!debug_locks || current->lockdep_recursion)) |
| 2428 | return; | 2627 | return; |
| 2429 | 2628 | ||
| 2430 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 2629 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
| @@ -2675,10 +2874,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
| 2675 | void lockdep_init_map(struct lockdep_map *lock, const char *name, | 2874 | void lockdep_init_map(struct lockdep_map *lock, const char *name, |
| 2676 | struct lock_class_key *key, int subclass) | 2875 | struct lock_class_key *key, int subclass) |
| 2677 | { | 2876 | { |
| 2678 | int i; | 2877 | memset(lock, 0, sizeof(*lock)); |
| 2679 | |||
| 2680 | for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) | ||
| 2681 | lock->class_cache[i] = NULL; | ||
| 2682 | 2878 | ||
| 2683 | #ifdef CONFIG_LOCK_STAT | 2879 | #ifdef CONFIG_LOCK_STAT |
| 2684 | lock->cpu = raw_smp_processor_id(); | 2880 | lock->cpu = raw_smp_processor_id(); |
| @@ -3242,7 +3438,7 @@ int lock_is_held(struct lockdep_map *lock) | |||
| 3242 | int ret = 0; | 3438 | int ret = 0; |
| 3243 | 3439 | ||
| 3244 | if (unlikely(current->lockdep_recursion)) | 3440 | if (unlikely(current->lockdep_recursion)) |
| 3245 | return ret; | 3441 | return 1; /* avoid false negative lockdep_assert_held() */ |
| 3246 | 3442 | ||
| 3247 | raw_local_irq_save(flags); | 3443 | raw_local_irq_save(flags); |
| 3248 | check_flags(flags); | 3444 | check_flags(flags); |
diff --git a/kernel/module.c b/kernel/module.c index d5938a5c19c4..04379f92f843 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -57,6 +57,7 @@ | |||
| 57 | #include <linux/kmemleak.h> | 57 | #include <linux/kmemleak.h> |
| 58 | #include <linux/jump_label.h> | 58 | #include <linux/jump_label.h> |
| 59 | #include <linux/pfn.h> | 59 | #include <linux/pfn.h> |
| 60 | #include <linux/bsearch.h> | ||
| 60 | 61 | ||
| 61 | #define CREATE_TRACE_POINTS | 62 | #define CREATE_TRACE_POINTS |
| 62 | #include <trace/events/module.h> | 63 | #include <trace/events/module.h> |
| @@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr, | |||
| 240 | struct module *owner, | 241 | struct module *owner, |
| 241 | bool (*fn)(const struct symsearch *syms, | 242 | bool (*fn)(const struct symsearch *syms, |
| 242 | struct module *owner, | 243 | struct module *owner, |
| 243 | unsigned int symnum, void *data), | 244 | void *data), |
| 244 | void *data) | 245 | void *data) |
| 245 | { | 246 | { |
| 246 | unsigned int i, j; | 247 | unsigned int j; |
| 247 | 248 | ||
| 248 | for (j = 0; j < arrsize; j++) { | 249 | for (j = 0; j < arrsize; j++) { |
| 249 | for (i = 0; i < arr[j].stop - arr[j].start; i++) | 250 | if (fn(&arr[j], owner, data)) |
| 250 | if (fn(&arr[j], owner, i, data)) | 251 | return true; |
| 251 | return true; | ||
| 252 | } | 252 | } |
| 253 | 253 | ||
| 254 | return false; | 254 | return false; |
| 255 | } | 255 | } |
| 256 | 256 | ||
| 257 | /* Returns true as soon as fn returns true, otherwise false. */ | 257 | /* Returns true as soon as fn returns true, otherwise false. */ |
| 258 | bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, | 258 | bool each_symbol_section(bool (*fn)(const struct symsearch *arr, |
| 259 | unsigned int symnum, void *data), void *data) | 259 | struct module *owner, |
| 260 | void *data), | ||
| 261 | void *data) | ||
| 260 | { | 262 | { |
| 261 | struct module *mod; | 263 | struct module *mod; |
| 262 | static const struct symsearch arr[] = { | 264 | static const struct symsearch arr[] = { |
| @@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, | |||
| 309 | } | 311 | } |
| 310 | return false; | 312 | return false; |
| 311 | } | 313 | } |
| 312 | EXPORT_SYMBOL_GPL(each_symbol); | 314 | EXPORT_SYMBOL_GPL(each_symbol_section); |
| 313 | 315 | ||
| 314 | struct find_symbol_arg { | 316 | struct find_symbol_arg { |
| 315 | /* Input */ | 317 | /* Input */ |
| @@ -323,15 +325,12 @@ struct find_symbol_arg { | |||
| 323 | const struct kernel_symbol *sym; | 325 | const struct kernel_symbol *sym; |
| 324 | }; | 326 | }; |
| 325 | 327 | ||
| 326 | static bool find_symbol_in_section(const struct symsearch *syms, | 328 | static bool check_symbol(const struct symsearch *syms, |
| 327 | struct module *owner, | 329 | struct module *owner, |
| 328 | unsigned int symnum, void *data) | 330 | unsigned int symnum, void *data) |
| 329 | { | 331 | { |
| 330 | struct find_symbol_arg *fsa = data; | 332 | struct find_symbol_arg *fsa = data; |
| 331 | 333 | ||
| 332 | if (strcmp(syms->start[symnum].name, fsa->name) != 0) | ||
| 333 | return false; | ||
| 334 | |||
| 335 | if (!fsa->gplok) { | 334 | if (!fsa->gplok) { |
| 336 | if (syms->licence == GPL_ONLY) | 335 | if (syms->licence == GPL_ONLY) |
| 337 | return false; | 336 | return false; |
| @@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms, | |||
| 365 | return true; | 364 | return true; |
| 366 | } | 365 | } |
| 367 | 366 | ||
| 367 | static int cmp_name(const void *va, const void *vb) | ||
| 368 | { | ||
| 369 | const char *a; | ||
| 370 | const struct kernel_symbol *b; | ||
| 371 | a = va; b = vb; | ||
| 372 | return strcmp(a, b->name); | ||
| 373 | } | ||
| 374 | |||
| 375 | static bool find_symbol_in_section(const struct symsearch *syms, | ||
| 376 | struct module *owner, | ||
| 377 | void *data) | ||
| 378 | { | ||
| 379 | struct find_symbol_arg *fsa = data; | ||
| 380 | struct kernel_symbol *sym; | ||
| 381 | |||
| 382 | sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, | ||
| 383 | sizeof(struct kernel_symbol), cmp_name); | ||
| 384 | |||
| 385 | if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data)) | ||
| 386 | return true; | ||
| 387 | |||
| 388 | return false; | ||
| 389 | } | ||
| 390 | |||
| 368 | /* Find a symbol and return it, along with, (optional) crc and | 391 | /* Find a symbol and return it, along with, (optional) crc and |
| 369 | * (optional) module which owns it. Needs preempt disabled or module_mutex. */ | 392 | * (optional) module which owns it. Needs preempt disabled or module_mutex. */ |
| 370 | const struct kernel_symbol *find_symbol(const char *name, | 393 | const struct kernel_symbol *find_symbol(const char *name, |
| @@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name, | |||
| 379 | fsa.gplok = gplok; | 402 | fsa.gplok = gplok; |
| 380 | fsa.warn = warn; | 403 | fsa.warn = warn; |
| 381 | 404 | ||
| 382 | if (each_symbol(find_symbol_in_section, &fsa)) { | 405 | if (each_symbol_section(find_symbol_in_section, &fsa)) { |
| 383 | if (owner) | 406 | if (owner) |
| 384 | *owner = fsa.owner; | 407 | *owner = fsa.owner; |
| 385 | if (crc) | 408 | if (crc) |
| @@ -522,9 +545,9 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \ | |||
| 522 | mod->field = kstrdup(s, GFP_KERNEL); \ | 545 | mod->field = kstrdup(s, GFP_KERNEL); \ |
| 523 | } \ | 546 | } \ |
| 524 | static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ | 547 | static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ |
| 525 | struct module *mod, char *buffer) \ | 548 | struct module_kobject *mk, char *buffer) \ |
| 526 | { \ | 549 | { \ |
| 527 | return sprintf(buffer, "%s\n", mod->field); \ | 550 | return sprintf(buffer, "%s\n", mk->mod->field); \ |
| 528 | } \ | 551 | } \ |
| 529 | static int modinfo_##field##_exists(struct module *mod) \ | 552 | static int modinfo_##field##_exists(struct module *mod) \ |
| 530 | { \ | 553 | { \ |
| @@ -879,9 +902,9 @@ void symbol_put_addr(void *addr) | |||
| 879 | EXPORT_SYMBOL_GPL(symbol_put_addr); | 902 | EXPORT_SYMBOL_GPL(symbol_put_addr); |
| 880 | 903 | ||
| 881 | static ssize_t show_refcnt(struct module_attribute *mattr, | 904 | static ssize_t show_refcnt(struct module_attribute *mattr, |
| 882 | struct module *mod, char *buffer) | 905 | struct module_kobject *mk, char *buffer) |
| 883 | { | 906 | { |
| 884 | return sprintf(buffer, "%u\n", module_refcount(mod)); | 907 | return sprintf(buffer, "%u\n", module_refcount(mk->mod)); |
| 885 | } | 908 | } |
| 886 | 909 | ||
| 887 | static struct module_attribute refcnt = { | 910 | static struct module_attribute refcnt = { |
| @@ -929,11 +952,11 @@ static inline int module_unload_init(struct module *mod) | |||
| 929 | #endif /* CONFIG_MODULE_UNLOAD */ | 952 | #endif /* CONFIG_MODULE_UNLOAD */ |
| 930 | 953 | ||
| 931 | static ssize_t show_initstate(struct module_attribute *mattr, | 954 | static ssize_t show_initstate(struct module_attribute *mattr, |
| 932 | struct module *mod, char *buffer) | 955 | struct module_kobject *mk, char *buffer) |
| 933 | { | 956 | { |
| 934 | const char *state = "unknown"; | 957 | const char *state = "unknown"; |
| 935 | 958 | ||
| 936 | switch (mod->state) { | 959 | switch (mk->mod->state) { |
| 937 | case MODULE_STATE_LIVE: | 960 | case MODULE_STATE_LIVE: |
| 938 | state = "live"; | 961 | state = "live"; |
| 939 | break; | 962 | break; |
| @@ -952,10 +975,27 @@ static struct module_attribute initstate = { | |||
| 952 | .show = show_initstate, | 975 | .show = show_initstate, |
| 953 | }; | 976 | }; |
| 954 | 977 | ||
| 978 | static ssize_t store_uevent(struct module_attribute *mattr, | ||
| 979 | struct module_kobject *mk, | ||
| 980 | const char *buffer, size_t count) | ||
| 981 | { | ||
| 982 | enum kobject_action action; | ||
| 983 | |||
| 984 | if (kobject_action_type(buffer, count, &action) == 0) | ||
| 985 | kobject_uevent(&mk->kobj, action); | ||
| 986 | return count; | ||
| 987 | } | ||
| 988 | |||
| 989 | struct module_attribute module_uevent = { | ||
| 990 | .attr = { .name = "uevent", .mode = 0200 }, | ||
| 991 | .store = store_uevent, | ||
| 992 | }; | ||
| 993 | |||
| 955 | static struct module_attribute *modinfo_attrs[] = { | 994 | static struct module_attribute *modinfo_attrs[] = { |
| 956 | &modinfo_version, | 995 | &modinfo_version, |
| 957 | &modinfo_srcversion, | 996 | &modinfo_srcversion, |
| 958 | &initstate, | 997 | &initstate, |
| 998 | &module_uevent, | ||
| 959 | #ifdef CONFIG_MODULE_UNLOAD | 999 | #ifdef CONFIG_MODULE_UNLOAD |
| 960 | &refcnt, | 1000 | &refcnt, |
| 961 | #endif | 1001 | #endif |
| @@ -1164,7 +1204,7 @@ struct module_sect_attrs | |||
| 1164 | }; | 1204 | }; |
| 1165 | 1205 | ||
| 1166 | static ssize_t module_sect_show(struct module_attribute *mattr, | 1206 | static ssize_t module_sect_show(struct module_attribute *mattr, |
| 1167 | struct module *mod, char *buf) | 1207 | struct module_kobject *mk, char *buf) |
| 1168 | { | 1208 | { |
| 1169 | struct module_sect_attr *sattr = | 1209 | struct module_sect_attr *sattr = |
| 1170 | container_of(mattr, struct module_sect_attr, mattr); | 1210 | container_of(mattr, struct module_sect_attr, mattr); |
| @@ -1607,27 +1647,28 @@ static void set_section_ro_nx(void *base, | |||
| 1607 | } | 1647 | } |
| 1608 | } | 1648 | } |
| 1609 | 1649 | ||
| 1610 | /* Setting memory back to RW+NX before releasing it */ | 1650 | static void unset_module_core_ro_nx(struct module *mod) |
| 1611 | void unset_section_ro_nx(struct module *mod, void *module_region) | ||
| 1612 | { | 1651 | { |
| 1613 | unsigned long total_pages; | 1652 | set_page_attributes(mod->module_core + mod->core_text_size, |
| 1614 | 1653 | mod->module_core + mod->core_size, | |
| 1615 | if (mod->module_core == module_region) { | 1654 | set_memory_x); |
| 1616 | /* Set core as NX+RW */ | 1655 | set_page_attributes(mod->module_core, |
| 1617 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size); | 1656 | mod->module_core + mod->core_ro_size, |
| 1618 | set_memory_nx((unsigned long)mod->module_core, total_pages); | 1657 | set_memory_rw); |
| 1619 | set_memory_rw((unsigned long)mod->module_core, total_pages); | 1658 | } |
| 1620 | 1659 | ||
| 1621 | } else if (mod->module_init == module_region) { | 1660 | static void unset_module_init_ro_nx(struct module *mod) |
| 1622 | /* Set init as NX+RW */ | 1661 | { |
| 1623 | total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size); | 1662 | set_page_attributes(mod->module_init + mod->init_text_size, |
| 1624 | set_memory_nx((unsigned long)mod->module_init, total_pages); | 1663 | mod->module_init + mod->init_size, |
| 1625 | set_memory_rw((unsigned long)mod->module_init, total_pages); | 1664 | set_memory_x); |
| 1626 | } | 1665 | set_page_attributes(mod->module_init, |
| 1666 | mod->module_init + mod->init_ro_size, | ||
| 1667 | set_memory_rw); | ||
| 1627 | } | 1668 | } |
| 1628 | 1669 | ||
| 1629 | /* Iterate through all modules and set each module's text as RW */ | 1670 | /* Iterate through all modules and set each module's text as RW */ |
| 1630 | void set_all_modules_text_rw() | 1671 | void set_all_modules_text_rw(void) |
| 1631 | { | 1672 | { |
| 1632 | struct module *mod; | 1673 | struct module *mod; |
| 1633 | 1674 | ||
| @@ -1648,7 +1689,7 @@ void set_all_modules_text_rw() | |||
| 1648 | } | 1689 | } |
| 1649 | 1690 | ||
| 1650 | /* Iterate through all modules and set each module's text as RO */ | 1691 | /* Iterate through all modules and set each module's text as RO */ |
| 1651 | void set_all_modules_text_ro() | 1692 | void set_all_modules_text_ro(void) |
| 1652 | { | 1693 | { |
| 1653 | struct module *mod; | 1694 | struct module *mod; |
| 1654 | 1695 | ||
| @@ -1669,9 +1710,19 @@ void set_all_modules_text_ro() | |||
| 1669 | } | 1710 | } |
| 1670 | #else | 1711 | #else |
| 1671 | static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } | 1712 | static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } |
| 1672 | static inline void unset_section_ro_nx(struct module *mod, void *module_region) { } | 1713 | static void unset_module_core_ro_nx(struct module *mod) { } |
| 1714 | static void unset_module_init_ro_nx(struct module *mod) { } | ||
| 1673 | #endif | 1715 | #endif |
| 1674 | 1716 | ||
| 1717 | void __weak module_free(struct module *mod, void *module_region) | ||
| 1718 | { | ||
| 1719 | vfree(module_region); | ||
| 1720 | } | ||
| 1721 | |||
| 1722 | void __weak module_arch_cleanup(struct module *mod) | ||
| 1723 | { | ||
| 1724 | } | ||
| 1725 | |||
| 1675 | /* Free a module, remove from lists, etc. */ | 1726 | /* Free a module, remove from lists, etc. */ |
| 1676 | static void free_module(struct module *mod) | 1727 | static void free_module(struct module *mod) |
| 1677 | { | 1728 | { |
| @@ -1696,7 +1747,7 @@ static void free_module(struct module *mod) | |||
| 1696 | destroy_params(mod->kp, mod->num_kp); | 1747 | destroy_params(mod->kp, mod->num_kp); |
| 1697 | 1748 | ||
| 1698 | /* This may be NULL, but that's OK */ | 1749 | /* This may be NULL, but that's OK */ |
| 1699 | unset_section_ro_nx(mod, mod->module_init); | 1750 | unset_module_init_ro_nx(mod); |
| 1700 | module_free(mod, mod->module_init); | 1751 | module_free(mod, mod->module_init); |
| 1701 | kfree(mod->args); | 1752 | kfree(mod->args); |
| 1702 | percpu_modfree(mod); | 1753 | percpu_modfree(mod); |
| @@ -1705,7 +1756,7 @@ static void free_module(struct module *mod) | |||
| 1705 | lockdep_free_key_range(mod->module_core, mod->core_size); | 1756 | lockdep_free_key_range(mod->module_core, mod->core_size); |
| 1706 | 1757 | ||
| 1707 | /* Finally, free the core (containing the module structure) */ | 1758 | /* Finally, free the core (containing the module structure) */ |
| 1708 | unset_section_ro_nx(mod, mod->module_core); | 1759 | unset_module_core_ro_nx(mod); |
| 1709 | module_free(mod, mod->module_core); | 1760 | module_free(mod, mod->module_core); |
| 1710 | 1761 | ||
| 1711 | #ifdef CONFIG_MPU | 1762 | #ifdef CONFIG_MPU |
| @@ -1826,6 +1877,26 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) | |||
| 1826 | return ret; | 1877 | return ret; |
| 1827 | } | 1878 | } |
| 1828 | 1879 | ||
| 1880 | int __weak apply_relocate(Elf_Shdr *sechdrs, | ||
| 1881 | const char *strtab, | ||
| 1882 | unsigned int symindex, | ||
| 1883 | unsigned int relsec, | ||
| 1884 | struct module *me) | ||
| 1885 | { | ||
| 1886 | pr_err("module %s: REL relocation unsupported\n", me->name); | ||
| 1887 | return -ENOEXEC; | ||
| 1888 | } | ||
| 1889 | |||
| 1890 | int __weak apply_relocate_add(Elf_Shdr *sechdrs, | ||
| 1891 | const char *strtab, | ||
| 1892 | unsigned int symindex, | ||
| 1893 | unsigned int relsec, | ||
| 1894 | struct module *me) | ||
| 1895 | { | ||
| 1896 | pr_err("module %s: RELA relocation unsupported\n", me->name); | ||
| 1897 | return -ENOEXEC; | ||
| 1898 | } | ||
| 1899 | |||
| 1829 | static int apply_relocations(struct module *mod, const struct load_info *info) | 1900 | static int apply_relocations(struct module *mod, const struct load_info *info) |
| 1830 | { | 1901 | { |
| 1831 | unsigned int i; | 1902 | unsigned int i; |
| @@ -2030,11 +2101,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name, | |||
| 2030 | const struct kernel_symbol *start, | 2101 | const struct kernel_symbol *start, |
| 2031 | const struct kernel_symbol *stop) | 2102 | const struct kernel_symbol *stop) |
| 2032 | { | 2103 | { |
| 2033 | const struct kernel_symbol *ks = start; | 2104 | return bsearch(name, start, stop - start, |
| 2034 | for (; ks < stop; ks++) | 2105 | sizeof(struct kernel_symbol), cmp_name); |
| 2035 | if (strcmp(ks->name, name) == 0) | ||
| 2036 | return ks; | ||
| 2037 | return NULL; | ||
| 2038 | } | 2106 | } |
| 2039 | 2107 | ||
| 2040 | static int is_exported(const char *name, unsigned long value, | 2108 | static int is_exported(const char *name, unsigned long value, |
| @@ -2213,6 +2281,11 @@ static void dynamic_debug_remove(struct _ddebug *debug) | |||
| 2213 | ddebug_remove_module(debug->modname); | 2281 | ddebug_remove_module(debug->modname); |
| 2214 | } | 2282 | } |
| 2215 | 2283 | ||
| 2284 | void * __weak module_alloc(unsigned long size) | ||
| 2285 | { | ||
| 2286 | return size == 0 ? NULL : vmalloc_exec(size); | ||
| 2287 | } | ||
| 2288 | |||
| 2216 | static void *module_alloc_update_bounds(unsigned long size) | 2289 | static void *module_alloc_update_bounds(unsigned long size) |
| 2217 | { | 2290 | { |
| 2218 | void *ret = module_alloc(size); | 2291 | void *ret = module_alloc(size); |
| @@ -2623,6 +2696,14 @@ static void flush_module_icache(const struct module *mod) | |||
| 2623 | set_fs(old_fs); | 2696 | set_fs(old_fs); |
| 2624 | } | 2697 | } |
| 2625 | 2698 | ||
| 2699 | int __weak module_frob_arch_sections(Elf_Ehdr *hdr, | ||
| 2700 | Elf_Shdr *sechdrs, | ||
| 2701 | char *secstrings, | ||
| 2702 | struct module *mod) | ||
| 2703 | { | ||
| 2704 | return 0; | ||
| 2705 | } | ||
| 2706 | |||
| 2626 | static struct module *layout_and_allocate(struct load_info *info) | 2707 | static struct module *layout_and_allocate(struct load_info *info) |
| 2627 | { | 2708 | { |
| 2628 | /* Module within temporary copy. */ | 2709 | /* Module within temporary copy. */ |
| @@ -2694,6 +2775,13 @@ static void module_deallocate(struct module *mod, struct load_info *info) | |||
| 2694 | module_free(mod, mod->module_core); | 2775 | module_free(mod, mod->module_core); |
| 2695 | } | 2776 | } |
| 2696 | 2777 | ||
| 2778 | int __weak module_finalize(const Elf_Ehdr *hdr, | ||
| 2779 | const Elf_Shdr *sechdrs, | ||
| 2780 | struct module *me) | ||
| 2781 | { | ||
| 2782 | return 0; | ||
| 2783 | } | ||
| 2784 | |||
| 2697 | static int post_relocation(struct module *mod, const struct load_info *info) | 2785 | static int post_relocation(struct module *mod, const struct load_info *info) |
| 2698 | { | 2786 | { |
| 2699 | /* Sort exception table now relocations are done. */ | 2787 | /* Sort exception table now relocations are done. */ |
| @@ -2790,7 +2878,7 @@ static struct module *load_module(void __user *umod, | |||
| 2790 | } | 2878 | } |
| 2791 | 2879 | ||
| 2792 | /* This has to be done once we're sure module name is unique. */ | 2880 | /* This has to be done once we're sure module name is unique. */ |
| 2793 | if (!mod->taints) | 2881 | if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) |
| 2794 | dynamic_debug_setup(info.debug, info.num_debug); | 2882 | dynamic_debug_setup(info.debug, info.num_debug); |
| 2795 | 2883 | ||
| 2796 | /* Find duplicate symbols */ | 2884 | /* Find duplicate symbols */ |
| @@ -2827,7 +2915,7 @@ static struct module *load_module(void __user *umod, | |||
| 2827 | module_bug_cleanup(mod); | 2915 | module_bug_cleanup(mod); |
| 2828 | 2916 | ||
| 2829 | ddebug: | 2917 | ddebug: |
| 2830 | if (!mod->taints) | 2918 | if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) |
| 2831 | dynamic_debug_remove(info.debug); | 2919 | dynamic_debug_remove(info.debug); |
| 2832 | unlock: | 2920 | unlock: |
| 2833 | mutex_unlock(&module_mutex); | 2921 | mutex_unlock(&module_mutex); |
| @@ -2931,10 +3019,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, | |||
| 2931 | mod->symtab = mod->core_symtab; | 3019 | mod->symtab = mod->core_symtab; |
| 2932 | mod->strtab = mod->core_strtab; | 3020 | mod->strtab = mod->core_strtab; |
| 2933 | #endif | 3021 | #endif |
| 2934 | unset_section_ro_nx(mod, mod->module_init); | 3022 | unset_module_init_ro_nx(mod); |
| 2935 | module_free(mod, mod->module_init); | 3023 | module_free(mod, mod->module_init); |
| 2936 | mod->module_init = NULL; | 3024 | mod->module_init = NULL; |
| 2937 | mod->init_size = 0; | 3025 | mod->init_size = 0; |
| 3026 | mod->init_ro_size = 0; | ||
| 2938 | mod->init_text_size = 0; | 3027 | mod->init_text_size = 0; |
| 2939 | mutex_unlock(&module_mutex); | 3028 | mutex_unlock(&module_mutex); |
| 2940 | 3029 | ||
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a960b5d..73da83aff418 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
| @@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) | |||
| 75 | return; | 75 | return; |
| 76 | 76 | ||
| 77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); | 77 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
| 78 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); | 78 | DEBUG_LOCKS_WARN_ON(lock->owner != current); |
| 79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 79 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
| 80 | mutex_clear_owner(lock); | 80 | mutex_clear_owner(lock); |
| 81 | } | 81 | } |
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a16f9d..0799fd3e4cfa 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h | |||
| @@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, | |||
| 29 | 29 | ||
| 30 | static inline void mutex_set_owner(struct mutex *lock) | 30 | static inline void mutex_set_owner(struct mutex *lock) |
| 31 | { | 31 | { |
| 32 | lock->owner = current_thread_info(); | 32 | lock->owner = current; |
| 33 | } | 33 | } |
| 34 | 34 | ||
| 35 | static inline void mutex_clear_owner(struct mutex *lock) | 35 | static inline void mutex_clear_owner(struct mutex *lock) |
diff --git a/kernel/mutex.c b/kernel/mutex.c index c4195fa98900..d607ed5dd441 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
| @@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock); | |||
| 131 | */ | 131 | */ |
| 132 | static inline int __sched | 132 | static inline int __sched |
| 133 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | 133 | __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, |
| 134 | unsigned long ip) | 134 | struct lockdep_map *nest_lock, unsigned long ip) |
| 135 | { | 135 | { |
| 136 | struct task_struct *task = current; | 136 | struct task_struct *task = current; |
| 137 | struct mutex_waiter waiter; | 137 | struct mutex_waiter waiter; |
| 138 | unsigned long flags; | 138 | unsigned long flags; |
| 139 | 139 | ||
| 140 | preempt_disable(); | 140 | preempt_disable(); |
| 141 | mutex_acquire(&lock->dep_map, subclass, 0, ip); | 141 | mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); |
| 142 | 142 | ||
| 143 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 143 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
| 144 | /* | 144 | /* |
| @@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
| 160 | */ | 160 | */ |
| 161 | 161 | ||
| 162 | for (;;) { | 162 | for (;;) { |
| 163 | struct thread_info *owner; | 163 | struct task_struct *owner; |
| 164 | |||
| 165 | /* | ||
| 166 | * If we own the BKL, then don't spin. The owner of | ||
| 167 | * the mutex might be waiting on us to release the BKL. | ||
| 168 | */ | ||
| 169 | if (unlikely(current->lock_depth >= 0)) | ||
| 170 | break; | ||
| 171 | 164 | ||
| 172 | /* | 165 | /* |
| 173 | * If there's an owner, wait for it to either | 166 | * If there's an owner, wait for it to either |
| @@ -276,16 +269,25 @@ void __sched | |||
| 276 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) | 269 | mutex_lock_nested(struct mutex *lock, unsigned int subclass) |
| 277 | { | 270 | { |
| 278 | might_sleep(); | 271 | might_sleep(); |
| 279 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); | 272 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); |
| 280 | } | 273 | } |
| 281 | 274 | ||
| 282 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 275 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
| 283 | 276 | ||
| 277 | void __sched | ||
| 278 | _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) | ||
| 279 | { | ||
| 280 | might_sleep(); | ||
| 281 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); | ||
| 282 | } | ||
| 283 | |||
| 284 | EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); | ||
| 285 | |||
| 284 | int __sched | 286 | int __sched |
| 285 | mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) | 287 | mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) |
| 286 | { | 288 | { |
| 287 | might_sleep(); | 289 | might_sleep(); |
| 288 | return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); | 290 | return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); |
| 289 | } | 291 | } |
| 290 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); | 292 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); |
| 291 | 293 | ||
| @@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | |||
| 294 | { | 296 | { |
| 295 | might_sleep(); | 297 | might_sleep(); |
| 296 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, | 298 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, |
| 297 | subclass, _RET_IP_); | 299 | subclass, NULL, _RET_IP_); |
| 298 | } | 300 | } |
| 299 | 301 | ||
| 300 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | 302 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); |
| @@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count) | |||
| 400 | { | 402 | { |
| 401 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 403 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 402 | 404 | ||
| 403 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); | 405 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); |
| 404 | } | 406 | } |
| 405 | 407 | ||
| 406 | static noinline int __sched | 408 | static noinline int __sched |
| @@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count) | |||
| 408 | { | 410 | { |
| 409 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 411 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 410 | 412 | ||
| 411 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); | 413 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); |
| 412 | } | 414 | } |
| 413 | 415 | ||
| 414 | static noinline int __sched | 416 | static noinline int __sched |
| @@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count) | |||
| 416 | { | 418 | { |
| 417 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 419 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
| 418 | 420 | ||
| 419 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); | 421 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); |
| 420 | } | 422 | } |
| 421 | #endif | 423 | #endif |
| 422 | 424 | ||
diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca48f94..4115fbf83b12 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h | |||
| @@ -19,7 +19,7 @@ | |||
| 19 | #ifdef CONFIG_SMP | 19 | #ifdef CONFIG_SMP |
| 20 | static inline void mutex_set_owner(struct mutex *lock) | 20 | static inline void mutex_set_owner(struct mutex *lock) |
| 21 | { | 21 | { |
| 22 | lock->owner = current_thread_info(); | 22 | lock->owner = current; |
| 23 | } | 23 | } |
| 24 | 24 | ||
| 25 | static inline void mutex_clear_owner(struct mutex *lock) | 25 | static inline void mutex_clear_owner(struct mutex *lock) |
diff --git a/kernel/notifier.c b/kernel/notifier.c index 2488ba7eb568..8d7b435806c9 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
| @@ -525,37 +525,6 @@ void srcu_init_notifier_head(struct srcu_notifier_head *nh) | |||
| 525 | } | 525 | } |
| 526 | EXPORT_SYMBOL_GPL(srcu_init_notifier_head); | 526 | EXPORT_SYMBOL_GPL(srcu_init_notifier_head); |
| 527 | 527 | ||
| 528 | /** | ||
| 529 | * register_reboot_notifier - Register function to be called at reboot time | ||
| 530 | * @nb: Info about notifier function to be called | ||
| 531 | * | ||
| 532 | * Registers a function with the list of functions | ||
| 533 | * to be called at reboot time. | ||
| 534 | * | ||
| 535 | * Currently always returns zero, as blocking_notifier_chain_register() | ||
| 536 | * always returns zero. | ||
| 537 | */ | ||
| 538 | int register_reboot_notifier(struct notifier_block *nb) | ||
| 539 | { | ||
| 540 | return blocking_notifier_chain_register(&reboot_notifier_list, nb); | ||
| 541 | } | ||
| 542 | EXPORT_SYMBOL(register_reboot_notifier); | ||
| 543 | |||
| 544 | /** | ||
| 545 | * unregister_reboot_notifier - Unregister previously registered reboot notifier | ||
| 546 | * @nb: Hook to be unregistered | ||
| 547 | * | ||
| 548 | * Unregisters a previously registered reboot | ||
| 549 | * notifier function. | ||
| 550 | * | ||
| 551 | * Returns zero on success, or %-ENOENT on failure. | ||
| 552 | */ | ||
| 553 | int unregister_reboot_notifier(struct notifier_block *nb) | ||
| 554 | { | ||
| 555 | return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); | ||
| 556 | } | ||
| 557 | EXPORT_SYMBOL(unregister_reboot_notifier); | ||
| 558 | |||
| 559 | static ATOMIC_NOTIFIER_HEAD(die_chain); | 528 | static ATOMIC_NOTIFIER_HEAD(die_chain); |
| 560 | 529 | ||
| 561 | int notrace __kprobes notify_die(enum die_val val, const char *str, | 530 | int notrace __kprobes notify_die(enum die_val val, const char *str, |
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c deleted file mode 100644 index 2c98ad94ba0e..000000000000 --- a/kernel/ns_cgroup.c +++ /dev/null | |||
| @@ -1,118 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * ns_cgroup.c - namespace cgroup subsystem | ||
| 3 | * | ||
| 4 | * Copyright 2006, 2007 IBM Corp | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include <linux/module.h> | ||
| 8 | #include <linux/cgroup.h> | ||
| 9 | #include <linux/fs.h> | ||
| 10 | #include <linux/proc_fs.h> | ||
| 11 | #include <linux/slab.h> | ||
| 12 | #include <linux/nsproxy.h> | ||
| 13 | |||
| 14 | struct ns_cgroup { | ||
| 15 | struct cgroup_subsys_state css; | ||
| 16 | }; | ||
| 17 | |||
| 18 | struct cgroup_subsys ns_subsys; | ||
| 19 | |||
| 20 | static inline struct ns_cgroup *cgroup_to_ns( | ||
| 21 | struct cgroup *cgroup) | ||
| 22 | { | ||
| 23 | return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), | ||
| 24 | struct ns_cgroup, css); | ||
| 25 | } | ||
| 26 | |||
| 27 | int ns_cgroup_clone(struct task_struct *task, struct pid *pid) | ||
| 28 | { | ||
| 29 | char name[PROC_NUMBUF]; | ||
| 30 | |||
| 31 | snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); | ||
| 32 | return cgroup_clone(task, &ns_subsys, name); | ||
| 33 | } | ||
| 34 | |||
| 35 | /* | ||
| 36 | * Rules: | ||
| 37 | * 1. you can only enter a cgroup which is a descendant of your current | ||
| 38 | * cgroup | ||
| 39 | * 2. you can only place another process into a cgroup if | ||
| 40 | * a. you have CAP_SYS_ADMIN | ||
| 41 | * b. your cgroup is an ancestor of task's destination cgroup | ||
| 42 | * (hence either you are in the same cgroup as task, or in an | ||
| 43 | * ancestor cgroup thereof) | ||
| 44 | */ | ||
| 45 | static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, | ||
| 46 | struct task_struct *task, bool threadgroup) | ||
| 47 | { | ||
| 48 | if (current != task) { | ||
| 49 | if (!capable(CAP_SYS_ADMIN)) | ||
| 50 | return -EPERM; | ||
| 51 | |||
| 52 | if (!cgroup_is_descendant(new_cgroup, current)) | ||
| 53 | return -EPERM; | ||
| 54 | } | ||
| 55 | |||
| 56 | if (!cgroup_is_descendant(new_cgroup, task)) | ||
| 57 | return -EPERM; | ||
| 58 | |||
| 59 | if (threadgroup) { | ||
| 60 | struct task_struct *c; | ||
| 61 | rcu_read_lock(); | ||
| 62 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
| 63 | if (!cgroup_is_descendant(new_cgroup, c)) { | ||
| 64 | rcu_read_unlock(); | ||
| 65 | return -EPERM; | ||
| 66 | } | ||
| 67 | } | ||
| 68 | rcu_read_unlock(); | ||
| 69 | } | ||
| 70 | |||
| 71 | return 0; | ||
| 72 | } | ||
| 73 | |||
| 74 | /* | ||
| 75 | * Rules: you can only create a cgroup if | ||
| 76 | * 1. you are capable(CAP_SYS_ADMIN) | ||
| 77 | * 2. the target cgroup is a descendant of your own cgroup | ||
| 78 | */ | ||
| 79 | static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, | ||
| 80 | struct cgroup *cgroup) | ||
| 81 | { | ||
| 82 | struct ns_cgroup *ns_cgroup; | ||
| 83 | |||
| 84 | if (!capable(CAP_SYS_ADMIN)) | ||
| 85 | return ERR_PTR(-EPERM); | ||
| 86 | if (!cgroup_is_descendant(cgroup, current)) | ||
| 87 | return ERR_PTR(-EPERM); | ||
| 88 | if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { | ||
| 89 | printk("ns_cgroup can't be created with parent " | ||
| 90 | "'clone_children' set.\n"); | ||
| 91 | return ERR_PTR(-EINVAL); | ||
| 92 | } | ||
| 93 | |||
| 94 | printk_once("ns_cgroup deprecated: consider using the " | ||
| 95 | "'clone_children' flag without the ns_cgroup.\n"); | ||
| 96 | |||
| 97 | ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); | ||
| 98 | if (!ns_cgroup) | ||
| 99 | return ERR_PTR(-ENOMEM); | ||
| 100 | return &ns_cgroup->css; | ||
| 101 | } | ||
| 102 | |||
| 103 | static void ns_destroy(struct cgroup_subsys *ss, | ||
| 104 | struct cgroup *cgroup) | ||
| 105 | { | ||
| 106 | struct ns_cgroup *ns_cgroup; | ||
| 107 | |||
| 108 | ns_cgroup = cgroup_to_ns(cgroup); | ||
| 109 | kfree(ns_cgroup); | ||
| 110 | } | ||
| 111 | |||
| 112 | struct cgroup_subsys ns_subsys = { | ||
| 113 | .name = "ns", | ||
| 114 | .can_attach = ns_can_attach, | ||
| 115 | .create = ns_create, | ||
| 116 | .destroy = ns_destroy, | ||
| 117 | .subsys_id = ns_subsys_id, | ||
| 118 | }; | ||
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a05d191ffdd9..9aeab4b98c64 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -22,6 +22,9 @@ | |||
| 22 | #include <linux/pid_namespace.h> | 22 | #include <linux/pid_namespace.h> |
| 23 | #include <net/net_namespace.h> | 23 | #include <net/net_namespace.h> |
| 24 | #include <linux/ipc_namespace.h> | 24 | #include <linux/ipc_namespace.h> |
| 25 | #include <linux/proc_fs.h> | ||
| 26 | #include <linux/file.h> | ||
| 27 | #include <linux/syscalls.h> | ||
| 25 | 28 | ||
| 26 | static struct kmem_cache *nsproxy_cachep; | 29 | static struct kmem_cache *nsproxy_cachep; |
| 27 | 30 | ||
| @@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, | |||
| 198 | goto out; | 201 | goto out; |
| 199 | } | 202 | } |
| 200 | 203 | ||
| 201 | err = ns_cgroup_clone(current, task_pid(current)); | ||
| 202 | if (err) | ||
| 203 | put_nsproxy(*new_nsp); | ||
| 204 | |||
| 205 | out: | 204 | out: |
| 206 | return err; | 205 | return err; |
| 207 | } | 206 | } |
| @@ -233,10 +232,47 @@ void exit_task_namespaces(struct task_struct *p) | |||
| 233 | switch_task_namespaces(p, NULL); | 232 | switch_task_namespaces(p, NULL); |
| 234 | } | 233 | } |
| 235 | 234 | ||
| 236 | static int __init nsproxy_cache_init(void) | 235 | SYSCALL_DEFINE2(setns, int, fd, int, nstype) |
| 236 | { | ||
| 237 | const struct proc_ns_operations *ops; | ||
| 238 | struct task_struct *tsk = current; | ||
| 239 | struct nsproxy *new_nsproxy; | ||
| 240 | struct proc_inode *ei; | ||
| 241 | struct file *file; | ||
| 242 | int err; | ||
| 243 | |||
| 244 | if (!capable(CAP_SYS_ADMIN)) | ||
| 245 | return -EPERM; | ||
| 246 | |||
| 247 | file = proc_ns_fget(fd); | ||
| 248 | if (IS_ERR(file)) | ||
| 249 | return PTR_ERR(file); | ||
| 250 | |||
| 251 | err = -EINVAL; | ||
| 252 | ei = PROC_I(file->f_dentry->d_inode); | ||
| 253 | ops = ei->ns_ops; | ||
| 254 | if (nstype && (ops->type != nstype)) | ||
| 255 | goto out; | ||
| 256 | |||
| 257 | new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); | ||
| 258 | if (IS_ERR(new_nsproxy)) { | ||
| 259 | err = PTR_ERR(new_nsproxy); | ||
| 260 | goto out; | ||
| 261 | } | ||
| 262 | |||
| 263 | err = ops->install(new_nsproxy, ei->ns); | ||
| 264 | if (err) { | ||
| 265 | free_nsproxy(new_nsproxy); | ||
| 266 | goto out; | ||
| 267 | } | ||
| 268 | switch_task_namespaces(tsk, new_nsproxy); | ||
| 269 | out: | ||
| 270 | fput(file); | ||
| 271 | return err; | ||
| 272 | } | ||
| 273 | |||
| 274 | int __init nsproxy_cache_init(void) | ||
| 237 | { | 275 | { |
| 238 | nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); | 276 | nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); |
| 239 | return 0; | 277 | return 0; |
| 240 | } | 278 | } |
| 241 | |||
| 242 | module_init(nsproxy_cache_init); | ||
diff --git a/kernel/panic.c b/kernel/panic.c index 69231670eb95..d7bb6974efb5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -119,6 +119,8 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
| 119 | } | 119 | } |
| 120 | mdelay(PANIC_TIMER_STEP); | 120 | mdelay(PANIC_TIMER_STEP); |
| 121 | } | 121 | } |
| 122 | } | ||
| 123 | if (panic_timeout != 0) { | ||
| 122 | /* | 124 | /* |
| 123 | * This will not be a clean reboot, with everything | 125 | * This will not be a clean reboot, with everything |
| 124 | * shutting down. But if there is a chance of | 126 | * shutting down. But if there is a chance of |
diff --git a/kernel/params.c b/kernel/params.c index 7ab388a48a2e..22df3e0d142a 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -225,8 +225,8 @@ int parse_args(const char *name, | |||
| 225 | int ret; \ | 225 | int ret; \ |
| 226 | \ | 226 | \ |
| 227 | ret = strtolfn(val, 0, &l); \ | 227 | ret = strtolfn(val, 0, &l); \ |
| 228 | if (ret == -EINVAL || ((type)l != l)) \ | 228 | if (ret < 0 || ((type)l != l)) \ |
| 229 | return -EINVAL; \ | 229 | return ret < 0 ? ret : -EINVAL; \ |
| 230 | *((type *)kp->arg) = l; \ | 230 | *((type *)kp->arg) = l; \ |
| 231 | return 0; \ | 231 | return 0; \ |
| 232 | } \ | 232 | } \ |
| @@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp); | |||
| 297 | int param_set_bool(const char *val, const struct kernel_param *kp) | 297 | int param_set_bool(const char *val, const struct kernel_param *kp) |
| 298 | { | 298 | { |
| 299 | bool v; | 299 | bool v; |
| 300 | int ret; | ||
| 300 | 301 | ||
| 301 | /* No equals means "set"... */ | 302 | /* No equals means "set"... */ |
| 302 | if (!val) val = "1"; | 303 | if (!val) val = "1"; |
| 303 | 304 | ||
| 304 | /* One of =[yYnN01] */ | 305 | /* One of =[yYnN01] */ |
| 305 | switch (val[0]) { | 306 | ret = strtobool(val, &v); |
| 306 | case 'y': case 'Y': case '1': | 307 | if (ret) |
| 307 | v = true; | 308 | return ret; |
| 308 | break; | ||
| 309 | case 'n': case 'N': case '0': | ||
| 310 | v = false; | ||
| 311 | break; | ||
| 312 | default: | ||
| 313 | return -EINVAL; | ||
| 314 | } | ||
| 315 | 309 | ||
| 316 | if (kp->flags & KPARAM_ISBOOL) | 310 | if (kp->flags & KPARAM_ISBOOL) |
| 317 | *(bool *)kp->arg = v; | 311 | *(bool *)kp->arg = v; |
| @@ -517,7 +511,7 @@ struct module_param_attrs | |||
| 517 | #define to_param_attr(n) container_of(n, struct param_attribute, mattr) | 511 | #define to_param_attr(n) container_of(n, struct param_attribute, mattr) |
| 518 | 512 | ||
| 519 | static ssize_t param_attr_show(struct module_attribute *mattr, | 513 | static ssize_t param_attr_show(struct module_attribute *mattr, |
| 520 | struct module *mod, char *buf) | 514 | struct module_kobject *mk, char *buf) |
| 521 | { | 515 | { |
| 522 | int count; | 516 | int count; |
| 523 | struct param_attribute *attribute = to_param_attr(mattr); | 517 | struct param_attribute *attribute = to_param_attr(mattr); |
| @@ -537,7 +531,7 @@ static ssize_t param_attr_show(struct module_attribute *mattr, | |||
| 537 | 531 | ||
| 538 | /* sysfs always hands a nul-terminated string in buf. We rely on that. */ | 532 | /* sysfs always hands a nul-terminated string in buf. We rely on that. */ |
| 539 | static ssize_t param_attr_store(struct module_attribute *mattr, | 533 | static ssize_t param_attr_store(struct module_attribute *mattr, |
| 540 | struct module *owner, | 534 | struct module_kobject *km, |
| 541 | const char *buf, size_t len) | 535 | const char *buf, size_t len) |
| 542 | { | 536 | { |
| 543 | int err; | 537 | int err; |
| @@ -736,6 +730,10 @@ static struct module_kobject * __init locate_module_kobject(const char *name) | |||
| 736 | mk->kobj.kset = module_kset; | 730 | mk->kobj.kset = module_kset; |
| 737 | err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, | 731 | err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, |
| 738 | "%s", name); | 732 | "%s", name); |
| 733 | #ifdef CONFIG_MODULES | ||
| 734 | if (!err) | ||
| 735 | err = sysfs_create_file(&mk->kobj, &module_uevent.attr); | ||
| 736 | #endif | ||
| 739 | if (err) { | 737 | if (err) { |
| 740 | kobject_put(&mk->kobj); | 738 | kobject_put(&mk->kobj); |
| 741 | printk(KERN_ERR | 739 | printk(KERN_ERR |
| @@ -813,7 +811,7 @@ static void __init param_sysfs_builtin(void) | |||
| 813 | } | 811 | } |
| 814 | 812 | ||
| 815 | ssize_t __modver_version_show(struct module_attribute *mattr, | 813 | ssize_t __modver_version_show(struct module_attribute *mattr, |
| 816 | struct module *mod, char *buf) | 814 | struct module_kobject *mk, char *buf) |
| 817 | { | 815 | { |
| 818 | struct module_version_attribute *vattr = | 816 | struct module_version_attribute *vattr = |
| 819 | container_of(mattr, struct module_version_attribute, mattr); | 817 | container_of(mattr, struct module_version_attribute, mattr); |
| @@ -821,15 +819,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr, | |||
| 821 | return sprintf(buf, "%s\n", vattr->version); | 819 | return sprintf(buf, "%s\n", vattr->version); |
| 822 | } | 820 | } |
| 823 | 821 | ||
| 824 | extern struct module_version_attribute __start___modver[], __stop___modver[]; | 822 | extern const struct module_version_attribute *__start___modver[]; |
| 823 | extern const struct module_version_attribute *__stop___modver[]; | ||
| 825 | 824 | ||
| 826 | static void __init version_sysfs_builtin(void) | 825 | static void __init version_sysfs_builtin(void) |
| 827 | { | 826 | { |
| 828 | const struct module_version_attribute *vattr; | 827 | const struct module_version_attribute **p; |
| 829 | struct module_kobject *mk; | 828 | struct module_kobject *mk; |
| 830 | int err; | 829 | int err; |
| 831 | 830 | ||
| 832 | for (vattr = __start___modver; vattr < __stop___modver; vattr++) { | 831 | for (p = __start___modver; p < __stop___modver; p++) { |
| 832 | const struct module_version_attribute *vattr = *p; | ||
| 833 | |||
| 833 | mk = locate_module_kobject(vattr->module_name); | 834 | mk = locate_module_kobject(vattr->module_name); |
| 834 | if (mk) { | 835 | if (mk) { |
| 835 | err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); | 836 | err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); |
| @@ -855,7 +856,7 @@ static ssize_t module_attr_show(struct kobject *kobj, | |||
| 855 | if (!attribute->show) | 856 | if (!attribute->show) |
| 856 | return -EIO; | 857 | return -EIO; |
| 857 | 858 | ||
| 858 | ret = attribute->show(attribute, mk->mod, buf); | 859 | ret = attribute->show(attribute, mk, buf); |
| 859 | 860 | ||
| 860 | return ret; | 861 | return ret; |
| 861 | } | 862 | } |
| @@ -874,7 +875,7 @@ static ssize_t module_attr_store(struct kobject *kobj, | |||
| 874 | if (!attribute->store) | 875 | if (!attribute->store) |
| 875 | return -EIO; | 876 | return -EIO; |
| 876 | 877 | ||
| 877 | ret = attribute->store(attribute, mk->mod, buf, len); | 878 | ret = attribute->store(attribute, mk, buf, len); |
| 878 | 879 | ||
| 879 | return ret; | 880 | return ret; |
| 880 | } | 881 | } |
diff --git a/kernel/pid.c b/kernel/pid.c index 57a8346a270e..e432057f3b21 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -405,7 +405,6 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) | |||
| 405 | if (pid) { | 405 | if (pid) { |
| 406 | struct hlist_node *first; | 406 | struct hlist_node *first; |
| 407 | first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), | 407 | first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), |
| 408 | rcu_read_lock_held() || | ||
| 409 | lockdep_tasklist_lock_is_held()); | 408 | lockdep_tasklist_lock_is_held()); |
| 410 | if (first) | 409 | if (first) |
| 411 | result = hlist_entry(first, struct task_struct, pids[(type)].node); | 410 | result = hlist_entry(first, struct task_struct, pids[(type)].node); |
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 0da058bff8eb..37f05d0f0793 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c | |||
| @@ -40,6 +40,7 @@ | |||
| 40 | #include <linux/string.h> | 40 | #include <linux/string.h> |
| 41 | #include <linux/platform_device.h> | 41 | #include <linux/platform_device.h> |
| 42 | #include <linux/init.h> | 42 | #include <linux/init.h> |
| 43 | #include <linux/kernel.h> | ||
| 43 | 44 | ||
| 44 | #include <linux/uaccess.h> | 45 | #include <linux/uaccess.h> |
| 45 | 46 | ||
| @@ -53,11 +54,17 @@ enum pm_qos_type { | |||
| 53 | PM_QOS_MIN /* return the smallest value */ | 54 | PM_QOS_MIN /* return the smallest value */ |
| 54 | }; | 55 | }; |
| 55 | 56 | ||
| 57 | /* | ||
| 58 | * Note: The lockless read path depends on the CPU accessing | ||
| 59 | * target_value atomically. Atomic access is only guaranteed on all CPU | ||
| 60 | * types linux supports for 32 bit quantites | ||
| 61 | */ | ||
| 56 | struct pm_qos_object { | 62 | struct pm_qos_object { |
| 57 | struct plist_head requests; | 63 | struct plist_head requests; |
| 58 | struct blocking_notifier_head *notifiers; | 64 | struct blocking_notifier_head *notifiers; |
| 59 | struct miscdevice pm_qos_power_miscdev; | 65 | struct miscdevice pm_qos_power_miscdev; |
| 60 | char *name; | 66 | char *name; |
| 67 | s32 target_value; /* Do not change to 64 bit */ | ||
| 61 | s32 default_value; | 68 | s32 default_value; |
| 62 | enum pm_qos_type type; | 69 | enum pm_qos_type type; |
| 63 | }; | 70 | }; |
| @@ -67,29 +74,32 @@ static DEFINE_SPINLOCK(pm_qos_lock); | |||
| 67 | static struct pm_qos_object null_pm_qos; | 74 | static struct pm_qos_object null_pm_qos; |
| 68 | static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); | 75 | static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); |
| 69 | static struct pm_qos_object cpu_dma_pm_qos = { | 76 | static struct pm_qos_object cpu_dma_pm_qos = { |
| 70 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), | 77 | .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), |
| 71 | .notifiers = &cpu_dma_lat_notifier, | 78 | .notifiers = &cpu_dma_lat_notifier, |
| 72 | .name = "cpu_dma_latency", | 79 | .name = "cpu_dma_latency", |
| 73 | .default_value = 2000 * USEC_PER_SEC, | 80 | .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, |
| 81 | .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, | ||
| 74 | .type = PM_QOS_MIN, | 82 | .type = PM_QOS_MIN, |
| 75 | }; | 83 | }; |
| 76 | 84 | ||
| 77 | static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); | 85 | static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); |
| 78 | static struct pm_qos_object network_lat_pm_qos = { | 86 | static struct pm_qos_object network_lat_pm_qos = { |
| 79 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), | 87 | .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), |
| 80 | .notifiers = &network_lat_notifier, | 88 | .notifiers = &network_lat_notifier, |
| 81 | .name = "network_latency", | 89 | .name = "network_latency", |
| 82 | .default_value = 2000 * USEC_PER_SEC, | 90 | .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, |
| 91 | .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, | ||
| 83 | .type = PM_QOS_MIN | 92 | .type = PM_QOS_MIN |
| 84 | }; | 93 | }; |
| 85 | 94 | ||
| 86 | 95 | ||
| 87 | static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); | 96 | static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); |
| 88 | static struct pm_qos_object network_throughput_pm_qos = { | 97 | static struct pm_qos_object network_throughput_pm_qos = { |
| 89 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), | 98 | .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), |
| 90 | .notifiers = &network_throughput_notifier, | 99 | .notifiers = &network_throughput_notifier, |
| 91 | .name = "network_throughput", | 100 | .name = "network_throughput", |
| 92 | .default_value = 0, | 101 | .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, |
| 102 | .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, | ||
| 93 | .type = PM_QOS_MAX, | 103 | .type = PM_QOS_MAX, |
| 94 | }; | 104 | }; |
| 95 | 105 | ||
| @@ -135,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) | |||
| 135 | } | 145 | } |
| 136 | } | 146 | } |
| 137 | 147 | ||
| 148 | static inline s32 pm_qos_read_value(struct pm_qos_object *o) | ||
| 149 | { | ||
| 150 | return o->target_value; | ||
| 151 | } | ||
| 152 | |||
| 153 | static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) | ||
| 154 | { | ||
| 155 | o->target_value = value; | ||
| 156 | } | ||
| 157 | |||
| 138 | static void update_target(struct pm_qos_object *o, struct plist_node *node, | 158 | static void update_target(struct pm_qos_object *o, struct plist_node *node, |
| 139 | int del, int value) | 159 | int del, int value) |
| 140 | { | 160 | { |
| @@ -159,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node, | |||
| 159 | plist_add(node, &o->requests); | 179 | plist_add(node, &o->requests); |
| 160 | } | 180 | } |
| 161 | curr_value = pm_qos_get_value(o); | 181 | curr_value = pm_qos_get_value(o); |
| 182 | pm_qos_set_value(o, curr_value); | ||
| 162 | spin_unlock_irqrestore(&pm_qos_lock, flags); | 183 | spin_unlock_irqrestore(&pm_qos_lock, flags); |
| 163 | 184 | ||
| 164 | if (prev_value != curr_value) | 185 | if (prev_value != curr_value) |
| @@ -193,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor) | |||
| 193 | * pm_qos_request - returns current system wide qos expectation | 214 | * pm_qos_request - returns current system wide qos expectation |
| 194 | * @pm_qos_class: identification of which qos value is requested | 215 | * @pm_qos_class: identification of which qos value is requested |
| 195 | * | 216 | * |
| 196 | * This function returns the current target value in an atomic manner. | 217 | * This function returns the current target value. |
| 197 | */ | 218 | */ |
| 198 | int pm_qos_request(int pm_qos_class) | 219 | int pm_qos_request(int pm_qos_class) |
| 199 | { | 220 | { |
| 200 | unsigned long flags; | 221 | return pm_qos_read_value(pm_qos_array[pm_qos_class]); |
| 201 | int value; | ||
| 202 | |||
| 203 | spin_lock_irqsave(&pm_qos_lock, flags); | ||
| 204 | value = pm_qos_get_value(pm_qos_array[pm_qos_class]); | ||
| 205 | spin_unlock_irqrestore(&pm_qos_lock, flags); | ||
| 206 | |||
| 207 | return value; | ||
| 208 | } | 222 | } |
| 209 | EXPORT_SYMBOL_GPL(pm_qos_request); | 223 | EXPORT_SYMBOL_GPL(pm_qos_request); |
| 210 | 224 | ||
| @@ -385,7 +399,7 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, | |||
| 385 | s32 value; | 399 | s32 value; |
| 386 | unsigned long flags; | 400 | unsigned long flags; |
| 387 | struct pm_qos_object *o; | 401 | struct pm_qos_object *o; |
| 388 | struct pm_qos_request_list *pm_qos_req = filp->private_data;; | 402 | struct pm_qos_request_list *pm_qos_req = filp->private_data; |
| 389 | 403 | ||
| 390 | if (!pm_qos_req) | 404 | if (!pm_qos_req) |
| 391 | return -EINVAL; | 405 | return -EINVAL; |
| @@ -404,24 +418,36 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, | |||
| 404 | size_t count, loff_t *f_pos) | 418 | size_t count, loff_t *f_pos) |
| 405 | { | 419 | { |
| 406 | s32 value; | 420 | s32 value; |
| 407 | int x; | ||
| 408 | char ascii_value[11]; | ||
| 409 | struct pm_qos_request_list *pm_qos_req; | 421 | struct pm_qos_request_list *pm_qos_req; |
| 410 | 422 | ||
| 411 | if (count == sizeof(s32)) { | 423 | if (count == sizeof(s32)) { |
| 412 | if (copy_from_user(&value, buf, sizeof(s32))) | 424 | if (copy_from_user(&value, buf, sizeof(s32))) |
| 413 | return -EFAULT; | 425 | return -EFAULT; |
| 414 | } else if (count == 11) { /* len('0x12345678/0') */ | 426 | } else if (count <= 11) { /* ASCII perhaps? */ |
| 415 | if (copy_from_user(ascii_value, buf, 11)) | 427 | char ascii_value[11]; |
| 428 | unsigned long int ulval; | ||
| 429 | int ret; | ||
| 430 | |||
| 431 | if (copy_from_user(ascii_value, buf, count)) | ||
| 416 | return -EFAULT; | 432 | return -EFAULT; |
| 417 | if (strlen(ascii_value) != 10) | 433 | |
| 418 | return -EINVAL; | 434 | if (count > 10) { |
| 419 | x = sscanf(ascii_value, "%x", &value); | 435 | if (ascii_value[10] == '\n') |
| 420 | if (x != 1) | 436 | ascii_value[10] = '\0'; |
| 437 | else | ||
| 438 | return -EINVAL; | ||
| 439 | } else { | ||
| 440 | ascii_value[count] = '\0'; | ||
| 441 | } | ||
| 442 | ret = strict_strtoul(ascii_value, 16, &ulval); | ||
| 443 | if (ret) { | ||
| 444 | pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); | ||
| 421 | return -EINVAL; | 445 | return -EINVAL; |
| 422 | pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); | 446 | } |
| 423 | } else | 447 | value = (s32)lower_32_bits(ulval); |
| 448 | } else { | ||
| 424 | return -EINVAL; | 449 | return -EINVAL; |
| 450 | } | ||
| 425 | 451 | ||
| 426 | pm_qos_req = filp->private_data; | 452 | pm_qos_req = filp->private_data; |
| 427 | pm_qos_update_request(pm_qos_req, value); | 453 | pm_qos_update_request(pm_qos_req, value); |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 0791b13df7bf..58f405b581e7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
| @@ -1514,7 +1514,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
| 1514 | return -EFAULT; | 1514 | return -EFAULT; |
| 1515 | 1515 | ||
| 1516 | restart_block->fn = posix_cpu_nsleep_restart; | 1516 | restart_block->fn = posix_cpu_nsleep_restart; |
| 1517 | restart_block->nanosleep.index = which_clock; | 1517 | restart_block->nanosleep.clockid = which_clock; |
| 1518 | restart_block->nanosleep.rmtp = rmtp; | 1518 | restart_block->nanosleep.rmtp = rmtp; |
| 1519 | restart_block->nanosleep.expires = timespec_to_ns(rqtp); | 1519 | restart_block->nanosleep.expires = timespec_to_ns(rqtp); |
| 1520 | } | 1520 | } |
| @@ -1523,7 +1523,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, | |||
| 1523 | 1523 | ||
| 1524 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) | 1524 | static long posix_cpu_nsleep_restart(struct restart_block *restart_block) |
| 1525 | { | 1525 | { |
| 1526 | clockid_t which_clock = restart_block->nanosleep.index; | 1526 | clockid_t which_clock = restart_block->nanosleep.clockid; |
| 1527 | struct timespec t; | 1527 | struct timespec t; |
| 1528 | struct itimerspec it; | 1528 | struct itimerspec it; |
| 1529 | int error; | 1529 | int error; |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e5498d7405c3..4556182527f3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void) | |||
| 491 | return tmr; | 491 | return tmr; |
| 492 | } | 492 | } |
| 493 | 493 | ||
| 494 | static void k_itimer_rcu_free(struct rcu_head *head) | ||
| 495 | { | ||
| 496 | struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); | ||
| 497 | |||
| 498 | kmem_cache_free(posix_timers_cache, tmr); | ||
| 499 | } | ||
| 500 | |||
| 494 | #define IT_ID_SET 1 | 501 | #define IT_ID_SET 1 |
| 495 | #define IT_ID_NOT_SET 0 | 502 | #define IT_ID_NOT_SET 0 |
| 496 | static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | 503 | static void release_posix_timer(struct k_itimer *tmr, int it_id_set) |
| @@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) | |||
| 503 | } | 510 | } |
| 504 | put_pid(tmr->it_pid); | 511 | put_pid(tmr->it_pid); |
| 505 | sigqueue_free(tmr->sigq); | 512 | sigqueue_free(tmr->sigq); |
| 506 | kmem_cache_free(posix_timers_cache, tmr); | 513 | call_rcu(&tmr->it.rcu, k_itimer_rcu_free); |
| 507 | } | 514 | } |
| 508 | 515 | ||
| 509 | static struct k_clock *clockid_to_kclock(const clockid_t id) | 516 | static struct k_clock *clockid_to_kclock(const clockid_t id) |
| @@ -631,22 +638,18 @@ out: | |||
| 631 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) | 638 | static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) |
| 632 | { | 639 | { |
| 633 | struct k_itimer *timr; | 640 | struct k_itimer *timr; |
| 634 | /* | 641 | |
| 635 | * Watch out here. We do a irqsave on the idr_lock and pass the | 642 | rcu_read_lock(); |
| 636 | * flags part over to the timer lock. Must not let interrupts in | ||
| 637 | * while we are moving the lock. | ||
| 638 | */ | ||
| 639 | spin_lock_irqsave(&idr_lock, *flags); | ||
| 640 | timr = idr_find(&posix_timers_id, (int)timer_id); | 643 | timr = idr_find(&posix_timers_id, (int)timer_id); |
| 641 | if (timr) { | 644 | if (timr) { |
| 642 | spin_lock(&timr->it_lock); | 645 | spin_lock_irqsave(&timr->it_lock, *flags); |
| 643 | if (timr->it_signal == current->signal) { | 646 | if (timr->it_signal == current->signal) { |
| 644 | spin_unlock(&idr_lock); | 647 | rcu_read_unlock(); |
| 645 | return timr; | 648 | return timr; |
| 646 | } | 649 | } |
| 647 | spin_unlock(&timr->it_lock); | 650 | spin_unlock_irqrestore(&timr->it_lock, *flags); |
| 648 | } | 651 | } |
| 649 | spin_unlock_irqrestore(&idr_lock, *flags); | 652 | rcu_read_unlock(); |
| 650 | 653 | ||
| 651 | return NULL; | 654 | return NULL; |
| 652 | } | 655 | } |
| @@ -1056,7 +1059,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, | |||
| 1056 | */ | 1059 | */ |
| 1057 | long clock_nanosleep_restart(struct restart_block *restart_block) | 1060 | long clock_nanosleep_restart(struct restart_block *restart_block) |
| 1058 | { | 1061 | { |
| 1059 | clockid_t which_clock = restart_block->nanosleep.index; | 1062 | clockid_t which_clock = restart_block->nanosleep.clockid; |
| 1060 | struct k_clock *kc = clockid_to_kclock(which_clock); | 1063 | struct k_clock *kc = clockid_to_kclock(which_clock); |
| 1061 | 1064 | ||
| 1062 | if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) | 1065 | if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 6de9a8fc3417..b1914cb9095c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -125,12 +125,6 @@ config PM_DEBUG | |||
| 125 | code. This is helpful when debugging and reporting PM bugs, like | 125 | code. This is helpful when debugging and reporting PM bugs, like |
| 126 | suspend support. | 126 | suspend support. |
| 127 | 127 | ||
| 128 | config PM_VERBOSE | ||
| 129 | bool "Verbose Power Management debugging" | ||
| 130 | depends on PM_DEBUG | ||
| 131 | ---help--- | ||
| 132 | This option enables verbose messages from the Power Management code. | ||
| 133 | |||
| 134 | config PM_ADVANCED_DEBUG | 128 | config PM_ADVANCED_DEBUG |
| 135 | bool "Extra PM attributes in sysfs for low-level debugging/testing" | 129 | bool "Extra PM attributes in sysfs for low-level debugging/testing" |
| 136 | depends on PM_DEBUG | 130 | depends on PM_DEBUG |
| @@ -199,8 +193,8 @@ config APM_EMULATION | |||
| 199 | notification of APM "events" (e.g. battery status change). | 193 | notification of APM "events" (e.g. battery status change). |
| 200 | 194 | ||
| 201 | In order to use APM, you will need supporting software. For location | 195 | In order to use APM, you will need supporting software. For location |
| 202 | and more information, read <file:Documentation/power/pm.txt> and the | 196 | and more information, read <file:Documentation/power/apm-acpi.txt> |
| 203 | Battery Powered Linux mini-HOWTO, available from | 197 | and the Battery Powered Linux mini-HOWTO, available from |
| 204 | <http://www.tldp.org/docs.html#howto>. | 198 | <http://www.tldp.org/docs.html#howto>. |
| 205 | 199 | ||
| 206 | This driver does not spin down disk drives (see the hdparm(8) | 200 | This driver does not spin down disk drives (see the hdparm(8) |
| @@ -229,3 +223,11 @@ config PM_OPP | |||
| 229 | representing individual voltage domains and provides SOC | 223 | representing individual voltage domains and provides SOC |
| 230 | implementations a ready to use framework to manage OPPs. | 224 | implementations a ready to use framework to manage OPPs. |
| 231 | For more information, read <file:Documentation/power/opp.txt> | 225 | For more information, read <file:Documentation/power/opp.txt> |
| 226 | |||
| 227 | config PM_CLK | ||
| 228 | def_bool y | ||
| 229 | depends on PM && HAVE_CLK | ||
| 230 | |||
| 231 | config PM_GENERIC_DOMAINS | ||
| 232 | bool | ||
| 233 | depends on PM | ||
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 50aae660174d..8f7b1db1ece1 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -25,7 +25,6 @@ | |||
| 25 | #include <linux/gfp.h> | 25 | #include <linux/gfp.h> |
| 26 | #include <linux/syscore_ops.h> | 26 | #include <linux/syscore_ops.h> |
| 27 | #include <scsi/scsi_scan.h> | 27 | #include <scsi/scsi_scan.h> |
| 28 | #include <asm/suspend.h> | ||
| 29 | 28 | ||
| 30 | #include "power.h" | 29 | #include "power.h" |
| 31 | 30 | ||
| @@ -55,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN; | |||
| 55 | static const struct platform_hibernation_ops *hibernation_ops; | 54 | static const struct platform_hibernation_ops *hibernation_ops; |
| 56 | 55 | ||
| 57 | /** | 56 | /** |
| 58 | * hibernation_set_ops - set the global hibernate operations | 57 | * hibernation_set_ops - Set the global hibernate operations. |
| 59 | * @ops: the hibernation operations to use in subsequent hibernation transitions | 58 | * @ops: Hibernation operations to use in subsequent hibernation transitions. |
| 60 | */ | 59 | */ |
| 61 | |||
| 62 | void hibernation_set_ops(const struct platform_hibernation_ops *ops) | 60 | void hibernation_set_ops(const struct platform_hibernation_ops *ops) |
| 63 | { | 61 | { |
| 64 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot | 62 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot |
| @@ -115,10 +113,9 @@ static int hibernation_test(int level) { return 0; } | |||
| 115 | #endif /* !CONFIG_PM_DEBUG */ | 113 | #endif /* !CONFIG_PM_DEBUG */ |
| 116 | 114 | ||
| 117 | /** | 115 | /** |
| 118 | * platform_begin - tell the platform driver that we're starting | 116 | * platform_begin - Call platform to start hibernation. |
| 119 | * hibernation | 117 | * @platform_mode: Whether or not to use the platform driver. |
| 120 | */ | 118 | */ |
| 121 | |||
| 122 | static int platform_begin(int platform_mode) | 119 | static int platform_begin(int platform_mode) |
| 123 | { | 120 | { |
| 124 | return (platform_mode && hibernation_ops) ? | 121 | return (platform_mode && hibernation_ops) ? |
| @@ -126,10 +123,9 @@ static int platform_begin(int platform_mode) | |||
| 126 | } | 123 | } |
| 127 | 124 | ||
| 128 | /** | 125 | /** |
| 129 | * platform_end - tell the platform driver that we've entered the | 126 | * platform_end - Call platform to finish transition to the working state. |
| 130 | * working state | 127 | * @platform_mode: Whether or not to use the platform driver. |
| 131 | */ | 128 | */ |
| 132 | |||
| 133 | static void platform_end(int platform_mode) | 129 | static void platform_end(int platform_mode) |
| 134 | { | 130 | { |
| 135 | if (platform_mode && hibernation_ops) | 131 | if (platform_mode && hibernation_ops) |
| @@ -137,8 +133,11 @@ static void platform_end(int platform_mode) | |||
| 137 | } | 133 | } |
| 138 | 134 | ||
| 139 | /** | 135 | /** |
| 140 | * platform_pre_snapshot - prepare the machine for hibernation using the | 136 | * platform_pre_snapshot - Call platform to prepare the machine for hibernation. |
| 141 | * platform driver if so configured and return an error code if it fails | 137 | * @platform_mode: Whether or not to use the platform driver. |
| 138 | * | ||
| 139 | * Use the platform driver to prepare the system for creating a hibernate image, | ||
| 140 | * if so configured, and return an error code if that fails. | ||
| 142 | */ | 141 | */ |
| 143 | 142 | ||
| 144 | static int platform_pre_snapshot(int platform_mode) | 143 | static int platform_pre_snapshot(int platform_mode) |
| @@ -148,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode) | |||
| 148 | } | 147 | } |
| 149 | 148 | ||
| 150 | /** | 149 | /** |
| 151 | * platform_leave - prepare the machine for switching to the normal mode | 150 | * platform_leave - Call platform to prepare a transition to the working state. |
| 152 | * of operation using the platform driver (called with interrupts disabled) | 151 | * @platform_mode: Whether or not to use the platform driver. |
| 152 | * | ||
| 153 | * Use the platform driver prepare to prepare the machine for switching to the | ||
| 154 | * normal mode of operation. | ||
| 155 | * | ||
| 156 | * This routine is called on one CPU with interrupts disabled. | ||
| 153 | */ | 157 | */ |
| 154 | |||
| 155 | static void platform_leave(int platform_mode) | 158 | static void platform_leave(int platform_mode) |
| 156 | { | 159 | { |
| 157 | if (platform_mode && hibernation_ops) | 160 | if (platform_mode && hibernation_ops) |
| @@ -159,10 +162,14 @@ static void platform_leave(int platform_mode) | |||
| 159 | } | 162 | } |
| 160 | 163 | ||
| 161 | /** | 164 | /** |
| 162 | * platform_finish - switch the machine to the normal mode of operation | 165 | * platform_finish - Call platform to switch the system to the working state. |
| 163 | * using the platform driver (must be called after platform_prepare()) | 166 | * @platform_mode: Whether or not to use the platform driver. |
| 167 | * | ||
| 168 | * Use the platform driver to switch the machine to the normal mode of | ||
| 169 | * operation. | ||
| 170 | * | ||
| 171 | * This routine must be called after platform_prepare(). | ||
| 164 | */ | 172 | */ |
| 165 | |||
| 166 | static void platform_finish(int platform_mode) | 173 | static void platform_finish(int platform_mode) |
| 167 | { | 174 | { |
| 168 | if (platform_mode && hibernation_ops) | 175 | if (platform_mode && hibernation_ops) |
| @@ -170,11 +177,15 @@ static void platform_finish(int platform_mode) | |||
| 170 | } | 177 | } |
| 171 | 178 | ||
| 172 | /** | 179 | /** |
| 173 | * platform_pre_restore - prepare the platform for the restoration from a | 180 | * platform_pre_restore - Prepare for hibernate image restoration. |
| 174 | * hibernation image. If the restore fails after this function has been | 181 | * @platform_mode: Whether or not to use the platform driver. |
| 175 | * called, platform_restore_cleanup() must be called. | 182 | * |
| 183 | * Use the platform driver to prepare the system for resume from a hibernation | ||
| 184 | * image. | ||
| 185 | * | ||
| 186 | * If the restore fails after this function has been called, | ||
| 187 | * platform_restore_cleanup() must be called. | ||
| 176 | */ | 188 | */ |
| 177 | |||
| 178 | static int platform_pre_restore(int platform_mode) | 189 | static int platform_pre_restore(int platform_mode) |
| 179 | { | 190 | { |
| 180 | return (platform_mode && hibernation_ops) ? | 191 | return (platform_mode && hibernation_ops) ? |
| @@ -182,12 +193,16 @@ static int platform_pre_restore(int platform_mode) | |||
| 182 | } | 193 | } |
| 183 | 194 | ||
| 184 | /** | 195 | /** |
| 185 | * platform_restore_cleanup - switch the platform to the normal mode of | 196 | * platform_restore_cleanup - Switch to the working state after failing restore. |
| 186 | * operation after a failing restore. If platform_pre_restore() has been | 197 | * @platform_mode: Whether or not to use the platform driver. |
| 187 | * called before the failing restore, this function must be called too, | 198 | * |
| 188 | * regardless of the result of platform_pre_restore(). | 199 | * Use the platform driver to switch the system to the normal mode of operation |
| 200 | * after a failing restore. | ||
| 201 | * | ||
| 202 | * If platform_pre_restore() has been called before the failing restore, this | ||
| 203 | * function must be called too, regardless of the result of | ||
| 204 | * platform_pre_restore(). | ||
| 189 | */ | 205 | */ |
| 190 | |||
| 191 | static void platform_restore_cleanup(int platform_mode) | 206 | static void platform_restore_cleanup(int platform_mode) |
| 192 | { | 207 | { |
| 193 | if (platform_mode && hibernation_ops) | 208 | if (platform_mode && hibernation_ops) |
| @@ -195,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode) | |||
| 195 | } | 210 | } |
| 196 | 211 | ||
| 197 | /** | 212 | /** |
| 198 | * platform_recover - recover the platform from a failure to suspend | 213 | * platform_recover - Recover from a failure to suspend devices. |
| 199 | * devices. | 214 | * @platform_mode: Whether or not to use the platform driver. |
| 200 | */ | 215 | */ |
| 201 | |||
| 202 | static void platform_recover(int platform_mode) | 216 | static void platform_recover(int platform_mode) |
| 203 | { | 217 | { |
| 204 | if (platform_mode && hibernation_ops && hibernation_ops->recover) | 218 | if (platform_mode && hibernation_ops && hibernation_ops->recover) |
| @@ -206,13 +220,12 @@ static void platform_recover(int platform_mode) | |||
| 206 | } | 220 | } |
| 207 | 221 | ||
| 208 | /** | 222 | /** |
| 209 | * swsusp_show_speed - print the time elapsed between two events. | 223 | * swsusp_show_speed - Print time elapsed between two events during hibernation. |
| 210 | * @start: Starting event. | 224 | * @start: Starting event. |
| 211 | * @stop: Final event. | 225 | * @stop: Final event. |
| 212 | * @nr_pages - number of pages processed between @start and @stop | 226 | * @nr_pages: Number of memory pages processed between @start and @stop. |
| 213 | * @msg - introductory message to print | 227 | * @msg: Additional diagnostic message to print. |
| 214 | */ | 228 | */ |
| 215 | |||
| 216 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, | 229 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, |
| 217 | unsigned nr_pages, char *msg) | 230 | unsigned nr_pages, char *msg) |
| 218 | { | 231 | { |
| @@ -235,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, | |||
| 235 | } | 248 | } |
| 236 | 249 | ||
| 237 | /** | 250 | /** |
| 238 | * create_image - freeze devices that need to be frozen with interrupts | 251 | * create_image - Create a hibernation image. |
| 239 | * off, create the hibernation image and thaw those devices. Control | 252 | * @platform_mode: Whether or not to use the platform driver. |
| 240 | * reappears in this routine after a restore. | 253 | * |
| 254 | * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image | ||
| 255 | * and execute the drivers' .thaw_noirq() callbacks. | ||
| 256 | * | ||
| 257 | * Control reappears in this routine after the subsequent restore. | ||
| 241 | */ | 258 | */ |
| 242 | |||
| 243 | static int create_image(int platform_mode) | 259 | static int create_image(int platform_mode) |
| 244 | { | 260 | { |
| 245 | int error; | 261 | int error; |
| 246 | 262 | ||
| 247 | error = arch_prepare_suspend(); | ||
| 248 | if (error) | ||
| 249 | return error; | ||
| 250 | |||
| 251 | /* At this point, dpm_suspend_start() has been called, but *not* | ||
| 252 | * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now. | ||
| 253 | * Otherwise, drivers for some devices (e.g. interrupt controllers) | ||
| 254 | * become desynchronized with the actual state of the hardware | ||
| 255 | * at resume time, and evil weirdness ensues. | ||
| 256 | */ | ||
| 257 | error = dpm_suspend_noirq(PMSG_FREEZE); | 263 | error = dpm_suspend_noirq(PMSG_FREEZE); |
| 258 | if (error) { | 264 | if (error) { |
| 259 | printk(KERN_ERR "PM: Some devices failed to power down, " | 265 | printk(KERN_ERR "PM: Some devices failed to power down, " |
| @@ -272,12 +278,7 @@ static int create_image(int platform_mode) | |||
| 272 | 278 | ||
| 273 | local_irq_disable(); | 279 | local_irq_disable(); |
| 274 | 280 | ||
| 275 | error = sysdev_suspend(PMSG_FREEZE); | 281 | error = syscore_suspend(); |
| 276 | if (!error) { | ||
| 277 | error = syscore_suspend(); | ||
| 278 | if (error) | ||
| 279 | sysdev_resume(); | ||
| 280 | } | ||
| 281 | if (error) { | 282 | if (error) { |
| 282 | printk(KERN_ERR "PM: Some system devices failed to power down, " | 283 | printk(KERN_ERR "PM: Some system devices failed to power down, " |
| 283 | "aborting hibernation\n"); | 284 | "aborting hibernation\n"); |
| @@ -302,10 +303,6 @@ static int create_image(int platform_mode) | |||
| 302 | 303 | ||
| 303 | Power_up: | 304 | Power_up: |
| 304 | syscore_resume(); | 305 | syscore_resume(); |
| 305 | sysdev_resume(); | ||
| 306 | /* NOTE: dpm_resume_noirq() is just a resume() for devices | ||
| 307 | * that suspended with irqs off ... no overall powerup. | ||
| 308 | */ | ||
| 309 | 306 | ||
| 310 | Enable_irqs: | 307 | Enable_irqs: |
| 311 | local_irq_enable(); | 308 | local_irq_enable(); |
| @@ -323,30 +320,32 @@ static int create_image(int platform_mode) | |||
| 323 | } | 320 | } |
| 324 | 321 | ||
| 325 | /** | 322 | /** |
| 326 | * hibernation_snapshot - quiesce devices and create the hibernation | 323 | * hibernation_snapshot - Quiesce devices and create a hibernation image. |
| 327 | * snapshot image. | 324 | * @platform_mode: If set, use platform driver to prepare for the transition. |
| 328 | * @platform_mode - if set, use the platform driver, if available, to | ||
| 329 | * prepare the platform firmware for the power transition. | ||
| 330 | * | 325 | * |
| 331 | * Must be called with pm_mutex held | 326 | * This routine must be called with pm_mutex held. |
| 332 | */ | 327 | */ |
| 333 | |||
| 334 | int hibernation_snapshot(int platform_mode) | 328 | int hibernation_snapshot(int platform_mode) |
| 335 | { | 329 | { |
| 330 | pm_message_t msg = PMSG_RECOVER; | ||
| 336 | int error; | 331 | int error; |
| 337 | 332 | ||
| 338 | error = platform_begin(platform_mode); | 333 | error = platform_begin(platform_mode); |
| 339 | if (error) | 334 | if (error) |
| 340 | goto Close; | 335 | goto Close; |
| 341 | 336 | ||
| 337 | error = dpm_prepare(PMSG_FREEZE); | ||
| 338 | if (error) | ||
| 339 | goto Complete_devices; | ||
| 340 | |||
| 342 | /* Preallocate image memory before shutting down devices. */ | 341 | /* Preallocate image memory before shutting down devices. */ |
| 343 | error = hibernate_preallocate_memory(); | 342 | error = hibernate_preallocate_memory(); |
| 344 | if (error) | 343 | if (error) |
| 345 | goto Close; | 344 | goto Complete_devices; |
| 346 | 345 | ||
| 347 | suspend_console(); | 346 | suspend_console(); |
| 348 | pm_restrict_gfp_mask(); | 347 | pm_restrict_gfp_mask(); |
| 349 | error = dpm_suspend_start(PMSG_FREEZE); | 348 | error = dpm_suspend(PMSG_FREEZE); |
| 350 | if (error) | 349 | if (error) |
| 351 | goto Recover_platform; | 350 | goto Recover_platform; |
| 352 | 351 | ||
| @@ -364,13 +363,17 @@ int hibernation_snapshot(int platform_mode) | |||
| 364 | if (error || !in_suspend) | 363 | if (error || !in_suspend) |
| 365 | swsusp_free(); | 364 | swsusp_free(); |
| 366 | 365 | ||
| 367 | dpm_resume_end(in_suspend ? | 366 | msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; |
| 368 | (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); | 367 | dpm_resume(msg); |
| 369 | 368 | ||
| 370 | if (error || !in_suspend) | 369 | if (error || !in_suspend) |
| 371 | pm_restore_gfp_mask(); | 370 | pm_restore_gfp_mask(); |
| 372 | 371 | ||
| 373 | resume_console(); | 372 | resume_console(); |
| 373 | |||
| 374 | Complete_devices: | ||
| 375 | dpm_complete(msg); | ||
| 376 | |||
| 374 | Close: | 377 | Close: |
| 375 | platform_end(platform_mode); | 378 | platform_end(platform_mode); |
| 376 | return error; | 379 | return error; |
| @@ -381,13 +384,14 @@ int hibernation_snapshot(int platform_mode) | |||
| 381 | } | 384 | } |
| 382 | 385 | ||
| 383 | /** | 386 | /** |
| 384 | * resume_target_kernel - prepare devices that need to be suspended with | 387 | * resume_target_kernel - Restore system state from a hibernation image. |
| 385 | * interrupts off, restore the contents of highmem that have not been | 388 | * @platform_mode: Whether or not to use the platform driver. |
| 386 | * restored yet from the image and run the low level code that will restore | 389 | * |
| 387 | * the remaining contents of memory and switch to the just restored target | 390 | * Execute device drivers' .freeze_noirq() callbacks, restore the contents of |
| 388 | * kernel. | 391 | * highmem that have not been restored yet from the image and run the low-level |
| 392 | * code that will restore the remaining contents of memory and switch to the | ||
| 393 | * just restored target kernel. | ||
| 389 | */ | 394 | */ |
| 390 | |||
| 391 | static int resume_target_kernel(bool platform_mode) | 395 | static int resume_target_kernel(bool platform_mode) |
| 392 | { | 396 | { |
| 393 | int error; | 397 | int error; |
| @@ -409,40 +413,36 @@ static int resume_target_kernel(bool platform_mode) | |||
| 409 | 413 | ||
| 410 | local_irq_disable(); | 414 | local_irq_disable(); |
| 411 | 415 | ||
| 412 | error = sysdev_suspend(PMSG_QUIESCE); | 416 | error = syscore_suspend(); |
| 413 | if (!error) { | ||
| 414 | error = syscore_suspend(); | ||
| 415 | if (error) | ||
| 416 | sysdev_resume(); | ||
| 417 | } | ||
| 418 | if (error) | 417 | if (error) |
| 419 | goto Enable_irqs; | 418 | goto Enable_irqs; |
| 420 | 419 | ||
| 421 | /* We'll ignore saved state, but this gets preempt count (etc) right */ | ||
| 422 | save_processor_state(); | 420 | save_processor_state(); |
| 423 | error = restore_highmem(); | 421 | error = restore_highmem(); |
| 424 | if (!error) { | 422 | if (!error) { |
| 425 | error = swsusp_arch_resume(); | 423 | error = swsusp_arch_resume(); |
| 426 | /* | 424 | /* |
| 427 | * The code below is only ever reached in case of a failure. | 425 | * The code below is only ever reached in case of a failure. |
| 428 | * Otherwise execution continues at place where | 426 | * Otherwise, execution continues at the place where |
| 429 | * swsusp_arch_suspend() was called | 427 | * swsusp_arch_suspend() was called. |
| 430 | */ | 428 | */ |
| 431 | BUG_ON(!error); | 429 | BUG_ON(!error); |
| 432 | /* This call to restore_highmem() undos the previous one */ | 430 | /* |
| 431 | * This call to restore_highmem() reverts the changes made by | ||
| 432 | * the previous one. | ||
| 433 | */ | ||
| 433 | restore_highmem(); | 434 | restore_highmem(); |
| 434 | } | 435 | } |
| 435 | /* | 436 | /* |
| 436 | * The only reason why swsusp_arch_resume() can fail is memory being | 437 | * The only reason why swsusp_arch_resume() can fail is memory being |
| 437 | * very tight, so we have to free it as soon as we can to avoid | 438 | * very tight, so we have to free it as soon as we can to avoid |
| 438 | * subsequent failures | 439 | * subsequent failures. |
| 439 | */ | 440 | */ |
| 440 | swsusp_free(); | 441 | swsusp_free(); |
| 441 | restore_processor_state(); | 442 | restore_processor_state(); |
| 442 | touch_softlockup_watchdog(); | 443 | touch_softlockup_watchdog(); |
| 443 | 444 | ||
| 444 | syscore_resume(); | 445 | syscore_resume(); |
| 445 | sysdev_resume(); | ||
| 446 | 446 | ||
| 447 | Enable_irqs: | 447 | Enable_irqs: |
| 448 | local_irq_enable(); | 448 | local_irq_enable(); |
| @@ -459,14 +459,12 @@ static int resume_target_kernel(bool platform_mode) | |||
| 459 | } | 459 | } |
| 460 | 460 | ||
| 461 | /** | 461 | /** |
| 462 | * hibernation_restore - quiesce devices and restore the hibernation | 462 | * hibernation_restore - Quiesce devices and restore from a hibernation image. |
| 463 | * snapshot image. If successful, control returns in hibernation_snaphot() | 463 | * @platform_mode: If set, use platform driver to prepare for the transition. |
| 464 | * @platform_mode - if set, use the platform driver, if available, to | ||
| 465 | * prepare the platform firmware for the transition. | ||
| 466 | * | 464 | * |
| 467 | * Must be called with pm_mutex held | 465 | * This routine must be called with pm_mutex held. If it is successful, control |
| 466 | * reappears in the restored target kernel in hibernation_snaphot(). | ||
| 468 | */ | 467 | */ |
| 469 | |||
| 470 | int hibernation_restore(int platform_mode) | 468 | int hibernation_restore(int platform_mode) |
| 471 | { | 469 | { |
| 472 | int error; | 470 | int error; |
| @@ -486,10 +484,8 @@ int hibernation_restore(int platform_mode) | |||
| 486 | } | 484 | } |
| 487 | 485 | ||
| 488 | /** | 486 | /** |
| 489 | * hibernation_platform_enter - enter the hibernation state using the | 487 | * hibernation_platform_enter - Power off the system using the platform driver. |
| 490 | * platform driver (if available) | ||
| 491 | */ | 488 | */ |
| 492 | |||
| 493 | int hibernation_platform_enter(void) | 489 | int hibernation_platform_enter(void) |
| 494 | { | 490 | { |
| 495 | int error; | 491 | int error; |
| @@ -528,7 +524,6 @@ int hibernation_platform_enter(void) | |||
| 528 | goto Platform_finish; | 524 | goto Platform_finish; |
| 529 | 525 | ||
| 530 | local_irq_disable(); | 526 | local_irq_disable(); |
| 531 | sysdev_suspend(PMSG_HIBERNATE); | ||
| 532 | syscore_suspend(); | 527 | syscore_suspend(); |
| 533 | if (pm_wakeup_pending()) { | 528 | if (pm_wakeup_pending()) { |
| 534 | error = -EAGAIN; | 529 | error = -EAGAIN; |
| @@ -541,7 +536,6 @@ int hibernation_platform_enter(void) | |||
| 541 | 536 | ||
| 542 | Power_up: | 537 | Power_up: |
| 543 | syscore_resume(); | 538 | syscore_resume(); |
| 544 | sysdev_resume(); | ||
| 545 | local_irq_enable(); | 539 | local_irq_enable(); |
| 546 | enable_nonboot_cpus(); | 540 | enable_nonboot_cpus(); |
| 547 | 541 | ||
| @@ -562,12 +556,12 @@ int hibernation_platform_enter(void) | |||
| 562 | } | 556 | } |
| 563 | 557 | ||
| 564 | /** | 558 | /** |
| 565 | * power_down - Shut the machine down for hibernation. | 559 | * power_down - Shut the machine down for hibernation. |
| 566 | * | 560 | * |
| 567 | * Use the platform driver, if configured so; otherwise try | 561 | * Use the platform driver, if configured, to put the system into the sleep |
| 568 | * to power off or reboot. | 562 | * state corresponding to hibernation, or try to power it off or reboot, |
| 563 | * depending on the value of hibernation_mode. | ||
| 569 | */ | 564 | */ |
| 570 | |||
| 571 | static void power_down(void) | 565 | static void power_down(void) |
| 572 | { | 566 | { |
| 573 | switch (hibernation_mode) { | 567 | switch (hibernation_mode) { |
| @@ -604,9 +598,8 @@ static int prepare_processes(void) | |||
| 604 | } | 598 | } |
| 605 | 599 | ||
| 606 | /** | 600 | /** |
| 607 | * hibernate - The granpappy of the built-in hibernation management | 601 | * hibernate - Carry out system hibernation, including saving the image. |
| 608 | */ | 602 | */ |
| 609 | |||
| 610 | int hibernate(void) | 603 | int hibernate(void) |
| 611 | { | 604 | { |
| 612 | int error; | 605 | int error; |
| @@ -684,17 +677,20 @@ int hibernate(void) | |||
| 684 | 677 | ||
| 685 | 678 | ||
| 686 | /** | 679 | /** |
| 687 | * software_resume - Resume from a saved image. | 680 | * software_resume - Resume from a saved hibernation image. |
| 681 | * | ||
| 682 | * This routine is called as a late initcall, when all devices have been | ||
| 683 | * discovered and initialized already. | ||
| 688 | * | 684 | * |
| 689 | * Called as a late_initcall (so all devices are discovered and | 685 | * The image reading code is called to see if there is a hibernation image |
| 690 | * initialized), we call swsusp to see if we have a saved image or not. | 686 | * available for reading. If that is the case, devices are quiesced and the |
| 691 | * If so, we quiesce devices, the restore the saved image. We will | 687 | * contents of memory is restored from the saved image. |
| 692 | * return above (in hibernate() ) if everything goes well. | ||
| 693 | * Otherwise, we fail gracefully and return to the normally | ||
| 694 | * scheduled program. | ||
| 695 | * | 688 | * |
| 689 | * If this is successful, control reappears in the restored target kernel in | ||
| 690 | * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine | ||
| 691 | * attempts to recover gracefully and make the kernel return to the normal mode | ||
| 692 | * of operation. | ||
| 696 | */ | 693 | */ |
| 697 | |||
| 698 | static int software_resume(void) | 694 | static int software_resume(void) |
| 699 | { | 695 | { |
| 700 | int error; | 696 | int error; |
| @@ -824,21 +820,17 @@ static const char * const hibernation_modes[] = { | |||
| 824 | [HIBERNATION_TESTPROC] = "testproc", | 820 | [HIBERNATION_TESTPROC] = "testproc", |
| 825 | }; | 821 | }; |
| 826 | 822 | ||
| 827 | /** | 823 | /* |
| 828 | * disk - Control hibernation mode | 824 | * /sys/power/disk - Control hibernation mode. |
| 829 | * | ||
| 830 | * Suspend-to-disk can be handled in several ways. We have a few options | ||
| 831 | * for putting the system to sleep - using the platform driver (e.g. ACPI | ||
| 832 | * or other hibernation_ops), powering off the system or rebooting the | ||
| 833 | * system (for testing) as well as the two test modes. | ||
| 834 | * | 825 | * |
| 835 | * The system can support 'platform', and that is known a priori (and | 826 | * Hibernation can be handled in several ways. There are a few different ways |
| 836 | * encoded by the presence of hibernation_ops). However, the user may | 827 | * to put the system into the sleep state: using the platform driver (e.g. ACPI |
| 837 | * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the | 828 | * or other hibernation_ops), powering it off or rebooting it (for testing |
| 838 | * test modes, 'test' or 'testproc'. | 829 | * mostly), or using one of the two available test modes. |
| 839 | * | 830 | * |
| 840 | * show() will display what the mode is currently set to. | 831 | * The sysfs file /sys/power/disk provides an interface for selecting the |
| 841 | * store() will accept one of | 832 | * hibernation mode to use. Reading from this file causes the available modes |
| 833 | * to be printed. There are 5 modes that can be supported: | ||
| 842 | * | 834 | * |
| 843 | * 'platform' | 835 | * 'platform' |
| 844 | * 'shutdown' | 836 | * 'shutdown' |
| @@ -846,8 +838,14 @@ static const char * const hibernation_modes[] = { | |||
| 846 | * 'test' | 838 | * 'test' |
| 847 | * 'testproc' | 839 | * 'testproc' |
| 848 | * | 840 | * |
| 849 | * It will only change to 'platform' if the system | 841 | * If a platform hibernation driver is in use, 'platform' will be supported |
| 850 | * supports it (as determined by having hibernation_ops). | 842 | * and will be used by default. Otherwise, 'shutdown' will be used by default. |
| 843 | * The selected option (i.e. the one corresponding to the current value of | ||
| 844 | * hibernation_mode) is enclosed by a square bracket. | ||
| 845 | * | ||
| 846 | * To select a given hibernation mode it is necessary to write the mode's | ||
| 847 | * string representation (as returned by reading from /sys/power/disk) back | ||
| 848 | * into /sys/power/disk. | ||
| 851 | */ | 849 | */ |
| 852 | 850 | ||
| 853 | static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | 851 | static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, |
| @@ -880,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, | |||
| 880 | return buf-start; | 878 | return buf-start; |
| 881 | } | 879 | } |
| 882 | 880 | ||
| 883 | |||
| 884 | static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, | 881 | static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, |
| 885 | const char *buf, size_t n) | 882 | const char *buf, size_t n) |
| 886 | { | 883 | { |
| @@ -982,10 +979,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att | |||
| 982 | 979 | ||
| 983 | power_attr(image_size); | 980 | power_attr(image_size); |
| 984 | 981 | ||
| 982 | static ssize_t reserved_size_show(struct kobject *kobj, | ||
| 983 | struct kobj_attribute *attr, char *buf) | ||
| 984 | { | ||
| 985 | return sprintf(buf, "%lu\n", reserved_size); | ||
| 986 | } | ||
| 987 | |||
| 988 | static ssize_t reserved_size_store(struct kobject *kobj, | ||
| 989 | struct kobj_attribute *attr, | ||
| 990 | const char *buf, size_t n) | ||
| 991 | { | ||
| 992 | unsigned long size; | ||
| 993 | |||
| 994 | if (sscanf(buf, "%lu", &size) == 1) { | ||
| 995 | reserved_size = size; | ||
| 996 | return n; | ||
| 997 | } | ||
| 998 | |||
| 999 | return -EINVAL; | ||
| 1000 | } | ||
| 1001 | |||
| 1002 | power_attr(reserved_size); | ||
| 1003 | |||
| 985 | static struct attribute * g[] = { | 1004 | static struct attribute * g[] = { |
| 986 | &disk_attr.attr, | 1005 | &disk_attr.attr, |
| 987 | &resume_attr.attr, | 1006 | &resume_attr.attr, |
| 988 | &image_size_attr.attr, | 1007 | &image_size_attr.attr, |
| 1008 | &reserved_size_attr.attr, | ||
| 989 | NULL, | 1009 | NULL, |
| 990 | }; | 1010 | }; |
| 991 | 1011 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index de9aef8742f4..6c601f871964 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -37,8 +37,9 @@ EXPORT_SYMBOL_GPL(unregister_pm_notifier); | |||
| 37 | 37 | ||
| 38 | int pm_notifier_call_chain(unsigned long val) | 38 | int pm_notifier_call_chain(unsigned long val) |
| 39 | { | 39 | { |
| 40 | return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) | 40 | int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL); |
| 41 | == NOTIFY_BAD) ? -EINVAL : 0; | 41 | |
| 42 | return notifier_to_errno(ret); | ||
| 42 | } | 43 | } |
| 43 | 44 | ||
| 44 | /* If set, devices may be suspended and resumed asynchronously. */ | 45 | /* If set, devices may be suspended and resumed asynchronously. */ |
| @@ -337,6 +338,7 @@ static int __init pm_init(void) | |||
| 337 | if (error) | 338 | if (error) |
| 338 | return error; | 339 | return error; |
| 339 | hibernate_image_size_init(); | 340 | hibernate_image_size_init(); |
| 341 | hibernate_reserved_size_init(); | ||
| 340 | power_kobj = kobject_create_and_add("power", NULL); | 342 | power_kobj = kobject_create_and_add("power", NULL); |
| 341 | if (!power_kobj) | 343 | if (!power_kobj) |
| 342 | return -ENOMEM; | 344 | return -ENOMEM; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 03634be55f62..9a00a0a26280 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -15,6 +15,7 @@ struct swsusp_info { | |||
| 15 | 15 | ||
| 16 | #ifdef CONFIG_HIBERNATION | 16 | #ifdef CONFIG_HIBERNATION |
| 17 | /* kernel/power/snapshot.c */ | 17 | /* kernel/power/snapshot.c */ |
| 18 | extern void __init hibernate_reserved_size_init(void); | ||
| 18 | extern void __init hibernate_image_size_init(void); | 19 | extern void __init hibernate_image_size_init(void); |
| 19 | 20 | ||
| 20 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER | 21 | #ifdef CONFIG_ARCH_HIBERNATION_HEADER |
| @@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void); | |||
| 55 | 56 | ||
| 56 | #else /* !CONFIG_HIBERNATION */ | 57 | #else /* !CONFIG_HIBERNATION */ |
| 57 | 58 | ||
| 59 | static inline void hibernate_reserved_size_init(void) {} | ||
| 58 | static inline void hibernate_image_size_init(void) {} | 60 | static inline void hibernate_image_size_init(void) {} |
| 59 | #endif /* !CONFIG_HIBERNATION */ | 61 | #endif /* !CONFIG_HIBERNATION */ |
| 60 | 62 | ||
| @@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \ | |||
| 72 | 74 | ||
| 73 | /* Preferred image size in bytes (default 500 MB) */ | 75 | /* Preferred image size in bytes (default 500 MB) */ |
| 74 | extern unsigned long image_size; | 76 | extern unsigned long image_size; |
| 77 | /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ | ||
| 78 | extern unsigned long reserved_size; | ||
| 75 | extern int in_suspend; | 79 | extern int in_suspend; |
| 76 | extern dev_t swsusp_resume_device; | 80 | extern dev_t swsusp_resume_device; |
| 77 | extern sector_t swsusp_resume_block; | 81 | extern sector_t swsusp_resume_block; |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index ca0aacc24874..06efa54f93d6 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -41,16 +41,28 @@ static void swsusp_set_page_forbidden(struct page *); | |||
| 41 | static void swsusp_unset_page_forbidden(struct page *); | 41 | static void swsusp_unset_page_forbidden(struct page *); |
| 42 | 42 | ||
| 43 | /* | 43 | /* |
| 44 | * Number of bytes to reserve for memory allocations made by device drivers | ||
| 45 | * from their ->freeze() and ->freeze_noirq() callbacks so that they don't | ||
| 46 | * cause image creation to fail (tunable via /sys/power/reserved_size). | ||
| 47 | */ | ||
| 48 | unsigned long reserved_size; | ||
| 49 | |||
| 50 | void __init hibernate_reserved_size_init(void) | ||
| 51 | { | ||
| 52 | reserved_size = SPARE_PAGES * PAGE_SIZE; | ||
| 53 | } | ||
| 54 | |||
| 55 | /* | ||
| 44 | * Preferred image size in bytes (tunable via /sys/power/image_size). | 56 | * Preferred image size in bytes (tunable via /sys/power/image_size). |
| 45 | * When it is set to N, the image creating code will do its best to | 57 | * When it is set to N, swsusp will do its best to ensure the image |
| 46 | * ensure the image size will not exceed N bytes, but if that is | 58 | * size will not exceed N bytes, but if that is impossible, it will |
| 47 | * impossible, it will try to create the smallest image possible. | 59 | * try to create the smallest image possible. |
| 48 | */ | 60 | */ |
| 49 | unsigned long image_size; | 61 | unsigned long image_size; |
| 50 | 62 | ||
| 51 | void __init hibernate_image_size_init(void) | 63 | void __init hibernate_image_size_init(void) |
| 52 | { | 64 | { |
| 53 | image_size = (totalram_pages / 3) * PAGE_SIZE; | 65 | image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; |
| 54 | } | 66 | } |
| 55 | 67 | ||
| 56 | /* List of PBEs needed for restoring the pages that were allocated before | 68 | /* List of PBEs needed for restoring the pages that were allocated before |
| @@ -1199,7 +1211,11 @@ static void free_unnecessary_pages(void) | |||
| 1199 | to_free_highmem = alloc_highmem - save; | 1211 | to_free_highmem = alloc_highmem - save; |
| 1200 | } else { | 1212 | } else { |
| 1201 | to_free_highmem = 0; | 1213 | to_free_highmem = 0; |
| 1202 | to_free_normal -= save - alloc_highmem; | 1214 | save -= alloc_highmem; |
| 1215 | if (to_free_normal > save) | ||
| 1216 | to_free_normal -= save; | ||
| 1217 | else | ||
| 1218 | to_free_normal = 0; | ||
| 1203 | } | 1219 | } |
| 1204 | 1220 | ||
| 1205 | memory_bm_position_reset(©_bm); | 1221 | memory_bm_position_reset(©_bm); |
| @@ -1263,11 +1279,13 @@ static unsigned long minimum_image_size(unsigned long saveable) | |||
| 1263 | * frame in use. We also need a number of page frames to be free during | 1279 | * frame in use. We also need a number of page frames to be free during |
| 1264 | * hibernation for allocations made while saving the image and for device | 1280 | * hibernation for allocations made while saving the image and for device |
| 1265 | * drivers, in case they need to allocate memory from their hibernation | 1281 | * drivers, in case they need to allocate memory from their hibernation |
| 1266 | * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES, | 1282 | * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough |
| 1267 | * respectively, both of which are rough estimates). To make this happen, we | 1283 | * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through |
| 1268 | * compute the total number of available page frames and allocate at least | 1284 | * /sys/power/reserved_size, respectively). To make this happen, we compute the |
| 1285 | * total number of available page frames and allocate at least | ||
| 1269 | * | 1286 | * |
| 1270 | * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES | 1287 | * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 |
| 1288 | * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) | ||
| 1271 | * | 1289 | * |
| 1272 | * of them, which corresponds to the maximum size of a hibernation image. | 1290 | * of them, which corresponds to the maximum size of a hibernation image. |
| 1273 | * | 1291 | * |
| @@ -1322,7 +1340,8 @@ int hibernate_preallocate_memory(void) | |||
| 1322 | count -= totalreserve_pages; | 1340 | count -= totalreserve_pages; |
| 1323 | 1341 | ||
| 1324 | /* Compute the maximum number of saveable pages to leave in memory. */ | 1342 | /* Compute the maximum number of saveable pages to leave in memory. */ |
| 1325 | max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; | 1343 | max_size = (count - (size + PAGES_FOR_IO)) / 2 |
| 1344 | - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); | ||
| 1326 | /* Compute the desired number of image pages specified by image_size. */ | 1345 | /* Compute the desired number of image pages specified by image_size. */ |
| 1327 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); | 1346 | size = DIV_ROUND_UP(image_size, PAGE_SIZE); |
| 1328 | if (size > max_size) | 1347 | if (size > max_size) |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8935369d503a..b6b71ad2208f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -44,6 +44,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) | |||
| 44 | suspend_ops = ops; | 44 | suspend_ops = ops; |
| 45 | mutex_unlock(&pm_mutex); | 45 | mutex_unlock(&pm_mutex); |
| 46 | } | 46 | } |
| 47 | EXPORT_SYMBOL_GPL(suspend_set_ops); | ||
| 47 | 48 | ||
| 48 | bool valid_state(suspend_state_t state) | 49 | bool valid_state(suspend_state_t state) |
| 49 | { | 50 | { |
| @@ -65,6 +66,7 @@ int suspend_valid_only_mem(suspend_state_t state) | |||
| 65 | { | 66 | { |
| 66 | return state == PM_SUSPEND_MEM; | 67 | return state == PM_SUSPEND_MEM; |
| 67 | } | 68 | } |
| 69 | EXPORT_SYMBOL_GPL(suspend_valid_only_mem); | ||
| 68 | 70 | ||
| 69 | static int suspend_test(int level) | 71 | static int suspend_test(int level) |
| 70 | { | 72 | { |
| @@ -126,12 +128,13 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) | |||
| 126 | } | 128 | } |
| 127 | 129 | ||
| 128 | /** | 130 | /** |
| 129 | * suspend_enter - enter the desired system sleep state. | 131 | * suspend_enter - enter the desired system sleep state. |
| 130 | * @state: state to enter | 132 | * @state: State to enter |
| 133 | * @wakeup: Returns information that suspend should not be entered again. | ||
| 131 | * | 134 | * |
| 132 | * This function should be called after devices have been suspended. | 135 | * This function should be called after devices have been suspended. |
| 133 | */ | 136 | */ |
| 134 | static int suspend_enter(suspend_state_t state) | 137 | static int suspend_enter(suspend_state_t state, bool *wakeup) |
| 135 | { | 138 | { |
| 136 | int error; | 139 | int error; |
| 137 | 140 | ||
| @@ -163,19 +166,14 @@ static int suspend_enter(suspend_state_t state) | |||
| 163 | arch_suspend_disable_irqs(); | 166 | arch_suspend_disable_irqs(); |
| 164 | BUG_ON(!irqs_disabled()); | 167 | BUG_ON(!irqs_disabled()); |
| 165 | 168 | ||
| 166 | error = sysdev_suspend(PMSG_SUSPEND); | 169 | error = syscore_suspend(); |
| 167 | if (!error) { | 170 | if (!error) { |
| 168 | error = syscore_suspend(); | 171 | *wakeup = pm_wakeup_pending(); |
| 169 | if (error) | 172 | if (!(suspend_test(TEST_CORE) || *wakeup)) { |
| 170 | sysdev_resume(); | ||
| 171 | } | ||
| 172 | if (!error) { | ||
| 173 | if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { | ||
| 174 | error = suspend_ops->enter(state); | 173 | error = suspend_ops->enter(state); |
| 175 | events_check_enabled = false; | 174 | events_check_enabled = false; |
| 176 | } | 175 | } |
| 177 | syscore_resume(); | 176 | syscore_resume(); |
| 178 | sysdev_resume(); | ||
| 179 | } | 177 | } |
| 180 | 178 | ||
| 181 | arch_suspend_enable_irqs(); | 179 | arch_suspend_enable_irqs(); |
| @@ -205,6 +203,7 @@ static int suspend_enter(suspend_state_t state) | |||
| 205 | int suspend_devices_and_enter(suspend_state_t state) | 203 | int suspend_devices_and_enter(suspend_state_t state) |
| 206 | { | 204 | { |
| 207 | int error; | 205 | int error; |
| 206 | bool wakeup = false; | ||
| 208 | 207 | ||
| 209 | if (!suspend_ops) | 208 | if (!suspend_ops) |
| 210 | return -ENOSYS; | 209 | return -ENOSYS; |
| @@ -216,7 +215,6 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 216 | goto Close; | 215 | goto Close; |
| 217 | } | 216 | } |
| 218 | suspend_console(); | 217 | suspend_console(); |
| 219 | pm_restrict_gfp_mask(); | ||
| 220 | suspend_test_start(); | 218 | suspend_test_start(); |
| 221 | error = dpm_suspend_start(PMSG_SUSPEND); | 219 | error = dpm_suspend_start(PMSG_SUSPEND); |
| 222 | if (error) { | 220 | if (error) { |
| @@ -227,13 +225,15 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 227 | if (suspend_test(TEST_DEVICES)) | 225 | if (suspend_test(TEST_DEVICES)) |
| 228 | goto Recover_platform; | 226 | goto Recover_platform; |
| 229 | 227 | ||
| 230 | suspend_enter(state); | 228 | do { |
| 229 | error = suspend_enter(state, &wakeup); | ||
| 230 | } while (!error && !wakeup | ||
| 231 | && suspend_ops->suspend_again && suspend_ops->suspend_again()); | ||
| 231 | 232 | ||
| 232 | Resume_devices: | 233 | Resume_devices: |
| 233 | suspend_test_start(); | 234 | suspend_test_start(); |
| 234 | dpm_resume_end(PMSG_RESUME); | 235 | dpm_resume_end(PMSG_RESUME); |
| 235 | suspend_test_finish("resume devices"); | 236 | suspend_test_finish("resume devices"); |
| 236 | pm_restore_gfp_mask(); | ||
| 237 | resume_console(); | 237 | resume_console(); |
| 238 | Close: | 238 | Close: |
| 239 | if (suspend_ops->end) | 239 | if (suspend_ops->end) |
| @@ -294,7 +294,9 @@ int enter_state(suspend_state_t state) | |||
| 294 | goto Finish; | 294 | goto Finish; |
| 295 | 295 | ||
| 296 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); | 296 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); |
| 297 | pm_restrict_gfp_mask(); | ||
| 297 | error = suspend_devices_and_enter(state); | 298 | error = suspend_devices_and_enter(state); |
| 299 | pm_restore_gfp_mask(); | ||
| 298 | 300 | ||
| 299 | Finish: | 301 | Finish: |
| 300 | pr_debug("PM: Finishing wakeup.\n"); | 302 | pr_debug("PM: Finishing wakeup.\n"); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index c36c3b9e8a84..42ddbc6f0de6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
| 113 | if (error) | 113 | if (error) |
| 114 | pm_notifier_call_chain(PM_POST_RESTORE); | 114 | pm_notifier_call_chain(PM_POST_RESTORE); |
| 115 | } | 115 | } |
| 116 | if (error) | 116 | if (error) { |
| 117 | free_basic_memory_bitmaps(); | ||
| 117 | atomic_inc(&snapshot_device_available); | 118 | atomic_inc(&snapshot_device_available); |
| 119 | } | ||
| 118 | data->frozen = 0; | 120 | data->frozen = 0; |
| 119 | data->ready = 0; | 121 | data->ready = 0; |
| 120 | data->platform_support = 0; | 122 | data->platform_support = 0; |
| @@ -135,8 +137,10 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
| 135 | free_basic_memory_bitmaps(); | 137 | free_basic_memory_bitmaps(); |
| 136 | data = filp->private_data; | 138 | data = filp->private_data; |
| 137 | free_all_swap_pages(data->swap); | 139 | free_all_swap_pages(data->swap); |
| 138 | if (data->frozen) | 140 | if (data->frozen) { |
| 141 | pm_restore_gfp_mask(); | ||
| 139 | thaw_processes(); | 142 | thaw_processes(); |
| 143 | } | ||
| 140 | pm_notifier_call_chain(data->mode == O_RDONLY ? | 144 | pm_notifier_call_chain(data->mode == O_RDONLY ? |
| 141 | PM_POST_HIBERNATION : PM_POST_RESTORE); | 145 | PM_POST_HIBERNATION : PM_POST_RESTORE); |
| 142 | atomic_inc(&snapshot_device_available); | 146 | atomic_inc(&snapshot_device_available); |
| @@ -379,6 +383,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, | |||
| 379 | * PM_HIBERNATION_PREPARE | 383 | * PM_HIBERNATION_PREPARE |
| 380 | */ | 384 | */ |
| 381 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); | 385 | error = suspend_devices_and_enter(PM_SUSPEND_MEM); |
| 386 | data->ready = 0; | ||
| 382 | break; | 387 | break; |
| 383 | 388 | ||
| 384 | case SNAPSHOT_PLATFORM_SUPPORT: | 389 | case SNAPSHOT_PLATFORM_SUPPORT: |
diff --git a/kernel/printk.c b/kernel/printk.c index da8ca817eae3..836a2ae0ac31 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/smp.h> | 31 | #include <linux/smp.h> |
| 32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
| 33 | #include <linux/bootmem.h> | 33 | #include <linux/bootmem.h> |
| 34 | #include <linux/memblock.h> | ||
| 34 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
| 35 | #include <linux/kexec.h> | 36 | #include <linux/kexec.h> |
| 36 | #include <linux/kdb.h> | 37 | #include <linux/kdb.h> |
| @@ -167,46 +168,74 @@ void log_buf_kexec_setup(void) | |||
| 167 | } | 168 | } |
| 168 | #endif | 169 | #endif |
| 169 | 170 | ||
| 171 | /* requested log_buf_len from kernel cmdline */ | ||
| 172 | static unsigned long __initdata new_log_buf_len; | ||
| 173 | |||
| 174 | /* save requested log_buf_len since it's too early to process it */ | ||
| 170 | static int __init log_buf_len_setup(char *str) | 175 | static int __init log_buf_len_setup(char *str) |
| 171 | { | 176 | { |
| 172 | unsigned size = memparse(str, &str); | 177 | unsigned size = memparse(str, &str); |
| 173 | unsigned long flags; | ||
| 174 | 178 | ||
| 175 | if (size) | 179 | if (size) |
| 176 | size = roundup_pow_of_two(size); | 180 | size = roundup_pow_of_two(size); |
| 177 | if (size > log_buf_len) { | 181 | if (size > log_buf_len) |
| 178 | unsigned start, dest_idx, offset; | 182 | new_log_buf_len = size; |
| 179 | char *new_log_buf; | ||
| 180 | 183 | ||
| 181 | new_log_buf = alloc_bootmem(size); | 184 | return 0; |
| 182 | if (!new_log_buf) { | 185 | } |
| 183 | printk(KERN_WARNING "log_buf_len: allocation failed\n"); | 186 | early_param("log_buf_len", log_buf_len_setup); |
| 184 | goto out; | ||
| 185 | } | ||
| 186 | 187 | ||
| 187 | spin_lock_irqsave(&logbuf_lock, flags); | 188 | void __init setup_log_buf(int early) |
| 188 | log_buf_len = size; | 189 | { |
| 189 | log_buf = new_log_buf; | 190 | unsigned long flags; |
| 190 | 191 | unsigned start, dest_idx, offset; | |
| 191 | offset = start = min(con_start, log_start); | 192 | char *new_log_buf; |
| 192 | dest_idx = 0; | 193 | int free; |
| 193 | while (start != log_end) { | 194 | |
| 194 | log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; | 195 | if (!new_log_buf_len) |
| 195 | start++; | 196 | return; |
| 196 | dest_idx++; | 197 | |
| 197 | } | 198 | if (early) { |
| 198 | log_start -= offset; | 199 | unsigned long mem; |
| 199 | con_start -= offset; | ||
| 200 | log_end -= offset; | ||
| 201 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 202 | 200 | ||
| 203 | printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); | 201 | mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); |
| 202 | if (mem == MEMBLOCK_ERROR) | ||
| 203 | return; | ||
| 204 | new_log_buf = __va(mem); | ||
| 205 | } else { | ||
| 206 | new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); | ||
| 204 | } | 207 | } |
| 205 | out: | ||
| 206 | return 1; | ||
| 207 | } | ||
| 208 | 208 | ||
| 209 | __setup("log_buf_len=", log_buf_len_setup); | 209 | if (unlikely(!new_log_buf)) { |
| 210 | pr_err("log_buf_len: %ld bytes not available\n", | ||
| 211 | new_log_buf_len); | ||
| 212 | return; | ||
| 213 | } | ||
| 214 | |||
| 215 | spin_lock_irqsave(&logbuf_lock, flags); | ||
| 216 | log_buf_len = new_log_buf_len; | ||
| 217 | log_buf = new_log_buf; | ||
| 218 | new_log_buf_len = 0; | ||
| 219 | free = __LOG_BUF_LEN - log_end; | ||
| 220 | |||
| 221 | offset = start = min(con_start, log_start); | ||
| 222 | dest_idx = 0; | ||
| 223 | while (start != log_end) { | ||
| 224 | unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); | ||
| 225 | |||
| 226 | log_buf[dest_idx] = __log_buf[log_idx_mask]; | ||
| 227 | start++; | ||
| 228 | dest_idx++; | ||
| 229 | } | ||
| 230 | log_start -= offset; | ||
| 231 | con_start -= offset; | ||
| 232 | log_end -= offset; | ||
| 233 | spin_unlock_irqrestore(&logbuf_lock, flags); | ||
| 234 | |||
| 235 | pr_info("log_buf_len: %d\n", log_buf_len); | ||
| 236 | pr_info("early log buf free: %d(%d%%)\n", | ||
| 237 | free, (free * 100) / __LOG_BUF_LEN); | ||
| 238 | } | ||
| 210 | 239 | ||
| 211 | #ifdef CONFIG_BOOT_PRINTK_DELAY | 240 | #ifdef CONFIG_BOOT_PRINTK_DELAY |
| 212 | 241 | ||
| @@ -289,8 +318,10 @@ static int check_syslog_permissions(int type, bool from_file) | |||
| 289 | return 0; | 318 | return 0; |
| 290 | /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ | 319 | /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ |
| 291 | if (capable(CAP_SYS_ADMIN)) { | 320 | if (capable(CAP_SYS_ADMIN)) { |
| 292 | WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " | 321 | printk_once(KERN_WARNING "%s (%d): " |
| 293 | "but no CAP_SYSLOG (deprecated).\n"); | 322 | "Attempt to access syslog with CAP_SYS_ADMIN " |
| 323 | "but no CAP_SYSLOG (deprecated).\n", | ||
| 324 | current->comm, task_pid_nr(current)); | ||
| 294 | return 0; | 325 | return 0; |
| 295 | } | 326 | } |
| 296 | return -EPERM; | 327 | return -EPERM; |
| @@ -753,7 +784,7 @@ static inline int can_use_console(unsigned int cpu) | |||
| 753 | static int console_trylock_for_printk(unsigned int cpu) | 784 | static int console_trylock_for_printk(unsigned int cpu) |
| 754 | __releases(&logbuf_lock) | 785 | __releases(&logbuf_lock) |
| 755 | { | 786 | { |
| 756 | int retval = 0; | 787 | int retval = 0, wake = 0; |
| 757 | 788 | ||
| 758 | if (console_trylock()) { | 789 | if (console_trylock()) { |
| 759 | retval = 1; | 790 | retval = 1; |
| @@ -766,12 +797,14 @@ static int console_trylock_for_printk(unsigned int cpu) | |||
| 766 | */ | 797 | */ |
| 767 | if (!can_use_console(cpu)) { | 798 | if (!can_use_console(cpu)) { |
| 768 | console_locked = 0; | 799 | console_locked = 0; |
| 769 | up(&console_sem); | 800 | wake = 1; |
| 770 | retval = 0; | 801 | retval = 0; |
| 771 | } | 802 | } |
| 772 | } | 803 | } |
| 773 | printk_cpu = UINT_MAX; | 804 | printk_cpu = UINT_MAX; |
| 774 | spin_unlock(&logbuf_lock); | 805 | spin_unlock(&logbuf_lock); |
| 806 | if (wake) | ||
| 807 | up(&console_sem); | ||
| 775 | return retval; | 808 | return retval; |
| 776 | } | 809 | } |
| 777 | static const char recursion_bug_msg [] = | 810 | static const char recursion_bug_msg [] = |
| @@ -1213,7 +1246,7 @@ void console_unlock(void) | |||
| 1213 | { | 1246 | { |
| 1214 | unsigned long flags; | 1247 | unsigned long flags; |
| 1215 | unsigned _con_start, _log_end; | 1248 | unsigned _con_start, _log_end; |
| 1216 | unsigned wake_klogd = 0; | 1249 | unsigned wake_klogd = 0, retry = 0; |
| 1217 | 1250 | ||
| 1218 | if (console_suspended) { | 1251 | if (console_suspended) { |
| 1219 | up(&console_sem); | 1252 | up(&console_sem); |
| @@ -1222,6 +1255,7 @@ void console_unlock(void) | |||
| 1222 | 1255 | ||
| 1223 | console_may_schedule = 0; | 1256 | console_may_schedule = 0; |
| 1224 | 1257 | ||
| 1258 | again: | ||
| 1225 | for ( ; ; ) { | 1259 | for ( ; ; ) { |
| 1226 | spin_lock_irqsave(&logbuf_lock, flags); | 1260 | spin_lock_irqsave(&logbuf_lock, flags); |
| 1227 | wake_klogd |= log_start - log_end; | 1261 | wake_klogd |= log_start - log_end; |
| @@ -1242,8 +1276,23 @@ void console_unlock(void) | |||
| 1242 | if (unlikely(exclusive_console)) | 1276 | if (unlikely(exclusive_console)) |
| 1243 | exclusive_console = NULL; | 1277 | exclusive_console = NULL; |
| 1244 | 1278 | ||
| 1279 | spin_unlock(&logbuf_lock); | ||
| 1280 | |||
| 1245 | up(&console_sem); | 1281 | up(&console_sem); |
| 1282 | |||
| 1283 | /* | ||
| 1284 | * Someone could have filled up the buffer again, so re-check if there's | ||
| 1285 | * something to flush. In case we cannot trylock the console_sem again, | ||
| 1286 | * there's a new owner and the console_unlock() from them will do the | ||
| 1287 | * flush, no worries. | ||
| 1288 | */ | ||
| 1289 | spin_lock(&logbuf_lock); | ||
| 1290 | if (con_start != log_end) | ||
| 1291 | retry = 1; | ||
| 1246 | spin_unlock_irqrestore(&logbuf_lock, flags); | 1292 | spin_unlock_irqrestore(&logbuf_lock, flags); |
| 1293 | if (retry && console_trylock()) | ||
| 1294 | goto again; | ||
| 1295 | |||
| 1247 | if (wake_klogd) | 1296 | if (wake_klogd) |
| 1248 | wake_up_klogd(); | 1297 | wake_up_klogd(); |
| 1249 | } | 1298 | } |
diff --git a/kernel/profile.c b/kernel/profile.c index 66f841b7fbd3..961b389fe52f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -126,11 +126,9 @@ int __ref profile_init(void) | |||
| 126 | if (prof_buffer) | 126 | if (prof_buffer) |
| 127 | return 0; | 127 | return 0; |
| 128 | 128 | ||
| 129 | prof_buffer = vmalloc(buffer_bytes); | 129 | prof_buffer = vzalloc(buffer_bytes); |
| 130 | if (prof_buffer) { | 130 | if (prof_buffer) |
| 131 | memset(prof_buffer, 0, buffer_bytes); | ||
| 132 | return 0; | 131 | return 0; |
| 133 | } | ||
| 134 | 132 | ||
| 135 | free_cpumask_var(prof_cpu_mask); | 133 | free_cpumask_var(prof_cpu_mask); |
| 136 | return -ENOMEM; | 134 | return -ENOMEM; |
| @@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void) | |||
| 305 | mutex_unlock(&profile_flip_mutex); | 303 | mutex_unlock(&profile_flip_mutex); |
| 306 | } | 304 | } |
| 307 | 305 | ||
| 308 | void profile_hits(int type, void *__pc, unsigned int nr_hits) | 306 | static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) |
| 309 | { | 307 | { |
| 310 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; | 308 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; |
| 311 | int i, j, cpu; | 309 | int i, j, cpu; |
| 312 | struct profile_hit *hits; | 310 | struct profile_hit *hits; |
| 313 | 311 | ||
| 314 | if (prof_on != type || !prof_buffer) | ||
| 315 | return; | ||
| 316 | pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); | 312 | pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); |
| 317 | i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; | 313 | i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; |
| 318 | secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; | 314 | secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; |
| @@ -419,16 +415,20 @@ out_free: | |||
| 419 | #define profile_discard_flip_buffers() do { } while (0) | 415 | #define profile_discard_flip_buffers() do { } while (0) |
| 420 | #define profile_cpu_callback NULL | 416 | #define profile_cpu_callback NULL |
| 421 | 417 | ||
| 422 | void profile_hits(int type, void *__pc, unsigned int nr_hits) | 418 | static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) |
| 423 | { | 419 | { |
| 424 | unsigned long pc; | 420 | unsigned long pc; |
| 425 | |||
| 426 | if (prof_on != type || !prof_buffer) | ||
| 427 | return; | ||
| 428 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; | 421 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; |
| 429 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); | 422 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
| 430 | } | 423 | } |
| 431 | #endif /* !CONFIG_SMP */ | 424 | #endif /* !CONFIG_SMP */ |
| 425 | |||
| 426 | void profile_hits(int type, void *__pc, unsigned int nr_hits) | ||
| 427 | { | ||
| 428 | if (prof_on != type || !prof_buffer) | ||
| 429 | return; | ||
| 430 | do_profile_hits(type, __pc, nr_hits); | ||
| 431 | } | ||
| 432 | EXPORT_SYMBOL_GPL(profile_hits); | 432 | EXPORT_SYMBOL_GPL(profile_hits); |
| 433 | 433 | ||
| 434 | void profile_tick(int type) | 434 | void profile_tick(int type) |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dc7ab65f3b36..9de3ecfd20f9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -23,8 +23,15 @@ | |||
| 23 | #include <linux/uaccess.h> | 23 | #include <linux/uaccess.h> |
| 24 | #include <linux/regset.h> | 24 | #include <linux/regset.h> |
| 25 | #include <linux/hw_breakpoint.h> | 25 | #include <linux/hw_breakpoint.h> |
| 26 | #include <linux/cn_proc.h> | ||
| 26 | 27 | ||
| 27 | 28 | ||
| 29 | static int ptrace_trapping_sleep_fn(void *flags) | ||
| 30 | { | ||
| 31 | schedule(); | ||
| 32 | return 0; | ||
| 33 | } | ||
| 34 | |||
| 28 | /* | 35 | /* |
| 29 | * ptrace a task: make the debugger its new parent and | 36 | * ptrace a task: make the debugger its new parent and |
| 30 | * move it to the ptrace list. | 37 | * move it to the ptrace list. |
| @@ -38,35 +45,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) | |||
| 38 | child->parent = new_parent; | 45 | child->parent = new_parent; |
| 39 | } | 46 | } |
| 40 | 47 | ||
| 41 | /* | 48 | /** |
| 42 | * Turn a tracing stop into a normal stop now, since with no tracer there | 49 | * __ptrace_unlink - unlink ptracee and restore its execution state |
| 43 | * would be no way to wake it up with SIGCONT or SIGKILL. If there was a | 50 | * @child: ptracee to be unlinked |
| 44 | * signal sent that would resume the child, but didn't because it was in | ||
| 45 | * TASK_TRACED, resume it now. | ||
| 46 | * Requires that irqs be disabled. | ||
| 47 | */ | ||
| 48 | static void ptrace_untrace(struct task_struct *child) | ||
| 49 | { | ||
| 50 | spin_lock(&child->sighand->siglock); | ||
| 51 | if (task_is_traced(child)) { | ||
| 52 | /* | ||
| 53 | * If the group stop is completed or in progress, | ||
| 54 | * this thread was already counted as stopped. | ||
| 55 | */ | ||
| 56 | if (child->signal->flags & SIGNAL_STOP_STOPPED || | ||
| 57 | child->signal->group_stop_count) | ||
| 58 | __set_task_state(child, TASK_STOPPED); | ||
| 59 | else | ||
| 60 | signal_wake_up(child, 1); | ||
| 61 | } | ||
| 62 | spin_unlock(&child->sighand->siglock); | ||
| 63 | } | ||
| 64 | |||
| 65 | /* | ||
| 66 | * unptrace a task: move it back to its original parent and | ||
| 67 | * remove it from the ptrace list. | ||
| 68 | * | 51 | * |
| 69 | * Must be called with the tasklist lock write-held. | 52 | * Remove @child from the ptrace list, move it back to the original parent, |
| 53 | * and restore the execution state so that it conforms to the group stop | ||
| 54 | * state. | ||
| 55 | * | ||
| 56 | * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer | ||
| 57 | * exiting. For PTRACE_DETACH, unless the ptracee has been killed between | ||
| 58 | * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED. | ||
| 59 | * If the ptracer is exiting, the ptracee can be in any state. | ||
| 60 | * | ||
| 61 | * After detach, the ptracee should be in a state which conforms to the | ||
| 62 | * group stop. If the group is stopped or in the process of stopping, the | ||
| 63 | * ptracee should be put into TASK_STOPPED; otherwise, it should be woken | ||
| 64 | * up from TASK_TRACED. | ||
| 65 | * | ||
| 66 | * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED, | ||
| 67 | * it goes through TRACED -> RUNNING -> STOPPED transition which is similar | ||
| 68 | * to but in the opposite direction of what happens while attaching to a | ||
| 69 | * stopped task. However, in this direction, the intermediate RUNNING | ||
| 70 | * state is not hidden even from the current ptracer and if it immediately | ||
| 71 | * re-attaches and performs a WNOHANG wait(2), it may fail. | ||
| 72 | * | ||
| 73 | * CONTEXT: | ||
| 74 | * write_lock_irq(tasklist_lock) | ||
| 70 | */ | 75 | */ |
| 71 | void __ptrace_unlink(struct task_struct *child) | 76 | void __ptrace_unlink(struct task_struct *child) |
| 72 | { | 77 | { |
| @@ -76,14 +81,54 @@ void __ptrace_unlink(struct task_struct *child) | |||
| 76 | child->parent = child->real_parent; | 81 | child->parent = child->real_parent; |
| 77 | list_del_init(&child->ptrace_entry); | 82 | list_del_init(&child->ptrace_entry); |
| 78 | 83 | ||
| 79 | if (task_is_traced(child)) | 84 | spin_lock(&child->sighand->siglock); |
| 80 | ptrace_untrace(child); | 85 | |
| 86 | /* | ||
| 87 | * Clear all pending traps and TRAPPING. TRAPPING should be | ||
| 88 | * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. | ||
| 89 | */ | ||
| 90 | task_clear_jobctl_pending(child, JOBCTL_TRAP_MASK); | ||
| 91 | task_clear_jobctl_trapping(child); | ||
| 92 | |||
| 93 | /* | ||
| 94 | * Reinstate JOBCTL_STOP_PENDING if group stop is in effect and | ||
| 95 | * @child isn't dead. | ||
| 96 | */ | ||
| 97 | if (!(child->flags & PF_EXITING) && | ||
| 98 | (child->signal->flags & SIGNAL_STOP_STOPPED || | ||
| 99 | child->signal->group_stop_count)) | ||
| 100 | child->jobctl |= JOBCTL_STOP_PENDING; | ||
| 101 | |||
| 102 | /* | ||
| 103 | * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick | ||
| 104 | * @child in the butt. Note that @resume should be used iff @child | ||
| 105 | * is in TASK_TRACED; otherwise, we might unduly disrupt | ||
| 106 | * TASK_KILLABLE sleeps. | ||
| 107 | */ | ||
| 108 | if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) | ||
| 109 | signal_wake_up(child, task_is_traced(child)); | ||
| 110 | |||
| 111 | spin_unlock(&child->sighand->siglock); | ||
| 81 | } | 112 | } |
| 82 | 113 | ||
| 83 | /* | 114 | /** |
| 84 | * Check that we have indeed attached to the thing.. | 115 | * ptrace_check_attach - check whether ptracee is ready for ptrace operation |
| 116 | * @child: ptracee to check for | ||
| 117 | * @ignore_state: don't check whether @child is currently %TASK_TRACED | ||
| 118 | * | ||
| 119 | * Check whether @child is being ptraced by %current and ready for further | ||
| 120 | * ptrace operations. If @ignore_state is %false, @child also should be in | ||
| 121 | * %TASK_TRACED state and on return the child is guaranteed to be traced | ||
| 122 | * and not executing. If @ignore_state is %true, @child can be in any | ||
| 123 | * state. | ||
| 124 | * | ||
| 125 | * CONTEXT: | ||
| 126 | * Grabs and releases tasklist_lock and @child->sighand->siglock. | ||
| 127 | * | ||
| 128 | * RETURNS: | ||
| 129 | * 0 on success, -ESRCH if %child is not ready. | ||
| 85 | */ | 130 | */ |
| 86 | int ptrace_check_attach(struct task_struct *child, int kill) | 131 | int ptrace_check_attach(struct task_struct *child, bool ignore_state) |
| 87 | { | 132 | { |
| 88 | int ret = -ESRCH; | 133 | int ret = -ESRCH; |
| 89 | 134 | ||
| @@ -96,21 +141,20 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
| 96 | */ | 141 | */ |
| 97 | read_lock(&tasklist_lock); | 142 | read_lock(&tasklist_lock); |
| 98 | if ((child->ptrace & PT_PTRACED) && child->parent == current) { | 143 | if ((child->ptrace & PT_PTRACED) && child->parent == current) { |
| 99 | ret = 0; | ||
| 100 | /* | 144 | /* |
| 101 | * child->sighand can't be NULL, release_task() | 145 | * child->sighand can't be NULL, release_task() |
| 102 | * does ptrace_unlink() before __exit_signal(). | 146 | * does ptrace_unlink() before __exit_signal(). |
| 103 | */ | 147 | */ |
| 104 | spin_lock_irq(&child->sighand->siglock); | 148 | spin_lock_irq(&child->sighand->siglock); |
| 105 | if (task_is_stopped(child)) | 149 | WARN_ON_ONCE(task_is_stopped(child)); |
| 106 | child->state = TASK_TRACED; | 150 | if (ignore_state || (task_is_traced(child) && |
| 107 | else if (!task_is_traced(child) && !kill) | 151 | !(child->jobctl & JOBCTL_LISTENING))) |
| 108 | ret = -ESRCH; | 152 | ret = 0; |
| 109 | spin_unlock_irq(&child->sighand->siglock); | 153 | spin_unlock_irq(&child->sighand->siglock); |
| 110 | } | 154 | } |
| 111 | read_unlock(&tasklist_lock); | 155 | read_unlock(&tasklist_lock); |
| 112 | 156 | ||
| 113 | if (!ret && !kill) | 157 | if (!ret && !ignore_state) |
| 114 | ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; | 158 | ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; |
| 115 | 159 | ||
| 116 | /* All systems go.. */ | 160 | /* All systems go.. */ |
| @@ -167,10 +211,28 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
| 167 | return !err; | 211 | return !err; |
| 168 | } | 212 | } |
| 169 | 213 | ||
| 170 | static int ptrace_attach(struct task_struct *task) | 214 | static int ptrace_attach(struct task_struct *task, long request, |
| 215 | unsigned long flags) | ||
| 171 | { | 216 | { |
| 217 | bool seize = (request == PTRACE_SEIZE); | ||
| 172 | int retval; | 218 | int retval; |
| 173 | 219 | ||
| 220 | /* | ||
| 221 | * SEIZE will enable new ptrace behaviors which will be implemented | ||
| 222 | * gradually. SEIZE_DEVEL is used to prevent applications | ||
| 223 | * expecting full SEIZE behaviors trapping on kernel commits which | ||
| 224 | * are still in the process of implementing them. | ||
| 225 | * | ||
| 226 | * Only test programs for new ptrace behaviors being implemented | ||
| 227 | * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. | ||
| 228 | * | ||
| 229 | * Once SEIZE behaviors are completely implemented, this flag and | ||
| 230 | * the following test will be removed. | ||
| 231 | */ | ||
| 232 | retval = -EIO; | ||
| 233 | if (seize && !(flags & PTRACE_SEIZE_DEVEL)) | ||
| 234 | goto out; | ||
| 235 | |||
| 174 | audit_ptrace(task); | 236 | audit_ptrace(task); |
| 175 | 237 | ||
| 176 | retval = -EPERM; | 238 | retval = -EPERM; |
| @@ -202,11 +264,41 @@ static int ptrace_attach(struct task_struct *task) | |||
| 202 | goto unlock_tasklist; | 264 | goto unlock_tasklist; |
| 203 | 265 | ||
| 204 | task->ptrace = PT_PTRACED; | 266 | task->ptrace = PT_PTRACED; |
| 267 | if (seize) | ||
| 268 | task->ptrace |= PT_SEIZED; | ||
| 205 | if (task_ns_capable(task, CAP_SYS_PTRACE)) | 269 | if (task_ns_capable(task, CAP_SYS_PTRACE)) |
| 206 | task->ptrace |= PT_PTRACE_CAP; | 270 | task->ptrace |= PT_PTRACE_CAP; |
| 207 | 271 | ||
| 208 | __ptrace_link(task, current); | 272 | __ptrace_link(task, current); |
| 209 | send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); | 273 | |
| 274 | /* SEIZE doesn't trap tracee on attach */ | ||
| 275 | if (!seize) | ||
| 276 | send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); | ||
| 277 | |||
| 278 | spin_lock(&task->sighand->siglock); | ||
| 279 | |||
| 280 | /* | ||
| 281 | * If the task is already STOPPED, set JOBCTL_TRAP_STOP and | ||
| 282 | * TRAPPING, and kick it so that it transits to TRACED. TRAPPING | ||
| 283 | * will be cleared if the child completes the transition or any | ||
| 284 | * event which clears the group stop states happens. We'll wait | ||
| 285 | * for the transition to complete before returning from this | ||
| 286 | * function. | ||
| 287 | * | ||
| 288 | * This hides STOPPED -> RUNNING -> TRACED transition from the | ||
| 289 | * attaching thread but a different thread in the same group can | ||
| 290 | * still observe the transient RUNNING state. IOW, if another | ||
| 291 | * thread's WNOHANG wait(2) on the stopped tracee races against | ||
| 292 | * ATTACH, the wait(2) may fail due to the transient RUNNING. | ||
| 293 | * | ||
| 294 | * The following task_is_stopped() test is safe as both transitions | ||
| 295 | * in and out of STOPPED are protected by siglock. | ||
| 296 | */ | ||
| 297 | if (task_is_stopped(task) && | ||
| 298 | task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) | ||
| 299 | signal_wake_up(task, 1); | ||
| 300 | |||
| 301 | spin_unlock(&task->sighand->siglock); | ||
| 210 | 302 | ||
| 211 | retval = 0; | 303 | retval = 0; |
| 212 | unlock_tasklist: | 304 | unlock_tasklist: |
| @@ -214,6 +306,12 @@ unlock_tasklist: | |||
| 214 | unlock_creds: | 306 | unlock_creds: |
| 215 | mutex_unlock(&task->signal->cred_guard_mutex); | 307 | mutex_unlock(&task->signal->cred_guard_mutex); |
| 216 | out: | 308 | out: |
| 309 | if (!retval) { | ||
| 310 | wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, | ||
| 311 | ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); | ||
| 312 | proc_ptrace_connector(task, PTRACE_ATTACH); | ||
| 313 | } | ||
| 314 | |||
| 217 | return retval; | 315 | return retval; |
| 218 | } | 316 | } |
| 219 | 317 | ||
| @@ -276,25 +374,27 @@ static int ignoring_children(struct sighand_struct *sigh) | |||
| 276 | */ | 374 | */ |
| 277 | static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) | 375 | static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) |
| 278 | { | 376 | { |
| 377 | bool dead; | ||
| 378 | |||
| 279 | __ptrace_unlink(p); | 379 | __ptrace_unlink(p); |
| 280 | 380 | ||
| 281 | if (p->exit_state == EXIT_ZOMBIE) { | 381 | if (p->exit_state != EXIT_ZOMBIE) |
| 282 | if (!task_detached(p) && thread_group_empty(p)) { | 382 | return false; |
| 283 | if (!same_thread_group(p->real_parent, tracer)) | 383 | |
| 284 | do_notify_parent(p, p->exit_signal); | 384 | dead = !thread_group_leader(p); |
| 285 | else if (ignoring_children(tracer->sighand)) { | 385 | |
| 286 | __wake_up_parent(p, tracer); | 386 | if (!dead && thread_group_empty(p)) { |
| 287 | p->exit_signal = -1; | 387 | if (!same_thread_group(p->real_parent, tracer)) |
| 288 | } | 388 | dead = do_notify_parent(p, p->exit_signal); |
| 289 | } | 389 | else if (ignoring_children(tracer->sighand)) { |
| 290 | if (task_detached(p)) { | 390 | __wake_up_parent(p, tracer); |
| 291 | /* Mark it as in the process of being reaped. */ | 391 | dead = true; |
| 292 | p->exit_state = EXIT_DEAD; | ||
| 293 | return true; | ||
| 294 | } | 392 | } |
| 295 | } | 393 | } |
| 296 | 394 | /* Mark it as in the process of being reaped. */ | |
| 297 | return false; | 395 | if (dead) |
| 396 | p->exit_state = EXIT_DEAD; | ||
| 397 | return dead; | ||
| 298 | } | 398 | } |
| 299 | 399 | ||
| 300 | static int ptrace_detach(struct task_struct *child, unsigned int data) | 400 | static int ptrace_detach(struct task_struct *child, unsigned int data) |
| @@ -316,11 +416,10 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) | |||
| 316 | if (child->ptrace) { | 416 | if (child->ptrace) { |
| 317 | child->exit_code = data; | 417 | child->exit_code = data; |
| 318 | dead = __ptrace_detach(current, child); | 418 | dead = __ptrace_detach(current, child); |
| 319 | if (!child->exit_state) | ||
| 320 | wake_up_state(child, TASK_TRACED | TASK_STOPPED); | ||
| 321 | } | 419 | } |
| 322 | write_unlock_irq(&tasklist_lock); | 420 | write_unlock_irq(&tasklist_lock); |
| 323 | 421 | ||
| 422 | proc_ptrace_connector(child, PTRACE_DETACH); | ||
| 324 | if (unlikely(dead)) | 423 | if (unlikely(dead)) |
| 325 | release_task(child); | 424 | release_task(child); |
| 326 | 425 | ||
| @@ -518,7 +617,7 @@ static int ptrace_resume(struct task_struct *child, long request, | |||
| 518 | } | 617 | } |
| 519 | 618 | ||
| 520 | child->exit_code = data; | 619 | child->exit_code = data; |
| 521 | wake_up_process(child); | 620 | wake_up_state(child, __TASK_TRACED); |
| 522 | 621 | ||
| 523 | return 0; | 622 | return 0; |
| 524 | } | 623 | } |
| @@ -567,10 +666,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, | |||
| 567 | int ptrace_request(struct task_struct *child, long request, | 666 | int ptrace_request(struct task_struct *child, long request, |
| 568 | unsigned long addr, unsigned long data) | 667 | unsigned long addr, unsigned long data) |
| 569 | { | 668 | { |
| 669 | bool seized = child->ptrace & PT_SEIZED; | ||
| 570 | int ret = -EIO; | 670 | int ret = -EIO; |
| 571 | siginfo_t siginfo; | 671 | siginfo_t siginfo, *si; |
| 572 | void __user *datavp = (void __user *) data; | 672 | void __user *datavp = (void __user *) data; |
| 573 | unsigned long __user *datalp = datavp; | 673 | unsigned long __user *datalp = datavp; |
| 674 | unsigned long flags; | ||
| 574 | 675 | ||
| 575 | switch (request) { | 676 | switch (request) { |
| 576 | case PTRACE_PEEKTEXT: | 677 | case PTRACE_PEEKTEXT: |
| @@ -603,6 +704,62 @@ int ptrace_request(struct task_struct *child, long request, | |||
| 603 | ret = ptrace_setsiginfo(child, &siginfo); | 704 | ret = ptrace_setsiginfo(child, &siginfo); |
| 604 | break; | 705 | break; |
| 605 | 706 | ||
| 707 | case PTRACE_INTERRUPT: | ||
| 708 | /* | ||
| 709 | * Stop tracee without any side-effect on signal or job | ||
| 710 | * control. At least one trap is guaranteed to happen | ||
| 711 | * after this request. If @child is already trapped, the | ||
| 712 | * current trap is not disturbed and another trap will | ||
| 713 | * happen after the current trap is ended with PTRACE_CONT. | ||
| 714 | * | ||
| 715 | * The actual trap might not be PTRACE_EVENT_STOP trap but | ||
| 716 | * the pending condition is cleared regardless. | ||
| 717 | */ | ||
| 718 | if (unlikely(!seized || !lock_task_sighand(child, &flags))) | ||
| 719 | break; | ||
| 720 | |||
| 721 | /* | ||
| 722 | * INTERRUPT doesn't disturb existing trap sans one | ||
| 723 | * exception. If ptracer issued LISTEN for the current | ||
| 724 | * STOP, this INTERRUPT should clear LISTEN and re-trap | ||
| 725 | * tracee into STOP. | ||
| 726 | */ | ||
| 727 | if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) | ||
| 728 | signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); | ||
| 729 | |||
| 730 | unlock_task_sighand(child, &flags); | ||
| 731 | ret = 0; | ||
| 732 | break; | ||
| 733 | |||
| 734 | case PTRACE_LISTEN: | ||
| 735 | /* | ||
| 736 | * Listen for events. Tracee must be in STOP. It's not | ||
| 737 | * resumed per-se but is not considered to be in TRACED by | ||
| 738 | * wait(2) or ptrace(2). If an async event (e.g. group | ||
| 739 | * stop state change) happens, tracee will enter STOP trap | ||
| 740 | * again. Alternatively, ptracer can issue INTERRUPT to | ||
| 741 | * finish listening and re-trap tracee into STOP. | ||
| 742 | */ | ||
| 743 | if (unlikely(!seized || !lock_task_sighand(child, &flags))) | ||
| 744 | break; | ||
| 745 | |||
| 746 | si = child->last_siginfo; | ||
| 747 | if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP)) | ||
| 748 | break; | ||
| 749 | |||
| 750 | child->jobctl |= JOBCTL_LISTENING; | ||
| 751 | |||
| 752 | /* | ||
| 753 | * If NOTIFY is set, it means event happened between start | ||
| 754 | * of this trap and now. Trigger re-trap immediately. | ||
| 755 | */ | ||
| 756 | if (child->jobctl & JOBCTL_TRAP_NOTIFY) | ||
| 757 | signal_wake_up(child, true); | ||
| 758 | |||
| 759 | unlock_task_sighand(child, &flags); | ||
| 760 | ret = 0; | ||
| 761 | break; | ||
| 762 | |||
| 606 | case PTRACE_DETACH: /* detach a process that was attached. */ | 763 | case PTRACE_DETACH: /* detach a process that was attached. */ |
| 607 | ret = ptrace_detach(child, data); | 764 | ret = ptrace_detach(child, data); |
| 608 | break; | 765 | break; |
| @@ -717,8 +874,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, | |||
| 717 | goto out; | 874 | goto out; |
| 718 | } | 875 | } |
| 719 | 876 | ||
| 720 | if (request == PTRACE_ATTACH) { | 877 | if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { |
| 721 | ret = ptrace_attach(child); | 878 | ret = ptrace_attach(child, request, data); |
| 722 | /* | 879 | /* |
| 723 | * Some architectures need to do book-keeping after | 880 | * Some architectures need to do book-keeping after |
| 724 | * a ptrace attach. | 881 | * a ptrace attach. |
| @@ -728,7 +885,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, | |||
| 728 | goto out_put_task_struct; | 885 | goto out_put_task_struct; |
| 729 | } | 886 | } |
| 730 | 887 | ||
| 731 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | 888 | ret = ptrace_check_attach(child, request == PTRACE_KILL || |
| 889 | request == PTRACE_INTERRUPT); | ||
| 732 | if (ret < 0) | 890 | if (ret < 0) |
| 733 | goto out_put_task_struct; | 891 | goto out_put_task_struct; |
| 734 | 892 | ||
| @@ -859,8 +1017,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
| 859 | goto out; | 1017 | goto out; |
| 860 | } | 1018 | } |
| 861 | 1019 | ||
| 862 | if (request == PTRACE_ATTACH) { | 1020 | if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { |
| 863 | ret = ptrace_attach(child); | 1021 | ret = ptrace_attach(child, request, data); |
| 864 | /* | 1022 | /* |
| 865 | * Some architectures need to do book-keeping after | 1023 | * Some architectures need to do book-keeping after |
| 866 | * a ptrace attach. | 1024 | * a ptrace attach. |
| @@ -870,7 +1028,8 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | |||
| 870 | goto out_put_task_struct; | 1028 | goto out_put_task_struct; |
| 871 | } | 1029 | } |
| 872 | 1030 | ||
| 873 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | 1031 | ret = ptrace_check_attach(child, request == PTRACE_KILL || |
| 1032 | request == PTRACE_INTERRUPT); | ||
| 874 | if (!ret) | 1033 | if (!ret) |
| 875 | ret = compat_arch_ptrace(child, request, addr, data); | 1034 | ret = compat_arch_ptrace(child, request, addr, data); |
| 876 | 1035 | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f3240e987928..ddddb320be61 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -37,7 +37,7 @@ | |||
| 37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
| 38 | #include <linux/interrupt.h> | 38 | #include <linux/interrupt.h> |
| 39 | #include <linux/sched.h> | 39 | #include <linux/sched.h> |
| 40 | #include <asm/atomic.h> | 40 | #include <linux/atomic.h> |
| 41 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
| 42 | #include <linux/percpu.h> | 42 | #include <linux/percpu.h> |
| 43 | #include <linux/notifier.h> | 43 | #include <linux/notifier.h> |
| @@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) | |||
| 142 | * Ensure that queued callbacks are all executed. | 142 | * Ensure that queued callbacks are all executed. |
| 143 | * If we detect that we are nested in a RCU read-side critical | 143 | * If we detect that we are nested in a RCU read-side critical |
| 144 | * section, we should simply fail, otherwise we would deadlock. | 144 | * section, we should simply fail, otherwise we would deadlock. |
| 145 | * In !PREEMPT configurations, there is no way to tell if we are | ||
| 146 | * in a RCU read-side critical section or not, so we never | ||
| 147 | * attempt any fixup and just print a warning. | ||
| 145 | */ | 148 | */ |
| 149 | #ifndef CONFIG_PREEMPT | ||
| 150 | WARN_ON_ONCE(1); | ||
| 151 | return 0; | ||
| 152 | #endif | ||
| 146 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 153 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
| 147 | irqs_disabled()) { | 154 | irqs_disabled()) { |
| 148 | WARN_ON(1); | 155 | WARN_ON_ONCE(1); |
| 149 | return 0; | 156 | return 0; |
| 150 | } | 157 | } |
| 151 | rcu_barrier(); | 158 | rcu_barrier(); |
| @@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | |||
| 184 | * Ensure that queued callbacks are all executed. | 191 | * Ensure that queued callbacks are all executed. |
| 185 | * If we detect that we are nested in a RCU read-side critical | 192 | * If we detect that we are nested in a RCU read-side critical |
| 186 | * section, we should simply fail, otherwise we would deadlock. | 193 | * section, we should simply fail, otherwise we would deadlock. |
| 194 | * In !PREEMPT configurations, there is no way to tell if we are | ||
| 195 | * in a RCU read-side critical section or not, so we never | ||
| 196 | * attempt any fixup and just print a warning. | ||
| 187 | */ | 197 | */ |
| 198 | #ifndef CONFIG_PREEMPT | ||
| 199 | WARN_ON_ONCE(1); | ||
| 200 | return 0; | ||
| 201 | #endif | ||
| 188 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 202 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
| 189 | irqs_disabled()) { | 203 | irqs_disabled()) { |
| 190 | WARN_ON(1); | 204 | WARN_ON_ONCE(1); |
| 191 | return 0; | 205 | return 0; |
| 192 | } | 206 | } |
| 193 | rcu_barrier(); | 207 | rcu_barrier(); |
| @@ -214,15 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) | |||
| 214 | * Ensure that queued callbacks are all executed. | 228 | * Ensure that queued callbacks are all executed. |
| 215 | * If we detect that we are nested in a RCU read-side critical | 229 | * If we detect that we are nested in a RCU read-side critical |
| 216 | * section, we should simply fail, otherwise we would deadlock. | 230 | * section, we should simply fail, otherwise we would deadlock. |
| 217 | * Note that the machinery to reliably determine whether | 231 | * In !PREEMPT configurations, there is no way to tell if we are |
| 218 | * or not we are in an RCU read-side critical section | 232 | * in a RCU read-side critical section or not, so we never |
| 219 | * exists only in the preemptible RCU implementations | 233 | * attempt any fixup and just print a warning. |
| 220 | * (TINY_PREEMPT_RCU and TREE_PREEMPT_RCU), which is why | ||
| 221 | * DEBUG_OBJECTS_RCU_HEAD is disallowed if !PREEMPT. | ||
| 222 | */ | 234 | */ |
| 235 | #ifndef CONFIG_PREEMPT | ||
| 236 | WARN_ON_ONCE(1); | ||
| 237 | return 0; | ||
| 238 | #endif | ||
| 223 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || | 239 | if (rcu_preempt_depth() != 0 || preempt_count() != 0 || |
| 224 | irqs_disabled()) { | 240 | irqs_disabled()) { |
| 225 | WARN_ON(1); | 241 | WARN_ON_ONCE(1); |
| 226 | return 0; | 242 | return 0; |
| 227 | } | 243 | } |
| 228 | rcu_barrier(); | 244 | rcu_barrier(); |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 0c343b9a46d5..7bbac7d0f5ab 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
| @@ -35,15 +35,16 @@ | |||
| 35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
| 36 | #include <linux/time.h> | 36 | #include <linux/time.h> |
| 37 | #include <linux/cpu.h> | 37 | #include <linux/cpu.h> |
| 38 | #include <linux/prefetch.h> | ||
| 38 | 39 | ||
| 39 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ | 40 | /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ |
| 40 | static struct task_struct *rcu_kthread_task; | 41 | static struct task_struct *rcu_kthread_task; |
| 41 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); | 42 | static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); |
| 42 | static unsigned long have_rcu_kthread_work; | 43 | static unsigned long have_rcu_kthread_work; |
| 43 | static void invoke_rcu_kthread(void); | ||
| 44 | 44 | ||
| 45 | /* Forward declarations for rcutiny_plugin.h. */ | 45 | /* Forward declarations for rcutiny_plugin.h. */ |
| 46 | struct rcu_ctrlblk; | 46 | struct rcu_ctrlblk; |
| 47 | static void invoke_rcu_kthread(void); | ||
| 47 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); | 48 | static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); |
| 48 | static int rcu_kthread(void *arg); | 49 | static int rcu_kthread(void *arg); |
| 49 | static void __call_rcu(struct rcu_head *head, | 50 | static void __call_rcu(struct rcu_head *head, |
| @@ -79,36 +80,45 @@ void rcu_exit_nohz(void) | |||
| 79 | #endif /* #ifdef CONFIG_NO_HZ */ | 80 | #endif /* #ifdef CONFIG_NO_HZ */ |
| 80 | 81 | ||
| 81 | /* | 82 | /* |
| 82 | * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc(). | 83 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). |
| 83 | * Also disable irqs to avoid confusion due to interrupt handlers | 84 | * Also irqs are disabled to avoid confusion due to interrupt handlers |
| 84 | * invoking call_rcu(). | 85 | * invoking call_rcu(). |
| 85 | */ | 86 | */ |
| 86 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | 87 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) |
| 87 | { | 88 | { |
| 88 | unsigned long flags; | ||
| 89 | |||
| 90 | local_irq_save(flags); | ||
| 91 | if (rcp->rcucblist != NULL && | 89 | if (rcp->rcucblist != NULL && |
| 92 | rcp->donetail != rcp->curtail) { | 90 | rcp->donetail != rcp->curtail) { |
| 93 | rcp->donetail = rcp->curtail; | 91 | rcp->donetail = rcp->curtail; |
| 94 | local_irq_restore(flags); | ||
| 95 | return 1; | 92 | return 1; |
| 96 | } | 93 | } |
| 97 | local_irq_restore(flags); | ||
| 98 | 94 | ||
| 99 | return 0; | 95 | return 0; |
| 100 | } | 96 | } |
| 101 | 97 | ||
| 102 | /* | 98 | /* |
| 99 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
| 100 | * or to boost readers. | ||
| 101 | */ | ||
| 102 | static void invoke_rcu_kthread(void) | ||
| 103 | { | ||
| 104 | have_rcu_kthread_work = 1; | ||
| 105 | wake_up(&rcu_kthread_wq); | ||
| 106 | } | ||
| 107 | |||
| 108 | /* | ||
| 103 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we | 109 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we |
| 104 | * are at it, given that any rcu quiescent state is also an rcu_bh | 110 | * are at it, given that any rcu quiescent state is also an rcu_bh |
| 105 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. | 111 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. |
| 106 | */ | 112 | */ |
| 107 | void rcu_sched_qs(int cpu) | 113 | void rcu_sched_qs(int cpu) |
| 108 | { | 114 | { |
| 115 | unsigned long flags; | ||
| 116 | |||
| 117 | local_irq_save(flags); | ||
| 109 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | 118 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
| 110 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | 119 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
| 111 | invoke_rcu_kthread(); | 120 | invoke_rcu_kthread(); |
| 121 | local_irq_restore(flags); | ||
| 112 | } | 122 | } |
| 113 | 123 | ||
| 114 | /* | 124 | /* |
| @@ -116,8 +126,12 @@ void rcu_sched_qs(int cpu) | |||
| 116 | */ | 126 | */ |
| 117 | void rcu_bh_qs(int cpu) | 127 | void rcu_bh_qs(int cpu) |
| 118 | { | 128 | { |
| 129 | unsigned long flags; | ||
| 130 | |||
| 131 | local_irq_save(flags); | ||
| 119 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | 132 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
| 120 | invoke_rcu_kthread(); | 133 | invoke_rcu_kthread(); |
| 134 | local_irq_restore(flags); | ||
| 121 | } | 135 | } |
| 122 | 136 | ||
| 123 | /* | 137 | /* |
| @@ -167,7 +181,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |||
| 167 | prefetch(next); | 181 | prefetch(next); |
| 168 | debug_rcu_head_unqueue(list); | 182 | debug_rcu_head_unqueue(list); |
| 169 | local_bh_disable(); | 183 | local_bh_disable(); |
| 170 | list->func(list); | 184 | __rcu_reclaim(list); |
| 171 | local_bh_enable(); | 185 | local_bh_enable(); |
| 172 | list = next; | 186 | list = next; |
| 173 | RCU_TRACE(cb_count++); | 187 | RCU_TRACE(cb_count++); |
| @@ -208,20 +222,6 @@ static int rcu_kthread(void *arg) | |||
| 208 | } | 222 | } |
| 209 | 223 | ||
| 210 | /* | 224 | /* |
| 211 | * Wake up rcu_kthread() to process callbacks now eligible for invocation | ||
| 212 | * or to boost readers. | ||
| 213 | */ | ||
| 214 | static void invoke_rcu_kthread(void) | ||
| 215 | { | ||
| 216 | unsigned long flags; | ||
| 217 | |||
| 218 | local_irq_save(flags); | ||
| 219 | have_rcu_kthread_work = 1; | ||
| 220 | wake_up(&rcu_kthread_wq); | ||
| 221 | local_irq_restore(flags); | ||
| 222 | } | ||
| 223 | |||
| 224 | /* | ||
| 225 | * Wait for a grace period to elapse. But it is illegal to invoke | 225 | * Wait for a grace period to elapse. But it is illegal to invoke |
| 226 | * synchronize_sched() from within an RCU read-side critical section. | 226 | * synchronize_sched() from within an RCU read-side critical section. |
| 227 | * Therefore, any legal call to synchronize_sched() is a quiescent | 227 | * Therefore, any legal call to synchronize_sched() is a quiescent |
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 3cb8e362e883..f259c676195f 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
| @@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk { | |||
| 100 | u8 completed; /* Last grace period completed. */ | 100 | u8 completed; /* Last grace period completed. */ |
| 101 | /* If all three are equal, RCU is idle. */ | 101 | /* If all three are equal, RCU is idle. */ |
| 102 | #ifdef CONFIG_RCU_BOOST | 102 | #ifdef CONFIG_RCU_BOOST |
| 103 | s8 boosted_this_gp; /* Has boosting already happened? */ | ||
| 104 | unsigned long boost_time; /* When to start boosting (jiffies) */ | 103 | unsigned long boost_time; /* When to start boosting (jiffies) */ |
| 105 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 104 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 106 | #ifdef CONFIG_RCU_TRACE | 105 | #ifdef CONFIG_RCU_TRACE |
| 107 | unsigned long n_grace_periods; | 106 | unsigned long n_grace_periods; |
| 108 | #ifdef CONFIG_RCU_BOOST | 107 | #ifdef CONFIG_RCU_BOOST |
| 109 | unsigned long n_tasks_boosted; | 108 | unsigned long n_tasks_boosted; |
| 109 | /* Total number of tasks boosted. */ | ||
| 110 | unsigned long n_exp_boosts; | 110 | unsigned long n_exp_boosts; |
| 111 | /* Number of tasks boosted for expedited GP. */ | ||
| 111 | unsigned long n_normal_boosts; | 112 | unsigned long n_normal_boosts; |
| 112 | unsigned long n_normal_balk_blkd_tasks; | 113 | /* Number of tasks boosted for normal GP. */ |
| 113 | unsigned long n_normal_balk_gp_tasks; | 114 | unsigned long n_balk_blkd_tasks; |
| 114 | unsigned long n_normal_balk_boost_tasks; | 115 | /* Refused to boost: no blocked tasks. */ |
| 115 | unsigned long n_normal_balk_boosted; | 116 | unsigned long n_balk_exp_gp_tasks; |
| 116 | unsigned long n_normal_balk_notyet; | 117 | /* Refused to boost: nothing blocking GP. */ |
| 117 | unsigned long n_normal_balk_nos; | 118 | unsigned long n_balk_boost_tasks; |
| 118 | unsigned long n_exp_balk_blkd_tasks; | 119 | /* Refused to boost: already boosting. */ |
| 119 | unsigned long n_exp_balk_nos; | 120 | unsigned long n_balk_notyet; |
| 121 | /* Refused to boost: not yet time. */ | ||
| 122 | unsigned long n_balk_nos; | ||
| 123 | /* Refused to boost: not sure why, though. */ | ||
| 124 | /* This can happen due to race conditions. */ | ||
| 120 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 125 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 121 | #endif /* #ifdef CONFIG_RCU_TRACE */ | 126 | #endif /* #ifdef CONFIG_RCU_TRACE */ |
| 122 | }; | 127 | }; |
| @@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t) | |||
| 201 | 206 | ||
| 202 | #ifdef CONFIG_RCU_BOOST | 207 | #ifdef CONFIG_RCU_BOOST |
| 203 | static void rcu_initiate_boost_trace(void); | 208 | static void rcu_initiate_boost_trace(void); |
| 204 | static void rcu_initiate_exp_boost_trace(void); | ||
| 205 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 209 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 206 | 210 | ||
| 207 | /* | 211 | /* |
| @@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m) | |||
| 219 | "N."[!rcu_preempt_ctrlblk.gp_tasks], | 223 | "N."[!rcu_preempt_ctrlblk.gp_tasks], |
| 220 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); | 224 | "E."[!rcu_preempt_ctrlblk.exp_tasks]); |
| 221 | #ifdef CONFIG_RCU_BOOST | 225 | #ifdef CONFIG_RCU_BOOST |
| 222 | seq_printf(m, " ttb=%c btg=", | 226 | seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", |
| 223 | "B."[!rcu_preempt_ctrlblk.boost_tasks]); | 227 | " ", |
| 224 | switch (rcu_preempt_ctrlblk.boosted_this_gp) { | 228 | "B."[!rcu_preempt_ctrlblk.boost_tasks], |
| 225 | case -1: | ||
| 226 | seq_puts(m, "exp"); | ||
| 227 | break; | ||
| 228 | case 0: | ||
| 229 | seq_puts(m, "no"); | ||
| 230 | break; | ||
| 231 | case 1: | ||
| 232 | seq_puts(m, "begun"); | ||
| 233 | break; | ||
| 234 | case 2: | ||
| 235 | seq_puts(m, "done"); | ||
| 236 | break; | ||
| 237 | default: | ||
| 238 | seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); | ||
| 239 | } | ||
| 240 | seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", | ||
| 241 | rcu_preempt_ctrlblk.n_tasks_boosted, | 229 | rcu_preempt_ctrlblk.n_tasks_boosted, |
| 242 | rcu_preempt_ctrlblk.n_exp_boosts, | 230 | rcu_preempt_ctrlblk.n_exp_boosts, |
| 243 | rcu_preempt_ctrlblk.n_normal_boosts, | 231 | rcu_preempt_ctrlblk.n_normal_boosts, |
| 244 | (int)(jiffies & 0xffff), | 232 | (int)(jiffies & 0xffff), |
| 245 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); | 233 | (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); |
| 246 | seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", | 234 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", |
| 247 | "normal balk", | 235 | " balk", |
| 248 | rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, | 236 | rcu_preempt_ctrlblk.n_balk_blkd_tasks, |
| 249 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, | 237 | rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, |
| 250 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, | 238 | rcu_preempt_ctrlblk.n_balk_boost_tasks, |
| 251 | rcu_preempt_ctrlblk.n_normal_balk_boosted, | 239 | rcu_preempt_ctrlblk.n_balk_notyet, |
| 252 | rcu_preempt_ctrlblk.n_normal_balk_notyet, | 240 | rcu_preempt_ctrlblk.n_balk_nos); |
| 253 | rcu_preempt_ctrlblk.n_normal_balk_nos); | ||
| 254 | seq_printf(m, " exp balk: bt=%lu nos=%lu\n", | ||
| 255 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, | ||
| 256 | rcu_preempt_ctrlblk.n_exp_balk_nos); | ||
| 257 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 241 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 258 | } | 242 | } |
| 259 | 243 | ||
| @@ -271,25 +255,59 @@ static int rcu_boost(void) | |||
| 271 | { | 255 | { |
| 272 | unsigned long flags; | 256 | unsigned long flags; |
| 273 | struct rt_mutex mtx; | 257 | struct rt_mutex mtx; |
| 274 | struct list_head *np; | ||
| 275 | struct task_struct *t; | 258 | struct task_struct *t; |
| 259 | struct list_head *tb; | ||
| 276 | 260 | ||
| 277 | if (rcu_preempt_ctrlblk.boost_tasks == NULL) | 261 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && |
| 262 | rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
| 278 | return 0; /* Nothing to boost. */ | 263 | return 0; /* Nothing to boost. */ |
| 264 | |||
| 279 | raw_local_irq_save(flags); | 265 | raw_local_irq_save(flags); |
| 280 | rcu_preempt_ctrlblk.boosted_this_gp++; | 266 | |
| 281 | t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, | 267 | /* |
| 282 | rcu_node_entry); | 268 | * Recheck with irqs disabled: all tasks in need of boosting |
| 283 | np = rcu_next_node_entry(t); | 269 | * might exit their RCU read-side critical sections on their own |
| 270 | * if we are preempted just before disabling irqs. | ||
| 271 | */ | ||
| 272 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && | ||
| 273 | rcu_preempt_ctrlblk.exp_tasks == NULL) { | ||
| 274 | raw_local_irq_restore(flags); | ||
| 275 | return 0; | ||
| 276 | } | ||
| 277 | |||
| 278 | /* | ||
| 279 | * Preferentially boost tasks blocking expedited grace periods. | ||
| 280 | * This cannot starve the normal grace periods because a second | ||
| 281 | * expedited grace period must boost all blocked tasks, including | ||
| 282 | * those blocking the pre-existing normal grace period. | ||
| 283 | */ | ||
| 284 | if (rcu_preempt_ctrlblk.exp_tasks != NULL) { | ||
| 285 | tb = rcu_preempt_ctrlblk.exp_tasks; | ||
| 286 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
| 287 | } else { | ||
| 288 | tb = rcu_preempt_ctrlblk.boost_tasks; | ||
| 289 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
| 290 | } | ||
| 291 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | ||
| 292 | |||
| 293 | /* | ||
| 294 | * We boost task t by manufacturing an rt_mutex that appears to | ||
| 295 | * be held by task t. We leave a pointer to that rt_mutex where | ||
| 296 | * task t can find it, and task t will release the mutex when it | ||
| 297 | * exits its outermost RCU read-side critical section. Then | ||
| 298 | * simply acquiring this artificial rt_mutex will boost task | ||
| 299 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
| 300 | */ | ||
| 301 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
| 284 | rt_mutex_init_proxy_locked(&mtx, t); | 302 | rt_mutex_init_proxy_locked(&mtx, t); |
| 285 | t->rcu_boost_mutex = &mtx; | 303 | t->rcu_boost_mutex = &mtx; |
| 286 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | 304 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; |
| 287 | raw_local_irq_restore(flags); | 305 | raw_local_irq_restore(flags); |
| 288 | rt_mutex_lock(&mtx); | 306 | rt_mutex_lock(&mtx); |
| 289 | RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | 307 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
| 290 | rcu_preempt_ctrlblk.boosted_this_gp++; | 308 | |
| 291 | rt_mutex_unlock(&mtx); | 309 | return rcu_preempt_ctrlblk.boost_tasks != NULL || |
| 292 | return rcu_preempt_ctrlblk.boost_tasks != NULL; | 310 | rcu_preempt_ctrlblk.exp_tasks != NULL; |
| 293 | } | 311 | } |
| 294 | 312 | ||
| 295 | /* | 313 | /* |
| @@ -304,42 +322,25 @@ static int rcu_boost(void) | |||
| 304 | */ | 322 | */ |
| 305 | static int rcu_initiate_boost(void) | 323 | static int rcu_initiate_boost(void) |
| 306 | { | 324 | { |
| 307 | if (!rcu_preempt_blocked_readers_cgp()) { | 325 | if (!rcu_preempt_blocked_readers_cgp() && |
| 308 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); | 326 | rcu_preempt_ctrlblk.exp_tasks == NULL) { |
| 327 | RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); | ||
| 309 | return 0; | 328 | return 0; |
| 310 | } | 329 | } |
| 311 | if (rcu_preempt_ctrlblk.gp_tasks != NULL && | 330 | if (rcu_preempt_ctrlblk.exp_tasks != NULL || |
| 312 | rcu_preempt_ctrlblk.boost_tasks == NULL && | 331 | (rcu_preempt_ctrlblk.gp_tasks != NULL && |
| 313 | rcu_preempt_ctrlblk.boosted_this_gp == 0 && | 332 | rcu_preempt_ctrlblk.boost_tasks == NULL && |
| 314 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { | 333 | ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { |
| 315 | rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; | 334 | if (rcu_preempt_ctrlblk.exp_tasks == NULL) |
| 335 | rcu_preempt_ctrlblk.boost_tasks = | ||
| 336 | rcu_preempt_ctrlblk.gp_tasks; | ||
| 316 | invoke_rcu_kthread(); | 337 | invoke_rcu_kthread(); |
| 317 | RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | ||
| 318 | } else | 338 | } else |
| 319 | RCU_TRACE(rcu_initiate_boost_trace()); | 339 | RCU_TRACE(rcu_initiate_boost_trace()); |
| 320 | return 1; | 340 | return 1; |
| 321 | } | 341 | } |
| 322 | 342 | ||
| 323 | /* | 343 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) |
| 324 | * Initiate boosting for an expedited grace period. | ||
| 325 | */ | ||
| 326 | static void rcu_initiate_expedited_boost(void) | ||
| 327 | { | ||
| 328 | unsigned long flags; | ||
| 329 | |||
| 330 | raw_local_irq_save(flags); | ||
| 331 | if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { | ||
| 332 | rcu_preempt_ctrlblk.boost_tasks = | ||
| 333 | rcu_preempt_ctrlblk.blkd_tasks.next; | ||
| 334 | rcu_preempt_ctrlblk.boosted_this_gp = -1; | ||
| 335 | invoke_rcu_kthread(); | ||
| 336 | RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | ||
| 337 | } else | ||
| 338 | RCU_TRACE(rcu_initiate_exp_boost_trace()); | ||
| 339 | raw_local_irq_restore(flags); | ||
| 340 | } | ||
| 341 | |||
| 342 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); | ||
| 343 | 344 | ||
| 344 | /* | 345 | /* |
| 345 | * Do priority-boost accounting for the start of a new grace period. | 346 | * Do priority-boost accounting for the start of a new grace period. |
| @@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void) | |||
| 347 | static void rcu_preempt_boost_start_gp(void) | 348 | static void rcu_preempt_boost_start_gp(void) |
| 348 | { | 349 | { |
| 349 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | 350 | rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; |
| 350 | if (rcu_preempt_ctrlblk.boosted_this_gp > 0) | ||
| 351 | rcu_preempt_ctrlblk.boosted_this_gp = 0; | ||
| 352 | } | 351 | } |
| 353 | 352 | ||
| 354 | #else /* #ifdef CONFIG_RCU_BOOST */ | 353 | #else /* #ifdef CONFIG_RCU_BOOST */ |
| @@ -372,13 +371,6 @@ static int rcu_initiate_boost(void) | |||
| 372 | } | 371 | } |
| 373 | 372 | ||
| 374 | /* | 373 | /* |
| 375 | * If there is no RCU priority boosting, we don't initiate expedited boosting. | ||
| 376 | */ | ||
| 377 | static void rcu_initiate_expedited_boost(void) | ||
| 378 | { | ||
| 379 | } | ||
| 380 | |||
| 381 | /* | ||
| 382 | * If there is no RCU priority boosting, nothing to do at grace-period start. | 374 | * If there is no RCU priority boosting, nothing to do at grace-period start. |
| 383 | */ | 375 | */ |
| 384 | static void rcu_preempt_boost_start_gp(void) | 376 | static void rcu_preempt_boost_start_gp(void) |
| @@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void) | |||
| 418 | if (!rcu_preempt_gp_in_progress()) | 410 | if (!rcu_preempt_gp_in_progress()) |
| 419 | return; | 411 | return; |
| 420 | /* | 412 | /* |
| 421 | * Check up on boosting. If there are no readers blocking the | 413 | * Check up on boosting. If there are readers blocking the |
| 422 | * current grace period, leave. | 414 | * current grace period, leave. |
| 423 | */ | 415 | */ |
| 424 | if (rcu_initiate_boost()) | 416 | if (rcu_initiate_boost()) |
| @@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 578 | empty = !rcu_preempt_blocked_readers_cgp(); | 570 | empty = !rcu_preempt_blocked_readers_cgp(); |
| 579 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; | 571 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; |
| 580 | np = rcu_next_node_entry(t); | 572 | np = rcu_next_node_entry(t); |
| 581 | list_del(&t->rcu_node_entry); | 573 | list_del_init(&t->rcu_node_entry); |
| 582 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) | 574 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) |
| 583 | rcu_preempt_ctrlblk.gp_tasks = np; | 575 | rcu_preempt_ctrlblk.gp_tasks = np; |
| 584 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) | 576 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) |
| @@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 587 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) | 579 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) |
| 588 | rcu_preempt_ctrlblk.boost_tasks = np; | 580 | rcu_preempt_ctrlblk.boost_tasks = np; |
| 589 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 581 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 590 | INIT_LIST_HEAD(&t->rcu_node_entry); | ||
| 591 | 582 | ||
| 592 | /* | 583 | /* |
| 593 | * If this was the last task on the current list, and if | 584 | * If this was the last task on the current list, and if |
| @@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void) | |||
| 812 | rpcp->exp_tasks = rpcp->blkd_tasks.next; | 803 | rpcp->exp_tasks = rpcp->blkd_tasks.next; |
| 813 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) | 804 | if (rpcp->exp_tasks == &rpcp->blkd_tasks) |
| 814 | rpcp->exp_tasks = NULL; | 805 | rpcp->exp_tasks = NULL; |
| 815 | local_irq_restore(flags); | ||
| 816 | 806 | ||
| 817 | /* Wait for tail of ->blkd_tasks list to drain. */ | 807 | /* Wait for tail of ->blkd_tasks list to drain. */ |
| 818 | if (rcu_preempted_readers_exp()) | 808 | if (!rcu_preempted_readers_exp()) |
| 819 | rcu_initiate_expedited_boost(); | 809 | local_irq_restore(flags); |
| 810 | else { | ||
| 811 | rcu_initiate_boost(); | ||
| 812 | local_irq_restore(flags); | ||
| 820 | wait_event(sync_rcu_preempt_exp_wq, | 813 | wait_event(sync_rcu_preempt_exp_wq, |
| 821 | !rcu_preempted_readers_exp()); | 814 | !rcu_preempted_readers_exp()); |
| 815 | } | ||
| 822 | 816 | ||
| 823 | /* Clean up and exit. */ | 817 | /* Clean up and exit. */ |
| 824 | barrier(); /* ensure expedited GP seen before counter increment. */ | 818 | barrier(); /* ensure expedited GP seen before counter increment. */ |
| @@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void) | |||
| 931 | 925 | ||
| 932 | static void rcu_initiate_boost_trace(void) | 926 | static void rcu_initiate_boost_trace(void) |
| 933 | { | 927 | { |
| 934 | if (rcu_preempt_ctrlblk.gp_tasks == NULL) | 928 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) |
| 935 | rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; | 929 | rcu_preempt_ctrlblk.n_balk_blkd_tasks++; |
| 930 | else if (rcu_preempt_ctrlblk.gp_tasks == NULL && | ||
| 931 | rcu_preempt_ctrlblk.exp_tasks == NULL) | ||
| 932 | rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; | ||
| 936 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) | 933 | else if (rcu_preempt_ctrlblk.boost_tasks != NULL) |
| 937 | rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; | 934 | rcu_preempt_ctrlblk.n_balk_boost_tasks++; |
| 938 | else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) | ||
| 939 | rcu_preempt_ctrlblk.n_normal_balk_boosted++; | ||
| 940 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) | 935 | else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) |
| 941 | rcu_preempt_ctrlblk.n_normal_balk_notyet++; | 936 | rcu_preempt_ctrlblk.n_balk_notyet++; |
| 942 | else | ||
| 943 | rcu_preempt_ctrlblk.n_normal_balk_nos++; | ||
| 944 | } | ||
| 945 | |||
| 946 | static void rcu_initiate_exp_boost_trace(void) | ||
| 947 | { | ||
| 948 | if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) | ||
| 949 | rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; | ||
| 950 | else | 937 | else |
| 951 | rcu_preempt_ctrlblk.n_exp_balk_nos++; | 938 | rcu_preempt_ctrlblk.n_balk_nos++; |
| 952 | } | 939 | } |
| 953 | 940 | ||
| 954 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 941 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c224da41890c..98f51b13bb7e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -33,7 +33,7 @@ | |||
| 33 | #include <linux/rcupdate.h> | 33 | #include <linux/rcupdate.h> |
| 34 | #include <linux/interrupt.h> | 34 | #include <linux/interrupt.h> |
| 35 | #include <linux/sched.h> | 35 | #include <linux/sched.h> |
| 36 | #include <asm/atomic.h> | 36 | #include <linux/atomic.h> |
| 37 | #include <linux/bitops.h> | 37 | #include <linux/bitops.h> |
| 38 | #include <linux/completion.h> | 38 | #include <linux/completion.h> |
| 39 | #include <linux/moduleparam.h> | 39 | #include <linux/moduleparam.h> |
| @@ -131,7 +131,7 @@ struct rcu_torture { | |||
| 131 | 131 | ||
| 132 | static LIST_HEAD(rcu_torture_freelist); | 132 | static LIST_HEAD(rcu_torture_freelist); |
| 133 | static struct rcu_torture __rcu *rcu_torture_current; | 133 | static struct rcu_torture __rcu *rcu_torture_current; |
| 134 | static long rcu_torture_current_version; | 134 | static unsigned long rcu_torture_current_version; |
| 135 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | 135 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; |
| 136 | static DEFINE_SPINLOCK(rcu_torture_lock); | 136 | static DEFINE_SPINLOCK(rcu_torture_lock); |
| 137 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = | 137 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = |
| @@ -146,8 +146,6 @@ static atomic_t n_rcu_torture_mberror; | |||
| 146 | static atomic_t n_rcu_torture_error; | 146 | static atomic_t n_rcu_torture_error; |
| 147 | static long n_rcu_torture_boost_ktrerror; | 147 | static long n_rcu_torture_boost_ktrerror; |
| 148 | static long n_rcu_torture_boost_rterror; | 148 | static long n_rcu_torture_boost_rterror; |
| 149 | static long n_rcu_torture_boost_allocerror; | ||
| 150 | static long n_rcu_torture_boost_afferror; | ||
| 151 | static long n_rcu_torture_boost_failure; | 149 | static long n_rcu_torture_boost_failure; |
| 152 | static long n_rcu_torture_boosts; | 150 | static long n_rcu_torture_boosts; |
| 153 | static long n_rcu_torture_timers; | 151 | static long n_rcu_torture_timers; |
| @@ -163,11 +161,11 @@ static int stutter_pause_test; | |||
| 163 | #endif | 161 | #endif |
| 164 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | 162 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
| 165 | 163 | ||
| 166 | #ifdef CONFIG_RCU_BOOST | 164 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) |
| 167 | #define rcu_can_boost() 1 | 165 | #define rcu_can_boost() 1 |
| 168 | #else /* #ifdef CONFIG_RCU_BOOST */ | 166 | #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
| 169 | #define rcu_can_boost() 0 | 167 | #define rcu_can_boost() 0 |
| 170 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | 168 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ |
| 171 | 169 | ||
| 172 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | 170 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ |
| 173 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | 171 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ |
| @@ -751,6 +749,7 @@ static int rcu_torture_boost(void *arg) | |||
| 751 | n_rcu_torture_boost_rterror++; | 749 | n_rcu_torture_boost_rterror++; |
| 752 | } | 750 | } |
| 753 | 751 | ||
| 752 | init_rcu_head_on_stack(&rbi.rcu); | ||
| 754 | /* Each pass through the following loop does one boost-test cycle. */ | 753 | /* Each pass through the following loop does one boost-test cycle. */ |
| 755 | do { | 754 | do { |
| 756 | /* Wait for the next test interval. */ | 755 | /* Wait for the next test interval. */ |
| @@ -810,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); | |||
| 810 | 809 | ||
| 811 | /* Clean up and exit. */ | 810 | /* Clean up and exit. */ |
| 812 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | 811 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); |
| 812 | destroy_rcu_head_on_stack(&rbi.rcu); | ||
| 813 | rcutorture_shutdown_absorb("rcu_torture_boost"); | 813 | rcutorture_shutdown_absorb("rcu_torture_boost"); |
| 814 | while (!kthread_should_stop() || rbi.inflight) | 814 | while (!kthread_should_stop() || rbi.inflight) |
| 815 | schedule_timeout_uninterruptible(1); | 815 | schedule_timeout_uninterruptible(1); |
| @@ -886,7 +886,7 @@ rcu_torture_writer(void *arg) | |||
| 886 | old_rp->rtort_pipe_count++; | 886 | old_rp->rtort_pipe_count++; |
| 887 | cur_ops->deferred_free(old_rp); | 887 | cur_ops->deferred_free(old_rp); |
| 888 | } | 888 | } |
| 889 | rcu_torture_current_version++; | 889 | rcutorture_record_progress(++rcu_torture_current_version); |
| 890 | oldbatch = cur_ops->completed(); | 890 | oldbatch = cur_ops->completed(); |
| 891 | rcu_stutter_wait("rcu_torture_writer"); | 891 | rcu_stutter_wait("rcu_torture_writer"); |
| 892 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | 892 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); |
| @@ -941,7 +941,6 @@ static void rcu_torture_timer(unsigned long unused) | |||
| 941 | idx = cur_ops->readlock(); | 941 | idx = cur_ops->readlock(); |
| 942 | completed = cur_ops->completed(); | 942 | completed = cur_ops->completed(); |
| 943 | p = rcu_dereference_check(rcu_torture_current, | 943 | p = rcu_dereference_check(rcu_torture_current, |
| 944 | rcu_read_lock_held() || | ||
| 945 | rcu_read_lock_bh_held() || | 944 | rcu_read_lock_bh_held() || |
| 946 | rcu_read_lock_sched_held() || | 945 | rcu_read_lock_sched_held() || |
| 947 | srcu_read_lock_held(&srcu_ctl)); | 946 | srcu_read_lock_held(&srcu_ctl)); |
| @@ -1002,7 +1001,6 @@ rcu_torture_reader(void *arg) | |||
| 1002 | idx = cur_ops->readlock(); | 1001 | idx = cur_ops->readlock(); |
| 1003 | completed = cur_ops->completed(); | 1002 | completed = cur_ops->completed(); |
| 1004 | p = rcu_dereference_check(rcu_torture_current, | 1003 | p = rcu_dereference_check(rcu_torture_current, |
| 1005 | rcu_read_lock_held() || | ||
| 1006 | rcu_read_lock_bh_held() || | 1004 | rcu_read_lock_bh_held() || |
| 1007 | rcu_read_lock_sched_held() || | 1005 | rcu_read_lock_sched_held() || |
| 1008 | srcu_read_lock_held(&srcu_ctl)); | 1006 | srcu_read_lock_held(&srcu_ctl)); |
| @@ -1066,8 +1064,8 @@ rcu_torture_printk(char *page) | |||
| 1066 | } | 1064 | } |
| 1067 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | 1065 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
| 1068 | cnt += sprintf(&page[cnt], | 1066 | cnt += sprintf(&page[cnt], |
| 1069 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 1067 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " |
| 1070 | "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " | 1068 | "rtmbe: %d rtbke: %ld rtbre: %ld " |
| 1071 | "rtbf: %ld rtb: %ld nt: %ld", | 1069 | "rtbf: %ld rtb: %ld nt: %ld", |
| 1072 | rcu_torture_current, | 1070 | rcu_torture_current, |
| 1073 | rcu_torture_current_version, | 1071 | rcu_torture_current_version, |
| @@ -1078,16 +1076,12 @@ rcu_torture_printk(char *page) | |||
| 1078 | atomic_read(&n_rcu_torture_mberror), | 1076 | atomic_read(&n_rcu_torture_mberror), |
| 1079 | n_rcu_torture_boost_ktrerror, | 1077 | n_rcu_torture_boost_ktrerror, |
| 1080 | n_rcu_torture_boost_rterror, | 1078 | n_rcu_torture_boost_rterror, |
| 1081 | n_rcu_torture_boost_allocerror, | ||
| 1082 | n_rcu_torture_boost_afferror, | ||
| 1083 | n_rcu_torture_boost_failure, | 1079 | n_rcu_torture_boost_failure, |
| 1084 | n_rcu_torture_boosts, | 1080 | n_rcu_torture_boosts, |
| 1085 | n_rcu_torture_timers); | 1081 | n_rcu_torture_timers); |
| 1086 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | 1082 | if (atomic_read(&n_rcu_torture_mberror) != 0 || |
| 1087 | n_rcu_torture_boost_ktrerror != 0 || | 1083 | n_rcu_torture_boost_ktrerror != 0 || |
| 1088 | n_rcu_torture_boost_rterror != 0 || | 1084 | n_rcu_torture_boost_rterror != 0 || |
| 1089 | n_rcu_torture_boost_allocerror != 0 || | ||
| 1090 | n_rcu_torture_boost_afferror != 0 || | ||
| 1091 | n_rcu_torture_boost_failure != 0) | 1085 | n_rcu_torture_boost_failure != 0) |
| 1092 | cnt += sprintf(&page[cnt], " !!!"); | 1086 | cnt += sprintf(&page[cnt], " !!!"); |
| 1093 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | 1087 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
| @@ -1331,6 +1325,7 @@ rcu_torture_cleanup(void) | |||
| 1331 | int i; | 1325 | int i; |
| 1332 | 1326 | ||
| 1333 | mutex_lock(&fullstop_mutex); | 1327 | mutex_lock(&fullstop_mutex); |
| 1328 | rcutorture_record_test_transition(); | ||
| 1334 | if (fullstop == FULLSTOP_SHUTDOWN) { | 1329 | if (fullstop == FULLSTOP_SHUTDOWN) { |
| 1335 | printk(KERN_WARNING /* but going down anyway, so... */ | 1330 | printk(KERN_WARNING /* but going down anyway, so... */ |
| 1336 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | 1331 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); |
| @@ -1486,8 +1481,6 @@ rcu_torture_init(void) | |||
| 1486 | atomic_set(&n_rcu_torture_error, 0); | 1481 | atomic_set(&n_rcu_torture_error, 0); |
| 1487 | n_rcu_torture_boost_ktrerror = 0; | 1482 | n_rcu_torture_boost_ktrerror = 0; |
| 1488 | n_rcu_torture_boost_rterror = 0; | 1483 | n_rcu_torture_boost_rterror = 0; |
| 1489 | n_rcu_torture_boost_allocerror = 0; | ||
| 1490 | n_rcu_torture_boost_afferror = 0; | ||
| 1491 | n_rcu_torture_boost_failure = 0; | 1484 | n_rcu_torture_boost_failure = 0; |
| 1492 | n_rcu_torture_boosts = 0; | 1485 | n_rcu_torture_boosts = 0; |
| 1493 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 1486 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
| @@ -1624,6 +1617,7 @@ rcu_torture_init(void) | |||
| 1624 | } | 1617 | } |
| 1625 | } | 1618 | } |
| 1626 | register_reboot_notifier(&rcutorture_shutdown_nb); | 1619 | register_reboot_notifier(&rcutorture_shutdown_nb); |
| 1620 | rcutorture_record_test_transition(); | ||
| 1627 | mutex_unlock(&fullstop_mutex); | 1621 | mutex_unlock(&fullstop_mutex); |
| 1628 | return 0; | 1622 | return 0; |
| 1629 | 1623 | ||
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dd4aea806f8e..ba06207b1dd3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -36,7 +36,7 @@ | |||
| 36 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
| 37 | #include <linux/sched.h> | 37 | #include <linux/sched.h> |
| 38 | #include <linux/nmi.h> | 38 | #include <linux/nmi.h> |
| 39 | #include <asm/atomic.h> | 39 | #include <linux/atomic.h> |
| 40 | #include <linux/bitops.h> | 40 | #include <linux/bitops.h> |
| 41 | #include <linux/module.h> | 41 | #include <linux/module.h> |
| 42 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
| @@ -47,6 +47,9 @@ | |||
| 47 | #include <linux/mutex.h> | 47 | #include <linux/mutex.h> |
| 48 | #include <linux/time.h> | 48 | #include <linux/time.h> |
| 49 | #include <linux/kernel_stat.h> | 49 | #include <linux/kernel_stat.h> |
| 50 | #include <linux/wait.h> | ||
| 51 | #include <linux/kthread.h> | ||
| 52 | #include <linux/prefetch.h> | ||
| 50 | 53 | ||
| 51 | #include "rcutree.h" | 54 | #include "rcutree.h" |
| 52 | 55 | ||
| @@ -79,10 +82,67 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | |||
| 79 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); | 82 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); |
| 80 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 83 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
| 81 | 84 | ||
| 85 | static struct rcu_state *rcu_state; | ||
| 86 | |||
| 87 | /* | ||
| 88 | * The rcu_scheduler_active variable transitions from zero to one just | ||
| 89 | * before the first task is spawned. So when this variable is zero, RCU | ||
| 90 | * can assume that there is but one task, allowing RCU to (for example) | ||
| 91 | * optimized synchronize_sched() to a simple barrier(). When this variable | ||
| 92 | * is one, RCU must actually do all the hard work required to detect real | ||
| 93 | * grace periods. This variable is also used to suppress boot-time false | ||
| 94 | * positives from lockdep-RCU error checking. | ||
| 95 | */ | ||
| 82 | int rcu_scheduler_active __read_mostly; | 96 | int rcu_scheduler_active __read_mostly; |
| 83 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | 97 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); |
| 84 | 98 | ||
| 85 | /* | 99 | /* |
| 100 | * The rcu_scheduler_fully_active variable transitions from zero to one | ||
| 101 | * during the early_initcall() processing, which is after the scheduler | ||
| 102 | * is capable of creating new tasks. So RCU processing (for example, | ||
| 103 | * creating tasks for RCU priority boosting) must be delayed until after | ||
| 104 | * rcu_scheduler_fully_active transitions from zero to one. We also | ||
| 105 | * currently delay invocation of any RCU callbacks until after this point. | ||
| 106 | * | ||
| 107 | * It might later prove better for people registering RCU callbacks during | ||
| 108 | * early boot to take responsibility for these callbacks, but one step at | ||
| 109 | * a time. | ||
| 110 | */ | ||
| 111 | static int rcu_scheduler_fully_active __read_mostly; | ||
| 112 | |||
| 113 | #ifdef CONFIG_RCU_BOOST | ||
| 114 | |||
| 115 | /* | ||
| 116 | * Control variables for per-CPU and per-rcu_node kthreads. These | ||
| 117 | * handle all flavors of RCU. | ||
| 118 | */ | ||
| 119 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | ||
| 120 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
| 121 | DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); | ||
| 122 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
| 123 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | ||
| 124 | |||
| 125 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 126 | |||
| 127 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | ||
| 128 | static void invoke_rcu_core(void); | ||
| 129 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | ||
| 130 | |||
| 131 | #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ | ||
| 132 | |||
| 133 | /* | ||
| 134 | * Track the rcutorture test sequence number and the update version | ||
| 135 | * number within a given test. The rcutorture_testseq is incremented | ||
| 136 | * on every rcutorture module load and unload, so has an odd value | ||
| 137 | * when a test is running. The rcutorture_vernum is set to zero | ||
| 138 | * when rcutorture starts and is incremented on each rcutorture update. | ||
| 139 | * These variables enable correlating rcutorture output with the | ||
| 140 | * RCU tracing information. | ||
| 141 | */ | ||
| 142 | unsigned long rcutorture_testseq; | ||
| 143 | unsigned long rcutorture_vernum; | ||
| 144 | |||
| 145 | /* | ||
| 86 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 146 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
| 87 | * permit this function to be invoked without holding the root rcu_node | 147 | * permit this function to be invoked without holding the root rcu_node |
| 88 | * structure's ->lock, but of course results can be subject to change. | 148 | * structure's ->lock, but of course results can be subject to change. |
| @@ -124,11 +184,12 @@ void rcu_note_context_switch(int cpu) | |||
| 124 | rcu_sched_qs(cpu); | 184 | rcu_sched_qs(cpu); |
| 125 | rcu_preempt_note_context_switch(cpu); | 185 | rcu_preempt_note_context_switch(cpu); |
| 126 | } | 186 | } |
| 187 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | ||
| 127 | 188 | ||
| 128 | #ifdef CONFIG_NO_HZ | 189 | #ifdef CONFIG_NO_HZ |
| 129 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 190 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
| 130 | .dynticks_nesting = 1, | 191 | .dynticks_nesting = 1, |
| 131 | .dynticks = 1, | 192 | .dynticks = ATOMIC_INIT(1), |
| 132 | }; | 193 | }; |
| 133 | #endif /* #ifdef CONFIG_NO_HZ */ | 194 | #endif /* #ifdef CONFIG_NO_HZ */ |
| 134 | 195 | ||
| @@ -140,10 +201,8 @@ module_param(blimit, int, 0); | |||
| 140 | module_param(qhimark, int, 0); | 201 | module_param(qhimark, int, 0); |
| 141 | module_param(qlowmark, int, 0); | 202 | module_param(qlowmark, int, 0); |
| 142 | 203 | ||
| 143 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | 204 | int rcu_cpu_stall_suppress __read_mostly; |
| 144 | int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; | ||
| 145 | module_param(rcu_cpu_stall_suppress, int, 0644); | 205 | module_param(rcu_cpu_stall_suppress, int, 0644); |
| 146 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 147 | 206 | ||
| 148 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); | 207 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); |
| 149 | static int rcu_pending(int cpu); | 208 | static int rcu_pending(int cpu); |
| @@ -176,6 +235,31 @@ void rcu_bh_force_quiescent_state(void) | |||
| 176 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 235 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); |
| 177 | 236 | ||
| 178 | /* | 237 | /* |
| 238 | * Record the number of times rcutorture tests have been initiated and | ||
| 239 | * terminated. This information allows the debugfs tracing stats to be | ||
| 240 | * correlated to the rcutorture messages, even when the rcutorture module | ||
| 241 | * is being repeatedly loaded and unloaded. In other words, we cannot | ||
| 242 | * store this state in rcutorture itself. | ||
| 243 | */ | ||
| 244 | void rcutorture_record_test_transition(void) | ||
| 245 | { | ||
| 246 | rcutorture_testseq++; | ||
| 247 | rcutorture_vernum = 0; | ||
| 248 | } | ||
| 249 | EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); | ||
| 250 | |||
| 251 | /* | ||
| 252 | * Record the number of writer passes through the current rcutorture test. | ||
| 253 | * This is also used to correlate debugfs tracing stats with the rcutorture | ||
| 254 | * messages. | ||
| 255 | */ | ||
| 256 | void rcutorture_record_progress(unsigned long vernum) | ||
| 257 | { | ||
| 258 | rcutorture_vernum++; | ||
| 259 | } | ||
| 260 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | ||
| 261 | |||
| 262 | /* | ||
| 179 | * Force a quiescent state for RCU-sched. | 263 | * Force a quiescent state for RCU-sched. |
| 180 | */ | 264 | */ |
| 181 | void rcu_sched_force_quiescent_state(void) | 265 | void rcu_sched_force_quiescent_state(void) |
| @@ -234,8 +318,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) | |||
| 234 | return 1; | 318 | return 1; |
| 235 | } | 319 | } |
| 236 | 320 | ||
| 237 | /* If preemptable RCU, no point in sending reschedule IPI. */ | 321 | /* If preemptible RCU, no point in sending reschedule IPI. */ |
| 238 | if (rdp->preemptable) | 322 | if (rdp->preemptible) |
| 239 | return 0; | 323 | return 0; |
| 240 | 324 | ||
| 241 | /* The CPU is online, so send it a reschedule IPI. */ | 325 | /* The CPU is online, so send it a reschedule IPI. */ |
| @@ -264,13 +348,25 @@ void rcu_enter_nohz(void) | |||
| 264 | unsigned long flags; | 348 | unsigned long flags; |
| 265 | struct rcu_dynticks *rdtp; | 349 | struct rcu_dynticks *rdtp; |
| 266 | 350 | ||
| 267 | smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ | ||
| 268 | local_irq_save(flags); | 351 | local_irq_save(flags); |
| 269 | rdtp = &__get_cpu_var(rcu_dynticks); | 352 | rdtp = &__get_cpu_var(rcu_dynticks); |
| 270 | rdtp->dynticks++; | 353 | if (--rdtp->dynticks_nesting) { |
| 271 | rdtp->dynticks_nesting--; | 354 | local_irq_restore(flags); |
| 272 | WARN_ON_ONCE(rdtp->dynticks & 0x1); | 355 | return; |
| 356 | } | ||
| 357 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
| 358 | smp_mb__before_atomic_inc(); /* See above. */ | ||
| 359 | atomic_inc(&rdtp->dynticks); | ||
| 360 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | ||
| 361 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
| 273 | local_irq_restore(flags); | 362 | local_irq_restore(flags); |
| 363 | |||
| 364 | /* If the interrupt queued a callback, get out of dyntick mode. */ | ||
| 365 | if (in_irq() && | ||
| 366 | (__get_cpu_var(rcu_sched_data).nxtlist || | ||
| 367 | __get_cpu_var(rcu_bh_data).nxtlist || | ||
| 368 | rcu_preempt_needs_cpu(smp_processor_id()))) | ||
| 369 | set_need_resched(); | ||
| 274 | } | 370 | } |
| 275 | 371 | ||
| 276 | /* | 372 | /* |
| @@ -286,11 +382,16 @@ void rcu_exit_nohz(void) | |||
| 286 | 382 | ||
| 287 | local_irq_save(flags); | 383 | local_irq_save(flags); |
| 288 | rdtp = &__get_cpu_var(rcu_dynticks); | 384 | rdtp = &__get_cpu_var(rcu_dynticks); |
| 289 | rdtp->dynticks++; | 385 | if (rdtp->dynticks_nesting++) { |
| 290 | rdtp->dynticks_nesting++; | 386 | local_irq_restore(flags); |
| 291 | WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); | 387 | return; |
| 388 | } | ||
| 389 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | ||
| 390 | atomic_inc(&rdtp->dynticks); | ||
| 391 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | ||
| 392 | smp_mb__after_atomic_inc(); /* See above. */ | ||
| 393 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
| 292 | local_irq_restore(flags); | 394 | local_irq_restore(flags); |
| 293 | smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ | ||
| 294 | } | 395 | } |
| 295 | 396 | ||
| 296 | /** | 397 | /** |
| @@ -304,11 +405,15 @@ void rcu_nmi_enter(void) | |||
| 304 | { | 405 | { |
| 305 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 406 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); |
| 306 | 407 | ||
| 307 | if (rdtp->dynticks & 0x1) | 408 | if (rdtp->dynticks_nmi_nesting == 0 && |
| 409 | (atomic_read(&rdtp->dynticks) & 0x1)) | ||
| 308 | return; | 410 | return; |
| 309 | rdtp->dynticks_nmi++; | 411 | rdtp->dynticks_nmi_nesting++; |
| 310 | WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); | 412 | smp_mb__before_atomic_inc(); /* Force delay from prior write. */ |
| 311 | smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ | 413 | atomic_inc(&rdtp->dynticks); |
| 414 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | ||
| 415 | smp_mb__after_atomic_inc(); /* See above. */ | ||
| 416 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
| 312 | } | 417 | } |
| 313 | 418 | ||
| 314 | /** | 419 | /** |
| @@ -322,11 +427,14 @@ void rcu_nmi_exit(void) | |||
| 322 | { | 427 | { |
| 323 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 428 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); |
| 324 | 429 | ||
| 325 | if (rdtp->dynticks & 0x1) | 430 | if (rdtp->dynticks_nmi_nesting == 0 || |
| 431 | --rdtp->dynticks_nmi_nesting != 0) | ||
| 326 | return; | 432 | return; |
| 327 | smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ | 433 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ |
| 328 | rdtp->dynticks_nmi++; | 434 | smp_mb__before_atomic_inc(); /* See above. */ |
| 329 | WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); | 435 | atomic_inc(&rdtp->dynticks); |
| 436 | smp_mb__after_atomic_inc(); /* Force delay to next write. */ | ||
| 437 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
| 330 | } | 438 | } |
| 331 | 439 | ||
| 332 | /** | 440 | /** |
| @@ -337,13 +445,7 @@ void rcu_nmi_exit(void) | |||
| 337 | */ | 445 | */ |
| 338 | void rcu_irq_enter(void) | 446 | void rcu_irq_enter(void) |
| 339 | { | 447 | { |
| 340 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 448 | rcu_exit_nohz(); |
| 341 | |||
| 342 | if (rdtp->dynticks_nesting++) | ||
| 343 | return; | ||
| 344 | rdtp->dynticks++; | ||
| 345 | WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); | ||
| 346 | smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ | ||
| 347 | } | 449 | } |
| 348 | 450 | ||
| 349 | /** | 451 | /** |
| @@ -355,18 +457,7 @@ void rcu_irq_enter(void) | |||
| 355 | */ | 457 | */ |
| 356 | void rcu_irq_exit(void) | 458 | void rcu_irq_exit(void) |
| 357 | { | 459 | { |
| 358 | struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); | 460 | rcu_enter_nohz(); |
| 359 | |||
| 360 | if (--rdtp->dynticks_nesting) | ||
| 361 | return; | ||
| 362 | smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ | ||
| 363 | rdtp->dynticks++; | ||
| 364 | WARN_ON_ONCE(rdtp->dynticks & 0x1); | ||
| 365 | |||
| 366 | /* If the interrupt queued a callback, get out of dyntick mode. */ | ||
| 367 | if (__this_cpu_read(rcu_sched_data.nxtlist) || | ||
| 368 | __this_cpu_read(rcu_bh_data.nxtlist)) | ||
| 369 | set_need_resched(); | ||
| 370 | } | 461 | } |
| 371 | 462 | ||
| 372 | #ifdef CONFIG_SMP | 463 | #ifdef CONFIG_SMP |
| @@ -378,19 +469,8 @@ void rcu_irq_exit(void) | |||
| 378 | */ | 469 | */ |
| 379 | static int dyntick_save_progress_counter(struct rcu_data *rdp) | 470 | static int dyntick_save_progress_counter(struct rcu_data *rdp) |
| 380 | { | 471 | { |
| 381 | int ret; | 472 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); |
| 382 | int snap; | 473 | return 0; |
| 383 | int snap_nmi; | ||
| 384 | |||
| 385 | snap = rdp->dynticks->dynticks; | ||
| 386 | snap_nmi = rdp->dynticks->dynticks_nmi; | ||
| 387 | smp_mb(); /* Order sampling of snap with end of grace period. */ | ||
| 388 | rdp->dynticks_snap = snap; | ||
| 389 | rdp->dynticks_nmi_snap = snap_nmi; | ||
| 390 | ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0); | ||
| 391 | if (ret) | ||
| 392 | rdp->dynticks_fqs++; | ||
| 393 | return ret; | ||
| 394 | } | 474 | } |
| 395 | 475 | ||
| 396 | /* | 476 | /* |
| @@ -401,16 +481,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
| 401 | */ | 481 | */ |
| 402 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 482 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) |
| 403 | { | 483 | { |
| 404 | long curr; | 484 | unsigned long curr; |
| 405 | long curr_nmi; | 485 | unsigned long snap; |
| 406 | long snap; | ||
| 407 | long snap_nmi; | ||
| 408 | 486 | ||
| 409 | curr = rdp->dynticks->dynticks; | 487 | curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); |
| 410 | snap = rdp->dynticks_snap; | 488 | snap = (unsigned long)rdp->dynticks_snap; |
| 411 | curr_nmi = rdp->dynticks->dynticks_nmi; | ||
| 412 | snap_nmi = rdp->dynticks_nmi_snap; | ||
| 413 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
| 414 | 489 | ||
| 415 | /* | 490 | /* |
| 416 | * If the CPU passed through or entered a dynticks idle phase with | 491 | * If the CPU passed through or entered a dynticks idle phase with |
| @@ -420,8 +495,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
| 420 | * read-side critical section that started before the beginning | 495 | * read-side critical section that started before the beginning |
| 421 | * of the current RCU grace period. | 496 | * of the current RCU grace period. |
| 422 | */ | 497 | */ |
| 423 | if ((curr != snap || (curr & 0x1) == 0) && | 498 | if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { |
| 424 | (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) { | ||
| 425 | rdp->dynticks_fqs++; | 499 | rdp->dynticks_fqs++; |
| 426 | return 1; | 500 | return 1; |
| 427 | } | 501 | } |
| @@ -450,8 +524,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
| 450 | 524 | ||
| 451 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 525 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
| 452 | 526 | ||
| 453 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 454 | |||
| 455 | int rcu_cpu_stall_suppress __read_mostly; | 527 | int rcu_cpu_stall_suppress __read_mostly; |
| 456 | 528 | ||
| 457 | static void record_gp_stall_check_time(struct rcu_state *rsp) | 529 | static void record_gp_stall_check_time(struct rcu_state *rsp) |
| @@ -537,21 +609,24 @@ static void print_cpu_stall(struct rcu_state *rsp) | |||
| 537 | 609 | ||
| 538 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | 610 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) |
| 539 | { | 611 | { |
| 540 | long delta; | 612 | unsigned long j; |
| 613 | unsigned long js; | ||
| 541 | struct rcu_node *rnp; | 614 | struct rcu_node *rnp; |
| 542 | 615 | ||
| 543 | if (rcu_cpu_stall_suppress) | 616 | if (rcu_cpu_stall_suppress) |
| 544 | return; | 617 | return; |
| 545 | delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); | 618 | j = ACCESS_ONCE(jiffies); |
| 619 | js = ACCESS_ONCE(rsp->jiffies_stall); | ||
| 546 | rnp = rdp->mynode; | 620 | rnp = rdp->mynode; |
| 547 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { | 621 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { |
| 548 | 622 | ||
| 549 | /* We haven't checked in, so go dump stack. */ | 623 | /* We haven't checked in, so go dump stack. */ |
| 550 | print_cpu_stall(rsp); | 624 | print_cpu_stall(rsp); |
| 551 | 625 | ||
| 552 | } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) { | 626 | } else if (rcu_gp_in_progress(rsp) && |
| 627 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { | ||
| 553 | 628 | ||
| 554 | /* They had two time units to dump stack, so complain. */ | 629 | /* They had a few time units to dump stack, so complain. */ |
| 555 | print_other_cpu_stall(rsp); | 630 | print_other_cpu_stall(rsp); |
| 556 | } | 631 | } |
| 557 | } | 632 | } |
| @@ -587,26 +662,6 @@ static void __init check_cpu_stall_init(void) | |||
| 587 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); | 662 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); |
| 588 | } | 663 | } |
| 589 | 664 | ||
| 590 | #else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 591 | |||
| 592 | static void record_gp_stall_check_time(struct rcu_state *rsp) | ||
| 593 | { | ||
| 594 | } | ||
| 595 | |||
| 596 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | ||
| 597 | { | ||
| 598 | } | ||
| 599 | |||
| 600 | void rcu_cpu_stall_reset(void) | ||
| 601 | { | ||
| 602 | } | ||
| 603 | |||
| 604 | static void __init check_cpu_stall_init(void) | ||
| 605 | { | ||
| 606 | } | ||
| 607 | |||
| 608 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 609 | |||
| 610 | /* | 665 | /* |
| 611 | * Update CPU-local rcu_data state to record the newly noticed grace period. | 666 | * Update CPU-local rcu_data state to record the newly noticed grace period. |
| 612 | * This is used both when we started the grace period and when we notice | 667 | * This is used both when we started the grace period and when we notice |
| @@ -809,6 +864,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
| 809 | rnp->completed = rsp->completed; | 864 | rnp->completed = rsp->completed; |
| 810 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 865 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ |
| 811 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 866 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
| 867 | rcu_preempt_boost_start_gp(rnp); | ||
| 812 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 868 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 813 | return; | 869 | return; |
| 814 | } | 870 | } |
| @@ -844,6 +900,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
| 844 | rnp->completed = rsp->completed; | 900 | rnp->completed = rsp->completed; |
| 845 | if (rnp == rdp->mynode) | 901 | if (rnp == rdp->mynode) |
| 846 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 902 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
| 903 | rcu_preempt_boost_start_gp(rnp); | ||
| 847 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 904 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 848 | } | 905 | } |
| 849 | 906 | ||
| @@ -864,7 +921,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
| 864 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 921 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) |
| 865 | __releases(rcu_get_root(rsp)->lock) | 922 | __releases(rcu_get_root(rsp)->lock) |
| 866 | { | 923 | { |
| 924 | unsigned long gp_duration; | ||
| 925 | |||
| 867 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 926 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
| 927 | |||
| 928 | /* | ||
| 929 | * Ensure that all grace-period and pre-grace-period activity | ||
| 930 | * is seen before the assignment to rsp->completed. | ||
| 931 | */ | ||
| 932 | smp_mb(); /* See above block comment. */ | ||
| 933 | gp_duration = jiffies - rsp->gp_start; | ||
| 934 | if (gp_duration > rsp->gp_max) | ||
| 935 | rsp->gp_max = gp_duration; | ||
| 868 | rsp->completed = rsp->gpnum; | 936 | rsp->completed = rsp->gpnum; |
| 869 | rsp->signaled = RCU_GP_IDLE; | 937 | rsp->signaled = RCU_GP_IDLE; |
| 870 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 938 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
| @@ -894,7 +962,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
| 894 | return; | 962 | return; |
| 895 | } | 963 | } |
| 896 | rnp->qsmask &= ~mask; | 964 | rnp->qsmask &= ~mask; |
| 897 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 965 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
| 898 | 966 | ||
| 899 | /* Other bits still set at this level, so done. */ | 967 | /* Other bits still set at this level, so done. */ |
| 900 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 968 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| @@ -1037,6 +1105,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) | |||
| 1037 | /* | 1105 | /* |
| 1038 | * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy | 1106 | * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy |
| 1039 | * and move all callbacks from the outgoing CPU to the current one. | 1107 | * and move all callbacks from the outgoing CPU to the current one. |
| 1108 | * There can only be one CPU hotplug operation at a time, so no other | ||
| 1109 | * CPU can be attempting to update rcu_cpu_kthread_task. | ||
| 1040 | */ | 1110 | */ |
| 1041 | static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | 1111 | static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) |
| 1042 | { | 1112 | { |
| @@ -1046,6 +1116,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
| 1046 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 1116 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
| 1047 | struct rcu_node *rnp; | 1117 | struct rcu_node *rnp; |
| 1048 | 1118 | ||
| 1119 | rcu_stop_cpu_kthread(cpu); | ||
| 1120 | |||
| 1049 | /* Exclude any attempts to start a new grace period. */ | 1121 | /* Exclude any attempts to start a new grace period. */ |
| 1050 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1122 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
| 1051 | 1123 | ||
| @@ -1082,6 +1154,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
| 1082 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1154 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1083 | if (need_report & RCU_OFL_TASKS_EXP_GP) | 1155 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
| 1084 | rcu_report_exp_rnp(rsp, rnp); | 1156 | rcu_report_exp_rnp(rsp, rnp); |
| 1157 | rcu_node_kthread_setaffinity(rnp, -1); | ||
| 1085 | } | 1158 | } |
| 1086 | 1159 | ||
| 1087 | /* | 1160 | /* |
| @@ -1143,7 +1216,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1143 | next = list->next; | 1216 | next = list->next; |
| 1144 | prefetch(next); | 1217 | prefetch(next); |
| 1145 | debug_rcu_head_unqueue(list); | 1218 | debug_rcu_head_unqueue(list); |
| 1146 | list->func(list); | 1219 | __rcu_reclaim(list); |
| 1147 | list = next; | 1220 | list = next; |
| 1148 | if (++count >= rdp->blimit) | 1221 | if (++count >= rdp->blimit) |
| 1149 | break; | 1222 | break; |
| @@ -1179,7 +1252,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1179 | 1252 | ||
| 1180 | /* Re-raise the RCU softirq if there are callbacks remaining. */ | 1253 | /* Re-raise the RCU softirq if there are callbacks remaining. */ |
| 1181 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | 1254 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
| 1182 | raise_softirq(RCU_SOFTIRQ); | 1255 | invoke_rcu_core(); |
| 1183 | } | 1256 | } |
| 1184 | 1257 | ||
| 1185 | /* | 1258 | /* |
| @@ -1225,7 +1298,7 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 1225 | } | 1298 | } |
| 1226 | rcu_preempt_check_callbacks(cpu); | 1299 | rcu_preempt_check_callbacks(cpu); |
| 1227 | if (rcu_pending(cpu)) | 1300 | if (rcu_pending(cpu)) |
| 1228 | raise_softirq(RCU_SOFTIRQ); | 1301 | invoke_rcu_core(); |
| 1229 | } | 1302 | } |
| 1230 | 1303 | ||
| 1231 | #ifdef CONFIG_SMP | 1304 | #ifdef CONFIG_SMP |
| @@ -1233,6 +1306,8 @@ void rcu_check_callbacks(int cpu, int user) | |||
| 1233 | /* | 1306 | /* |
| 1234 | * Scan the leaf rcu_node structures, processing dyntick state for any that | 1307 | * Scan the leaf rcu_node structures, processing dyntick state for any that |
| 1235 | * have not yet encountered a quiescent state, using the function specified. | 1308 | * have not yet encountered a quiescent state, using the function specified. |
| 1309 | * Also initiate boosting for any threads blocked on the root rcu_node. | ||
| 1310 | * | ||
| 1236 | * The caller must have suppressed start of new grace periods. | 1311 | * The caller must have suppressed start of new grace periods. |
| 1237 | */ | 1312 | */ |
| 1238 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | 1313 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) |
| @@ -1251,7 +1326,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
| 1251 | return; | 1326 | return; |
| 1252 | } | 1327 | } |
| 1253 | if (rnp->qsmask == 0) { | 1328 | if (rnp->qsmask == 0) { |
| 1254 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1329 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ |
| 1255 | continue; | 1330 | continue; |
| 1256 | } | 1331 | } |
| 1257 | cpu = rnp->grplo; | 1332 | cpu = rnp->grplo; |
| @@ -1269,6 +1344,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
| 1269 | } | 1344 | } |
| 1270 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1345 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1271 | } | 1346 | } |
| 1347 | rnp = rcu_get_root(rsp); | ||
| 1348 | if (rnp->qsmask == 0) { | ||
| 1349 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1350 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
| 1351 | } | ||
| 1272 | } | 1352 | } |
| 1273 | 1353 | ||
| 1274 | /* | 1354 | /* |
| @@ -1383,7 +1463,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1383 | } | 1463 | } |
| 1384 | 1464 | ||
| 1385 | /* If there are callbacks ready, invoke them. */ | 1465 | /* If there are callbacks ready, invoke them. */ |
| 1386 | rcu_do_batch(rsp, rdp); | 1466 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
| 1467 | invoke_rcu_callbacks(rsp, rdp); | ||
| 1387 | } | 1468 | } |
| 1388 | 1469 | ||
| 1389 | /* | 1470 | /* |
| @@ -1391,29 +1472,37 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1391 | */ | 1472 | */ |
| 1392 | static void rcu_process_callbacks(struct softirq_action *unused) | 1473 | static void rcu_process_callbacks(struct softirq_action *unused) |
| 1393 | { | 1474 | { |
| 1394 | /* | ||
| 1395 | * Memory references from any prior RCU read-side critical sections | ||
| 1396 | * executed by the interrupted code must be seen before any RCU | ||
| 1397 | * grace-period manipulations below. | ||
| 1398 | */ | ||
| 1399 | smp_mb(); /* See above block comment. */ | ||
| 1400 | |||
| 1401 | __rcu_process_callbacks(&rcu_sched_state, | 1475 | __rcu_process_callbacks(&rcu_sched_state, |
| 1402 | &__get_cpu_var(rcu_sched_data)); | 1476 | &__get_cpu_var(rcu_sched_data)); |
| 1403 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | 1477 | __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); |
| 1404 | rcu_preempt_process_callbacks(); | 1478 | rcu_preempt_process_callbacks(); |
| 1405 | 1479 | ||
| 1406 | /* | ||
| 1407 | * Memory references from any later RCU read-side critical sections | ||
| 1408 | * executed by the interrupted code must be seen after any RCU | ||
| 1409 | * grace-period manipulations above. | ||
| 1410 | */ | ||
| 1411 | smp_mb(); /* See above block comment. */ | ||
| 1412 | |||
| 1413 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ | 1480 | /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ |
| 1414 | rcu_needs_cpu_flush(); | 1481 | rcu_needs_cpu_flush(); |
| 1415 | } | 1482 | } |
| 1416 | 1483 | ||
| 1484 | /* | ||
| 1485 | * Wake up the current CPU's kthread. This replaces raise_softirq() | ||
| 1486 | * in earlier versions of RCU. Note that because we are running on | ||
| 1487 | * the current CPU with interrupts disabled, the rcu_cpu_kthread_task | ||
| 1488 | * cannot disappear out from under us. | ||
| 1489 | */ | ||
| 1490 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | ||
| 1491 | { | ||
| 1492 | if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) | ||
| 1493 | return; | ||
| 1494 | if (likely(!rsp->boost)) { | ||
| 1495 | rcu_do_batch(rsp, rdp); | ||
| 1496 | return; | ||
| 1497 | } | ||
| 1498 | invoke_rcu_callbacks_kthread(); | ||
| 1499 | } | ||
| 1500 | |||
| 1501 | static void invoke_rcu_core(void) | ||
| 1502 | { | ||
| 1503 | raise_softirq(RCU_SOFTIRQ); | ||
| 1504 | } | ||
| 1505 | |||
| 1417 | static void | 1506 | static void |
| 1418 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | 1507 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), |
| 1419 | struct rcu_state *rsp) | 1508 | struct rcu_state *rsp) |
| @@ -1439,6 +1528,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1439 | /* Add the callback to our list. */ | 1528 | /* Add the callback to our list. */ |
| 1440 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | 1529 | *rdp->nxttail[RCU_NEXT_TAIL] = head; |
| 1441 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | 1530 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
| 1531 | rdp->qlen++; | ||
| 1532 | |||
| 1533 | /* If interrupts were disabled, don't dive into RCU core. */ | ||
| 1534 | if (irqs_disabled_flags(flags)) { | ||
| 1535 | local_irq_restore(flags); | ||
| 1536 | return; | ||
| 1537 | } | ||
| 1442 | 1538 | ||
| 1443 | /* | 1539 | /* |
| 1444 | * Force the grace period if too many callbacks or too long waiting. | 1540 | * Force the grace period if too many callbacks or too long waiting. |
| @@ -1447,7 +1543,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1447 | * invoking force_quiescent_state() if the newly enqueued callback | 1543 | * invoking force_quiescent_state() if the newly enqueued callback |
| 1448 | * is the only one waiting for a grace period to complete. | 1544 | * is the only one waiting for a grace period to complete. |
| 1449 | */ | 1545 | */ |
| 1450 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | 1546 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { |
| 1451 | 1547 | ||
| 1452 | /* Are we ignoring a completed grace period? */ | 1548 | /* Are we ignoring a completed grace period? */ |
| 1453 | rcu_process_gp_end(rsp, rdp); | 1549 | rcu_process_gp_end(rsp, rdp); |
| @@ -1583,7 +1679,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1583 | * or RCU-bh, force a local reschedule. | 1679 | * or RCU-bh, force a local reschedule. |
| 1584 | */ | 1680 | */ |
| 1585 | rdp->n_rp_qs_pending++; | 1681 | rdp->n_rp_qs_pending++; |
| 1586 | if (!rdp->preemptable && | 1682 | if (!rdp->preemptible && |
| 1587 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, | 1683 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, |
| 1588 | jiffies)) | 1684 | jiffies)) |
| 1589 | set_need_resched(); | 1685 | set_need_resched(); |
| @@ -1760,7 +1856,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 1760 | * that this CPU cannot possibly have any RCU callbacks in flight yet. | 1856 | * that this CPU cannot possibly have any RCU callbacks in flight yet. |
| 1761 | */ | 1857 | */ |
| 1762 | static void __cpuinit | 1858 | static void __cpuinit |
| 1763 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | 1859 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) |
| 1764 | { | 1860 | { |
| 1765 | unsigned long flags; | 1861 | unsigned long flags; |
| 1766 | unsigned long mask; | 1862 | unsigned long mask; |
| @@ -1772,7 +1868,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
| 1772 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ | 1868 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ |
| 1773 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ | 1869 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ |
| 1774 | rdp->beenonline = 1; /* We have now been online. */ | 1870 | rdp->beenonline = 1; /* We have now been online. */ |
| 1775 | rdp->preemptable = preemptable; | 1871 | rdp->preemptible = preemptible; |
| 1776 | rdp->qlen_last_fqs_check = 0; | 1872 | rdp->qlen_last_fqs_check = 0; |
| 1777 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1873 | rdp->n_force_qs_snap = rsp->n_force_qs; |
| 1778 | rdp->blimit = blimit; | 1874 | rdp->blimit = blimit; |
| @@ -1806,7 +1902,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
| 1806 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 1902 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
| 1807 | } | 1903 | } |
| 1808 | 1904 | ||
| 1809 | static void __cpuinit rcu_online_cpu(int cpu) | 1905 | static void __cpuinit rcu_prepare_cpu(int cpu) |
| 1810 | { | 1906 | { |
| 1811 | rcu_init_percpu_data(cpu, &rcu_sched_state, 0); | 1907 | rcu_init_percpu_data(cpu, &rcu_sched_state, 0); |
| 1812 | rcu_init_percpu_data(cpu, &rcu_bh_state, 0); | 1908 | rcu_init_percpu_data(cpu, &rcu_bh_state, 0); |
| @@ -1820,11 +1916,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
| 1820 | unsigned long action, void *hcpu) | 1916 | unsigned long action, void *hcpu) |
| 1821 | { | 1917 | { |
| 1822 | long cpu = (long)hcpu; | 1918 | long cpu = (long)hcpu; |
| 1919 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
| 1920 | struct rcu_node *rnp = rdp->mynode; | ||
| 1823 | 1921 | ||
| 1824 | switch (action) { | 1922 | switch (action) { |
| 1825 | case CPU_UP_PREPARE: | 1923 | case CPU_UP_PREPARE: |
| 1826 | case CPU_UP_PREPARE_FROZEN: | 1924 | case CPU_UP_PREPARE_FROZEN: |
| 1827 | rcu_online_cpu(cpu); | 1925 | rcu_prepare_cpu(cpu); |
| 1926 | rcu_prepare_kthreads(cpu); | ||
| 1927 | break; | ||
| 1928 | case CPU_ONLINE: | ||
| 1929 | case CPU_DOWN_FAILED: | ||
| 1930 | rcu_node_kthread_setaffinity(rnp, -1); | ||
| 1931 | rcu_cpu_kthread_setrt(cpu, 1); | ||
| 1932 | break; | ||
| 1933 | case CPU_DOWN_PREPARE: | ||
| 1934 | rcu_node_kthread_setaffinity(rnp, cpu); | ||
| 1935 | rcu_cpu_kthread_setrt(cpu, 0); | ||
| 1828 | break; | 1936 | break; |
| 1829 | case CPU_DYING: | 1937 | case CPU_DYING: |
| 1830 | case CPU_DYING_FROZEN: | 1938 | case CPU_DYING_FROZEN: |
| @@ -1943,10 +2051,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 1943 | j / rsp->levelspread[i - 1]; | 2051 | j / rsp->levelspread[i - 1]; |
| 1944 | } | 2052 | } |
| 1945 | rnp->level = i; | 2053 | rnp->level = i; |
| 1946 | INIT_LIST_HEAD(&rnp->blocked_tasks[0]); | 2054 | INIT_LIST_HEAD(&rnp->blkd_tasks); |
| 1947 | INIT_LIST_HEAD(&rnp->blocked_tasks[1]); | ||
| 1948 | INIT_LIST_HEAD(&rnp->blocked_tasks[2]); | ||
| 1949 | INIT_LIST_HEAD(&rnp->blocked_tasks[3]); | ||
| 1950 | } | 2055 | } |
| 1951 | } | 2056 | } |
| 1952 | 2057 | ||
| @@ -1968,7 +2073,7 @@ void __init rcu_init(void) | |||
| 1968 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 2073 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
| 1969 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 2074 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
| 1970 | __rcu_init_preempt(); | 2075 | __rcu_init_preempt(); |
| 1971 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | 2076 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
| 1972 | 2077 | ||
| 1973 | /* | 2078 | /* |
| 1974 | * We don't need protection against CPU-hotplug here because | 2079 | * We don't need protection against CPU-hotplug here because |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e8f057e44e3e..01b2ccda26fb 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
| @@ -84,13 +84,19 @@ | |||
| 84 | * Dynticks per-CPU state. | 84 | * Dynticks per-CPU state. |
| 85 | */ | 85 | */ |
| 86 | struct rcu_dynticks { | 86 | struct rcu_dynticks { |
| 87 | int dynticks_nesting; /* Track nesting level, sort of. */ | 87 | int dynticks_nesting; /* Track irq/process nesting level. */ |
| 88 | int dynticks; /* Even value for dynticks-idle, else odd. */ | 88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
| 89 | int dynticks_nmi; /* Even value for either dynticks-idle or */ | 89 | atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ |
| 90 | /* not in nmi handler, else odd. So this */ | ||
| 91 | /* remains even for nmi from irq handler. */ | ||
| 92 | }; | 90 | }; |
| 93 | 91 | ||
| 92 | /* RCU's kthread states for tracing. */ | ||
| 93 | #define RCU_KTHREAD_STOPPED 0 | ||
| 94 | #define RCU_KTHREAD_RUNNING 1 | ||
| 95 | #define RCU_KTHREAD_WAITING 2 | ||
| 96 | #define RCU_KTHREAD_OFFCPU 3 | ||
| 97 | #define RCU_KTHREAD_YIELDING 4 | ||
| 98 | #define RCU_KTHREAD_MAX 4 | ||
| 99 | |||
| 94 | /* | 100 | /* |
| 95 | * Definition for node within the RCU grace-period-detection hierarchy. | 101 | * Definition for node within the RCU grace-period-detection hierarchy. |
| 96 | */ | 102 | */ |
| @@ -109,10 +115,13 @@ struct rcu_node { | |||
| 109 | /* an rcu_data structure, otherwise, each */ | 115 | /* an rcu_data structure, otherwise, each */ |
| 110 | /* bit corresponds to a child rcu_node */ | 116 | /* bit corresponds to a child rcu_node */ |
| 111 | /* structure. */ | 117 | /* structure. */ |
| 112 | unsigned long expmask; /* Groups that have ->blocked_tasks[] */ | 118 | unsigned long expmask; /* Groups that have ->blkd_tasks */ |
| 113 | /* elements that need to drain to allow the */ | 119 | /* elements that need to drain to allow the */ |
| 114 | /* current expedited grace period to */ | 120 | /* current expedited grace period to */ |
| 115 | /* complete (only for TREE_PREEMPT_RCU). */ | 121 | /* complete (only for TREE_PREEMPT_RCU). */ |
| 122 | atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ | ||
| 123 | /* Since this has meaning only for leaf */ | ||
| 124 | /* rcu_node structures, 32 bits suffices. */ | ||
| 116 | unsigned long qsmaskinit; | 125 | unsigned long qsmaskinit; |
| 117 | /* Per-GP initial value for qsmask & expmask. */ | 126 | /* Per-GP initial value for qsmask & expmask. */ |
| 118 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 127 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
| @@ -122,11 +131,62 @@ struct rcu_node { | |||
| 122 | u8 grpnum; /* CPU/group number for next level up. */ | 131 | u8 grpnum; /* CPU/group number for next level up. */ |
| 123 | u8 level; /* root is at level 0. */ | 132 | u8 level; /* root is at level 0. */ |
| 124 | struct rcu_node *parent; | 133 | struct rcu_node *parent; |
| 125 | struct list_head blocked_tasks[4]; | 134 | struct list_head blkd_tasks; |
| 126 | /* Tasks blocked in RCU read-side critsect. */ | 135 | /* Tasks blocked in RCU read-side critical */ |
| 127 | /* Grace period number (->gpnum) x blocked */ | 136 | /* section. Tasks are placed at the head */ |
| 128 | /* by tasks on the (x & 0x1) element of the */ | 137 | /* of this list and age towards the tail. */ |
| 129 | /* blocked_tasks[] array. */ | 138 | struct list_head *gp_tasks; |
| 139 | /* Pointer to the first task blocking the */ | ||
| 140 | /* current grace period, or NULL if there */ | ||
| 141 | /* is no such task. */ | ||
| 142 | struct list_head *exp_tasks; | ||
| 143 | /* Pointer to the first task blocking the */ | ||
| 144 | /* current expedited grace period, or NULL */ | ||
| 145 | /* if there is no such task. If there */ | ||
| 146 | /* is no current expedited grace period, */ | ||
| 147 | /* then there can cannot be any such task. */ | ||
| 148 | #ifdef CONFIG_RCU_BOOST | ||
| 149 | struct list_head *boost_tasks; | ||
| 150 | /* Pointer to first task that needs to be */ | ||
| 151 | /* priority boosted, or NULL if no priority */ | ||
| 152 | /* boosting is needed for this rcu_node */ | ||
| 153 | /* structure. If there are no tasks */ | ||
| 154 | /* queued on this rcu_node structure that */ | ||
| 155 | /* are blocking the current grace period, */ | ||
| 156 | /* there can be no such task. */ | ||
| 157 | unsigned long boost_time; | ||
| 158 | /* When to start boosting (jiffies). */ | ||
| 159 | struct task_struct *boost_kthread_task; | ||
| 160 | /* kthread that takes care of priority */ | ||
| 161 | /* boosting for this rcu_node structure. */ | ||
| 162 | unsigned int boost_kthread_status; | ||
| 163 | /* State of boost_kthread_task for tracing. */ | ||
| 164 | unsigned long n_tasks_boosted; | ||
| 165 | /* Total number of tasks boosted. */ | ||
| 166 | unsigned long n_exp_boosts; | ||
| 167 | /* Number of tasks boosted for expedited GP. */ | ||
| 168 | unsigned long n_normal_boosts; | ||
| 169 | /* Number of tasks boosted for normal GP. */ | ||
| 170 | unsigned long n_balk_blkd_tasks; | ||
| 171 | /* Refused to boost: no blocked tasks. */ | ||
| 172 | unsigned long n_balk_exp_gp_tasks; | ||
| 173 | /* Refused to boost: nothing blocking GP. */ | ||
| 174 | unsigned long n_balk_boost_tasks; | ||
| 175 | /* Refused to boost: already boosting. */ | ||
| 176 | unsigned long n_balk_notblocked; | ||
| 177 | /* Refused to boost: RCU RS CS still running. */ | ||
| 178 | unsigned long n_balk_notyet; | ||
| 179 | /* Refused to boost: not yet time. */ | ||
| 180 | unsigned long n_balk_nos; | ||
| 181 | /* Refused to boost: not sure why, though. */ | ||
| 182 | /* This can happen due to race conditions. */ | ||
| 183 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 184 | struct task_struct *node_kthread_task; | ||
| 185 | /* kthread that takes care of this rcu_node */ | ||
| 186 | /* structure, for example, awakening the */ | ||
| 187 | /* per-CPU kthreads as needed. */ | ||
| 188 | unsigned int node_kthread_status; | ||
| 189 | /* State of node_kthread_task for tracing. */ | ||
| 130 | } ____cacheline_internodealigned_in_smp; | 190 | } ____cacheline_internodealigned_in_smp; |
| 131 | 191 | ||
| 132 | /* | 192 | /* |
| @@ -175,7 +235,7 @@ struct rcu_data { | |||
| 175 | bool passed_quiesc; /* User-mode/idle loop etc. */ | 235 | bool passed_quiesc; /* User-mode/idle loop etc. */ |
| 176 | bool qs_pending; /* Core waits for quiesc state. */ | 236 | bool qs_pending; /* Core waits for quiesc state. */ |
| 177 | bool beenonline; /* CPU online at least once. */ | 237 | bool beenonline; /* CPU online at least once. */ |
| 178 | bool preemptable; /* Preemptable RCU? */ | 238 | bool preemptible; /* Preemptible RCU? */ |
| 179 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | 239 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ |
| 180 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ | 240 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ |
| 181 | 241 | ||
| @@ -218,7 +278,6 @@ struct rcu_data { | |||
| 218 | /* 3) dynticks interface. */ | 278 | /* 3) dynticks interface. */ |
| 219 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ | 279 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ |
| 220 | int dynticks_snap; /* Per-GP tracking for dynticks. */ | 280 | int dynticks_snap; /* Per-GP tracking for dynticks. */ |
| 221 | int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ | ||
| 222 | #endif /* #ifdef CONFIG_NO_HZ */ | 281 | #endif /* #ifdef CONFIG_NO_HZ */ |
| 223 | 282 | ||
| 224 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ | 283 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ |
| @@ -254,7 +313,6 @@ struct rcu_data { | |||
| 254 | #endif /* #else #ifdef CONFIG_NO_HZ */ | 313 | #endif /* #else #ifdef CONFIG_NO_HZ */ |
| 255 | 314 | ||
| 256 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ | 315 | #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ |
| 257 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 258 | 316 | ||
| 259 | #ifdef CONFIG_PROVE_RCU | 317 | #ifdef CONFIG_PROVE_RCU |
| 260 | #define RCU_STALL_DELAY_DELTA (5 * HZ) | 318 | #define RCU_STALL_DELAY_DELTA (5 * HZ) |
| @@ -272,13 +330,16 @@ struct rcu_data { | |||
| 272 | /* scheduling clock irq */ | 330 | /* scheduling clock irq */ |
| 273 | /* before ratting on them. */ | 331 | /* before ratting on them. */ |
| 274 | 332 | ||
| 275 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE | 333 | #define rcu_wait(cond) \ |
| 276 | #define RCU_CPU_STALL_SUPPRESS_INIT 0 | 334 | do { \ |
| 277 | #else | 335 | for (;;) { \ |
| 278 | #define RCU_CPU_STALL_SUPPRESS_INIT 1 | 336 | set_current_state(TASK_INTERRUPTIBLE); \ |
| 279 | #endif | 337 | if (cond) \ |
| 280 | 338 | break; \ | |
| 281 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 339 | schedule(); \ |
| 340 | } \ | ||
| 341 | __set_current_state(TASK_RUNNING); \ | ||
| 342 | } while (0) | ||
| 282 | 343 | ||
| 283 | /* | 344 | /* |
| 284 | * RCU global state, including node hierarchy. This hierarchy is | 345 | * RCU global state, including node hierarchy. This hierarchy is |
| @@ -308,6 +369,7 @@ struct rcu_state { | |||
| 308 | /* period because */ | 369 | /* period because */ |
| 309 | /* force_quiescent_state() */ | 370 | /* force_quiescent_state() */ |
| 310 | /* was running. */ | 371 | /* was running. */ |
| 372 | u8 boost; /* Subject to priority boost. */ | ||
| 311 | unsigned long gpnum; /* Current gp number. */ | 373 | unsigned long gpnum; /* Current gp number. */ |
| 312 | unsigned long completed; /* # of last completed gp. */ | 374 | unsigned long completed; /* # of last completed gp. */ |
| 313 | 375 | ||
| @@ -325,12 +387,12 @@ struct rcu_state { | |||
| 325 | /* due to lock unavailable. */ | 387 | /* due to lock unavailable. */ |
| 326 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ | 388 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ |
| 327 | /* due to no GP active. */ | 389 | /* due to no GP active. */ |
| 328 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 329 | unsigned long gp_start; /* Time at which GP started, */ | 390 | unsigned long gp_start; /* Time at which GP started, */ |
| 330 | /* but in jiffies. */ | 391 | /* but in jiffies. */ |
| 331 | unsigned long jiffies_stall; /* Time at which to check */ | 392 | unsigned long jiffies_stall; /* Time at which to check */ |
| 332 | /* for CPU stalls. */ | 393 | /* for CPU stalls. */ |
| 333 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 394 | unsigned long gp_max; /* Maximum GP duration in */ |
| 395 | /* jiffies. */ | ||
| 334 | char *name; /* Name of structure. */ | 396 | char *name; /* Name of structure. */ |
| 335 | }; | 397 | }; |
| 336 | 398 | ||
| @@ -361,16 +423,15 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | |||
| 361 | static void rcu_bootup_announce(void); | 423 | static void rcu_bootup_announce(void); |
| 362 | long rcu_batches_completed(void); | 424 | long rcu_batches_completed(void); |
| 363 | static void rcu_preempt_note_context_switch(int cpu); | 425 | static void rcu_preempt_note_context_switch(int cpu); |
| 364 | static int rcu_preempted_readers(struct rcu_node *rnp); | 426 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); |
| 365 | #ifdef CONFIG_HOTPLUG_CPU | 427 | #ifdef CONFIG_HOTPLUG_CPU |
| 366 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 428 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
| 367 | unsigned long flags); | 429 | unsigned long flags); |
| 430 | static void rcu_stop_cpu_kthread(int cpu); | ||
| 368 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 431 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 369 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 370 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 432 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
| 371 | static void rcu_print_task_stall(struct rcu_node *rnp); | 433 | static void rcu_print_task_stall(struct rcu_node *rnp); |
| 372 | static void rcu_preempt_stall_reset(void); | 434 | static void rcu_preempt_stall_reset(void); |
| 373 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 374 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 435 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
| 375 | #ifdef CONFIG_HOTPLUG_CPU | 436 | #ifdef CONFIG_HOTPLUG_CPU |
| 376 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | 437 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, |
| @@ -390,5 +451,20 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu); | |||
| 390 | static void rcu_preempt_send_cbs_to_online(void); | 451 | static void rcu_preempt_send_cbs_to_online(void); |
| 391 | static void __init __rcu_init_preempt(void); | 452 | static void __init __rcu_init_preempt(void); |
| 392 | static void rcu_needs_cpu_flush(void); | 453 | static void rcu_needs_cpu_flush(void); |
| 454 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | ||
| 455 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | ||
| 456 | static void invoke_rcu_callbacks_kthread(void); | ||
| 457 | #ifdef CONFIG_RCU_BOOST | ||
| 458 | static void rcu_preempt_do_callbacks(void); | ||
| 459 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
| 460 | cpumask_var_t cm); | ||
| 461 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
| 462 | struct rcu_node *rnp, | ||
| 463 | int rnp_index); | ||
| 464 | static void invoke_rcu_node_kthread(struct rcu_node *rnp); | ||
| 465 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg); | ||
| 466 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 467 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); | ||
| 468 | static void __cpuinit rcu_prepare_kthreads(int cpu); | ||
| 393 | 469 | ||
| 394 | #endif /* #ifndef RCU_TREE_NONCORE */ | 470 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a3638710dc67..8aafbb80b8b0 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | 2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) |
| 3 | * Internal non-public definitions that provide either classic | 3 | * Internal non-public definitions that provide either classic |
| 4 | * or preemptable semantics. | 4 | * or preemptible semantics. |
| 5 | * | 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify | 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License as published by | 7 | * it under the terms of the GNU General Public License as published by |
| @@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 54 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | 54 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE |
| 55 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); | 55 | printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); |
| 56 | #endif | 56 | #endif |
| 57 | #ifndef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 58 | printk(KERN_INFO | ||
| 59 | "\tRCU-based detection of stalled CPUs is disabled.\n"); | ||
| 60 | #endif | ||
| 61 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) | 57 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) |
| 62 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); | 58 | printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); |
| 63 | #endif | 59 | #endif |
| @@ -70,7 +66,9 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 70 | 66 | ||
| 71 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); | 67 | struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); |
| 72 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | 68 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); |
| 69 | static struct rcu_state *rcu_state = &rcu_preempt_state; | ||
| 73 | 70 | ||
| 71 | static void rcu_read_unlock_special(struct task_struct *t); | ||
| 74 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | 72 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); |
| 75 | 73 | ||
| 76 | /* | 74 | /* |
| @@ -78,7 +76,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp); | |||
| 78 | */ | 76 | */ |
| 79 | static void __init rcu_bootup_announce(void) | 77 | static void __init rcu_bootup_announce(void) |
| 80 | { | 78 | { |
| 81 | printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n"); | 79 | printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); |
| 82 | rcu_bootup_announce_oddness(); | 80 | rcu_bootup_announce_oddness(); |
| 83 | } | 81 | } |
| 84 | 82 | ||
| @@ -111,7 +109,7 @@ void rcu_force_quiescent_state(void) | |||
| 111 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 109 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
| 112 | 110 | ||
| 113 | /* | 111 | /* |
| 114 | * Record a preemptable-RCU quiescent state for the specified CPU. Note | 112 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
| 115 | * that this just means that the task currently running on the CPU is | 113 | * that this just means that the task currently running on the CPU is |
| 116 | * not in a quiescent state. There might be any number of tasks blocked | 114 | * not in a quiescent state. There might be any number of tasks blocked |
| 117 | * while in an RCU read-side critical section. | 115 | * while in an RCU read-side critical section. |
| @@ -134,12 +132,12 @@ static void rcu_preempt_qs(int cpu) | |||
| 134 | * We have entered the scheduler, and the current task might soon be | 132 | * We have entered the scheduler, and the current task might soon be |
| 135 | * context-switched away from. If this task is in an RCU read-side | 133 | * context-switched away from. If this task is in an RCU read-side |
| 136 | * critical section, we will no longer be able to rely on the CPU to | 134 | * critical section, we will no longer be able to rely on the CPU to |
| 137 | * record that fact, so we enqueue the task on the appropriate entry | 135 | * record that fact, so we enqueue the task on the blkd_tasks list. |
| 138 | * of the blocked_tasks[] array. The task will dequeue itself when | 136 | * The task will dequeue itself when it exits the outermost enclosing |
| 139 | * it exits the outermost enclosing RCU read-side critical section. | 137 | * RCU read-side critical section. Therefore, the current grace period |
| 140 | * Therefore, the current grace period cannot be permitted to complete | 138 | * cannot be permitted to complete until the blkd_tasks list entries |
| 141 | * until the blocked_tasks[] entry indexed by the low-order bit of | 139 | * predating the current grace period drain, in other words, until |
| 142 | * rnp->gpnum empties. | 140 | * rnp->gp_tasks becomes NULL. |
| 143 | * | 141 | * |
| 144 | * Caller must disable preemption. | 142 | * Caller must disable preemption. |
| 145 | */ | 143 | */ |
| @@ -147,11 +145,10 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 147 | { | 145 | { |
| 148 | struct task_struct *t = current; | 146 | struct task_struct *t = current; |
| 149 | unsigned long flags; | 147 | unsigned long flags; |
| 150 | int phase; | ||
| 151 | struct rcu_data *rdp; | 148 | struct rcu_data *rdp; |
| 152 | struct rcu_node *rnp; | 149 | struct rcu_node *rnp; |
| 153 | 150 | ||
| 154 | if (t->rcu_read_lock_nesting && | 151 | if (t->rcu_read_lock_nesting > 0 && |
| 155 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | 152 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { |
| 156 | 153 | ||
| 157 | /* Possibly blocking in an RCU read-side critical section. */ | 154 | /* Possibly blocking in an RCU read-side critical section. */ |
| @@ -169,16 +166,39 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 169 | * (i.e., this CPU has not yet passed through a quiescent | 166 | * (i.e., this CPU has not yet passed through a quiescent |
| 170 | * state for the current grace period), then as long | 167 | * state for the current grace period), then as long |
| 171 | * as that task remains queued, the current grace period | 168 | * as that task remains queued, the current grace period |
| 172 | * cannot end. | 169 | * cannot end. Note that there is some uncertainty as |
| 170 | * to exactly when the current grace period started. | ||
| 171 | * We take a conservative approach, which can result | ||
| 172 | * in unnecessarily waiting on tasks that started very | ||
| 173 | * slightly after the current grace period began. C'est | ||
| 174 | * la vie!!! | ||
| 173 | * | 175 | * |
| 174 | * But first, note that the current CPU must still be | 176 | * But first, note that the current CPU must still be |
| 175 | * on line! | 177 | * on line! |
| 176 | */ | 178 | */ |
| 177 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); | 179 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); |
| 178 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | 180 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); |
| 179 | phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; | 181 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { |
| 180 | list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); | 182 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); |
| 183 | rnp->gp_tasks = &t->rcu_node_entry; | ||
| 184 | #ifdef CONFIG_RCU_BOOST | ||
| 185 | if (rnp->boost_tasks != NULL) | ||
| 186 | rnp->boost_tasks = rnp->gp_tasks; | ||
| 187 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 188 | } else { | ||
| 189 | list_add(&t->rcu_node_entry, &rnp->blkd_tasks); | ||
| 190 | if (rnp->qsmask & rdp->grpmask) | ||
| 191 | rnp->gp_tasks = &t->rcu_node_entry; | ||
| 192 | } | ||
| 181 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 193 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 194 | } else if (t->rcu_read_lock_nesting < 0 && | ||
| 195 | t->rcu_read_unlock_special) { | ||
| 196 | |||
| 197 | /* | ||
| 198 | * Complete exit from RCU read-side critical section on | ||
| 199 | * behalf of preempted instance of __rcu_read_unlock(). | ||
| 200 | */ | ||
| 201 | rcu_read_unlock_special(t); | ||
| 182 | } | 202 | } |
| 183 | 203 | ||
| 184 | /* | 204 | /* |
| @@ -196,7 +216,7 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 196 | } | 216 | } |
| 197 | 217 | ||
| 198 | /* | 218 | /* |
| 199 | * Tree-preemptable RCU implementation for rcu_read_lock(). | 219 | * Tree-preemptible RCU implementation for rcu_read_lock(). |
| 200 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | 220 | * Just increment ->rcu_read_lock_nesting, shared state will be updated |
| 201 | * if we block. | 221 | * if we block. |
| 202 | */ | 222 | */ |
| @@ -212,12 +232,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); | |||
| 212 | * for the specified rcu_node structure. If the caller needs a reliable | 232 | * for the specified rcu_node structure. If the caller needs a reliable |
| 213 | * answer, it must hold the rcu_node's ->lock. | 233 | * answer, it must hold the rcu_node's ->lock. |
| 214 | */ | 234 | */ |
| 215 | static int rcu_preempted_readers(struct rcu_node *rnp) | 235 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) |
| 216 | { | 236 | { |
| 217 | int phase = rnp->gpnum & 0x1; | 237 | return rnp->gp_tasks != NULL; |
| 218 | |||
| 219 | return !list_empty(&rnp->blocked_tasks[phase]) || | ||
| 220 | !list_empty(&rnp->blocked_tasks[phase + 2]); | ||
| 221 | } | 238 | } |
| 222 | 239 | ||
| 223 | /* | 240 | /* |
| @@ -233,7 +250,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
| 233 | unsigned long mask; | 250 | unsigned long mask; |
| 234 | struct rcu_node *rnp_p; | 251 | struct rcu_node *rnp_p; |
| 235 | 252 | ||
| 236 | if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { | 253 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { |
| 237 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 254 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 238 | return; /* Still need more quiescent states! */ | 255 | return; /* Still need more quiescent states! */ |
| 239 | } | 256 | } |
| @@ -257,15 +274,31 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
| 257 | } | 274 | } |
| 258 | 275 | ||
| 259 | /* | 276 | /* |
| 277 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
| 278 | * returning NULL if at the end of the list. | ||
| 279 | */ | ||
| 280 | static struct list_head *rcu_next_node_entry(struct task_struct *t, | ||
| 281 | struct rcu_node *rnp) | ||
| 282 | { | ||
| 283 | struct list_head *np; | ||
| 284 | |||
| 285 | np = t->rcu_node_entry.next; | ||
| 286 | if (np == &rnp->blkd_tasks) | ||
| 287 | np = NULL; | ||
| 288 | return np; | ||
| 289 | } | ||
| 290 | |||
| 291 | /* | ||
| 260 | * Handle special cases during rcu_read_unlock(), such as needing to | 292 | * Handle special cases during rcu_read_unlock(), such as needing to |
| 261 | * notify RCU core processing or task having blocked during the RCU | 293 | * notify RCU core processing or task having blocked during the RCU |
| 262 | * read-side critical section. | 294 | * read-side critical section. |
| 263 | */ | 295 | */ |
| 264 | static void rcu_read_unlock_special(struct task_struct *t) | 296 | static noinline void rcu_read_unlock_special(struct task_struct *t) |
| 265 | { | 297 | { |
| 266 | int empty; | 298 | int empty; |
| 267 | int empty_exp; | 299 | int empty_exp; |
| 268 | unsigned long flags; | 300 | unsigned long flags; |
| 301 | struct list_head *np; | ||
| 269 | struct rcu_node *rnp; | 302 | struct rcu_node *rnp; |
| 270 | int special; | 303 | int special; |
| 271 | 304 | ||
| @@ -285,7 +318,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 285 | } | 318 | } |
| 286 | 319 | ||
| 287 | /* Hardware IRQ handlers cannot block. */ | 320 | /* Hardware IRQ handlers cannot block. */ |
| 288 | if (in_irq()) { | 321 | if (in_irq() || in_serving_softirq()) { |
| 289 | local_irq_restore(flags); | 322 | local_irq_restore(flags); |
| 290 | return; | 323 | return; |
| 291 | } | 324 | } |
| @@ -306,10 +339,24 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 306 | break; | 339 | break; |
| 307 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 340 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 308 | } | 341 | } |
| 309 | empty = !rcu_preempted_readers(rnp); | 342 | empty = !rcu_preempt_blocked_readers_cgp(rnp); |
| 310 | empty_exp = !rcu_preempted_readers_exp(rnp); | 343 | empty_exp = !rcu_preempted_readers_exp(rnp); |
| 311 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | 344 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ |
| 345 | np = rcu_next_node_entry(t, rnp); | ||
| 312 | list_del_init(&t->rcu_node_entry); | 346 | list_del_init(&t->rcu_node_entry); |
| 347 | if (&t->rcu_node_entry == rnp->gp_tasks) | ||
| 348 | rnp->gp_tasks = np; | ||
| 349 | if (&t->rcu_node_entry == rnp->exp_tasks) | ||
| 350 | rnp->exp_tasks = np; | ||
| 351 | #ifdef CONFIG_RCU_BOOST | ||
| 352 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
| 353 | rnp->boost_tasks = np; | ||
| 354 | /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ | ||
| 355 | if (t->rcu_boosted) { | ||
| 356 | special |= RCU_READ_UNLOCK_BOOSTED; | ||
| 357 | t->rcu_boosted = 0; | ||
| 358 | } | ||
| 359 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 313 | t->rcu_blocked_node = NULL; | 360 | t->rcu_blocked_node = NULL; |
| 314 | 361 | ||
| 315 | /* | 362 | /* |
| @@ -322,6 +369,14 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 322 | else | 369 | else |
| 323 | rcu_report_unblock_qs_rnp(rnp, flags); | 370 | rcu_report_unblock_qs_rnp(rnp, flags); |
| 324 | 371 | ||
| 372 | #ifdef CONFIG_RCU_BOOST | ||
| 373 | /* Unboost if we were boosted. */ | ||
| 374 | if (special & RCU_READ_UNLOCK_BOOSTED) { | ||
| 375 | rt_mutex_unlock(t->rcu_boost_mutex); | ||
| 376 | t->rcu_boost_mutex = NULL; | ||
| 377 | } | ||
| 378 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 379 | |||
| 325 | /* | 380 | /* |
| 326 | * If this was the last task on the expedited lists, | 381 | * If this was the last task on the expedited lists, |
| 327 | * then we need to report up the rcu_node hierarchy. | 382 | * then we need to report up the rcu_node hierarchy. |
| @@ -334,7 +389,7 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 334 | } | 389 | } |
| 335 | 390 | ||
| 336 | /* | 391 | /* |
| 337 | * Tree-preemptable RCU implementation for rcu_read_unlock(). | 392 | * Tree-preemptible RCU implementation for rcu_read_unlock(). |
| 338 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | 393 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost |
| 339 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | 394 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then |
| 340 | * invoke rcu_read_unlock_special() to clean up after a context switch | 395 | * invoke rcu_read_unlock_special() to clean up after a context switch |
| @@ -345,19 +400,26 @@ void __rcu_read_unlock(void) | |||
| 345 | struct task_struct *t = current; | 400 | struct task_struct *t = current; |
| 346 | 401 | ||
| 347 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ | 402 | barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ |
| 348 | --t->rcu_read_lock_nesting; | 403 | if (t->rcu_read_lock_nesting != 1) |
| 349 | barrier(); /* decrement before load of ->rcu_read_unlock_special */ | 404 | --t->rcu_read_lock_nesting; |
| 350 | if (t->rcu_read_lock_nesting == 0 && | 405 | else { |
| 351 | unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 406 | t->rcu_read_lock_nesting = INT_MIN; |
| 352 | rcu_read_unlock_special(t); | 407 | barrier(); /* assign before ->rcu_read_unlock_special load */ |
| 408 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
| 409 | rcu_read_unlock_special(t); | ||
| 410 | barrier(); /* ->rcu_read_unlock_special load before assign */ | ||
| 411 | t->rcu_read_lock_nesting = 0; | ||
| 412 | } | ||
| 353 | #ifdef CONFIG_PROVE_LOCKING | 413 | #ifdef CONFIG_PROVE_LOCKING |
| 354 | WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); | 414 | { |
| 415 | int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
| 416 | |||
| 417 | WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); | ||
| 418 | } | ||
| 355 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | 419 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ |
| 356 | } | 420 | } |
| 357 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | 421 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); |
| 358 | 422 | ||
| 359 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 360 | |||
| 361 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | 423 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE |
| 362 | 424 | ||
| 363 | /* | 425 | /* |
| @@ -367,18 +429,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock); | |||
| 367 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | 429 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) |
| 368 | { | 430 | { |
| 369 | unsigned long flags; | 431 | unsigned long flags; |
| 370 | struct list_head *lp; | ||
| 371 | int phase; | ||
| 372 | struct task_struct *t; | 432 | struct task_struct *t; |
| 373 | 433 | ||
| 374 | if (rcu_preempted_readers(rnp)) { | 434 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
| 375 | raw_spin_lock_irqsave(&rnp->lock, flags); | 435 | return; |
| 376 | phase = rnp->gpnum & 0x1; | 436 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 377 | lp = &rnp->blocked_tasks[phase]; | 437 | t = list_entry(rnp->gp_tasks, |
| 378 | list_for_each_entry(t, lp, rcu_node_entry) | 438 | struct task_struct, rcu_node_entry); |
| 379 | sched_show_task(t); | 439 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) |
| 380 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 440 | sched_show_task(t); |
| 381 | } | 441 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 382 | } | 442 | } |
| 383 | 443 | ||
| 384 | /* | 444 | /* |
| @@ -408,16 +468,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
| 408 | */ | 468 | */ |
| 409 | static void rcu_print_task_stall(struct rcu_node *rnp) | 469 | static void rcu_print_task_stall(struct rcu_node *rnp) |
| 410 | { | 470 | { |
| 411 | struct list_head *lp; | ||
| 412 | int phase; | ||
| 413 | struct task_struct *t; | 471 | struct task_struct *t; |
| 414 | 472 | ||
| 415 | if (rcu_preempted_readers(rnp)) { | 473 | if (!rcu_preempt_blocked_readers_cgp(rnp)) |
| 416 | phase = rnp->gpnum & 0x1; | 474 | return; |
| 417 | lp = &rnp->blocked_tasks[phase]; | 475 | t = list_entry(rnp->gp_tasks, |
| 418 | list_for_each_entry(t, lp, rcu_node_entry) | 476 | struct task_struct, rcu_node_entry); |
| 419 | printk(" P%d", t->pid); | 477 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) |
| 420 | } | 478 | printk(" P%d", t->pid); |
| 421 | } | 479 | } |
| 422 | 480 | ||
| 423 | /* | 481 | /* |
| @@ -430,18 +488,21 @@ static void rcu_preempt_stall_reset(void) | |||
| 430 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; | 488 | rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; |
| 431 | } | 489 | } |
| 432 | 490 | ||
| 433 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 434 | |||
| 435 | /* | 491 | /* |
| 436 | * Check that the list of blocked tasks for the newly completed grace | 492 | * Check that the list of blocked tasks for the newly completed grace |
| 437 | * period is in fact empty. It is a serious bug to complete a grace | 493 | * period is in fact empty. It is a serious bug to complete a grace |
| 438 | * period that still has RCU readers blocked! This function must be | 494 | * period that still has RCU readers blocked! This function must be |
| 439 | * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock | 495 | * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock |
| 440 | * must be held by the caller. | 496 | * must be held by the caller. |
| 497 | * | ||
| 498 | * Also, if there are blocked tasks on the list, they automatically | ||
| 499 | * block the newly created grace period, so set up ->gp_tasks accordingly. | ||
| 441 | */ | 500 | */ |
| 442 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | 501 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) |
| 443 | { | 502 | { |
| 444 | WARN_ON_ONCE(rcu_preempted_readers(rnp)); | 503 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); |
| 504 | if (!list_empty(&rnp->blkd_tasks)) | ||
| 505 | rnp->gp_tasks = rnp->blkd_tasks.next; | ||
| 445 | WARN_ON_ONCE(rnp->qsmask); | 506 | WARN_ON_ONCE(rnp->qsmask); |
| 446 | } | 507 | } |
| 447 | 508 | ||
| @@ -465,50 +526,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 465 | struct rcu_node *rnp, | 526 | struct rcu_node *rnp, |
| 466 | struct rcu_data *rdp) | 527 | struct rcu_data *rdp) |
| 467 | { | 528 | { |
| 468 | int i; | ||
| 469 | struct list_head *lp; | 529 | struct list_head *lp; |
| 470 | struct list_head *lp_root; | 530 | struct list_head *lp_root; |
| 471 | int retval = 0; | 531 | int retval = 0; |
| 472 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 532 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
| 473 | struct task_struct *tp; | 533 | struct task_struct *t; |
| 474 | 534 | ||
| 475 | if (rnp == rnp_root) { | 535 | if (rnp == rnp_root) { |
| 476 | WARN_ONCE(1, "Last CPU thought to be offlined?"); | 536 | WARN_ONCE(1, "Last CPU thought to be offlined?"); |
| 477 | return 0; /* Shouldn't happen: at least one CPU online. */ | 537 | return 0; /* Shouldn't happen: at least one CPU online. */ |
| 478 | } | 538 | } |
| 479 | WARN_ON_ONCE(rnp != rdp->mynode && | 539 | |
| 480 | (!list_empty(&rnp->blocked_tasks[0]) || | 540 | /* If we are on an internal node, complain bitterly. */ |
| 481 | !list_empty(&rnp->blocked_tasks[1]) || | 541 | WARN_ON_ONCE(rnp != rdp->mynode); |
| 482 | !list_empty(&rnp->blocked_tasks[2]) || | ||
| 483 | !list_empty(&rnp->blocked_tasks[3]))); | ||
| 484 | 542 | ||
| 485 | /* | 543 | /* |
| 486 | * Move tasks up to root rcu_node. Rely on the fact that the | 544 | * Move tasks up to root rcu_node. Don't try to get fancy for |
| 487 | * root rcu_node can be at most one ahead of the rest of the | 545 | * this corner-case operation -- just put this node's tasks |
| 488 | * rcu_nodes in terms of gp_num value. This fact allows us to | 546 | * at the head of the root node's list, and update the root node's |
| 489 | * move the blocked_tasks[] array directly, element by element. | 547 | * ->gp_tasks and ->exp_tasks pointers to those of this node's, |
| 548 | * if non-NULL. This might result in waiting for more tasks than | ||
| 549 | * absolutely necessary, but this is a good performance/complexity | ||
| 550 | * tradeoff. | ||
| 490 | */ | 551 | */ |
| 491 | if (rcu_preempted_readers(rnp)) | 552 | if (rcu_preempt_blocked_readers_cgp(rnp)) |
| 492 | retval |= RCU_OFL_TASKS_NORM_GP; | 553 | retval |= RCU_OFL_TASKS_NORM_GP; |
| 493 | if (rcu_preempted_readers_exp(rnp)) | 554 | if (rcu_preempted_readers_exp(rnp)) |
| 494 | retval |= RCU_OFL_TASKS_EXP_GP; | 555 | retval |= RCU_OFL_TASKS_EXP_GP; |
| 495 | for (i = 0; i < 4; i++) { | 556 | lp = &rnp->blkd_tasks; |
| 496 | lp = &rnp->blocked_tasks[i]; | 557 | lp_root = &rnp_root->blkd_tasks; |
| 497 | lp_root = &rnp_root->blocked_tasks[i]; | 558 | while (!list_empty(lp)) { |
| 498 | while (!list_empty(lp)) { | 559 | t = list_entry(lp->next, typeof(*t), rcu_node_entry); |
| 499 | tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); | 560 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
| 500 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | 561 | list_del(&t->rcu_node_entry); |
| 501 | list_del(&tp->rcu_node_entry); | 562 | t->rcu_blocked_node = rnp_root; |
| 502 | tp->rcu_blocked_node = rnp_root; | 563 | list_add(&t->rcu_node_entry, lp_root); |
| 503 | list_add(&tp->rcu_node_entry, lp_root); | 564 | if (&t->rcu_node_entry == rnp->gp_tasks) |
| 504 | raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ | 565 | rnp_root->gp_tasks = rnp->gp_tasks; |
| 505 | } | 566 | if (&t->rcu_node_entry == rnp->exp_tasks) |
| 567 | rnp_root->exp_tasks = rnp->exp_tasks; | ||
| 568 | #ifdef CONFIG_RCU_BOOST | ||
| 569 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
| 570 | rnp_root->boost_tasks = rnp->boost_tasks; | ||
| 571 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 572 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
| 506 | } | 573 | } |
| 574 | |||
| 575 | #ifdef CONFIG_RCU_BOOST | ||
| 576 | /* In case root is being boosted and leaf is not. */ | ||
| 577 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | ||
| 578 | if (rnp_root->boost_tasks != NULL && | ||
| 579 | rnp_root->boost_tasks != rnp_root->gp_tasks) | ||
| 580 | rnp_root->boost_tasks = rnp_root->gp_tasks; | ||
| 581 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
| 582 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 583 | |||
| 584 | rnp->gp_tasks = NULL; | ||
| 585 | rnp->exp_tasks = NULL; | ||
| 507 | return retval; | 586 | return retval; |
| 508 | } | 587 | } |
| 509 | 588 | ||
| 510 | /* | 589 | /* |
| 511 | * Do CPU-offline processing for preemptable RCU. | 590 | * Do CPU-offline processing for preemptible RCU. |
| 512 | */ | 591 | */ |
| 513 | static void rcu_preempt_offline_cpu(int cpu) | 592 | static void rcu_preempt_offline_cpu(int cpu) |
| 514 | { | 593 | { |
| @@ -532,12 +611,13 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
| 532 | rcu_preempt_qs(cpu); | 611 | rcu_preempt_qs(cpu); |
| 533 | return; | 612 | return; |
| 534 | } | 613 | } |
| 535 | if (per_cpu(rcu_preempt_data, cpu).qs_pending) | 614 | if (t->rcu_read_lock_nesting > 0 && |
| 615 | per_cpu(rcu_preempt_data, cpu).qs_pending) | ||
| 536 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | 616 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; |
| 537 | } | 617 | } |
| 538 | 618 | ||
| 539 | /* | 619 | /* |
| 540 | * Process callbacks for preemptable RCU. | 620 | * Process callbacks for preemptible RCU. |
| 541 | */ | 621 | */ |
| 542 | static void rcu_preempt_process_callbacks(void) | 622 | static void rcu_preempt_process_callbacks(void) |
| 543 | { | 623 | { |
| @@ -545,8 +625,17 @@ static void rcu_preempt_process_callbacks(void) | |||
| 545 | &__get_cpu_var(rcu_preempt_data)); | 625 | &__get_cpu_var(rcu_preempt_data)); |
| 546 | } | 626 | } |
| 547 | 627 | ||
| 628 | #ifdef CONFIG_RCU_BOOST | ||
| 629 | |||
| 630 | static void rcu_preempt_do_callbacks(void) | ||
| 631 | { | ||
| 632 | rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); | ||
| 633 | } | ||
| 634 | |||
| 635 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 636 | |||
| 548 | /* | 637 | /* |
| 549 | * Queue a preemptable-RCU callback for invocation after a grace period. | 638 | * Queue a preemptible-RCU callback for invocation after a grace period. |
| 550 | */ | 639 | */ |
| 551 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | 640 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) |
| 552 | { | 641 | { |
| @@ -594,8 +683,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | |||
| 594 | */ | 683 | */ |
| 595 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) | 684 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) |
| 596 | { | 685 | { |
| 597 | return !list_empty(&rnp->blocked_tasks[2]) || | 686 | return rnp->exp_tasks != NULL; |
| 598 | !list_empty(&rnp->blocked_tasks[3]); | ||
| 599 | } | 687 | } |
| 600 | 688 | ||
| 601 | /* | 689 | /* |
| @@ -630,9 +718,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 630 | 718 | ||
| 631 | raw_spin_lock_irqsave(&rnp->lock, flags); | 719 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 632 | for (;;) { | 720 | for (;;) { |
| 633 | if (!sync_rcu_preempt_exp_done(rnp)) | 721 | if (!sync_rcu_preempt_exp_done(rnp)) { |
| 722 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 634 | break; | 723 | break; |
| 724 | } | ||
| 635 | if (rnp->parent == NULL) { | 725 | if (rnp->parent == NULL) { |
| 726 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 636 | wake_up(&sync_rcu_preempt_exp_wq); | 727 | wake_up(&sync_rcu_preempt_exp_wq); |
| 637 | break; | 728 | break; |
| 638 | } | 729 | } |
| @@ -642,7 +733,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 642 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 733 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ |
| 643 | rnp->expmask &= ~mask; | 734 | rnp->expmask &= ~mask; |
| 644 | } | 735 | } |
| 645 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 646 | } | 736 | } |
| 647 | 737 | ||
| 648 | /* | 738 | /* |
| @@ -655,13 +745,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 655 | static void | 745 | static void |
| 656 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | 746 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) |
| 657 | { | 747 | { |
| 658 | int must_wait; | 748 | unsigned long flags; |
| 749 | int must_wait = 0; | ||
| 659 | 750 | ||
| 660 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 751 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 661 | list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); | 752 | if (list_empty(&rnp->blkd_tasks)) |
| 662 | list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); | 753 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 663 | must_wait = rcu_preempted_readers_exp(rnp); | 754 | else { |
| 664 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 755 | rnp->exp_tasks = rnp->blkd_tasks.next; |
| 756 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | ||
| 757 | must_wait = 1; | ||
| 758 | } | ||
| 665 | if (!must_wait) | 759 | if (!must_wait) |
| 666 | rcu_report_exp_rnp(rsp, rnp); | 760 | rcu_report_exp_rnp(rsp, rnp); |
| 667 | } | 761 | } |
| @@ -669,9 +763,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 669 | /* | 763 | /* |
| 670 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea | 764 | * Wait for an rcu-preempt grace period, but expedite it. The basic idea |
| 671 | * is to invoke synchronize_sched_expedited() to push all the tasks to | 765 | * is to invoke synchronize_sched_expedited() to push all the tasks to |
| 672 | * the ->blocked_tasks[] lists, move all entries from the first set of | 766 | * the ->blkd_tasks lists and wait for this list to drain. |
| 673 | * ->blocked_tasks[] lists to the second set, and finally wait for this | ||
| 674 | * second set to drain. | ||
| 675 | */ | 767 | */ |
| 676 | void synchronize_rcu_expedited(void) | 768 | void synchronize_rcu_expedited(void) |
| 677 | { | 769 | { |
| @@ -703,7 +795,7 @@ void synchronize_rcu_expedited(void) | |||
| 703 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) | 795 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) |
| 704 | goto unlock_mb_ret; /* Others did our work for us. */ | 796 | goto unlock_mb_ret; /* Others did our work for us. */ |
| 705 | 797 | ||
| 706 | /* force all RCU readers onto blocked_tasks[]. */ | 798 | /* force all RCU readers onto ->blkd_tasks lists. */ |
| 707 | synchronize_sched_expedited(); | 799 | synchronize_sched_expedited(); |
| 708 | 800 | ||
| 709 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 801 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
| @@ -715,7 +807,7 @@ void synchronize_rcu_expedited(void) | |||
| 715 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 807 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
| 716 | } | 808 | } |
| 717 | 809 | ||
| 718 | /* Snapshot current state of ->blocked_tasks[] lists. */ | 810 | /* Snapshot current state of ->blkd_tasks lists. */ |
| 719 | rcu_for_each_leaf_node(rsp, rnp) | 811 | rcu_for_each_leaf_node(rsp, rnp) |
| 720 | sync_rcu_preempt_exp_init(rsp, rnp); | 812 | sync_rcu_preempt_exp_init(rsp, rnp); |
| 721 | if (NUM_RCU_NODES > 1) | 813 | if (NUM_RCU_NODES > 1) |
| @@ -723,7 +815,7 @@ void synchronize_rcu_expedited(void) | |||
| 723 | 815 | ||
| 724 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 816 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); |
| 725 | 817 | ||
| 726 | /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ | 818 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ |
| 727 | rnp = rcu_get_root(rsp); | 819 | rnp = rcu_get_root(rsp); |
| 728 | wait_event(sync_rcu_preempt_exp_wq, | 820 | wait_event(sync_rcu_preempt_exp_wq, |
| 729 | sync_rcu_preempt_exp_done(rnp)); | 821 | sync_rcu_preempt_exp_done(rnp)); |
| @@ -739,7 +831,7 @@ mb_ret: | |||
| 739 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | 831 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); |
| 740 | 832 | ||
| 741 | /* | 833 | /* |
| 742 | * Check to see if there is any immediate preemptable-RCU-related work | 834 | * Check to see if there is any immediate preemptible-RCU-related work |
| 743 | * to be done. | 835 | * to be done. |
| 744 | */ | 836 | */ |
| 745 | static int rcu_preempt_pending(int cpu) | 837 | static int rcu_preempt_pending(int cpu) |
| @@ -749,7 +841,7 @@ static int rcu_preempt_pending(int cpu) | |||
| 749 | } | 841 | } |
| 750 | 842 | ||
| 751 | /* | 843 | /* |
| 752 | * Does preemptable RCU need the CPU to stay out of dynticks mode? | 844 | * Does preemptible RCU need the CPU to stay out of dynticks mode? |
| 753 | */ | 845 | */ |
| 754 | static int rcu_preempt_needs_cpu(int cpu) | 846 | static int rcu_preempt_needs_cpu(int cpu) |
| 755 | { | 847 | { |
| @@ -766,7 +858,7 @@ void rcu_barrier(void) | |||
| 766 | EXPORT_SYMBOL_GPL(rcu_barrier); | 858 | EXPORT_SYMBOL_GPL(rcu_barrier); |
| 767 | 859 | ||
| 768 | /* | 860 | /* |
| 769 | * Initialize preemptable RCU's per-CPU data. | 861 | * Initialize preemptible RCU's per-CPU data. |
| 770 | */ | 862 | */ |
| 771 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | 863 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) |
| 772 | { | 864 | { |
| @@ -774,7 +866,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
| 774 | } | 866 | } |
| 775 | 867 | ||
| 776 | /* | 868 | /* |
| 777 | * Move preemptable RCU's callbacks from dying CPU to other online CPU. | 869 | * Move preemptible RCU's callbacks from dying CPU to other online CPU. |
| 778 | */ | 870 | */ |
| 779 | static void rcu_preempt_send_cbs_to_online(void) | 871 | static void rcu_preempt_send_cbs_to_online(void) |
| 780 | { | 872 | { |
| @@ -782,7 +874,7 @@ static void rcu_preempt_send_cbs_to_online(void) | |||
| 782 | } | 874 | } |
| 783 | 875 | ||
| 784 | /* | 876 | /* |
| 785 | * Initialize preemptable RCU's state structures. | 877 | * Initialize preemptible RCU's state structures. |
| 786 | */ | 878 | */ |
| 787 | static void __init __rcu_init_preempt(void) | 879 | static void __init __rcu_init_preempt(void) |
| 788 | { | 880 | { |
| @@ -790,7 +882,7 @@ static void __init __rcu_init_preempt(void) | |||
| 790 | } | 882 | } |
| 791 | 883 | ||
| 792 | /* | 884 | /* |
| 793 | * Check for a task exiting while in a preemptable-RCU read-side | 885 | * Check for a task exiting while in a preemptible-RCU read-side |
| 794 | * critical section, clean up if so. No need to issue warnings, | 886 | * critical section, clean up if so. No need to issue warnings, |
| 795 | * as debug_check_no_locks_held() already does this if lockdep | 887 | * as debug_check_no_locks_held() already does this if lockdep |
| 796 | * is enabled. | 888 | * is enabled. |
| @@ -802,11 +894,13 @@ void exit_rcu(void) | |||
| 802 | if (t->rcu_read_lock_nesting == 0) | 894 | if (t->rcu_read_lock_nesting == 0) |
| 803 | return; | 895 | return; |
| 804 | t->rcu_read_lock_nesting = 1; | 896 | t->rcu_read_lock_nesting = 1; |
| 805 | rcu_read_unlock(); | 897 | __rcu_read_unlock(); |
| 806 | } | 898 | } |
| 807 | 899 | ||
| 808 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 900 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
| 809 | 901 | ||
| 902 | static struct rcu_state *rcu_state = &rcu_sched_state; | ||
| 903 | |||
| 810 | /* | 904 | /* |
| 811 | * Tell them what RCU they are running. | 905 | * Tell them what RCU they are running. |
| 812 | */ | 906 | */ |
| @@ -836,7 +930,7 @@ void rcu_force_quiescent_state(void) | |||
| 836 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 930 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
| 837 | 931 | ||
| 838 | /* | 932 | /* |
| 839 | * Because preemptable RCU does not exist, we never have to check for | 933 | * Because preemptible RCU does not exist, we never have to check for |
| 840 | * CPUs being in quiescent states. | 934 | * CPUs being in quiescent states. |
| 841 | */ | 935 | */ |
| 842 | static void rcu_preempt_note_context_switch(int cpu) | 936 | static void rcu_preempt_note_context_switch(int cpu) |
| @@ -844,10 +938,10 @@ static void rcu_preempt_note_context_switch(int cpu) | |||
| 844 | } | 938 | } |
| 845 | 939 | ||
| 846 | /* | 940 | /* |
| 847 | * Because preemptable RCU does not exist, there are never any preempted | 941 | * Because preemptible RCU does not exist, there are never any preempted |
| 848 | * RCU readers. | 942 | * RCU readers. |
| 849 | */ | 943 | */ |
| 850 | static int rcu_preempted_readers(struct rcu_node *rnp) | 944 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) |
| 851 | { | 945 | { |
| 852 | return 0; | 946 | return 0; |
| 853 | } | 947 | } |
| @@ -862,10 +956,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | |||
| 862 | 956 | ||
| 863 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 957 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 864 | 958 | ||
| 865 | #ifdef CONFIG_RCU_CPU_STALL_DETECTOR | ||
| 866 | |||
| 867 | /* | 959 | /* |
| 868 | * Because preemptable RCU does not exist, we never have to check for | 960 | * Because preemptible RCU does not exist, we never have to check for |
| 869 | * tasks blocked within RCU read-side critical sections. | 961 | * tasks blocked within RCU read-side critical sections. |
| 870 | */ | 962 | */ |
| 871 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | 963 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) |
| @@ -873,7 +965,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) | |||
| 873 | } | 965 | } |
| 874 | 966 | ||
| 875 | /* | 967 | /* |
| 876 | * Because preemptable RCU does not exist, we never have to check for | 968 | * Because preemptible RCU does not exist, we never have to check for |
| 877 | * tasks blocked within RCU read-side critical sections. | 969 | * tasks blocked within RCU read-side critical sections. |
| 878 | */ | 970 | */ |
| 879 | static void rcu_print_task_stall(struct rcu_node *rnp) | 971 | static void rcu_print_task_stall(struct rcu_node *rnp) |
| @@ -888,10 +980,8 @@ static void rcu_preempt_stall_reset(void) | |||
| 888 | { | 980 | { |
| 889 | } | 981 | } |
| 890 | 982 | ||
| 891 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | ||
| 892 | |||
| 893 | /* | 983 | /* |
| 894 | * Because there is no preemptable RCU, there can be no readers blocked, | 984 | * Because there is no preemptible RCU, there can be no readers blocked, |
| 895 | * so there is no need to check for blocked tasks. So check only for | 985 | * so there is no need to check for blocked tasks. So check only for |
| 896 | * bogus qsmask values. | 986 | * bogus qsmask values. |
| 897 | */ | 987 | */ |
| @@ -903,7 +993,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |||
| 903 | #ifdef CONFIG_HOTPLUG_CPU | 993 | #ifdef CONFIG_HOTPLUG_CPU |
| 904 | 994 | ||
| 905 | /* | 995 | /* |
| 906 | * Because preemptable RCU does not exist, it never needs to migrate | 996 | * Because preemptible RCU does not exist, it never needs to migrate |
| 907 | * tasks that were blocked within RCU read-side critical sections, and | 997 | * tasks that were blocked within RCU read-side critical sections, and |
| 908 | * such non-existent tasks cannot possibly have been blocking the current | 998 | * such non-existent tasks cannot possibly have been blocking the current |
| 909 | * grace period. | 999 | * grace period. |
| @@ -916,7 +1006,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 916 | } | 1006 | } |
| 917 | 1007 | ||
| 918 | /* | 1008 | /* |
| 919 | * Because preemptable RCU does not exist, it never needs CPU-offline | 1009 | * Because preemptible RCU does not exist, it never needs CPU-offline |
| 920 | * processing. | 1010 | * processing. |
| 921 | */ | 1011 | */ |
| 922 | static void rcu_preempt_offline_cpu(int cpu) | 1012 | static void rcu_preempt_offline_cpu(int cpu) |
| @@ -926,7 +1016,7 @@ static void rcu_preempt_offline_cpu(int cpu) | |||
| 926 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1016 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 927 | 1017 | ||
| 928 | /* | 1018 | /* |
| 929 | * Because preemptable RCU does not exist, it never has any callbacks | 1019 | * Because preemptible RCU does not exist, it never has any callbacks |
| 930 | * to check. | 1020 | * to check. |
| 931 | */ | 1021 | */ |
| 932 | static void rcu_preempt_check_callbacks(int cpu) | 1022 | static void rcu_preempt_check_callbacks(int cpu) |
| @@ -934,7 +1024,7 @@ static void rcu_preempt_check_callbacks(int cpu) | |||
| 934 | } | 1024 | } |
| 935 | 1025 | ||
| 936 | /* | 1026 | /* |
| 937 | * Because preemptable RCU does not exist, it never has any callbacks | 1027 | * Because preemptible RCU does not exist, it never has any callbacks |
| 938 | * to process. | 1028 | * to process. |
| 939 | */ | 1029 | */ |
| 940 | static void rcu_preempt_process_callbacks(void) | 1030 | static void rcu_preempt_process_callbacks(void) |
| @@ -943,7 +1033,7 @@ static void rcu_preempt_process_callbacks(void) | |||
| 943 | 1033 | ||
| 944 | /* | 1034 | /* |
| 945 | * Wait for an rcu-preempt grace period, but make it happen quickly. | 1035 | * Wait for an rcu-preempt grace period, but make it happen quickly. |
| 946 | * But because preemptable RCU does not exist, map to rcu-sched. | 1036 | * But because preemptible RCU does not exist, map to rcu-sched. |
| 947 | */ | 1037 | */ |
| 948 | void synchronize_rcu_expedited(void) | 1038 | void synchronize_rcu_expedited(void) |
| 949 | { | 1039 | { |
| @@ -954,7 +1044,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | |||
| 954 | #ifdef CONFIG_HOTPLUG_CPU | 1044 | #ifdef CONFIG_HOTPLUG_CPU |
| 955 | 1045 | ||
| 956 | /* | 1046 | /* |
| 957 | * Because preemptable RCU does not exist, there is never any need to | 1047 | * Because preemptible RCU does not exist, there is never any need to |
| 958 | * report on tasks preempted in RCU read-side critical sections during | 1048 | * report on tasks preempted in RCU read-side critical sections during |
| 959 | * expedited RCU grace periods. | 1049 | * expedited RCU grace periods. |
| 960 | */ | 1050 | */ |
| @@ -966,7 +1056,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) | |||
| 966 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 1056 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 967 | 1057 | ||
| 968 | /* | 1058 | /* |
| 969 | * Because preemptable RCU does not exist, it never has any work to do. | 1059 | * Because preemptible RCU does not exist, it never has any work to do. |
| 970 | */ | 1060 | */ |
| 971 | static int rcu_preempt_pending(int cpu) | 1061 | static int rcu_preempt_pending(int cpu) |
| 972 | { | 1062 | { |
| @@ -974,7 +1064,7 @@ static int rcu_preempt_pending(int cpu) | |||
| 974 | } | 1064 | } |
| 975 | 1065 | ||
| 976 | /* | 1066 | /* |
| 977 | * Because preemptable RCU does not exist, it never needs any CPU. | 1067 | * Because preemptible RCU does not exist, it never needs any CPU. |
| 978 | */ | 1068 | */ |
| 979 | static int rcu_preempt_needs_cpu(int cpu) | 1069 | static int rcu_preempt_needs_cpu(int cpu) |
| 980 | { | 1070 | { |
| @@ -982,7 +1072,7 @@ static int rcu_preempt_needs_cpu(int cpu) | |||
| 982 | } | 1072 | } |
| 983 | 1073 | ||
| 984 | /* | 1074 | /* |
| 985 | * Because preemptable RCU does not exist, rcu_barrier() is just | 1075 | * Because preemptible RCU does not exist, rcu_barrier() is just |
| 986 | * another name for rcu_barrier_sched(). | 1076 | * another name for rcu_barrier_sched(). |
| 987 | */ | 1077 | */ |
| 988 | void rcu_barrier(void) | 1078 | void rcu_barrier(void) |
| @@ -992,7 +1082,7 @@ void rcu_barrier(void) | |||
| 992 | EXPORT_SYMBOL_GPL(rcu_barrier); | 1082 | EXPORT_SYMBOL_GPL(rcu_barrier); |
| 993 | 1083 | ||
| 994 | /* | 1084 | /* |
| 995 | * Because preemptable RCU does not exist, there is no per-CPU | 1085 | * Because preemptible RCU does not exist, there is no per-CPU |
| 996 | * data to initialize. | 1086 | * data to initialize. |
| 997 | */ | 1087 | */ |
| 998 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | 1088 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu) |
| @@ -1000,14 +1090,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) | |||
| 1000 | } | 1090 | } |
| 1001 | 1091 | ||
| 1002 | /* | 1092 | /* |
| 1003 | * Because there is no preemptable RCU, there are no callbacks to move. | 1093 | * Because there is no preemptible RCU, there are no callbacks to move. |
| 1004 | */ | 1094 | */ |
| 1005 | static void rcu_preempt_send_cbs_to_online(void) | 1095 | static void rcu_preempt_send_cbs_to_online(void) |
| 1006 | { | 1096 | { |
| 1007 | } | 1097 | } |
| 1008 | 1098 | ||
| 1009 | /* | 1099 | /* |
| 1010 | * Because preemptable RCU does not exist, it need not be initialized. | 1100 | * Because preemptible RCU does not exist, it need not be initialized. |
| 1011 | */ | 1101 | */ |
| 1012 | static void __init __rcu_init_preempt(void) | 1102 | static void __init __rcu_init_preempt(void) |
| 1013 | { | 1103 | { |
| @@ -1015,6 +1105,665 @@ static void __init __rcu_init_preempt(void) | |||
| 1015 | 1105 | ||
| 1016 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1106 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
| 1017 | 1107 | ||
| 1108 | #ifdef CONFIG_RCU_BOOST | ||
| 1109 | |||
| 1110 | #include "rtmutex_common.h" | ||
| 1111 | |||
| 1112 | #ifdef CONFIG_RCU_TRACE | ||
| 1113 | |||
| 1114 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
| 1115 | { | ||
| 1116 | if (list_empty(&rnp->blkd_tasks)) | ||
| 1117 | rnp->n_balk_blkd_tasks++; | ||
| 1118 | else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) | ||
| 1119 | rnp->n_balk_exp_gp_tasks++; | ||
| 1120 | else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) | ||
| 1121 | rnp->n_balk_boost_tasks++; | ||
| 1122 | else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) | ||
| 1123 | rnp->n_balk_notblocked++; | ||
| 1124 | else if (rnp->gp_tasks != NULL && | ||
| 1125 | ULONG_CMP_LT(jiffies, rnp->boost_time)) | ||
| 1126 | rnp->n_balk_notyet++; | ||
| 1127 | else | ||
| 1128 | rnp->n_balk_nos++; | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
| 1132 | |||
| 1133 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
| 1134 | { | ||
| 1135 | } | ||
| 1136 | |||
| 1137 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
| 1138 | |||
| 1139 | /* | ||
| 1140 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | ||
| 1141 | * or ->boost_tasks, advancing the pointer to the next task in the | ||
| 1142 | * ->blkd_tasks list. | ||
| 1143 | * | ||
| 1144 | * Note that irqs must be enabled: boosting the task can block. | ||
| 1145 | * Returns 1 if there are more tasks needing to be boosted. | ||
| 1146 | */ | ||
| 1147 | static int rcu_boost(struct rcu_node *rnp) | ||
| 1148 | { | ||
| 1149 | unsigned long flags; | ||
| 1150 | struct rt_mutex mtx; | ||
| 1151 | struct task_struct *t; | ||
| 1152 | struct list_head *tb; | ||
| 1153 | |||
| 1154 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) | ||
| 1155 | return 0; /* Nothing left to boost. */ | ||
| 1156 | |||
| 1157 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1158 | |||
| 1159 | /* | ||
| 1160 | * Recheck under the lock: all tasks in need of boosting | ||
| 1161 | * might exit their RCU read-side critical sections on their own. | ||
| 1162 | */ | ||
| 1163 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { | ||
| 1164 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1165 | return 0; | ||
| 1166 | } | ||
| 1167 | |||
| 1168 | /* | ||
| 1169 | * Preferentially boost tasks blocking expedited grace periods. | ||
| 1170 | * This cannot starve the normal grace periods because a second | ||
| 1171 | * expedited grace period must boost all blocked tasks, including | ||
| 1172 | * those blocking the pre-existing normal grace period. | ||
| 1173 | */ | ||
| 1174 | if (rnp->exp_tasks != NULL) { | ||
| 1175 | tb = rnp->exp_tasks; | ||
| 1176 | rnp->n_exp_boosts++; | ||
| 1177 | } else { | ||
| 1178 | tb = rnp->boost_tasks; | ||
| 1179 | rnp->n_normal_boosts++; | ||
| 1180 | } | ||
| 1181 | rnp->n_tasks_boosted++; | ||
| 1182 | |||
| 1183 | /* | ||
| 1184 | * We boost task t by manufacturing an rt_mutex that appears to | ||
| 1185 | * be held by task t. We leave a pointer to that rt_mutex where | ||
| 1186 | * task t can find it, and task t will release the mutex when it | ||
| 1187 | * exits its outermost RCU read-side critical section. Then | ||
| 1188 | * simply acquiring this artificial rt_mutex will boost task | ||
| 1189 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
| 1190 | * | ||
| 1191 | * Note that task t must acquire rnp->lock to remove itself from | ||
| 1192 | * the ->blkd_tasks list, which it will do from exit() if from | ||
| 1193 | * nowhere else. We therefore are guaranteed that task t will | ||
| 1194 | * stay around at least until we drop rnp->lock. Note that | ||
| 1195 | * rnp->lock also resolves races between our priority boosting | ||
| 1196 | * and task t's exiting its outermost RCU read-side critical | ||
| 1197 | * section. | ||
| 1198 | */ | ||
| 1199 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
| 1200 | rt_mutex_init_proxy_locked(&mtx, t); | ||
| 1201 | t->rcu_boost_mutex = &mtx; | ||
| 1202 | t->rcu_boosted = 1; | ||
| 1203 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1204 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | ||
| 1205 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | ||
| 1206 | |||
| 1207 | return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; | ||
| 1208 | } | ||
| 1209 | |||
| 1210 | /* | ||
| 1211 | * Timer handler to initiate waking up of boost kthreads that | ||
| 1212 | * have yielded the CPU due to excessive numbers of tasks to | ||
| 1213 | * boost. We wake up the per-rcu_node kthread, which in turn | ||
| 1214 | * will wake up the booster kthread. | ||
| 1215 | */ | ||
| 1216 | static void rcu_boost_kthread_timer(unsigned long arg) | ||
| 1217 | { | ||
| 1218 | invoke_rcu_node_kthread((struct rcu_node *)arg); | ||
| 1219 | } | ||
| 1220 | |||
| 1221 | /* | ||
| 1222 | * Priority-boosting kthread. One per leaf rcu_node and one for the | ||
| 1223 | * root rcu_node. | ||
| 1224 | */ | ||
| 1225 | static int rcu_boost_kthread(void *arg) | ||
| 1226 | { | ||
| 1227 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
| 1228 | int spincnt = 0; | ||
| 1229 | int more2boost; | ||
| 1230 | |||
| 1231 | for (;;) { | ||
| 1232 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | ||
| 1233 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | ||
| 1234 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | ||
| 1235 | more2boost = rcu_boost(rnp); | ||
| 1236 | if (more2boost) | ||
| 1237 | spincnt++; | ||
| 1238 | else | ||
| 1239 | spincnt = 0; | ||
| 1240 | if (spincnt > 10) { | ||
| 1241 | rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); | ||
| 1242 | spincnt = 0; | ||
| 1243 | } | ||
| 1244 | } | ||
| 1245 | /* NOTREACHED */ | ||
| 1246 | return 0; | ||
| 1247 | } | ||
| 1248 | |||
| 1249 | /* | ||
| 1250 | * Check to see if it is time to start boosting RCU readers that are | ||
| 1251 | * blocking the current grace period, and, if so, tell the per-rcu_node | ||
| 1252 | * kthread to start boosting them. If there is an expedited grace | ||
| 1253 | * period in progress, it is always time to boost. | ||
| 1254 | * | ||
| 1255 | * The caller must hold rnp->lock, which this function releases, | ||
| 1256 | * but irqs remain disabled. The ->boost_kthread_task is immortal, | ||
| 1257 | * so we don't need to worry about it going away. | ||
| 1258 | */ | ||
| 1259 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
| 1260 | { | ||
| 1261 | struct task_struct *t; | ||
| 1262 | |||
| 1263 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { | ||
| 1264 | rnp->n_balk_exp_gp_tasks++; | ||
| 1265 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1266 | return; | ||
| 1267 | } | ||
| 1268 | if (rnp->exp_tasks != NULL || | ||
| 1269 | (rnp->gp_tasks != NULL && | ||
| 1270 | rnp->boost_tasks == NULL && | ||
| 1271 | rnp->qsmask == 0 && | ||
| 1272 | ULONG_CMP_GE(jiffies, rnp->boost_time))) { | ||
| 1273 | if (rnp->exp_tasks == NULL) | ||
| 1274 | rnp->boost_tasks = rnp->gp_tasks; | ||
| 1275 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1276 | t = rnp->boost_kthread_task; | ||
| 1277 | if (t != NULL) | ||
| 1278 | wake_up_process(t); | ||
| 1279 | } else { | ||
| 1280 | rcu_initiate_boost_trace(rnp); | ||
| 1281 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1282 | } | ||
| 1283 | } | ||
| 1284 | |||
| 1285 | /* | ||
| 1286 | * Wake up the per-CPU kthread to invoke RCU callbacks. | ||
| 1287 | */ | ||
| 1288 | static void invoke_rcu_callbacks_kthread(void) | ||
| 1289 | { | ||
| 1290 | unsigned long flags; | ||
| 1291 | |||
| 1292 | local_irq_save(flags); | ||
| 1293 | __this_cpu_write(rcu_cpu_has_work, 1); | ||
| 1294 | if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { | ||
| 1295 | local_irq_restore(flags); | ||
| 1296 | return; | ||
| 1297 | } | ||
| 1298 | wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); | ||
| 1299 | local_irq_restore(flags); | ||
| 1300 | } | ||
| 1301 | |||
| 1302 | /* | ||
| 1303 | * Set the affinity of the boost kthread. The CPU-hotplug locks are | ||
| 1304 | * held, so no one should be messing with the existence of the boost | ||
| 1305 | * kthread. | ||
| 1306 | */ | ||
| 1307 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
| 1308 | cpumask_var_t cm) | ||
| 1309 | { | ||
| 1310 | struct task_struct *t; | ||
| 1311 | |||
| 1312 | t = rnp->boost_kthread_task; | ||
| 1313 | if (t != NULL) | ||
| 1314 | set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); | ||
| 1315 | } | ||
| 1316 | |||
| 1317 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | ||
| 1318 | |||
| 1319 | /* | ||
| 1320 | * Do priority-boost accounting for the start of a new grace period. | ||
| 1321 | */ | ||
| 1322 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
| 1323 | { | ||
| 1324 | rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
| 1325 | } | ||
| 1326 | |||
| 1327 | /* | ||
| 1328 | * Create an RCU-boost kthread for the specified node if one does not | ||
| 1329 | * already exist. We only create this kthread for preemptible RCU. | ||
| 1330 | * Returns zero if all is well, a negated errno otherwise. | ||
| 1331 | */ | ||
| 1332 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
| 1333 | struct rcu_node *rnp, | ||
| 1334 | int rnp_index) | ||
| 1335 | { | ||
| 1336 | unsigned long flags; | ||
| 1337 | struct sched_param sp; | ||
| 1338 | struct task_struct *t; | ||
| 1339 | |||
| 1340 | if (&rcu_preempt_state != rsp) | ||
| 1341 | return 0; | ||
| 1342 | rsp->boost = 1; | ||
| 1343 | if (rnp->boost_kthread_task != NULL) | ||
| 1344 | return 0; | ||
| 1345 | t = kthread_create(rcu_boost_kthread, (void *)rnp, | ||
| 1346 | "rcub%d", rnp_index); | ||
| 1347 | if (IS_ERR(t)) | ||
| 1348 | return PTR_ERR(t); | ||
| 1349 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1350 | rnp->boost_kthread_task = t; | ||
| 1351 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1352 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
| 1353 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
| 1354 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | ||
| 1355 | return 0; | ||
| 1356 | } | ||
| 1357 | |||
| 1358 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1359 | |||
| 1360 | /* | ||
| 1361 | * Stop the RCU's per-CPU kthread when its CPU goes offline,. | ||
| 1362 | */ | ||
| 1363 | static void rcu_stop_cpu_kthread(int cpu) | ||
| 1364 | { | ||
| 1365 | struct task_struct *t; | ||
| 1366 | |||
| 1367 | /* Stop the CPU's kthread. */ | ||
| 1368 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
| 1369 | if (t != NULL) { | ||
| 1370 | per_cpu(rcu_cpu_kthread_task, cpu) = NULL; | ||
| 1371 | kthread_stop(t); | ||
| 1372 | } | ||
| 1373 | } | ||
| 1374 | |||
| 1375 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1376 | |||
| 1377 | static void rcu_kthread_do_work(void) | ||
| 1378 | { | ||
| 1379 | rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); | ||
| 1380 | rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); | ||
| 1381 | rcu_preempt_do_callbacks(); | ||
| 1382 | } | ||
| 1383 | |||
| 1384 | /* | ||
| 1385 | * Wake up the specified per-rcu_node-structure kthread. | ||
| 1386 | * Because the per-rcu_node kthreads are immortal, we don't need | ||
| 1387 | * to do anything to keep them alive. | ||
| 1388 | */ | ||
| 1389 | static void invoke_rcu_node_kthread(struct rcu_node *rnp) | ||
| 1390 | { | ||
| 1391 | struct task_struct *t; | ||
| 1392 | |||
| 1393 | t = rnp->node_kthread_task; | ||
| 1394 | if (t != NULL) | ||
| 1395 | wake_up_process(t); | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | /* | ||
| 1399 | * Set the specified CPU's kthread to run RT or not, as specified by | ||
| 1400 | * the to_rt argument. The CPU-hotplug locks are held, so the task | ||
| 1401 | * is not going away. | ||
| 1402 | */ | ||
| 1403 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
| 1404 | { | ||
| 1405 | int policy; | ||
| 1406 | struct sched_param sp; | ||
| 1407 | struct task_struct *t; | ||
| 1408 | |||
| 1409 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
| 1410 | if (t == NULL) | ||
| 1411 | return; | ||
| 1412 | if (to_rt) { | ||
| 1413 | policy = SCHED_FIFO; | ||
| 1414 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
| 1415 | } else { | ||
| 1416 | policy = SCHED_NORMAL; | ||
| 1417 | sp.sched_priority = 0; | ||
| 1418 | } | ||
| 1419 | sched_setscheduler_nocheck(t, policy, &sp); | ||
| 1420 | } | ||
| 1421 | |||
| 1422 | /* | ||
| 1423 | * Timer handler to initiate the waking up of per-CPU kthreads that | ||
| 1424 | * have yielded the CPU due to excess numbers of RCU callbacks. | ||
| 1425 | * We wake up the per-rcu_node kthread, which in turn will wake up | ||
| 1426 | * the booster kthread. | ||
| 1427 | */ | ||
| 1428 | static void rcu_cpu_kthread_timer(unsigned long arg) | ||
| 1429 | { | ||
| 1430 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); | ||
| 1431 | struct rcu_node *rnp = rdp->mynode; | ||
| 1432 | |||
| 1433 | atomic_or(rdp->grpmask, &rnp->wakemask); | ||
| 1434 | invoke_rcu_node_kthread(rnp); | ||
| 1435 | } | ||
| 1436 | |||
| 1437 | /* | ||
| 1438 | * Drop to non-real-time priority and yield, but only after posting a | ||
| 1439 | * timer that will cause us to regain our real-time priority if we | ||
| 1440 | * remain preempted. Either way, we restore our real-time priority | ||
| 1441 | * before returning. | ||
| 1442 | */ | ||
| 1443 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg) | ||
| 1444 | { | ||
| 1445 | struct sched_param sp; | ||
| 1446 | struct timer_list yield_timer; | ||
| 1447 | |||
| 1448 | setup_timer_on_stack(&yield_timer, f, arg); | ||
| 1449 | mod_timer(&yield_timer, jiffies + 2); | ||
| 1450 | sp.sched_priority = 0; | ||
| 1451 | sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); | ||
| 1452 | set_user_nice(current, 19); | ||
| 1453 | schedule(); | ||
| 1454 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
| 1455 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | ||
| 1456 | del_timer(&yield_timer); | ||
| 1457 | } | ||
| 1458 | |||
| 1459 | /* | ||
| 1460 | * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. | ||
| 1461 | * This can happen while the corresponding CPU is either coming online | ||
| 1462 | * or going offline. We cannot wait until the CPU is fully online | ||
| 1463 | * before starting the kthread, because the various notifier functions | ||
| 1464 | * can wait for RCU grace periods. So we park rcu_cpu_kthread() until | ||
| 1465 | * the corresponding CPU is online. | ||
| 1466 | * | ||
| 1467 | * Return 1 if the kthread needs to stop, 0 otherwise. | ||
| 1468 | * | ||
| 1469 | * Caller must disable bh. This function can momentarily enable it. | ||
| 1470 | */ | ||
| 1471 | static int rcu_cpu_kthread_should_stop(int cpu) | ||
| 1472 | { | ||
| 1473 | while (cpu_is_offline(cpu) || | ||
| 1474 | !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || | ||
| 1475 | smp_processor_id() != cpu) { | ||
| 1476 | if (kthread_should_stop()) | ||
| 1477 | return 1; | ||
| 1478 | per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | ||
| 1479 | per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); | ||
| 1480 | local_bh_enable(); | ||
| 1481 | schedule_timeout_uninterruptible(1); | ||
| 1482 | if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) | ||
| 1483 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
| 1484 | local_bh_disable(); | ||
| 1485 | } | ||
| 1486 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
| 1487 | return 0; | ||
| 1488 | } | ||
| 1489 | |||
| 1490 | /* | ||
| 1491 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces the | ||
| 1492 | * earlier RCU softirq. | ||
| 1493 | */ | ||
| 1494 | static int rcu_cpu_kthread(void *arg) | ||
| 1495 | { | ||
| 1496 | int cpu = (int)(long)arg; | ||
| 1497 | unsigned long flags; | ||
| 1498 | int spincnt = 0; | ||
| 1499 | unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); | ||
| 1500 | char work; | ||
| 1501 | char *workp = &per_cpu(rcu_cpu_has_work, cpu); | ||
| 1502 | |||
| 1503 | for (;;) { | ||
| 1504 | *statusp = RCU_KTHREAD_WAITING; | ||
| 1505 | rcu_wait(*workp != 0 || kthread_should_stop()); | ||
| 1506 | local_bh_disable(); | ||
| 1507 | if (rcu_cpu_kthread_should_stop(cpu)) { | ||
| 1508 | local_bh_enable(); | ||
| 1509 | break; | ||
| 1510 | } | ||
| 1511 | *statusp = RCU_KTHREAD_RUNNING; | ||
| 1512 | per_cpu(rcu_cpu_kthread_loops, cpu)++; | ||
| 1513 | local_irq_save(flags); | ||
| 1514 | work = *workp; | ||
| 1515 | *workp = 0; | ||
| 1516 | local_irq_restore(flags); | ||
| 1517 | if (work) | ||
| 1518 | rcu_kthread_do_work(); | ||
| 1519 | local_bh_enable(); | ||
| 1520 | if (*workp != 0) | ||
| 1521 | spincnt++; | ||
| 1522 | else | ||
| 1523 | spincnt = 0; | ||
| 1524 | if (spincnt > 10) { | ||
| 1525 | *statusp = RCU_KTHREAD_YIELDING; | ||
| 1526 | rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); | ||
| 1527 | spincnt = 0; | ||
| 1528 | } | ||
| 1529 | } | ||
| 1530 | *statusp = RCU_KTHREAD_STOPPED; | ||
| 1531 | return 0; | ||
| 1532 | } | ||
| 1533 | |||
| 1534 | /* | ||
| 1535 | * Spawn a per-CPU kthread, setting up affinity and priority. | ||
| 1536 | * Because the CPU hotplug lock is held, no other CPU will be attempting | ||
| 1537 | * to manipulate rcu_cpu_kthread_task. There might be another CPU | ||
| 1538 | * attempting to access it during boot, but the locking in kthread_bind() | ||
| 1539 | * will enforce sufficient ordering. | ||
| 1540 | * | ||
| 1541 | * Please note that we cannot simply refuse to wake up the per-CPU | ||
| 1542 | * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, | ||
| 1543 | * which can result in softlockup complaints if the task ends up being | ||
| 1544 | * idle for more than a couple of minutes. | ||
| 1545 | * | ||
| 1546 | * However, please note also that we cannot bind the per-CPU kthread to its | ||
| 1547 | * CPU until that CPU is fully online. We also cannot wait until the | ||
| 1548 | * CPU is fully online before we create its per-CPU kthread, as this would | ||
| 1549 | * deadlock the system when CPU notifiers tried waiting for grace | ||
| 1550 | * periods. So we bind the per-CPU kthread to its CPU only if the CPU | ||
| 1551 | * is online. If its CPU is not yet fully online, then the code in | ||
| 1552 | * rcu_cpu_kthread() will wait until it is fully online, and then do | ||
| 1553 | * the binding. | ||
| 1554 | */ | ||
| 1555 | static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) | ||
| 1556 | { | ||
| 1557 | struct sched_param sp; | ||
| 1558 | struct task_struct *t; | ||
| 1559 | |||
| 1560 | if (!rcu_scheduler_fully_active || | ||
| 1561 | per_cpu(rcu_cpu_kthread_task, cpu) != NULL) | ||
| 1562 | return 0; | ||
| 1563 | t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); | ||
| 1564 | if (IS_ERR(t)) | ||
| 1565 | return PTR_ERR(t); | ||
| 1566 | if (cpu_online(cpu)) | ||
| 1567 | kthread_bind(t, cpu); | ||
| 1568 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
| 1569 | WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); | ||
| 1570 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
| 1571 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
| 1572 | per_cpu(rcu_cpu_kthread_task, cpu) = t; | ||
| 1573 | wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ | ||
| 1574 | return 0; | ||
| 1575 | } | ||
| 1576 | |||
| 1577 | /* | ||
| 1578 | * Per-rcu_node kthread, which is in charge of waking up the per-CPU | ||
| 1579 | * kthreads when needed. We ignore requests to wake up kthreads | ||
| 1580 | * for offline CPUs, which is OK because force_quiescent_state() | ||
| 1581 | * takes care of this case. | ||
| 1582 | */ | ||
| 1583 | static int rcu_node_kthread(void *arg) | ||
| 1584 | { | ||
| 1585 | int cpu; | ||
| 1586 | unsigned long flags; | ||
| 1587 | unsigned long mask; | ||
| 1588 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
| 1589 | struct sched_param sp; | ||
| 1590 | struct task_struct *t; | ||
| 1591 | |||
| 1592 | for (;;) { | ||
| 1593 | rnp->node_kthread_status = RCU_KTHREAD_WAITING; | ||
| 1594 | rcu_wait(atomic_read(&rnp->wakemask) != 0); | ||
| 1595 | rnp->node_kthread_status = RCU_KTHREAD_RUNNING; | ||
| 1596 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1597 | mask = atomic_xchg(&rnp->wakemask, 0); | ||
| 1598 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
| 1599 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { | ||
| 1600 | if ((mask & 0x1) == 0) | ||
| 1601 | continue; | ||
| 1602 | preempt_disable(); | ||
| 1603 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
| 1604 | if (!cpu_online(cpu) || t == NULL) { | ||
| 1605 | preempt_enable(); | ||
| 1606 | continue; | ||
| 1607 | } | ||
| 1608 | per_cpu(rcu_cpu_has_work, cpu) = 1; | ||
| 1609 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
| 1610 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
| 1611 | preempt_enable(); | ||
| 1612 | } | ||
| 1613 | } | ||
| 1614 | /* NOTREACHED */ | ||
| 1615 | rnp->node_kthread_status = RCU_KTHREAD_STOPPED; | ||
| 1616 | return 0; | ||
| 1617 | } | ||
| 1618 | |||
| 1619 | /* | ||
| 1620 | * Set the per-rcu_node kthread's affinity to cover all CPUs that are | ||
| 1621 | * served by the rcu_node in question. The CPU hotplug lock is still | ||
| 1622 | * held, so the value of rnp->qsmaskinit will be stable. | ||
| 1623 | * | ||
| 1624 | * We don't include outgoingcpu in the affinity set, use -1 if there is | ||
| 1625 | * no outgoing CPU. If there are no CPUs left in the affinity set, | ||
| 1626 | * this function allows the kthread to execute on any CPU. | ||
| 1627 | */ | ||
| 1628 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
| 1629 | { | ||
| 1630 | cpumask_var_t cm; | ||
| 1631 | int cpu; | ||
| 1632 | unsigned long mask = rnp->qsmaskinit; | ||
| 1633 | |||
| 1634 | if (rnp->node_kthread_task == NULL) | ||
| 1635 | return; | ||
| 1636 | if (!alloc_cpumask_var(&cm, GFP_KERNEL)) | ||
| 1637 | return; | ||
| 1638 | cpumask_clear(cm); | ||
| 1639 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) | ||
| 1640 | if ((mask & 0x1) && cpu != outgoingcpu) | ||
| 1641 | cpumask_set_cpu(cpu, cm); | ||
| 1642 | if (cpumask_weight(cm) == 0) { | ||
| 1643 | cpumask_setall(cm); | ||
| 1644 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) | ||
| 1645 | cpumask_clear_cpu(cpu, cm); | ||
| 1646 | WARN_ON_ONCE(cpumask_weight(cm) == 0); | ||
| 1647 | } | ||
| 1648 | set_cpus_allowed_ptr(rnp->node_kthread_task, cm); | ||
| 1649 | rcu_boost_kthread_setaffinity(rnp, cm); | ||
| 1650 | free_cpumask_var(cm); | ||
| 1651 | } | ||
| 1652 | |||
| 1653 | /* | ||
| 1654 | * Spawn a per-rcu_node kthread, setting priority and affinity. | ||
| 1655 | * Called during boot before online/offline can happen, or, if | ||
| 1656 | * during runtime, with the main CPU-hotplug locks held. So only | ||
| 1657 | * one of these can be executing at a time. | ||
| 1658 | */ | ||
| 1659 | static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, | ||
| 1660 | struct rcu_node *rnp) | ||
| 1661 | { | ||
| 1662 | unsigned long flags; | ||
| 1663 | int rnp_index = rnp - &rsp->node[0]; | ||
| 1664 | struct sched_param sp; | ||
| 1665 | struct task_struct *t; | ||
| 1666 | |||
| 1667 | if (!rcu_scheduler_fully_active || | ||
| 1668 | rnp->qsmaskinit == 0) | ||
| 1669 | return 0; | ||
| 1670 | if (rnp->node_kthread_task == NULL) { | ||
| 1671 | t = kthread_create(rcu_node_kthread, (void *)rnp, | ||
| 1672 | "rcun%d", rnp_index); | ||
| 1673 | if (IS_ERR(t)) | ||
| 1674 | return PTR_ERR(t); | ||
| 1675 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1676 | rnp->node_kthread_task = t; | ||
| 1677 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1678 | sp.sched_priority = 99; | ||
| 1679 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
| 1680 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | ||
| 1681 | } | ||
| 1682 | return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); | ||
| 1683 | } | ||
| 1684 | |||
| 1685 | /* | ||
| 1686 | * Spawn all kthreads -- called as soon as the scheduler is running. | ||
| 1687 | */ | ||
| 1688 | static int __init rcu_spawn_kthreads(void) | ||
| 1689 | { | ||
| 1690 | int cpu; | ||
| 1691 | struct rcu_node *rnp; | ||
| 1692 | |||
| 1693 | rcu_scheduler_fully_active = 1; | ||
| 1694 | for_each_possible_cpu(cpu) { | ||
| 1695 | per_cpu(rcu_cpu_has_work, cpu) = 0; | ||
| 1696 | if (cpu_online(cpu)) | ||
| 1697 | (void)rcu_spawn_one_cpu_kthread(cpu); | ||
| 1698 | } | ||
| 1699 | rnp = rcu_get_root(rcu_state); | ||
| 1700 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
| 1701 | if (NUM_RCU_NODES > 1) { | ||
| 1702 | rcu_for_each_leaf_node(rcu_state, rnp) | ||
| 1703 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
| 1704 | } | ||
| 1705 | return 0; | ||
| 1706 | } | ||
| 1707 | early_initcall(rcu_spawn_kthreads); | ||
| 1708 | |||
| 1709 | static void __cpuinit rcu_prepare_kthreads(int cpu) | ||
| 1710 | { | ||
| 1711 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
| 1712 | struct rcu_node *rnp = rdp->mynode; | ||
| 1713 | |||
| 1714 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | ||
| 1715 | if (rcu_scheduler_fully_active) { | ||
| 1716 | (void)rcu_spawn_one_cpu_kthread(cpu); | ||
| 1717 | if (rnp->node_kthread_task == NULL) | ||
| 1718 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
| 1719 | } | ||
| 1720 | } | ||
| 1721 | |||
| 1722 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
| 1723 | |||
| 1724 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
| 1725 | { | ||
| 1726 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1727 | } | ||
| 1728 | |||
| 1729 | static void invoke_rcu_callbacks_kthread(void) | ||
| 1730 | { | ||
| 1731 | WARN_ON_ONCE(1); | ||
| 1732 | } | ||
| 1733 | |||
| 1734 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
| 1735 | { | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1739 | |||
| 1740 | static void rcu_stop_cpu_kthread(int cpu) | ||
| 1741 | { | ||
| 1742 | } | ||
| 1743 | |||
| 1744 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1745 | |||
| 1746 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
| 1747 | { | ||
| 1748 | } | ||
| 1749 | |||
| 1750 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
| 1751 | { | ||
| 1752 | } | ||
| 1753 | |||
| 1754 | static int __init rcu_scheduler_really_started(void) | ||
| 1755 | { | ||
| 1756 | rcu_scheduler_fully_active = 1; | ||
| 1757 | return 0; | ||
| 1758 | } | ||
| 1759 | early_initcall(rcu_scheduler_really_started); | ||
| 1760 | |||
| 1761 | static void __cpuinit rcu_prepare_kthreads(int cpu) | ||
| 1762 | { | ||
| 1763 | } | ||
| 1764 | |||
| 1765 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
| 1766 | |||
| 1018 | #ifndef CONFIG_SMP | 1767 | #ifndef CONFIG_SMP |
| 1019 | 1768 | ||
| 1020 | void synchronize_sched_expedited(void) | 1769 | void synchronize_sched_expedited(void) |
| @@ -1187,14 +1936,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); | |||
| 1187 | * | 1936 | * |
| 1188 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | 1937 | * Because it is not legal to invoke rcu_process_callbacks() with irqs |
| 1189 | * disabled, we do one pass of force_quiescent_state(), then do a | 1938 | * disabled, we do one pass of force_quiescent_state(), then do a |
| 1190 | * raise_softirq() to cause rcu_process_callbacks() to be invoked later. | 1939 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked |
| 1191 | * The per-cpu rcu_dyntick_drain variable controls the sequencing. | 1940 | * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. |
| 1192 | */ | 1941 | */ |
| 1193 | int rcu_needs_cpu(int cpu) | 1942 | int rcu_needs_cpu(int cpu) |
| 1194 | { | 1943 | { |
| 1195 | int c = 0; | 1944 | int c = 0; |
| 1196 | int snap; | 1945 | int snap; |
| 1197 | int snap_nmi; | ||
| 1198 | int thatcpu; | 1946 | int thatcpu; |
| 1199 | 1947 | ||
| 1200 | /* Check for being in the holdoff period. */ | 1948 | /* Check for being in the holdoff period. */ |
| @@ -1205,10 +1953,10 @@ int rcu_needs_cpu(int cpu) | |||
| 1205 | for_each_online_cpu(thatcpu) { | 1953 | for_each_online_cpu(thatcpu) { |
| 1206 | if (thatcpu == cpu) | 1954 | if (thatcpu == cpu) |
| 1207 | continue; | 1955 | continue; |
| 1208 | snap = per_cpu(rcu_dynticks, thatcpu).dynticks; | 1956 | snap = atomic_add_return(0, &per_cpu(rcu_dynticks, |
| 1209 | snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; | 1957 | thatcpu).dynticks); |
| 1210 | smp_mb(); /* Order sampling of snap with end of grace period. */ | 1958 | smp_mb(); /* Order sampling of snap with end of grace period. */ |
| 1211 | if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { | 1959 | if ((snap & 0x1) != 0) { |
| 1212 | per_cpu(rcu_dyntick_drain, cpu) = 0; | 1960 | per_cpu(rcu_dyntick_drain, cpu) = 0; |
| 1213 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; | 1961 | per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; |
| 1214 | return rcu_needs_cpu_quick_check(cpu); | 1962 | return rcu_needs_cpu_quick_check(cpu); |
| @@ -1239,7 +1987,7 @@ int rcu_needs_cpu(int cpu) | |||
| 1239 | 1987 | ||
| 1240 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ | 1988 | /* If RCU callbacks are still pending, RCU still needs this CPU. */ |
| 1241 | if (c) | 1989 | if (c) |
| 1242 | raise_softirq(RCU_SOFTIRQ); | 1990 | invoke_rcu_core(); |
| 1243 | return c; | 1991 | return c; |
| 1244 | } | 1992 | } |
| 1245 | 1993 | ||
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index c8e97853b970..3b0c0986afc0 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
| @@ -31,7 +31,7 @@ | |||
| 31 | #include <linux/rcupdate.h> | 31 | #include <linux/rcupdate.h> |
| 32 | #include <linux/interrupt.h> | 32 | #include <linux/interrupt.h> |
| 33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
| 34 | #include <asm/atomic.h> | 34 | #include <linux/atomic.h> |
| 35 | #include <linux/bitops.h> | 35 | #include <linux/bitops.h> |
| 36 | #include <linux/module.h> | 36 | #include <linux/module.h> |
| 37 | #include <linux/completion.h> | 37 | #include <linux/completion.h> |
| @@ -46,6 +46,22 @@ | |||
| 46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
| 47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
| 48 | 48 | ||
| 49 | #ifdef CONFIG_RCU_BOOST | ||
| 50 | |||
| 51 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
| 52 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); | ||
| 53 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
| 54 | DECLARE_PER_CPU(char, rcu_cpu_has_work); | ||
| 55 | |||
| 56 | static char convert_kthread_status(unsigned int kthread_status) | ||
| 57 | { | ||
| 58 | if (kthread_status > RCU_KTHREAD_MAX) | ||
| 59 | return '?'; | ||
| 60 | return "SRWOY"[kthread_status]; | ||
| 61 | } | ||
| 62 | |||
| 63 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 64 | |||
| 49 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | 65 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) |
| 50 | { | 66 | { |
| 51 | if (!rdp->beenonline) | 67 | if (!rdp->beenonline) |
| @@ -57,14 +73,31 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
| 57 | rdp->passed_quiesc, rdp->passed_quiesc_completed, | 73 | rdp->passed_quiesc, rdp->passed_quiesc_completed, |
| 58 | rdp->qs_pending); | 74 | rdp->qs_pending); |
| 59 | #ifdef CONFIG_NO_HZ | 75 | #ifdef CONFIG_NO_HZ |
| 60 | seq_printf(m, " dt=%d/%d dn=%d df=%lu", | 76 | seq_printf(m, " dt=%d/%d/%d df=%lu", |
| 61 | rdp->dynticks->dynticks, | 77 | atomic_read(&rdp->dynticks->dynticks), |
| 62 | rdp->dynticks->dynticks_nesting, | 78 | rdp->dynticks->dynticks_nesting, |
| 63 | rdp->dynticks->dynticks_nmi, | 79 | rdp->dynticks->dynticks_nmi_nesting, |
| 64 | rdp->dynticks_fqs); | 80 | rdp->dynticks_fqs); |
| 65 | #endif /* #ifdef CONFIG_NO_HZ */ | 81 | #endif /* #ifdef CONFIG_NO_HZ */ |
| 66 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); | 82 | seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); |
| 67 | seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); | 83 | seq_printf(m, " ql=%ld qs=%c%c%c%c", |
| 84 | rdp->qlen, | ||
| 85 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
| 86 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
| 87 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
| 88 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
| 89 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
| 90 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
| 91 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
| 92 | #ifdef CONFIG_RCU_BOOST | ||
| 93 | seq_printf(m, " kt=%d/%c/%d ktl=%x", | ||
| 94 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
| 95 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
| 96 | rdp->cpu)), | ||
| 97 | per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), | ||
| 98 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); | ||
| 99 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 100 | seq_printf(m, " b=%ld", rdp->blimit); | ||
| 68 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", | 101 | seq_printf(m, " ci=%lu co=%lu ca=%lu\n", |
| 69 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 102 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); |
| 70 | } | 103 | } |
| @@ -115,13 +148,27 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
| 115 | rdp->qs_pending); | 148 | rdp->qs_pending); |
| 116 | #ifdef CONFIG_NO_HZ | 149 | #ifdef CONFIG_NO_HZ |
| 117 | seq_printf(m, ",%d,%d,%d,%lu", | 150 | seq_printf(m, ",%d,%d,%d,%lu", |
| 118 | rdp->dynticks->dynticks, | 151 | atomic_read(&rdp->dynticks->dynticks), |
| 119 | rdp->dynticks->dynticks_nesting, | 152 | rdp->dynticks->dynticks_nesting, |
| 120 | rdp->dynticks->dynticks_nmi, | 153 | rdp->dynticks->dynticks_nmi_nesting, |
| 121 | rdp->dynticks_fqs); | 154 | rdp->dynticks_fqs); |
| 122 | #endif /* #ifdef CONFIG_NO_HZ */ | 155 | #endif /* #ifdef CONFIG_NO_HZ */ |
| 123 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); | 156 | seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); |
| 124 | seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); | 157 | seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, |
| 158 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
| 159 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
| 160 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
| 161 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
| 162 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
| 163 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
| 164 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
| 165 | #ifdef CONFIG_RCU_BOOST | ||
| 166 | seq_printf(m, ",%d,\"%c\"", | ||
| 167 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
| 168 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
| 169 | rdp->cpu))); | ||
| 170 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 171 | seq_printf(m, ",%ld", rdp->blimit); | ||
| 125 | seq_printf(m, ",%lu,%lu,%lu\n", | 172 | seq_printf(m, ",%lu,%lu,%lu\n", |
| 126 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | 173 | rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); |
| 127 | } | 174 | } |
| @@ -130,9 +177,13 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) | |||
| 130 | { | 177 | { |
| 131 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); | 178 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); |
| 132 | #ifdef CONFIG_NO_HZ | 179 | #ifdef CONFIG_NO_HZ |
| 133 | seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); | 180 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
| 134 | #endif /* #ifdef CONFIG_NO_HZ */ | 181 | #endif /* #ifdef CONFIG_NO_HZ */ |
| 135 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); | 182 | seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); |
| 183 | #ifdef CONFIG_RCU_BOOST | ||
| 184 | seq_puts(m, "\"kt\",\"ktl\""); | ||
| 185 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 186 | seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); | ||
| 136 | #ifdef CONFIG_TREE_PREEMPT_RCU | 187 | #ifdef CONFIG_TREE_PREEMPT_RCU |
| 137 | seq_puts(m, "\"rcu_preempt:\"\n"); | 188 | seq_puts(m, "\"rcu_preempt:\"\n"); |
| 138 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); | 189 | PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); |
| @@ -157,11 +208,76 @@ static const struct file_operations rcudata_csv_fops = { | |||
| 157 | .release = single_release, | 208 | .release = single_release, |
| 158 | }; | 209 | }; |
| 159 | 210 | ||
| 211 | #ifdef CONFIG_RCU_BOOST | ||
| 212 | |||
| 213 | static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) | ||
| 214 | { | ||
| 215 | seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " | ||
| 216 | "j=%04x bt=%04x\n", | ||
| 217 | rnp->grplo, rnp->grphi, | ||
| 218 | "T."[list_empty(&rnp->blkd_tasks)], | ||
| 219 | "N."[!rnp->gp_tasks], | ||
| 220 | "E."[!rnp->exp_tasks], | ||
| 221 | "B."[!rnp->boost_tasks], | ||
| 222 | convert_kthread_status(rnp->boost_kthread_status), | ||
| 223 | rnp->n_tasks_boosted, rnp->n_exp_boosts, | ||
| 224 | rnp->n_normal_boosts, | ||
| 225 | (int)(jiffies & 0xffff), | ||
| 226 | (int)(rnp->boost_time & 0xffff)); | ||
| 227 | seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", | ||
| 228 | " balk", | ||
| 229 | rnp->n_balk_blkd_tasks, | ||
| 230 | rnp->n_balk_exp_gp_tasks, | ||
| 231 | rnp->n_balk_boost_tasks, | ||
| 232 | rnp->n_balk_notblocked, | ||
| 233 | rnp->n_balk_notyet, | ||
| 234 | rnp->n_balk_nos); | ||
| 235 | } | ||
| 236 | |||
| 237 | static int show_rcu_node_boost(struct seq_file *m, void *unused) | ||
| 238 | { | ||
| 239 | struct rcu_node *rnp; | ||
| 240 | |||
| 241 | rcu_for_each_leaf_node(&rcu_preempt_state, rnp) | ||
| 242 | print_one_rcu_node_boost(m, rnp); | ||
| 243 | return 0; | ||
| 244 | } | ||
| 245 | |||
| 246 | static int rcu_node_boost_open(struct inode *inode, struct file *file) | ||
| 247 | { | ||
| 248 | return single_open(file, show_rcu_node_boost, NULL); | ||
| 249 | } | ||
| 250 | |||
| 251 | static const struct file_operations rcu_node_boost_fops = { | ||
| 252 | .owner = THIS_MODULE, | ||
| 253 | .open = rcu_node_boost_open, | ||
| 254 | .read = seq_read, | ||
| 255 | .llseek = seq_lseek, | ||
| 256 | .release = single_release, | ||
| 257 | }; | ||
| 258 | |||
| 259 | /* | ||
| 260 | * Create the rcuboost debugfs entry. Standard error return. | ||
| 261 | */ | ||
| 262 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
| 263 | { | ||
| 264 | return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, | ||
| 265 | &rcu_node_boost_fops); | ||
| 266 | } | ||
| 267 | |||
| 268 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
| 269 | |||
| 270 | static int rcu_boost_trace_create_file(struct dentry *rcudir) | ||
| 271 | { | ||
| 272 | return 0; /* There cannot be an error if we didn't create it! */ | ||
| 273 | } | ||
| 274 | |||
| 275 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
| 276 | |||
| 160 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | 277 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) |
| 161 | { | 278 | { |
| 162 | unsigned long gpnum; | 279 | unsigned long gpnum; |
| 163 | int level = 0; | 280 | int level = 0; |
| 164 | int phase; | ||
| 165 | struct rcu_node *rnp; | 281 | struct rcu_node *rnp; |
| 166 | 282 | ||
| 167 | gpnum = rsp->gpnum; | 283 | gpnum = rsp->gpnum; |
| @@ -178,13 +294,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
| 178 | seq_puts(m, "\n"); | 294 | seq_puts(m, "\n"); |
| 179 | level = rnp->level; | 295 | level = rnp->level; |
| 180 | } | 296 | } |
| 181 | phase = gpnum & 0x1; | 297 | seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", |
| 182 | seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ", | ||
| 183 | rnp->qsmask, rnp->qsmaskinit, | 298 | rnp->qsmask, rnp->qsmaskinit, |
| 184 | "T."[list_empty(&rnp->blocked_tasks[phase])], | 299 | ".G"[rnp->gp_tasks != NULL], |
| 185 | "E."[list_empty(&rnp->blocked_tasks[phase + 2])], | 300 | ".E"[rnp->exp_tasks != NULL], |
| 186 | "T."[list_empty(&rnp->blocked_tasks[!phase])], | 301 | ".T"[!list_empty(&rnp->blkd_tasks)], |
| 187 | "E."[list_empty(&rnp->blocked_tasks[!phase + 2])], | ||
| 188 | rnp->grplo, rnp->grphi, rnp->grpnum); | 302 | rnp->grplo, rnp->grphi, rnp->grpnum); |
| 189 | } | 303 | } |
| 190 | seq_puts(m, "\n"); | 304 | seq_puts(m, "\n"); |
| @@ -216,16 +330,35 @@ static const struct file_operations rcuhier_fops = { | |||
| 216 | .release = single_release, | 330 | .release = single_release, |
| 217 | }; | 331 | }; |
| 218 | 332 | ||
| 333 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | ||
| 334 | { | ||
| 335 | unsigned long flags; | ||
| 336 | unsigned long completed; | ||
| 337 | unsigned long gpnum; | ||
| 338 | unsigned long gpage; | ||
| 339 | unsigned long gpmax; | ||
| 340 | struct rcu_node *rnp = &rsp->node[0]; | ||
| 341 | |||
| 342 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 343 | completed = rsp->completed; | ||
| 344 | gpnum = rsp->gpnum; | ||
| 345 | if (rsp->completed == rsp->gpnum) | ||
| 346 | gpage = 0; | ||
| 347 | else | ||
| 348 | gpage = jiffies - rsp->gp_start; | ||
| 349 | gpmax = rsp->gp_max; | ||
| 350 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 351 | seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", | ||
| 352 | rsp->name, completed, gpnum, gpage, gpmax); | ||
| 353 | } | ||
| 354 | |||
| 219 | static int show_rcugp(struct seq_file *m, void *unused) | 355 | static int show_rcugp(struct seq_file *m, void *unused) |
| 220 | { | 356 | { |
| 221 | #ifdef CONFIG_TREE_PREEMPT_RCU | 357 | #ifdef CONFIG_TREE_PREEMPT_RCU |
| 222 | seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", | 358 | show_one_rcugp(m, &rcu_preempt_state); |
| 223 | rcu_preempt_state.completed, rcu_preempt_state.gpnum); | ||
| 224 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 359 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
| 225 | seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", | 360 | show_one_rcugp(m, &rcu_sched_state); |
| 226 | rcu_sched_state.completed, rcu_sched_state.gpnum); | 361 | show_one_rcugp(m, &rcu_bh_state); |
| 227 | seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", | ||
| 228 | rcu_bh_state.completed, rcu_bh_state.gpnum); | ||
| 229 | return 0; | 362 | return 0; |
| 230 | } | 363 | } |
| 231 | 364 | ||
| @@ -298,6 +431,29 @@ static const struct file_operations rcu_pending_fops = { | |||
| 298 | .release = single_release, | 431 | .release = single_release, |
| 299 | }; | 432 | }; |
| 300 | 433 | ||
| 434 | static int show_rcutorture(struct seq_file *m, void *unused) | ||
| 435 | { | ||
| 436 | seq_printf(m, "rcutorture test sequence: %lu %s\n", | ||
| 437 | rcutorture_testseq >> 1, | ||
| 438 | (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); | ||
| 439 | seq_printf(m, "rcutorture update version number: %lu\n", | ||
| 440 | rcutorture_vernum); | ||
| 441 | return 0; | ||
| 442 | } | ||
| 443 | |||
| 444 | static int rcutorture_open(struct inode *inode, struct file *file) | ||
| 445 | { | ||
| 446 | return single_open(file, show_rcutorture, NULL); | ||
| 447 | } | ||
| 448 | |||
| 449 | static const struct file_operations rcutorture_fops = { | ||
| 450 | .owner = THIS_MODULE, | ||
| 451 | .open = rcutorture_open, | ||
| 452 | .read = seq_read, | ||
| 453 | .llseek = seq_lseek, | ||
| 454 | .release = single_release, | ||
| 455 | }; | ||
| 456 | |||
| 301 | static struct dentry *rcudir; | 457 | static struct dentry *rcudir; |
| 302 | 458 | ||
| 303 | static int __init rcutree_trace_init(void) | 459 | static int __init rcutree_trace_init(void) |
| @@ -318,6 +474,9 @@ static int __init rcutree_trace_init(void) | |||
| 318 | if (!retval) | 474 | if (!retval) |
| 319 | goto free_out; | 475 | goto free_out; |
| 320 | 476 | ||
| 477 | if (rcu_boost_trace_create_file(rcudir)) | ||
| 478 | goto free_out; | ||
| 479 | |||
| 321 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | 480 | retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); |
| 322 | if (!retval) | 481 | if (!retval) |
| 323 | goto free_out; | 482 | goto free_out; |
| @@ -331,6 +490,11 @@ static int __init rcutree_trace_init(void) | |||
| 331 | NULL, &rcu_pending_fops); | 490 | NULL, &rcu_pending_fops); |
| 332 | if (!retval) | 491 | if (!retval) |
| 333 | goto free_out; | 492 | goto free_out; |
| 493 | |||
| 494 | retval = debugfs_create_file("rcutorture", 0444, rcudir, | ||
| 495 | NULL, &rcutorture_fops); | ||
| 496 | if (!retval) | ||
| 497 | goto free_out; | ||
| 334 | return 0; | 498 | return 0; |
| 335 | free_out: | 499 | free_out: |
| 336 | debugfs_remove_recursive(rcudir); | 500 | debugfs_remove_recursive(rcudir); |
diff --git a/kernel/resource.c b/kernel/resource.c index 798e2fae2a06..3b3cedc52592 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -38,6 +38,14 @@ struct resource iomem_resource = { | |||
| 38 | }; | 38 | }; |
| 39 | EXPORT_SYMBOL(iomem_resource); | 39 | EXPORT_SYMBOL(iomem_resource); |
| 40 | 40 | ||
| 41 | /* constraints to be met while allocating resources */ | ||
| 42 | struct resource_constraint { | ||
| 43 | resource_size_t min, max, align; | ||
| 44 | resource_size_t (*alignf)(void *, const struct resource *, | ||
| 45 | resource_size_t, resource_size_t); | ||
| 46 | void *alignf_data; | ||
| 47 | }; | ||
| 48 | |||
| 41 | static DEFINE_RWLOCK(resource_lock); | 49 | static DEFINE_RWLOCK(resource_lock); |
| 42 | 50 | ||
| 43 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | 51 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) |
| @@ -384,16 +392,13 @@ static bool resource_contains(struct resource *res1, struct resource *res2) | |||
| 384 | } | 392 | } |
| 385 | 393 | ||
| 386 | /* | 394 | /* |
| 387 | * Find empty slot in the resource tree given range and alignment. | 395 | * Find empty slot in the resource tree with the given range and |
| 396 | * alignment constraints | ||
| 388 | */ | 397 | */ |
| 389 | static int find_resource(struct resource *root, struct resource *new, | 398 | static int __find_resource(struct resource *root, struct resource *old, |
| 390 | resource_size_t size, resource_size_t min, | 399 | struct resource *new, |
| 391 | resource_size_t max, resource_size_t align, | 400 | resource_size_t size, |
| 392 | resource_size_t (*alignf)(void *, | 401 | struct resource_constraint *constraint) |
| 393 | const struct resource *, | ||
| 394 | resource_size_t, | ||
| 395 | resource_size_t), | ||
| 396 | void *alignf_data) | ||
| 397 | { | 402 | { |
| 398 | struct resource *this = root->child; | 403 | struct resource *this = root->child; |
| 399 | struct resource tmp = *new, avail, alloc; | 404 | struct resource tmp = *new, avail, alloc; |
| @@ -404,25 +409,26 @@ static int find_resource(struct resource *root, struct resource *new, | |||
| 404 | * Skip past an allocated resource that starts at 0, since the assignment | 409 | * Skip past an allocated resource that starts at 0, since the assignment |
| 405 | * of this->start - 1 to tmp->end below would cause an underflow. | 410 | * of this->start - 1 to tmp->end below would cause an underflow. |
| 406 | */ | 411 | */ |
| 407 | if (this && this->start == 0) { | 412 | if (this && this->start == root->start) { |
| 408 | tmp.start = this->end + 1; | 413 | tmp.start = (this == old) ? old->start : this->end + 1; |
| 409 | this = this->sibling; | 414 | this = this->sibling; |
| 410 | } | 415 | } |
| 411 | for(;;) { | 416 | for(;;) { |
| 412 | if (this) | 417 | if (this) |
| 413 | tmp.end = this->start - 1; | 418 | tmp.end = (this == old) ? this->end : this->start - 1; |
| 414 | else | 419 | else |
| 415 | tmp.end = root->end; | 420 | tmp.end = root->end; |
| 416 | 421 | ||
| 417 | resource_clip(&tmp, min, max); | 422 | resource_clip(&tmp, constraint->min, constraint->max); |
| 418 | arch_remove_reservations(&tmp); | 423 | arch_remove_reservations(&tmp); |
| 419 | 424 | ||
| 420 | /* Check for overflow after ALIGN() */ | 425 | /* Check for overflow after ALIGN() */ |
| 421 | avail = *new; | 426 | avail = *new; |
| 422 | avail.start = ALIGN(tmp.start, align); | 427 | avail.start = ALIGN(tmp.start, constraint->align); |
| 423 | avail.end = tmp.end; | 428 | avail.end = tmp.end; |
| 424 | if (avail.start >= tmp.start) { | 429 | if (avail.start >= tmp.start) { |
| 425 | alloc.start = alignf(alignf_data, &avail, size, align); | 430 | alloc.start = constraint->alignf(constraint->alignf_data, &avail, |
| 431 | size, constraint->align); | ||
| 426 | alloc.end = alloc.start + size - 1; | 432 | alloc.end = alloc.start + size - 1; |
| 427 | if (resource_contains(&avail, &alloc)) { | 433 | if (resource_contains(&avail, &alloc)) { |
| 428 | new->start = alloc.start; | 434 | new->start = alloc.start; |
| @@ -432,14 +438,75 @@ static int find_resource(struct resource *root, struct resource *new, | |||
| 432 | } | 438 | } |
| 433 | if (!this) | 439 | if (!this) |
| 434 | break; | 440 | break; |
| 435 | tmp.start = this->end + 1; | 441 | if (this != old) |
| 442 | tmp.start = this->end + 1; | ||
| 436 | this = this->sibling; | 443 | this = this->sibling; |
| 437 | } | 444 | } |
| 438 | return -EBUSY; | 445 | return -EBUSY; |
| 439 | } | 446 | } |
| 440 | 447 | ||
| 448 | /* | ||
| 449 | * Find empty slot in the resource tree given range and alignment. | ||
| 450 | */ | ||
| 451 | static int find_resource(struct resource *root, struct resource *new, | ||
| 452 | resource_size_t size, | ||
| 453 | struct resource_constraint *constraint) | ||
| 454 | { | ||
| 455 | return __find_resource(root, NULL, new, size, constraint); | ||
| 456 | } | ||
| 457 | |||
| 441 | /** | 458 | /** |
| 442 | * allocate_resource - allocate empty slot in the resource tree given range & alignment | 459 | * reallocate_resource - allocate a slot in the resource tree given range & alignment. |
| 460 | * The resource will be relocated if the new size cannot be reallocated in the | ||
| 461 | * current location. | ||
| 462 | * | ||
| 463 | * @root: root resource descriptor | ||
| 464 | * @old: resource descriptor desired by caller | ||
| 465 | * @newsize: new size of the resource descriptor | ||
| 466 | * @constraint: the size and alignment constraints to be met. | ||
| 467 | */ | ||
| 468 | int reallocate_resource(struct resource *root, struct resource *old, | ||
| 469 | resource_size_t newsize, | ||
| 470 | struct resource_constraint *constraint) | ||
| 471 | { | ||
| 472 | int err=0; | ||
| 473 | struct resource new = *old; | ||
| 474 | struct resource *conflict; | ||
| 475 | |||
| 476 | write_lock(&resource_lock); | ||
| 477 | |||
| 478 | if ((err = __find_resource(root, old, &new, newsize, constraint))) | ||
| 479 | goto out; | ||
| 480 | |||
| 481 | if (resource_contains(&new, old)) { | ||
| 482 | old->start = new.start; | ||
| 483 | old->end = new.end; | ||
| 484 | goto out; | ||
| 485 | } | ||
| 486 | |||
| 487 | if (old->child) { | ||
| 488 | err = -EBUSY; | ||
| 489 | goto out; | ||
| 490 | } | ||
| 491 | |||
| 492 | if (resource_contains(old, &new)) { | ||
| 493 | old->start = new.start; | ||
| 494 | old->end = new.end; | ||
| 495 | } else { | ||
| 496 | __release_resource(old); | ||
| 497 | *old = new; | ||
| 498 | conflict = __request_resource(root, old); | ||
| 499 | BUG_ON(conflict); | ||
| 500 | } | ||
| 501 | out: | ||
| 502 | write_unlock(&resource_lock); | ||
| 503 | return err; | ||
| 504 | } | ||
| 505 | |||
| 506 | |||
| 507 | /** | ||
| 508 | * allocate_resource - allocate empty slot in the resource tree given range & alignment. | ||
| 509 | * The resource will be reallocated with a new size if it was already allocated | ||
| 443 | * @root: root resource descriptor | 510 | * @root: root resource descriptor |
| 444 | * @new: resource descriptor desired by caller | 511 | * @new: resource descriptor desired by caller |
| 445 | * @size: requested resource region size | 512 | * @size: requested resource region size |
| @@ -459,12 +526,25 @@ int allocate_resource(struct resource *root, struct resource *new, | |||
| 459 | void *alignf_data) | 526 | void *alignf_data) |
| 460 | { | 527 | { |
| 461 | int err; | 528 | int err; |
| 529 | struct resource_constraint constraint; | ||
| 462 | 530 | ||
| 463 | if (!alignf) | 531 | if (!alignf) |
| 464 | alignf = simple_align_resource; | 532 | alignf = simple_align_resource; |
| 465 | 533 | ||
| 534 | constraint.min = min; | ||
| 535 | constraint.max = max; | ||
| 536 | constraint.align = align; | ||
| 537 | constraint.alignf = alignf; | ||
| 538 | constraint.alignf_data = alignf_data; | ||
| 539 | |||
| 540 | if ( new->parent ) { | ||
| 541 | /* resource is already allocated, try reallocating with | ||
| 542 | the new constraints */ | ||
| 543 | return reallocate_resource(root, new, size, &constraint); | ||
| 544 | } | ||
| 545 | |||
| 466 | write_lock(&resource_lock); | 546 | write_lock(&resource_lock); |
| 467 | err = find_resource(root, new, size, min, max, align, alignf, alignf_data); | 547 | err = find_resource(root, new, size, &constraint); |
| 468 | if (err >= 0 && __request_resource(root, new)) | 548 | if (err >= 0 && __request_resource(root, new)) |
| 469 | err = -EBUSY; | 549 | err = -EBUSY; |
| 470 | write_unlock(&resource_lock); | 550 | write_unlock(&resource_lock); |
| @@ -473,6 +553,27 @@ int allocate_resource(struct resource *root, struct resource *new, | |||
| 473 | 553 | ||
| 474 | EXPORT_SYMBOL(allocate_resource); | 554 | EXPORT_SYMBOL(allocate_resource); |
| 475 | 555 | ||
| 556 | /** | ||
| 557 | * lookup_resource - find an existing resource by a resource start address | ||
| 558 | * @root: root resource descriptor | ||
| 559 | * @start: resource start address | ||
| 560 | * | ||
| 561 | * Returns a pointer to the resource if found, NULL otherwise | ||
| 562 | */ | ||
| 563 | struct resource *lookup_resource(struct resource *root, resource_size_t start) | ||
| 564 | { | ||
| 565 | struct resource *res; | ||
| 566 | |||
| 567 | read_lock(&resource_lock); | ||
| 568 | for (res = root->child; res; res = res->sibling) { | ||
| 569 | if (res->start == start) | ||
| 570 | break; | ||
| 571 | } | ||
| 572 | read_unlock(&resource_lock); | ||
| 573 | |||
| 574 | return res; | ||
| 575 | } | ||
| 576 | |||
| 476 | /* | 577 | /* |
| 477 | * Insert a resource into the resource tree. If successful, return NULL, | 578 | * Insert a resource into the resource tree. If successful, return NULL, |
| 478 | * otherwise return the conflicting resource (compare to __request_resource()) | 579 | * otherwise return the conflicting resource (compare to __request_resource()) |
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index ab449117aaf2..255e1662acdb 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c | |||
| @@ -890,7 +890,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name) | |||
| 890 | { | 890 | { |
| 891 | lock->owner = NULL; | 891 | lock->owner = NULL; |
| 892 | raw_spin_lock_init(&lock->wait_lock); | 892 | raw_spin_lock_init(&lock->wait_lock); |
| 893 | plist_head_init_raw(&lock->wait_list, &lock->wait_lock); | 893 | plist_head_init(&lock->wait_list); |
| 894 | 894 | ||
| 895 | debug_rt_mutex_init(lock, name); | 895 | debug_rt_mutex_init(lock, name); |
| 896 | } | 896 | } |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index cae050b05f5e..9f48f3d82e9b 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
| @@ -11,7 +11,7 @@ | |||
| 11 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> |
| 12 | 12 | ||
| 13 | #include <asm/system.h> | 13 | #include <asm/system.h> |
| 14 | #include <asm/atomic.h> | 14 | #include <linux/atomic.h> |
| 15 | 15 | ||
| 16 | /* | 16 | /* |
| 17 | * lock for reading | 17 | * lock for reading |
| @@ -117,15 +117,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) | |||
| 117 | 117 | ||
| 118 | EXPORT_SYMBOL(down_read_nested); | 118 | EXPORT_SYMBOL(down_read_nested); |
| 119 | 119 | ||
| 120 | void down_read_non_owner(struct rw_semaphore *sem) | ||
| 121 | { | ||
| 122 | might_sleep(); | ||
| 123 | |||
| 124 | __down_read(sem); | ||
| 125 | } | ||
| 126 | |||
| 127 | EXPORT_SYMBOL(down_read_non_owner); | ||
| 128 | |||
| 129 | void down_write_nested(struct rw_semaphore *sem, int subclass) | 120 | void down_write_nested(struct rw_semaphore *sem, int subclass) |
| 130 | { | 121 | { |
| 131 | might_sleep(); | 122 | might_sleep(); |
| @@ -136,13 +127,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass) | |||
| 136 | 127 | ||
| 137 | EXPORT_SYMBOL(down_write_nested); | 128 | EXPORT_SYMBOL(down_write_nested); |
| 138 | 129 | ||
| 139 | void up_read_non_owner(struct rw_semaphore *sem) | ||
| 140 | { | ||
| 141 | __up_read(sem); | ||
| 142 | } | ||
| 143 | |||
| 144 | EXPORT_SYMBOL(up_read_non_owner); | ||
| 145 | |||
| 146 | #endif | 130 | #endif |
| 147 | 131 | ||
| 148 | 132 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 312f8b95c2d4..ccacdbdecf45 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -75,6 +75,9 @@ | |||
| 75 | #include <asm/tlb.h> | 75 | #include <asm/tlb.h> |
| 76 | #include <asm/irq_regs.h> | 76 | #include <asm/irq_regs.h> |
| 77 | #include <asm/mutex.h> | 77 | #include <asm/mutex.h> |
| 78 | #ifdef CONFIG_PARAVIRT | ||
| 79 | #include <asm/paravirt.h> | ||
| 80 | #endif | ||
| 78 | 81 | ||
| 79 | #include "sched_cpupri.h" | 82 | #include "sched_cpupri.h" |
| 80 | #include "workqueue_sched.h" | 83 | #include "workqueue_sched.h" |
| @@ -124,7 +127,7 @@ | |||
| 124 | 127 | ||
| 125 | static inline int rt_policy(int policy) | 128 | static inline int rt_policy(int policy) |
| 126 | { | 129 | { |
| 127 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 130 | if (policy == SCHED_FIFO || policy == SCHED_RR) |
| 128 | return 1; | 131 | return 1; |
| 129 | return 0; | 132 | return 0; |
| 130 | } | 133 | } |
| @@ -231,7 +234,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
| 231 | #endif | 234 | #endif |
| 232 | 235 | ||
| 233 | /* | 236 | /* |
| 234 | * sched_domains_mutex serializes calls to arch_init_sched_domains, | 237 | * sched_domains_mutex serializes calls to init_sched_domains, |
| 235 | * detach_destroy_domains and partition_sched_domains. | 238 | * detach_destroy_domains and partition_sched_domains. |
| 236 | */ | 239 | */ |
| 237 | static DEFINE_MUTEX(sched_domains_mutex); | 240 | static DEFINE_MUTEX(sched_domains_mutex); |
| @@ -292,7 +295,7 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
| 292 | * (The default weight is 1024 - so there's no practical | 295 | * (The default weight is 1024 - so there's no practical |
| 293 | * limitation from this.) | 296 | * limitation from this.) |
| 294 | */ | 297 | */ |
| 295 | #define MIN_SHARES 2 | 298 | #define MIN_SHARES (1UL << 1) |
| 296 | #define MAX_SHARES (1UL << 18) | 299 | #define MAX_SHARES (1UL << 18) |
| 297 | 300 | ||
| 298 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; | 301 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
| @@ -312,6 +315,9 @@ struct cfs_rq { | |||
| 312 | 315 | ||
| 313 | u64 exec_clock; | 316 | u64 exec_clock; |
| 314 | u64 min_vruntime; | 317 | u64 min_vruntime; |
| 318 | #ifndef CONFIG_64BIT | ||
| 319 | u64 min_vruntime_copy; | ||
| 320 | #endif | ||
| 315 | 321 | ||
| 316 | struct rb_root tasks_timeline; | 322 | struct rb_root tasks_timeline; |
| 317 | struct rb_node *rb_leftmost; | 323 | struct rb_node *rb_leftmost; |
| @@ -325,7 +331,9 @@ struct cfs_rq { | |||
| 325 | */ | 331 | */ |
| 326 | struct sched_entity *curr, *next, *last, *skip; | 332 | struct sched_entity *curr, *next, *last, *skip; |
| 327 | 333 | ||
| 334 | #ifdef CONFIG_SCHED_DEBUG | ||
| 328 | unsigned int nr_spread_over; | 335 | unsigned int nr_spread_over; |
| 336 | #endif | ||
| 329 | 337 | ||
| 330 | #ifdef CONFIG_FAIR_GROUP_SCHED | 338 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 331 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 339 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
| @@ -417,6 +425,8 @@ struct rt_rq { | |||
| 417 | */ | 425 | */ |
| 418 | struct root_domain { | 426 | struct root_domain { |
| 419 | atomic_t refcount; | 427 | atomic_t refcount; |
| 428 | atomic_t rto_count; | ||
| 429 | struct rcu_head rcu; | ||
| 420 | cpumask_var_t span; | 430 | cpumask_var_t span; |
| 421 | cpumask_var_t online; | 431 | cpumask_var_t online; |
| 422 | 432 | ||
| @@ -425,7 +435,6 @@ struct root_domain { | |||
| 425 | * one runnable RT task. | 435 | * one runnable RT task. |
| 426 | */ | 436 | */ |
| 427 | cpumask_var_t rto_mask; | 437 | cpumask_var_t rto_mask; |
| 428 | atomic_t rto_count; | ||
| 429 | struct cpupri cpupri; | 438 | struct cpupri cpupri; |
| 430 | }; | 439 | }; |
| 431 | 440 | ||
| @@ -460,7 +469,7 @@ struct rq { | |||
| 460 | u64 nohz_stamp; | 469 | u64 nohz_stamp; |
| 461 | unsigned char nohz_balance_kick; | 470 | unsigned char nohz_balance_kick; |
| 462 | #endif | 471 | #endif |
| 463 | unsigned int skip_clock_update; | 472 | int skip_clock_update; |
| 464 | 473 | ||
| 465 | /* capture load from *all* tasks on this cpu: */ | 474 | /* capture load from *all* tasks on this cpu: */ |
| 466 | struct load_weight load; | 475 | struct load_weight load; |
| @@ -522,6 +531,12 @@ struct rq { | |||
| 522 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 531 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| 523 | u64 prev_irq_time; | 532 | u64 prev_irq_time; |
| 524 | #endif | 533 | #endif |
| 534 | #ifdef CONFIG_PARAVIRT | ||
| 535 | u64 prev_steal_time; | ||
| 536 | #endif | ||
| 537 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
| 538 | u64 prev_steal_time_rq; | ||
| 539 | #endif | ||
| 525 | 540 | ||
| 526 | /* calc_load related fields */ | 541 | /* calc_load related fields */ |
| 527 | unsigned long calc_load_update; | 542 | unsigned long calc_load_update; |
| @@ -553,6 +568,10 @@ struct rq { | |||
| 553 | unsigned int ttwu_count; | 568 | unsigned int ttwu_count; |
| 554 | unsigned int ttwu_local; | 569 | unsigned int ttwu_local; |
| 555 | #endif | 570 | #endif |
| 571 | |||
| 572 | #ifdef CONFIG_SMP | ||
| 573 | struct task_struct *wake_list; | ||
| 574 | #endif | ||
| 556 | }; | 575 | }; |
| 557 | 576 | ||
| 558 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 577 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| @@ -571,7 +590,6 @@ static inline int cpu_of(struct rq *rq) | |||
| 571 | 590 | ||
| 572 | #define rcu_dereference_check_sched_domain(p) \ | 591 | #define rcu_dereference_check_sched_domain(p) \ |
| 573 | rcu_dereference_check((p), \ | 592 | rcu_dereference_check((p), \ |
| 574 | rcu_read_lock_sched_held() || \ | ||
| 575 | lockdep_is_held(&sched_domains_mutex)) | 593 | lockdep_is_held(&sched_domains_mutex)) |
| 576 | 594 | ||
| 577 | /* | 595 | /* |
| @@ -595,10 +613,10 @@ static inline int cpu_of(struct rq *rq) | |||
| 595 | /* | 613 | /* |
| 596 | * Return the group to which this tasks belongs. | 614 | * Return the group to which this tasks belongs. |
| 597 | * | 615 | * |
| 598 | * We use task_subsys_state_check() and extend the RCU verification | 616 | * We use task_subsys_state_check() and extend the RCU verification with |
| 599 | * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() | 617 | * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each |
| 600 | * holds that lock for each task it moves into the cgroup. Therefore | 618 | * task it moves into the cgroup. Therefore by holding either of those locks, |
| 601 | * by holding that lock, we pin the task to the current cgroup. | 619 | * we pin the task to the current cgroup. |
| 602 | */ | 620 | */ |
| 603 | static inline struct task_group *task_group(struct task_struct *p) | 621 | static inline struct task_group *task_group(struct task_struct *p) |
| 604 | { | 622 | { |
| @@ -606,6 +624,7 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
| 606 | struct cgroup_subsys_state *css; | 624 | struct cgroup_subsys_state *css; |
| 607 | 625 | ||
| 608 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 626 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
| 627 | lockdep_is_held(&p->pi_lock) || | ||
| 609 | lockdep_is_held(&task_rq(p)->lock)); | 628 | lockdep_is_held(&task_rq(p)->lock)); |
| 610 | tg = container_of(css, struct task_group, css); | 629 | tg = container_of(css, struct task_group, css); |
| 611 | 630 | ||
| @@ -642,7 +661,7 @@ static void update_rq_clock(struct rq *rq) | |||
| 642 | { | 661 | { |
| 643 | s64 delta; | 662 | s64 delta; |
| 644 | 663 | ||
| 645 | if (rq->skip_clock_update) | 664 | if (rq->skip_clock_update > 0) |
| 646 | return; | 665 | return; |
| 647 | 666 | ||
| 648 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | 667 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
| @@ -838,18 +857,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) | |||
| 838 | return rq->curr == p; | 857 | return rq->curr == p; |
| 839 | } | 858 | } |
| 840 | 859 | ||
| 841 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 842 | static inline int task_running(struct rq *rq, struct task_struct *p) | 860 | static inline int task_running(struct rq *rq, struct task_struct *p) |
| 843 | { | 861 | { |
| 862 | #ifdef CONFIG_SMP | ||
| 863 | return p->on_cpu; | ||
| 864 | #else | ||
| 844 | return task_current(rq, p); | 865 | return task_current(rq, p); |
| 866 | #endif | ||
| 845 | } | 867 | } |
| 846 | 868 | ||
| 869 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
| 847 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 870 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
| 848 | { | 871 | { |
| 872 | #ifdef CONFIG_SMP | ||
| 873 | /* | ||
| 874 | * We can optimise this out completely for !SMP, because the | ||
| 875 | * SMP rebalancing from interrupt is the only thing that cares | ||
| 876 | * here. | ||
| 877 | */ | ||
| 878 | next->on_cpu = 1; | ||
| 879 | #endif | ||
| 849 | } | 880 | } |
| 850 | 881 | ||
| 851 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 882 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
| 852 | { | 883 | { |
| 884 | #ifdef CONFIG_SMP | ||
| 885 | /* | ||
| 886 | * After ->on_cpu is cleared, the task can be moved to a different CPU. | ||
| 887 | * We must ensure this doesn't happen until the switch is completely | ||
| 888 | * finished. | ||
| 889 | */ | ||
| 890 | smp_wmb(); | ||
| 891 | prev->on_cpu = 0; | ||
| 892 | #endif | ||
| 853 | #ifdef CONFIG_DEBUG_SPINLOCK | 893 | #ifdef CONFIG_DEBUG_SPINLOCK |
| 854 | /* this is a valid case when another task releases the spinlock */ | 894 | /* this is a valid case when another task releases the spinlock */ |
| 855 | rq->lock.owner = current; | 895 | rq->lock.owner = current; |
| @@ -865,15 +905,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 865 | } | 905 | } |
| 866 | 906 | ||
| 867 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 907 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
| 868 | static inline int task_running(struct rq *rq, struct task_struct *p) | ||
| 869 | { | ||
| 870 | #ifdef CONFIG_SMP | ||
| 871 | return p->oncpu; | ||
| 872 | #else | ||
| 873 | return task_current(rq, p); | ||
| 874 | #endif | ||
| 875 | } | ||
| 876 | |||
| 877 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 908 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
| 878 | { | 909 | { |
| 879 | #ifdef CONFIG_SMP | 910 | #ifdef CONFIG_SMP |
| @@ -882,7 +913,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
| 882 | * SMP rebalancing from interrupt is the only thing that cares | 913 | * SMP rebalancing from interrupt is the only thing that cares |
| 883 | * here. | 914 | * here. |
| 884 | */ | 915 | */ |
| 885 | next->oncpu = 1; | 916 | next->on_cpu = 1; |
| 886 | #endif | 917 | #endif |
| 887 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 918 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
| 888 | raw_spin_unlock_irq(&rq->lock); | 919 | raw_spin_unlock_irq(&rq->lock); |
| @@ -895,12 +926,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 895 | { | 926 | { |
| 896 | #ifdef CONFIG_SMP | 927 | #ifdef CONFIG_SMP |
| 897 | /* | 928 | /* |
| 898 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 929 | * After ->on_cpu is cleared, the task can be moved to a different CPU. |
| 899 | * We must ensure this doesn't happen until the switch is completely | 930 | * We must ensure this doesn't happen until the switch is completely |
| 900 | * finished. | 931 | * finished. |
| 901 | */ | 932 | */ |
| 902 | smp_wmb(); | 933 | smp_wmb(); |
| 903 | prev->oncpu = 0; | 934 | prev->on_cpu = 0; |
| 904 | #endif | 935 | #endif |
| 905 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 936 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
| 906 | local_irq_enable(); | 937 | local_irq_enable(); |
| @@ -909,23 +940,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 909 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 940 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
| 910 | 941 | ||
| 911 | /* | 942 | /* |
| 912 | * Check whether the task is waking, we use this to synchronize ->cpus_allowed | 943 | * __task_rq_lock - lock the rq @p resides on. |
| 913 | * against ttwu(). | ||
| 914 | */ | ||
| 915 | static inline int task_is_waking(struct task_struct *p) | ||
| 916 | { | ||
| 917 | return unlikely(p->state == TASK_WAKING); | ||
| 918 | } | ||
| 919 | |||
| 920 | /* | ||
| 921 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
| 922 | * Must be called interrupts disabled. | ||
| 923 | */ | 944 | */ |
| 924 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 945 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
| 925 | __acquires(rq->lock) | 946 | __acquires(rq->lock) |
| 926 | { | 947 | { |
| 927 | struct rq *rq; | 948 | struct rq *rq; |
| 928 | 949 | ||
| 950 | lockdep_assert_held(&p->pi_lock); | ||
| 951 | |||
| 929 | for (;;) { | 952 | for (;;) { |
| 930 | rq = task_rq(p); | 953 | rq = task_rq(p); |
| 931 | raw_spin_lock(&rq->lock); | 954 | raw_spin_lock(&rq->lock); |
| @@ -936,22 +959,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
| 936 | } | 959 | } |
| 937 | 960 | ||
| 938 | /* | 961 | /* |
| 939 | * task_rq_lock - lock the runqueue a given task resides on and disable | 962 | * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. |
| 940 | * interrupts. Note the ordering: we can safely lookup the task_rq without | ||
| 941 | * explicitly disabling preemption. | ||
| 942 | */ | 963 | */ |
| 943 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 964 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
| 965 | __acquires(p->pi_lock) | ||
| 944 | __acquires(rq->lock) | 966 | __acquires(rq->lock) |
| 945 | { | 967 | { |
| 946 | struct rq *rq; | 968 | struct rq *rq; |
| 947 | 969 | ||
| 948 | for (;;) { | 970 | for (;;) { |
| 949 | local_irq_save(*flags); | 971 | raw_spin_lock_irqsave(&p->pi_lock, *flags); |
| 950 | rq = task_rq(p); | 972 | rq = task_rq(p); |
| 951 | raw_spin_lock(&rq->lock); | 973 | raw_spin_lock(&rq->lock); |
| 952 | if (likely(rq == task_rq(p))) | 974 | if (likely(rq == task_rq(p))) |
| 953 | return rq; | 975 | return rq; |
| 954 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 976 | raw_spin_unlock(&rq->lock); |
| 977 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
| 955 | } | 978 | } |
| 956 | } | 979 | } |
| 957 | 980 | ||
| @@ -961,10 +984,13 @@ static void __task_rq_unlock(struct rq *rq) | |||
| 961 | raw_spin_unlock(&rq->lock); | 984 | raw_spin_unlock(&rq->lock); |
| 962 | } | 985 | } |
| 963 | 986 | ||
| 964 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 987 | static inline void |
| 988 | task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) | ||
| 965 | __releases(rq->lock) | 989 | __releases(rq->lock) |
| 990 | __releases(p->pi_lock) | ||
| 966 | { | 991 | { |
| 967 | raw_spin_unlock_irqrestore(&rq->lock, *flags); | 992 | raw_spin_unlock(&rq->lock); |
| 993 | raw_spin_unlock_irqrestore(&p->pi_lock, *flags); | ||
| 968 | } | 994 | } |
| 969 | 995 | ||
| 970 | /* | 996 | /* |
| @@ -1193,11 +1219,17 @@ int get_nohz_timer_target(void) | |||
| 1193 | int i; | 1219 | int i; |
| 1194 | struct sched_domain *sd; | 1220 | struct sched_domain *sd; |
| 1195 | 1221 | ||
| 1222 | rcu_read_lock(); | ||
| 1196 | for_each_domain(cpu, sd) { | 1223 | for_each_domain(cpu, sd) { |
| 1197 | for_each_cpu(i, sched_domain_span(sd)) | 1224 | for_each_cpu(i, sched_domain_span(sd)) { |
| 1198 | if (!idle_cpu(i)) | 1225 | if (!idle_cpu(i)) { |
| 1199 | return i; | 1226 | cpu = i; |
| 1227 | goto unlock; | ||
| 1228 | } | ||
| 1229 | } | ||
| 1200 | } | 1230 | } |
| 1231 | unlock: | ||
| 1232 | rcu_read_unlock(); | ||
| 1201 | return cpu; | 1233 | return cpu; |
| 1202 | } | 1234 | } |
| 1203 | /* | 1235 | /* |
| @@ -1307,15 +1339,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
| 1307 | { | 1339 | { |
| 1308 | u64 tmp; | 1340 | u64 tmp; |
| 1309 | 1341 | ||
| 1342 | /* | ||
| 1343 | * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched | ||
| 1344 | * entities since MIN_SHARES = 2. Treat weight as 1 if less than | ||
| 1345 | * 2^SCHED_LOAD_RESOLUTION. | ||
| 1346 | */ | ||
| 1347 | if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) | ||
| 1348 | tmp = (u64)delta_exec * scale_load_down(weight); | ||
| 1349 | else | ||
| 1350 | tmp = (u64)delta_exec; | ||
| 1351 | |||
| 1310 | if (!lw->inv_weight) { | 1352 | if (!lw->inv_weight) { |
| 1311 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) | 1353 | unsigned long w = scale_load_down(lw->weight); |
| 1354 | |||
| 1355 | if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) | ||
| 1312 | lw->inv_weight = 1; | 1356 | lw->inv_weight = 1; |
| 1357 | else if (unlikely(!w)) | ||
| 1358 | lw->inv_weight = WMULT_CONST; | ||
| 1313 | else | 1359 | else |
| 1314 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) | 1360 | lw->inv_weight = WMULT_CONST / w; |
| 1315 | / (lw->weight+1); | ||
| 1316 | } | 1361 | } |
| 1317 | 1362 | ||
| 1318 | tmp = (u64)delta_exec * weight; | ||
| 1319 | /* | 1363 | /* |
| 1320 | * Check whether we'd overflow the 64-bit multiplication: | 1364 | * Check whether we'd overflow the 64-bit multiplication: |
| 1321 | */ | 1365 | */ |
| @@ -1532,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 1532 | return rq->avg_load_per_task; | 1576 | return rq->avg_load_per_task; |
| 1533 | } | 1577 | } |
| 1534 | 1578 | ||
| 1535 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 1536 | |||
| 1537 | /* | ||
| 1538 | * Compute the cpu's hierarchical load factor for each task group. | ||
| 1539 | * This needs to be done in a top-down fashion because the load of a child | ||
| 1540 | * group is a fraction of its parents load. | ||
| 1541 | */ | ||
| 1542 | static int tg_load_down(struct task_group *tg, void *data) | ||
| 1543 | { | ||
| 1544 | unsigned long load; | ||
| 1545 | long cpu = (long)data; | ||
| 1546 | |||
| 1547 | if (!tg->parent) { | ||
| 1548 | load = cpu_rq(cpu)->load.weight; | ||
| 1549 | } else { | ||
| 1550 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
| 1551 | load *= tg->se[cpu]->load.weight; | ||
| 1552 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | ||
| 1553 | } | ||
| 1554 | |||
| 1555 | tg->cfs_rq[cpu]->h_load = load; | ||
| 1556 | |||
| 1557 | return 0; | ||
| 1558 | } | ||
| 1559 | |||
| 1560 | static void update_h_load(long cpu) | ||
| 1561 | { | ||
| 1562 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | ||
| 1563 | } | ||
| 1564 | |||
| 1565 | #endif | ||
| 1566 | |||
| 1567 | #ifdef CONFIG_PREEMPT | 1579 | #ifdef CONFIG_PREEMPT |
| 1568 | 1580 | ||
| 1569 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | 1581 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); |
| @@ -1755,17 +1767,20 @@ static void dec_nr_running(struct rq *rq) | |||
| 1755 | 1767 | ||
| 1756 | static void set_load_weight(struct task_struct *p) | 1768 | static void set_load_weight(struct task_struct *p) |
| 1757 | { | 1769 | { |
| 1770 | int prio = p->static_prio - MAX_RT_PRIO; | ||
| 1771 | struct load_weight *load = &p->se.load; | ||
| 1772 | |||
| 1758 | /* | 1773 | /* |
| 1759 | * SCHED_IDLE tasks get minimal weight: | 1774 | * SCHED_IDLE tasks get minimal weight: |
| 1760 | */ | 1775 | */ |
| 1761 | if (p->policy == SCHED_IDLE) { | 1776 | if (p->policy == SCHED_IDLE) { |
| 1762 | p->se.load.weight = WEIGHT_IDLEPRIO; | 1777 | load->weight = scale_load(WEIGHT_IDLEPRIO); |
| 1763 | p->se.load.inv_weight = WMULT_IDLEPRIO; | 1778 | load->inv_weight = WMULT_IDLEPRIO; |
| 1764 | return; | 1779 | return; |
| 1765 | } | 1780 | } |
| 1766 | 1781 | ||
| 1767 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; | 1782 | load->weight = scale_load(prio_to_weight[prio]); |
| 1768 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 1783 | load->inv_weight = prio_to_wmult[prio]; |
| 1769 | } | 1784 | } |
| 1770 | 1785 | ||
| 1771 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 1786 | static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
| @@ -1773,7 +1788,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 1773 | update_rq_clock(rq); | 1788 | update_rq_clock(rq); |
| 1774 | sched_info_queued(p); | 1789 | sched_info_queued(p); |
| 1775 | p->sched_class->enqueue_task(rq, p, flags); | 1790 | p->sched_class->enqueue_task(rq, p, flags); |
| 1776 | p->se.on_rq = 1; | ||
| 1777 | } | 1791 | } |
| 1778 | 1792 | ||
| 1779 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | 1793 | static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) |
| @@ -1781,7 +1795,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 1781 | update_rq_clock(rq); | 1795 | update_rq_clock(rq); |
| 1782 | sched_info_dequeued(p); | 1796 | sched_info_dequeued(p); |
| 1783 | p->sched_class->dequeue_task(rq, p, flags); | 1797 | p->sched_class->dequeue_task(rq, p, flags); |
| 1784 | p->se.on_rq = 0; | ||
| 1785 | } | 1798 | } |
| 1786 | 1799 | ||
| 1787 | /* | 1800 | /* |
| @@ -1916,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr) | |||
| 1916 | } | 1929 | } |
| 1917 | EXPORT_SYMBOL_GPL(account_system_vtime); | 1930 | EXPORT_SYMBOL_GPL(account_system_vtime); |
| 1918 | 1931 | ||
| 1919 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 1932 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
| 1933 | |||
| 1934 | #ifdef CONFIG_PARAVIRT | ||
| 1935 | static inline u64 steal_ticks(u64 steal) | ||
| 1920 | { | 1936 | { |
| 1921 | s64 irq_delta; | 1937 | if (unlikely(steal > NSEC_PER_SEC)) |
| 1938 | return div_u64(steal, TICK_NSEC); | ||
| 1939 | |||
| 1940 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
| 1941 | } | ||
| 1942 | #endif | ||
| 1922 | 1943 | ||
| 1944 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
| 1945 | { | ||
| 1946 | /* | ||
| 1947 | * In theory, the compile should just see 0 here, and optimize out the call | ||
| 1948 | * to sched_rt_avg_update. But I don't trust it... | ||
| 1949 | */ | ||
| 1950 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
| 1951 | s64 steal = 0, irq_delta = 0; | ||
| 1952 | #endif | ||
| 1953 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 1923 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | 1954 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; |
| 1924 | 1955 | ||
| 1925 | /* | 1956 | /* |
| @@ -1942,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
| 1942 | 1973 | ||
| 1943 | rq->prev_irq_time += irq_delta; | 1974 | rq->prev_irq_time += irq_delta; |
| 1944 | delta -= irq_delta; | 1975 | delta -= irq_delta; |
| 1976 | #endif | ||
| 1977 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
| 1978 | if (static_branch((¶virt_steal_rq_enabled))) { | ||
| 1979 | u64 st; | ||
| 1980 | |||
| 1981 | steal = paravirt_steal_clock(cpu_of(rq)); | ||
| 1982 | steal -= rq->prev_steal_time_rq; | ||
| 1983 | |||
| 1984 | if (unlikely(steal > delta)) | ||
| 1985 | steal = delta; | ||
| 1986 | |||
| 1987 | st = steal_ticks(steal); | ||
| 1988 | steal = st * TICK_NSEC; | ||
| 1989 | |||
| 1990 | rq->prev_steal_time_rq += steal; | ||
| 1991 | |||
| 1992 | delta -= steal; | ||
| 1993 | } | ||
| 1994 | #endif | ||
| 1995 | |||
| 1945 | rq->clock_task += delta; | 1996 | rq->clock_task += delta; |
| 1946 | 1997 | ||
| 1947 | if (irq_delta && sched_feat(NONIRQ_POWER)) | 1998 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) |
| 1948 | sched_rt_avg_update(rq, irq_delta); | 1999 | if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) |
| 2000 | sched_rt_avg_update(rq, irq_delta + steal); | ||
| 2001 | #endif | ||
| 1949 | } | 2002 | } |
| 1950 | 2003 | ||
| 2004 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 1951 | static int irqtime_account_hi_update(void) | 2005 | static int irqtime_account_hi_update(void) |
| 1952 | { | 2006 | { |
| 1953 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 2007 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
| @@ -1982,12 +2036,7 @@ static int irqtime_account_si_update(void) | |||
| 1982 | 2036 | ||
| 1983 | #define sched_clock_irqtime (0) | 2037 | #define sched_clock_irqtime (0) |
| 1984 | 2038 | ||
| 1985 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 2039 | #endif |
| 1986 | { | ||
| 1987 | rq->clock_task += delta; | ||
| 1988 | } | ||
| 1989 | |||
| 1990 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 1991 | 2040 | ||
| 1992 | #include "sched_idletask.c" | 2041 | #include "sched_idletask.c" |
| 1993 | #include "sched_fair.c" | 2042 | #include "sched_fair.c" |
| @@ -2116,7 +2165,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
| 2116 | * A queue event has occurred, and we're going to schedule. In | 2165 | * A queue event has occurred, and we're going to schedule. In |
| 2117 | * this case, we can save a useless back to back clock update. | 2166 | * this case, we can save a useless back to back clock update. |
| 2118 | */ | 2167 | */ |
| 2119 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) | 2168 | if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) |
| 2120 | rq->skip_clock_update = 1; | 2169 | rq->skip_clock_update = 1; |
| 2121 | } | 2170 | } |
| 2122 | 2171 | ||
| @@ -2162,13 +2211,28 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
| 2162 | */ | 2211 | */ |
| 2163 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && | 2212 | WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && |
| 2164 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); | 2213 | !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); |
| 2214 | |||
| 2215 | #ifdef CONFIG_LOCKDEP | ||
| 2216 | /* | ||
| 2217 | * The caller should hold either p->pi_lock or rq->lock, when changing | ||
| 2218 | * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. | ||
| 2219 | * | ||
| 2220 | * sched_move_task() holds both and thus holding either pins the cgroup, | ||
| 2221 | * see set_task_rq(). | ||
| 2222 | * | ||
| 2223 | * Furthermore, all task_rq users should acquire both locks, see | ||
| 2224 | * task_rq_lock(). | ||
| 2225 | */ | ||
| 2226 | WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || | ||
| 2227 | lockdep_is_held(&task_rq(p)->lock))); | ||
| 2228 | #endif | ||
| 2165 | #endif | 2229 | #endif |
| 2166 | 2230 | ||
| 2167 | trace_sched_migrate_task(p, new_cpu); | 2231 | trace_sched_migrate_task(p, new_cpu); |
| 2168 | 2232 | ||
| 2169 | if (task_cpu(p) != new_cpu) { | 2233 | if (task_cpu(p) != new_cpu) { |
| 2170 | p->se.nr_migrations++; | 2234 | p->se.nr_migrations++; |
| 2171 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); | 2235 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); |
| 2172 | } | 2236 | } |
| 2173 | 2237 | ||
| 2174 | __set_task_cpu(p, new_cpu); | 2238 | __set_task_cpu(p, new_cpu); |
| @@ -2182,19 +2246,6 @@ struct migration_arg { | |||
| 2182 | static int migration_cpu_stop(void *data); | 2246 | static int migration_cpu_stop(void *data); |
| 2183 | 2247 | ||
| 2184 | /* | 2248 | /* |
| 2185 | * The task's runqueue lock must be held. | ||
| 2186 | * Returns true if you have to wait for migration thread. | ||
| 2187 | */ | ||
| 2188 | static bool migrate_task(struct task_struct *p, struct rq *rq) | ||
| 2189 | { | ||
| 2190 | /* | ||
| 2191 | * If the task is not on a runqueue (and not running), then | ||
| 2192 | * the next wake-up will properly place the task. | ||
| 2193 | */ | ||
| 2194 | return p->se.on_rq || task_running(rq, p); | ||
| 2195 | } | ||
| 2196 | |||
| 2197 | /* | ||
| 2198 | * wait_task_inactive - wait for a thread to unschedule. | 2249 | * wait_task_inactive - wait for a thread to unschedule. |
| 2199 | * | 2250 | * |
| 2200 | * If @match_state is nonzero, it's the @p->state value just checked and | 2251 | * If @match_state is nonzero, it's the @p->state value just checked and |
| @@ -2251,11 +2302,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
| 2251 | rq = task_rq_lock(p, &flags); | 2302 | rq = task_rq_lock(p, &flags); |
| 2252 | trace_sched_wait_task(p); | 2303 | trace_sched_wait_task(p); |
| 2253 | running = task_running(rq, p); | 2304 | running = task_running(rq, p); |
| 2254 | on_rq = p->se.on_rq; | 2305 | on_rq = p->on_rq; |
| 2255 | ncsw = 0; | 2306 | ncsw = 0; |
| 2256 | if (!match_state || p->state == match_state) | 2307 | if (!match_state || p->state == match_state) |
| 2257 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | 2308 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ |
| 2258 | task_rq_unlock(rq, &flags); | 2309 | task_rq_unlock(rq, p, &flags); |
| 2259 | 2310 | ||
| 2260 | /* | 2311 | /* |
| 2261 | * If it changed from the expected state, bail out now. | 2312 | * If it changed from the expected state, bail out now. |
| @@ -2330,7 +2381,7 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
| 2330 | 2381 | ||
| 2331 | #ifdef CONFIG_SMP | 2382 | #ifdef CONFIG_SMP |
| 2332 | /* | 2383 | /* |
| 2333 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2384 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock |
| 2334 | */ | 2385 | */ |
| 2335 | static int select_fallback_rq(int cpu, struct task_struct *p) | 2386 | static int select_fallback_rq(int cpu, struct task_struct *p) |
| 2336 | { | 2387 | { |
| @@ -2363,12 +2414,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
| 2363 | } | 2414 | } |
| 2364 | 2415 | ||
| 2365 | /* | 2416 | /* |
| 2366 | * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. | 2417 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. |
| 2367 | */ | 2418 | */ |
| 2368 | static inline | 2419 | static inline |
| 2369 | int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) | 2420 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
| 2370 | { | 2421 | { |
| 2371 | int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); | 2422 | int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); |
| 2372 | 2423 | ||
| 2373 | /* | 2424 | /* |
| 2374 | * In order not to call set_task_cpu() on a blocking task we need | 2425 | * In order not to call set_task_cpu() on a blocking task we need |
| @@ -2394,27 +2445,63 @@ static void update_avg(u64 *avg, u64 sample) | |||
| 2394 | } | 2445 | } |
| 2395 | #endif | 2446 | #endif |
| 2396 | 2447 | ||
| 2397 | static inline void ttwu_activate(struct task_struct *p, struct rq *rq, | 2448 | static void |
| 2398 | bool is_sync, bool is_migrate, bool is_local, | 2449 | ttwu_stat(struct task_struct *p, int cpu, int wake_flags) |
| 2399 | unsigned long en_flags) | ||
| 2400 | { | 2450 | { |
| 2401 | schedstat_inc(p, se.statistics.nr_wakeups); | 2451 | #ifdef CONFIG_SCHEDSTATS |
| 2402 | if (is_sync) | 2452 | struct rq *rq = this_rq(); |
| 2403 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | 2453 | |
| 2404 | if (is_migrate) | 2454 | #ifdef CONFIG_SMP |
| 2405 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | 2455 | int this_cpu = smp_processor_id(); |
| 2406 | if (is_local) | 2456 | |
| 2457 | if (cpu == this_cpu) { | ||
| 2458 | schedstat_inc(rq, ttwu_local); | ||
| 2407 | schedstat_inc(p, se.statistics.nr_wakeups_local); | 2459 | schedstat_inc(p, se.statistics.nr_wakeups_local); |
| 2408 | else | 2460 | } else { |
| 2461 | struct sched_domain *sd; | ||
| 2462 | |||
| 2409 | schedstat_inc(p, se.statistics.nr_wakeups_remote); | 2463 | schedstat_inc(p, se.statistics.nr_wakeups_remote); |
| 2464 | rcu_read_lock(); | ||
| 2465 | for_each_domain(this_cpu, sd) { | ||
| 2466 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
| 2467 | schedstat_inc(sd, ttwu_wake_remote); | ||
| 2468 | break; | ||
| 2469 | } | ||
| 2470 | } | ||
| 2471 | rcu_read_unlock(); | ||
| 2472 | } | ||
| 2473 | |||
| 2474 | if (wake_flags & WF_MIGRATED) | ||
| 2475 | schedstat_inc(p, se.statistics.nr_wakeups_migrate); | ||
| 2410 | 2476 | ||
| 2477 | #endif /* CONFIG_SMP */ | ||
| 2478 | |||
| 2479 | schedstat_inc(rq, ttwu_count); | ||
| 2480 | schedstat_inc(p, se.statistics.nr_wakeups); | ||
| 2481 | |||
| 2482 | if (wake_flags & WF_SYNC) | ||
| 2483 | schedstat_inc(p, se.statistics.nr_wakeups_sync); | ||
| 2484 | |||
| 2485 | #endif /* CONFIG_SCHEDSTATS */ | ||
| 2486 | } | ||
| 2487 | |||
| 2488 | static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) | ||
| 2489 | { | ||
| 2411 | activate_task(rq, p, en_flags); | 2490 | activate_task(rq, p, en_flags); |
| 2491 | p->on_rq = 1; | ||
| 2492 | |||
| 2493 | /* if a worker is waking up, notify workqueue */ | ||
| 2494 | if (p->flags & PF_WQ_WORKER) | ||
| 2495 | wq_worker_waking_up(p, cpu_of(rq)); | ||
| 2412 | } | 2496 | } |
| 2413 | 2497 | ||
| 2414 | static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | 2498 | /* |
| 2415 | int wake_flags, bool success) | 2499 | * Mark the task runnable and perform wakeup-preemption. |
| 2500 | */ | ||
| 2501 | static void | ||
| 2502 | ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) | ||
| 2416 | { | 2503 | { |
| 2417 | trace_sched_wakeup(p, success); | 2504 | trace_sched_wakeup(p, true); |
| 2418 | check_preempt_curr(rq, p, wake_flags); | 2505 | check_preempt_curr(rq, p, wake_flags); |
| 2419 | 2506 | ||
| 2420 | p->state = TASK_RUNNING; | 2507 | p->state = TASK_RUNNING; |
| @@ -2422,7 +2509,7 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
| 2422 | if (p->sched_class->task_woken) | 2509 | if (p->sched_class->task_woken) |
| 2423 | p->sched_class->task_woken(rq, p); | 2510 | p->sched_class->task_woken(rq, p); |
| 2424 | 2511 | ||
| 2425 | if (unlikely(rq->idle_stamp)) { | 2512 | if (rq->idle_stamp) { |
| 2426 | u64 delta = rq->clock - rq->idle_stamp; | 2513 | u64 delta = rq->clock - rq->idle_stamp; |
| 2427 | u64 max = 2*sysctl_sched_migration_cost; | 2514 | u64 max = 2*sysctl_sched_migration_cost; |
| 2428 | 2515 | ||
| @@ -2433,9 +2520,151 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
| 2433 | rq->idle_stamp = 0; | 2520 | rq->idle_stamp = 0; |
| 2434 | } | 2521 | } |
| 2435 | #endif | 2522 | #endif |
| 2436 | /* if a worker is waking up, notify workqueue */ | 2523 | } |
| 2437 | if ((p->flags & PF_WQ_WORKER) && success) | 2524 | |
| 2438 | wq_worker_waking_up(p, cpu_of(rq)); | 2525 | static void |
| 2526 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) | ||
| 2527 | { | ||
| 2528 | #ifdef CONFIG_SMP | ||
| 2529 | if (p->sched_contributes_to_load) | ||
| 2530 | rq->nr_uninterruptible--; | ||
| 2531 | #endif | ||
| 2532 | |||
| 2533 | ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); | ||
| 2534 | ttwu_do_wakeup(rq, p, wake_flags); | ||
| 2535 | } | ||
| 2536 | |||
| 2537 | /* | ||
| 2538 | * Called in case the task @p isn't fully descheduled from its runqueue, | ||
| 2539 | * in this case we must do a remote wakeup. Its a 'light' wakeup though, | ||
| 2540 | * since all we need to do is flip p->state to TASK_RUNNING, since | ||
| 2541 | * the task is still ->on_rq. | ||
| 2542 | */ | ||
| 2543 | static int ttwu_remote(struct task_struct *p, int wake_flags) | ||
| 2544 | { | ||
| 2545 | struct rq *rq; | ||
| 2546 | int ret = 0; | ||
| 2547 | |||
| 2548 | rq = __task_rq_lock(p); | ||
| 2549 | if (p->on_rq) { | ||
| 2550 | ttwu_do_wakeup(rq, p, wake_flags); | ||
| 2551 | ret = 1; | ||
| 2552 | } | ||
| 2553 | __task_rq_unlock(rq); | ||
| 2554 | |||
| 2555 | return ret; | ||
| 2556 | } | ||
| 2557 | |||
| 2558 | #ifdef CONFIG_SMP | ||
| 2559 | static void sched_ttwu_do_pending(struct task_struct *list) | ||
| 2560 | { | ||
| 2561 | struct rq *rq = this_rq(); | ||
| 2562 | |||
| 2563 | raw_spin_lock(&rq->lock); | ||
| 2564 | |||
| 2565 | while (list) { | ||
| 2566 | struct task_struct *p = list; | ||
| 2567 | list = list->wake_entry; | ||
| 2568 | ttwu_do_activate(rq, p, 0); | ||
| 2569 | } | ||
| 2570 | |||
| 2571 | raw_spin_unlock(&rq->lock); | ||
| 2572 | } | ||
| 2573 | |||
| 2574 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 2575 | |||
| 2576 | static void sched_ttwu_pending(void) | ||
| 2577 | { | ||
| 2578 | struct rq *rq = this_rq(); | ||
| 2579 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
| 2580 | |||
| 2581 | if (!list) | ||
| 2582 | return; | ||
| 2583 | |||
| 2584 | sched_ttwu_do_pending(list); | ||
| 2585 | } | ||
| 2586 | |||
| 2587 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 2588 | |||
| 2589 | void scheduler_ipi(void) | ||
| 2590 | { | ||
| 2591 | struct rq *rq = this_rq(); | ||
| 2592 | struct task_struct *list = xchg(&rq->wake_list, NULL); | ||
| 2593 | |||
| 2594 | if (!list) | ||
| 2595 | return; | ||
| 2596 | |||
| 2597 | /* | ||
| 2598 | * Not all reschedule IPI handlers call irq_enter/irq_exit, since | ||
| 2599 | * traditionally all their work was done from the interrupt return | ||
| 2600 | * path. Now that we actually do some work, we need to make sure | ||
| 2601 | * we do call them. | ||
| 2602 | * | ||
| 2603 | * Some archs already do call them, luckily irq_enter/exit nest | ||
| 2604 | * properly. | ||
| 2605 | * | ||
| 2606 | * Arguably we should visit all archs and update all handlers, | ||
| 2607 | * however a fair share of IPIs are still resched only so this would | ||
| 2608 | * somewhat pessimize the simple resched case. | ||
| 2609 | */ | ||
| 2610 | irq_enter(); | ||
| 2611 | sched_ttwu_do_pending(list); | ||
| 2612 | irq_exit(); | ||
| 2613 | } | ||
| 2614 | |||
| 2615 | static void ttwu_queue_remote(struct task_struct *p, int cpu) | ||
| 2616 | { | ||
| 2617 | struct rq *rq = cpu_rq(cpu); | ||
| 2618 | struct task_struct *next = rq->wake_list; | ||
| 2619 | |||
| 2620 | for (;;) { | ||
| 2621 | struct task_struct *old = next; | ||
| 2622 | |||
| 2623 | p->wake_entry = next; | ||
| 2624 | next = cmpxchg(&rq->wake_list, old, p); | ||
| 2625 | if (next == old) | ||
| 2626 | break; | ||
| 2627 | } | ||
| 2628 | |||
| 2629 | if (!next) | ||
| 2630 | smp_send_reschedule(cpu); | ||
| 2631 | } | ||
| 2632 | |||
| 2633 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 2634 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | ||
| 2635 | { | ||
| 2636 | struct rq *rq; | ||
| 2637 | int ret = 0; | ||
| 2638 | |||
| 2639 | rq = __task_rq_lock(p); | ||
| 2640 | if (p->on_cpu) { | ||
| 2641 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||
| 2642 | ttwu_do_wakeup(rq, p, wake_flags); | ||
| 2643 | ret = 1; | ||
| 2644 | } | ||
| 2645 | __task_rq_unlock(rq); | ||
| 2646 | |||
| 2647 | return ret; | ||
| 2648 | |||
| 2649 | } | ||
| 2650 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 2651 | #endif /* CONFIG_SMP */ | ||
| 2652 | |||
| 2653 | static void ttwu_queue(struct task_struct *p, int cpu) | ||
| 2654 | { | ||
| 2655 | struct rq *rq = cpu_rq(cpu); | ||
| 2656 | |||
| 2657 | #if defined(CONFIG_SMP) | ||
| 2658 | if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { | ||
| 2659 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | ||
| 2660 | ttwu_queue_remote(p, cpu); | ||
| 2661 | return; | ||
| 2662 | } | ||
| 2663 | #endif | ||
| 2664 | |||
| 2665 | raw_spin_lock(&rq->lock); | ||
| 2666 | ttwu_do_activate(rq, p, 0); | ||
| 2667 | raw_spin_unlock(&rq->lock); | ||
| 2439 | } | 2668 | } |
| 2440 | 2669 | ||
| 2441 | /** | 2670 | /** |
| @@ -2453,92 +2682,66 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, | |||
| 2453 | * Returns %true if @p was woken up, %false if it was already running | 2682 | * Returns %true if @p was woken up, %false if it was already running |
| 2454 | * or @state didn't match @p's state. | 2683 | * or @state didn't match @p's state. |
| 2455 | */ | 2684 | */ |
| 2456 | static int try_to_wake_up(struct task_struct *p, unsigned int state, | 2685 | static int |
| 2457 | int wake_flags) | 2686 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
| 2458 | { | 2687 | { |
| 2459 | int cpu, orig_cpu, this_cpu, success = 0; | ||
| 2460 | unsigned long flags; | 2688 | unsigned long flags; |
| 2461 | unsigned long en_flags = ENQUEUE_WAKEUP; | 2689 | int cpu, success = 0; |
| 2462 | struct rq *rq; | ||
| 2463 | |||
| 2464 | this_cpu = get_cpu(); | ||
| 2465 | 2690 | ||
| 2466 | smp_wmb(); | 2691 | smp_wmb(); |
| 2467 | rq = task_rq_lock(p, &flags); | 2692 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 2468 | if (!(p->state & state)) | 2693 | if (!(p->state & state)) |
| 2469 | goto out; | 2694 | goto out; |
| 2470 | 2695 | ||
| 2471 | if (p->se.on_rq) | 2696 | success = 1; /* we're going to change ->state */ |
| 2472 | goto out_running; | ||
| 2473 | |||
| 2474 | cpu = task_cpu(p); | 2697 | cpu = task_cpu(p); |
| 2475 | orig_cpu = cpu; | ||
| 2476 | 2698 | ||
| 2477 | #ifdef CONFIG_SMP | 2699 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
| 2478 | if (unlikely(task_running(rq, p))) | 2700 | goto stat; |
| 2479 | goto out_activate; | ||
| 2480 | 2701 | ||
| 2702 | #ifdef CONFIG_SMP | ||
| 2481 | /* | 2703 | /* |
| 2482 | * In order to handle concurrent wakeups and release the rq->lock | 2704 | * If the owning (remote) cpu is still in the middle of schedule() with |
| 2483 | * we put the task in TASK_WAKING state. | 2705 | * this task as prev, wait until its done referencing the task. |
| 2484 | * | ||
| 2485 | * First fix up the nr_uninterruptible count: | ||
| 2486 | */ | 2706 | */ |
| 2487 | if (task_contributes_to_load(p)) { | 2707 | while (p->on_cpu) { |
| 2488 | if (likely(cpu_online(orig_cpu))) | 2708 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
| 2489 | rq->nr_uninterruptible--; | 2709 | /* |
| 2490 | else | 2710 | * In case the architecture enables interrupts in |
| 2491 | this_rq()->nr_uninterruptible--; | 2711 | * context_switch(), we cannot busy wait, since that |
| 2712 | * would lead to deadlocks when an interrupt hits and | ||
| 2713 | * tries to wake up @prev. So bail and do a complete | ||
| 2714 | * remote wakeup. | ||
| 2715 | */ | ||
| 2716 | if (ttwu_activate_remote(p, wake_flags)) | ||
| 2717 | goto stat; | ||
| 2718 | #else | ||
| 2719 | cpu_relax(); | ||
| 2720 | #endif | ||
| 2492 | } | 2721 | } |
| 2722 | /* | ||
| 2723 | * Pairs with the smp_wmb() in finish_lock_switch(). | ||
| 2724 | */ | ||
| 2725 | smp_rmb(); | ||
| 2726 | |||
| 2727 | p->sched_contributes_to_load = !!task_contributes_to_load(p); | ||
| 2493 | p->state = TASK_WAKING; | 2728 | p->state = TASK_WAKING; |
| 2494 | 2729 | ||
| 2495 | if (p->sched_class->task_waking) { | 2730 | if (p->sched_class->task_waking) |
| 2496 | p->sched_class->task_waking(rq, p); | 2731 | p->sched_class->task_waking(p); |
| 2497 | en_flags |= ENQUEUE_WAKING; | ||
| 2498 | } | ||
| 2499 | 2732 | ||
| 2500 | cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); | 2733 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
| 2501 | if (cpu != orig_cpu) | 2734 | if (task_cpu(p) != cpu) { |
| 2735 | wake_flags |= WF_MIGRATED; | ||
| 2502 | set_task_cpu(p, cpu); | 2736 | set_task_cpu(p, cpu); |
| 2503 | __task_rq_unlock(rq); | ||
| 2504 | |||
| 2505 | rq = cpu_rq(cpu); | ||
| 2506 | raw_spin_lock(&rq->lock); | ||
| 2507 | |||
| 2508 | /* | ||
| 2509 | * We migrated the task without holding either rq->lock, however | ||
| 2510 | * since the task is not on the task list itself, nobody else | ||
| 2511 | * will try and migrate the task, hence the rq should match the | ||
| 2512 | * cpu we just moved it to. | ||
| 2513 | */ | ||
| 2514 | WARN_ON(task_cpu(p) != cpu); | ||
| 2515 | WARN_ON(p->state != TASK_WAKING); | ||
| 2516 | |||
| 2517 | #ifdef CONFIG_SCHEDSTATS | ||
| 2518 | schedstat_inc(rq, ttwu_count); | ||
| 2519 | if (cpu == this_cpu) | ||
| 2520 | schedstat_inc(rq, ttwu_local); | ||
| 2521 | else { | ||
| 2522 | struct sched_domain *sd; | ||
| 2523 | for_each_domain(this_cpu, sd) { | ||
| 2524 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
| 2525 | schedstat_inc(sd, ttwu_wake_remote); | ||
| 2526 | break; | ||
| 2527 | } | ||
| 2528 | } | ||
| 2529 | } | 2737 | } |
| 2530 | #endif /* CONFIG_SCHEDSTATS */ | ||
| 2531 | |||
| 2532 | out_activate: | ||
| 2533 | #endif /* CONFIG_SMP */ | 2738 | #endif /* CONFIG_SMP */ |
| 2534 | ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, | 2739 | |
| 2535 | cpu == this_cpu, en_flags); | 2740 | ttwu_queue(p, cpu); |
| 2536 | success = 1; | 2741 | stat: |
| 2537 | out_running: | 2742 | ttwu_stat(p, cpu, wake_flags); |
| 2538 | ttwu_post_activation(p, rq, wake_flags, success); | ||
| 2539 | out: | 2743 | out: |
| 2540 | task_rq_unlock(rq, &flags); | 2744 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 2541 | put_cpu(); | ||
| 2542 | 2745 | ||
| 2543 | return success; | 2746 | return success; |
| 2544 | } | 2747 | } |
| @@ -2547,31 +2750,34 @@ out: | |||
| 2547 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2750 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
| 2548 | * @p: the thread to be awakened | 2751 | * @p: the thread to be awakened |
| 2549 | * | 2752 | * |
| 2550 | * Put @p on the run-queue if it's not already there. The caller must | 2753 | * Put @p on the run-queue if it's not already there. The caller must |
| 2551 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2754 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
| 2552 | * the current task. this_rq() stays locked over invocation. | 2755 | * the current task. |
| 2553 | */ | 2756 | */ |
| 2554 | static void try_to_wake_up_local(struct task_struct *p) | 2757 | static void try_to_wake_up_local(struct task_struct *p) |
| 2555 | { | 2758 | { |
| 2556 | struct rq *rq = task_rq(p); | 2759 | struct rq *rq = task_rq(p); |
| 2557 | bool success = false; | ||
| 2558 | 2760 | ||
| 2559 | BUG_ON(rq != this_rq()); | 2761 | BUG_ON(rq != this_rq()); |
| 2560 | BUG_ON(p == current); | 2762 | BUG_ON(p == current); |
| 2561 | lockdep_assert_held(&rq->lock); | 2763 | lockdep_assert_held(&rq->lock); |
| 2562 | 2764 | ||
| 2765 | if (!raw_spin_trylock(&p->pi_lock)) { | ||
| 2766 | raw_spin_unlock(&rq->lock); | ||
| 2767 | raw_spin_lock(&p->pi_lock); | ||
| 2768 | raw_spin_lock(&rq->lock); | ||
| 2769 | } | ||
| 2770 | |||
| 2563 | if (!(p->state & TASK_NORMAL)) | 2771 | if (!(p->state & TASK_NORMAL)) |
| 2564 | return; | 2772 | goto out; |
| 2565 | 2773 | ||
| 2566 | if (!p->se.on_rq) { | 2774 | if (!p->on_rq) |
| 2567 | if (likely(!task_running(rq, p))) { | 2775 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
| 2568 | schedstat_inc(rq, ttwu_count); | 2776 | |
| 2569 | schedstat_inc(rq, ttwu_local); | 2777 | ttwu_do_wakeup(rq, p, 0); |
| 2570 | } | 2778 | ttwu_stat(p, smp_processor_id(), 0); |
| 2571 | ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); | 2779 | out: |
| 2572 | success = true; | 2780 | raw_spin_unlock(&p->pi_lock); |
| 2573 | } | ||
| 2574 | ttwu_post_activation(p, rq, 0, success); | ||
| 2575 | } | 2781 | } |
| 2576 | 2782 | ||
| 2577 | /** | 2783 | /** |
| @@ -2604,19 +2810,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) | |||
| 2604 | */ | 2810 | */ |
| 2605 | static void __sched_fork(struct task_struct *p) | 2811 | static void __sched_fork(struct task_struct *p) |
| 2606 | { | 2812 | { |
| 2813 | p->on_rq = 0; | ||
| 2814 | |||
| 2815 | p->se.on_rq = 0; | ||
| 2607 | p->se.exec_start = 0; | 2816 | p->se.exec_start = 0; |
| 2608 | p->se.sum_exec_runtime = 0; | 2817 | p->se.sum_exec_runtime = 0; |
| 2609 | p->se.prev_sum_exec_runtime = 0; | 2818 | p->se.prev_sum_exec_runtime = 0; |
| 2610 | p->se.nr_migrations = 0; | 2819 | p->se.nr_migrations = 0; |
| 2611 | p->se.vruntime = 0; | 2820 | p->se.vruntime = 0; |
| 2821 | INIT_LIST_HEAD(&p->se.group_node); | ||
| 2612 | 2822 | ||
| 2613 | #ifdef CONFIG_SCHEDSTATS | 2823 | #ifdef CONFIG_SCHEDSTATS |
| 2614 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2824 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
| 2615 | #endif | 2825 | #endif |
| 2616 | 2826 | ||
| 2617 | INIT_LIST_HEAD(&p->rt.run_list); | 2827 | INIT_LIST_HEAD(&p->rt.run_list); |
| 2618 | p->se.on_rq = 0; | ||
| 2619 | INIT_LIST_HEAD(&p->se.group_node); | ||
| 2620 | 2828 | ||
| 2621 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2829 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
| 2622 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2830 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
| @@ -2626,8 +2834,9 @@ static void __sched_fork(struct task_struct *p) | |||
| 2626 | /* | 2834 | /* |
| 2627 | * fork()/clone()-time setup: | 2835 | * fork()/clone()-time setup: |
| 2628 | */ | 2836 | */ |
| 2629 | void sched_fork(struct task_struct *p, int clone_flags) | 2837 | void sched_fork(struct task_struct *p) |
| 2630 | { | 2838 | { |
| 2839 | unsigned long flags; | ||
| 2631 | int cpu = get_cpu(); | 2840 | int cpu = get_cpu(); |
| 2632 | 2841 | ||
| 2633 | __sched_fork(p); | 2842 | __sched_fork(p); |
| @@ -2678,18 +2887,18 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 2678 | * | 2887 | * |
| 2679 | * Silence PROVE_RCU. | 2888 | * Silence PROVE_RCU. |
| 2680 | */ | 2889 | */ |
| 2681 | rcu_read_lock(); | 2890 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 2682 | set_task_cpu(p, cpu); | 2891 | set_task_cpu(p, cpu); |
| 2683 | rcu_read_unlock(); | 2892 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 2684 | 2893 | ||
| 2685 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2894 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
| 2686 | if (likely(sched_info_on())) | 2895 | if (likely(sched_info_on())) |
| 2687 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2896 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
| 2688 | #endif | 2897 | #endif |
| 2689 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 2898 | #if defined(CONFIG_SMP) |
| 2690 | p->oncpu = 0; | 2899 | p->on_cpu = 0; |
| 2691 | #endif | 2900 | #endif |
| 2692 | #ifdef CONFIG_PREEMPT | 2901 | #ifdef CONFIG_PREEMPT_COUNT |
| 2693 | /* Want to start with kernel preemption disabled. */ | 2902 | /* Want to start with kernel preemption disabled. */ |
| 2694 | task_thread_info(p)->preempt_count = 1; | 2903 | task_thread_info(p)->preempt_count = 1; |
| 2695 | #endif | 2904 | #endif |
| @@ -2707,41 +2916,31 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 2707 | * that must be done for every newly created context, then puts the task | 2916 | * that must be done for every newly created context, then puts the task |
| 2708 | * on the runqueue and wakes it. | 2917 | * on the runqueue and wakes it. |
| 2709 | */ | 2918 | */ |
| 2710 | void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 2919 | void wake_up_new_task(struct task_struct *p) |
| 2711 | { | 2920 | { |
| 2712 | unsigned long flags; | 2921 | unsigned long flags; |
| 2713 | struct rq *rq; | 2922 | struct rq *rq; |
| 2714 | int cpu __maybe_unused = get_cpu(); | ||
| 2715 | 2923 | ||
| 2924 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
| 2716 | #ifdef CONFIG_SMP | 2925 | #ifdef CONFIG_SMP |
| 2717 | rq = task_rq_lock(p, &flags); | ||
| 2718 | p->state = TASK_WAKING; | ||
| 2719 | |||
| 2720 | /* | 2926 | /* |
| 2721 | * Fork balancing, do it here and not earlier because: | 2927 | * Fork balancing, do it here and not earlier because: |
| 2722 | * - cpus_allowed can change in the fork path | 2928 | * - cpus_allowed can change in the fork path |
| 2723 | * - any previously selected cpu might disappear through hotplug | 2929 | * - any previously selected cpu might disappear through hotplug |
| 2724 | * | ||
| 2725 | * We set TASK_WAKING so that select_task_rq() can drop rq->lock | ||
| 2726 | * without people poking at ->cpus_allowed. | ||
| 2727 | */ | 2930 | */ |
| 2728 | cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); | 2931 | set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); |
| 2729 | set_task_cpu(p, cpu); | ||
| 2730 | |||
| 2731 | p->state = TASK_RUNNING; | ||
| 2732 | task_rq_unlock(rq, &flags); | ||
| 2733 | #endif | 2932 | #endif |
| 2734 | 2933 | ||
| 2735 | rq = task_rq_lock(p, &flags); | 2934 | rq = __task_rq_lock(p); |
| 2736 | activate_task(rq, p, 0); | 2935 | activate_task(rq, p, 0); |
| 2737 | trace_sched_wakeup_new(p, 1); | 2936 | p->on_rq = 1; |
| 2937 | trace_sched_wakeup_new(p, true); | ||
| 2738 | check_preempt_curr(rq, p, WF_FORK); | 2938 | check_preempt_curr(rq, p, WF_FORK); |
| 2739 | #ifdef CONFIG_SMP | 2939 | #ifdef CONFIG_SMP |
| 2740 | if (p->sched_class->task_woken) | 2940 | if (p->sched_class->task_woken) |
| 2741 | p->sched_class->task_woken(rq, p); | 2941 | p->sched_class->task_woken(rq, p); |
| 2742 | #endif | 2942 | #endif |
| 2743 | task_rq_unlock(rq, &flags); | 2943 | task_rq_unlock(rq, p, &flags); |
| 2744 | put_cpu(); | ||
| 2745 | } | 2944 | } |
| 2746 | 2945 | ||
| 2747 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 2946 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
| @@ -3450,27 +3649,22 @@ void sched_exec(void) | |||
| 3450 | { | 3649 | { |
| 3451 | struct task_struct *p = current; | 3650 | struct task_struct *p = current; |
| 3452 | unsigned long flags; | 3651 | unsigned long flags; |
| 3453 | struct rq *rq; | ||
| 3454 | int dest_cpu; | 3652 | int dest_cpu; |
| 3455 | 3653 | ||
| 3456 | rq = task_rq_lock(p, &flags); | 3654 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 3457 | dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); | 3655 | dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); |
| 3458 | if (dest_cpu == smp_processor_id()) | 3656 | if (dest_cpu == smp_processor_id()) |
| 3459 | goto unlock; | 3657 | goto unlock; |
| 3460 | 3658 | ||
| 3461 | /* | 3659 | if (likely(cpu_active(dest_cpu))) { |
| 3462 | * select_task_rq() can race against ->cpus_allowed | ||
| 3463 | */ | ||
| 3464 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | ||
| 3465 | likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { | ||
| 3466 | struct migration_arg arg = { p, dest_cpu }; | 3660 | struct migration_arg arg = { p, dest_cpu }; |
| 3467 | 3661 | ||
| 3468 | task_rq_unlock(rq, &flags); | 3662 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 3469 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 3663 | stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); |
| 3470 | return; | 3664 | return; |
| 3471 | } | 3665 | } |
| 3472 | unlock: | 3666 | unlock: |
| 3473 | task_rq_unlock(rq, &flags); | 3667 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 3474 | } | 3668 | } |
| 3475 | 3669 | ||
| 3476 | #endif | 3670 | #endif |
| @@ -3507,7 +3701,7 @@ unsigned long long task_delta_exec(struct task_struct *p) | |||
| 3507 | 3701 | ||
| 3508 | rq = task_rq_lock(p, &flags); | 3702 | rq = task_rq_lock(p, &flags); |
| 3509 | ns = do_task_delta_exec(p, rq); | 3703 | ns = do_task_delta_exec(p, rq); |
| 3510 | task_rq_unlock(rq, &flags); | 3704 | task_rq_unlock(rq, p, &flags); |
| 3511 | 3705 | ||
| 3512 | return ns; | 3706 | return ns; |
| 3513 | } | 3707 | } |
| @@ -3525,7 +3719,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 3525 | 3719 | ||
| 3526 | rq = task_rq_lock(p, &flags); | 3720 | rq = task_rq_lock(p, &flags); |
| 3527 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); | 3721 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
| 3528 | task_rq_unlock(rq, &flags); | 3722 | task_rq_unlock(rq, p, &flags); |
| 3529 | 3723 | ||
| 3530 | return ns; | 3724 | return ns; |
| 3531 | } | 3725 | } |
| @@ -3549,7 +3743,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) | |||
| 3549 | rq = task_rq_lock(p, &flags); | 3743 | rq = task_rq_lock(p, &flags); |
| 3550 | thread_group_cputime(p, &totals); | 3744 | thread_group_cputime(p, &totals); |
| 3551 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); | 3745 | ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); |
| 3552 | task_rq_unlock(rq, &flags); | 3746 | task_rq_unlock(rq, p, &flags); |
| 3553 | 3747 | ||
| 3554 | return ns; | 3748 | return ns; |
| 3555 | } | 3749 | } |
| @@ -3695,6 +3889,25 @@ void account_idle_time(cputime_t cputime) | |||
| 3695 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); | 3889 | cpustat->idle = cputime64_add(cpustat->idle, cputime64); |
| 3696 | } | 3890 | } |
| 3697 | 3891 | ||
| 3892 | static __always_inline bool steal_account_process_tick(void) | ||
| 3893 | { | ||
| 3894 | #ifdef CONFIG_PARAVIRT | ||
| 3895 | if (static_branch(¶virt_steal_enabled)) { | ||
| 3896 | u64 steal, st = 0; | ||
| 3897 | |||
| 3898 | steal = paravirt_steal_clock(smp_processor_id()); | ||
| 3899 | steal -= this_rq()->prev_steal_time; | ||
| 3900 | |||
| 3901 | st = steal_ticks(steal); | ||
| 3902 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
| 3903 | |||
| 3904 | account_steal_time(st); | ||
| 3905 | return st; | ||
| 3906 | } | ||
| 3907 | #endif | ||
| 3908 | return false; | ||
| 3909 | } | ||
| 3910 | |||
| 3698 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3911 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
| 3699 | 3912 | ||
| 3700 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 3913 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
| @@ -3726,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
| 3726 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | 3939 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); |
| 3727 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3940 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
| 3728 | 3941 | ||
| 3942 | if (steal_account_process_tick()) | ||
| 3943 | return; | ||
| 3944 | |||
| 3729 | if (irqtime_account_hi_update()) { | 3945 | if (irqtime_account_hi_update()) { |
| 3730 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3946 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
| 3731 | } else if (irqtime_account_si_update()) { | 3947 | } else if (irqtime_account_si_update()) { |
| @@ -3779,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
| 3779 | return; | 3995 | return; |
| 3780 | } | 3996 | } |
| 3781 | 3997 | ||
| 3998 | if (steal_account_process_tick()) | ||
| 3999 | return; | ||
| 4000 | |||
| 3782 | if (user_tick) | 4001 | if (user_tick) |
| 3783 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 4002 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
| 3784 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 4003 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
| @@ -3903,9 +4122,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
| 3903 | /* | 4122 | /* |
| 3904 | * This function gets called by the timer code, with HZ frequency. | 4123 | * This function gets called by the timer code, with HZ frequency. |
| 3905 | * We call it with interrupts disabled. | 4124 | * We call it with interrupts disabled. |
| 3906 | * | ||
| 3907 | * It also gets called by the fork code, when changing the parent's | ||
| 3908 | * timeslices. | ||
| 3909 | */ | 4125 | */ |
| 3910 | void scheduler_tick(void) | 4126 | void scheduler_tick(void) |
| 3911 | { | 4127 | { |
| @@ -4025,17 +4241,11 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 4025 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4241 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
| 4026 | 4242 | ||
| 4027 | schedstat_inc(this_rq(), sched_count); | 4243 | schedstat_inc(this_rq(), sched_count); |
| 4028 | #ifdef CONFIG_SCHEDSTATS | ||
| 4029 | if (unlikely(prev->lock_depth >= 0)) { | ||
| 4030 | schedstat_inc(this_rq(), rq_sched_info.bkl_count); | ||
| 4031 | schedstat_inc(prev, sched_info.bkl_count); | ||
| 4032 | } | ||
| 4033 | #endif | ||
| 4034 | } | 4244 | } |
| 4035 | 4245 | ||
| 4036 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 4246 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
| 4037 | { | 4247 | { |
| 4038 | if (prev->se.on_rq) | 4248 | if (prev->on_rq || rq->skip_clock_update < 0) |
| 4039 | update_rq_clock(rq); | 4249 | update_rq_clock(rq); |
| 4040 | prev->sched_class->put_prev_task(rq, prev); | 4250 | prev->sched_class->put_prev_task(rq, prev); |
| 4041 | } | 4251 | } |
| @@ -4097,11 +4307,13 @@ need_resched: | |||
| 4097 | if (unlikely(signal_pending_state(prev->state, prev))) { | 4307 | if (unlikely(signal_pending_state(prev->state, prev))) { |
| 4098 | prev->state = TASK_RUNNING; | 4308 | prev->state = TASK_RUNNING; |
| 4099 | } else { | 4309 | } else { |
| 4310 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
| 4311 | prev->on_rq = 0; | ||
| 4312 | |||
| 4100 | /* | 4313 | /* |
| 4101 | * If a worker is going to sleep, notify and | 4314 | * If a worker went to sleep, notify and ask workqueue |
| 4102 | * ask workqueue whether it wants to wake up a | 4315 | * whether it wants to wake up a task to maintain |
| 4103 | * task to maintain concurrency. If so, wake | 4316 | * concurrency. |
| 4104 | * up the task. | ||
| 4105 | */ | 4317 | */ |
| 4106 | if (prev->flags & PF_WQ_WORKER) { | 4318 | if (prev->flags & PF_WQ_WORKER) { |
| 4107 | struct task_struct *to_wakeup; | 4319 | struct task_struct *to_wakeup; |
| @@ -4110,11 +4322,10 @@ need_resched: | |||
| 4110 | if (to_wakeup) | 4322 | if (to_wakeup) |
| 4111 | try_to_wake_up_local(to_wakeup); | 4323 | try_to_wake_up_local(to_wakeup); |
| 4112 | } | 4324 | } |
| 4113 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | ||
| 4114 | 4325 | ||
| 4115 | /* | 4326 | /* |
| 4116 | * If we are going to sleep and we have plugged IO queued, make | 4327 | * If we are going to sleep and we have plugged IO |
| 4117 | * sure to submit it to avoid deadlocks. | 4328 | * queued, make sure to submit it to avoid deadlocks. |
| 4118 | */ | 4329 | */ |
| 4119 | if (blk_needs_flush_plug(prev)) { | 4330 | if (blk_needs_flush_plug(prev)) { |
| 4120 | raw_spin_unlock(&rq->lock); | 4331 | raw_spin_unlock(&rq->lock); |
| @@ -4161,71 +4372,47 @@ need_resched: | |||
| 4161 | EXPORT_SYMBOL(schedule); | 4372 | EXPORT_SYMBOL(schedule); |
| 4162 | 4373 | ||
| 4163 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | 4374 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
| 4164 | /* | ||
| 4165 | * Look out! "owner" is an entirely speculative pointer | ||
| 4166 | * access and not reliable. | ||
| 4167 | */ | ||
| 4168 | int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | ||
| 4169 | { | ||
| 4170 | unsigned int cpu; | ||
| 4171 | struct rq *rq; | ||
| 4172 | 4375 | ||
| 4173 | if (!sched_feat(OWNER_SPIN)) | 4376 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) |
| 4174 | return 0; | 4377 | { |
| 4378 | if (lock->owner != owner) | ||
| 4379 | return false; | ||
| 4175 | 4380 | ||
| 4176 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
| 4177 | /* | 4381 | /* |
| 4178 | * Need to access the cpu field knowing that | 4382 | * Ensure we emit the owner->on_cpu, dereference _after_ checking |
| 4179 | * DEBUG_PAGEALLOC could have unmapped it if | 4383 | * lock->owner still matches owner, if that fails, owner might |
| 4180 | * the mutex owner just released it and exited. | 4384 | * point to free()d memory, if it still matches, the rcu_read_lock() |
| 4385 | * ensures the memory stays valid. | ||
| 4181 | */ | 4386 | */ |
| 4182 | if (probe_kernel_address(&owner->cpu, cpu)) | 4387 | barrier(); |
| 4183 | return 0; | ||
| 4184 | #else | ||
| 4185 | cpu = owner->cpu; | ||
| 4186 | #endif | ||
| 4187 | 4388 | ||
| 4188 | /* | 4389 | return owner->on_cpu; |
| 4189 | * Even if the access succeeded (likely case), | 4390 | } |
| 4190 | * the cpu field may no longer be valid. | ||
| 4191 | */ | ||
| 4192 | if (cpu >= nr_cpumask_bits) | ||
| 4193 | return 0; | ||
| 4194 | 4391 | ||
| 4195 | /* | 4392 | /* |
| 4196 | * We need to validate that we can do a | 4393 | * Look out! "owner" is an entirely speculative pointer |
| 4197 | * get_cpu() and that we have the percpu area. | 4394 | * access and not reliable. |
| 4198 | */ | 4395 | */ |
| 4199 | if (!cpu_online(cpu)) | 4396 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) |
| 4397 | { | ||
| 4398 | if (!sched_feat(OWNER_SPIN)) | ||
| 4200 | return 0; | 4399 | return 0; |
| 4201 | 4400 | ||
| 4202 | rq = cpu_rq(cpu); | 4401 | rcu_read_lock(); |
| 4203 | 4402 | while (owner_running(lock, owner)) { | |
| 4204 | for (;;) { | 4403 | if (need_resched()) |
| 4205 | /* | ||
| 4206 | * Owner changed, break to re-assess state. | ||
| 4207 | */ | ||
| 4208 | if (lock->owner != owner) { | ||
| 4209 | /* | ||
| 4210 | * If the lock has switched to a different owner, | ||
| 4211 | * we likely have heavy contention. Return 0 to quit | ||
| 4212 | * optimistic spinning and not contend further: | ||
| 4213 | */ | ||
| 4214 | if (lock->owner) | ||
| 4215 | return 0; | ||
| 4216 | break; | 4404 | break; |
| 4217 | } | ||
| 4218 | |||
| 4219 | /* | ||
| 4220 | * Is that owner really running on that cpu? | ||
| 4221 | */ | ||
| 4222 | if (task_thread_info(rq->curr) != owner || need_resched()) | ||
| 4223 | return 0; | ||
| 4224 | 4405 | ||
| 4225 | arch_mutex_cpu_relax(); | 4406 | arch_mutex_cpu_relax(); |
| 4226 | } | 4407 | } |
| 4408 | rcu_read_unlock(); | ||
| 4227 | 4409 | ||
| 4228 | return 1; | 4410 | /* |
| 4411 | * We break out the loop above on need_resched() and when the | ||
| 4412 | * owner changed, which is a sign for heavy contention. Return | ||
| 4413 | * success only when lock->owner is NULL. | ||
| 4414 | */ | ||
| 4415 | return lock->owner == NULL; | ||
| 4229 | } | 4416 | } |
| 4230 | #endif | 4417 | #endif |
| 4231 | 4418 | ||
| @@ -4684,19 +4871,18 @@ EXPORT_SYMBOL(sleep_on_timeout); | |||
| 4684 | */ | 4871 | */ |
| 4685 | void rt_mutex_setprio(struct task_struct *p, int prio) | 4872 | void rt_mutex_setprio(struct task_struct *p, int prio) |
| 4686 | { | 4873 | { |
| 4687 | unsigned long flags; | ||
| 4688 | int oldprio, on_rq, running; | 4874 | int oldprio, on_rq, running; |
| 4689 | struct rq *rq; | 4875 | struct rq *rq; |
| 4690 | const struct sched_class *prev_class; | 4876 | const struct sched_class *prev_class; |
| 4691 | 4877 | ||
| 4692 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4878 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
| 4693 | 4879 | ||
| 4694 | rq = task_rq_lock(p, &flags); | 4880 | rq = __task_rq_lock(p); |
| 4695 | 4881 | ||
| 4696 | trace_sched_pi_setprio(p, prio); | 4882 | trace_sched_pi_setprio(p, prio); |
| 4697 | oldprio = p->prio; | 4883 | oldprio = p->prio; |
| 4698 | prev_class = p->sched_class; | 4884 | prev_class = p->sched_class; |
| 4699 | on_rq = p->se.on_rq; | 4885 | on_rq = p->on_rq; |
| 4700 | running = task_current(rq, p); | 4886 | running = task_current(rq, p); |
| 4701 | if (on_rq) | 4887 | if (on_rq) |
| 4702 | dequeue_task(rq, p, 0); | 4888 | dequeue_task(rq, p, 0); |
| @@ -4716,7 +4902,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
| 4716 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4902 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
| 4717 | 4903 | ||
| 4718 | check_class_changed(rq, p, prev_class, oldprio); | 4904 | check_class_changed(rq, p, prev_class, oldprio); |
| 4719 | task_rq_unlock(rq, &flags); | 4905 | __task_rq_unlock(rq); |
| 4720 | } | 4906 | } |
| 4721 | 4907 | ||
| 4722 | #endif | 4908 | #endif |
| @@ -4744,7 +4930,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4744 | p->static_prio = NICE_TO_PRIO(nice); | 4930 | p->static_prio = NICE_TO_PRIO(nice); |
| 4745 | goto out_unlock; | 4931 | goto out_unlock; |
| 4746 | } | 4932 | } |
| 4747 | on_rq = p->se.on_rq; | 4933 | on_rq = p->on_rq; |
| 4748 | if (on_rq) | 4934 | if (on_rq) |
| 4749 | dequeue_task(rq, p, 0); | 4935 | dequeue_task(rq, p, 0); |
| 4750 | 4936 | ||
| @@ -4764,7 +4950,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
| 4764 | resched_task(rq->curr); | 4950 | resched_task(rq->curr); |
| 4765 | } | 4951 | } |
| 4766 | out_unlock: | 4952 | out_unlock: |
| 4767 | task_rq_unlock(rq, &flags); | 4953 | task_rq_unlock(rq, p, &flags); |
| 4768 | } | 4954 | } |
| 4769 | EXPORT_SYMBOL(set_user_nice); | 4955 | EXPORT_SYMBOL(set_user_nice); |
| 4770 | 4956 | ||
| @@ -4878,8 +5064,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) | |||
| 4878 | static void | 5064 | static void |
| 4879 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 5065 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
| 4880 | { | 5066 | { |
| 4881 | BUG_ON(p->se.on_rq); | ||
| 4882 | |||
| 4883 | p->policy = policy; | 5067 | p->policy = policy; |
| 4884 | p->rt_priority = prio; | 5068 | p->rt_priority = prio; |
| 4885 | p->normal_prio = normal_prio(p); | 5069 | p->normal_prio = normal_prio(p); |
| @@ -4994,20 +5178,17 @@ recheck: | |||
| 4994 | /* | 5178 | /* |
| 4995 | * make sure no PI-waiters arrive (or leave) while we are | 5179 | * make sure no PI-waiters arrive (or leave) while we are |
| 4996 | * changing the priority of the task: | 5180 | * changing the priority of the task: |
| 4997 | */ | 5181 | * |
| 4998 | raw_spin_lock_irqsave(&p->pi_lock, flags); | ||
| 4999 | /* | ||
| 5000 | * To be able to change p->policy safely, the appropriate | 5182 | * To be able to change p->policy safely, the appropriate |
| 5001 | * runqueue lock must be held. | 5183 | * runqueue lock must be held. |
| 5002 | */ | 5184 | */ |
| 5003 | rq = __task_rq_lock(p); | 5185 | rq = task_rq_lock(p, &flags); |
| 5004 | 5186 | ||
| 5005 | /* | 5187 | /* |
| 5006 | * Changing the policy of the stop threads its a very bad idea | 5188 | * Changing the policy of the stop threads its a very bad idea |
| 5007 | */ | 5189 | */ |
| 5008 | if (p == rq->stop) { | 5190 | if (p == rq->stop) { |
| 5009 | __task_rq_unlock(rq); | 5191 | task_rq_unlock(rq, p, &flags); |
| 5010 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 5011 | return -EINVAL; | 5192 | return -EINVAL; |
| 5012 | } | 5193 | } |
| 5013 | 5194 | ||
| @@ -5031,8 +5212,7 @@ recheck: | |||
| 5031 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 5212 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
| 5032 | task_group(p)->rt_bandwidth.rt_runtime == 0 && | 5213 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
| 5033 | !task_group_is_autogroup(task_group(p))) { | 5214 | !task_group_is_autogroup(task_group(p))) { |
| 5034 | __task_rq_unlock(rq); | 5215 | task_rq_unlock(rq, p, &flags); |
| 5035 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 5036 | return -EPERM; | 5216 | return -EPERM; |
| 5037 | } | 5217 | } |
| 5038 | } | 5218 | } |
| @@ -5041,11 +5221,10 @@ recheck: | |||
| 5041 | /* recheck policy now with rq lock held */ | 5221 | /* recheck policy now with rq lock held */ |
| 5042 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 5222 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
| 5043 | policy = oldpolicy = -1; | 5223 | policy = oldpolicy = -1; |
| 5044 | __task_rq_unlock(rq); | 5224 | task_rq_unlock(rq, p, &flags); |
| 5045 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 5046 | goto recheck; | 5225 | goto recheck; |
| 5047 | } | 5226 | } |
| 5048 | on_rq = p->se.on_rq; | 5227 | on_rq = p->on_rq; |
| 5049 | running = task_current(rq, p); | 5228 | running = task_current(rq, p); |
| 5050 | if (on_rq) | 5229 | if (on_rq) |
| 5051 | deactivate_task(rq, p, 0); | 5230 | deactivate_task(rq, p, 0); |
| @@ -5064,8 +5243,7 @@ recheck: | |||
| 5064 | activate_task(rq, p, 0); | 5243 | activate_task(rq, p, 0); |
| 5065 | 5244 | ||
| 5066 | check_class_changed(rq, p, prev_class, oldprio); | 5245 | check_class_changed(rq, p, prev_class, oldprio); |
| 5067 | __task_rq_unlock(rq); | 5246 | task_rq_unlock(rq, p, &flags); |
| 5068 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
| 5069 | 5247 | ||
| 5070 | rt_mutex_adjust_pi(p); | 5248 | rt_mutex_adjust_pi(p); |
| 5071 | 5249 | ||
| @@ -5316,7 +5494,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
| 5316 | { | 5494 | { |
| 5317 | struct task_struct *p; | 5495 | struct task_struct *p; |
| 5318 | unsigned long flags; | 5496 | unsigned long flags; |
| 5319 | struct rq *rq; | ||
| 5320 | int retval; | 5497 | int retval; |
| 5321 | 5498 | ||
| 5322 | get_online_cpus(); | 5499 | get_online_cpus(); |
| @@ -5331,9 +5508,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
| 5331 | if (retval) | 5508 | if (retval) |
| 5332 | goto out_unlock; | 5509 | goto out_unlock; |
| 5333 | 5510 | ||
| 5334 | rq = task_rq_lock(p, &flags); | 5511 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 5335 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 5512 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
| 5336 | task_rq_unlock(rq, &flags); | 5513 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 5337 | 5514 | ||
| 5338 | out_unlock: | 5515 | out_unlock: |
| 5339 | rcu_read_unlock(); | 5516 | rcu_read_unlock(); |
| @@ -5658,7 +5835,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
| 5658 | 5835 | ||
| 5659 | rq = task_rq_lock(p, &flags); | 5836 | rq = task_rq_lock(p, &flags); |
| 5660 | time_slice = p->sched_class->get_rr_interval(rq, p); | 5837 | time_slice = p->sched_class->get_rr_interval(rq, p); |
| 5661 | task_rq_unlock(rq, &flags); | 5838 | task_rq_unlock(rq, p, &flags); |
| 5662 | 5839 | ||
| 5663 | rcu_read_unlock(); | 5840 | rcu_read_unlock(); |
| 5664 | jiffies_to_timespec(time_slice, &t); | 5841 | jiffies_to_timespec(time_slice, &t); |
| @@ -5760,7 +5937,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 5760 | idle->state = TASK_RUNNING; | 5937 | idle->state = TASK_RUNNING; |
| 5761 | idle->se.exec_start = sched_clock(); | 5938 | idle->se.exec_start = sched_clock(); |
| 5762 | 5939 | ||
| 5763 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 5940 | do_set_cpus_allowed(idle, cpumask_of(cpu)); |
| 5764 | /* | 5941 | /* |
| 5765 | * We're having a chicken and egg problem, even though we are | 5942 | * We're having a chicken and egg problem, even though we are |
| 5766 | * holding rq->lock, the cpu isn't yet set to this cpu so the | 5943 | * holding rq->lock, the cpu isn't yet set to this cpu so the |
| @@ -5776,17 +5953,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 5776 | rcu_read_unlock(); | 5953 | rcu_read_unlock(); |
| 5777 | 5954 | ||
| 5778 | rq->curr = rq->idle = idle; | 5955 | rq->curr = rq->idle = idle; |
| 5779 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5956 | #if defined(CONFIG_SMP) |
| 5780 | idle->oncpu = 1; | 5957 | idle->on_cpu = 1; |
| 5781 | #endif | 5958 | #endif |
| 5782 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5959 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 5783 | 5960 | ||
| 5784 | /* Set the preempt count _outside_ the spinlocks! */ | 5961 | /* Set the preempt count _outside_ the spinlocks! */ |
| 5785 | #if defined(CONFIG_PREEMPT) | ||
| 5786 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
| 5787 | #else | ||
| 5788 | task_thread_info(idle)->preempt_count = 0; | 5962 | task_thread_info(idle)->preempt_count = 0; |
| 5789 | #endif | 5963 | |
| 5790 | /* | 5964 | /* |
| 5791 | * The idle tasks have their own, simple scheduling class: | 5965 | * The idle tasks have their own, simple scheduling class: |
| 5792 | */ | 5966 | */ |
| @@ -5851,6 +6025,16 @@ static inline void sched_init_granularity(void) | |||
| 5851 | } | 6025 | } |
| 5852 | 6026 | ||
| 5853 | #ifdef CONFIG_SMP | 6027 | #ifdef CONFIG_SMP |
| 6028 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | ||
| 6029 | { | ||
| 6030 | if (p->sched_class && p->sched_class->set_cpus_allowed) | ||
| 6031 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
| 6032 | else { | ||
| 6033 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
| 6034 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | ||
| 6035 | } | ||
| 6036 | } | ||
| 6037 | |||
| 5854 | /* | 6038 | /* |
| 5855 | * This is how migration works: | 6039 | * This is how migration works: |
| 5856 | * | 6040 | * |
| @@ -5881,52 +6065,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
| 5881 | unsigned int dest_cpu; | 6065 | unsigned int dest_cpu; |
| 5882 | int ret = 0; | 6066 | int ret = 0; |
| 5883 | 6067 | ||
| 5884 | /* | ||
| 5885 | * Serialize against TASK_WAKING so that ttwu() and wunt() can | ||
| 5886 | * drop the rq->lock and still rely on ->cpus_allowed. | ||
| 5887 | */ | ||
| 5888 | again: | ||
| 5889 | while (task_is_waking(p)) | ||
| 5890 | cpu_relax(); | ||
| 5891 | rq = task_rq_lock(p, &flags); | 6068 | rq = task_rq_lock(p, &flags); |
| 5892 | if (task_is_waking(p)) { | 6069 | |
| 5893 | task_rq_unlock(rq, &flags); | 6070 | if (cpumask_equal(&p->cpus_allowed, new_mask)) |
| 5894 | goto again; | 6071 | goto out; |
| 5895 | } | ||
| 5896 | 6072 | ||
| 5897 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { | 6073 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
| 5898 | ret = -EINVAL; | 6074 | ret = -EINVAL; |
| 5899 | goto out; | 6075 | goto out; |
| 5900 | } | 6076 | } |
| 5901 | 6077 | ||
| 5902 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && | 6078 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { |
| 5903 | !cpumask_equal(&p->cpus_allowed, new_mask))) { | ||
| 5904 | ret = -EINVAL; | 6079 | ret = -EINVAL; |
| 5905 | goto out; | 6080 | goto out; |
| 5906 | } | 6081 | } |
| 5907 | 6082 | ||
| 5908 | if (p->sched_class->set_cpus_allowed) | 6083 | do_set_cpus_allowed(p, new_mask); |
| 5909 | p->sched_class->set_cpus_allowed(p, new_mask); | ||
| 5910 | else { | ||
| 5911 | cpumask_copy(&p->cpus_allowed, new_mask); | ||
| 5912 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | ||
| 5913 | } | ||
| 5914 | 6084 | ||
| 5915 | /* Can the task run on the task's current CPU? If so, we're done */ | 6085 | /* Can the task run on the task's current CPU? If so, we're done */ |
| 5916 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 6086 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
| 5917 | goto out; | 6087 | goto out; |
| 5918 | 6088 | ||
| 5919 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 6089 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
| 5920 | if (migrate_task(p, rq)) { | 6090 | if (p->on_rq) { |
| 5921 | struct migration_arg arg = { p, dest_cpu }; | 6091 | struct migration_arg arg = { p, dest_cpu }; |
| 5922 | /* Need help from migration thread: drop lock and wait. */ | 6092 | /* Need help from migration thread: drop lock and wait. */ |
| 5923 | task_rq_unlock(rq, &flags); | 6093 | task_rq_unlock(rq, p, &flags); |
| 5924 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); | 6094 | stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); |
| 5925 | tlb_migrate_finish(p->mm); | 6095 | tlb_migrate_finish(p->mm); |
| 5926 | return 0; | 6096 | return 0; |
| 5927 | } | 6097 | } |
| 5928 | out: | 6098 | out: |
| 5929 | task_rq_unlock(rq, &flags); | 6099 | task_rq_unlock(rq, p, &flags); |
| 5930 | 6100 | ||
| 5931 | return ret; | 6101 | return ret; |
| 5932 | } | 6102 | } |
| @@ -5954,6 +6124,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 5954 | rq_src = cpu_rq(src_cpu); | 6124 | rq_src = cpu_rq(src_cpu); |
| 5955 | rq_dest = cpu_rq(dest_cpu); | 6125 | rq_dest = cpu_rq(dest_cpu); |
| 5956 | 6126 | ||
| 6127 | raw_spin_lock(&p->pi_lock); | ||
| 5957 | double_rq_lock(rq_src, rq_dest); | 6128 | double_rq_lock(rq_src, rq_dest); |
| 5958 | /* Already moved. */ | 6129 | /* Already moved. */ |
| 5959 | if (task_cpu(p) != src_cpu) | 6130 | if (task_cpu(p) != src_cpu) |
| @@ -5966,7 +6137,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 5966 | * If we're not on a rq, the next wake-up will ensure we're | 6137 | * If we're not on a rq, the next wake-up will ensure we're |
| 5967 | * placed properly. | 6138 | * placed properly. |
| 5968 | */ | 6139 | */ |
| 5969 | if (p->se.on_rq) { | 6140 | if (p->on_rq) { |
| 5970 | deactivate_task(rq_src, p, 0); | 6141 | deactivate_task(rq_src, p, 0); |
| 5971 | set_task_cpu(p, dest_cpu); | 6142 | set_task_cpu(p, dest_cpu); |
| 5972 | activate_task(rq_dest, p, 0); | 6143 | activate_task(rq_dest, p, 0); |
| @@ -5976,6 +6147,7 @@ done: | |||
| 5976 | ret = 1; | 6147 | ret = 1; |
| 5977 | fail: | 6148 | fail: |
| 5978 | double_rq_unlock(rq_src, rq_dest); | 6149 | double_rq_unlock(rq_src, rq_dest); |
| 6150 | raw_spin_unlock(&p->pi_lock); | ||
| 5979 | return ret; | 6151 | return ret; |
| 5980 | } | 6152 | } |
| 5981 | 6153 | ||
| @@ -6316,6 +6488,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 6316 | 6488 | ||
| 6317 | #ifdef CONFIG_HOTPLUG_CPU | 6489 | #ifdef CONFIG_HOTPLUG_CPU |
| 6318 | case CPU_DYING: | 6490 | case CPU_DYING: |
| 6491 | sched_ttwu_pending(); | ||
| 6319 | /* Update our root-domain */ | 6492 | /* Update our root-domain */ |
| 6320 | raw_spin_lock_irqsave(&rq->lock, flags); | 6493 | raw_spin_lock_irqsave(&rq->lock, flags); |
| 6321 | if (rq->rd) { | 6494 | if (rq->rd) { |
| @@ -6394,6 +6567,8 @@ early_initcall(migration_init); | |||
| 6394 | 6567 | ||
| 6395 | #ifdef CONFIG_SMP | 6568 | #ifdef CONFIG_SMP |
| 6396 | 6569 | ||
| 6570 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
| 6571 | |||
| 6397 | #ifdef CONFIG_SCHED_DEBUG | 6572 | #ifdef CONFIG_SCHED_DEBUG |
| 6398 | 6573 | ||
| 6399 | static __read_mostly int sched_domain_debug_enabled; | 6574 | static __read_mostly int sched_domain_debug_enabled; |
| @@ -6444,7 +6619,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 6444 | break; | 6619 | break; |
| 6445 | } | 6620 | } |
| 6446 | 6621 | ||
| 6447 | if (!group->cpu_power) { | 6622 | if (!group->sgp->power) { |
| 6448 | printk(KERN_CONT "\n"); | 6623 | printk(KERN_CONT "\n"); |
| 6449 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 6624 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
| 6450 | "set\n"); | 6625 | "set\n"); |
| @@ -6468,9 +6643,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 6468 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 6643 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
| 6469 | 6644 | ||
| 6470 | printk(KERN_CONT " %s", str); | 6645 | printk(KERN_CONT " %s", str); |
| 6471 | if (group->cpu_power != SCHED_LOAD_SCALE) { | 6646 | if (group->sgp->power != SCHED_POWER_SCALE) { |
| 6472 | printk(KERN_CONT " (cpu_power = %d)", | 6647 | printk(KERN_CONT " (cpu_power = %d)", |
| 6473 | group->cpu_power); | 6648 | group->sgp->power); |
| 6474 | } | 6649 | } |
| 6475 | 6650 | ||
| 6476 | group = group->next; | 6651 | group = group->next; |
| @@ -6489,7 +6664,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 6489 | 6664 | ||
| 6490 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 6665 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
| 6491 | { | 6666 | { |
| 6492 | cpumask_var_t groupmask; | ||
| 6493 | int level = 0; | 6667 | int level = 0; |
| 6494 | 6668 | ||
| 6495 | if (!sched_domain_debug_enabled) | 6669 | if (!sched_domain_debug_enabled) |
| @@ -6502,20 +6676,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 6502 | 6676 | ||
| 6503 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 6677 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
| 6504 | 6678 | ||
| 6505 | if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { | ||
| 6506 | printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); | ||
| 6507 | return; | ||
| 6508 | } | ||
| 6509 | |||
| 6510 | for (;;) { | 6679 | for (;;) { |
| 6511 | if (sched_domain_debug_one(sd, cpu, level, groupmask)) | 6680 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) |
| 6512 | break; | 6681 | break; |
| 6513 | level++; | 6682 | level++; |
| 6514 | sd = sd->parent; | 6683 | sd = sd->parent; |
| 6515 | if (!sd) | 6684 | if (!sd) |
| 6516 | break; | 6685 | break; |
| 6517 | } | 6686 | } |
| 6518 | free_cpumask_var(groupmask); | ||
| 6519 | } | 6687 | } |
| 6520 | #else /* !CONFIG_SCHED_DEBUG */ | 6688 | #else /* !CONFIG_SCHED_DEBUG */ |
| 6521 | # define sched_domain_debug(sd, cpu) do { } while (0) | 6689 | # define sched_domain_debug(sd, cpu) do { } while (0) |
| @@ -6572,12 +6740,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 6572 | return 1; | 6740 | return 1; |
| 6573 | } | 6741 | } |
| 6574 | 6742 | ||
| 6575 | static void free_rootdomain(struct root_domain *rd) | 6743 | static void free_rootdomain(struct rcu_head *rcu) |
| 6576 | { | 6744 | { |
| 6577 | synchronize_sched(); | 6745 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
| 6578 | 6746 | ||
| 6579 | cpupri_cleanup(&rd->cpupri); | 6747 | cpupri_cleanup(&rd->cpupri); |
| 6580 | |||
| 6581 | free_cpumask_var(rd->rto_mask); | 6748 | free_cpumask_var(rd->rto_mask); |
| 6582 | free_cpumask_var(rd->online); | 6749 | free_cpumask_var(rd->online); |
| 6583 | free_cpumask_var(rd->span); | 6750 | free_cpumask_var(rd->span); |
| @@ -6618,7 +6785,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
| 6618 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6785 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 6619 | 6786 | ||
| 6620 | if (old_rd) | 6787 | if (old_rd) |
| 6621 | free_rootdomain(old_rd); | 6788 | call_rcu_sched(&old_rd->rcu, free_rootdomain); |
| 6622 | } | 6789 | } |
| 6623 | 6790 | ||
| 6624 | static int init_rootdomain(struct root_domain *rd) | 6791 | static int init_rootdomain(struct root_domain *rd) |
| @@ -6669,6 +6836,53 @@ static struct root_domain *alloc_rootdomain(void) | |||
| 6669 | return rd; | 6836 | return rd; |
| 6670 | } | 6837 | } |
| 6671 | 6838 | ||
| 6839 | static void free_sched_groups(struct sched_group *sg, int free_sgp) | ||
| 6840 | { | ||
| 6841 | struct sched_group *tmp, *first; | ||
| 6842 | |||
| 6843 | if (!sg) | ||
| 6844 | return; | ||
| 6845 | |||
| 6846 | first = sg; | ||
| 6847 | do { | ||
| 6848 | tmp = sg->next; | ||
| 6849 | |||
| 6850 | if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) | ||
| 6851 | kfree(sg->sgp); | ||
| 6852 | |||
| 6853 | kfree(sg); | ||
| 6854 | sg = tmp; | ||
| 6855 | } while (sg != first); | ||
| 6856 | } | ||
| 6857 | |||
| 6858 | static void free_sched_domain(struct rcu_head *rcu) | ||
| 6859 | { | ||
| 6860 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
| 6861 | |||
| 6862 | /* | ||
| 6863 | * If its an overlapping domain it has private groups, iterate and | ||
| 6864 | * nuke them all. | ||
| 6865 | */ | ||
| 6866 | if (sd->flags & SD_OVERLAP) { | ||
| 6867 | free_sched_groups(sd->groups, 1); | ||
| 6868 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
| 6869 | kfree(sd->groups->sgp); | ||
| 6870 | kfree(sd->groups); | ||
| 6871 | } | ||
| 6872 | kfree(sd); | ||
| 6873 | } | ||
| 6874 | |||
| 6875 | static void destroy_sched_domain(struct sched_domain *sd, int cpu) | ||
| 6876 | { | ||
| 6877 | call_rcu(&sd->rcu, free_sched_domain); | ||
| 6878 | } | ||
| 6879 | |||
| 6880 | static void destroy_sched_domains(struct sched_domain *sd, int cpu) | ||
| 6881 | { | ||
| 6882 | for (; sd; sd = sd->parent) | ||
| 6883 | destroy_sched_domain(sd, cpu); | ||
| 6884 | } | ||
| 6885 | |||
| 6672 | /* | 6886 | /* |
| 6673 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6887 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
| 6674 | * hold the hotplug lock. | 6888 | * hold the hotplug lock. |
| @@ -6679,9 +6893,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 6679 | struct rq *rq = cpu_rq(cpu); | 6893 | struct rq *rq = cpu_rq(cpu); |
| 6680 | struct sched_domain *tmp; | 6894 | struct sched_domain *tmp; |
| 6681 | 6895 | ||
| 6682 | for (tmp = sd; tmp; tmp = tmp->parent) | ||
| 6683 | tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); | ||
| 6684 | |||
| 6685 | /* Remove the sched domains which do not contribute to scheduling. */ | 6896 | /* Remove the sched domains which do not contribute to scheduling. */ |
| 6686 | for (tmp = sd; tmp; ) { | 6897 | for (tmp = sd; tmp; ) { |
| 6687 | struct sched_domain *parent = tmp->parent; | 6898 | struct sched_domain *parent = tmp->parent; |
| @@ -6692,12 +6903,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 6692 | tmp->parent = parent->parent; | 6903 | tmp->parent = parent->parent; |
| 6693 | if (parent->parent) | 6904 | if (parent->parent) |
| 6694 | parent->parent->child = tmp; | 6905 | parent->parent->child = tmp; |
| 6906 | destroy_sched_domain(parent, cpu); | ||
| 6695 | } else | 6907 | } else |
| 6696 | tmp = tmp->parent; | 6908 | tmp = tmp->parent; |
| 6697 | } | 6909 | } |
| 6698 | 6910 | ||
| 6699 | if (sd && sd_degenerate(sd)) { | 6911 | if (sd && sd_degenerate(sd)) { |
| 6912 | tmp = sd; | ||
| 6700 | sd = sd->parent; | 6913 | sd = sd->parent; |
| 6914 | destroy_sched_domain(tmp, cpu); | ||
| 6701 | if (sd) | 6915 | if (sd) |
| 6702 | sd->child = NULL; | 6916 | sd->child = NULL; |
| 6703 | } | 6917 | } |
| @@ -6705,7 +6919,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
| 6705 | sched_domain_debug(sd, cpu); | 6919 | sched_domain_debug(sd, cpu); |
| 6706 | 6920 | ||
| 6707 | rq_attach_root(rq, rd); | 6921 | rq_attach_root(rq, rd); |
| 6922 | tmp = rq->sd; | ||
| 6708 | rcu_assign_pointer(rq->sd, sd); | 6923 | rcu_assign_pointer(rq->sd, sd); |
| 6924 | destroy_sched_domains(tmp, cpu); | ||
| 6709 | } | 6925 | } |
| 6710 | 6926 | ||
| 6711 | /* cpus with isolated domains */ | 6927 | /* cpus with isolated domains */ |
| @@ -6721,56 +6937,6 @@ static int __init isolated_cpu_setup(char *str) | |||
| 6721 | 6937 | ||
| 6722 | __setup("isolcpus=", isolated_cpu_setup); | 6938 | __setup("isolcpus=", isolated_cpu_setup); |
| 6723 | 6939 | ||
| 6724 | /* | ||
| 6725 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | ||
| 6726 | * to a function which identifies what group(along with sched group) a CPU | ||
| 6727 | * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids | ||
| 6728 | * (due to the fact that we keep track of groups covered with a struct cpumask). | ||
| 6729 | * | ||
| 6730 | * init_sched_build_groups will build a circular linked list of the groups | ||
| 6731 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
| 6732 | * and ->cpu_power to 0. | ||
| 6733 | */ | ||
| 6734 | static void | ||
| 6735 | init_sched_build_groups(const struct cpumask *span, | ||
| 6736 | const struct cpumask *cpu_map, | ||
| 6737 | int (*group_fn)(int cpu, const struct cpumask *cpu_map, | ||
| 6738 | struct sched_group **sg, | ||
| 6739 | struct cpumask *tmpmask), | ||
| 6740 | struct cpumask *covered, struct cpumask *tmpmask) | ||
| 6741 | { | ||
| 6742 | struct sched_group *first = NULL, *last = NULL; | ||
| 6743 | int i; | ||
| 6744 | |||
| 6745 | cpumask_clear(covered); | ||
| 6746 | |||
| 6747 | for_each_cpu(i, span) { | ||
| 6748 | struct sched_group *sg; | ||
| 6749 | int group = group_fn(i, cpu_map, &sg, tmpmask); | ||
| 6750 | int j; | ||
| 6751 | |||
| 6752 | if (cpumask_test_cpu(i, covered)) | ||
| 6753 | continue; | ||
| 6754 | |||
| 6755 | cpumask_clear(sched_group_cpus(sg)); | ||
| 6756 | sg->cpu_power = 0; | ||
| 6757 | |||
| 6758 | for_each_cpu(j, span) { | ||
| 6759 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | ||
| 6760 | continue; | ||
| 6761 | |||
| 6762 | cpumask_set_cpu(j, covered); | ||
| 6763 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
| 6764 | } | ||
| 6765 | if (!first) | ||
| 6766 | first = sg; | ||
| 6767 | if (last) | ||
| 6768 | last->next = sg; | ||
| 6769 | last = sg; | ||
| 6770 | } | ||
| 6771 | last->next = first; | ||
| 6772 | } | ||
| 6773 | |||
| 6774 | #define SD_NODES_PER_DOMAIN 16 | 6940 | #define SD_NODES_PER_DOMAIN 16 |
| 6775 | 6941 | ||
| 6776 | #ifdef CONFIG_NUMA | 6942 | #ifdef CONFIG_NUMA |
| @@ -6787,7 +6953,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
| 6787 | */ | 6953 | */ |
| 6788 | static int find_next_best_node(int node, nodemask_t *used_nodes) | 6954 | static int find_next_best_node(int node, nodemask_t *used_nodes) |
| 6789 | { | 6955 | { |
| 6790 | int i, n, val, min_val, best_node = 0; | 6956 | int i, n, val, min_val, best_node = -1; |
| 6791 | 6957 | ||
| 6792 | min_val = INT_MAX; | 6958 | min_val = INT_MAX; |
| 6793 | 6959 | ||
| @@ -6811,7 +6977,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) | |||
| 6811 | } | 6977 | } |
| 6812 | } | 6978 | } |
| 6813 | 6979 | ||
| 6814 | node_set(best_node, *used_nodes); | 6980 | if (best_node != -1) |
| 6981 | node_set(best_node, *used_nodes); | ||
| 6815 | return best_node; | 6982 | return best_node; |
| 6816 | } | 6983 | } |
| 6817 | 6984 | ||
| @@ -6837,315 +7004,197 @@ static void sched_domain_node_span(int node, struct cpumask *span) | |||
| 6837 | 7004 | ||
| 6838 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 7005 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
| 6839 | int next_node = find_next_best_node(node, &used_nodes); | 7006 | int next_node = find_next_best_node(node, &used_nodes); |
| 6840 | 7007 | if (next_node < 0) | |
| 7008 | break; | ||
| 6841 | cpumask_or(span, span, cpumask_of_node(next_node)); | 7009 | cpumask_or(span, span, cpumask_of_node(next_node)); |
| 6842 | } | 7010 | } |
| 6843 | } | 7011 | } |
| 7012 | |||
| 7013 | static const struct cpumask *cpu_node_mask(int cpu) | ||
| 7014 | { | ||
| 7015 | lockdep_assert_held(&sched_domains_mutex); | ||
| 7016 | |||
| 7017 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
| 7018 | |||
| 7019 | return sched_domains_tmpmask; | ||
| 7020 | } | ||
| 7021 | |||
| 7022 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
| 7023 | { | ||
| 7024 | return cpu_possible_mask; | ||
| 7025 | } | ||
| 6844 | #endif /* CONFIG_NUMA */ | 7026 | #endif /* CONFIG_NUMA */ |
| 6845 | 7027 | ||
| 6846 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 7028 | static const struct cpumask *cpu_cpu_mask(int cpu) |
| 7029 | { | ||
| 7030 | return cpumask_of_node(cpu_to_node(cpu)); | ||
| 7031 | } | ||
| 6847 | 7032 | ||
| 6848 | /* | 7033 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
| 6849 | * The cpus mask in sched_group and sched_domain hangs off the end. | ||
| 6850 | * | ||
| 6851 | * ( See the the comments in include/linux/sched.h:struct sched_group | ||
| 6852 | * and struct sched_domain. ) | ||
| 6853 | */ | ||
| 6854 | struct static_sched_group { | ||
| 6855 | struct sched_group sg; | ||
| 6856 | DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); | ||
| 6857 | }; | ||
| 6858 | 7034 | ||
| 6859 | struct static_sched_domain { | 7035 | struct sd_data { |
| 6860 | struct sched_domain sd; | 7036 | struct sched_domain **__percpu sd; |
| 6861 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 7037 | struct sched_group **__percpu sg; |
| 7038 | struct sched_group_power **__percpu sgp; | ||
| 6862 | }; | 7039 | }; |
| 6863 | 7040 | ||
| 6864 | struct s_data { | 7041 | struct s_data { |
| 6865 | #ifdef CONFIG_NUMA | 7042 | struct sched_domain ** __percpu sd; |
| 6866 | int sd_allnodes; | ||
| 6867 | cpumask_var_t domainspan; | ||
| 6868 | cpumask_var_t covered; | ||
| 6869 | cpumask_var_t notcovered; | ||
| 6870 | #endif | ||
| 6871 | cpumask_var_t nodemask; | ||
| 6872 | cpumask_var_t this_sibling_map; | ||
| 6873 | cpumask_var_t this_core_map; | ||
| 6874 | cpumask_var_t this_book_map; | ||
| 6875 | cpumask_var_t send_covered; | ||
| 6876 | cpumask_var_t tmpmask; | ||
| 6877 | struct sched_group **sched_group_nodes; | ||
| 6878 | struct root_domain *rd; | 7043 | struct root_domain *rd; |
| 6879 | }; | 7044 | }; |
| 6880 | 7045 | ||
| 6881 | enum s_alloc { | 7046 | enum s_alloc { |
| 6882 | sa_sched_groups = 0, | ||
| 6883 | sa_rootdomain, | 7047 | sa_rootdomain, |
| 6884 | sa_tmpmask, | 7048 | sa_sd, |
| 6885 | sa_send_covered, | 7049 | sa_sd_storage, |
| 6886 | sa_this_book_map, | ||
| 6887 | sa_this_core_map, | ||
| 6888 | sa_this_sibling_map, | ||
| 6889 | sa_nodemask, | ||
| 6890 | sa_sched_group_nodes, | ||
| 6891 | #ifdef CONFIG_NUMA | ||
| 6892 | sa_notcovered, | ||
| 6893 | sa_covered, | ||
| 6894 | sa_domainspan, | ||
| 6895 | #endif | ||
| 6896 | sa_none, | 7050 | sa_none, |
| 6897 | }; | 7051 | }; |
| 6898 | 7052 | ||
| 6899 | /* | 7053 | struct sched_domain_topology_level; |
| 6900 | * SMT sched-domains: | ||
| 6901 | */ | ||
| 6902 | #ifdef CONFIG_SCHED_SMT | ||
| 6903 | static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); | ||
| 6904 | static DEFINE_PER_CPU(struct static_sched_group, sched_groups); | ||
| 6905 | 7054 | ||
| 6906 | static int | 7055 | typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); |
| 6907 | cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | 7056 | typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); |
| 6908 | struct sched_group **sg, struct cpumask *unused) | ||
| 6909 | { | ||
| 6910 | if (sg) | ||
| 6911 | *sg = &per_cpu(sched_groups, cpu).sg; | ||
| 6912 | return cpu; | ||
| 6913 | } | ||
| 6914 | #endif /* CONFIG_SCHED_SMT */ | ||
| 6915 | 7057 | ||
| 6916 | /* | 7058 | #define SDTL_OVERLAP 0x01 |
| 6917 | * multi-core sched-domains: | 7059 | |
| 6918 | */ | 7060 | struct sched_domain_topology_level { |
| 6919 | #ifdef CONFIG_SCHED_MC | 7061 | sched_domain_init_f init; |
| 6920 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | 7062 | sched_domain_mask_f mask; |
| 6921 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | 7063 | int flags; |
| 7064 | struct sd_data data; | ||
| 7065 | }; | ||
| 6922 | 7066 | ||
| 6923 | static int | 7067 | static int |
| 6924 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 7068 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
| 6925 | struct sched_group **sg, struct cpumask *mask) | ||
| 6926 | { | 7069 | { |
| 6927 | int group; | 7070 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; |
| 6928 | #ifdef CONFIG_SCHED_SMT | 7071 | const struct cpumask *span = sched_domain_span(sd); |
| 6929 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | 7072 | struct cpumask *covered = sched_domains_tmpmask; |
| 6930 | group = cpumask_first(mask); | 7073 | struct sd_data *sdd = sd->private; |
| 6931 | #else | 7074 | struct sched_domain *child; |
| 6932 | group = cpu; | 7075 | int i; |
| 6933 | #endif | ||
| 6934 | if (sg) | ||
| 6935 | *sg = &per_cpu(sched_group_core, group).sg; | ||
| 6936 | return group; | ||
| 6937 | } | ||
| 6938 | #endif /* CONFIG_SCHED_MC */ | ||
| 6939 | 7076 | ||
| 6940 | /* | 7077 | cpumask_clear(covered); |
| 6941 | * book sched-domains: | ||
| 6942 | */ | ||
| 6943 | #ifdef CONFIG_SCHED_BOOK | ||
| 6944 | static DEFINE_PER_CPU(struct static_sched_domain, book_domains); | ||
| 6945 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); | ||
| 6946 | 7078 | ||
| 6947 | static int | 7079 | for_each_cpu(i, span) { |
| 6948 | cpu_to_book_group(int cpu, const struct cpumask *cpu_map, | 7080 | struct cpumask *sg_span; |
| 6949 | struct sched_group **sg, struct cpumask *mask) | ||
| 6950 | { | ||
| 6951 | int group = cpu; | ||
| 6952 | #ifdef CONFIG_SCHED_MC | ||
| 6953 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
| 6954 | group = cpumask_first(mask); | ||
| 6955 | #elif defined(CONFIG_SCHED_SMT) | ||
| 6956 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
| 6957 | group = cpumask_first(mask); | ||
| 6958 | #endif | ||
| 6959 | if (sg) | ||
| 6960 | *sg = &per_cpu(sched_group_book, group).sg; | ||
| 6961 | return group; | ||
| 6962 | } | ||
| 6963 | #endif /* CONFIG_SCHED_BOOK */ | ||
| 6964 | 7081 | ||
| 6965 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 7082 | if (cpumask_test_cpu(i, covered)) |
| 6966 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | 7083 | continue; |
| 6967 | 7084 | ||
| 6968 | static int | 7085 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
| 6969 | cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | 7086 | GFP_KERNEL, cpu_to_node(i)); |
| 6970 | struct sched_group **sg, struct cpumask *mask) | ||
| 6971 | { | ||
| 6972 | int group; | ||
| 6973 | #ifdef CONFIG_SCHED_BOOK | ||
| 6974 | cpumask_and(mask, cpu_book_mask(cpu), cpu_map); | ||
| 6975 | group = cpumask_first(mask); | ||
| 6976 | #elif defined(CONFIG_SCHED_MC) | ||
| 6977 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
| 6978 | group = cpumask_first(mask); | ||
| 6979 | #elif defined(CONFIG_SCHED_SMT) | ||
| 6980 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
| 6981 | group = cpumask_first(mask); | ||
| 6982 | #else | ||
| 6983 | group = cpu; | ||
| 6984 | #endif | ||
| 6985 | if (sg) | ||
| 6986 | *sg = &per_cpu(sched_group_phys, group).sg; | ||
| 6987 | return group; | ||
| 6988 | } | ||
| 6989 | 7087 | ||
| 6990 | #ifdef CONFIG_NUMA | 7088 | if (!sg) |
| 6991 | /* | 7089 | goto fail; |
| 6992 | * The init_sched_build_groups can't handle what we want to do with node | ||
| 6993 | * groups, so roll our own. Now each node has its own list of groups which | ||
| 6994 | * gets dynamically allocated. | ||
| 6995 | */ | ||
| 6996 | static DEFINE_PER_CPU(struct static_sched_domain, node_domains); | ||
| 6997 | static struct sched_group ***sched_group_nodes_bycpu; | ||
| 6998 | 7090 | ||
| 6999 | static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); | 7091 | sg_span = sched_group_cpus(sg); |
| 7000 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); | ||
| 7001 | 7092 | ||
| 7002 | static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, | 7093 | child = *per_cpu_ptr(sdd->sd, i); |
| 7003 | struct sched_group **sg, | 7094 | if (child->child) { |
| 7004 | struct cpumask *nodemask) | 7095 | child = child->child; |
| 7005 | { | 7096 | cpumask_copy(sg_span, sched_domain_span(child)); |
| 7006 | int group; | 7097 | } else |
| 7098 | cpumask_set_cpu(i, sg_span); | ||
| 7007 | 7099 | ||
| 7008 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); | 7100 | cpumask_or(covered, covered, sg_span); |
| 7009 | group = cpumask_first(nodemask); | ||
| 7010 | 7101 | ||
| 7011 | if (sg) | 7102 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); |
| 7012 | *sg = &per_cpu(sched_group_allnodes, group).sg; | 7103 | atomic_inc(&sg->sgp->ref); |
| 7013 | return group; | ||
| 7014 | } | ||
| 7015 | 7104 | ||
| 7016 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 7105 | if (cpumask_test_cpu(cpu, sg_span)) |
| 7017 | { | 7106 | groups = sg; |
| 7018 | struct sched_group *sg = group_head; | ||
| 7019 | int j; | ||
| 7020 | 7107 | ||
| 7021 | if (!sg) | 7108 | if (!first) |
| 7022 | return; | 7109 | first = sg; |
| 7023 | do { | 7110 | if (last) |
| 7024 | for_each_cpu(j, sched_group_cpus(sg)) { | 7111 | last->next = sg; |
| 7025 | struct sched_domain *sd; | 7112 | last = sg; |
| 7113 | last->next = first; | ||
| 7114 | } | ||
| 7115 | sd->groups = groups; | ||
| 7026 | 7116 | ||
| 7027 | sd = &per_cpu(phys_domains, j).sd; | 7117 | return 0; |
| 7028 | if (j != group_first_cpu(sd->groups)) { | ||
| 7029 | /* | ||
| 7030 | * Only add "power" once for each | ||
| 7031 | * physical package. | ||
| 7032 | */ | ||
| 7033 | continue; | ||
| 7034 | } | ||
| 7035 | 7118 | ||
| 7036 | sg->cpu_power += sd->groups->cpu_power; | 7119 | fail: |
| 7037 | } | 7120 | free_sched_groups(first, 0); |
| 7038 | sg = sg->next; | 7121 | |
| 7039 | } while (sg != group_head); | 7122 | return -ENOMEM; |
| 7040 | } | 7123 | } |
| 7041 | 7124 | ||
| 7042 | static int build_numa_sched_groups(struct s_data *d, | 7125 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) |
| 7043 | const struct cpumask *cpu_map, int num) | ||
| 7044 | { | 7126 | { |
| 7045 | struct sched_domain *sd; | 7127 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
| 7046 | struct sched_group *sg, *prev; | 7128 | struct sched_domain *child = sd->child; |
| 7047 | int n, j; | ||
| 7048 | 7129 | ||
| 7049 | cpumask_clear(d->covered); | 7130 | if (child) |
| 7050 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | 7131 | cpu = cpumask_first(sched_domain_span(child)); |
| 7051 | if (cpumask_empty(d->nodemask)) { | 7132 | |
| 7052 | d->sched_group_nodes[num] = NULL; | 7133 | if (sg) { |
| 7053 | goto out; | 7134 | *sg = *per_cpu_ptr(sdd->sg, cpu); |
| 7135 | (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); | ||
| 7136 | atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ | ||
| 7054 | } | 7137 | } |
| 7055 | 7138 | ||
| 7056 | sched_domain_node_span(num, d->domainspan); | 7139 | return cpu; |
| 7057 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | 7140 | } |
| 7058 | 7141 | ||
| 7059 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 7142 | /* |
| 7060 | GFP_KERNEL, num); | 7143 | * build_sched_groups will build a circular linked list of the groups |
| 7061 | if (!sg) { | 7144 | * covered by the given span, and will set each group's ->cpumask correctly, |
| 7062 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | 7145 | * and ->cpu_power to 0. |
| 7063 | num); | 7146 | * |
| 7064 | return -ENOMEM; | 7147 | * Assumes the sched_domain tree is fully constructed |
| 7065 | } | 7148 | */ |
| 7066 | d->sched_group_nodes[num] = sg; | 7149 | static int |
| 7150 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
| 7151 | { | ||
| 7152 | struct sched_group *first = NULL, *last = NULL; | ||
| 7153 | struct sd_data *sdd = sd->private; | ||
| 7154 | const struct cpumask *span = sched_domain_span(sd); | ||
| 7155 | struct cpumask *covered; | ||
| 7156 | int i; | ||
| 7067 | 7157 | ||
| 7068 | for_each_cpu(j, d->nodemask) { | 7158 | get_group(cpu, sdd, &sd->groups); |
| 7069 | sd = &per_cpu(node_domains, j).sd; | 7159 | atomic_inc(&sd->groups->ref); |
| 7070 | sd->groups = sg; | ||
| 7071 | } | ||
| 7072 | 7160 | ||
| 7073 | sg->cpu_power = 0; | 7161 | if (cpu != cpumask_first(sched_domain_span(sd))) |
| 7074 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | 7162 | return 0; |
| 7075 | sg->next = sg; | ||
| 7076 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
| 7077 | 7163 | ||
| 7078 | prev = sg; | 7164 | lockdep_assert_held(&sched_domains_mutex); |
| 7079 | for (j = 0; j < nr_node_ids; j++) { | 7165 | covered = sched_domains_tmpmask; |
| 7080 | n = (num + j) % nr_node_ids; | ||
| 7081 | cpumask_complement(d->notcovered, d->covered); | ||
| 7082 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
| 7083 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
| 7084 | if (cpumask_empty(d->tmpmask)) | ||
| 7085 | break; | ||
| 7086 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
| 7087 | if (cpumask_empty(d->tmpmask)) | ||
| 7088 | continue; | ||
| 7089 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
| 7090 | GFP_KERNEL, num); | ||
| 7091 | if (!sg) { | ||
| 7092 | printk(KERN_WARNING | ||
| 7093 | "Can not alloc domain group for node %d\n", j); | ||
| 7094 | return -ENOMEM; | ||
| 7095 | } | ||
| 7096 | sg->cpu_power = 0; | ||
| 7097 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
| 7098 | sg->next = prev->next; | ||
| 7099 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
| 7100 | prev->next = sg; | ||
| 7101 | prev = sg; | ||
| 7102 | } | ||
| 7103 | out: | ||
| 7104 | return 0; | ||
| 7105 | } | ||
| 7106 | #endif /* CONFIG_NUMA */ | ||
| 7107 | 7166 | ||
| 7108 | #ifdef CONFIG_NUMA | 7167 | cpumask_clear(covered); |
| 7109 | /* Free memory allocated for various sched_group structures */ | ||
| 7110 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
| 7111 | struct cpumask *nodemask) | ||
| 7112 | { | ||
| 7113 | int cpu, i; | ||
| 7114 | 7168 | ||
| 7115 | for_each_cpu(cpu, cpu_map) { | 7169 | for_each_cpu(i, span) { |
| 7116 | struct sched_group **sched_group_nodes | 7170 | struct sched_group *sg; |
| 7117 | = sched_group_nodes_bycpu[cpu]; | 7171 | int group = get_group(i, sdd, &sg); |
| 7172 | int j; | ||
| 7118 | 7173 | ||
| 7119 | if (!sched_group_nodes) | 7174 | if (cpumask_test_cpu(i, covered)) |
| 7120 | continue; | 7175 | continue; |
| 7121 | 7176 | ||
| 7122 | for (i = 0; i < nr_node_ids; i++) { | 7177 | cpumask_clear(sched_group_cpus(sg)); |
| 7123 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 7178 | sg->sgp->power = 0; |
| 7124 | 7179 | ||
| 7125 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 7180 | for_each_cpu(j, span) { |
| 7126 | if (cpumask_empty(nodemask)) | 7181 | if (get_group(j, sdd, NULL) != group) |
| 7127 | continue; | 7182 | continue; |
| 7128 | 7183 | ||
| 7129 | if (sg == NULL) | 7184 | cpumask_set_cpu(j, covered); |
| 7130 | continue; | 7185 | cpumask_set_cpu(j, sched_group_cpus(sg)); |
| 7131 | sg = sg->next; | ||
| 7132 | next_sg: | ||
| 7133 | oldsg = sg; | ||
| 7134 | sg = sg->next; | ||
| 7135 | kfree(oldsg); | ||
| 7136 | if (oldsg != sched_group_nodes[i]) | ||
| 7137 | goto next_sg; | ||
| 7138 | } | 7186 | } |
| 7139 | kfree(sched_group_nodes); | 7187 | |
| 7140 | sched_group_nodes_bycpu[cpu] = NULL; | 7188 | if (!first) |
| 7189 | first = sg; | ||
| 7190 | if (last) | ||
| 7191 | last->next = sg; | ||
| 7192 | last = sg; | ||
| 7141 | } | 7193 | } |
| 7194 | last->next = first; | ||
| 7195 | |||
| 7196 | return 0; | ||
| 7142 | } | 7197 | } |
| 7143 | #else /* !CONFIG_NUMA */ | ||
| 7144 | static void free_sched_groups(const struct cpumask *cpu_map, | ||
| 7145 | struct cpumask *nodemask) | ||
| 7146 | { | ||
| 7147 | } | ||
| 7148 | #endif /* CONFIG_NUMA */ | ||
| 7149 | 7198 | ||
| 7150 | /* | 7199 | /* |
| 7151 | * Initialize sched groups cpu_power. | 7200 | * Initialize sched groups cpu_power. |
| @@ -7159,48 +7208,19 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
| 7159 | */ | 7208 | */ |
| 7160 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 7209 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
| 7161 | { | 7210 | { |
| 7162 | struct sched_domain *child; | 7211 | struct sched_group *sg = sd->groups; |
| 7163 | struct sched_group *group; | ||
| 7164 | long power; | ||
| 7165 | int weight; | ||
| 7166 | 7212 | ||
| 7167 | WARN_ON(!sd || !sd->groups); | 7213 | WARN_ON(!sd || !sg); |
| 7168 | |||
| 7169 | if (cpu != group_first_cpu(sd->groups)) | ||
| 7170 | return; | ||
| 7171 | 7214 | ||
| 7172 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | 7215 | do { |
| 7173 | 7216 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | |
| 7174 | child = sd->child; | 7217 | sg = sg->next; |
| 7175 | 7218 | } while (sg != sd->groups); | |
| 7176 | sd->groups->cpu_power = 0; | ||
| 7177 | 7219 | ||
| 7178 | if (!child) { | 7220 | if (cpu != group_first_cpu(sg)) |
| 7179 | power = SCHED_LOAD_SCALE; | ||
| 7180 | weight = cpumask_weight(sched_domain_span(sd)); | ||
| 7181 | /* | ||
| 7182 | * SMT siblings share the power of a single core. | ||
| 7183 | * Usually multiple threads get a better yield out of | ||
| 7184 | * that one core than a single thread would have, | ||
| 7185 | * reflect that in sd->smt_gain. | ||
| 7186 | */ | ||
| 7187 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
| 7188 | power *= sd->smt_gain; | ||
| 7189 | power /= weight; | ||
| 7190 | power >>= SCHED_LOAD_SHIFT; | ||
| 7191 | } | ||
| 7192 | sd->groups->cpu_power += power; | ||
| 7193 | return; | 7221 | return; |
| 7194 | } | ||
| 7195 | 7222 | ||
| 7196 | /* | 7223 | update_group_power(sd, cpu); |
| 7197 | * Add cpu_power of each child group to this groups cpu_power. | ||
| 7198 | */ | ||
| 7199 | group = child->groups; | ||
| 7200 | do { | ||
| 7201 | sd->groups->cpu_power += group->cpu_power; | ||
| 7202 | group = group->next; | ||
| 7203 | } while (group != child->groups); | ||
| 7204 | } | 7224 | } |
| 7205 | 7225 | ||
| 7206 | /* | 7226 | /* |
| @@ -7214,15 +7234,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 7214 | # define SD_INIT_NAME(sd, type) do { } while (0) | 7234 | # define SD_INIT_NAME(sd, type) do { } while (0) |
| 7215 | #endif | 7235 | #endif |
| 7216 | 7236 | ||
| 7217 | #define SD_INIT(sd, type) sd_init_##type(sd) | 7237 | #define SD_INIT_FUNC(type) \ |
| 7218 | 7238 | static noinline struct sched_domain * \ | |
| 7219 | #define SD_INIT_FUNC(type) \ | 7239 | sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ |
| 7220 | static noinline void sd_init_##type(struct sched_domain *sd) \ | 7240 | { \ |
| 7221 | { \ | 7241 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ |
| 7222 | memset(sd, 0, sizeof(*sd)); \ | 7242 | *sd = SD_##type##_INIT; \ |
| 7223 | *sd = SD_##type##_INIT; \ | 7243 | SD_INIT_NAME(sd, type); \ |
| 7224 | sd->level = SD_LV_##type; \ | 7244 | sd->private = &tl->data; \ |
| 7225 | SD_INIT_NAME(sd, type); \ | 7245 | return sd; \ |
| 7226 | } | 7246 | } |
| 7227 | 7247 | ||
| 7228 | SD_INIT_FUNC(CPU) | 7248 | SD_INIT_FUNC(CPU) |
| @@ -7241,13 +7261,14 @@ SD_INIT_FUNC(CPU) | |||
| 7241 | #endif | 7261 | #endif |
| 7242 | 7262 | ||
| 7243 | static int default_relax_domain_level = -1; | 7263 | static int default_relax_domain_level = -1; |
| 7264 | int sched_domain_level_max; | ||
| 7244 | 7265 | ||
| 7245 | static int __init setup_relax_domain_level(char *str) | 7266 | static int __init setup_relax_domain_level(char *str) |
| 7246 | { | 7267 | { |
| 7247 | unsigned long val; | 7268 | unsigned long val; |
| 7248 | 7269 | ||
| 7249 | val = simple_strtoul(str, NULL, 0); | 7270 | val = simple_strtoul(str, NULL, 0); |
| 7250 | if (val < SD_LV_MAX) | 7271 | if (val < sched_domain_level_max) |
| 7251 | default_relax_domain_level = val; | 7272 | default_relax_domain_level = val; |
| 7252 | 7273 | ||
| 7253 | return 1; | 7274 | return 1; |
| @@ -7275,37 +7296,20 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
| 7275 | } | 7296 | } |
| 7276 | } | 7297 | } |
| 7277 | 7298 | ||
| 7299 | static void __sdt_free(const struct cpumask *cpu_map); | ||
| 7300 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
| 7301 | |||
| 7278 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | 7302 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
| 7279 | const struct cpumask *cpu_map) | 7303 | const struct cpumask *cpu_map) |
| 7280 | { | 7304 | { |
| 7281 | switch (what) { | 7305 | switch (what) { |
| 7282 | case sa_sched_groups: | ||
| 7283 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
| 7284 | d->sched_group_nodes = NULL; | ||
| 7285 | case sa_rootdomain: | 7306 | case sa_rootdomain: |
| 7286 | free_rootdomain(d->rd); /* fall through */ | 7307 | if (!atomic_read(&d->rd->refcount)) |
| 7287 | case sa_tmpmask: | 7308 | free_rootdomain(&d->rd->rcu); /* fall through */ |
| 7288 | free_cpumask_var(d->tmpmask); /* fall through */ | 7309 | case sa_sd: |
| 7289 | case sa_send_covered: | 7310 | free_percpu(d->sd); /* fall through */ |
| 7290 | free_cpumask_var(d->send_covered); /* fall through */ | 7311 | case sa_sd_storage: |
| 7291 | case sa_this_book_map: | 7312 | __sdt_free(cpu_map); /* fall through */ |
| 7292 | free_cpumask_var(d->this_book_map); /* fall through */ | ||
| 7293 | case sa_this_core_map: | ||
| 7294 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
| 7295 | case sa_this_sibling_map: | ||
| 7296 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
| 7297 | case sa_nodemask: | ||
| 7298 | free_cpumask_var(d->nodemask); /* fall through */ | ||
| 7299 | case sa_sched_group_nodes: | ||
| 7300 | #ifdef CONFIG_NUMA | ||
| 7301 | kfree(d->sched_group_nodes); /* fall through */ | ||
| 7302 | case sa_notcovered: | ||
| 7303 | free_cpumask_var(d->notcovered); /* fall through */ | ||
| 7304 | case sa_covered: | ||
| 7305 | free_cpumask_var(d->covered); /* fall through */ | ||
| 7306 | case sa_domainspan: | ||
| 7307 | free_cpumask_var(d->domainspan); /* fall through */ | ||
| 7308 | #endif | ||
| 7309 | case sa_none: | 7313 | case sa_none: |
| 7310 | break; | 7314 | break; |
| 7311 | } | 7315 | } |
| @@ -7314,308 +7318,233 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
| 7314 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | 7318 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
| 7315 | const struct cpumask *cpu_map) | 7319 | const struct cpumask *cpu_map) |
| 7316 | { | 7320 | { |
| 7317 | #ifdef CONFIG_NUMA | 7321 | memset(d, 0, sizeof(*d)); |
| 7318 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) | 7322 | |
| 7319 | return sa_none; | 7323 | if (__sdt_alloc(cpu_map)) |
| 7320 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) | 7324 | return sa_sd_storage; |
| 7321 | return sa_domainspan; | 7325 | d->sd = alloc_percpu(struct sched_domain *); |
| 7322 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) | 7326 | if (!d->sd) |
| 7323 | return sa_covered; | 7327 | return sa_sd_storage; |
| 7324 | /* Allocate the per-node list of sched groups */ | ||
| 7325 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
| 7326 | sizeof(struct sched_group *), GFP_KERNEL); | ||
| 7327 | if (!d->sched_group_nodes) { | ||
| 7328 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
| 7329 | return sa_notcovered; | ||
| 7330 | } | ||
| 7331 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; | ||
| 7332 | #endif | ||
| 7333 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) | ||
| 7334 | return sa_sched_group_nodes; | ||
| 7335 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
| 7336 | return sa_nodemask; | ||
| 7337 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
| 7338 | return sa_this_sibling_map; | ||
| 7339 | if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) | ||
| 7340 | return sa_this_core_map; | ||
| 7341 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
| 7342 | return sa_this_book_map; | ||
| 7343 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
| 7344 | return sa_send_covered; | ||
| 7345 | d->rd = alloc_rootdomain(); | 7328 | d->rd = alloc_rootdomain(); |
| 7346 | if (!d->rd) { | 7329 | if (!d->rd) |
| 7347 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 7330 | return sa_sd; |
| 7348 | return sa_tmpmask; | ||
| 7349 | } | ||
| 7350 | return sa_rootdomain; | 7331 | return sa_rootdomain; |
| 7351 | } | 7332 | } |
| 7352 | 7333 | ||
| 7353 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | 7334 | /* |
| 7354 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | 7335 | * NULL the sd_data elements we've used to build the sched_domain and |
| 7336 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
| 7337 | * will not free the data we're using. | ||
| 7338 | */ | ||
| 7339 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
| 7355 | { | 7340 | { |
| 7356 | struct sched_domain *sd = NULL; | 7341 | struct sd_data *sdd = sd->private; |
| 7357 | #ifdef CONFIG_NUMA | ||
| 7358 | struct sched_domain *parent; | ||
| 7359 | |||
| 7360 | d->sd_allnodes = 0; | ||
| 7361 | if (cpumask_weight(cpu_map) > | ||
| 7362 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
| 7363 | sd = &per_cpu(allnodes_domains, i).sd; | ||
| 7364 | SD_INIT(sd, ALLNODES); | ||
| 7365 | set_domain_attribute(sd, attr); | ||
| 7366 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
| 7367 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
| 7368 | d->sd_allnodes = 1; | ||
| 7369 | } | ||
| 7370 | parent = sd; | ||
| 7371 | |||
| 7372 | sd = &per_cpu(node_domains, i).sd; | ||
| 7373 | SD_INIT(sd, NODE); | ||
| 7374 | set_domain_attribute(sd, attr); | ||
| 7375 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
| 7376 | sd->parent = parent; | ||
| 7377 | if (parent) | ||
| 7378 | parent->child = sd; | ||
| 7379 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
| 7380 | #endif | ||
| 7381 | return sd; | ||
| 7382 | } | ||
| 7383 | 7342 | ||
| 7384 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | 7343 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
| 7385 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7344 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
| 7386 | struct sched_domain *parent, int i) | ||
| 7387 | { | ||
| 7388 | struct sched_domain *sd; | ||
| 7389 | sd = &per_cpu(phys_domains, i).sd; | ||
| 7390 | SD_INIT(sd, CPU); | ||
| 7391 | set_domain_attribute(sd, attr); | ||
| 7392 | cpumask_copy(sched_domain_span(sd), d->nodemask); | ||
| 7393 | sd->parent = parent; | ||
| 7394 | if (parent) | ||
| 7395 | parent->child = sd; | ||
| 7396 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
| 7397 | return sd; | ||
| 7398 | } | ||
| 7399 | 7345 | ||
| 7400 | static struct sched_domain *__build_book_sched_domain(struct s_data *d, | 7346 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
| 7401 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7347 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
| 7402 | struct sched_domain *parent, int i) | ||
| 7403 | { | ||
| 7404 | struct sched_domain *sd = parent; | ||
| 7405 | #ifdef CONFIG_SCHED_BOOK | ||
| 7406 | sd = &per_cpu(book_domains, i).sd; | ||
| 7407 | SD_INIT(sd, BOOK); | ||
| 7408 | set_domain_attribute(sd, attr); | ||
| 7409 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); | ||
| 7410 | sd->parent = parent; | ||
| 7411 | parent->child = sd; | ||
| 7412 | cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
| 7413 | #endif | ||
| 7414 | return sd; | ||
| 7415 | } | ||
| 7416 | 7348 | ||
| 7417 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7349 | if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) |
| 7418 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7350 | *per_cpu_ptr(sdd->sgp, cpu) = NULL; |
| 7419 | struct sched_domain *parent, int i) | ||
| 7420 | { | ||
| 7421 | struct sched_domain *sd = parent; | ||
| 7422 | #ifdef CONFIG_SCHED_MC | ||
| 7423 | sd = &per_cpu(core_domains, i).sd; | ||
| 7424 | SD_INIT(sd, MC); | ||
| 7425 | set_domain_attribute(sd, attr); | ||
| 7426 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); | ||
| 7427 | sd->parent = parent; | ||
| 7428 | parent->child = sd; | ||
| 7429 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
| 7430 | #endif | ||
| 7431 | return sd; | ||
| 7432 | } | 7351 | } |
| 7433 | 7352 | ||
| 7434 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
| 7435 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
| 7436 | struct sched_domain *parent, int i) | ||
| 7437 | { | ||
| 7438 | struct sched_domain *sd = parent; | ||
| 7439 | #ifdef CONFIG_SCHED_SMT | 7353 | #ifdef CONFIG_SCHED_SMT |
| 7440 | sd = &per_cpu(cpu_domains, i).sd; | 7354 | static const struct cpumask *cpu_smt_mask(int cpu) |
| 7441 | SD_INIT(sd, SIBLING); | 7355 | { |
| 7442 | set_domain_attribute(sd, attr); | 7356 | return topology_thread_cpumask(cpu); |
| 7443 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); | ||
| 7444 | sd->parent = parent; | ||
| 7445 | parent->child = sd; | ||
| 7446 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
| 7447 | #endif | ||
| 7448 | return sd; | ||
| 7449 | } | 7357 | } |
| 7358 | #endif | ||
| 7450 | 7359 | ||
| 7451 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | 7360 | /* |
| 7452 | const struct cpumask *cpu_map, int cpu) | 7361 | * Topology list, bottom-up. |
| 7453 | { | 7362 | */ |
| 7454 | switch (l) { | 7363 | static struct sched_domain_topology_level default_topology[] = { |
| 7455 | #ifdef CONFIG_SCHED_SMT | 7364 | #ifdef CONFIG_SCHED_SMT |
| 7456 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ | 7365 | { sd_init_SIBLING, cpu_smt_mask, }, |
| 7457 | cpumask_and(d->this_sibling_map, cpu_map, | ||
| 7458 | topology_thread_cpumask(cpu)); | ||
| 7459 | if (cpu == cpumask_first(d->this_sibling_map)) | ||
| 7460 | init_sched_build_groups(d->this_sibling_map, cpu_map, | ||
| 7461 | &cpu_to_cpu_group, | ||
| 7462 | d->send_covered, d->tmpmask); | ||
| 7463 | break; | ||
| 7464 | #endif | 7366 | #endif |
| 7465 | #ifdef CONFIG_SCHED_MC | 7367 | #ifdef CONFIG_SCHED_MC |
| 7466 | case SD_LV_MC: /* set up multi-core groups */ | 7368 | { sd_init_MC, cpu_coregroup_mask, }, |
| 7467 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); | ||
| 7468 | if (cpu == cpumask_first(d->this_core_map)) | ||
| 7469 | init_sched_build_groups(d->this_core_map, cpu_map, | ||
| 7470 | &cpu_to_core_group, | ||
| 7471 | d->send_covered, d->tmpmask); | ||
| 7472 | break; | ||
| 7473 | #endif | 7369 | #endif |
| 7474 | #ifdef CONFIG_SCHED_BOOK | 7370 | #ifdef CONFIG_SCHED_BOOK |
| 7475 | case SD_LV_BOOK: /* set up book groups */ | 7371 | { sd_init_BOOK, cpu_book_mask, }, |
| 7476 | cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); | ||
| 7477 | if (cpu == cpumask_first(d->this_book_map)) | ||
| 7478 | init_sched_build_groups(d->this_book_map, cpu_map, | ||
| 7479 | &cpu_to_book_group, | ||
| 7480 | d->send_covered, d->tmpmask); | ||
| 7481 | break; | ||
| 7482 | #endif | 7372 | #endif |
| 7483 | case SD_LV_CPU: /* set up physical groups */ | 7373 | { sd_init_CPU, cpu_cpu_mask, }, |
| 7484 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | ||
| 7485 | if (!cpumask_empty(d->nodemask)) | ||
| 7486 | init_sched_build_groups(d->nodemask, cpu_map, | ||
| 7487 | &cpu_to_phys_group, | ||
| 7488 | d->send_covered, d->tmpmask); | ||
| 7489 | break; | ||
| 7490 | #ifdef CONFIG_NUMA | 7374 | #ifdef CONFIG_NUMA |
| 7491 | case SD_LV_ALLNODES: | 7375 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, |
| 7492 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, | 7376 | { sd_init_ALLNODES, cpu_allnodes_mask, }, |
| 7493 | d->send_covered, d->tmpmask); | ||
| 7494 | break; | ||
| 7495 | #endif | 7377 | #endif |
| 7496 | default: | 7378 | { NULL, }, |
| 7497 | break; | 7379 | }; |
| 7380 | |||
| 7381 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | ||
| 7382 | |||
| 7383 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
| 7384 | { | ||
| 7385 | struct sched_domain_topology_level *tl; | ||
| 7386 | int j; | ||
| 7387 | |||
| 7388 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
| 7389 | struct sd_data *sdd = &tl->data; | ||
| 7390 | |||
| 7391 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
| 7392 | if (!sdd->sd) | ||
| 7393 | return -ENOMEM; | ||
| 7394 | |||
| 7395 | sdd->sg = alloc_percpu(struct sched_group *); | ||
| 7396 | if (!sdd->sg) | ||
| 7397 | return -ENOMEM; | ||
| 7398 | |||
| 7399 | sdd->sgp = alloc_percpu(struct sched_group_power *); | ||
| 7400 | if (!sdd->sgp) | ||
| 7401 | return -ENOMEM; | ||
| 7402 | |||
| 7403 | for_each_cpu(j, cpu_map) { | ||
| 7404 | struct sched_domain *sd; | ||
| 7405 | struct sched_group *sg; | ||
| 7406 | struct sched_group_power *sgp; | ||
| 7407 | |||
| 7408 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
| 7409 | GFP_KERNEL, cpu_to_node(j)); | ||
| 7410 | if (!sd) | ||
| 7411 | return -ENOMEM; | ||
| 7412 | |||
| 7413 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
| 7414 | |||
| 7415 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
| 7416 | GFP_KERNEL, cpu_to_node(j)); | ||
| 7417 | if (!sg) | ||
| 7418 | return -ENOMEM; | ||
| 7419 | |||
| 7420 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
| 7421 | |||
| 7422 | sgp = kzalloc_node(sizeof(struct sched_group_power), | ||
| 7423 | GFP_KERNEL, cpu_to_node(j)); | ||
| 7424 | if (!sgp) | ||
| 7425 | return -ENOMEM; | ||
| 7426 | |||
| 7427 | *per_cpu_ptr(sdd->sgp, j) = sgp; | ||
| 7428 | } | ||
| 7429 | } | ||
| 7430 | |||
| 7431 | return 0; | ||
| 7432 | } | ||
| 7433 | |||
| 7434 | static void __sdt_free(const struct cpumask *cpu_map) | ||
| 7435 | { | ||
| 7436 | struct sched_domain_topology_level *tl; | ||
| 7437 | int j; | ||
| 7438 | |||
| 7439 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
| 7440 | struct sd_data *sdd = &tl->data; | ||
| 7441 | |||
| 7442 | for_each_cpu(j, cpu_map) { | ||
| 7443 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); | ||
| 7444 | if (sd && (sd->flags & SD_OVERLAP)) | ||
| 7445 | free_sched_groups(sd->groups, 0); | ||
| 7446 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
| 7447 | kfree(*per_cpu_ptr(sdd->sgp, j)); | ||
| 7448 | } | ||
| 7449 | free_percpu(sdd->sd); | ||
| 7450 | free_percpu(sdd->sg); | ||
| 7451 | free_percpu(sdd->sgp); | ||
| 7498 | } | 7452 | } |
| 7499 | } | 7453 | } |
| 7500 | 7454 | ||
| 7455 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
| 7456 | struct s_data *d, const struct cpumask *cpu_map, | ||
| 7457 | struct sched_domain_attr *attr, struct sched_domain *child, | ||
| 7458 | int cpu) | ||
| 7459 | { | ||
| 7460 | struct sched_domain *sd = tl->init(tl, cpu); | ||
| 7461 | if (!sd) | ||
| 7462 | return child; | ||
| 7463 | |||
| 7464 | set_domain_attribute(sd, attr); | ||
| 7465 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
| 7466 | if (child) { | ||
| 7467 | sd->level = child->level + 1; | ||
| 7468 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
| 7469 | child->parent = sd; | ||
| 7470 | } | ||
| 7471 | sd->child = child; | ||
| 7472 | |||
| 7473 | return sd; | ||
| 7474 | } | ||
| 7475 | |||
| 7501 | /* | 7476 | /* |
| 7502 | * Build sched domains for a given set of cpus and attach the sched domains | 7477 | * Build sched domains for a given set of cpus and attach the sched domains |
| 7503 | * to the individual cpus | 7478 | * to the individual cpus |
| 7504 | */ | 7479 | */ |
| 7505 | static int __build_sched_domains(const struct cpumask *cpu_map, | 7480 | static int build_sched_domains(const struct cpumask *cpu_map, |
| 7506 | struct sched_domain_attr *attr) | 7481 | struct sched_domain_attr *attr) |
| 7507 | { | 7482 | { |
| 7508 | enum s_alloc alloc_state = sa_none; | 7483 | enum s_alloc alloc_state = sa_none; |
| 7509 | struct s_data d; | ||
| 7510 | struct sched_domain *sd; | 7484 | struct sched_domain *sd; |
| 7511 | int i; | 7485 | struct s_data d; |
| 7512 | #ifdef CONFIG_NUMA | 7486 | int i, ret = -ENOMEM; |
| 7513 | d.sd_allnodes = 0; | ||
| 7514 | #endif | ||
| 7515 | 7487 | ||
| 7516 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | 7488 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
| 7517 | if (alloc_state != sa_rootdomain) | 7489 | if (alloc_state != sa_rootdomain) |
| 7518 | goto error; | 7490 | goto error; |
| 7519 | alloc_state = sa_sched_groups; | ||
| 7520 | 7491 | ||
| 7521 | /* | 7492 | /* Set up domains for cpus specified by the cpu_map. */ |
| 7522 | * Set up domains for cpus specified by the cpu_map. | ||
| 7523 | */ | ||
| 7524 | for_each_cpu(i, cpu_map) { | 7493 | for_each_cpu(i, cpu_map) { |
| 7525 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), | 7494 | struct sched_domain_topology_level *tl; |
| 7526 | cpu_map); | 7495 | |
| 7496 | sd = NULL; | ||
| 7497 | for (tl = sched_domain_topology; tl->init; tl++) { | ||
| 7498 | sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); | ||
| 7499 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | ||
| 7500 | sd->flags |= SD_OVERLAP; | ||
| 7501 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | ||
| 7502 | break; | ||
| 7503 | } | ||
| 7527 | 7504 | ||
| 7528 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | 7505 | while (sd->child) |
| 7529 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | 7506 | sd = sd->child; |
| 7530 | sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); | ||
| 7531 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | ||
| 7532 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | ||
| 7533 | } | ||
| 7534 | 7507 | ||
| 7535 | for_each_cpu(i, cpu_map) { | 7508 | *per_cpu_ptr(d.sd, i) = sd; |
| 7536 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | ||
| 7537 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | ||
| 7538 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
| 7539 | } | 7509 | } |
| 7540 | 7510 | ||
| 7541 | /* Set up physical groups */ | 7511 | /* Build the groups for the domains */ |
| 7542 | for (i = 0; i < nr_node_ids; i++) | ||
| 7543 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); | ||
| 7544 | |||
| 7545 | #ifdef CONFIG_NUMA | ||
| 7546 | /* Set up node groups */ | ||
| 7547 | if (d.sd_allnodes) | ||
| 7548 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
| 7549 | |||
| 7550 | for (i = 0; i < nr_node_ids; i++) | ||
| 7551 | if (build_numa_sched_groups(&d, cpu_map, i)) | ||
| 7552 | goto error; | ||
| 7553 | #endif | ||
| 7554 | |||
| 7555 | /* Calculate CPU power for physical packages and nodes */ | ||
| 7556 | #ifdef CONFIG_SCHED_SMT | ||
| 7557 | for_each_cpu(i, cpu_map) { | 7512 | for_each_cpu(i, cpu_map) { |
| 7558 | sd = &per_cpu(cpu_domains, i).sd; | 7513 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
| 7559 | init_sched_groups_power(i, sd); | 7514 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); |
| 7560 | } | 7515 | if (sd->flags & SD_OVERLAP) { |
| 7561 | #endif | 7516 | if (build_overlap_sched_groups(sd, i)) |
| 7562 | #ifdef CONFIG_SCHED_MC | 7517 | goto error; |
| 7563 | for_each_cpu(i, cpu_map) { | 7518 | } else { |
| 7564 | sd = &per_cpu(core_domains, i).sd; | 7519 | if (build_sched_groups(sd, i)) |
| 7565 | init_sched_groups_power(i, sd); | 7520 | goto error; |
| 7566 | } | 7521 | } |
| 7567 | #endif | 7522 | } |
| 7568 | #ifdef CONFIG_SCHED_BOOK | ||
| 7569 | for_each_cpu(i, cpu_map) { | ||
| 7570 | sd = &per_cpu(book_domains, i).sd; | ||
| 7571 | init_sched_groups_power(i, sd); | ||
| 7572 | } | ||
| 7573 | #endif | ||
| 7574 | |||
| 7575 | for_each_cpu(i, cpu_map) { | ||
| 7576 | sd = &per_cpu(phys_domains, i).sd; | ||
| 7577 | init_sched_groups_power(i, sd); | ||
| 7578 | } | 7523 | } |
| 7579 | 7524 | ||
| 7580 | #ifdef CONFIG_NUMA | 7525 | /* Calculate CPU power for physical packages and nodes */ |
| 7581 | for (i = 0; i < nr_node_ids; i++) | 7526 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
| 7582 | init_numa_sched_groups_power(d.sched_group_nodes[i]); | 7527 | if (!cpumask_test_cpu(i, cpu_map)) |
| 7583 | 7528 | continue; | |
| 7584 | if (d.sd_allnodes) { | ||
| 7585 | struct sched_group *sg; | ||
| 7586 | 7529 | ||
| 7587 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 7530 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
| 7588 | d.tmpmask); | 7531 | claim_allocations(i, sd); |
| 7589 | init_numa_sched_groups_power(sg); | 7532 | init_sched_groups_power(i, sd); |
| 7533 | } | ||
| 7590 | } | 7534 | } |
| 7591 | #endif | ||
| 7592 | 7535 | ||
| 7593 | /* Attach the domains */ | 7536 | /* Attach the domains */ |
| 7537 | rcu_read_lock(); | ||
| 7594 | for_each_cpu(i, cpu_map) { | 7538 | for_each_cpu(i, cpu_map) { |
| 7595 | #ifdef CONFIG_SCHED_SMT | 7539 | sd = *per_cpu_ptr(d.sd, i); |
| 7596 | sd = &per_cpu(cpu_domains, i).sd; | ||
| 7597 | #elif defined(CONFIG_SCHED_MC) | ||
| 7598 | sd = &per_cpu(core_domains, i).sd; | ||
| 7599 | #elif defined(CONFIG_SCHED_BOOK) | ||
| 7600 | sd = &per_cpu(book_domains, i).sd; | ||
| 7601 | #else | ||
| 7602 | sd = &per_cpu(phys_domains, i).sd; | ||
| 7603 | #endif | ||
| 7604 | cpu_attach_domain(sd, d.rd, i); | 7540 | cpu_attach_domain(sd, d.rd, i); |
| 7605 | } | 7541 | } |
| 7542 | rcu_read_unlock(); | ||
| 7606 | 7543 | ||
| 7607 | d.sched_group_nodes = NULL; /* don't free this we still need it */ | 7544 | ret = 0; |
| 7608 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | ||
| 7609 | return 0; | ||
| 7610 | |||
| 7611 | error: | 7545 | error: |
| 7612 | __free_domain_allocs(&d, alloc_state, cpu_map); | 7546 | __free_domain_allocs(&d, alloc_state, cpu_map); |
| 7613 | return -ENOMEM; | 7547 | return ret; |
| 7614 | } | ||
| 7615 | |||
| 7616 | static int build_sched_domains(const struct cpumask *cpu_map) | ||
| 7617 | { | ||
| 7618 | return __build_sched_domains(cpu_map, NULL); | ||
| 7619 | } | 7548 | } |
| 7620 | 7549 | ||
| 7621 | static cpumask_var_t *doms_cur; /* current sched domains */ | 7550 | static cpumask_var_t *doms_cur; /* current sched domains */ |
| @@ -7670,7 +7599,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | |||
| 7670 | * For now this just excludes isolated cpus, but could be used to | 7599 | * For now this just excludes isolated cpus, but could be used to |
| 7671 | * exclude other special cases in the future. | 7600 | * exclude other special cases in the future. |
| 7672 | */ | 7601 | */ |
| 7673 | static int arch_init_sched_domains(const struct cpumask *cpu_map) | 7602 | static int init_sched_domains(const struct cpumask *cpu_map) |
| 7674 | { | 7603 | { |
| 7675 | int err; | 7604 | int err; |
| 7676 | 7605 | ||
| @@ -7681,32 +7610,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
| 7681 | doms_cur = &fallback_doms; | 7610 | doms_cur = &fallback_doms; |
| 7682 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 7611 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
| 7683 | dattr_cur = NULL; | 7612 | dattr_cur = NULL; |
| 7684 | err = build_sched_domains(doms_cur[0]); | 7613 | err = build_sched_domains(doms_cur[0], NULL); |
| 7685 | register_sched_domain_sysctl(); | 7614 | register_sched_domain_sysctl(); |
| 7686 | 7615 | ||
| 7687 | return err; | 7616 | return err; |
| 7688 | } | 7617 | } |
| 7689 | 7618 | ||
| 7690 | static void arch_destroy_sched_domains(const struct cpumask *cpu_map, | ||
| 7691 | struct cpumask *tmpmask) | ||
| 7692 | { | ||
| 7693 | free_sched_groups(cpu_map, tmpmask); | ||
| 7694 | } | ||
| 7695 | |||
| 7696 | /* | 7619 | /* |
| 7697 | * Detach sched domains from a group of cpus specified in cpu_map | 7620 | * Detach sched domains from a group of cpus specified in cpu_map |
| 7698 | * These cpus will now be attached to the NULL domain | 7621 | * These cpus will now be attached to the NULL domain |
| 7699 | */ | 7622 | */ |
| 7700 | static void detach_destroy_domains(const struct cpumask *cpu_map) | 7623 | static void detach_destroy_domains(const struct cpumask *cpu_map) |
| 7701 | { | 7624 | { |
| 7702 | /* Save because hotplug lock held. */ | ||
| 7703 | static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); | ||
| 7704 | int i; | 7625 | int i; |
| 7705 | 7626 | ||
| 7627 | rcu_read_lock(); | ||
| 7706 | for_each_cpu(i, cpu_map) | 7628 | for_each_cpu(i, cpu_map) |
| 7707 | cpu_attach_domain(NULL, &def_root_domain, i); | 7629 | cpu_attach_domain(NULL, &def_root_domain, i); |
| 7708 | synchronize_sched(); | 7630 | rcu_read_unlock(); |
| 7709 | arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); | ||
| 7710 | } | 7631 | } |
| 7711 | 7632 | ||
| 7712 | /* handle null as "default" */ | 7633 | /* handle null as "default" */ |
| @@ -7795,8 +7716,7 @@ match1: | |||
| 7795 | goto match2; | 7716 | goto match2; |
| 7796 | } | 7717 | } |
| 7797 | /* no match - add a new doms_new */ | 7718 | /* no match - add a new doms_new */ |
| 7798 | __build_sched_domains(doms_new[i], | 7719 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); |
| 7799 | dattr_new ? dattr_new + i : NULL); | ||
| 7800 | match2: | 7720 | match2: |
| 7801 | ; | 7721 | ; |
| 7802 | } | 7722 | } |
| @@ -7815,7 +7735,7 @@ match2: | |||
| 7815 | } | 7735 | } |
| 7816 | 7736 | ||
| 7817 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 7737 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
| 7818 | static void arch_reinit_sched_domains(void) | 7738 | static void reinit_sched_domains(void) |
| 7819 | { | 7739 | { |
| 7820 | get_online_cpus(); | 7740 | get_online_cpus(); |
| 7821 | 7741 | ||
| @@ -7848,7 +7768,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | |||
| 7848 | else | 7768 | else |
| 7849 | sched_mc_power_savings = level; | 7769 | sched_mc_power_savings = level; |
| 7850 | 7770 | ||
| 7851 | arch_reinit_sched_domains(); | 7771 | reinit_sched_domains(); |
| 7852 | 7772 | ||
| 7853 | return count; | 7773 | return count; |
| 7854 | } | 7774 | } |
| @@ -7967,14 +7887,9 @@ void __init sched_init_smp(void) | |||
| 7967 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7887 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
| 7968 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7888 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
| 7969 | 7889 | ||
| 7970 | #if defined(CONFIG_NUMA) | ||
| 7971 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | ||
| 7972 | GFP_KERNEL); | ||
| 7973 | BUG_ON(sched_group_nodes_bycpu == NULL); | ||
| 7974 | #endif | ||
| 7975 | get_online_cpus(); | 7890 | get_online_cpus(); |
| 7976 | mutex_lock(&sched_domains_mutex); | 7891 | mutex_lock(&sched_domains_mutex); |
| 7977 | arch_init_sched_domains(cpu_active_mask); | 7892 | init_sched_domains(cpu_active_mask); |
| 7978 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 7893 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
| 7979 | if (cpumask_empty(non_isolated_cpus)) | 7894 | if (cpumask_empty(non_isolated_cpus)) |
| 7980 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 7895 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
| @@ -8013,18 +7928,14 @@ int in_sched_functions(unsigned long addr) | |||
| 8013 | && addr < (unsigned long)__sched_text_end); | 7928 | && addr < (unsigned long)__sched_text_end); |
| 8014 | } | 7929 | } |
| 8015 | 7930 | ||
| 8016 | static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 7931 | static void init_cfs_rq(struct cfs_rq *cfs_rq) |
| 8017 | { | 7932 | { |
| 8018 | cfs_rq->tasks_timeline = RB_ROOT; | 7933 | cfs_rq->tasks_timeline = RB_ROOT; |
| 8019 | INIT_LIST_HEAD(&cfs_rq->tasks); | 7934 | INIT_LIST_HEAD(&cfs_rq->tasks); |
| 8020 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 8021 | cfs_rq->rq = rq; | ||
| 8022 | /* allow initial update_cfs_load() to truncate */ | ||
| 8023 | #ifdef CONFIG_SMP | ||
| 8024 | cfs_rq->load_stamp = 1; | ||
| 8025 | #endif | ||
| 8026 | #endif | ||
| 8027 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7935 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
| 7936 | #ifndef CONFIG_64BIT | ||
| 7937 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
| 7938 | #endif | ||
| 8028 | } | 7939 | } |
| 8029 | 7940 | ||
| 8030 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | 7941 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) |
| @@ -8040,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
| 8040 | /* delimiter for bitsearch: */ | 7951 | /* delimiter for bitsearch: */ |
| 8041 | __set_bit(MAX_RT_PRIO, array->bitmap); | 7952 | __set_bit(MAX_RT_PRIO, array->bitmap); |
| 8042 | 7953 | ||
| 8043 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 7954 | #if defined CONFIG_SMP |
| 8044 | rt_rq->highest_prio.curr = MAX_RT_PRIO; | 7955 | rt_rq->highest_prio.curr = MAX_RT_PRIO; |
| 8045 | #ifdef CONFIG_SMP | ||
| 8046 | rt_rq->highest_prio.next = MAX_RT_PRIO; | 7956 | rt_rq->highest_prio.next = MAX_RT_PRIO; |
| 8047 | #endif | ||
| 8048 | #endif | ||
| 8049 | #ifdef CONFIG_SMP | ||
| 8050 | rt_rq->rt_nr_migratory = 0; | 7957 | rt_rq->rt_nr_migratory = 0; |
| 8051 | rt_rq->overloaded = 0; | 7958 | rt_rq->overloaded = 0; |
| 8052 | plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); | 7959 | plist_head_init(&rt_rq->pushable_tasks); |
| 8053 | #endif | 7960 | #endif |
| 8054 | 7961 | ||
| 8055 | rt_rq->rt_time = 0; | 7962 | rt_rq->rt_time = 0; |
| 8056 | rt_rq->rt_throttled = 0; | 7963 | rt_rq->rt_throttled = 0; |
| 8057 | rt_rq->rt_runtime = 0; | 7964 | rt_rq->rt_runtime = 0; |
| 8058 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); | 7965 | raw_spin_lock_init(&rt_rq->rt_runtime_lock); |
| 8059 | |||
| 8060 | #ifdef CONFIG_RT_GROUP_SCHED | ||
| 8061 | rt_rq->rt_nr_boosted = 0; | ||
| 8062 | rt_rq->rq = rq; | ||
| 8063 | #endif | ||
| 8064 | } | 7966 | } |
| 8065 | 7967 | ||
| 8066 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7968 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -8069,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
| 8069 | struct sched_entity *parent) | 7971 | struct sched_entity *parent) |
| 8070 | { | 7972 | { |
| 8071 | struct rq *rq = cpu_rq(cpu); | 7973 | struct rq *rq = cpu_rq(cpu); |
| 8072 | tg->cfs_rq[cpu] = cfs_rq; | 7974 | |
| 8073 | init_cfs_rq(cfs_rq, rq); | ||
| 8074 | cfs_rq->tg = tg; | 7975 | cfs_rq->tg = tg; |
| 7976 | cfs_rq->rq = rq; | ||
| 7977 | #ifdef CONFIG_SMP | ||
| 7978 | /* allow initial update_cfs_load() to truncate */ | ||
| 7979 | cfs_rq->load_stamp = 1; | ||
| 7980 | #endif | ||
| 8075 | 7981 | ||
| 7982 | tg->cfs_rq[cpu] = cfs_rq; | ||
| 8076 | tg->se[cpu] = se; | 7983 | tg->se[cpu] = se; |
| 7984 | |||
| 8077 | /* se could be NULL for root_task_group */ | 7985 | /* se could be NULL for root_task_group */ |
| 8078 | if (!se) | 7986 | if (!se) |
| 8079 | return; | 7987 | return; |
| @@ -8096,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
| 8096 | { | 8004 | { |
| 8097 | struct rq *rq = cpu_rq(cpu); | 8005 | struct rq *rq = cpu_rq(cpu); |
| 8098 | 8006 | ||
| 8099 | tg->rt_rq[cpu] = rt_rq; | 8007 | rt_rq->highest_prio.curr = MAX_RT_PRIO; |
| 8100 | init_rt_rq(rt_rq, rq); | 8008 | rt_rq->rt_nr_boosted = 0; |
| 8009 | rt_rq->rq = rq; | ||
| 8101 | rt_rq->tg = tg; | 8010 | rt_rq->tg = tg; |
| 8102 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 8103 | 8011 | ||
| 8012 | tg->rt_rq[cpu] = rt_rq; | ||
| 8104 | tg->rt_se[cpu] = rt_se; | 8013 | tg->rt_se[cpu] = rt_se; |
| 8014 | |||
| 8105 | if (!rt_se) | 8015 | if (!rt_se) |
| 8106 | return; | 8016 | return; |
| 8107 | 8017 | ||
| @@ -8183,7 +8093,7 @@ void __init sched_init(void) | |||
| 8183 | rq->nr_running = 0; | 8093 | rq->nr_running = 0; |
| 8184 | rq->calc_load_active = 0; | 8094 | rq->calc_load_active = 0; |
| 8185 | rq->calc_load_update = jiffies + LOAD_FREQ; | 8095 | rq->calc_load_update = jiffies + LOAD_FREQ; |
| 8186 | init_cfs_rq(&rq->cfs, rq); | 8096 | init_cfs_rq(&rq->cfs); |
| 8187 | init_rt_rq(&rq->rt, rq); | 8097 | init_rt_rq(&rq->rt, rq); |
| 8188 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8098 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 8189 | root_task_group.shares = root_task_group_load; | 8099 | root_task_group.shares = root_task_group_load; |
| @@ -8224,7 +8134,7 @@ void __init sched_init(void) | |||
| 8224 | #ifdef CONFIG_SMP | 8134 | #ifdef CONFIG_SMP |
| 8225 | rq->sd = NULL; | 8135 | rq->sd = NULL; |
| 8226 | rq->rd = NULL; | 8136 | rq->rd = NULL; |
| 8227 | rq->cpu_power = SCHED_LOAD_SCALE; | 8137 | rq->cpu_power = SCHED_POWER_SCALE; |
| 8228 | rq->post_schedule = 0; | 8138 | rq->post_schedule = 0; |
| 8229 | rq->active_balance = 0; | 8139 | rq->active_balance = 0; |
| 8230 | rq->next_balance = jiffies; | 8140 | rq->next_balance = jiffies; |
| @@ -8254,7 +8164,7 @@ void __init sched_init(void) | |||
| 8254 | #endif | 8164 | #endif |
| 8255 | 8165 | ||
| 8256 | #ifdef CONFIG_RT_MUTEXES | 8166 | #ifdef CONFIG_RT_MUTEXES |
| 8257 | plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); | 8167 | plist_head_init(&init_task.pi_waiters); |
| 8258 | #endif | 8168 | #endif |
| 8259 | 8169 | ||
| 8260 | /* | 8170 | /* |
| @@ -8281,6 +8191,7 @@ void __init sched_init(void) | |||
| 8281 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 8191 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
| 8282 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 8192 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
| 8283 | #ifdef CONFIG_SMP | 8193 | #ifdef CONFIG_SMP |
| 8194 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | ||
| 8284 | #ifdef CONFIG_NO_HZ | 8195 | #ifdef CONFIG_NO_HZ |
| 8285 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 8196 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
| 8286 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); | 8197 | alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); |
| @@ -8296,7 +8207,7 @@ void __init sched_init(void) | |||
| 8296 | scheduler_running = 1; | 8207 | scheduler_running = 1; |
| 8297 | } | 8208 | } |
| 8298 | 8209 | ||
| 8299 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 8210 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP |
| 8300 | static inline int preempt_count_equals(int preempt_offset) | 8211 | static inline int preempt_count_equals(int preempt_offset) |
| 8301 | { | 8212 | { |
| 8302 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 8213 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
| @@ -8306,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset) | |||
| 8306 | 8217 | ||
| 8307 | void __might_sleep(const char *file, int line, int preempt_offset) | 8218 | void __might_sleep(const char *file, int line, int preempt_offset) |
| 8308 | { | 8219 | { |
| 8309 | #ifdef in_atomic | ||
| 8310 | static unsigned long prev_jiffy; /* ratelimiting */ | 8220 | static unsigned long prev_jiffy; /* ratelimiting */ |
| 8311 | 8221 | ||
| 8312 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || | 8222 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
| @@ -8328,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset) | |||
| 8328 | if (irqs_disabled()) | 8238 | if (irqs_disabled()) |
| 8329 | print_irqtrace_events(current); | 8239 | print_irqtrace_events(current); |
| 8330 | dump_stack(); | 8240 | dump_stack(); |
| 8331 | #endif | ||
| 8332 | } | 8241 | } |
| 8333 | EXPORT_SYMBOL(__might_sleep); | 8242 | EXPORT_SYMBOL(__might_sleep); |
| 8334 | #endif | 8243 | #endif |
| @@ -8340,7 +8249,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
| 8340 | int old_prio = p->prio; | 8249 | int old_prio = p->prio; |
| 8341 | int on_rq; | 8250 | int on_rq; |
| 8342 | 8251 | ||
| 8343 | on_rq = p->se.on_rq; | 8252 | on_rq = p->on_rq; |
| 8344 | if (on_rq) | 8253 | if (on_rq) |
| 8345 | deactivate_task(rq, p, 0); | 8254 | deactivate_task(rq, p, 0); |
| 8346 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 8255 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
| @@ -8487,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8487 | if (!se) | 8396 | if (!se) |
| 8488 | goto err_free_rq; | 8397 | goto err_free_rq; |
| 8489 | 8398 | ||
| 8399 | init_cfs_rq(cfs_rq); | ||
| 8490 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); | 8400 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
| 8491 | } | 8401 | } |
| 8492 | 8402 | ||
| @@ -8514,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | |||
| 8514 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | 8424 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); |
| 8515 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8425 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 8516 | } | 8426 | } |
| 8517 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8427 | #else /* !CONFIG_FAIR_GROUP_SCHED */ |
| 8518 | static inline void free_fair_sched_group(struct task_group *tg) | 8428 | static inline void free_fair_sched_group(struct task_group *tg) |
| 8519 | { | 8429 | { |
| 8520 | } | 8430 | } |
| @@ -8535,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg) | |||
| 8535 | { | 8445 | { |
| 8536 | int i; | 8446 | int i; |
| 8537 | 8447 | ||
| 8538 | destroy_rt_bandwidth(&tg->rt_bandwidth); | 8448 | if (tg->rt_se) |
| 8449 | destroy_rt_bandwidth(&tg->rt_bandwidth); | ||
| 8539 | 8450 | ||
| 8540 | for_each_possible_cpu(i) { | 8451 | for_each_possible_cpu(i) { |
| 8541 | if (tg->rt_rq) | 8452 | if (tg->rt_rq) |
| @@ -8553,7 +8464,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8553 | { | 8464 | { |
| 8554 | struct rt_rq *rt_rq; | 8465 | struct rt_rq *rt_rq; |
| 8555 | struct sched_rt_entity *rt_se; | 8466 | struct sched_rt_entity *rt_se; |
| 8556 | struct rq *rq; | ||
| 8557 | int i; | 8467 | int i; |
| 8558 | 8468 | ||
| 8559 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); | 8469 | tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); |
| @@ -8567,8 +8477,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8567 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); | 8477 | ktime_to_ns(def_rt_bandwidth.rt_period), 0); |
| 8568 | 8478 | ||
| 8569 | for_each_possible_cpu(i) { | 8479 | for_each_possible_cpu(i) { |
| 8570 | rq = cpu_rq(i); | ||
| 8571 | |||
| 8572 | rt_rq = kzalloc_node(sizeof(struct rt_rq), | 8480 | rt_rq = kzalloc_node(sizeof(struct rt_rq), |
| 8573 | GFP_KERNEL, cpu_to_node(i)); | 8481 | GFP_KERNEL, cpu_to_node(i)); |
| 8574 | if (!rt_rq) | 8482 | if (!rt_rq) |
| @@ -8579,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 8579 | if (!rt_se) | 8487 | if (!rt_se) |
| 8580 | goto err_free_rq; | 8488 | goto err_free_rq; |
| 8581 | 8489 | ||
| 8490 | init_rt_rq(rt_rq, cpu_rq(i)); | ||
| 8491 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | ||
| 8582 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); | 8492 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
| 8583 | } | 8493 | } |
| 8584 | 8494 | ||
| @@ -8683,7 +8593,7 @@ void sched_move_task(struct task_struct *tsk) | |||
| 8683 | rq = task_rq_lock(tsk, &flags); | 8593 | rq = task_rq_lock(tsk, &flags); |
| 8684 | 8594 | ||
| 8685 | running = task_current(rq, tsk); | 8595 | running = task_current(rq, tsk); |
| 8686 | on_rq = tsk->se.on_rq; | 8596 | on_rq = tsk->on_rq; |
| 8687 | 8597 | ||
| 8688 | if (on_rq) | 8598 | if (on_rq) |
| 8689 | dequeue_task(rq, tsk, 0); | 8599 | dequeue_task(rq, tsk, 0); |
| @@ -8702,7 +8612,7 @@ void sched_move_task(struct task_struct *tsk) | |||
| 8702 | if (on_rq) | 8612 | if (on_rq) |
| 8703 | enqueue_task(rq, tsk, 0); | 8613 | enqueue_task(rq, tsk, 0); |
| 8704 | 8614 | ||
| 8705 | task_rq_unlock(rq, &flags); | 8615 | task_rq_unlock(rq, tsk, &flags); |
| 8706 | } | 8616 | } |
| 8707 | #endif /* CONFIG_CGROUP_SCHED */ | 8617 | #endif /* CONFIG_CGROUP_SCHED */ |
| 8708 | 8618 | ||
| @@ -8720,10 +8630,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
| 8720 | if (!tg->se[0]) | 8630 | if (!tg->se[0]) |
| 8721 | return -EINVAL; | 8631 | return -EINVAL; |
| 8722 | 8632 | ||
| 8723 | if (shares < MIN_SHARES) | 8633 | shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); |
| 8724 | shares = MIN_SHARES; | ||
| 8725 | else if (shares > MAX_SHARES) | ||
| 8726 | shares = MAX_SHARES; | ||
| 8727 | 8634 | ||
| 8728 | mutex_lock(&shares_mutex); | 8635 | mutex_lock(&shares_mutex); |
| 8729 | if (tg->shares == shares) | 8636 | if (tg->shares == shares) |
| @@ -9073,42 +8980,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 9073 | return 0; | 8980 | return 0; |
| 9074 | } | 8981 | } |
| 9075 | 8982 | ||
| 9076 | static int | ||
| 9077 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
| 9078 | struct task_struct *tsk, bool threadgroup) | ||
| 9079 | { | ||
| 9080 | int retval = cpu_cgroup_can_attach_task(cgrp, tsk); | ||
| 9081 | if (retval) | ||
| 9082 | return retval; | ||
| 9083 | if (threadgroup) { | ||
| 9084 | struct task_struct *c; | ||
| 9085 | rcu_read_lock(); | ||
| 9086 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
| 9087 | retval = cpu_cgroup_can_attach_task(cgrp, c); | ||
| 9088 | if (retval) { | ||
| 9089 | rcu_read_unlock(); | ||
| 9090 | return retval; | ||
| 9091 | } | ||
| 9092 | } | ||
| 9093 | rcu_read_unlock(); | ||
| 9094 | } | ||
| 9095 | return 0; | ||
| 9096 | } | ||
| 9097 | |||
| 9098 | static void | 8983 | static void |
| 9099 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 8984 | cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
| 9100 | struct cgroup *old_cont, struct task_struct *tsk, | ||
| 9101 | bool threadgroup) | ||
| 9102 | { | 8985 | { |
| 9103 | sched_move_task(tsk); | 8986 | sched_move_task(tsk); |
| 9104 | if (threadgroup) { | ||
| 9105 | struct task_struct *c; | ||
| 9106 | rcu_read_lock(); | ||
| 9107 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
| 9108 | sched_move_task(c); | ||
| 9109 | } | ||
| 9110 | rcu_read_unlock(); | ||
| 9111 | } | ||
| 9112 | } | 8987 | } |
| 9113 | 8988 | ||
| 9114 | static void | 8989 | static void |
| @@ -9130,14 +9005,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
| 9130 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 9005 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
| 9131 | u64 shareval) | 9006 | u64 shareval) |
| 9132 | { | 9007 | { |
| 9133 | return sched_group_set_shares(cgroup_tg(cgrp), shareval); | 9008 | return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); |
| 9134 | } | 9009 | } |
| 9135 | 9010 | ||
| 9136 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) | 9011 | static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) |
| 9137 | { | 9012 | { |
| 9138 | struct task_group *tg = cgroup_tg(cgrp); | 9013 | struct task_group *tg = cgroup_tg(cgrp); |
| 9139 | 9014 | ||
| 9140 | return (u64) tg->shares; | 9015 | return (u64) scale_load_down(tg->shares); |
| 9141 | } | 9016 | } |
| 9142 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 9017 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
| 9143 | 9018 | ||
| @@ -9196,8 +9071,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
| 9196 | .name = "cpu", | 9071 | .name = "cpu", |
| 9197 | .create = cpu_cgroup_create, | 9072 | .create = cpu_cgroup_create, |
| 9198 | .destroy = cpu_cgroup_destroy, | 9073 | .destroy = cpu_cgroup_destroy, |
| 9199 | .can_attach = cpu_cgroup_can_attach, | 9074 | .can_attach_task = cpu_cgroup_can_attach_task, |
| 9200 | .attach = cpu_cgroup_attach, | 9075 | .attach_task = cpu_cgroup_attach_task, |
| 9201 | .exit = cpu_cgroup_exit, | 9076 | .exit = cpu_cgroup_exit, |
| 9202 | .populate = cpu_cgroup_populate, | 9077 | .populate = cpu_cgroup_populate, |
| 9203 | .subsys_id = cpu_cgroup_subsys_id, | 9078 | .subsys_id = cpu_cgroup_subsys_id, |
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 05577055cfca..c2f0e7248dca 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h | |||
| @@ -13,6 +13,7 @@ struct autogroup { | |||
| 13 | int nice; | 13 | int nice; |
| 14 | }; | 14 | }; |
| 15 | 15 | ||
| 16 | static inline bool task_group_is_autogroup(struct task_group *tg); | ||
| 16 | static inline struct task_group * | 17 | static inline struct task_group * |
| 17 | autogroup_task_group(struct task_struct *p, struct task_group *tg); | 18 | autogroup_task_group(struct task_struct *p, struct task_group *tg); |
| 18 | 19 | ||
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7bacd83a4158..a6710a112b4f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
| @@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
| 152 | read_lock_irqsave(&tasklist_lock, flags); | 152 | read_lock_irqsave(&tasklist_lock, flags); |
| 153 | 153 | ||
| 154 | do_each_thread(g, p) { | 154 | do_each_thread(g, p) { |
| 155 | if (!p->se.on_rq || task_cpu(p) != rq_cpu) | 155 | if (!p->on_rq || task_cpu(p) != rq_cpu) |
| 156 | continue; | 156 | continue; |
| 157 | 157 | ||
| 158 | print_task(m, rq, p); | 158 | print_task(m, rq, p); |
| @@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
| 296 | P(ttwu_count); | 296 | P(ttwu_count); |
| 297 | P(ttwu_local); | 297 | P(ttwu_local); |
| 298 | 298 | ||
| 299 | SEQ_printf(m, " .%-30s: %d\n", "bkl_count", | ||
| 300 | rq->rq_sched_info.bkl_count); | ||
| 301 | |||
| 302 | #undef P | 299 | #undef P |
| 303 | #undef P64 | 300 | #undef P64 |
| 304 | #endif | 301 | #endif |
| @@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
| 441 | P(se.statistics.wait_count); | 438 | P(se.statistics.wait_count); |
| 442 | PN(se.statistics.iowait_sum); | 439 | PN(se.statistics.iowait_sum); |
| 443 | P(se.statistics.iowait_count); | 440 | P(se.statistics.iowait_count); |
| 444 | P(sched_info.bkl_count); | ||
| 445 | P(se.nr_migrations); | 441 | P(se.nr_migrations); |
| 446 | P(se.statistics.nr_migrations_cold); | 442 | P(se.statistics.nr_migrations_cold); |
| 447 | P(se.statistics.nr_failed_migrations_affine); | 443 | P(se.statistics.nr_failed_migrations_affine); |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6fa833ab2cb8..bc8ee9993814 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
| @@ -135,14 +135,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
| 135 | return grp->my_q; | 135 | return grp->my_q; |
| 136 | } | 136 | } |
| 137 | 137 | ||
| 138 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | ||
| 139 | * another cpu ('this_cpu') | ||
| 140 | */ | ||
| 141 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
| 142 | { | ||
| 143 | return cfs_rq->tg->cfs_rq[this_cpu]; | ||
| 144 | } | ||
| 145 | |||
| 146 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 138 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
| 147 | { | 139 | { |
| 148 | if (!cfs_rq->on_list) { | 140 | if (!cfs_rq->on_list) { |
| @@ -271,11 +263,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
| 271 | return NULL; | 263 | return NULL; |
| 272 | } | 264 | } |
| 273 | 265 | ||
| 274 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | ||
| 275 | { | ||
| 276 | return &cpu_rq(this_cpu)->cfs; | ||
| 277 | } | ||
| 278 | |||
| 279 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 266 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
| 280 | { | 267 | { |
| 281 | } | 268 | } |
| @@ -334,11 +321,6 @@ static inline int entity_before(struct sched_entity *a, | |||
| 334 | return (s64)(a->vruntime - b->vruntime) < 0; | 321 | return (s64)(a->vruntime - b->vruntime) < 0; |
| 335 | } | 322 | } |
| 336 | 323 | ||
| 337 | static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
| 338 | { | ||
| 339 | return se->vruntime - cfs_rq->min_vruntime; | ||
| 340 | } | ||
| 341 | |||
| 342 | static void update_min_vruntime(struct cfs_rq *cfs_rq) | 324 | static void update_min_vruntime(struct cfs_rq *cfs_rq) |
| 343 | { | 325 | { |
| 344 | u64 vruntime = cfs_rq->min_vruntime; | 326 | u64 vruntime = cfs_rq->min_vruntime; |
| @@ -358,6 +340,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) | |||
| 358 | } | 340 | } |
| 359 | 341 | ||
| 360 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); | 342 | cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); |
| 343 | #ifndef CONFIG_64BIT | ||
| 344 | smp_wmb(); | ||
| 345 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | ||
| 346 | #endif | ||
| 361 | } | 347 | } |
| 362 | 348 | ||
| 363 | /* | 349 | /* |
| @@ -368,7 +354,6 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 368 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 354 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; |
| 369 | struct rb_node *parent = NULL; | 355 | struct rb_node *parent = NULL; |
| 370 | struct sched_entity *entry; | 356 | struct sched_entity *entry; |
| 371 | s64 key = entity_key(cfs_rq, se); | ||
| 372 | int leftmost = 1; | 357 | int leftmost = 1; |
| 373 | 358 | ||
| 374 | /* | 359 | /* |
| @@ -381,7 +366,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 381 | * We dont care about collisions. Nodes with | 366 | * We dont care about collisions. Nodes with |
| 382 | * the same key stay together. | 367 | * the same key stay together. |
| 383 | */ | 368 | */ |
| 384 | if (key < entity_key(cfs_rq, entry)) { | 369 | if (entity_before(se, entry)) { |
| 385 | link = &parent->rb_left; | 370 | link = &parent->rb_left; |
| 386 | } else { | 371 | } else { |
| 387 | link = &parent->rb_right; | 372 | link = &parent->rb_right; |
| @@ -1072,8 +1057,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 1072 | se->on_rq = 0; | 1057 | se->on_rq = 0; |
| 1073 | update_cfs_load(cfs_rq, 0); | 1058 | update_cfs_load(cfs_rq, 0); |
| 1074 | account_entity_dequeue(cfs_rq, se); | 1059 | account_entity_dequeue(cfs_rq, se); |
| 1075 | update_min_vruntime(cfs_rq); | ||
| 1076 | update_cfs_shares(cfs_rq); | ||
| 1077 | 1060 | ||
| 1078 | /* | 1061 | /* |
| 1079 | * Normalize the entity after updating the min_vruntime because the | 1062 | * Normalize the entity after updating the min_vruntime because the |
| @@ -1082,6 +1065,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
| 1082 | */ | 1065 | */ |
| 1083 | if (!(flags & DEQUEUE_SLEEP)) | 1066 | if (!(flags & DEQUEUE_SLEEP)) |
| 1084 | se->vruntime -= cfs_rq->min_vruntime; | 1067 | se->vruntime -= cfs_rq->min_vruntime; |
| 1068 | |||
| 1069 | update_min_vruntime(cfs_rq); | ||
| 1070 | update_cfs_shares(cfs_rq); | ||
| 1085 | } | 1071 | } |
| 1086 | 1072 | ||
| 1087 | /* | 1073 | /* |
| @@ -1331,7 +1317,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1331 | } | 1317 | } |
| 1332 | 1318 | ||
| 1333 | for_each_sched_entity(se) { | 1319 | for_each_sched_entity(se) { |
| 1334 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1320 | cfs_rq = cfs_rq_of(se); |
| 1335 | 1321 | ||
| 1336 | update_cfs_load(cfs_rq, 0); | 1322 | update_cfs_load(cfs_rq, 0); |
| 1337 | update_cfs_shares(cfs_rq); | 1323 | update_cfs_shares(cfs_rq); |
| @@ -1340,6 +1326,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1340 | hrtick_update(rq); | 1326 | hrtick_update(rq); |
| 1341 | } | 1327 | } |
| 1342 | 1328 | ||
| 1329 | static void set_next_buddy(struct sched_entity *se); | ||
| 1330 | |||
| 1343 | /* | 1331 | /* |
| 1344 | * The dequeue_task method is called before nr_running is | 1332 | * The dequeue_task method is called before nr_running is |
| 1345 | * decreased. We remove the task from the rbtree and | 1333 | * decreased. We remove the task from the rbtree and |
| @@ -1349,19 +1337,30 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1349 | { | 1337 | { |
| 1350 | struct cfs_rq *cfs_rq; | 1338 | struct cfs_rq *cfs_rq; |
| 1351 | struct sched_entity *se = &p->se; | 1339 | struct sched_entity *se = &p->se; |
| 1340 | int task_sleep = flags & DEQUEUE_SLEEP; | ||
| 1352 | 1341 | ||
| 1353 | for_each_sched_entity(se) { | 1342 | for_each_sched_entity(se) { |
| 1354 | cfs_rq = cfs_rq_of(se); | 1343 | cfs_rq = cfs_rq_of(se); |
| 1355 | dequeue_entity(cfs_rq, se, flags); | 1344 | dequeue_entity(cfs_rq, se, flags); |
| 1356 | 1345 | ||
| 1357 | /* Don't dequeue parent if it has other entities besides us */ | 1346 | /* Don't dequeue parent if it has other entities besides us */ |
| 1358 | if (cfs_rq->load.weight) | 1347 | if (cfs_rq->load.weight) { |
| 1348 | /* | ||
| 1349 | * Bias pick_next to pick a task from this cfs_rq, as | ||
| 1350 | * p is sleeping when it is within its sched_slice. | ||
| 1351 | */ | ||
| 1352 | if (task_sleep && parent_entity(se)) | ||
| 1353 | set_next_buddy(parent_entity(se)); | ||
| 1354 | |||
| 1355 | /* avoid re-evaluating load for this entity */ | ||
| 1356 | se = parent_entity(se); | ||
| 1359 | break; | 1357 | break; |
| 1358 | } | ||
| 1360 | flags |= DEQUEUE_SLEEP; | 1359 | flags |= DEQUEUE_SLEEP; |
| 1361 | } | 1360 | } |
| 1362 | 1361 | ||
| 1363 | for_each_sched_entity(se) { | 1362 | for_each_sched_entity(se) { |
| 1364 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1363 | cfs_rq = cfs_rq_of(se); |
| 1365 | 1364 | ||
| 1366 | update_cfs_load(cfs_rq, 0); | 1365 | update_cfs_load(cfs_rq, 0); |
| 1367 | update_cfs_shares(cfs_rq); | 1366 | update_cfs_shares(cfs_rq); |
| @@ -1372,12 +1371,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
| 1372 | 1371 | ||
| 1373 | #ifdef CONFIG_SMP | 1372 | #ifdef CONFIG_SMP |
| 1374 | 1373 | ||
| 1375 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | 1374 | static void task_waking_fair(struct task_struct *p) |
| 1376 | { | 1375 | { |
| 1377 | struct sched_entity *se = &p->se; | 1376 | struct sched_entity *se = &p->se; |
| 1378 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1377 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 1378 | u64 min_vruntime; | ||
| 1379 | 1379 | ||
| 1380 | se->vruntime -= cfs_rq->min_vruntime; | 1380 | #ifndef CONFIG_64BIT |
| 1381 | u64 min_vruntime_copy; | ||
| 1382 | |||
| 1383 | do { | ||
| 1384 | min_vruntime_copy = cfs_rq->min_vruntime_copy; | ||
| 1385 | smp_rmb(); | ||
| 1386 | min_vruntime = cfs_rq->min_vruntime; | ||
| 1387 | } while (min_vruntime != min_vruntime_copy); | ||
| 1388 | #else | ||
| 1389 | min_vruntime = cfs_rq->min_vruntime; | ||
| 1390 | #endif | ||
| 1391 | |||
| 1392 | se->vruntime -= min_vruntime; | ||
| 1381 | } | 1393 | } |
| 1382 | 1394 | ||
| 1383 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1395 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| @@ -1453,7 +1465,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
| 1453 | * effect of the currently running task from the load | 1465 | * effect of the currently running task from the load |
| 1454 | * of the current CPU: | 1466 | * of the current CPU: |
| 1455 | */ | 1467 | */ |
| 1456 | rcu_read_lock(); | ||
| 1457 | if (sync) { | 1468 | if (sync) { |
| 1458 | tg = task_group(current); | 1469 | tg = task_group(current); |
| 1459 | weight = current->se.load.weight; | 1470 | weight = current->se.load.weight; |
| @@ -1489,7 +1500,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) | |||
| 1489 | balanced = this_eff_load <= prev_eff_load; | 1500 | balanced = this_eff_load <= prev_eff_load; |
| 1490 | } else | 1501 | } else |
| 1491 | balanced = true; | 1502 | balanced = true; |
| 1492 | rcu_read_unlock(); | ||
| 1493 | 1503 | ||
| 1494 | /* | 1504 | /* |
| 1495 | * If the currently running task will sleep within | 1505 | * If the currently running task will sleep within |
| @@ -1557,7 +1567,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 1557 | } | 1567 | } |
| 1558 | 1568 | ||
| 1559 | /* Adjust by relative CPU power of the group */ | 1569 | /* Adjust by relative CPU power of the group */ |
| 1560 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 1570 | avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; |
| 1561 | 1571 | ||
| 1562 | if (local_group) { | 1572 | if (local_group) { |
| 1563 | this_load = avg_load; | 1573 | this_load = avg_load; |
| @@ -1622,6 +1632,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
| 1622 | /* | 1632 | /* |
| 1623 | * Otherwise, iterate the domains and find an elegible idle cpu. | 1633 | * Otherwise, iterate the domains and find an elegible idle cpu. |
| 1624 | */ | 1634 | */ |
| 1635 | rcu_read_lock(); | ||
| 1625 | for_each_domain(target, sd) { | 1636 | for_each_domain(target, sd) { |
| 1626 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) | 1637 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) |
| 1627 | break; | 1638 | break; |
| @@ -1641,6 +1652,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
| 1641 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) | 1652 | cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) |
| 1642 | break; | 1653 | break; |
| 1643 | } | 1654 | } |
| 1655 | rcu_read_unlock(); | ||
| 1644 | 1656 | ||
| 1645 | return target; | 1657 | return target; |
| 1646 | } | 1658 | } |
| @@ -1657,7 +1669,7 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
| 1657 | * preempt must be disabled. | 1669 | * preempt must be disabled. |
| 1658 | */ | 1670 | */ |
| 1659 | static int | 1671 | static int |
| 1660 | select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) | 1672 | select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) |
| 1661 | { | 1673 | { |
| 1662 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; | 1674 | struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
| 1663 | int cpu = smp_processor_id(); | 1675 | int cpu = smp_processor_id(); |
| @@ -1673,6 +1685,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
| 1673 | new_cpu = prev_cpu; | 1685 | new_cpu = prev_cpu; |
| 1674 | } | 1686 | } |
| 1675 | 1687 | ||
| 1688 | rcu_read_lock(); | ||
| 1676 | for_each_domain(cpu, tmp) { | 1689 | for_each_domain(cpu, tmp) { |
| 1677 | if (!(tmp->flags & SD_LOAD_BALANCE)) | 1690 | if (!(tmp->flags & SD_LOAD_BALANCE)) |
| 1678 | continue; | 1691 | continue; |
| @@ -1692,7 +1705,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
| 1692 | nr_running += cpu_rq(i)->cfs.nr_running; | 1705 | nr_running += cpu_rq(i)->cfs.nr_running; |
| 1693 | } | 1706 | } |
| 1694 | 1707 | ||
| 1695 | capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 1708 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); |
| 1696 | 1709 | ||
| 1697 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | 1710 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) |
| 1698 | nr_running /= 2; | 1711 | nr_running /= 2; |
| @@ -1723,9 +1736,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
| 1723 | 1736 | ||
| 1724 | if (affine_sd) { | 1737 | if (affine_sd) { |
| 1725 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1738 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
| 1726 | return select_idle_sibling(p, cpu); | 1739 | prev_cpu = cpu; |
| 1727 | else | 1740 | |
| 1728 | return select_idle_sibling(p, prev_cpu); | 1741 | new_cpu = select_idle_sibling(p, prev_cpu); |
| 1742 | goto unlock; | ||
| 1729 | } | 1743 | } |
| 1730 | 1744 | ||
| 1731 | while (sd) { | 1745 | while (sd) { |
| @@ -1766,6 +1780,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
| 1766 | } | 1780 | } |
| 1767 | /* while loop will break here if sd == NULL */ | 1781 | /* while loop will break here if sd == NULL */ |
| 1768 | } | 1782 | } |
| 1783 | unlock: | ||
| 1784 | rcu_read_unlock(); | ||
| 1769 | 1785 | ||
| 1770 | return new_cpu; | 1786 | return new_cpu; |
| 1771 | } | 1787 | } |
| @@ -1789,10 +1805,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) | |||
| 1789 | * This is especially important for buddies when the leftmost | 1805 | * This is especially important for buddies when the leftmost |
| 1790 | * task is higher priority than the buddy. | 1806 | * task is higher priority than the buddy. |
| 1791 | */ | 1807 | */ |
| 1792 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 1808 | return calc_delta_fair(gran, se); |
| 1793 | gran = calc_delta_fair(gran, se); | ||
| 1794 | |||
| 1795 | return gran; | ||
| 1796 | } | 1809 | } |
| 1797 | 1810 | ||
| 1798 | /* | 1811 | /* |
| @@ -1826,26 +1839,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) | |||
| 1826 | 1839 | ||
| 1827 | static void set_last_buddy(struct sched_entity *se) | 1840 | static void set_last_buddy(struct sched_entity *se) |
| 1828 | { | 1841 | { |
| 1829 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1842 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
| 1830 | for_each_sched_entity(se) | 1843 | return; |
| 1831 | cfs_rq_of(se)->last = se; | 1844 | |
| 1832 | } | 1845 | for_each_sched_entity(se) |
| 1846 | cfs_rq_of(se)->last = se; | ||
| 1833 | } | 1847 | } |
| 1834 | 1848 | ||
| 1835 | static void set_next_buddy(struct sched_entity *se) | 1849 | static void set_next_buddy(struct sched_entity *se) |
| 1836 | { | 1850 | { |
| 1837 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1851 | if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) |
| 1838 | for_each_sched_entity(se) | 1852 | return; |
| 1839 | cfs_rq_of(se)->next = se; | 1853 | |
| 1840 | } | 1854 | for_each_sched_entity(se) |
| 1855 | cfs_rq_of(se)->next = se; | ||
| 1841 | } | 1856 | } |
| 1842 | 1857 | ||
| 1843 | static void set_skip_buddy(struct sched_entity *se) | 1858 | static void set_skip_buddy(struct sched_entity *se) |
| 1844 | { | 1859 | { |
| 1845 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | 1860 | for_each_sched_entity(se) |
| 1846 | for_each_sched_entity(se) | 1861 | cfs_rq_of(se)->skip = se; |
| 1847 | cfs_rq_of(se)->skip = se; | ||
| 1848 | } | ||
| 1849 | } | 1862 | } |
| 1850 | 1863 | ||
| 1851 | /* | 1864 | /* |
| @@ -1857,12 +1870,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
| 1857 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1870 | struct sched_entity *se = &curr->se, *pse = &p->se; |
| 1858 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1871 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
| 1859 | int scale = cfs_rq->nr_running >= sched_nr_latency; | 1872 | int scale = cfs_rq->nr_running >= sched_nr_latency; |
| 1873 | int next_buddy_marked = 0; | ||
| 1860 | 1874 | ||
| 1861 | if (unlikely(se == pse)) | 1875 | if (unlikely(se == pse)) |
| 1862 | return; | 1876 | return; |
| 1863 | 1877 | ||
| 1864 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) | 1878 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { |
| 1865 | set_next_buddy(pse); | 1879 | set_next_buddy(pse); |
| 1880 | next_buddy_marked = 1; | ||
| 1881 | } | ||
| 1866 | 1882 | ||
| 1867 | /* | 1883 | /* |
| 1868 | * We can come here with TIF_NEED_RESCHED already set from new task | 1884 | * We can come here with TIF_NEED_RESCHED already set from new task |
| @@ -1887,11 +1903,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
| 1887 | if (!sched_feat(WAKEUP_PREEMPT)) | 1903 | if (!sched_feat(WAKEUP_PREEMPT)) |
| 1888 | return; | 1904 | return; |
| 1889 | 1905 | ||
| 1890 | update_curr(cfs_rq); | ||
| 1891 | find_matching_se(&se, &pse); | 1906 | find_matching_se(&se, &pse); |
| 1907 | update_curr(cfs_rq_of(se)); | ||
| 1892 | BUG_ON(!pse); | 1908 | BUG_ON(!pse); |
| 1893 | if (wakeup_preempt_entity(se, pse) == 1) | 1909 | if (wakeup_preempt_entity(se, pse) == 1) { |
| 1910 | /* | ||
| 1911 | * Bias pick_next to pick the sched entity that is | ||
| 1912 | * triggering this preemption. | ||
| 1913 | */ | ||
| 1914 | if (!next_buddy_marked) | ||
| 1915 | set_next_buddy(pse); | ||
| 1894 | goto preempt; | 1916 | goto preempt; |
| 1917 | } | ||
| 1895 | 1918 | ||
| 1896 | return; | 1919 | return; |
| 1897 | 1920 | ||
| @@ -2102,7 +2125,7 @@ static unsigned long | |||
| 2102 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2125 | balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 2103 | unsigned long max_load_move, struct sched_domain *sd, | 2126 | unsigned long max_load_move, struct sched_domain *sd, |
| 2104 | enum cpu_idle_type idle, int *all_pinned, | 2127 | enum cpu_idle_type idle, int *all_pinned, |
| 2105 | int *this_best_prio, struct cfs_rq *busiest_cfs_rq) | 2128 | struct cfs_rq *busiest_cfs_rq) |
| 2106 | { | 2129 | { |
| 2107 | int loops = 0, pulled = 0; | 2130 | int loops = 0, pulled = 0; |
| 2108 | long rem_load_move = max_load_move; | 2131 | long rem_load_move = max_load_move; |
| @@ -2140,9 +2163,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2140 | */ | 2163 | */ |
| 2141 | if (rem_load_move <= 0) | 2164 | if (rem_load_move <= 0) |
| 2142 | break; | 2165 | break; |
| 2143 | |||
| 2144 | if (p->prio < *this_best_prio) | ||
| 2145 | *this_best_prio = p->prio; | ||
| 2146 | } | 2166 | } |
| 2147 | out: | 2167 | out: |
| 2148 | /* | 2168 | /* |
| @@ -2193,26 +2213,56 @@ static void update_shares(int cpu) | |||
| 2193 | struct rq *rq = cpu_rq(cpu); | 2213 | struct rq *rq = cpu_rq(cpu); |
| 2194 | 2214 | ||
| 2195 | rcu_read_lock(); | 2215 | rcu_read_lock(); |
| 2216 | /* | ||
| 2217 | * Iterates the task_group tree in a bottom up fashion, see | ||
| 2218 | * list_add_leaf_cfs_rq() for details. | ||
| 2219 | */ | ||
| 2196 | for_each_leaf_cfs_rq(rq, cfs_rq) | 2220 | for_each_leaf_cfs_rq(rq, cfs_rq) |
| 2197 | update_shares_cpu(cfs_rq->tg, cpu); | 2221 | update_shares_cpu(cfs_rq->tg, cpu); |
| 2198 | rcu_read_unlock(); | 2222 | rcu_read_unlock(); |
| 2199 | } | 2223 | } |
| 2200 | 2224 | ||
| 2225 | /* | ||
| 2226 | * Compute the cpu's hierarchical load factor for each task group. | ||
| 2227 | * This needs to be done in a top-down fashion because the load of a child | ||
| 2228 | * group is a fraction of its parents load. | ||
| 2229 | */ | ||
| 2230 | static int tg_load_down(struct task_group *tg, void *data) | ||
| 2231 | { | ||
| 2232 | unsigned long load; | ||
| 2233 | long cpu = (long)data; | ||
| 2234 | |||
| 2235 | if (!tg->parent) { | ||
| 2236 | load = cpu_rq(cpu)->load.weight; | ||
| 2237 | } else { | ||
| 2238 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
| 2239 | load *= tg->se[cpu]->load.weight; | ||
| 2240 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | ||
| 2241 | } | ||
| 2242 | |||
| 2243 | tg->cfs_rq[cpu]->h_load = load; | ||
| 2244 | |||
| 2245 | return 0; | ||
| 2246 | } | ||
| 2247 | |||
| 2248 | static void update_h_load(long cpu) | ||
| 2249 | { | ||
| 2250 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | ||
| 2251 | } | ||
| 2252 | |||
| 2201 | static unsigned long | 2253 | static unsigned long |
| 2202 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2254 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 2203 | unsigned long max_load_move, | 2255 | unsigned long max_load_move, |
| 2204 | struct sched_domain *sd, enum cpu_idle_type idle, | 2256 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 2205 | int *all_pinned, int *this_best_prio) | 2257 | int *all_pinned) |
| 2206 | { | 2258 | { |
| 2207 | long rem_load_move = max_load_move; | 2259 | long rem_load_move = max_load_move; |
| 2208 | int busiest_cpu = cpu_of(busiest); | 2260 | struct cfs_rq *busiest_cfs_rq; |
| 2209 | struct task_group *tg; | ||
| 2210 | 2261 | ||
| 2211 | rcu_read_lock(); | 2262 | rcu_read_lock(); |
| 2212 | update_h_load(busiest_cpu); | 2263 | update_h_load(cpu_of(busiest)); |
| 2213 | 2264 | ||
| 2214 | list_for_each_entry_rcu(tg, &task_groups, list) { | 2265 | for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { |
| 2215 | struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; | ||
| 2216 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; | 2266 | unsigned long busiest_h_load = busiest_cfs_rq->h_load; |
| 2217 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; | 2267 | unsigned long busiest_weight = busiest_cfs_rq->load.weight; |
| 2218 | u64 rem_load, moved_load; | 2268 | u64 rem_load, moved_load; |
| @@ -2227,7 +2277,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2227 | rem_load = div_u64(rem_load, busiest_h_load + 1); | 2277 | rem_load = div_u64(rem_load, busiest_h_load + 1); |
| 2228 | 2278 | ||
| 2229 | moved_load = balance_tasks(this_rq, this_cpu, busiest, | 2279 | moved_load = balance_tasks(this_rq, this_cpu, busiest, |
| 2230 | rem_load, sd, idle, all_pinned, this_best_prio, | 2280 | rem_load, sd, idle, all_pinned, |
| 2231 | busiest_cfs_rq); | 2281 | busiest_cfs_rq); |
| 2232 | 2282 | ||
| 2233 | if (!moved_load) | 2283 | if (!moved_load) |
| @@ -2253,11 +2303,11 @@ static unsigned long | |||
| 2253 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2303 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
| 2254 | unsigned long max_load_move, | 2304 | unsigned long max_load_move, |
| 2255 | struct sched_domain *sd, enum cpu_idle_type idle, | 2305 | struct sched_domain *sd, enum cpu_idle_type idle, |
| 2256 | int *all_pinned, int *this_best_prio) | 2306 | int *all_pinned) |
| 2257 | { | 2307 | { |
| 2258 | return balance_tasks(this_rq, this_cpu, busiest, | 2308 | return balance_tasks(this_rq, this_cpu, busiest, |
| 2259 | max_load_move, sd, idle, all_pinned, | 2309 | max_load_move, sd, idle, all_pinned, |
| 2260 | this_best_prio, &busiest->cfs); | 2310 | &busiest->cfs); |
| 2261 | } | 2311 | } |
| 2262 | #endif | 2312 | #endif |
| 2263 | 2313 | ||
| @@ -2274,12 +2324,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
| 2274 | int *all_pinned) | 2324 | int *all_pinned) |
| 2275 | { | 2325 | { |
| 2276 | unsigned long total_load_moved = 0, load_moved; | 2326 | unsigned long total_load_moved = 0, load_moved; |
| 2277 | int this_best_prio = this_rq->curr->prio; | ||
| 2278 | 2327 | ||
| 2279 | do { | 2328 | do { |
| 2280 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, | 2329 | load_moved = load_balance_fair(this_rq, this_cpu, busiest, |
| 2281 | max_load_move - total_load_moved, | 2330 | max_load_move - total_load_moved, |
| 2282 | sd, idle, all_pinned, &this_best_prio); | 2331 | sd, idle, all_pinned); |
| 2283 | 2332 | ||
| 2284 | total_load_moved += load_moved; | 2333 | total_load_moved += load_moved; |
| 2285 | 2334 | ||
| @@ -2534,7 +2583,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
| 2534 | 2583 | ||
| 2535 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | 2584 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) |
| 2536 | { | 2585 | { |
| 2537 | return SCHED_LOAD_SCALE; | 2586 | return SCHED_POWER_SCALE; |
| 2538 | } | 2587 | } |
| 2539 | 2588 | ||
| 2540 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | 2589 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) |
| @@ -2571,10 +2620,10 @@ unsigned long scale_rt_power(int cpu) | |||
| 2571 | available = total - rq->rt_avg; | 2620 | available = total - rq->rt_avg; |
| 2572 | } | 2621 | } |
| 2573 | 2622 | ||
| 2574 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | 2623 | if (unlikely((s64)total < SCHED_POWER_SCALE)) |
| 2575 | total = SCHED_LOAD_SCALE; | 2624 | total = SCHED_POWER_SCALE; |
| 2576 | 2625 | ||
| 2577 | total >>= SCHED_LOAD_SHIFT; | 2626 | total >>= SCHED_POWER_SHIFT; |
| 2578 | 2627 | ||
| 2579 | return div_u64(available, total); | 2628 | return div_u64(available, total); |
| 2580 | } | 2629 | } |
| @@ -2582,7 +2631,7 @@ unsigned long scale_rt_power(int cpu) | |||
| 2582 | static void update_cpu_power(struct sched_domain *sd, int cpu) | 2631 | static void update_cpu_power(struct sched_domain *sd, int cpu) |
| 2583 | { | 2632 | { |
| 2584 | unsigned long weight = sd->span_weight; | 2633 | unsigned long weight = sd->span_weight; |
| 2585 | unsigned long power = SCHED_LOAD_SCALE; | 2634 | unsigned long power = SCHED_POWER_SCALE; |
| 2586 | struct sched_group *sdg = sd->groups; | 2635 | struct sched_group *sdg = sd->groups; |
| 2587 | 2636 | ||
| 2588 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | 2637 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
| @@ -2591,26 +2640,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) | |||
| 2591 | else | 2640 | else |
| 2592 | power *= default_scale_smt_power(sd, cpu); | 2641 | power *= default_scale_smt_power(sd, cpu); |
| 2593 | 2642 | ||
| 2594 | power >>= SCHED_LOAD_SHIFT; | 2643 | power >>= SCHED_POWER_SHIFT; |
| 2595 | } | 2644 | } |
| 2596 | 2645 | ||
| 2597 | sdg->cpu_power_orig = power; | 2646 | sdg->sgp->power_orig = power; |
| 2598 | 2647 | ||
| 2599 | if (sched_feat(ARCH_POWER)) | 2648 | if (sched_feat(ARCH_POWER)) |
| 2600 | power *= arch_scale_freq_power(sd, cpu); | 2649 | power *= arch_scale_freq_power(sd, cpu); |
| 2601 | else | 2650 | else |
| 2602 | power *= default_scale_freq_power(sd, cpu); | 2651 | power *= default_scale_freq_power(sd, cpu); |
| 2603 | 2652 | ||
| 2604 | power >>= SCHED_LOAD_SHIFT; | 2653 | power >>= SCHED_POWER_SHIFT; |
| 2605 | 2654 | ||
| 2606 | power *= scale_rt_power(cpu); | 2655 | power *= scale_rt_power(cpu); |
| 2607 | power >>= SCHED_LOAD_SHIFT; | 2656 | power >>= SCHED_POWER_SHIFT; |
| 2608 | 2657 | ||
| 2609 | if (!power) | 2658 | if (!power) |
| 2610 | power = 1; | 2659 | power = 1; |
| 2611 | 2660 | ||
| 2612 | cpu_rq(cpu)->cpu_power = power; | 2661 | cpu_rq(cpu)->cpu_power = power; |
| 2613 | sdg->cpu_power = power; | 2662 | sdg->sgp->power = power; |
| 2614 | } | 2663 | } |
| 2615 | 2664 | ||
| 2616 | static void update_group_power(struct sched_domain *sd, int cpu) | 2665 | static void update_group_power(struct sched_domain *sd, int cpu) |
| @@ -2628,11 +2677,11 @@ static void update_group_power(struct sched_domain *sd, int cpu) | |||
| 2628 | 2677 | ||
| 2629 | group = child->groups; | 2678 | group = child->groups; |
| 2630 | do { | 2679 | do { |
| 2631 | power += group->cpu_power; | 2680 | power += group->sgp->power; |
| 2632 | group = group->next; | 2681 | group = group->next; |
| 2633 | } while (group != child->groups); | 2682 | } while (group != child->groups); |
| 2634 | 2683 | ||
| 2635 | sdg->cpu_power = power; | 2684 | sdg->sgp->power = power; |
| 2636 | } | 2685 | } |
| 2637 | 2686 | ||
| 2638 | /* | 2687 | /* |
| @@ -2646,15 +2695,15 @@ static inline int | |||
| 2646 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | 2695 | fix_small_capacity(struct sched_domain *sd, struct sched_group *group) |
| 2647 | { | 2696 | { |
| 2648 | /* | 2697 | /* |
| 2649 | * Only siblings can have significantly less than SCHED_LOAD_SCALE | 2698 | * Only siblings can have significantly less than SCHED_POWER_SCALE |
| 2650 | */ | 2699 | */ |
| 2651 | if (sd->level != SD_LV_SIBLING) | 2700 | if (!(sd->flags & SD_SHARE_CPUPOWER)) |
| 2652 | return 0; | 2701 | return 0; |
| 2653 | 2702 | ||
| 2654 | /* | 2703 | /* |
| 2655 | * If ~90% of the cpu_power is still there, we're good. | 2704 | * If ~90% of the cpu_power is still there, we're good. |
| 2656 | */ | 2705 | */ |
| 2657 | if (group->cpu_power * 32 > group->cpu_power_orig * 29) | 2706 | if (group->sgp->power * 32 > group->sgp->power_orig * 29) |
| 2658 | return 1; | 2707 | return 1; |
| 2659 | 2708 | ||
| 2660 | return 0; | 2709 | return 0; |
| @@ -2734,7 +2783,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2734 | } | 2783 | } |
| 2735 | 2784 | ||
| 2736 | /* Adjust by relative CPU power of the group */ | 2785 | /* Adjust by relative CPU power of the group */ |
| 2737 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2786 | sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; |
| 2738 | 2787 | ||
| 2739 | /* | 2788 | /* |
| 2740 | * Consider the group unbalanced when the imbalance is larger | 2789 | * Consider the group unbalanced when the imbalance is larger |
| @@ -2751,7 +2800,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
| 2751 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) | 2800 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) |
| 2752 | sgs->group_imb = 1; | 2801 | sgs->group_imb = 1; |
| 2753 | 2802 | ||
| 2754 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2803 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, |
| 2804 | SCHED_POWER_SCALE); | ||
| 2755 | if (!sgs->group_capacity) | 2805 | if (!sgs->group_capacity) |
| 2756 | sgs->group_capacity = fix_small_capacity(sd, group); | 2806 | sgs->group_capacity = fix_small_capacity(sd, group); |
| 2757 | sgs->group_weight = group->group_weight; | 2807 | sgs->group_weight = group->group_weight; |
| @@ -2839,7 +2889,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
| 2839 | return; | 2889 | return; |
| 2840 | 2890 | ||
| 2841 | sds->total_load += sgs.group_load; | 2891 | sds->total_load += sgs.group_load; |
| 2842 | sds->total_pwr += sg->cpu_power; | 2892 | sds->total_pwr += sg->sgp->power; |
| 2843 | 2893 | ||
| 2844 | /* | 2894 | /* |
| 2845 | * In case the child domain prefers tasks go to siblings | 2895 | * In case the child domain prefers tasks go to siblings |
| @@ -2924,8 +2974,8 @@ static int check_asym_packing(struct sched_domain *sd, | |||
| 2924 | if (this_cpu > busiest_cpu) | 2974 | if (this_cpu > busiest_cpu) |
| 2925 | return 0; | 2975 | return 0; |
| 2926 | 2976 | ||
| 2927 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, | 2977 | *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, |
| 2928 | SCHED_LOAD_SCALE); | 2978 | SCHED_POWER_SCALE); |
| 2929 | return 1; | 2979 | return 1; |
| 2930 | } | 2980 | } |
| 2931 | 2981 | ||
| @@ -2954,8 +3004,8 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
| 2954 | cpu_avg_load_per_task(this_cpu); | 3004 | cpu_avg_load_per_task(this_cpu); |
| 2955 | 3005 | ||
| 2956 | scaled_busy_load_per_task = sds->busiest_load_per_task | 3006 | scaled_busy_load_per_task = sds->busiest_load_per_task |
| 2957 | * SCHED_LOAD_SCALE; | 3007 | * SCHED_POWER_SCALE; |
| 2958 | scaled_busy_load_per_task /= sds->busiest->cpu_power; | 3008 | scaled_busy_load_per_task /= sds->busiest->sgp->power; |
| 2959 | 3009 | ||
| 2960 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= | 3010 | if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= |
| 2961 | (scaled_busy_load_per_task * imbn)) { | 3011 | (scaled_busy_load_per_task * imbn)) { |
| @@ -2969,30 +3019,30 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
| 2969 | * moving them. | 3019 | * moving them. |
| 2970 | */ | 3020 | */ |
| 2971 | 3021 | ||
| 2972 | pwr_now += sds->busiest->cpu_power * | 3022 | pwr_now += sds->busiest->sgp->power * |
| 2973 | min(sds->busiest_load_per_task, sds->max_load); | 3023 | min(sds->busiest_load_per_task, sds->max_load); |
| 2974 | pwr_now += sds->this->cpu_power * | 3024 | pwr_now += sds->this->sgp->power * |
| 2975 | min(sds->this_load_per_task, sds->this_load); | 3025 | min(sds->this_load_per_task, sds->this_load); |
| 2976 | pwr_now /= SCHED_LOAD_SCALE; | 3026 | pwr_now /= SCHED_POWER_SCALE; |
| 2977 | 3027 | ||
| 2978 | /* Amount of load we'd subtract */ | 3028 | /* Amount of load we'd subtract */ |
| 2979 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | 3029 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / |
| 2980 | sds->busiest->cpu_power; | 3030 | sds->busiest->sgp->power; |
| 2981 | if (sds->max_load > tmp) | 3031 | if (sds->max_load > tmp) |
| 2982 | pwr_move += sds->busiest->cpu_power * | 3032 | pwr_move += sds->busiest->sgp->power * |
| 2983 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 3033 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
| 2984 | 3034 | ||
| 2985 | /* Amount of load we'd add */ | 3035 | /* Amount of load we'd add */ |
| 2986 | if (sds->max_load * sds->busiest->cpu_power < | 3036 | if (sds->max_load * sds->busiest->sgp->power < |
| 2987 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 3037 | sds->busiest_load_per_task * SCHED_POWER_SCALE) |
| 2988 | tmp = (sds->max_load * sds->busiest->cpu_power) / | 3038 | tmp = (sds->max_load * sds->busiest->sgp->power) / |
| 2989 | sds->this->cpu_power; | 3039 | sds->this->sgp->power; |
| 2990 | else | 3040 | else |
| 2991 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / | 3041 | tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / |
| 2992 | sds->this->cpu_power; | 3042 | sds->this->sgp->power; |
| 2993 | pwr_move += sds->this->cpu_power * | 3043 | pwr_move += sds->this->sgp->power * |
| 2994 | min(sds->this_load_per_task, sds->this_load + tmp); | 3044 | min(sds->this_load_per_task, sds->this_load + tmp); |
| 2995 | pwr_move /= SCHED_LOAD_SCALE; | 3045 | pwr_move /= SCHED_POWER_SCALE; |
| 2996 | 3046 | ||
| 2997 | /* Move if we gain throughput */ | 3047 | /* Move if we gain throughput */ |
| 2998 | if (pwr_move > pwr_now) | 3048 | if (pwr_move > pwr_now) |
| @@ -3034,9 +3084,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
| 3034 | load_above_capacity = (sds->busiest_nr_running - | 3084 | load_above_capacity = (sds->busiest_nr_running - |
| 3035 | sds->busiest_group_capacity); | 3085 | sds->busiest_group_capacity); |
| 3036 | 3086 | ||
| 3037 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); | 3087 | load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); |
| 3038 | 3088 | ||
| 3039 | load_above_capacity /= sds->busiest->cpu_power; | 3089 | load_above_capacity /= sds->busiest->sgp->power; |
| 3040 | } | 3090 | } |
| 3041 | 3091 | ||
| 3042 | /* | 3092 | /* |
| @@ -3052,9 +3102,9 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
| 3052 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); | 3102 | max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); |
| 3053 | 3103 | ||
| 3054 | /* How much load to actually move to equalise the imbalance */ | 3104 | /* How much load to actually move to equalise the imbalance */ |
| 3055 | *imbalance = min(max_pull * sds->busiest->cpu_power, | 3105 | *imbalance = min(max_pull * sds->busiest->sgp->power, |
| 3056 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) | 3106 | (sds->avg_load - sds->this_load) * sds->this->sgp->power) |
| 3057 | / SCHED_LOAD_SCALE; | 3107 | / SCHED_POWER_SCALE; |
| 3058 | 3108 | ||
| 3059 | /* | 3109 | /* |
| 3060 | * if *imbalance is less than the average load per runnable task | 3110 | * if *imbalance is less than the average load per runnable task |
| @@ -3123,7 +3173,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 3123 | if (!sds.busiest || sds.busiest_nr_running == 0) | 3173 | if (!sds.busiest || sds.busiest_nr_running == 0) |
| 3124 | goto out_balanced; | 3174 | goto out_balanced; |
| 3125 | 3175 | ||
| 3126 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | 3176 | sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; |
| 3127 | 3177 | ||
| 3128 | /* | 3178 | /* |
| 3129 | * If the busiest group is imbalanced the below checks don't | 3179 | * If the busiest group is imbalanced the below checks don't |
| @@ -3202,7 +3252,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
| 3202 | 3252 | ||
| 3203 | for_each_cpu(i, sched_group_cpus(group)) { | 3253 | for_each_cpu(i, sched_group_cpus(group)) { |
| 3204 | unsigned long power = power_of(i); | 3254 | unsigned long power = power_of(i); |
| 3205 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | 3255 | unsigned long capacity = DIV_ROUND_CLOSEST(power, |
| 3256 | SCHED_POWER_SCALE); | ||
| 3206 | unsigned long wl; | 3257 | unsigned long wl; |
| 3207 | 3258 | ||
| 3208 | if (!capacity) | 3259 | if (!capacity) |
| @@ -3227,7 +3278,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
| 3227 | * the load can be moved away from the cpu that is potentially | 3278 | * the load can be moved away from the cpu that is potentially |
| 3228 | * running at a lower capacity. | 3279 | * running at a lower capacity. |
| 3229 | */ | 3280 | */ |
| 3230 | wl = (wl * SCHED_LOAD_SCALE) / power; | 3281 | wl = (wl * SCHED_POWER_SCALE) / power; |
| 3231 | 3282 | ||
| 3232 | if (wl > max_load) { | 3283 | if (wl > max_load) { |
| 3233 | max_load = wl; | 3284 | max_load = wl; |
| @@ -3465,6 +3516,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 3465 | raw_spin_unlock(&this_rq->lock); | 3516 | raw_spin_unlock(&this_rq->lock); |
| 3466 | 3517 | ||
| 3467 | update_shares(this_cpu); | 3518 | update_shares(this_cpu); |
| 3519 | rcu_read_lock(); | ||
| 3468 | for_each_domain(this_cpu, sd) { | 3520 | for_each_domain(this_cpu, sd) { |
| 3469 | unsigned long interval; | 3521 | unsigned long interval; |
| 3470 | int balance = 1; | 3522 | int balance = 1; |
| @@ -3486,6 +3538,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 3486 | break; | 3538 | break; |
| 3487 | } | 3539 | } |
| 3488 | } | 3540 | } |
| 3541 | rcu_read_unlock(); | ||
| 3489 | 3542 | ||
| 3490 | raw_spin_lock(&this_rq->lock); | 3543 | raw_spin_lock(&this_rq->lock); |
| 3491 | 3544 | ||
| @@ -3534,6 +3587,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 3534 | double_lock_balance(busiest_rq, target_rq); | 3587 | double_lock_balance(busiest_rq, target_rq); |
| 3535 | 3588 | ||
| 3536 | /* Search for an sd spanning us and the target CPU. */ | 3589 | /* Search for an sd spanning us and the target CPU. */ |
| 3590 | rcu_read_lock(); | ||
| 3537 | for_each_domain(target_cpu, sd) { | 3591 | for_each_domain(target_cpu, sd) { |
| 3538 | if ((sd->flags & SD_LOAD_BALANCE) && | 3592 | if ((sd->flags & SD_LOAD_BALANCE) && |
| 3539 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) | 3593 | cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) |
| @@ -3549,6 +3603,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
| 3549 | else | 3603 | else |
| 3550 | schedstat_inc(sd, alb_failed); | 3604 | schedstat_inc(sd, alb_failed); |
| 3551 | } | 3605 | } |
| 3606 | rcu_read_unlock(); | ||
| 3552 | double_unlock_balance(busiest_rq, target_rq); | 3607 | double_unlock_balance(busiest_rq, target_rq); |
| 3553 | out_unlock: | 3608 | out_unlock: |
| 3554 | busiest_rq->active_balance = 0; | 3609 | busiest_rq->active_balance = 0; |
| @@ -3675,6 +3730,7 @@ static int find_new_ilb(int cpu) | |||
| 3675 | { | 3730 | { |
| 3676 | struct sched_domain *sd; | 3731 | struct sched_domain *sd; |
| 3677 | struct sched_group *ilb_group; | 3732 | struct sched_group *ilb_group; |
| 3733 | int ilb = nr_cpu_ids; | ||
| 3678 | 3734 | ||
| 3679 | /* | 3735 | /* |
| 3680 | * Have idle load balancer selection from semi-idle packages only | 3736 | * Have idle load balancer selection from semi-idle packages only |
| @@ -3690,20 +3746,25 @@ static int find_new_ilb(int cpu) | |||
| 3690 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) | 3746 | if (cpumask_weight(nohz.idle_cpus_mask) < 2) |
| 3691 | goto out_done; | 3747 | goto out_done; |
| 3692 | 3748 | ||
| 3749 | rcu_read_lock(); | ||
| 3693 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | 3750 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { |
| 3694 | ilb_group = sd->groups; | 3751 | ilb_group = sd->groups; |
| 3695 | 3752 | ||
| 3696 | do { | 3753 | do { |
| 3697 | if (is_semi_idle_group(ilb_group)) | 3754 | if (is_semi_idle_group(ilb_group)) { |
| 3698 | return cpumask_first(nohz.grp_idle_mask); | 3755 | ilb = cpumask_first(nohz.grp_idle_mask); |
| 3756 | goto unlock; | ||
| 3757 | } | ||
| 3699 | 3758 | ||
| 3700 | ilb_group = ilb_group->next; | 3759 | ilb_group = ilb_group->next; |
| 3701 | 3760 | ||
| 3702 | } while (ilb_group != sd->groups); | 3761 | } while (ilb_group != sd->groups); |
| 3703 | } | 3762 | } |
| 3763 | unlock: | ||
| 3764 | rcu_read_unlock(); | ||
| 3704 | 3765 | ||
| 3705 | out_done: | 3766 | out_done: |
| 3706 | return nr_cpu_ids; | 3767 | return ilb; |
| 3707 | } | 3768 | } |
| 3708 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | 3769 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ |
| 3709 | static inline int find_new_ilb(int call_cpu) | 3770 | static inline int find_new_ilb(int call_cpu) |
| @@ -3848,6 +3909,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
| 3848 | 3909 | ||
| 3849 | update_shares(cpu); | 3910 | update_shares(cpu); |
| 3850 | 3911 | ||
| 3912 | rcu_read_lock(); | ||
| 3851 | for_each_domain(cpu, sd) { | 3913 | for_each_domain(cpu, sd) { |
| 3852 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3914 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| 3853 | continue; | 3915 | continue; |
| @@ -3893,6 +3955,7 @@ out: | |||
| 3893 | if (!balance) | 3955 | if (!balance) |
| 3894 | break; | 3956 | break; |
| 3895 | } | 3957 | } |
| 3958 | rcu_read_unlock(); | ||
| 3896 | 3959 | ||
| 3897 | /* | 3960 | /* |
| 3898 | * next_balance will be updated only when there is a need. | 3961 | * next_balance will be updated only when there is a need. |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69acc29b9..2e74677cb040 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
| @@ -61,6 +61,14 @@ SCHED_FEAT(LB_BIAS, 1) | |||
| 61 | SCHED_FEAT(OWNER_SPIN, 1) | 61 | SCHED_FEAT(OWNER_SPIN, 1) |
| 62 | 62 | ||
| 63 | /* | 63 | /* |
| 64 | * Decrement CPU power based on irq activity | 64 | * Decrement CPU power based on time not spent running tasks |
| 65 | */ | 65 | */ |
| 66 | SCHED_FEAT(NONIRQ_POWER, 1) | 66 | SCHED_FEAT(NONTASK_POWER, 1) |
| 67 | |||
| 68 | /* | ||
| 69 | * Queue remote wakeups on the target CPU and process them | ||
| 70 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | ||
| 71 | */ | ||
| 72 | SCHED_FEAT(TTWU_QUEUE, 1) | ||
| 73 | |||
| 74 | SCHED_FEAT(FORCE_SD_OVERLAP, 0) | ||
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a776a6396427..0a51882534ea 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
| @@ -7,7 +7,7 @@ | |||
| 7 | 7 | ||
| 8 | #ifdef CONFIG_SMP | 8 | #ifdef CONFIG_SMP |
| 9 | static int | 9 | static int |
| 10 | select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | 10 | select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) |
| 11 | { | 11 | { |
| 12 | return task_cpu(p); /* IDLE tasks as never migrated */ | 12 | return task_cpu(p); /* IDLE tasks as never migrated */ |
| 13 | } | 13 | } |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e7cebdc65f82..97540f0c9e47 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
| @@ -183,6 +183,26 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
| 183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); | 183 | return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); |
| 184 | } | 184 | } |
| 185 | 185 | ||
| 186 | typedef struct task_group *rt_rq_iter_t; | ||
| 187 | |||
| 188 | static inline struct task_group *next_task_group(struct task_group *tg) | ||
| 189 | { | ||
| 190 | do { | ||
| 191 | tg = list_entry_rcu(tg->list.next, | ||
| 192 | typeof(struct task_group), list); | ||
| 193 | } while (&tg->list != &task_groups && task_group_is_autogroup(tg)); | ||
| 194 | |||
| 195 | if (&tg->list == &task_groups) | ||
| 196 | tg = NULL; | ||
| 197 | |||
| 198 | return tg; | ||
| 199 | } | ||
| 200 | |||
| 201 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
| 202 | for (iter = container_of(&task_groups, typeof(*iter), list); \ | ||
| 203 | (iter = next_task_group(iter)) && \ | ||
| 204 | (rt_rq = iter->rt_rq[cpu_of(rq)]);) | ||
| 205 | |||
| 186 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | 206 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) |
| 187 | { | 207 | { |
| 188 | list_add_rcu(&rt_rq->leaf_rt_rq_list, | 208 | list_add_rcu(&rt_rq->leaf_rt_rq_list, |
| @@ -288,6 +308,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq) | |||
| 288 | return ktime_to_ns(def_rt_bandwidth.rt_period); | 308 | return ktime_to_ns(def_rt_bandwidth.rt_period); |
| 289 | } | 309 | } |
| 290 | 310 | ||
| 311 | typedef struct rt_rq *rt_rq_iter_t; | ||
| 312 | |||
| 313 | #define for_each_rt_rq(rt_rq, iter, rq) \ | ||
| 314 | for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
| 315 | |||
| 291 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) | 316 | static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) |
| 292 | { | 317 | { |
| 293 | } | 318 | } |
| @@ -402,12 +427,13 @@ next: | |||
| 402 | static void __disable_runtime(struct rq *rq) | 427 | static void __disable_runtime(struct rq *rq) |
| 403 | { | 428 | { |
| 404 | struct root_domain *rd = rq->rd; | 429 | struct root_domain *rd = rq->rd; |
| 430 | rt_rq_iter_t iter; | ||
| 405 | struct rt_rq *rt_rq; | 431 | struct rt_rq *rt_rq; |
| 406 | 432 | ||
| 407 | if (unlikely(!scheduler_running)) | 433 | if (unlikely(!scheduler_running)) |
| 408 | return; | 434 | return; |
| 409 | 435 | ||
| 410 | for_each_leaf_rt_rq(rt_rq, rq) { | 436 | for_each_rt_rq(rt_rq, iter, rq) { |
| 411 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 437 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
| 412 | s64 want; | 438 | s64 want; |
| 413 | int i; | 439 | int i; |
| @@ -487,6 +513,7 @@ static void disable_runtime(struct rq *rq) | |||
| 487 | 513 | ||
| 488 | static void __enable_runtime(struct rq *rq) | 514 | static void __enable_runtime(struct rq *rq) |
| 489 | { | 515 | { |
| 516 | rt_rq_iter_t iter; | ||
| 490 | struct rt_rq *rt_rq; | 517 | struct rt_rq *rt_rq; |
| 491 | 518 | ||
| 492 | if (unlikely(!scheduler_running)) | 519 | if (unlikely(!scheduler_running)) |
| @@ -495,7 +522,7 @@ static void __enable_runtime(struct rq *rq) | |||
| 495 | /* | 522 | /* |
| 496 | * Reset each runqueue's bandwidth settings | 523 | * Reset each runqueue's bandwidth settings |
| 497 | */ | 524 | */ |
| 498 | for_each_leaf_rt_rq(rt_rq, rq) { | 525 | for_each_rt_rq(rt_rq, iter, rq) { |
| 499 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); | 526 | struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); |
| 500 | 527 | ||
| 501 | raw_spin_lock(&rt_b->rt_runtime_lock); | 528 | raw_spin_lock(&rt_b->rt_runtime_lock); |
| @@ -562,6 +589,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
| 562 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { | 589 | if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { |
| 563 | rt_rq->rt_throttled = 0; | 590 | rt_rq->rt_throttled = 0; |
| 564 | enqueue = 1; | 591 | enqueue = 1; |
| 592 | |||
| 593 | /* | ||
| 594 | * Force a clock update if the CPU was idle, | ||
| 595 | * lest wakeup -> unthrottle time accumulate. | ||
| 596 | */ | ||
| 597 | if (rt_rq->rt_nr_running && rq->curr == rq->idle) | ||
| 598 | rq->skip_clock_update = -1; | ||
| 565 | } | 599 | } |
| 566 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | 600 | if (rt_rq->rt_time || rt_rq->rt_nr_running) |
| 567 | idle = 0; | 601 | idle = 0; |
| @@ -977,13 +1011,23 @@ static void yield_task_rt(struct rq *rq) | |||
| 977 | static int find_lowest_rq(struct task_struct *task); | 1011 | static int find_lowest_rq(struct task_struct *task); |
| 978 | 1012 | ||
| 979 | static int | 1013 | static int |
| 980 | select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | 1014 | select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) |
| 981 | { | 1015 | { |
| 1016 | struct task_struct *curr; | ||
| 1017 | struct rq *rq; | ||
| 1018 | int cpu; | ||
| 1019 | |||
| 982 | if (sd_flag != SD_BALANCE_WAKE) | 1020 | if (sd_flag != SD_BALANCE_WAKE) |
| 983 | return smp_processor_id(); | 1021 | return smp_processor_id(); |
| 984 | 1022 | ||
| 1023 | cpu = task_cpu(p); | ||
| 1024 | rq = cpu_rq(cpu); | ||
| 1025 | |||
| 1026 | rcu_read_lock(); | ||
| 1027 | curr = ACCESS_ONCE(rq->curr); /* unlocked access */ | ||
| 1028 | |||
| 985 | /* | 1029 | /* |
| 986 | * If the current task is an RT task, then | 1030 | * If the current task on @p's runqueue is an RT task, then |
| 987 | * try to see if we can wake this RT task up on another | 1031 | * try to see if we can wake this RT task up on another |
| 988 | * runqueue. Otherwise simply start this RT task | 1032 | * runqueue. Otherwise simply start this RT task |
| 989 | * on its current runqueue. | 1033 | * on its current runqueue. |
| @@ -997,21 +1041,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) | |||
| 997 | * lock? | 1041 | * lock? |
| 998 | * | 1042 | * |
| 999 | * For equal prio tasks, we just let the scheduler sort it out. | 1043 | * For equal prio tasks, we just let the scheduler sort it out. |
| 1044 | * | ||
| 1045 | * Otherwise, just let it ride on the affined RQ and the | ||
| 1046 | * post-schedule router will push the preempted task away | ||
| 1047 | * | ||
| 1048 | * This test is optimistic, if we get it wrong the load-balancer | ||
| 1049 | * will have to sort it out. | ||
| 1000 | */ | 1050 | */ |
| 1001 | if (unlikely(rt_task(rq->curr)) && | 1051 | if (curr && unlikely(rt_task(curr)) && |
| 1002 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1052 | (curr->rt.nr_cpus_allowed < 2 || |
| 1003 | rq->curr->prio < p->prio) && | 1053 | curr->prio < p->prio) && |
| 1004 | (p->rt.nr_cpus_allowed > 1)) { | 1054 | (p->rt.nr_cpus_allowed > 1)) { |
| 1005 | int cpu = find_lowest_rq(p); | 1055 | int target = find_lowest_rq(p); |
| 1006 | 1056 | ||
| 1007 | return (cpu == -1) ? task_cpu(p) : cpu; | 1057 | if (target != -1) |
| 1058 | cpu = target; | ||
| 1008 | } | 1059 | } |
| 1060 | rcu_read_unlock(); | ||
| 1009 | 1061 | ||
| 1010 | /* | 1062 | return cpu; |
| 1011 | * Otherwise, just let it ride on the affined RQ and the | ||
| 1012 | * post-schedule router will push the preempted task away | ||
| 1013 | */ | ||
| 1014 | return task_cpu(p); | ||
| 1015 | } | 1063 | } |
| 1016 | 1064 | ||
| 1017 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1065 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
| @@ -1060,7 +1108,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag | |||
| 1060 | * to move current somewhere else, making room for our non-migratable | 1108 | * to move current somewhere else, making room for our non-migratable |
| 1061 | * task. | 1109 | * task. |
| 1062 | */ | 1110 | */ |
| 1063 | if (p->prio == rq->curr->prio && !need_resched()) | 1111 | if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) |
| 1064 | check_preempt_equal_prio(rq, p); | 1112 | check_preempt_equal_prio(rq, p); |
| 1065 | #endif | 1113 | #endif |
| 1066 | } | 1114 | } |
| @@ -1090,7 +1138,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
| 1090 | 1138 | ||
| 1091 | rt_rq = &rq->rt; | 1139 | rt_rq = &rq->rt; |
| 1092 | 1140 | ||
| 1093 | if (unlikely(!rt_rq->rt_nr_running)) | 1141 | if (!rt_rq->rt_nr_running) |
| 1094 | return NULL; | 1142 | return NULL; |
| 1095 | 1143 | ||
| 1096 | if (rt_rq_throttled(rt_rq)) | 1144 | if (rt_rq_throttled(rt_rq)) |
| @@ -1136,7 +1184,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
| 1136 | * The previous task needs to be made eligible for pushing | 1184 | * The previous task needs to be made eligible for pushing |
| 1137 | * if it is still active | 1185 | * if it is still active |
| 1138 | */ | 1186 | */ |
| 1139 | if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) | 1187 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) |
| 1140 | enqueue_pushable_task(rq, p); | 1188 | enqueue_pushable_task(rq, p); |
| 1141 | } | 1189 | } |
| 1142 | 1190 | ||
| @@ -1203,6 +1251,10 @@ static int find_lowest_rq(struct task_struct *task) | |||
| 1203 | int this_cpu = smp_processor_id(); | 1251 | int this_cpu = smp_processor_id(); |
| 1204 | int cpu = task_cpu(task); | 1252 | int cpu = task_cpu(task); |
| 1205 | 1253 | ||
| 1254 | /* Make sure the mask is initialized first */ | ||
| 1255 | if (unlikely(!lowest_mask)) | ||
| 1256 | return -1; | ||
| 1257 | |||
| 1206 | if (task->rt.nr_cpus_allowed == 1) | 1258 | if (task->rt.nr_cpus_allowed == 1) |
| 1207 | return -1; /* No other targets possible */ | 1259 | return -1; /* No other targets possible */ |
| 1208 | 1260 | ||
| @@ -1227,6 +1279,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
| 1227 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) | 1279 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) |
| 1228 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ | 1280 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ |
| 1229 | 1281 | ||
| 1282 | rcu_read_lock(); | ||
| 1230 | for_each_domain(cpu, sd) { | 1283 | for_each_domain(cpu, sd) { |
| 1231 | if (sd->flags & SD_WAKE_AFFINE) { | 1284 | if (sd->flags & SD_WAKE_AFFINE) { |
| 1232 | int best_cpu; | 1285 | int best_cpu; |
| @@ -1236,15 +1289,20 @@ static int find_lowest_rq(struct task_struct *task) | |||
| 1236 | * remote processor. | 1289 | * remote processor. |
| 1237 | */ | 1290 | */ |
| 1238 | if (this_cpu != -1 && | 1291 | if (this_cpu != -1 && |
| 1239 | cpumask_test_cpu(this_cpu, sched_domain_span(sd))) | 1292 | cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { |
| 1293 | rcu_read_unlock(); | ||
| 1240 | return this_cpu; | 1294 | return this_cpu; |
| 1295 | } | ||
| 1241 | 1296 | ||
| 1242 | best_cpu = cpumask_first_and(lowest_mask, | 1297 | best_cpu = cpumask_first_and(lowest_mask, |
| 1243 | sched_domain_span(sd)); | 1298 | sched_domain_span(sd)); |
| 1244 | if (best_cpu < nr_cpu_ids) | 1299 | if (best_cpu < nr_cpu_ids) { |
| 1300 | rcu_read_unlock(); | ||
| 1245 | return best_cpu; | 1301 | return best_cpu; |
| 1302 | } | ||
| 1246 | } | 1303 | } |
| 1247 | } | 1304 | } |
| 1305 | rcu_read_unlock(); | ||
| 1248 | 1306 | ||
| 1249 | /* | 1307 | /* |
| 1250 | * And finally, if there were no matches within the domains | 1308 | * And finally, if there were no matches within the domains |
| @@ -1287,7 +1345,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
| 1287 | !cpumask_test_cpu(lowest_rq->cpu, | 1345 | !cpumask_test_cpu(lowest_rq->cpu, |
| 1288 | &task->cpus_allowed) || | 1346 | &task->cpus_allowed) || |
| 1289 | task_running(rq, task) || | 1347 | task_running(rq, task) || |
| 1290 | !task->se.on_rq)) { | 1348 | !task->on_rq)) { |
| 1291 | 1349 | ||
| 1292 | raw_spin_unlock(&lowest_rq->lock); | 1350 | raw_spin_unlock(&lowest_rq->lock); |
| 1293 | lowest_rq = NULL; | 1351 | lowest_rq = NULL; |
| @@ -1321,7 +1379,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
| 1321 | BUG_ON(task_current(rq, p)); | 1379 | BUG_ON(task_current(rq, p)); |
| 1322 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1380 | BUG_ON(p->rt.nr_cpus_allowed <= 1); |
| 1323 | 1381 | ||
| 1324 | BUG_ON(!p->se.on_rq); | 1382 | BUG_ON(!p->on_rq); |
| 1325 | BUG_ON(!rt_task(p)); | 1383 | BUG_ON(!rt_task(p)); |
| 1326 | 1384 | ||
| 1327 | return p; | 1385 | return p; |
| @@ -1467,7 +1525,7 @@ static int pull_rt_task(struct rq *this_rq) | |||
| 1467 | */ | 1525 | */ |
| 1468 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { | 1526 | if (p && (p->prio < this_rq->rt.highest_prio.curr)) { |
| 1469 | WARN_ON(p == src_rq->curr); | 1527 | WARN_ON(p == src_rq->curr); |
| 1470 | WARN_ON(!p->se.on_rq); | 1528 | WARN_ON(!p->on_rq); |
| 1471 | 1529 | ||
| 1472 | /* | 1530 | /* |
| 1473 | * There's a chance that p is higher in priority | 1531 | * There's a chance that p is higher in priority |
| @@ -1502,7 +1560,7 @@ skip: | |||
| 1502 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | 1560 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) |
| 1503 | { | 1561 | { |
| 1504 | /* Try to pull RT tasks here if we lower this rq's prio */ | 1562 | /* Try to pull RT tasks here if we lower this rq's prio */ |
| 1505 | if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) | 1563 | if (rq->rt.highest_prio.curr > prev->prio) |
| 1506 | pull_rt_task(rq); | 1564 | pull_rt_task(rq); |
| 1507 | } | 1565 | } |
| 1508 | 1566 | ||
| @@ -1538,7 +1596,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
| 1538 | * Update the migration status of the RQ if we have an RT task | 1596 | * Update the migration status of the RQ if we have an RT task |
| 1539 | * which is running AND changing its weight value. | 1597 | * which is running AND changing its weight value. |
| 1540 | */ | 1598 | */ |
| 1541 | if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { | 1599 | if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { |
| 1542 | struct rq *rq = task_rq(p); | 1600 | struct rq *rq = task_rq(p); |
| 1543 | 1601 | ||
| 1544 | if (!task_current(rq, p)) { | 1602 | if (!task_current(rq, p)) { |
| @@ -1608,7 +1666,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
| 1608 | * we may need to handle the pulling of RT tasks | 1666 | * we may need to handle the pulling of RT tasks |
| 1609 | * now. | 1667 | * now. |
| 1610 | */ | 1668 | */ |
| 1611 | if (p->se.on_rq && !rq->rt.rt_nr_running) | 1669 | if (p->on_rq && !rq->rt.rt_nr_running) |
| 1612 | pull_rt_task(rq); | 1670 | pull_rt_task(rq); |
| 1613 | } | 1671 | } |
| 1614 | 1672 | ||
| @@ -1638,7 +1696,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
| 1638 | * If that current running task is also an RT task | 1696 | * If that current running task is also an RT task |
| 1639 | * then see if we can move to another run queue. | 1697 | * then see if we can move to another run queue. |
| 1640 | */ | 1698 | */ |
| 1641 | if (p->se.on_rq && rq->curr != p) { | 1699 | if (p->on_rq && rq->curr != p) { |
| 1642 | #ifdef CONFIG_SMP | 1700 | #ifdef CONFIG_SMP |
| 1643 | if (rq->rt.overloaded && push_rt_task(rq) && | 1701 | if (rq->rt.overloaded && push_rt_task(rq) && |
| 1644 | /* Don't resched if we changed runqueues */ | 1702 | /* Don't resched if we changed runqueues */ |
| @@ -1657,7 +1715,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
| 1657 | static void | 1715 | static void |
| 1658 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | 1716 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
| 1659 | { | 1717 | { |
| 1660 | if (!p->se.on_rq) | 1718 | if (!p->on_rq) |
| 1661 | return; | 1719 | return; |
| 1662 | 1720 | ||
| 1663 | if (rq->curr == p) { | 1721 | if (rq->curr == p) { |
| @@ -1796,10 +1854,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); | |||
| 1796 | 1854 | ||
| 1797 | static void print_rt_stats(struct seq_file *m, int cpu) | 1855 | static void print_rt_stats(struct seq_file *m, int cpu) |
| 1798 | { | 1856 | { |
| 1857 | rt_rq_iter_t iter; | ||
| 1799 | struct rt_rq *rt_rq; | 1858 | struct rt_rq *rt_rq; |
| 1800 | 1859 | ||
| 1801 | rcu_read_lock(); | 1860 | rcu_read_lock(); |
| 1802 | for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu)) | 1861 | for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) |
| 1803 | print_rt_rq(m, cpu, rt_rq); | 1862 | print_rt_rq(m, cpu, rt_rq); |
| 1804 | rcu_read_unlock(); | 1863 | rcu_read_unlock(); |
| 1805 | } | 1864 | } |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 48ddf431db0e..331e01bcd026 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
| @@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 37 | 37 | ||
| 38 | #ifdef CONFIG_SMP | 38 | #ifdef CONFIG_SMP |
| 39 | /* domain-specific stats */ | 39 | /* domain-specific stats */ |
| 40 | preempt_disable(); | 40 | rcu_read_lock(); |
| 41 | for_each_domain(cpu, sd) { | 41 | for_each_domain(cpu, sd) { |
| 42 | enum cpu_idle_type itype; | 42 | enum cpu_idle_type itype; |
| 43 | 43 | ||
| @@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 64 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | 64 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
| 65 | sd->ttwu_move_balance); | 65 | sd->ttwu_move_balance); |
| 66 | } | 66 | } |
| 67 | preempt_enable(); | 67 | rcu_read_unlock(); |
| 68 | #endif | 68 | #endif |
| 69 | } | 69 | } |
| 70 | kfree(mask_str); | 70 | kfree(mask_str); |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 1ba2bd40fdac..6f437632afab 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
| @@ -9,8 +9,7 @@ | |||
| 9 | 9 | ||
| 10 | #ifdef CONFIG_SMP | 10 | #ifdef CONFIG_SMP |
| 11 | static int | 11 | static int |
| 12 | select_task_rq_stop(struct rq *rq, struct task_struct *p, | 12 | select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) |
| 13 | int sd_flag, int flags) | ||
| 14 | { | 13 | { |
| 15 | return task_cpu(p); /* stop tasks as never migrate */ | 14 | return task_cpu(p); /* stop tasks as never migrate */ |
| 16 | } | 15 | } |
| @@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) | |||
| 26 | { | 25 | { |
| 27 | struct task_struct *stop = rq->stop; | 26 | struct task_struct *stop = rq->stop; |
| 28 | 27 | ||
| 29 | if (stop && stop->se.on_rq) | 28 | if (stop && stop->on_rq) |
| 30 | return stop; | 29 | return stop; |
| 31 | 30 | ||
| 32 | return NULL; | 31 | return NULL; |
diff --git a/kernel/signal.c b/kernel/signal.c index 7165af5f1b11..291c9700be75 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -87,7 +87,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) | |||
| 87 | /* | 87 | /* |
| 88 | * Tracers may want to know about even ignored signals. | 88 | * Tracers may want to know about even ignored signals. |
| 89 | */ | 89 | */ |
| 90 | return !tracehook_consider_ignored_signal(t, sig); | 90 | return !t->ptrace; |
| 91 | } | 91 | } |
| 92 | 92 | ||
| 93 | /* | 93 | /* |
| @@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) | |||
| 124 | 124 | ||
| 125 | static int recalc_sigpending_tsk(struct task_struct *t) | 125 | static int recalc_sigpending_tsk(struct task_struct *t) |
| 126 | { | 126 | { |
| 127 | if (t->signal->group_stop_count > 0 || | 127 | if ((t->jobctl & JOBCTL_PENDING_MASK) || |
| 128 | PENDING(&t->pending, &t->blocked) || | 128 | PENDING(&t->pending, &t->blocked) || |
| 129 | PENDING(&t->signal->shared_pending, &t->blocked)) { | 129 | PENDING(&t->signal->shared_pending, &t->blocked)) { |
| 130 | set_tsk_thread_flag(t, TIF_SIGPENDING); | 130 | set_tsk_thread_flag(t, TIF_SIGPENDING); |
| @@ -150,9 +150,7 @@ void recalc_sigpending_and_wake(struct task_struct *t) | |||
| 150 | 150 | ||
| 151 | void recalc_sigpending(void) | 151 | void recalc_sigpending(void) |
| 152 | { | 152 | { |
| 153 | if (unlikely(tracehook_force_sigpending())) | 153 | if (!recalc_sigpending_tsk(current) && !freezing(current)) |
| 154 | set_thread_flag(TIF_SIGPENDING); | ||
| 155 | else if (!recalc_sigpending_tsk(current) && !freezing(current)) | ||
| 156 | clear_thread_flag(TIF_SIGPENDING); | 154 | clear_thread_flag(TIF_SIGPENDING); |
| 157 | 155 | ||
| 158 | } | 156 | } |
| @@ -223,6 +221,129 @@ static inline void print_dropped_signal(int sig) | |||
| 223 | current->comm, current->pid, sig); | 221 | current->comm, current->pid, sig); |
| 224 | } | 222 | } |
| 225 | 223 | ||
| 224 | /** | ||
| 225 | * task_set_jobctl_pending - set jobctl pending bits | ||
| 226 | * @task: target task | ||
| 227 | * @mask: pending bits to set | ||
| 228 | * | ||
| 229 | * Clear @mask from @task->jobctl. @mask must be subset of | ||
| 230 | * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK | | ||
| 231 | * %JOBCTL_TRAPPING. If stop signo is being set, the existing signo is | ||
| 232 | * cleared. If @task is already being killed or exiting, this function | ||
| 233 | * becomes noop. | ||
| 234 | * | ||
| 235 | * CONTEXT: | ||
| 236 | * Must be called with @task->sighand->siglock held. | ||
| 237 | * | ||
| 238 | * RETURNS: | ||
| 239 | * %true if @mask is set, %false if made noop because @task was dying. | ||
| 240 | */ | ||
| 241 | bool task_set_jobctl_pending(struct task_struct *task, unsigned int mask) | ||
| 242 | { | ||
| 243 | BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME | | ||
| 244 | JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING)); | ||
| 245 | BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK)); | ||
| 246 | |||
| 247 | if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING))) | ||
| 248 | return false; | ||
| 249 | |||
| 250 | if (mask & JOBCTL_STOP_SIGMASK) | ||
| 251 | task->jobctl &= ~JOBCTL_STOP_SIGMASK; | ||
| 252 | |||
| 253 | task->jobctl |= mask; | ||
| 254 | return true; | ||
| 255 | } | ||
| 256 | |||
| 257 | /** | ||
| 258 | * task_clear_jobctl_trapping - clear jobctl trapping bit | ||
| 259 | * @task: target task | ||
| 260 | * | ||
| 261 | * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED. | ||
| 262 | * Clear it and wake up the ptracer. Note that we don't need any further | ||
| 263 | * locking. @task->siglock guarantees that @task->parent points to the | ||
| 264 | * ptracer. | ||
| 265 | * | ||
| 266 | * CONTEXT: | ||
| 267 | * Must be called with @task->sighand->siglock held. | ||
| 268 | */ | ||
| 269 | void task_clear_jobctl_trapping(struct task_struct *task) | ||
| 270 | { | ||
| 271 | if (unlikely(task->jobctl & JOBCTL_TRAPPING)) { | ||
| 272 | task->jobctl &= ~JOBCTL_TRAPPING; | ||
| 273 | wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT); | ||
| 274 | } | ||
| 275 | } | ||
| 276 | |||
| 277 | /** | ||
| 278 | * task_clear_jobctl_pending - clear jobctl pending bits | ||
| 279 | * @task: target task | ||
| 280 | * @mask: pending bits to clear | ||
| 281 | * | ||
| 282 | * Clear @mask from @task->jobctl. @mask must be subset of | ||
| 283 | * %JOBCTL_PENDING_MASK. If %JOBCTL_STOP_PENDING is being cleared, other | ||
| 284 | * STOP bits are cleared together. | ||
| 285 | * | ||
| 286 | * If clearing of @mask leaves no stop or trap pending, this function calls | ||
| 287 | * task_clear_jobctl_trapping(). | ||
| 288 | * | ||
| 289 | * CONTEXT: | ||
| 290 | * Must be called with @task->sighand->siglock held. | ||
| 291 | */ | ||
| 292 | void task_clear_jobctl_pending(struct task_struct *task, unsigned int mask) | ||
| 293 | { | ||
| 294 | BUG_ON(mask & ~JOBCTL_PENDING_MASK); | ||
| 295 | |||
| 296 | if (mask & JOBCTL_STOP_PENDING) | ||
| 297 | mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED; | ||
| 298 | |||
| 299 | task->jobctl &= ~mask; | ||
| 300 | |||
| 301 | if (!(task->jobctl & JOBCTL_PENDING_MASK)) | ||
| 302 | task_clear_jobctl_trapping(task); | ||
| 303 | } | ||
| 304 | |||
| 305 | /** | ||
| 306 | * task_participate_group_stop - participate in a group stop | ||
| 307 | * @task: task participating in a group stop | ||
| 308 | * | ||
| 309 | * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop. | ||
| 310 | * Group stop states are cleared and the group stop count is consumed if | ||
| 311 | * %JOBCTL_STOP_CONSUME was set. If the consumption completes the group | ||
| 312 | * stop, the appropriate %SIGNAL_* flags are set. | ||
| 313 | * | ||
| 314 | * CONTEXT: | ||
| 315 | * Must be called with @task->sighand->siglock held. | ||
| 316 | * | ||
| 317 | * RETURNS: | ||
| 318 | * %true if group stop completion should be notified to the parent, %false | ||
| 319 | * otherwise. | ||
| 320 | */ | ||
| 321 | static bool task_participate_group_stop(struct task_struct *task) | ||
| 322 | { | ||
| 323 | struct signal_struct *sig = task->signal; | ||
| 324 | bool consume = task->jobctl & JOBCTL_STOP_CONSUME; | ||
| 325 | |||
| 326 | WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)); | ||
| 327 | |||
| 328 | task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING); | ||
| 329 | |||
| 330 | if (!consume) | ||
| 331 | return false; | ||
| 332 | |||
| 333 | if (!WARN_ON_ONCE(sig->group_stop_count == 0)) | ||
| 334 | sig->group_stop_count--; | ||
| 335 | |||
| 336 | /* | ||
| 337 | * Tell the caller to notify completion iff we are entering into a | ||
| 338 | * fresh group stop. Read comment in do_signal_stop() for details. | ||
| 339 | */ | ||
| 340 | if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { | ||
| 341 | sig->flags = SIGNAL_STOP_STOPPED; | ||
| 342 | return true; | ||
| 343 | } | ||
| 344 | return false; | ||
| 345 | } | ||
| 346 | |||
| 226 | /* | 347 | /* |
| 227 | * allocate a new signal queue record | 348 | * allocate a new signal queue record |
| 228 | * - this may be called without locks if and only if t == current, otherwise an | 349 | * - this may be called without locks if and only if t == current, otherwise an |
| @@ -372,7 +493,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) | |||
| 372 | return 1; | 493 | return 1; |
| 373 | if (handler != SIG_IGN && handler != SIG_DFL) | 494 | if (handler != SIG_IGN && handler != SIG_DFL) |
| 374 | return 0; | 495 | return 0; |
| 375 | return !tracehook_consider_fatal_signal(tsk, sig); | 496 | /* if ptraced, let the tracer determine */ |
| 497 | return !tsk->ptrace; | ||
| 376 | } | 498 | } |
| 377 | 499 | ||
| 378 | /* | 500 | /* |
| @@ -527,7 +649,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
| 527 | * is to alert stop-signal processing code when another | 649 | * is to alert stop-signal processing code when another |
| 528 | * processor has come along and cleared the flag. | 650 | * processor has come along and cleared the flag. |
| 529 | */ | 651 | */ |
| 530 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | 652 | current->jobctl |= JOBCTL_STOP_DEQUEUED; |
| 531 | } | 653 | } |
| 532 | if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { | 654 | if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { |
| 533 | /* | 655 | /* |
| @@ -592,7 +714,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) | |||
| 592 | if (sigisemptyset(&m)) | 714 | if (sigisemptyset(&m)) |
| 593 | return 0; | 715 | return 0; |
| 594 | 716 | ||
| 595 | signandsets(&s->signal, &s->signal, mask); | 717 | sigandnsets(&s->signal, &s->signal, mask); |
| 596 | list_for_each_entry_safe(q, n, &s->list, list) { | 718 | list_for_each_entry_safe(q, n, &s->list, list) { |
| 597 | if (sigismember(mask, q->info.si_signo)) { | 719 | if (sigismember(mask, q->info.si_signo)) { |
| 598 | list_del_init(&q->list); | 720 | list_del_init(&q->list); |
| @@ -696,6 +818,32 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
| 696 | return security_task_kill(t, info, sig, 0); | 818 | return security_task_kill(t, info, sig, 0); |
| 697 | } | 819 | } |
| 698 | 820 | ||
| 821 | /** | ||
| 822 | * ptrace_trap_notify - schedule trap to notify ptracer | ||
| 823 | * @t: tracee wanting to notify tracer | ||
| 824 | * | ||
| 825 | * This function schedules sticky ptrace trap which is cleared on the next | ||
| 826 | * TRAP_STOP to notify ptracer of an event. @t must have been seized by | ||
| 827 | * ptracer. | ||
| 828 | * | ||
| 829 | * If @t is running, STOP trap will be taken. If trapped for STOP and | ||
| 830 | * ptracer is listening for events, tracee is woken up so that it can | ||
| 831 | * re-trap for the new event. If trapped otherwise, STOP trap will be | ||
| 832 | * eventually taken without returning to userland after the existing traps | ||
| 833 | * are finished by PTRACE_CONT. | ||
| 834 | * | ||
| 835 | * CONTEXT: | ||
| 836 | * Must be called with @task->sighand->siglock held. | ||
| 837 | */ | ||
| 838 | static void ptrace_trap_notify(struct task_struct *t) | ||
| 839 | { | ||
| 840 | WARN_ON_ONCE(!(t->ptrace & PT_SEIZED)); | ||
| 841 | assert_spin_locked(&t->sighand->siglock); | ||
| 842 | |||
| 843 | task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); | ||
| 844 | signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); | ||
| 845 | } | ||
| 846 | |||
| 699 | /* | 847 | /* |
| 700 | * Handle magic process-wide effects of stop/continue signals. Unlike | 848 | * Handle magic process-wide effects of stop/continue signals. Unlike |
| 701 | * the signal actions, these happen immediately at signal-generation | 849 | * the signal actions, these happen immediately at signal-generation |
| @@ -727,34 +875,17 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) | |||
| 727 | } else if (sig == SIGCONT) { | 875 | } else if (sig == SIGCONT) { |
| 728 | unsigned int why; | 876 | unsigned int why; |
| 729 | /* | 877 | /* |
| 730 | * Remove all stop signals from all queues, | 878 | * Remove all stop signals from all queues, wake all threads. |
| 731 | * and wake all threads. | ||
| 732 | */ | 879 | */ |
| 733 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); | 880 | rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); |
| 734 | t = p; | 881 | t = p; |
| 735 | do { | 882 | do { |
| 736 | unsigned int state; | 883 | task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING); |
| 737 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | 884 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); |
| 738 | /* | 885 | if (likely(!(t->ptrace & PT_SEIZED))) |
| 739 | * If there is a handler for SIGCONT, we must make | 886 | wake_up_state(t, __TASK_STOPPED); |
| 740 | * sure that no thread returns to user mode before | 887 | else |
| 741 | * we post the signal, in case it was the only | 888 | ptrace_trap_notify(t); |
| 742 | * thread eligible to run the signal handler--then | ||
| 743 | * it must not do anything between resuming and | ||
| 744 | * running the handler. With the TIF_SIGPENDING | ||
| 745 | * flag set, the thread will pause and acquire the | ||
| 746 | * siglock that we hold now and until we've queued | ||
| 747 | * the pending signal. | ||
| 748 | * | ||
| 749 | * Wake up the stopped thread _after_ setting | ||
| 750 | * TIF_SIGPENDING | ||
| 751 | */ | ||
| 752 | state = __TASK_STOPPED; | ||
| 753 | if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { | ||
| 754 | set_tsk_thread_flag(t, TIF_SIGPENDING); | ||
| 755 | state |= TASK_INTERRUPTIBLE; | ||
| 756 | } | ||
| 757 | wake_up_state(t, state); | ||
| 758 | } while_each_thread(p, t); | 889 | } while_each_thread(p, t); |
| 759 | 890 | ||
| 760 | /* | 891 | /* |
| @@ -780,13 +911,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) | |||
| 780 | signal->flags = why | SIGNAL_STOP_CONTINUED; | 911 | signal->flags = why | SIGNAL_STOP_CONTINUED; |
| 781 | signal->group_stop_count = 0; | 912 | signal->group_stop_count = 0; |
| 782 | signal->group_exit_code = 0; | 913 | signal->group_exit_code = 0; |
| 783 | } else { | ||
| 784 | /* | ||
| 785 | * We are not stopped, but there could be a stop | ||
| 786 | * signal in the middle of being processed after | ||
| 787 | * being removed from the queue. Clear that too. | ||
| 788 | */ | ||
| 789 | signal->flags &= ~SIGNAL_STOP_DEQUEUED; | ||
| 790 | } | 914 | } |
| 791 | } | 915 | } |
| 792 | 916 | ||
| @@ -858,8 +982,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) | |||
| 858 | if (sig_fatal(p, sig) && | 982 | if (sig_fatal(p, sig) && |
| 859 | !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && | 983 | !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && |
| 860 | !sigismember(&t->real_blocked, sig) && | 984 | !sigismember(&t->real_blocked, sig) && |
| 861 | (sig == SIGKILL || | 985 | (sig == SIGKILL || !t->ptrace)) { |
| 862 | !tracehook_consider_fatal_signal(t, sig))) { | ||
| 863 | /* | 986 | /* |
| 864 | * This signal will be fatal to the whole group. | 987 | * This signal will be fatal to the whole group. |
| 865 | */ | 988 | */ |
| @@ -875,6 +998,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) | |||
| 875 | signal->group_stop_count = 0; | 998 | signal->group_stop_count = 0; |
| 876 | t = p; | 999 | t = p; |
| 877 | do { | 1000 | do { |
| 1001 | task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); | ||
| 878 | sigaddset(&t->pending.signal, SIGKILL); | 1002 | sigaddset(&t->pending.signal, SIGKILL); |
| 879 | signal_wake_up(t, 1); | 1003 | signal_wake_up(t, 1); |
| 880 | } while_each_thread(p, t); | 1004 | } while_each_thread(p, t); |
| @@ -1109,6 +1233,7 @@ int zap_other_threads(struct task_struct *p) | |||
| 1109 | p->signal->group_stop_count = 0; | 1233 | p->signal->group_stop_count = 0; |
| 1110 | 1234 | ||
| 1111 | while_each_thread(p, t) { | 1235 | while_each_thread(p, t) { |
| 1236 | task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); | ||
| 1112 | count++; | 1237 | count++; |
| 1113 | 1238 | ||
| 1114 | /* Don't bother with already dead threads */ | 1239 | /* Don't bother with already dead threads */ |
| @@ -1126,18 +1251,25 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, | |||
| 1126 | { | 1251 | { |
| 1127 | struct sighand_struct *sighand; | 1252 | struct sighand_struct *sighand; |
| 1128 | 1253 | ||
| 1129 | rcu_read_lock(); | ||
| 1130 | for (;;) { | 1254 | for (;;) { |
| 1255 | local_irq_save(*flags); | ||
| 1256 | rcu_read_lock(); | ||
| 1131 | sighand = rcu_dereference(tsk->sighand); | 1257 | sighand = rcu_dereference(tsk->sighand); |
| 1132 | if (unlikely(sighand == NULL)) | 1258 | if (unlikely(sighand == NULL)) { |
| 1259 | rcu_read_unlock(); | ||
| 1260 | local_irq_restore(*flags); | ||
| 1133 | break; | 1261 | break; |
| 1262 | } | ||
| 1134 | 1263 | ||
| 1135 | spin_lock_irqsave(&sighand->siglock, *flags); | 1264 | spin_lock(&sighand->siglock); |
| 1136 | if (likely(sighand == tsk->sighand)) | 1265 | if (likely(sighand == tsk->sighand)) { |
| 1266 | rcu_read_unlock(); | ||
| 1137 | break; | 1267 | break; |
| 1138 | spin_unlock_irqrestore(&sighand->siglock, *flags); | 1268 | } |
| 1269 | spin_unlock(&sighand->siglock); | ||
| 1270 | rcu_read_unlock(); | ||
| 1271 | local_irq_restore(*flags); | ||
| 1139 | } | 1272 | } |
| 1140 | rcu_read_unlock(); | ||
| 1141 | 1273 | ||
| 1142 | return sighand; | 1274 | return sighand; |
| 1143 | } | 1275 | } |
| @@ -1452,22 +1584,22 @@ ret: | |||
| 1452 | * Let a parent know about the death of a child. | 1584 | * Let a parent know about the death of a child. |
| 1453 | * For a stopped/continued status change, use do_notify_parent_cldstop instead. | 1585 | * For a stopped/continued status change, use do_notify_parent_cldstop instead. |
| 1454 | * | 1586 | * |
| 1455 | * Returns -1 if our parent ignored us and so we've switched to | 1587 | * Returns true if our parent ignored us and so we've switched to |
| 1456 | * self-reaping, or else @sig. | 1588 | * self-reaping. |
| 1457 | */ | 1589 | */ |
| 1458 | int do_notify_parent(struct task_struct *tsk, int sig) | 1590 | bool do_notify_parent(struct task_struct *tsk, int sig) |
| 1459 | { | 1591 | { |
| 1460 | struct siginfo info; | 1592 | struct siginfo info; |
| 1461 | unsigned long flags; | 1593 | unsigned long flags; |
| 1462 | struct sighand_struct *psig; | 1594 | struct sighand_struct *psig; |
| 1463 | int ret = sig; | 1595 | bool autoreap = false; |
| 1464 | 1596 | ||
| 1465 | BUG_ON(sig == -1); | 1597 | BUG_ON(sig == -1); |
| 1466 | 1598 | ||
| 1467 | /* do_notify_parent_cldstop should have been called instead. */ | 1599 | /* do_notify_parent_cldstop should have been called instead. */ |
| 1468 | BUG_ON(task_is_stopped_or_traced(tsk)); | 1600 | BUG_ON(task_is_stopped_or_traced(tsk)); |
| 1469 | 1601 | ||
| 1470 | BUG_ON(!task_ptrace(tsk) && | 1602 | BUG_ON(!tsk->ptrace && |
| 1471 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); | 1603 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); |
| 1472 | 1604 | ||
| 1473 | info.si_signo = sig; | 1605 | info.si_signo = sig; |
| @@ -1506,7 +1638,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) | |||
| 1506 | 1638 | ||
| 1507 | psig = tsk->parent->sighand; | 1639 | psig = tsk->parent->sighand; |
| 1508 | spin_lock_irqsave(&psig->siglock, flags); | 1640 | spin_lock_irqsave(&psig->siglock, flags); |
| 1509 | if (!task_ptrace(tsk) && sig == SIGCHLD && | 1641 | if (!tsk->ptrace && sig == SIGCHLD && |
| 1510 | (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || | 1642 | (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || |
| 1511 | (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { | 1643 | (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { |
| 1512 | /* | 1644 | /* |
| @@ -1524,28 +1656,42 @@ int do_notify_parent(struct task_struct *tsk, int sig) | |||
| 1524 | * is implementation-defined: we do (if you don't want | 1656 | * is implementation-defined: we do (if you don't want |
| 1525 | * it, just use SIG_IGN instead). | 1657 | * it, just use SIG_IGN instead). |
| 1526 | */ | 1658 | */ |
| 1527 | ret = tsk->exit_signal = -1; | 1659 | autoreap = true; |
| 1528 | if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) | 1660 | if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) |
| 1529 | sig = -1; | 1661 | sig = 0; |
| 1530 | } | 1662 | } |
| 1531 | if (valid_signal(sig) && sig > 0) | 1663 | if (valid_signal(sig) && sig) |
| 1532 | __group_send_sig_info(sig, &info, tsk->parent); | 1664 | __group_send_sig_info(sig, &info, tsk->parent); |
| 1533 | __wake_up_parent(tsk, tsk->parent); | 1665 | __wake_up_parent(tsk, tsk->parent); |
| 1534 | spin_unlock_irqrestore(&psig->siglock, flags); | 1666 | spin_unlock_irqrestore(&psig->siglock, flags); |
| 1535 | 1667 | ||
| 1536 | return ret; | 1668 | return autoreap; |
| 1537 | } | 1669 | } |
| 1538 | 1670 | ||
| 1539 | static void do_notify_parent_cldstop(struct task_struct *tsk, int why) | 1671 | /** |
| 1672 | * do_notify_parent_cldstop - notify parent of stopped/continued state change | ||
| 1673 | * @tsk: task reporting the state change | ||
| 1674 | * @for_ptracer: the notification is for ptracer | ||
| 1675 | * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report | ||
| 1676 | * | ||
| 1677 | * Notify @tsk's parent that the stopped/continued state has changed. If | ||
| 1678 | * @for_ptracer is %false, @tsk's group leader notifies to its real parent. | ||
| 1679 | * If %true, @tsk reports to @tsk->parent which should be the ptracer. | ||
| 1680 | * | ||
| 1681 | * CONTEXT: | ||
| 1682 | * Must be called with tasklist_lock at least read locked. | ||
| 1683 | */ | ||
| 1684 | static void do_notify_parent_cldstop(struct task_struct *tsk, | ||
| 1685 | bool for_ptracer, int why) | ||
| 1540 | { | 1686 | { |
| 1541 | struct siginfo info; | 1687 | struct siginfo info; |
| 1542 | unsigned long flags; | 1688 | unsigned long flags; |
| 1543 | struct task_struct *parent; | 1689 | struct task_struct *parent; |
| 1544 | struct sighand_struct *sighand; | 1690 | struct sighand_struct *sighand; |
| 1545 | 1691 | ||
| 1546 | if (task_ptrace(tsk)) | 1692 | if (for_ptracer) { |
| 1547 | parent = tsk->parent; | 1693 | parent = tsk->parent; |
| 1548 | else { | 1694 | } else { |
| 1549 | tsk = tsk->group_leader; | 1695 | tsk = tsk->group_leader; |
| 1550 | parent = tsk->real_parent; | 1696 | parent = tsk->real_parent; |
| 1551 | } | 1697 | } |
| @@ -1592,7 +1738,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) | |||
| 1592 | 1738 | ||
| 1593 | static inline int may_ptrace_stop(void) | 1739 | static inline int may_ptrace_stop(void) |
| 1594 | { | 1740 | { |
| 1595 | if (!likely(task_ptrace(current))) | 1741 | if (!likely(current->ptrace)) |
| 1596 | return 0; | 1742 | return 0; |
| 1597 | /* | 1743 | /* |
| 1598 | * Are we in the middle of do_coredump? | 1744 | * Are we in the middle of do_coredump? |
| @@ -1631,10 +1777,12 @@ static int sigkill_pending(struct task_struct *tsk) | |||
| 1631 | * If we actually decide not to stop at all because the tracer | 1777 | * If we actually decide not to stop at all because the tracer |
| 1632 | * is gone, we keep current->exit_code unless clear_code. | 1778 | * is gone, we keep current->exit_code unless clear_code. |
| 1633 | */ | 1779 | */ |
| 1634 | static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | 1780 | static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) |
| 1635 | __releases(¤t->sighand->siglock) | 1781 | __releases(¤t->sighand->siglock) |
| 1636 | __acquires(¤t->sighand->siglock) | 1782 | __acquires(¤t->sighand->siglock) |
| 1637 | { | 1783 | { |
| 1784 | bool gstop_done = false; | ||
| 1785 | |||
| 1638 | if (arch_ptrace_stop_needed(exit_code, info)) { | 1786 | if (arch_ptrace_stop_needed(exit_code, info)) { |
| 1639 | /* | 1787 | /* |
| 1640 | * The arch code has something special to do before a | 1788 | * The arch code has something special to do before a |
| @@ -1655,21 +1803,52 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | |||
| 1655 | } | 1803 | } |
| 1656 | 1804 | ||
| 1657 | /* | 1805 | /* |
| 1658 | * If there is a group stop in progress, | 1806 | * We're committing to trapping. TRACED should be visible before |
| 1659 | * we must participate in the bookkeeping. | 1807 | * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). |
| 1808 | * Also, transition to TRACED and updates to ->jobctl should be | ||
| 1809 | * atomic with respect to siglock and should be done after the arch | ||
| 1810 | * hook as siglock is released and regrabbed across it. | ||
| 1660 | */ | 1811 | */ |
| 1661 | if (current->signal->group_stop_count > 0) | 1812 | set_current_state(TASK_TRACED); |
| 1662 | --current->signal->group_stop_count; | ||
| 1663 | 1813 | ||
| 1664 | current->last_siginfo = info; | 1814 | current->last_siginfo = info; |
| 1665 | current->exit_code = exit_code; | 1815 | current->exit_code = exit_code; |
| 1666 | 1816 | ||
| 1667 | /* Let the debugger run. */ | 1817 | /* |
| 1668 | __set_current_state(TASK_TRACED); | 1818 | * If @why is CLD_STOPPED, we're trapping to participate in a group |
| 1819 | * stop. Do the bookkeeping. Note that if SIGCONT was delievered | ||
| 1820 | * across siglock relocks since INTERRUPT was scheduled, PENDING | ||
| 1821 | * could be clear now. We act as if SIGCONT is received after | ||
| 1822 | * TASK_TRACED is entered - ignore it. | ||
| 1823 | */ | ||
| 1824 | if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING)) | ||
| 1825 | gstop_done = task_participate_group_stop(current); | ||
| 1826 | |||
| 1827 | /* any trap clears pending STOP trap, STOP trap clears NOTIFY */ | ||
| 1828 | task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP); | ||
| 1829 | if (info && info->si_code >> 8 == PTRACE_EVENT_STOP) | ||
| 1830 | task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY); | ||
| 1831 | |||
| 1832 | /* entering a trap, clear TRAPPING */ | ||
| 1833 | task_clear_jobctl_trapping(current); | ||
| 1834 | |||
| 1669 | spin_unlock_irq(¤t->sighand->siglock); | 1835 | spin_unlock_irq(¤t->sighand->siglock); |
| 1670 | read_lock(&tasklist_lock); | 1836 | read_lock(&tasklist_lock); |
| 1671 | if (may_ptrace_stop()) { | 1837 | if (may_ptrace_stop()) { |
| 1672 | do_notify_parent_cldstop(current, CLD_TRAPPED); | 1838 | /* |
| 1839 | * Notify parents of the stop. | ||
| 1840 | * | ||
| 1841 | * While ptraced, there are two parents - the ptracer and | ||
| 1842 | * the real_parent of the group_leader. The ptracer should | ||
| 1843 | * know about every stop while the real parent is only | ||
| 1844 | * interested in the completion of group stop. The states | ||
| 1845 | * for the two don't interact with each other. Notify | ||
| 1846 | * separately unless they're gonna be duplicates. | ||
| 1847 | */ | ||
| 1848 | do_notify_parent_cldstop(current, true, why); | ||
| 1849 | if (gstop_done && ptrace_reparented(current)) | ||
| 1850 | do_notify_parent_cldstop(current, false, why); | ||
| 1851 | |||
| 1673 | /* | 1852 | /* |
| 1674 | * Don't want to allow preemption here, because | 1853 | * Don't want to allow preemption here, because |
| 1675 | * sys_ptrace() needs this task to be inactive. | 1854 | * sys_ptrace() needs this task to be inactive. |
| @@ -1684,7 +1863,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | |||
| 1684 | /* | 1863 | /* |
| 1685 | * By the time we got the lock, our tracer went away. | 1864 | * By the time we got the lock, our tracer went away. |
| 1686 | * Don't drop the lock yet, another tracer may come. | 1865 | * Don't drop the lock yet, another tracer may come. |
| 1866 | * | ||
| 1867 | * If @gstop_done, the ptracer went away between group stop | ||
| 1868 | * completion and here. During detach, it would have set | ||
| 1869 | * JOBCTL_STOP_PENDING on us and we'll re-enter | ||
| 1870 | * TASK_STOPPED in do_signal_stop() on return, so notifying | ||
| 1871 | * the real parent of the group stop completion is enough. | ||
| 1687 | */ | 1872 | */ |
| 1873 | if (gstop_done) | ||
| 1874 | do_notify_parent_cldstop(current, false, why); | ||
| 1875 | |||
| 1688 | __set_current_state(TASK_RUNNING); | 1876 | __set_current_state(TASK_RUNNING); |
| 1689 | if (clear_code) | 1877 | if (clear_code) |
| 1690 | current->exit_code = 0; | 1878 | current->exit_code = 0; |
| @@ -1706,6 +1894,9 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | |||
| 1706 | spin_lock_irq(¤t->sighand->siglock); | 1894 | spin_lock_irq(¤t->sighand->siglock); |
| 1707 | current->last_siginfo = NULL; | 1895 | current->last_siginfo = NULL; |
| 1708 | 1896 | ||
| 1897 | /* LISTENING can be set only during STOP traps, clear it */ | ||
| 1898 | current->jobctl &= ~JOBCTL_LISTENING; | ||
| 1899 | |||
| 1709 | /* | 1900 | /* |
| 1710 | * Queued signals ignored us while we were stopped for tracing. | 1901 | * Queued signals ignored us while we were stopped for tracing. |
| 1711 | * So check for any that we should take before resuming user mode. | 1902 | * So check for any that we should take before resuming user mode. |
| @@ -1714,107 +1905,204 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) | |||
| 1714 | recalc_sigpending_tsk(current); | 1905 | recalc_sigpending_tsk(current); |
| 1715 | } | 1906 | } |
| 1716 | 1907 | ||
| 1717 | void ptrace_notify(int exit_code) | 1908 | static void ptrace_do_notify(int signr, int exit_code, int why) |
| 1718 | { | 1909 | { |
| 1719 | siginfo_t info; | 1910 | siginfo_t info; |
| 1720 | 1911 | ||
| 1721 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); | ||
| 1722 | |||
| 1723 | memset(&info, 0, sizeof info); | 1912 | memset(&info, 0, sizeof info); |
| 1724 | info.si_signo = SIGTRAP; | 1913 | info.si_signo = signr; |
| 1725 | info.si_code = exit_code; | 1914 | info.si_code = exit_code; |
| 1726 | info.si_pid = task_pid_vnr(current); | 1915 | info.si_pid = task_pid_vnr(current); |
| 1727 | info.si_uid = current_uid(); | 1916 | info.si_uid = current_uid(); |
| 1728 | 1917 | ||
| 1729 | /* Let the debugger run. */ | 1918 | /* Let the debugger run. */ |
| 1919 | ptrace_stop(exit_code, why, 1, &info); | ||
| 1920 | } | ||
| 1921 | |||
| 1922 | void ptrace_notify(int exit_code) | ||
| 1923 | { | ||
| 1924 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); | ||
| 1925 | |||
| 1730 | spin_lock_irq(¤t->sighand->siglock); | 1926 | spin_lock_irq(¤t->sighand->siglock); |
| 1731 | ptrace_stop(exit_code, 1, &info); | 1927 | ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); |
| 1732 | spin_unlock_irq(¤t->sighand->siglock); | 1928 | spin_unlock_irq(¤t->sighand->siglock); |
| 1733 | } | 1929 | } |
| 1734 | 1930 | ||
| 1735 | /* | 1931 | /** |
| 1736 | * This performs the stopping for SIGSTOP and other stop signals. | 1932 | * do_signal_stop - handle group stop for SIGSTOP and other stop signals |
| 1737 | * We have to stop all threads in the thread group. | 1933 | * @signr: signr causing group stop if initiating |
| 1738 | * Returns non-zero if we've actually stopped and released the siglock. | 1934 | * |
| 1739 | * Returns zero if we didn't stop and still hold the siglock. | 1935 | * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr |
| 1936 | * and participate in it. If already set, participate in the existing | ||
| 1937 | * group stop. If participated in a group stop (and thus slept), %true is | ||
| 1938 | * returned with siglock released. | ||
| 1939 | * | ||
| 1940 | * If ptraced, this function doesn't handle stop itself. Instead, | ||
| 1941 | * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock | ||
| 1942 | * untouched. The caller must ensure that INTERRUPT trap handling takes | ||
| 1943 | * places afterwards. | ||
| 1944 | * | ||
| 1945 | * CONTEXT: | ||
| 1946 | * Must be called with @current->sighand->siglock held, which is released | ||
| 1947 | * on %true return. | ||
| 1948 | * | ||
| 1949 | * RETURNS: | ||
| 1950 | * %false if group stop is already cancelled or ptrace trap is scheduled. | ||
| 1951 | * %true if participated in group stop. | ||
| 1740 | */ | 1952 | */ |
| 1741 | static int do_signal_stop(int signr) | 1953 | static bool do_signal_stop(int signr) |
| 1954 | __releases(¤t->sighand->siglock) | ||
| 1742 | { | 1955 | { |
| 1743 | struct signal_struct *sig = current->signal; | 1956 | struct signal_struct *sig = current->signal; |
| 1744 | int notify; | ||
| 1745 | 1957 | ||
| 1746 | if (!sig->group_stop_count) { | 1958 | if (!(current->jobctl & JOBCTL_STOP_PENDING)) { |
| 1959 | unsigned int gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; | ||
| 1747 | struct task_struct *t; | 1960 | struct task_struct *t; |
| 1748 | 1961 | ||
| 1749 | if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || | 1962 | /* signr will be recorded in task->jobctl for retries */ |
| 1963 | WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); | ||
| 1964 | |||
| 1965 | if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || | ||
| 1750 | unlikely(signal_group_exit(sig))) | 1966 | unlikely(signal_group_exit(sig))) |
| 1751 | return 0; | 1967 | return false; |
| 1752 | /* | 1968 | /* |
| 1753 | * There is no group stop already in progress. | 1969 | * There is no group stop already in progress. We must |
| 1754 | * We must initiate one now. | 1970 | * initiate one now. |
| 1971 | * | ||
| 1972 | * While ptraced, a task may be resumed while group stop is | ||
| 1973 | * still in effect and then receive a stop signal and | ||
| 1974 | * initiate another group stop. This deviates from the | ||
| 1975 | * usual behavior as two consecutive stop signals can't | ||
| 1976 | * cause two group stops when !ptraced. That is why we | ||
| 1977 | * also check !task_is_stopped(t) below. | ||
| 1978 | * | ||
| 1979 | * The condition can be distinguished by testing whether | ||
| 1980 | * SIGNAL_STOP_STOPPED is already set. Don't generate | ||
| 1981 | * group_exit_code in such case. | ||
| 1982 | * | ||
| 1983 | * This is not necessary for SIGNAL_STOP_CONTINUED because | ||
| 1984 | * an intervening stop signal is required to cause two | ||
| 1985 | * continued events regardless of ptrace. | ||
| 1755 | */ | 1986 | */ |
| 1756 | sig->group_exit_code = signr; | 1987 | if (!(sig->flags & SIGNAL_STOP_STOPPED)) |
| 1988 | sig->group_exit_code = signr; | ||
| 1989 | else | ||
| 1990 | WARN_ON_ONCE(!current->ptrace); | ||
| 1991 | |||
| 1992 | sig->group_stop_count = 0; | ||
| 1757 | 1993 | ||
| 1758 | sig->group_stop_count = 1; | 1994 | if (task_set_jobctl_pending(current, signr | gstop)) |
| 1759 | for (t = next_thread(current); t != current; t = next_thread(t)) | 1995 | sig->group_stop_count++; |
| 1996 | |||
| 1997 | for (t = next_thread(current); t != current; | ||
| 1998 | t = next_thread(t)) { | ||
| 1760 | /* | 1999 | /* |
| 1761 | * Setting state to TASK_STOPPED for a group | 2000 | * Setting state to TASK_STOPPED for a group |
| 1762 | * stop is always done with the siglock held, | 2001 | * stop is always done with the siglock held, |
| 1763 | * so this check has no races. | 2002 | * so this check has no races. |
| 1764 | */ | 2003 | */ |
| 1765 | if (!(t->flags & PF_EXITING) && | 2004 | if (!task_is_stopped(t) && |
| 1766 | !task_is_stopped_or_traced(t)) { | 2005 | task_set_jobctl_pending(t, signr | gstop)) { |
| 1767 | sig->group_stop_count++; | 2006 | sig->group_stop_count++; |
| 1768 | signal_wake_up(t, 0); | 2007 | if (likely(!(t->ptrace & PT_SEIZED))) |
| 2008 | signal_wake_up(t, 0); | ||
| 2009 | else | ||
| 2010 | ptrace_trap_notify(t); | ||
| 1769 | } | 2011 | } |
| 2012 | } | ||
| 1770 | } | 2013 | } |
| 1771 | /* | 2014 | |
| 1772 | * If there are no other threads in the group, or if there is | 2015 | if (likely(!current->ptrace)) { |
| 1773 | * a group stop in progress and we are the last to stop, report | 2016 | int notify = 0; |
| 1774 | * to the parent. When ptraced, every thread reports itself. | 2017 | |
| 1775 | */ | 2018 | /* |
| 1776 | notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; | 2019 | * If there are no other threads in the group, or if there |
| 1777 | notify = tracehook_notify_jctl(notify, CLD_STOPPED); | 2020 | * is a group stop in progress and we are the last to stop, |
| 1778 | /* | 2021 | * report to the parent. |
| 1779 | * tracehook_notify_jctl() can drop and reacquire siglock, so | 2022 | */ |
| 1780 | * we keep ->group_stop_count != 0 before the call. If SIGCONT | 2023 | if (task_participate_group_stop(current)) |
| 1781 | * or SIGKILL comes in between ->group_stop_count == 0. | 2024 | notify = CLD_STOPPED; |
| 1782 | */ | 2025 | |
| 1783 | if (sig->group_stop_count) { | ||
| 1784 | if (!--sig->group_stop_count) | ||
| 1785 | sig->flags = SIGNAL_STOP_STOPPED; | ||
| 1786 | current->exit_code = sig->group_exit_code; | ||
| 1787 | __set_current_state(TASK_STOPPED); | 2026 | __set_current_state(TASK_STOPPED); |
| 1788 | } | 2027 | spin_unlock_irq(¤t->sighand->siglock); |
| 1789 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 1790 | 2028 | ||
| 1791 | if (notify) { | 2029 | /* |
| 1792 | read_lock(&tasklist_lock); | 2030 | * Notify the parent of the group stop completion. Because |
| 1793 | do_notify_parent_cldstop(current, notify); | 2031 | * we're not holding either the siglock or tasklist_lock |
| 1794 | read_unlock(&tasklist_lock); | 2032 | * here, ptracer may attach inbetween; however, this is for |
| 1795 | } | 2033 | * group stop and should always be delivered to the real |
| 2034 | * parent of the group leader. The new ptracer will get | ||
| 2035 | * its notification when this task transitions into | ||
| 2036 | * TASK_TRACED. | ||
| 2037 | */ | ||
| 2038 | if (notify) { | ||
| 2039 | read_lock(&tasklist_lock); | ||
| 2040 | do_notify_parent_cldstop(current, false, notify); | ||
| 2041 | read_unlock(&tasklist_lock); | ||
| 2042 | } | ||
| 1796 | 2043 | ||
| 1797 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ | 2044 | /* Now we don't run again until woken by SIGCONT or SIGKILL */ |
| 1798 | do { | ||
| 1799 | schedule(); | 2045 | schedule(); |
| 1800 | } while (try_to_freeze()); | 2046 | return true; |
| 1801 | 2047 | } else { | |
| 1802 | tracehook_finish_jctl(); | 2048 | /* |
| 1803 | current->exit_code = 0; | 2049 | * While ptraced, group stop is handled by STOP trap. |
| 2050 | * Schedule it and let the caller deal with it. | ||
| 2051 | */ | ||
| 2052 | task_set_jobctl_pending(current, JOBCTL_TRAP_STOP); | ||
| 2053 | return false; | ||
| 2054 | } | ||
| 2055 | } | ||
| 1804 | 2056 | ||
| 1805 | return 1; | 2057 | /** |
| 2058 | * do_jobctl_trap - take care of ptrace jobctl traps | ||
| 2059 | * | ||
| 2060 | * When PT_SEIZED, it's used for both group stop and explicit | ||
| 2061 | * SEIZE/INTERRUPT traps. Both generate PTRACE_EVENT_STOP trap with | ||
| 2062 | * accompanying siginfo. If stopped, lower eight bits of exit_code contain | ||
| 2063 | * the stop signal; otherwise, %SIGTRAP. | ||
| 2064 | * | ||
| 2065 | * When !PT_SEIZED, it's used only for group stop trap with stop signal | ||
| 2066 | * number as exit_code and no siginfo. | ||
| 2067 | * | ||
| 2068 | * CONTEXT: | ||
| 2069 | * Must be called with @current->sighand->siglock held, which may be | ||
| 2070 | * released and re-acquired before returning with intervening sleep. | ||
| 2071 | */ | ||
| 2072 | static void do_jobctl_trap(void) | ||
| 2073 | { | ||
| 2074 | struct signal_struct *signal = current->signal; | ||
| 2075 | int signr = current->jobctl & JOBCTL_STOP_SIGMASK; | ||
| 2076 | |||
| 2077 | if (current->ptrace & PT_SEIZED) { | ||
| 2078 | if (!signal->group_stop_count && | ||
| 2079 | !(signal->flags & SIGNAL_STOP_STOPPED)) | ||
| 2080 | signr = SIGTRAP; | ||
| 2081 | WARN_ON_ONCE(!signr); | ||
| 2082 | ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8), | ||
| 2083 | CLD_STOPPED); | ||
| 2084 | } else { | ||
| 2085 | WARN_ON_ONCE(!signr); | ||
| 2086 | ptrace_stop(signr, CLD_STOPPED, 0, NULL); | ||
| 2087 | current->exit_code = 0; | ||
| 2088 | } | ||
| 1806 | } | 2089 | } |
| 1807 | 2090 | ||
| 1808 | static int ptrace_signal(int signr, siginfo_t *info, | 2091 | static int ptrace_signal(int signr, siginfo_t *info, |
| 1809 | struct pt_regs *regs, void *cookie) | 2092 | struct pt_regs *regs, void *cookie) |
| 1810 | { | 2093 | { |
| 1811 | if (!task_ptrace(current)) | ||
| 1812 | return signr; | ||
| 1813 | |||
| 1814 | ptrace_signal_deliver(regs, cookie); | 2094 | ptrace_signal_deliver(regs, cookie); |
| 1815 | 2095 | /* | |
| 1816 | /* Let the debugger run. */ | 2096 | * We do not check sig_kernel_stop(signr) but set this marker |
| 1817 | ptrace_stop(signr, 0, info); | 2097 | * unconditionally because we do not know whether debugger will |
| 2098 | * change signr. This flag has no meaning unless we are going | ||
| 2099 | * to stop after return from ptrace_stop(). In this case it will | ||
| 2100 | * be checked in do_signal_stop(), we should only stop if it was | ||
| 2101 | * not cleared by SIGCONT while we were sleeping. See also the | ||
| 2102 | * comment in dequeue_signal(). | ||
| 2103 | */ | ||
| 2104 | current->jobctl |= JOBCTL_STOP_DEQUEUED; | ||
| 2105 | ptrace_stop(signr, CLD_TRAPPED, 0, info); | ||
| 1818 | 2106 | ||
| 1819 | /* We're back. Did the debugger cancel the sig? */ | 2107 | /* We're back. Did the debugger cancel the sig? */ |
| 1820 | signr = current->exit_code; | 2108 | signr = current->exit_code; |
| @@ -1869,54 +2157,63 @@ relock: | |||
| 1869 | * the CLD_ si_code into SIGNAL_CLD_MASK bits. | 2157 | * the CLD_ si_code into SIGNAL_CLD_MASK bits. |
| 1870 | */ | 2158 | */ |
| 1871 | if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { | 2159 | if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { |
| 1872 | int why = (signal->flags & SIGNAL_STOP_CONTINUED) | 2160 | int why; |
| 1873 | ? CLD_CONTINUED : CLD_STOPPED; | 2161 | |
| 2162 | if (signal->flags & SIGNAL_CLD_CONTINUED) | ||
| 2163 | why = CLD_CONTINUED; | ||
| 2164 | else | ||
| 2165 | why = CLD_STOPPED; | ||
| 2166 | |||
| 1874 | signal->flags &= ~SIGNAL_CLD_MASK; | 2167 | signal->flags &= ~SIGNAL_CLD_MASK; |
| 1875 | 2168 | ||
| 1876 | why = tracehook_notify_jctl(why, CLD_CONTINUED); | ||
| 1877 | spin_unlock_irq(&sighand->siglock); | 2169 | spin_unlock_irq(&sighand->siglock); |
| 1878 | 2170 | ||
| 1879 | if (why) { | 2171 | /* |
| 1880 | read_lock(&tasklist_lock); | 2172 | * Notify the parent that we're continuing. This event is |
| 1881 | do_notify_parent_cldstop(current->group_leader, why); | 2173 | * always per-process and doesn't make whole lot of sense |
| 1882 | read_unlock(&tasklist_lock); | 2174 | * for ptracers, who shouldn't consume the state via |
| 1883 | } | 2175 | * wait(2) either, but, for backward compatibility, notify |
| 2176 | * the ptracer of the group leader too unless it's gonna be | ||
| 2177 | * a duplicate. | ||
| 2178 | */ | ||
| 2179 | read_lock(&tasklist_lock); | ||
| 2180 | do_notify_parent_cldstop(current, false, why); | ||
| 2181 | |||
| 2182 | if (ptrace_reparented(current->group_leader)) | ||
| 2183 | do_notify_parent_cldstop(current->group_leader, | ||
| 2184 | true, why); | ||
| 2185 | read_unlock(&tasklist_lock); | ||
| 2186 | |||
| 1884 | goto relock; | 2187 | goto relock; |
| 1885 | } | 2188 | } |
| 1886 | 2189 | ||
| 1887 | for (;;) { | 2190 | for (;;) { |
| 1888 | struct k_sigaction *ka; | 2191 | struct k_sigaction *ka; |
| 1889 | /* | 2192 | |
| 1890 | * Tracing can induce an artificial signal and choose sigaction. | 2193 | if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && |
| 1891 | * The return value in @signr determines the default action, | 2194 | do_signal_stop(0)) |
| 1892 | * but @info->si_signo is the signal number we will report. | ||
| 1893 | */ | ||
| 1894 | signr = tracehook_get_signal(current, regs, info, return_ka); | ||
| 1895 | if (unlikely(signr < 0)) | ||
| 1896 | goto relock; | 2195 | goto relock; |
| 1897 | if (unlikely(signr != 0)) | ||
| 1898 | ka = return_ka; | ||
| 1899 | else { | ||
| 1900 | if (unlikely(signal->group_stop_count > 0) && | ||
| 1901 | do_signal_stop(0)) | ||
| 1902 | goto relock; | ||
| 1903 | 2196 | ||
| 1904 | signr = dequeue_signal(current, ¤t->blocked, | 2197 | if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) { |
| 1905 | info); | 2198 | do_jobctl_trap(); |
| 2199 | spin_unlock_irq(&sighand->siglock); | ||
| 2200 | goto relock; | ||
| 2201 | } | ||
| 1906 | 2202 | ||
| 1907 | if (!signr) | 2203 | signr = dequeue_signal(current, ¤t->blocked, info); |
| 1908 | break; /* will return 0 */ | ||
| 1909 | 2204 | ||
| 1910 | if (signr != SIGKILL) { | 2205 | if (!signr) |
| 1911 | signr = ptrace_signal(signr, info, | 2206 | break; /* will return 0 */ |
| 1912 | regs, cookie); | ||
| 1913 | if (!signr) | ||
| 1914 | continue; | ||
| 1915 | } | ||
| 1916 | 2207 | ||
| 1917 | ka = &sighand->action[signr-1]; | 2208 | if (unlikely(current->ptrace) && signr != SIGKILL) { |
| 2209 | signr = ptrace_signal(signr, info, | ||
| 2210 | regs, cookie); | ||
| 2211 | if (!signr) | ||
| 2212 | continue; | ||
| 1918 | } | 2213 | } |
| 1919 | 2214 | ||
| 2215 | ka = &sighand->action[signr-1]; | ||
| 2216 | |||
| 1920 | /* Trace actually delivered signals. */ | 2217 | /* Trace actually delivered signals. */ |
| 1921 | trace_signal_deliver(signr, info, ka); | 2218 | trace_signal_deliver(signr, info, ka); |
| 1922 | 2219 | ||
| @@ -2017,10 +2314,42 @@ relock: | |||
| 2017 | return signr; | 2314 | return signr; |
| 2018 | } | 2315 | } |
| 2019 | 2316 | ||
| 2317 | /* | ||
| 2318 | * It could be that complete_signal() picked us to notify about the | ||
| 2319 | * group-wide signal. Other threads should be notified now to take | ||
| 2320 | * the shared signals in @which since we will not. | ||
| 2321 | */ | ||
| 2322 | static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) | ||
| 2323 | { | ||
| 2324 | sigset_t retarget; | ||
| 2325 | struct task_struct *t; | ||
| 2326 | |||
| 2327 | sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); | ||
| 2328 | if (sigisemptyset(&retarget)) | ||
| 2329 | return; | ||
| 2330 | |||
| 2331 | t = tsk; | ||
| 2332 | while_each_thread(tsk, t) { | ||
| 2333 | if (t->flags & PF_EXITING) | ||
| 2334 | continue; | ||
| 2335 | |||
| 2336 | if (!has_pending_signals(&retarget, &t->blocked)) | ||
| 2337 | continue; | ||
| 2338 | /* Remove the signals this thread can handle. */ | ||
| 2339 | sigandsets(&retarget, &retarget, &t->blocked); | ||
| 2340 | |||
| 2341 | if (!signal_pending(t)) | ||
| 2342 | signal_wake_up(t, 0); | ||
| 2343 | |||
| 2344 | if (sigisemptyset(&retarget)) | ||
| 2345 | break; | ||
| 2346 | } | ||
| 2347 | } | ||
| 2348 | |||
| 2020 | void exit_signals(struct task_struct *tsk) | 2349 | void exit_signals(struct task_struct *tsk) |
| 2021 | { | 2350 | { |
| 2022 | int group_stop = 0; | 2351 | int group_stop = 0; |
| 2023 | struct task_struct *t; | 2352 | sigset_t unblocked; |
| 2024 | 2353 | ||
| 2025 | if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { | 2354 | if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { |
| 2026 | tsk->flags |= PF_EXITING; | 2355 | tsk->flags |= PF_EXITING; |
| @@ -2036,26 +2365,23 @@ void exit_signals(struct task_struct *tsk) | |||
| 2036 | if (!signal_pending(tsk)) | 2365 | if (!signal_pending(tsk)) |
| 2037 | goto out; | 2366 | goto out; |
| 2038 | 2367 | ||
| 2039 | /* | 2368 | unblocked = tsk->blocked; |
| 2040 | * It could be that __group_complete_signal() choose us to | 2369 | signotset(&unblocked); |
| 2041 | * notify about group-wide signal. Another thread should be | 2370 | retarget_shared_pending(tsk, &unblocked); |
| 2042 | * woken now to take the signal since we will not. | ||
| 2043 | */ | ||
| 2044 | for (t = tsk; (t = next_thread(t)) != tsk; ) | ||
| 2045 | if (!signal_pending(t) && !(t->flags & PF_EXITING)) | ||
| 2046 | recalc_sigpending_and_wake(t); | ||
| 2047 | 2371 | ||
| 2048 | if (unlikely(tsk->signal->group_stop_count) && | 2372 | if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) && |
| 2049 | !--tsk->signal->group_stop_count) { | 2373 | task_participate_group_stop(tsk)) |
| 2050 | tsk->signal->flags = SIGNAL_STOP_STOPPED; | 2374 | group_stop = CLD_STOPPED; |
| 2051 | group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); | ||
| 2052 | } | ||
| 2053 | out: | 2375 | out: |
| 2054 | spin_unlock_irq(&tsk->sighand->siglock); | 2376 | spin_unlock_irq(&tsk->sighand->siglock); |
| 2055 | 2377 | ||
| 2378 | /* | ||
| 2379 | * If group stop has completed, deliver the notification. This | ||
| 2380 | * should always go to the real parent of the group leader. | ||
| 2381 | */ | ||
| 2056 | if (unlikely(group_stop)) { | 2382 | if (unlikely(group_stop)) { |
| 2057 | read_lock(&tasklist_lock); | 2383 | read_lock(&tasklist_lock); |
| 2058 | do_notify_parent_cldstop(tsk, group_stop); | 2384 | do_notify_parent_cldstop(tsk, false, group_stop); |
| 2059 | read_unlock(&tasklist_lock); | 2385 | read_unlock(&tasklist_lock); |
| 2060 | } | 2386 | } |
| 2061 | } | 2387 | } |
| @@ -2089,11 +2415,33 @@ long do_no_restart_syscall(struct restart_block *param) | |||
| 2089 | return -EINTR; | 2415 | return -EINTR; |
| 2090 | } | 2416 | } |
| 2091 | 2417 | ||
| 2092 | /* | 2418 | static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) |
| 2093 | * We don't need to get the kernel lock - this is all local to this | 2419 | { |
| 2094 | * particular thread.. (and that's good, because this is _heavily_ | 2420 | if (signal_pending(tsk) && !thread_group_empty(tsk)) { |
| 2095 | * used by various programs) | 2421 | sigset_t newblocked; |
| 2422 | /* A set of now blocked but previously unblocked signals. */ | ||
| 2423 | sigandnsets(&newblocked, newset, ¤t->blocked); | ||
| 2424 | retarget_shared_pending(tsk, &newblocked); | ||
| 2425 | } | ||
| 2426 | tsk->blocked = *newset; | ||
| 2427 | recalc_sigpending(); | ||
| 2428 | } | ||
| 2429 | |||
| 2430 | /** | ||
| 2431 | * set_current_blocked - change current->blocked mask | ||
| 2432 | * @newset: new mask | ||
| 2433 | * | ||
| 2434 | * It is wrong to change ->blocked directly, this helper should be used | ||
| 2435 | * to ensure the process can't miss a shared signal we are going to block. | ||
| 2096 | */ | 2436 | */ |
| 2437 | void set_current_blocked(const sigset_t *newset) | ||
| 2438 | { | ||
| 2439 | struct task_struct *tsk = current; | ||
| 2440 | |||
| 2441 | spin_lock_irq(&tsk->sighand->siglock); | ||
| 2442 | __set_task_blocked(tsk, newset); | ||
| 2443 | spin_unlock_irq(&tsk->sighand->siglock); | ||
| 2444 | } | ||
| 2097 | 2445 | ||
| 2098 | /* | 2446 | /* |
| 2099 | * This is also useful for kernel threads that want to temporarily | 2447 | * This is also useful for kernel threads that want to temporarily |
| @@ -2105,73 +2453,66 @@ long do_no_restart_syscall(struct restart_block *param) | |||
| 2105 | */ | 2453 | */ |
| 2106 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) | 2454 | int sigprocmask(int how, sigset_t *set, sigset_t *oldset) |
| 2107 | { | 2455 | { |
| 2108 | int error; | 2456 | struct task_struct *tsk = current; |
| 2457 | sigset_t newset; | ||
| 2109 | 2458 | ||
| 2110 | spin_lock_irq(¤t->sighand->siglock); | 2459 | /* Lockless, only current can change ->blocked, never from irq */ |
| 2111 | if (oldset) | 2460 | if (oldset) |
| 2112 | *oldset = current->blocked; | 2461 | *oldset = tsk->blocked; |
| 2113 | 2462 | ||
| 2114 | error = 0; | ||
| 2115 | switch (how) { | 2463 | switch (how) { |
| 2116 | case SIG_BLOCK: | 2464 | case SIG_BLOCK: |
| 2117 | sigorsets(¤t->blocked, ¤t->blocked, set); | 2465 | sigorsets(&newset, &tsk->blocked, set); |
| 2118 | break; | 2466 | break; |
| 2119 | case SIG_UNBLOCK: | 2467 | case SIG_UNBLOCK: |
| 2120 | signandsets(¤t->blocked, ¤t->blocked, set); | 2468 | sigandnsets(&newset, &tsk->blocked, set); |
| 2121 | break; | 2469 | break; |
| 2122 | case SIG_SETMASK: | 2470 | case SIG_SETMASK: |
| 2123 | current->blocked = *set; | 2471 | newset = *set; |
| 2124 | break; | 2472 | break; |
| 2125 | default: | 2473 | default: |
| 2126 | error = -EINVAL; | 2474 | return -EINVAL; |
| 2127 | } | 2475 | } |
| 2128 | recalc_sigpending(); | ||
| 2129 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 2130 | 2476 | ||
| 2131 | return error; | 2477 | set_current_blocked(&newset); |
| 2478 | return 0; | ||
| 2132 | } | 2479 | } |
| 2133 | 2480 | ||
| 2134 | /** | 2481 | /** |
| 2135 | * sys_rt_sigprocmask - change the list of currently blocked signals | 2482 | * sys_rt_sigprocmask - change the list of currently blocked signals |
| 2136 | * @how: whether to add, remove, or set signals | 2483 | * @how: whether to add, remove, or set signals |
| 2137 | * @set: stores pending signals | 2484 | * @nset: stores pending signals |
| 2138 | * @oset: previous value of signal mask if non-null | 2485 | * @oset: previous value of signal mask if non-null |
| 2139 | * @sigsetsize: size of sigset_t type | 2486 | * @sigsetsize: size of sigset_t type |
| 2140 | */ | 2487 | */ |
| 2141 | SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, | 2488 | SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, |
| 2142 | sigset_t __user *, oset, size_t, sigsetsize) | 2489 | sigset_t __user *, oset, size_t, sigsetsize) |
| 2143 | { | 2490 | { |
| 2144 | int error = -EINVAL; | ||
| 2145 | sigset_t old_set, new_set; | 2491 | sigset_t old_set, new_set; |
| 2492 | int error; | ||
| 2146 | 2493 | ||
| 2147 | /* XXX: Don't preclude handling different sized sigset_t's. */ | 2494 | /* XXX: Don't preclude handling different sized sigset_t's. */ |
| 2148 | if (sigsetsize != sizeof(sigset_t)) | 2495 | if (sigsetsize != sizeof(sigset_t)) |
| 2149 | goto out; | 2496 | return -EINVAL; |
| 2150 | 2497 | ||
| 2151 | if (set) { | 2498 | old_set = current->blocked; |
| 2152 | error = -EFAULT; | 2499 | |
| 2153 | if (copy_from_user(&new_set, set, sizeof(*set))) | 2500 | if (nset) { |
| 2154 | goto out; | 2501 | if (copy_from_user(&new_set, nset, sizeof(sigset_t))) |
| 2502 | return -EFAULT; | ||
| 2155 | sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 2503 | sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
| 2156 | 2504 | ||
| 2157 | error = sigprocmask(how, &new_set, &old_set); | 2505 | error = sigprocmask(how, &new_set, NULL); |
| 2158 | if (error) | 2506 | if (error) |
| 2159 | goto out; | 2507 | return error; |
| 2160 | if (oset) | 2508 | } |
| 2161 | goto set_old; | ||
| 2162 | } else if (oset) { | ||
| 2163 | spin_lock_irq(¤t->sighand->siglock); | ||
| 2164 | old_set = current->blocked; | ||
| 2165 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 2166 | 2509 | ||
| 2167 | set_old: | 2510 | if (oset) { |
| 2168 | error = -EFAULT; | 2511 | if (copy_to_user(oset, &old_set, sizeof(sigset_t))) |
| 2169 | if (copy_to_user(oset, &old_set, sizeof(*oset))) | 2512 | return -EFAULT; |
| 2170 | goto out; | ||
| 2171 | } | 2513 | } |
| 2172 | error = 0; | 2514 | |
| 2173 | out: | 2515 | return 0; |
| 2174 | return error; | ||
| 2175 | } | 2516 | } |
| 2176 | 2517 | ||
| 2177 | long do_sigpending(void __user *set, unsigned long sigsetsize) | 2518 | long do_sigpending(void __user *set, unsigned long sigsetsize) |
| @@ -2284,6 +2625,66 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) | |||
| 2284 | #endif | 2625 | #endif |
| 2285 | 2626 | ||
| 2286 | /** | 2627 | /** |
| 2628 | * do_sigtimedwait - wait for queued signals specified in @which | ||
| 2629 | * @which: queued signals to wait for | ||
| 2630 | * @info: if non-null, the signal's siginfo is returned here | ||
| 2631 | * @ts: upper bound on process time suspension | ||
| 2632 | */ | ||
| 2633 | int do_sigtimedwait(const sigset_t *which, siginfo_t *info, | ||
| 2634 | const struct timespec *ts) | ||
| 2635 | { | ||
| 2636 | struct task_struct *tsk = current; | ||
| 2637 | long timeout = MAX_SCHEDULE_TIMEOUT; | ||
| 2638 | sigset_t mask = *which; | ||
| 2639 | int sig; | ||
| 2640 | |||
| 2641 | if (ts) { | ||
| 2642 | if (!timespec_valid(ts)) | ||
| 2643 | return -EINVAL; | ||
| 2644 | timeout = timespec_to_jiffies(ts); | ||
| 2645 | /* | ||
| 2646 | * We can be close to the next tick, add another one | ||
| 2647 | * to ensure we will wait at least the time asked for. | ||
| 2648 | */ | ||
| 2649 | if (ts->tv_sec || ts->tv_nsec) | ||
| 2650 | timeout++; | ||
| 2651 | } | ||
| 2652 | |||
| 2653 | /* | ||
| 2654 | * Invert the set of allowed signals to get those we want to block. | ||
| 2655 | */ | ||
| 2656 | sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
| 2657 | signotset(&mask); | ||
| 2658 | |||
| 2659 | spin_lock_irq(&tsk->sighand->siglock); | ||
| 2660 | sig = dequeue_signal(tsk, &mask, info); | ||
| 2661 | if (!sig && timeout) { | ||
| 2662 | /* | ||
| 2663 | * None ready, temporarily unblock those we're interested | ||
| 2664 | * while we are sleeping in so that we'll be awakened when | ||
| 2665 | * they arrive. Unblocking is always fine, we can avoid | ||
| 2666 | * set_current_blocked(). | ||
| 2667 | */ | ||
| 2668 | tsk->real_blocked = tsk->blocked; | ||
| 2669 | sigandsets(&tsk->blocked, &tsk->blocked, &mask); | ||
| 2670 | recalc_sigpending(); | ||
| 2671 | spin_unlock_irq(&tsk->sighand->siglock); | ||
| 2672 | |||
| 2673 | timeout = schedule_timeout_interruptible(timeout); | ||
| 2674 | |||
| 2675 | spin_lock_irq(&tsk->sighand->siglock); | ||
| 2676 | __set_task_blocked(tsk, &tsk->real_blocked); | ||
| 2677 | siginitset(&tsk->real_blocked, 0); | ||
| 2678 | sig = dequeue_signal(tsk, &mask, info); | ||
| 2679 | } | ||
| 2680 | spin_unlock_irq(&tsk->sighand->siglock); | ||
| 2681 | |||
| 2682 | if (sig) | ||
| 2683 | return sig; | ||
| 2684 | return timeout ? -EINTR : -EAGAIN; | ||
| 2685 | } | ||
| 2686 | |||
| 2687 | /** | ||
| 2287 | * sys_rt_sigtimedwait - synchronously wait for queued signals specified | 2688 | * sys_rt_sigtimedwait - synchronously wait for queued signals specified |
| 2288 | * in @uthese | 2689 | * in @uthese |
| 2289 | * @uthese: queued signals to wait for | 2690 | * @uthese: queued signals to wait for |
| @@ -2295,11 +2696,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | |||
| 2295 | siginfo_t __user *, uinfo, const struct timespec __user *, uts, | 2696 | siginfo_t __user *, uinfo, const struct timespec __user *, uts, |
| 2296 | size_t, sigsetsize) | 2697 | size_t, sigsetsize) |
| 2297 | { | 2698 | { |
| 2298 | int ret, sig; | ||
| 2299 | sigset_t these; | 2699 | sigset_t these; |
| 2300 | struct timespec ts; | 2700 | struct timespec ts; |
| 2301 | siginfo_t info; | 2701 | siginfo_t info; |
| 2302 | long timeout = 0; | 2702 | int ret; |
| 2303 | 2703 | ||
| 2304 | /* XXX: Don't preclude handling different sized sigset_t's. */ | 2704 | /* XXX: Don't preclude handling different sized sigset_t's. */ |
| 2305 | if (sigsetsize != sizeof(sigset_t)) | 2705 | if (sigsetsize != sizeof(sigset_t)) |
| @@ -2308,61 +2708,16 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, | |||
| 2308 | if (copy_from_user(&these, uthese, sizeof(these))) | 2708 | if (copy_from_user(&these, uthese, sizeof(these))) |
| 2309 | return -EFAULT; | 2709 | return -EFAULT; |
| 2310 | 2710 | ||
| 2311 | /* | ||
| 2312 | * Invert the set of allowed signals to get those we | ||
| 2313 | * want to block. | ||
| 2314 | */ | ||
| 2315 | sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); | ||
| 2316 | signotset(&these); | ||
| 2317 | |||
| 2318 | if (uts) { | 2711 | if (uts) { |
| 2319 | if (copy_from_user(&ts, uts, sizeof(ts))) | 2712 | if (copy_from_user(&ts, uts, sizeof(ts))) |
| 2320 | return -EFAULT; | 2713 | return -EFAULT; |
| 2321 | if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0 | ||
| 2322 | || ts.tv_sec < 0) | ||
| 2323 | return -EINVAL; | ||
| 2324 | } | 2714 | } |
| 2325 | 2715 | ||
| 2326 | spin_lock_irq(¤t->sighand->siglock); | 2716 | ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); |
| 2327 | sig = dequeue_signal(current, &these, &info); | ||
| 2328 | if (!sig) { | ||
| 2329 | timeout = MAX_SCHEDULE_TIMEOUT; | ||
| 2330 | if (uts) | ||
| 2331 | timeout = (timespec_to_jiffies(&ts) | ||
| 2332 | + (ts.tv_sec || ts.tv_nsec)); | ||
| 2333 | |||
| 2334 | if (timeout) { | ||
| 2335 | /* | ||
| 2336 | * None ready -- temporarily unblock those we're | ||
| 2337 | * interested while we are sleeping in so that we'll | ||
| 2338 | * be awakened when they arrive. | ||
| 2339 | */ | ||
| 2340 | current->real_blocked = current->blocked; | ||
| 2341 | sigandsets(¤t->blocked, ¤t->blocked, &these); | ||
| 2342 | recalc_sigpending(); | ||
| 2343 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 2344 | |||
| 2345 | timeout = schedule_timeout_interruptible(timeout); | ||
| 2346 | |||
| 2347 | spin_lock_irq(¤t->sighand->siglock); | ||
| 2348 | sig = dequeue_signal(current, &these, &info); | ||
| 2349 | current->blocked = current->real_blocked; | ||
| 2350 | siginitset(¤t->real_blocked, 0); | ||
| 2351 | recalc_sigpending(); | ||
| 2352 | } | ||
| 2353 | } | ||
| 2354 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 2355 | 2717 | ||
| 2356 | if (sig) { | 2718 | if (ret > 0 && uinfo) { |
| 2357 | ret = sig; | 2719 | if (copy_siginfo_to_user(uinfo, &info)) |
| 2358 | if (uinfo) { | 2720 | ret = -EFAULT; |
| 2359 | if (copy_siginfo_to_user(uinfo, &info)) | ||
| 2360 | ret = -EFAULT; | ||
| 2361 | } | ||
| 2362 | } else { | ||
| 2363 | ret = -EAGAIN; | ||
| 2364 | if (timeout) | ||
| 2365 | ret = -EINTR; | ||
| 2366 | } | 2721 | } |
| 2367 | 2722 | ||
| 2368 | return ret; | 2723 | return ret; |
| @@ -2650,60 +3005,51 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) | |||
| 2650 | /** | 3005 | /** |
| 2651 | * sys_sigprocmask - examine and change blocked signals | 3006 | * sys_sigprocmask - examine and change blocked signals |
| 2652 | * @how: whether to add, remove, or set signals | 3007 | * @how: whether to add, remove, or set signals |
| 2653 | * @set: signals to add or remove (if non-null) | 3008 | * @nset: signals to add or remove (if non-null) |
| 2654 | * @oset: previous value of signal mask if non-null | 3009 | * @oset: previous value of signal mask if non-null |
| 2655 | * | 3010 | * |
| 2656 | * Some platforms have their own version with special arguments; | 3011 | * Some platforms have their own version with special arguments; |
| 2657 | * others support only sys_rt_sigprocmask. | 3012 | * others support only sys_rt_sigprocmask. |
| 2658 | */ | 3013 | */ |
| 2659 | 3014 | ||
| 2660 | SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, | 3015 | SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, |
| 2661 | old_sigset_t __user *, oset) | 3016 | old_sigset_t __user *, oset) |
| 2662 | { | 3017 | { |
| 2663 | int error; | ||
| 2664 | old_sigset_t old_set, new_set; | 3018 | old_sigset_t old_set, new_set; |
| 3019 | sigset_t new_blocked; | ||
| 2665 | 3020 | ||
| 2666 | if (set) { | 3021 | old_set = current->blocked.sig[0]; |
| 2667 | error = -EFAULT; | 3022 | |
| 2668 | if (copy_from_user(&new_set, set, sizeof(*set))) | 3023 | if (nset) { |
| 2669 | goto out; | 3024 | if (copy_from_user(&new_set, nset, sizeof(*nset))) |
| 3025 | return -EFAULT; | ||
| 2670 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); | 3026 | new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); |
| 2671 | 3027 | ||
| 2672 | spin_lock_irq(¤t->sighand->siglock); | 3028 | new_blocked = current->blocked; |
| 2673 | old_set = current->blocked.sig[0]; | ||
| 2674 | 3029 | ||
| 2675 | error = 0; | ||
| 2676 | switch (how) { | 3030 | switch (how) { |
| 2677 | default: | ||
| 2678 | error = -EINVAL; | ||
| 2679 | break; | ||
| 2680 | case SIG_BLOCK: | 3031 | case SIG_BLOCK: |
| 2681 | sigaddsetmask(¤t->blocked, new_set); | 3032 | sigaddsetmask(&new_blocked, new_set); |
| 2682 | break; | 3033 | break; |
| 2683 | case SIG_UNBLOCK: | 3034 | case SIG_UNBLOCK: |
| 2684 | sigdelsetmask(¤t->blocked, new_set); | 3035 | sigdelsetmask(&new_blocked, new_set); |
| 2685 | break; | 3036 | break; |
| 2686 | case SIG_SETMASK: | 3037 | case SIG_SETMASK: |
| 2687 | current->blocked.sig[0] = new_set; | 3038 | new_blocked.sig[0] = new_set; |
| 2688 | break; | 3039 | break; |
| 3040 | default: | ||
| 3041 | return -EINVAL; | ||
| 2689 | } | 3042 | } |
| 2690 | 3043 | ||
| 2691 | recalc_sigpending(); | 3044 | set_current_blocked(&new_blocked); |
| 2692 | spin_unlock_irq(¤t->sighand->siglock); | 3045 | } |
| 2693 | if (error) | 3046 | |
| 2694 | goto out; | 3047 | if (oset) { |
| 2695 | if (oset) | ||
| 2696 | goto set_old; | ||
| 2697 | } else if (oset) { | ||
| 2698 | old_set = current->blocked.sig[0]; | ||
| 2699 | set_old: | ||
| 2700 | error = -EFAULT; | ||
| 2701 | if (copy_to_user(oset, &old_set, sizeof(*oset))) | 3048 | if (copy_to_user(oset, &old_set, sizeof(*oset))) |
| 2702 | goto out; | 3049 | return -EFAULT; |
| 2703 | } | 3050 | } |
| 2704 | error = 0; | 3051 | |
| 2705 | out: | 3052 | return 0; |
| 2706 | return error; | ||
| 2707 | } | 3053 | } |
| 2708 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ | 3054 | #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ |
| 2709 | 3055 | ||
| @@ -2756,15 +3102,11 @@ SYSCALL_DEFINE0(sgetmask) | |||
| 2756 | 3102 | ||
| 2757 | SYSCALL_DEFINE1(ssetmask, int, newmask) | 3103 | SYSCALL_DEFINE1(ssetmask, int, newmask) |
| 2758 | { | 3104 | { |
| 2759 | int old; | 3105 | int old = current->blocked.sig[0]; |
| 2760 | 3106 | sigset_t newset; | |
| 2761 | spin_lock_irq(¤t->sighand->siglock); | ||
| 2762 | old = current->blocked.sig[0]; | ||
| 2763 | 3107 | ||
| 2764 | siginitset(¤t->blocked, newmask & ~(sigmask(SIGKILL)| | 3108 | siginitset(&newset, newmask & ~(sigmask(SIGKILL) | sigmask(SIGSTOP))); |
| 2765 | sigmask(SIGSTOP))); | 3109 | set_current_blocked(&newset); |
| 2766 | recalc_sigpending(); | ||
| 2767 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 2768 | 3110 | ||
| 2769 | return old; | 3111 | return old; |
| 2770 | } | 3112 | } |
| @@ -2793,8 +3135,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) | |||
| 2793 | 3135 | ||
| 2794 | SYSCALL_DEFINE0(pause) | 3136 | SYSCALL_DEFINE0(pause) |
| 2795 | { | 3137 | { |
| 2796 | current->state = TASK_INTERRUPTIBLE; | 3138 | while (!signal_pending(current)) { |
| 2797 | schedule(); | 3139 | current->state = TASK_INTERRUPTIBLE; |
| 3140 | schedule(); | ||
| 3141 | } | ||
| 2798 | return -ERESTARTNOHAND; | 3142 | return -ERESTARTNOHAND; |
| 2799 | } | 3143 | } |
| 2800 | 3144 | ||
| @@ -2819,11 +3163,8 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) | |||
| 2819 | return -EFAULT; | 3163 | return -EFAULT; |
| 2820 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); | 3164 | sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); |
| 2821 | 3165 | ||
| 2822 | spin_lock_irq(¤t->sighand->siglock); | ||
| 2823 | current->saved_sigmask = current->blocked; | 3166 | current->saved_sigmask = current->blocked; |
| 2824 | current->blocked = newset; | 3167 | set_current_blocked(&newset); |
| 2825 | recalc_sigpending(); | ||
| 2826 | spin_unlock_irq(¤t->sighand->siglock); | ||
| 2827 | 3168 | ||
| 2828 | current->state = TASK_INTERRUPTIBLE; | 3169 | current->state = TASK_INTERRUPTIBLE; |
| 2829 | schedule(); | 3170 | schedule(); |
diff --git a/kernel/smp.c b/kernel/smp.c index 73a195193558..fb67dfa8394e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -74,7 +74,7 @@ static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { | |||
| 74 | .notifier_call = hotplug_cfd, | 74 | .notifier_call = hotplug_cfd, |
| 75 | }; | 75 | }; |
| 76 | 76 | ||
| 77 | static int __cpuinit init_call_single_data(void) | 77 | void __init call_function_init(void) |
| 78 | { | 78 | { |
| 79 | void *cpu = (void *)(long)smp_processor_id(); | 79 | void *cpu = (void *)(long)smp_processor_id(); |
| 80 | int i; | 80 | int i; |
| @@ -88,10 +88,7 @@ static int __cpuinit init_call_single_data(void) | |||
| 88 | 88 | ||
| 89 | hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); | 89 | hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); |
| 90 | register_cpu_notifier(&hotplug_cfd_notifier); | 90 | register_cpu_notifier(&hotplug_cfd_notifier); |
| 91 | |||
| 92 | return 0; | ||
| 93 | } | 91 | } |
| 94 | early_initcall(init_call_single_data); | ||
| 95 | 92 | ||
| 96 | /* | 93 | /* |
| 97 | * csd_lock/csd_unlock used to serialize access to per-cpu csd resources | 94 | * csd_lock/csd_unlock used to serialize access to per-cpu csd resources |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 174f976c2874..fca82c32042b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -58,7 +58,7 @@ DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | |||
| 58 | 58 | ||
| 59 | char *softirq_to_name[NR_SOFTIRQS] = { | 59 | char *softirq_to_name[NR_SOFTIRQS] = { |
| 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", | 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", |
| 61 | "TASKLET", "SCHED", "HRTIMER", "RCU" | 61 | "TASKLET", "SCHED", "HRTIMER", "RCU" |
| 62 | }; | 62 | }; |
| 63 | 63 | ||
| 64 | /* | 64 | /* |
| @@ -315,16 +315,24 @@ static inline void invoke_softirq(void) | |||
| 315 | { | 315 | { |
| 316 | if (!force_irqthreads) | 316 | if (!force_irqthreads) |
| 317 | __do_softirq(); | 317 | __do_softirq(); |
| 318 | else | 318 | else { |
| 319 | __local_bh_disable((unsigned long)__builtin_return_address(0), | ||
| 320 | SOFTIRQ_OFFSET); | ||
| 319 | wakeup_softirqd(); | 321 | wakeup_softirqd(); |
| 322 | __local_bh_enable(SOFTIRQ_OFFSET); | ||
| 323 | } | ||
| 320 | } | 324 | } |
| 321 | #else | 325 | #else |
| 322 | static inline void invoke_softirq(void) | 326 | static inline void invoke_softirq(void) |
| 323 | { | 327 | { |
| 324 | if (!force_irqthreads) | 328 | if (!force_irqthreads) |
| 325 | do_softirq(); | 329 | do_softirq(); |
| 326 | else | 330 | else { |
| 331 | __local_bh_disable((unsigned long)__builtin_return_address(0), | ||
| 332 | SOFTIRQ_OFFSET); | ||
| 327 | wakeup_softirqd(); | 333 | wakeup_softirqd(); |
| 334 | __local_bh_enable(SOFTIRQ_OFFSET); | ||
| 335 | } | ||
| 328 | } | 336 | } |
| 329 | #endif | 337 | #endif |
| 330 | 338 | ||
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index eb212f8f8bc8..d20c6983aad9 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c | |||
| @@ -26,12 +26,18 @@ void print_stack_trace(struct stack_trace *trace, int spaces) | |||
| 26 | EXPORT_SYMBOL_GPL(print_stack_trace); | 26 | EXPORT_SYMBOL_GPL(print_stack_trace); |
| 27 | 27 | ||
| 28 | /* | 28 | /* |
| 29 | * Architectures that do not implement save_stack_trace_tsk get this | 29 | * Architectures that do not implement save_stack_trace_tsk or |
| 30 | * weak alias and a once-per-bootup warning (whenever this facility | 30 | * save_stack_trace_regs get this weak alias and a once-per-bootup warning |
| 31 | * is utilized - for example by procfs): | 31 | * (whenever this facility is utilized - for example by procfs): |
| 32 | */ | 32 | */ |
| 33 | __weak void | 33 | __weak void |
| 34 | save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) | 34 | save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) |
| 35 | { | 35 | { |
| 36 | WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); | 36 | WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); |
| 37 | } | 37 | } |
| 38 | |||
| 39 | __weak void | ||
| 40 | save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) | ||
| 41 | { | ||
| 42 | WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); | ||
| 43 | } | ||
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e3516b29076c..ba5070ce5765 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -19,7 +19,7 @@ | |||
| 19 | #include <linux/interrupt.h> | 19 | #include <linux/interrupt.h> |
| 20 | #include <linux/kallsyms.h> | 20 | #include <linux/kallsyms.h> |
| 21 | 21 | ||
| 22 | #include <asm/atomic.h> | 22 | #include <linux/atomic.h> |
| 23 | 23 | ||
| 24 | /* | 24 | /* |
| 25 | * Structure to determine completion condition and record errors. May | 25 | * Structure to determine completion condition and record errors. May |
| @@ -136,10 +136,11 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, | |||
| 136 | static DEFINE_MUTEX(stop_cpus_mutex); | 136 | static DEFINE_MUTEX(stop_cpus_mutex); |
| 137 | static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); | 137 | static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); |
| 138 | 138 | ||
| 139 | int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) | 139 | static void queue_stop_cpus_work(const struct cpumask *cpumask, |
| 140 | cpu_stop_fn_t fn, void *arg, | ||
| 141 | struct cpu_stop_done *done) | ||
| 140 | { | 142 | { |
| 141 | struct cpu_stop_work *work; | 143 | struct cpu_stop_work *work; |
| 142 | struct cpu_stop_done done; | ||
| 143 | unsigned int cpu; | 144 | unsigned int cpu; |
| 144 | 145 | ||
| 145 | /* initialize works and done */ | 146 | /* initialize works and done */ |
| @@ -147,9 +148,8 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) | |||
| 147 | work = &per_cpu(stop_cpus_work, cpu); | 148 | work = &per_cpu(stop_cpus_work, cpu); |
| 148 | work->fn = fn; | 149 | work->fn = fn; |
| 149 | work->arg = arg; | 150 | work->arg = arg; |
| 150 | work->done = &done; | 151 | work->done = done; |
| 151 | } | 152 | } |
| 152 | cpu_stop_init_done(&done, cpumask_weight(cpumask)); | ||
| 153 | 153 | ||
| 154 | /* | 154 | /* |
| 155 | * Disable preemption while queueing to avoid getting | 155 | * Disable preemption while queueing to avoid getting |
| @@ -161,7 +161,15 @@ int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) | |||
| 161 | cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), | 161 | cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), |
| 162 | &per_cpu(stop_cpus_work, cpu)); | 162 | &per_cpu(stop_cpus_work, cpu)); |
| 163 | preempt_enable(); | 163 | preempt_enable(); |
| 164 | } | ||
| 164 | 165 | ||
| 166 | static int __stop_cpus(const struct cpumask *cpumask, | ||
| 167 | cpu_stop_fn_t fn, void *arg) | ||
| 168 | { | ||
| 169 | struct cpu_stop_done done; | ||
| 170 | |||
| 171 | cpu_stop_init_done(&done, cpumask_weight(cpumask)); | ||
| 172 | queue_stop_cpus_work(cpumask, fn, arg, &done); | ||
| 165 | wait_for_completion(&done.completion); | 173 | wait_for_completion(&done.completion); |
| 166 | return done.executed ? done.ret : -ENOENT; | 174 | return done.executed ? done.ret : -ENOENT; |
| 167 | } | 175 | } |
| @@ -431,8 +439,15 @@ static int stop_machine_cpu_stop(void *data) | |||
| 431 | struct stop_machine_data *smdata = data; | 439 | struct stop_machine_data *smdata = data; |
| 432 | enum stopmachine_state curstate = STOPMACHINE_NONE; | 440 | enum stopmachine_state curstate = STOPMACHINE_NONE; |
| 433 | int cpu = smp_processor_id(), err = 0; | 441 | int cpu = smp_processor_id(), err = 0; |
| 442 | unsigned long flags; | ||
| 434 | bool is_active; | 443 | bool is_active; |
| 435 | 444 | ||
| 445 | /* | ||
| 446 | * When called from stop_machine_from_inactive_cpu(), irq might | ||
| 447 | * already be disabled. Save the state and restore it on exit. | ||
| 448 | */ | ||
| 449 | local_save_flags(flags); | ||
| 450 | |||
| 436 | if (!smdata->active_cpus) | 451 | if (!smdata->active_cpus) |
| 437 | is_active = cpu == cpumask_first(cpu_online_mask); | 452 | is_active = cpu == cpumask_first(cpu_online_mask); |
| 438 | else | 453 | else |
| @@ -460,7 +475,7 @@ static int stop_machine_cpu_stop(void *data) | |||
| 460 | } | 475 | } |
| 461 | } while (curstate != STOPMACHINE_EXIT); | 476 | } while (curstate != STOPMACHINE_EXIT); |
| 462 | 477 | ||
| 463 | local_irq_enable(); | 478 | local_irq_restore(flags); |
| 464 | return err; | 479 | return err; |
| 465 | } | 480 | } |
| 466 | 481 | ||
| @@ -487,4 +502,57 @@ int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) | |||
| 487 | } | 502 | } |
| 488 | EXPORT_SYMBOL_GPL(stop_machine); | 503 | EXPORT_SYMBOL_GPL(stop_machine); |
| 489 | 504 | ||
| 505 | /** | ||
| 506 | * stop_machine_from_inactive_cpu - stop_machine() from inactive CPU | ||
| 507 | * @fn: the function to run | ||
| 508 | * @data: the data ptr for the @fn() | ||
| 509 | * @cpus: the cpus to run the @fn() on (NULL = any online cpu) | ||
| 510 | * | ||
| 511 | * This is identical to stop_machine() but can be called from a CPU which | ||
| 512 | * is not active. The local CPU is in the process of hotplug (so no other | ||
| 513 | * CPU hotplug can start) and not marked active and doesn't have enough | ||
| 514 | * context to sleep. | ||
| 515 | * | ||
| 516 | * This function provides stop_machine() functionality for such state by | ||
| 517 | * using busy-wait for synchronization and executing @fn directly for local | ||
| 518 | * CPU. | ||
| 519 | * | ||
| 520 | * CONTEXT: | ||
| 521 | * Local CPU is inactive. Temporarily stops all active CPUs. | ||
| 522 | * | ||
| 523 | * RETURNS: | ||
| 524 | * 0 if all executions of @fn returned 0, any non zero return value if any | ||
| 525 | * returned non zero. | ||
| 526 | */ | ||
| 527 | int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, | ||
| 528 | const struct cpumask *cpus) | ||
| 529 | { | ||
| 530 | struct stop_machine_data smdata = { .fn = fn, .data = data, | ||
| 531 | .active_cpus = cpus }; | ||
| 532 | struct cpu_stop_done done; | ||
| 533 | int ret; | ||
| 534 | |||
| 535 | /* Local CPU must be inactive and CPU hotplug in progress. */ | ||
| 536 | BUG_ON(cpu_active(raw_smp_processor_id())); | ||
| 537 | smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ | ||
| 538 | |||
| 539 | /* No proper task established and can't sleep - busy wait for lock. */ | ||
| 540 | while (!mutex_trylock(&stop_cpus_mutex)) | ||
| 541 | cpu_relax(); | ||
| 542 | |||
| 543 | /* Schedule work on other CPUs and execute directly for local CPU */ | ||
| 544 | set_state(&smdata, STOPMACHINE_PREPARE); | ||
| 545 | cpu_stop_init_done(&done, num_active_cpus()); | ||
| 546 | queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, | ||
| 547 | &done); | ||
| 548 | ret = stop_machine_cpu_stop(&smdata); | ||
| 549 | |||
| 550 | /* Busy wait for completion. */ | ||
| 551 | while (!completion_done(&done.completion)) | ||
| 552 | cpu_relax(); | ||
| 553 | |||
| 554 | mutex_unlock(&stop_cpus_mutex); | ||
| 555 | return ret ?: done.ret; | ||
| 556 | } | ||
| 557 | |||
| 490 | #endif /* CONFIG_STOP_MACHINE */ | 558 | #endif /* CONFIG_STOP_MACHINE */ |
diff --git a/kernel/sys.c b/kernel/sys.c index af468edf096a..dd948a1fca4c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -8,7 +8,6 @@ | |||
| 8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
| 9 | #include <linux/utsname.h> | 9 | #include <linux/utsname.h> |
| 10 | #include <linux/mman.h> | 10 | #include <linux/mman.h> |
| 11 | #include <linux/notifier.h> | ||
| 12 | #include <linux/reboot.h> | 11 | #include <linux/reboot.h> |
| 13 | #include <linux/prctl.h> | 12 | #include <linux/prctl.h> |
| 14 | #include <linux/highuid.h> | 13 | #include <linux/highuid.h> |
| @@ -314,12 +313,43 @@ void kernel_restart_prepare(char *cmd) | |||
| 314 | { | 313 | { |
| 315 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 314 | blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
| 316 | system_state = SYSTEM_RESTART; | 315 | system_state = SYSTEM_RESTART; |
| 316 | usermodehelper_disable(); | ||
| 317 | device_shutdown(); | 317 | device_shutdown(); |
| 318 | sysdev_shutdown(); | ||
| 319 | syscore_shutdown(); | 318 | syscore_shutdown(); |
| 320 | } | 319 | } |
| 321 | 320 | ||
| 322 | /** | 321 | /** |
| 322 | * register_reboot_notifier - Register function to be called at reboot time | ||
| 323 | * @nb: Info about notifier function to be called | ||
| 324 | * | ||
| 325 | * Registers a function with the list of functions | ||
| 326 | * to be called at reboot time. | ||
| 327 | * | ||
| 328 | * Currently always returns zero, as blocking_notifier_chain_register() | ||
| 329 | * always returns zero. | ||
| 330 | */ | ||
| 331 | int register_reboot_notifier(struct notifier_block *nb) | ||
| 332 | { | ||
| 333 | return blocking_notifier_chain_register(&reboot_notifier_list, nb); | ||
| 334 | } | ||
| 335 | EXPORT_SYMBOL(register_reboot_notifier); | ||
| 336 | |||
| 337 | /** | ||
| 338 | * unregister_reboot_notifier - Unregister previously registered reboot notifier | ||
| 339 | * @nb: Hook to be unregistered | ||
| 340 | * | ||
| 341 | * Unregisters a previously registered reboot | ||
| 342 | * notifier function. | ||
| 343 | * | ||
| 344 | * Returns zero on success, or %-ENOENT on failure. | ||
| 345 | */ | ||
| 346 | int unregister_reboot_notifier(struct notifier_block *nb) | ||
| 347 | { | ||
| 348 | return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); | ||
| 349 | } | ||
| 350 | EXPORT_SYMBOL(unregister_reboot_notifier); | ||
| 351 | |||
| 352 | /** | ||
| 323 | * kernel_restart - reboot the system | 353 | * kernel_restart - reboot the system |
| 324 | * @cmd: pointer to buffer containing command to execute for restart | 354 | * @cmd: pointer to buffer containing command to execute for restart |
| 325 | * or %NULL | 355 | * or %NULL |
| @@ -344,6 +374,7 @@ static void kernel_shutdown_prepare(enum system_states state) | |||
| 344 | blocking_notifier_call_chain(&reboot_notifier_list, | 374 | blocking_notifier_call_chain(&reboot_notifier_list, |
| 345 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); | 375 | (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); |
| 346 | system_state = state; | 376 | system_state = state; |
| 377 | usermodehelper_disable(); | ||
| 347 | device_shutdown(); | 378 | device_shutdown(); |
| 348 | } | 379 | } |
| 349 | /** | 380 | /** |
| @@ -354,7 +385,6 @@ static void kernel_shutdown_prepare(enum system_states state) | |||
| 354 | void kernel_halt(void) | 385 | void kernel_halt(void) |
| 355 | { | 386 | { |
| 356 | kernel_shutdown_prepare(SYSTEM_HALT); | 387 | kernel_shutdown_prepare(SYSTEM_HALT); |
| 357 | sysdev_shutdown(); | ||
| 358 | syscore_shutdown(); | 388 | syscore_shutdown(); |
| 359 | printk(KERN_EMERG "System halted.\n"); | 389 | printk(KERN_EMERG "System halted.\n"); |
| 360 | kmsg_dump(KMSG_DUMP_HALT); | 390 | kmsg_dump(KMSG_DUMP_HALT); |
| @@ -374,7 +404,6 @@ void kernel_power_off(void) | |||
| 374 | if (pm_power_off_prepare) | 404 | if (pm_power_off_prepare) |
| 375 | pm_power_off_prepare(); | 405 | pm_power_off_prepare(); |
| 376 | disable_nonboot_cpus(); | 406 | disable_nonboot_cpus(); |
| 377 | sysdev_shutdown(); | ||
| 378 | syscore_shutdown(); | 407 | syscore_shutdown(); |
| 379 | printk(KERN_EMERG "Power down.\n"); | 408 | printk(KERN_EMERG "Power down.\n"); |
| 380 | kmsg_dump(KMSG_DUMP_POWEROFF); | 409 | kmsg_dump(KMSG_DUMP_POWEROFF); |
| @@ -592,11 +621,18 @@ static int set_user(struct cred *new) | |||
| 592 | if (!new_user) | 621 | if (!new_user) |
| 593 | return -EAGAIN; | 622 | return -EAGAIN; |
| 594 | 623 | ||
| 624 | /* | ||
| 625 | * We don't fail in case of NPROC limit excess here because too many | ||
| 626 | * poorly written programs don't check set*uid() return code, assuming | ||
| 627 | * it never fails if called by root. We may still enforce NPROC limit | ||
| 628 | * for programs doing set*uid()+execve() by harmlessly deferring the | ||
| 629 | * failure to the execve() stage. | ||
| 630 | */ | ||
| 595 | if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && | 631 | if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && |
| 596 | new_user != INIT_USER) { | 632 | new_user != INIT_USER) |
| 597 | free_uid(new_user); | 633 | current->flags |= PF_NPROC_EXCEEDED; |
| 598 | return -EAGAIN; | 634 | else |
| 599 | } | 635 | current->flags &= ~PF_NPROC_EXCEEDED; |
| 600 | 636 | ||
| 601 | free_uid(new->user); | 637 | free_uid(new->user); |
| 602 | new->user = new_user; | 638 | new->user = new_user; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 25cc41cd8f33..62cbc8877fef 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt); | |||
| 46 | cond_syscall(compat_sys_getsockopt); | 46 | cond_syscall(compat_sys_getsockopt); |
| 47 | cond_syscall(sys_shutdown); | 47 | cond_syscall(sys_shutdown); |
| 48 | cond_syscall(sys_sendmsg); | 48 | cond_syscall(sys_sendmsg); |
| 49 | cond_syscall(sys_sendmmsg); | ||
| 49 | cond_syscall(compat_sys_sendmsg); | 50 | cond_syscall(compat_sys_sendmsg); |
| 51 | cond_syscall(compat_sys_sendmmsg); | ||
| 50 | cond_syscall(sys_recvmsg); | 52 | cond_syscall(sys_recvmsg); |
| 51 | cond_syscall(sys_recvmmsg); | 53 | cond_syscall(sys_recvmmsg); |
| 52 | cond_syscall(compat_sys_recvmsg); | 54 | cond_syscall(compat_sys_recvmsg); |
| @@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait); | |||
| 69 | cond_syscall(sys_semget); | 71 | cond_syscall(sys_semget); |
| 70 | cond_syscall(sys_semop); | 72 | cond_syscall(sys_semop); |
| 71 | cond_syscall(sys_semtimedop); | 73 | cond_syscall(sys_semtimedop); |
| 74 | cond_syscall(compat_sys_semtimedop); | ||
| 72 | cond_syscall(sys_semctl); | 75 | cond_syscall(sys_semctl); |
| 76 | cond_syscall(compat_sys_semctl); | ||
| 73 | cond_syscall(sys_msgget); | 77 | cond_syscall(sys_msgget); |
| 74 | cond_syscall(sys_msgsnd); | 78 | cond_syscall(sys_msgsnd); |
| 79 | cond_syscall(compat_sys_msgsnd); | ||
| 75 | cond_syscall(sys_msgrcv); | 80 | cond_syscall(sys_msgrcv); |
| 81 | cond_syscall(compat_sys_msgrcv); | ||
| 76 | cond_syscall(sys_msgctl); | 82 | cond_syscall(sys_msgctl); |
| 83 | cond_syscall(compat_sys_msgctl); | ||
| 77 | cond_syscall(sys_shmget); | 84 | cond_syscall(sys_shmget); |
| 78 | cond_syscall(sys_shmat); | 85 | cond_syscall(sys_shmat); |
| 86 | cond_syscall(compat_sys_shmat); | ||
| 79 | cond_syscall(sys_shmdt); | 87 | cond_syscall(sys_shmdt); |
| 80 | cond_syscall(sys_shmctl); | 88 | cond_syscall(sys_shmctl); |
| 89 | cond_syscall(compat_sys_shmctl); | ||
| 81 | cond_syscall(sys_mq_open); | 90 | cond_syscall(sys_mq_open); |
| 82 | cond_syscall(sys_mq_unlink); | 91 | cond_syscall(sys_mq_unlink); |
| 83 | cond_syscall(sys_mq_timedsend); | 92 | cond_syscall(sys_mq_timedsend); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0bb32414b17..11d65b531e50 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -56,6 +56,7 @@ | |||
| 56 | #include <linux/kprobes.h> | 56 | #include <linux/kprobes.h> |
| 57 | #include <linux/pipe_fs_i.h> | 57 | #include <linux/pipe_fs_i.h> |
| 58 | #include <linux/oom.h> | 58 | #include <linux/oom.h> |
| 59 | #include <linux/kmod.h> | ||
| 59 | 60 | ||
| 60 | #include <asm/uaccess.h> | 61 | #include <asm/uaccess.h> |
| 61 | #include <asm/processor.h> | 62 | #include <asm/processor.h> |
| @@ -616,6 +617,11 @@ static struct ctl_table kern_table[] = { | |||
| 616 | .child = random_table, | 617 | .child = random_table, |
| 617 | }, | 618 | }, |
| 618 | { | 619 | { |
| 620 | .procname = "usermodehelper", | ||
| 621 | .mode = 0555, | ||
| 622 | .child = usermodehelper_table, | ||
| 623 | }, | ||
| 624 | { | ||
| 619 | .procname = "overflowuid", | 625 | .procname = "overflowuid", |
| 620 | .data = &overflowuid, | 626 | .data = &overflowuid, |
| 621 | .maxlen = sizeof(int), | 627 | .maxlen = sizeof(int), |
| @@ -730,14 +736,16 @@ static struct ctl_table kern_table[] = { | |||
| 730 | .data = &watchdog_enabled, | 736 | .data = &watchdog_enabled, |
| 731 | .maxlen = sizeof (int), | 737 | .maxlen = sizeof (int), |
| 732 | .mode = 0644, | 738 | .mode = 0644, |
| 733 | .proc_handler = proc_dowatchdog_enabled, | 739 | .proc_handler = proc_dowatchdog, |
| 740 | .extra1 = &zero, | ||
| 741 | .extra2 = &one, | ||
| 734 | }, | 742 | }, |
| 735 | { | 743 | { |
| 736 | .procname = "watchdog_thresh", | 744 | .procname = "watchdog_thresh", |
| 737 | .data = &softlockup_thresh, | 745 | .data = &watchdog_thresh, |
| 738 | .maxlen = sizeof(int), | 746 | .maxlen = sizeof(int), |
| 739 | .mode = 0644, | 747 | .mode = 0644, |
| 740 | .proc_handler = proc_dowatchdog_thresh, | 748 | .proc_handler = proc_dowatchdog, |
| 741 | .extra1 = &neg_one, | 749 | .extra1 = &neg_one, |
| 742 | .extra2 = &sixty, | 750 | .extra2 = &sixty, |
| 743 | }, | 751 | }, |
| @@ -755,7 +763,9 @@ static struct ctl_table kern_table[] = { | |||
| 755 | .data = &watchdog_enabled, | 763 | .data = &watchdog_enabled, |
| 756 | .maxlen = sizeof (int), | 764 | .maxlen = sizeof (int), |
| 757 | .mode = 0644, | 765 | .mode = 0644, |
| 758 | .proc_handler = proc_dowatchdog_enabled, | 766 | .proc_handler = proc_dowatchdog, |
| 767 | .extra1 = &zero, | ||
| 768 | .extra2 = &one, | ||
| 759 | }, | 769 | }, |
| 760 | #endif | 770 | #endif |
| 761 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 771 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
| @@ -928,6 +938,12 @@ static struct ctl_table kern_table[] = { | |||
| 928 | }, | 938 | }, |
| 929 | #endif | 939 | #endif |
| 930 | #ifdef CONFIG_PERF_EVENTS | 940 | #ifdef CONFIG_PERF_EVENTS |
| 941 | /* | ||
| 942 | * User-space scripts rely on the existence of this file | ||
| 943 | * as a feature check for perf_events being enabled. | ||
| 944 | * | ||
| 945 | * So it's an ABI, do not remove! | ||
| 946 | */ | ||
| 931 | { | 947 | { |
| 932 | .procname = "perf_event_paranoid", | 948 | .procname = "perf_event_paranoid", |
| 933 | .data = &sysctl_perf_event_paranoid, | 949 | .data = &sysctl_perf_event_paranoid, |
| @@ -1496,7 +1512,7 @@ static struct ctl_table fs_table[] = { | |||
| 1496 | 1512 | ||
| 1497 | static struct ctl_table debug_table[] = { | 1513 | static struct ctl_table debug_table[] = { |
| 1498 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ | 1514 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ |
| 1499 | defined(CONFIG_S390) | 1515 | defined(CONFIG_S390) || defined(CONFIG_TILE) |
| 1500 | { | 1516 | { |
| 1501 | .procname = "exception-trace", | 1517 | .procname = "exception-trace", |
| 1502 | .data = &show_unhandled_signals, | 1518 | .data = &show_unhandled_signals, |
| @@ -1574,16 +1590,11 @@ void sysctl_head_get(struct ctl_table_header *head) | |||
| 1574 | spin_unlock(&sysctl_lock); | 1590 | spin_unlock(&sysctl_lock); |
| 1575 | } | 1591 | } |
| 1576 | 1592 | ||
| 1577 | static void free_head(struct rcu_head *rcu) | ||
| 1578 | { | ||
| 1579 | kfree(container_of(rcu, struct ctl_table_header, rcu)); | ||
| 1580 | } | ||
| 1581 | |||
| 1582 | void sysctl_head_put(struct ctl_table_header *head) | 1593 | void sysctl_head_put(struct ctl_table_header *head) |
| 1583 | { | 1594 | { |
| 1584 | spin_lock(&sysctl_lock); | 1595 | spin_lock(&sysctl_lock); |
| 1585 | if (!--head->count) | 1596 | if (!--head->count) |
| 1586 | call_rcu(&head->rcu, free_head); | 1597 | kfree_rcu(head, rcu); |
| 1587 | spin_unlock(&sysctl_lock); | 1598 | spin_unlock(&sysctl_lock); |
| 1588 | } | 1599 | } |
| 1589 | 1600 | ||
| @@ -1955,10 +1966,10 @@ void unregister_sysctl_table(struct ctl_table_header * header) | |||
| 1955 | start_unregistering(header); | 1966 | start_unregistering(header); |
| 1956 | if (!--header->parent->count) { | 1967 | if (!--header->parent->count) { |
| 1957 | WARN_ON(1); | 1968 | WARN_ON(1); |
| 1958 | call_rcu(&header->parent->rcu, free_head); | 1969 | kfree_rcu(header->parent, rcu); |
| 1959 | } | 1970 | } |
| 1960 | if (!--header->count) | 1971 | if (!--header->count) |
| 1961 | call_rcu(&header->rcu, free_head); | 1972 | kfree_rcu(header, rcu); |
| 1962 | spin_unlock(&sysctl_lock); | 1973 | spin_unlock(&sysctl_lock); |
| 1963 | } | 1974 | } |
| 1964 | 1975 | ||
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 9ffea360a778..e19ce1454ee1 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -28,7 +28,7 @@ | |||
| 28 | #include <linux/fs.h> | 28 | #include <linux/fs.h> |
| 29 | #include <linux/file.h> | 29 | #include <linux/file.h> |
| 30 | #include <net/genetlink.h> | 30 | #include <net/genetlink.h> |
| 31 | #include <asm/atomic.h> | 31 | #include <linux/atomic.h> |
| 32 | 32 | ||
| 33 | /* | 33 | /* |
| 34 | * Maximum length of a cpumask that can be specified in | 34 | * Maximum length of a cpumask that can be specified in |
| @@ -285,7 +285,7 @@ ret: | |||
| 285 | static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | 285 | static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) |
| 286 | { | 286 | { |
| 287 | struct listener_list *listeners; | 287 | struct listener_list *listeners; |
| 288 | struct listener *s, *tmp; | 288 | struct listener *s, *tmp, *s2; |
| 289 | unsigned int cpu; | 289 | unsigned int cpu; |
| 290 | 290 | ||
| 291 | if (!cpumask_subset(mask, cpu_possible_mask)) | 291 | if (!cpumask_subset(mask, cpu_possible_mask)) |
| @@ -293,18 +293,25 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | |||
| 293 | 293 | ||
| 294 | if (isadd == REGISTER) { | 294 | if (isadd == REGISTER) { |
| 295 | for_each_cpu(cpu, mask) { | 295 | for_each_cpu(cpu, mask) { |
| 296 | s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, | 296 | s = kmalloc_node(sizeof(struct listener), |
| 297 | cpu_to_node(cpu)); | 297 | GFP_KERNEL, cpu_to_node(cpu)); |
| 298 | if (!s) | 298 | if (!s) |
| 299 | goto cleanup; | 299 | goto cleanup; |
| 300 | |||
| 300 | s->pid = pid; | 301 | s->pid = pid; |
| 301 | INIT_LIST_HEAD(&s->list); | ||
| 302 | s->valid = 1; | 302 | s->valid = 1; |
| 303 | 303 | ||
| 304 | listeners = &per_cpu(listener_array, cpu); | 304 | listeners = &per_cpu(listener_array, cpu); |
| 305 | down_write(&listeners->sem); | 305 | down_write(&listeners->sem); |
| 306 | list_for_each_entry(s2, &listeners->list, list) { | ||
| 307 | if (s2->pid == pid && s2->valid) | ||
| 308 | goto exists; | ||
| 309 | } | ||
| 306 | list_add(&s->list, &listeners->list); | 310 | list_add(&s->list, &listeners->list); |
| 311 | s = NULL; | ||
| 312 | exists: | ||
| 307 | up_write(&listeners->sem); | 313 | up_write(&listeners->sem); |
| 314 | kfree(s); /* nop if NULL */ | ||
| 308 | } | 315 | } |
| 309 | return 0; | 316 | return 0; |
| 310 | } | 317 | } |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index b0425991e9ac..e2fd74b8e8c2 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o | 1 | obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o |
| 2 | obj-y += timeconv.o posix-clock.o | 2 | obj-y += timeconv.o posix-clock.o alarmtimer.o |
| 3 | 3 | ||
| 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o | 4 | obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o |
| 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o | 5 | obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 000000000000..59f369f98a04 --- /dev/null +++ b/kernel/time/alarmtimer.c | |||
| @@ -0,0 +1,720 @@ | |||
| 1 | /* | ||
| 2 | * Alarmtimer interface | ||
| 3 | * | ||
| 4 | * This interface provides a timer which is similarto hrtimers, | ||
| 5 | * but triggers a RTC alarm if the box is suspend. | ||
| 6 | * | ||
| 7 | * This interface is influenced by the Android RTC Alarm timer | ||
| 8 | * interface. | ||
| 9 | * | ||
| 10 | * Copyright (C) 2010 IBM Corperation | ||
| 11 | * | ||
| 12 | * Author: John Stultz <john.stultz@linaro.org> | ||
| 13 | * | ||
| 14 | * This program is free software; you can redistribute it and/or modify | ||
| 15 | * it under the terms of the GNU General Public License version 2 as | ||
| 16 | * published by the Free Software Foundation. | ||
| 17 | */ | ||
| 18 | #include <linux/time.h> | ||
| 19 | #include <linux/hrtimer.h> | ||
| 20 | #include <linux/timerqueue.h> | ||
| 21 | #include <linux/rtc.h> | ||
| 22 | #include <linux/alarmtimer.h> | ||
| 23 | #include <linux/mutex.h> | ||
| 24 | #include <linux/platform_device.h> | ||
| 25 | #include <linux/posix-timers.h> | ||
| 26 | #include <linux/workqueue.h> | ||
| 27 | #include <linux/freezer.h> | ||
| 28 | |||
| 29 | /** | ||
| 30 | * struct alarm_base - Alarm timer bases | ||
| 31 | * @lock: Lock for syncrhonized access to the base | ||
| 32 | * @timerqueue: Timerqueue head managing the list of events | ||
| 33 | * @timer: hrtimer used to schedule events while running | ||
| 34 | * @gettime: Function to read the time correlating to the base | ||
| 35 | * @base_clockid: clockid for the base | ||
| 36 | */ | ||
| 37 | static struct alarm_base { | ||
| 38 | spinlock_t lock; | ||
| 39 | struct timerqueue_head timerqueue; | ||
| 40 | struct hrtimer timer; | ||
| 41 | ktime_t (*gettime)(void); | ||
| 42 | clockid_t base_clockid; | ||
| 43 | } alarm_bases[ALARM_NUMTYPE]; | ||
| 44 | |||
| 45 | /* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ | ||
| 46 | static ktime_t freezer_delta; | ||
| 47 | static DEFINE_SPINLOCK(freezer_delta_lock); | ||
| 48 | |||
| 49 | #ifdef CONFIG_RTC_CLASS | ||
| 50 | /* rtc timer and device for setting alarm wakeups at suspend */ | ||
| 51 | static struct rtc_timer rtctimer; | ||
| 52 | static struct rtc_device *rtcdev; | ||
| 53 | static DEFINE_SPINLOCK(rtcdev_lock); | ||
| 54 | |||
| 55 | /** | ||
| 56 | * has_wakealarm - check rtc device has wakealarm ability | ||
| 57 | * @dev: current device | ||
| 58 | * @name_ptr: name to be returned | ||
| 59 | * | ||
| 60 | * This helper function checks to see if the rtc device can wake | ||
| 61 | * from suspend. | ||
| 62 | */ | ||
| 63 | static int has_wakealarm(struct device *dev, void *name_ptr) | ||
| 64 | { | ||
| 65 | struct rtc_device *candidate = to_rtc_device(dev); | ||
| 66 | |||
| 67 | if (!candidate->ops->set_alarm) | ||
| 68 | return 0; | ||
| 69 | if (!device_may_wakeup(candidate->dev.parent)) | ||
| 70 | return 0; | ||
| 71 | |||
| 72 | *(const char **)name_ptr = dev_name(dev); | ||
| 73 | return 1; | ||
| 74 | } | ||
| 75 | |||
| 76 | /** | ||
| 77 | * alarmtimer_get_rtcdev - Return selected rtcdevice | ||
| 78 | * | ||
| 79 | * This function returns the rtc device to use for wakealarms. | ||
| 80 | * If one has not already been chosen, it checks to see if a | ||
| 81 | * functional rtc device is available. | ||
| 82 | */ | ||
| 83 | static struct rtc_device *alarmtimer_get_rtcdev(void) | ||
| 84 | { | ||
| 85 | struct device *dev; | ||
| 86 | char *str; | ||
| 87 | unsigned long flags; | ||
| 88 | struct rtc_device *ret; | ||
| 89 | |||
| 90 | spin_lock_irqsave(&rtcdev_lock, flags); | ||
| 91 | if (!rtcdev) { | ||
| 92 | /* Find an rtc device and init the rtc_timer */ | ||
| 93 | dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); | ||
| 94 | /* If we have a device then str is valid. See has_wakealarm() */ | ||
| 95 | if (dev) { | ||
| 96 | rtcdev = rtc_class_open(str); | ||
| 97 | /* | ||
| 98 | * Drop the reference we got in class_find_device, | ||
| 99 | * rtc_open takes its own. | ||
| 100 | */ | ||
| 101 | put_device(dev); | ||
| 102 | rtc_timer_init(&rtctimer, NULL, NULL); | ||
| 103 | } | ||
| 104 | } | ||
| 105 | ret = rtcdev; | ||
| 106 | spin_unlock_irqrestore(&rtcdev_lock, flags); | ||
| 107 | |||
| 108 | return ret; | ||
| 109 | } | ||
| 110 | #else | ||
| 111 | #define alarmtimer_get_rtcdev() (0) | ||
| 112 | #define rtcdev (0) | ||
| 113 | #endif | ||
| 114 | |||
| 115 | |||
| 116 | /** | ||
| 117 | * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue | ||
| 118 | * @base: pointer to the base where the timer is being run | ||
| 119 | * @alarm: pointer to alarm being enqueued. | ||
| 120 | * | ||
| 121 | * Adds alarm to a alarm_base timerqueue and if necessary sets | ||
| 122 | * an hrtimer to run. | ||
| 123 | * | ||
| 124 | * Must hold base->lock when calling. | ||
| 125 | */ | ||
| 126 | static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) | ||
| 127 | { | ||
| 128 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
| 129 | if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { | ||
| 130 | hrtimer_try_to_cancel(&base->timer); | ||
| 131 | hrtimer_start(&base->timer, alarm->node.expires, | ||
| 132 | HRTIMER_MODE_ABS); | ||
| 133 | } | ||
| 134 | } | ||
| 135 | |||
| 136 | /** | ||
| 137 | * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue | ||
| 138 | * @base: pointer to the base where the timer is running | ||
| 139 | * @alarm: pointer to alarm being removed | ||
| 140 | * | ||
| 141 | * Removes alarm to a alarm_base timerqueue and if necessary sets | ||
| 142 | * a new timer to run. | ||
| 143 | * | ||
| 144 | * Must hold base->lock when calling. | ||
| 145 | */ | ||
| 146 | static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) | ||
| 147 | { | ||
| 148 | struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); | ||
| 149 | |||
| 150 | timerqueue_del(&base->timerqueue, &alarm->node); | ||
| 151 | if (next == &alarm->node) { | ||
| 152 | hrtimer_try_to_cancel(&base->timer); | ||
| 153 | next = timerqueue_getnext(&base->timerqueue); | ||
| 154 | if (!next) | ||
| 155 | return; | ||
| 156 | hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); | ||
| 157 | } | ||
| 158 | } | ||
| 159 | |||
| 160 | |||
| 161 | /** | ||
| 162 | * alarmtimer_fired - Handles alarm hrtimer being fired. | ||
| 163 | * @timer: pointer to hrtimer being run | ||
| 164 | * | ||
| 165 | * When a alarm timer fires, this runs through the timerqueue to | ||
| 166 | * see which alarms expired, and runs those. If there are more alarm | ||
| 167 | * timers queued for the future, we set the hrtimer to fire when | ||
| 168 | * when the next future alarm timer expires. | ||
| 169 | */ | ||
| 170 | static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) | ||
| 171 | { | ||
| 172 | struct alarm_base *base = container_of(timer, struct alarm_base, timer); | ||
| 173 | struct timerqueue_node *next; | ||
| 174 | unsigned long flags; | ||
| 175 | ktime_t now; | ||
| 176 | int ret = HRTIMER_NORESTART; | ||
| 177 | |||
| 178 | spin_lock_irqsave(&base->lock, flags); | ||
| 179 | now = base->gettime(); | ||
| 180 | while ((next = timerqueue_getnext(&base->timerqueue))) { | ||
| 181 | struct alarm *alarm; | ||
| 182 | ktime_t expired = next->expires; | ||
| 183 | |||
| 184 | if (expired.tv64 >= now.tv64) | ||
| 185 | break; | ||
| 186 | |||
| 187 | alarm = container_of(next, struct alarm, node); | ||
| 188 | |||
| 189 | timerqueue_del(&base->timerqueue, &alarm->node); | ||
| 190 | alarm->enabled = 0; | ||
| 191 | /* Re-add periodic timers */ | ||
| 192 | if (alarm->period.tv64) { | ||
| 193 | alarm->node.expires = ktime_add(expired, alarm->period); | ||
| 194 | timerqueue_add(&base->timerqueue, &alarm->node); | ||
| 195 | alarm->enabled = 1; | ||
| 196 | } | ||
| 197 | spin_unlock_irqrestore(&base->lock, flags); | ||
| 198 | if (alarm->function) | ||
| 199 | alarm->function(alarm); | ||
| 200 | spin_lock_irqsave(&base->lock, flags); | ||
| 201 | } | ||
| 202 | |||
| 203 | if (next) { | ||
| 204 | hrtimer_set_expires(&base->timer, next->expires); | ||
| 205 | ret = HRTIMER_RESTART; | ||
| 206 | } | ||
| 207 | spin_unlock_irqrestore(&base->lock, flags); | ||
| 208 | |||
| 209 | return ret; | ||
| 210 | |||
| 211 | } | ||
| 212 | |||
| 213 | #ifdef CONFIG_RTC_CLASS | ||
| 214 | /** | ||
| 215 | * alarmtimer_suspend - Suspend time callback | ||
| 216 | * @dev: unused | ||
| 217 | * @state: unused | ||
| 218 | * | ||
| 219 | * When we are going into suspend, we look through the bases | ||
| 220 | * to see which is the soonest timer to expire. We then | ||
| 221 | * set an rtc timer to fire that far into the future, which | ||
| 222 | * will wake us from suspend. | ||
| 223 | */ | ||
| 224 | static int alarmtimer_suspend(struct device *dev) | ||
| 225 | { | ||
| 226 | struct rtc_time tm; | ||
| 227 | ktime_t min, now; | ||
| 228 | unsigned long flags; | ||
| 229 | struct rtc_device *rtc; | ||
| 230 | int i; | ||
| 231 | |||
| 232 | spin_lock_irqsave(&freezer_delta_lock, flags); | ||
| 233 | min = freezer_delta; | ||
| 234 | freezer_delta = ktime_set(0, 0); | ||
| 235 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | ||
| 236 | |||
| 237 | rtc = rtcdev; | ||
| 238 | /* If we have no rtcdev, just return */ | ||
| 239 | if (!rtc) | ||
| 240 | return 0; | ||
| 241 | |||
| 242 | /* Find the soonest timer to expire*/ | ||
| 243 | for (i = 0; i < ALARM_NUMTYPE; i++) { | ||
| 244 | struct alarm_base *base = &alarm_bases[i]; | ||
| 245 | struct timerqueue_node *next; | ||
| 246 | ktime_t delta; | ||
| 247 | |||
| 248 | spin_lock_irqsave(&base->lock, flags); | ||
| 249 | next = timerqueue_getnext(&base->timerqueue); | ||
| 250 | spin_unlock_irqrestore(&base->lock, flags); | ||
| 251 | if (!next) | ||
| 252 | continue; | ||
| 253 | delta = ktime_sub(next->expires, base->gettime()); | ||
| 254 | if (!min.tv64 || (delta.tv64 < min.tv64)) | ||
| 255 | min = delta; | ||
| 256 | } | ||
| 257 | if (min.tv64 == 0) | ||
| 258 | return 0; | ||
| 259 | |||
| 260 | /* XXX - Should we enforce a minimum sleep time? */ | ||
| 261 | WARN_ON(min.tv64 < NSEC_PER_SEC); | ||
| 262 | |||
| 263 | /* Setup an rtc timer to fire that far in the future */ | ||
| 264 | rtc_timer_cancel(rtc, &rtctimer); | ||
| 265 | rtc_read_time(rtc, &tm); | ||
| 266 | now = rtc_tm_to_ktime(tm); | ||
| 267 | now = ktime_add(now, min); | ||
| 268 | |||
| 269 | rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); | ||
| 270 | |||
| 271 | return 0; | ||
| 272 | } | ||
| 273 | #else | ||
| 274 | static int alarmtimer_suspend(struct device *dev) | ||
| 275 | { | ||
| 276 | return 0; | ||
| 277 | } | ||
| 278 | #endif | ||
| 279 | |||
| 280 | static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) | ||
| 281 | { | ||
| 282 | ktime_t delta; | ||
| 283 | unsigned long flags; | ||
| 284 | struct alarm_base *base = &alarm_bases[type]; | ||
| 285 | |||
| 286 | delta = ktime_sub(absexp, base->gettime()); | ||
| 287 | |||
| 288 | spin_lock_irqsave(&freezer_delta_lock, flags); | ||
| 289 | if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) | ||
| 290 | freezer_delta = delta; | ||
| 291 | spin_unlock_irqrestore(&freezer_delta_lock, flags); | ||
| 292 | } | ||
| 293 | |||
| 294 | |||
| 295 | /** | ||
| 296 | * alarm_init - Initialize an alarm structure | ||
| 297 | * @alarm: ptr to alarm to be initialized | ||
| 298 | * @type: the type of the alarm | ||
| 299 | * @function: callback that is run when the alarm fires | ||
| 300 | */ | ||
| 301 | void alarm_init(struct alarm *alarm, enum alarmtimer_type type, | ||
| 302 | void (*function)(struct alarm *)) | ||
| 303 | { | ||
| 304 | timerqueue_init(&alarm->node); | ||
| 305 | alarm->period = ktime_set(0, 0); | ||
| 306 | alarm->function = function; | ||
| 307 | alarm->type = type; | ||
| 308 | alarm->enabled = 0; | ||
| 309 | } | ||
| 310 | |||
| 311 | /** | ||
| 312 | * alarm_start - Sets an alarm to fire | ||
| 313 | * @alarm: ptr to alarm to set | ||
| 314 | * @start: time to run the alarm | ||
| 315 | * @period: period at which the alarm will recur | ||
| 316 | */ | ||
| 317 | void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) | ||
| 318 | { | ||
| 319 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
| 320 | unsigned long flags; | ||
| 321 | |||
| 322 | spin_lock_irqsave(&base->lock, flags); | ||
| 323 | if (alarm->enabled) | ||
| 324 | alarmtimer_remove(base, alarm); | ||
| 325 | alarm->node.expires = start; | ||
| 326 | alarm->period = period; | ||
| 327 | alarmtimer_enqueue(base, alarm); | ||
| 328 | alarm->enabled = 1; | ||
| 329 | spin_unlock_irqrestore(&base->lock, flags); | ||
| 330 | } | ||
| 331 | |||
| 332 | /** | ||
| 333 | * alarm_cancel - Tries to cancel an alarm timer | ||
| 334 | * @alarm: ptr to alarm to be canceled | ||
| 335 | */ | ||
| 336 | void alarm_cancel(struct alarm *alarm) | ||
| 337 | { | ||
| 338 | struct alarm_base *base = &alarm_bases[alarm->type]; | ||
| 339 | unsigned long flags; | ||
| 340 | |||
| 341 | spin_lock_irqsave(&base->lock, flags); | ||
| 342 | if (alarm->enabled) | ||
| 343 | alarmtimer_remove(base, alarm); | ||
| 344 | alarm->enabled = 0; | ||
| 345 | spin_unlock_irqrestore(&base->lock, flags); | ||
| 346 | } | ||
| 347 | |||
| 348 | |||
| 349 | /** | ||
| 350 | * clock2alarm - helper that converts from clockid to alarmtypes | ||
| 351 | * @clockid: clockid. | ||
| 352 | */ | ||
| 353 | static enum alarmtimer_type clock2alarm(clockid_t clockid) | ||
| 354 | { | ||
| 355 | if (clockid == CLOCK_REALTIME_ALARM) | ||
| 356 | return ALARM_REALTIME; | ||
| 357 | if (clockid == CLOCK_BOOTTIME_ALARM) | ||
| 358 | return ALARM_BOOTTIME; | ||
| 359 | return -1; | ||
| 360 | } | ||
| 361 | |||
| 362 | /** | ||
| 363 | * alarm_handle_timer - Callback for posix timers | ||
| 364 | * @alarm: alarm that fired | ||
| 365 | * | ||
| 366 | * Posix timer callback for expired alarm timers. | ||
| 367 | */ | ||
| 368 | static void alarm_handle_timer(struct alarm *alarm) | ||
| 369 | { | ||
| 370 | struct k_itimer *ptr = container_of(alarm, struct k_itimer, | ||
| 371 | it.alarmtimer); | ||
| 372 | if (posix_timer_event(ptr, 0) != 0) | ||
| 373 | ptr->it_overrun++; | ||
| 374 | } | ||
| 375 | |||
| 376 | /** | ||
| 377 | * alarm_clock_getres - posix getres interface | ||
| 378 | * @which_clock: clockid | ||
| 379 | * @tp: timespec to fill | ||
| 380 | * | ||
| 381 | * Returns the granularity of underlying alarm base clock | ||
| 382 | */ | ||
| 383 | static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) | ||
| 384 | { | ||
| 385 | clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; | ||
| 386 | |||
| 387 | if (!alarmtimer_get_rtcdev()) | ||
| 388 | return -ENOTSUPP; | ||
| 389 | |||
| 390 | return hrtimer_get_res(baseid, tp); | ||
| 391 | } | ||
| 392 | |||
| 393 | /** | ||
| 394 | * alarm_clock_get - posix clock_get interface | ||
| 395 | * @which_clock: clockid | ||
| 396 | * @tp: timespec to fill. | ||
| 397 | * | ||
| 398 | * Provides the underlying alarm base time. | ||
| 399 | */ | ||
| 400 | static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) | ||
| 401 | { | ||
| 402 | struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; | ||
| 403 | |||
| 404 | if (!alarmtimer_get_rtcdev()) | ||
| 405 | return -ENOTSUPP; | ||
| 406 | |||
| 407 | *tp = ktime_to_timespec(base->gettime()); | ||
| 408 | return 0; | ||
| 409 | } | ||
| 410 | |||
| 411 | /** | ||
| 412 | * alarm_timer_create - posix timer_create interface | ||
| 413 | * @new_timer: k_itimer pointer to manage | ||
| 414 | * | ||
| 415 | * Initializes the k_itimer structure. | ||
| 416 | */ | ||
| 417 | static int alarm_timer_create(struct k_itimer *new_timer) | ||
| 418 | { | ||
| 419 | enum alarmtimer_type type; | ||
| 420 | struct alarm_base *base; | ||
| 421 | |||
| 422 | if (!alarmtimer_get_rtcdev()) | ||
| 423 | return -ENOTSUPP; | ||
| 424 | |||
| 425 | if (!capable(CAP_WAKE_ALARM)) | ||
| 426 | return -EPERM; | ||
| 427 | |||
| 428 | type = clock2alarm(new_timer->it_clock); | ||
| 429 | base = &alarm_bases[type]; | ||
| 430 | alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); | ||
| 431 | return 0; | ||
| 432 | } | ||
| 433 | |||
| 434 | /** | ||
| 435 | * alarm_timer_get - posix timer_get interface | ||
| 436 | * @new_timer: k_itimer pointer | ||
| 437 | * @cur_setting: itimerspec data to fill | ||
| 438 | * | ||
| 439 | * Copies the itimerspec data out from the k_itimer | ||
| 440 | */ | ||
| 441 | static void alarm_timer_get(struct k_itimer *timr, | ||
| 442 | struct itimerspec *cur_setting) | ||
| 443 | { | ||
| 444 | cur_setting->it_interval = | ||
| 445 | ktime_to_timespec(timr->it.alarmtimer.period); | ||
| 446 | cur_setting->it_value = | ||
| 447 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | ||
| 448 | return; | ||
| 449 | } | ||
| 450 | |||
| 451 | /** | ||
| 452 | * alarm_timer_del - posix timer_del interface | ||
| 453 | * @timr: k_itimer pointer to be deleted | ||
| 454 | * | ||
| 455 | * Cancels any programmed alarms for the given timer. | ||
| 456 | */ | ||
| 457 | static int alarm_timer_del(struct k_itimer *timr) | ||
| 458 | { | ||
| 459 | if (!rtcdev) | ||
| 460 | return -ENOTSUPP; | ||
| 461 | |||
| 462 | alarm_cancel(&timr->it.alarmtimer); | ||
| 463 | return 0; | ||
| 464 | } | ||
| 465 | |||
| 466 | /** | ||
| 467 | * alarm_timer_set - posix timer_set interface | ||
| 468 | * @timr: k_itimer pointer to be deleted | ||
| 469 | * @flags: timer flags | ||
| 470 | * @new_setting: itimerspec to be used | ||
| 471 | * @old_setting: itimerspec being replaced | ||
| 472 | * | ||
| 473 | * Sets the timer to new_setting, and starts the timer. | ||
| 474 | */ | ||
| 475 | static int alarm_timer_set(struct k_itimer *timr, int flags, | ||
| 476 | struct itimerspec *new_setting, | ||
| 477 | struct itimerspec *old_setting) | ||
| 478 | { | ||
| 479 | if (!rtcdev) | ||
| 480 | return -ENOTSUPP; | ||
| 481 | |||
| 482 | /* Save old values */ | ||
| 483 | old_setting->it_interval = | ||
| 484 | ktime_to_timespec(timr->it.alarmtimer.period); | ||
| 485 | old_setting->it_value = | ||
| 486 | ktime_to_timespec(timr->it.alarmtimer.node.expires); | ||
| 487 | |||
| 488 | /* If the timer was already set, cancel it */ | ||
| 489 | alarm_cancel(&timr->it.alarmtimer); | ||
| 490 | |||
| 491 | /* start the timer */ | ||
| 492 | alarm_start(&timr->it.alarmtimer, | ||
| 493 | timespec_to_ktime(new_setting->it_value), | ||
| 494 | timespec_to_ktime(new_setting->it_interval)); | ||
| 495 | return 0; | ||
| 496 | } | ||
| 497 | |||
| 498 | /** | ||
| 499 | * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep | ||
| 500 | * @alarm: ptr to alarm that fired | ||
| 501 | * | ||
| 502 | * Wakes up the task that set the alarmtimer | ||
| 503 | */ | ||
| 504 | static void alarmtimer_nsleep_wakeup(struct alarm *alarm) | ||
| 505 | { | ||
| 506 | struct task_struct *task = (struct task_struct *)alarm->data; | ||
| 507 | |||
| 508 | alarm->data = NULL; | ||
| 509 | if (task) | ||
| 510 | wake_up_process(task); | ||
| 511 | } | ||
| 512 | |||
| 513 | /** | ||
| 514 | * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation | ||
| 515 | * @alarm: ptr to alarmtimer | ||
| 516 | * @absexp: absolute expiration time | ||
| 517 | * | ||
| 518 | * Sets the alarm timer and sleeps until it is fired or interrupted. | ||
| 519 | */ | ||
| 520 | static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) | ||
| 521 | { | ||
| 522 | alarm->data = (void *)current; | ||
| 523 | do { | ||
| 524 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 525 | alarm_start(alarm, absexp, ktime_set(0, 0)); | ||
| 526 | if (likely(alarm->data)) | ||
| 527 | schedule(); | ||
| 528 | |||
| 529 | alarm_cancel(alarm); | ||
| 530 | } while (alarm->data && !signal_pending(current)); | ||
| 531 | |||
| 532 | __set_current_state(TASK_RUNNING); | ||
| 533 | |||
| 534 | return (alarm->data == NULL); | ||
| 535 | } | ||
| 536 | |||
| 537 | |||
| 538 | /** | ||
| 539 | * update_rmtp - Update remaining timespec value | ||
| 540 | * @exp: expiration time | ||
| 541 | * @type: timer type | ||
| 542 | * @rmtp: user pointer to remaining timepsec value | ||
| 543 | * | ||
| 544 | * Helper function that fills in rmtp value with time between | ||
| 545 | * now and the exp value | ||
| 546 | */ | ||
| 547 | static int update_rmtp(ktime_t exp, enum alarmtimer_type type, | ||
| 548 | struct timespec __user *rmtp) | ||
| 549 | { | ||
| 550 | struct timespec rmt; | ||
| 551 | ktime_t rem; | ||
| 552 | |||
| 553 | rem = ktime_sub(exp, alarm_bases[type].gettime()); | ||
| 554 | |||
| 555 | if (rem.tv64 <= 0) | ||
| 556 | return 0; | ||
| 557 | rmt = ktime_to_timespec(rem); | ||
| 558 | |||
| 559 | if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) | ||
| 560 | return -EFAULT; | ||
| 561 | |||
| 562 | return 1; | ||
| 563 | |||
| 564 | } | ||
| 565 | |||
| 566 | /** | ||
| 567 | * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep | ||
| 568 | * @restart: ptr to restart block | ||
| 569 | * | ||
| 570 | * Handles restarted clock_nanosleep calls | ||
| 571 | */ | ||
| 572 | static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) | ||
| 573 | { | ||
| 574 | enum alarmtimer_type type = restart->nanosleep.clockid; | ||
| 575 | ktime_t exp; | ||
| 576 | struct timespec __user *rmtp; | ||
| 577 | struct alarm alarm; | ||
| 578 | int ret = 0; | ||
| 579 | |||
| 580 | exp.tv64 = restart->nanosleep.expires; | ||
| 581 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); | ||
| 582 | |||
| 583 | if (alarmtimer_do_nsleep(&alarm, exp)) | ||
| 584 | goto out; | ||
| 585 | |||
| 586 | if (freezing(current)) | ||
| 587 | alarmtimer_freezerset(exp, type); | ||
| 588 | |||
| 589 | rmtp = restart->nanosleep.rmtp; | ||
| 590 | if (rmtp) { | ||
| 591 | ret = update_rmtp(exp, type, rmtp); | ||
| 592 | if (ret <= 0) | ||
| 593 | goto out; | ||
| 594 | } | ||
| 595 | |||
| 596 | |||
| 597 | /* The other values in restart are already filled in */ | ||
| 598 | ret = -ERESTART_RESTARTBLOCK; | ||
| 599 | out: | ||
| 600 | return ret; | ||
| 601 | } | ||
| 602 | |||
| 603 | /** | ||
| 604 | * alarm_timer_nsleep - alarmtimer nanosleep | ||
| 605 | * @which_clock: clockid | ||
| 606 | * @flags: determins abstime or relative | ||
| 607 | * @tsreq: requested sleep time (abs or rel) | ||
| 608 | * @rmtp: remaining sleep time saved | ||
| 609 | * | ||
| 610 | * Handles clock_nanosleep calls against _ALARM clockids | ||
| 611 | */ | ||
| 612 | static int alarm_timer_nsleep(const clockid_t which_clock, int flags, | ||
| 613 | struct timespec *tsreq, struct timespec __user *rmtp) | ||
| 614 | { | ||
| 615 | enum alarmtimer_type type = clock2alarm(which_clock); | ||
| 616 | struct alarm alarm; | ||
| 617 | ktime_t exp; | ||
| 618 | int ret = 0; | ||
| 619 | struct restart_block *restart; | ||
| 620 | |||
| 621 | if (!alarmtimer_get_rtcdev()) | ||
| 622 | return -ENOTSUPP; | ||
| 623 | |||
| 624 | if (!capable(CAP_WAKE_ALARM)) | ||
| 625 | return -EPERM; | ||
| 626 | |||
| 627 | alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); | ||
| 628 | |||
| 629 | exp = timespec_to_ktime(*tsreq); | ||
| 630 | /* Convert (if necessary) to absolute time */ | ||
| 631 | if (flags != TIMER_ABSTIME) { | ||
| 632 | ktime_t now = alarm_bases[type].gettime(); | ||
| 633 | exp = ktime_add(now, exp); | ||
| 634 | } | ||
| 635 | |||
| 636 | if (alarmtimer_do_nsleep(&alarm, exp)) | ||
| 637 | goto out; | ||
| 638 | |||
| 639 | if (freezing(current)) | ||
| 640 | alarmtimer_freezerset(exp, type); | ||
| 641 | |||
| 642 | /* abs timers don't set remaining time or restart */ | ||
| 643 | if (flags == TIMER_ABSTIME) { | ||
| 644 | ret = -ERESTARTNOHAND; | ||
| 645 | goto out; | ||
| 646 | } | ||
| 647 | |||
| 648 | if (rmtp) { | ||
| 649 | ret = update_rmtp(exp, type, rmtp); | ||
| 650 | if (ret <= 0) | ||
| 651 | goto out; | ||
| 652 | } | ||
| 653 | |||
| 654 | restart = ¤t_thread_info()->restart_block; | ||
| 655 | restart->fn = alarm_timer_nsleep_restart; | ||
| 656 | restart->nanosleep.clockid = type; | ||
| 657 | restart->nanosleep.expires = exp.tv64; | ||
| 658 | restart->nanosleep.rmtp = rmtp; | ||
| 659 | ret = -ERESTART_RESTARTBLOCK; | ||
| 660 | |||
| 661 | out: | ||
| 662 | return ret; | ||
| 663 | } | ||
| 664 | |||
| 665 | |||
| 666 | /* Suspend hook structures */ | ||
| 667 | static const struct dev_pm_ops alarmtimer_pm_ops = { | ||
| 668 | .suspend = alarmtimer_suspend, | ||
| 669 | }; | ||
| 670 | |||
| 671 | static struct platform_driver alarmtimer_driver = { | ||
| 672 | .driver = { | ||
| 673 | .name = "alarmtimer", | ||
| 674 | .pm = &alarmtimer_pm_ops, | ||
| 675 | } | ||
| 676 | }; | ||
| 677 | |||
| 678 | /** | ||
| 679 | * alarmtimer_init - Initialize alarm timer code | ||
| 680 | * | ||
| 681 | * This function initializes the alarm bases and registers | ||
| 682 | * the posix clock ids. | ||
| 683 | */ | ||
| 684 | static int __init alarmtimer_init(void) | ||
| 685 | { | ||
| 686 | int error = 0; | ||
| 687 | int i; | ||
| 688 | struct k_clock alarm_clock = { | ||
| 689 | .clock_getres = alarm_clock_getres, | ||
| 690 | .clock_get = alarm_clock_get, | ||
| 691 | .timer_create = alarm_timer_create, | ||
| 692 | .timer_set = alarm_timer_set, | ||
| 693 | .timer_del = alarm_timer_del, | ||
| 694 | .timer_get = alarm_timer_get, | ||
| 695 | .nsleep = alarm_timer_nsleep, | ||
| 696 | }; | ||
| 697 | |||
| 698 | posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); | ||
| 699 | posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); | ||
| 700 | |||
| 701 | /* Initialize alarm bases */ | ||
| 702 | alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; | ||
| 703 | alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; | ||
| 704 | alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; | ||
| 705 | alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; | ||
| 706 | for (i = 0; i < ALARM_NUMTYPE; i++) { | ||
| 707 | timerqueue_init_head(&alarm_bases[i].timerqueue); | ||
| 708 | spin_lock_init(&alarm_bases[i].lock); | ||
| 709 | hrtimer_init(&alarm_bases[i].timer, | ||
| 710 | alarm_bases[i].base_clockid, | ||
| 711 | HRTIMER_MODE_ABS); | ||
| 712 | alarm_bases[i].timer.function = alarmtimer_fired; | ||
| 713 | } | ||
| 714 | error = platform_driver_register(&alarmtimer_driver); | ||
| 715 | platform_device_register_simple("alarmtimer", -1, NULL, 0); | ||
| 716 | |||
| 717 | return error; | ||
| 718 | } | ||
| 719 | device_initcall(alarmtimer_init); | ||
| 720 | |||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 0d74b9ba90c8..e4c699dfa4e8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
| 182 | unsigned long flags; | 182 | unsigned long flags; |
| 183 | 183 | ||
| 184 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | 184 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); |
| 185 | BUG_ON(!dev->cpumask); | 185 | if (!dev->cpumask) { |
| 186 | WARN_ON(num_possible_cpus() > 1); | ||
| 187 | dev->cpumask = cpumask_of(smp_processor_id()); | ||
| 188 | } | ||
| 186 | 189 | ||
| 187 | raw_spin_lock_irqsave(&clockevents_lock, flags); | 190 | raw_spin_lock_irqsave(&clockevents_lock, flags); |
| 188 | 191 | ||
| @@ -194,6 +197,70 @@ void clockevents_register_device(struct clock_event_device *dev) | |||
| 194 | } | 197 | } |
| 195 | EXPORT_SYMBOL_GPL(clockevents_register_device); | 198 | EXPORT_SYMBOL_GPL(clockevents_register_device); |
| 196 | 199 | ||
| 200 | static void clockevents_config(struct clock_event_device *dev, | ||
| 201 | u32 freq) | ||
| 202 | { | ||
| 203 | u64 sec; | ||
| 204 | |||
| 205 | if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) | ||
| 206 | return; | ||
| 207 | |||
| 208 | /* | ||
| 209 | * Calculate the maximum number of seconds we can sleep. Limit | ||
| 210 | * to 10 minutes for hardware which can program more than | ||
| 211 | * 32bit ticks so we still get reasonable conversion values. | ||
| 212 | */ | ||
| 213 | sec = dev->max_delta_ticks; | ||
| 214 | do_div(sec, freq); | ||
| 215 | if (!sec) | ||
| 216 | sec = 1; | ||
| 217 | else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) | ||
| 218 | sec = 600; | ||
| 219 | |||
| 220 | clockevents_calc_mult_shift(dev, freq, sec); | ||
| 221 | dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); | ||
| 222 | dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); | ||
| 223 | } | ||
| 224 | |||
| 225 | /** | ||
| 226 | * clockevents_config_and_register - Configure and register a clock event device | ||
| 227 | * @dev: device to register | ||
| 228 | * @freq: The clock frequency | ||
| 229 | * @min_delta: The minimum clock ticks to program in oneshot mode | ||
| 230 | * @max_delta: The maximum clock ticks to program in oneshot mode | ||
| 231 | * | ||
| 232 | * min/max_delta can be 0 for devices which do not support oneshot mode. | ||
| 233 | */ | ||
| 234 | void clockevents_config_and_register(struct clock_event_device *dev, | ||
| 235 | u32 freq, unsigned long min_delta, | ||
| 236 | unsigned long max_delta) | ||
| 237 | { | ||
| 238 | dev->min_delta_ticks = min_delta; | ||
| 239 | dev->max_delta_ticks = max_delta; | ||
| 240 | clockevents_config(dev, freq); | ||
| 241 | clockevents_register_device(dev); | ||
| 242 | } | ||
| 243 | |||
| 244 | /** | ||
| 245 | * clockevents_update_freq - Update frequency and reprogram a clock event device. | ||
| 246 | * @dev: device to modify | ||
| 247 | * @freq: new device frequency | ||
| 248 | * | ||
| 249 | * Reconfigure and reprogram a clock event device in oneshot | ||
| 250 | * mode. Must be called on the cpu for which the device delivers per | ||
| 251 | * cpu timer events with interrupts disabled! Returns 0 on success, | ||
| 252 | * -ETIME when the event is in the past. | ||
| 253 | */ | ||
| 254 | int clockevents_update_freq(struct clock_event_device *dev, u32 freq) | ||
| 255 | { | ||
| 256 | clockevents_config(dev, freq); | ||
| 257 | |||
| 258 | if (dev->mode != CLOCK_EVT_MODE_ONESHOT) | ||
| 259 | return 0; | ||
| 260 | |||
| 261 | return clockevents_program_event(dev, dev->next_event, ktime_get()); | ||
| 262 | } | ||
| 263 | |||
| 197 | /* | 264 | /* |
| 198 | * Noop handler when we shut down an event device | 265 | * Noop handler when we shut down an event device |
| 199 | */ | 266 | */ |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6519cf62d9cd..e0980f0d9a0a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -185,7 +185,6 @@ static struct clocksource *watchdog; | |||
| 185 | static struct timer_list watchdog_timer; | 185 | static struct timer_list watchdog_timer; |
| 186 | static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); | 186 | static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); |
| 187 | static DEFINE_SPINLOCK(watchdog_lock); | 187 | static DEFINE_SPINLOCK(watchdog_lock); |
| 188 | static cycle_t watchdog_last; | ||
| 189 | static int watchdog_running; | 188 | static int watchdog_running; |
| 190 | 189 | ||
| 191 | static int clocksource_watchdog_kthread(void *data); | 190 | static int clocksource_watchdog_kthread(void *data); |
| @@ -254,11 +253,6 @@ static void clocksource_watchdog(unsigned long data) | |||
| 254 | if (!watchdog_running) | 253 | if (!watchdog_running) |
| 255 | goto out; | 254 | goto out; |
| 256 | 255 | ||
| 257 | wdnow = watchdog->read(watchdog); | ||
| 258 | wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask, | ||
| 259 | watchdog->mult, watchdog->shift); | ||
| 260 | watchdog_last = wdnow; | ||
| 261 | |||
| 262 | list_for_each_entry(cs, &watchdog_list, wd_list) { | 256 | list_for_each_entry(cs, &watchdog_list, wd_list) { |
| 263 | 257 | ||
| 264 | /* Clocksource already marked unstable? */ | 258 | /* Clocksource already marked unstable? */ |
| @@ -268,19 +262,28 @@ static void clocksource_watchdog(unsigned long data) | |||
| 268 | continue; | 262 | continue; |
| 269 | } | 263 | } |
| 270 | 264 | ||
| 265 | local_irq_disable(); | ||
| 271 | csnow = cs->read(cs); | 266 | csnow = cs->read(cs); |
| 267 | wdnow = watchdog->read(watchdog); | ||
| 268 | local_irq_enable(); | ||
| 272 | 269 | ||
| 273 | /* Clocksource initialized ? */ | 270 | /* Clocksource initialized ? */ |
| 274 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { | 271 | if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { |
| 275 | cs->flags |= CLOCK_SOURCE_WATCHDOG; | 272 | cs->flags |= CLOCK_SOURCE_WATCHDOG; |
| 276 | cs->wd_last = csnow; | 273 | cs->wd_last = wdnow; |
| 274 | cs->cs_last = csnow; | ||
| 277 | continue; | 275 | continue; |
| 278 | } | 276 | } |
| 279 | 277 | ||
| 280 | /* Check the deviation from the watchdog clocksource. */ | 278 | wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, |
| 281 | cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) & | 279 | watchdog->mult, watchdog->shift); |
| 280 | |||
| 281 | cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & | ||
| 282 | cs->mask, cs->mult, cs->shift); | 282 | cs->mask, cs->mult, cs->shift); |
| 283 | cs->wd_last = csnow; | 283 | cs->cs_last = csnow; |
| 284 | cs->wd_last = wdnow; | ||
| 285 | |||
| 286 | /* Check the deviation from the watchdog clocksource. */ | ||
| 284 | if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { | 287 | if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { |
| 285 | clocksource_unstable(cs, cs_nsec - wd_nsec); | 288 | clocksource_unstable(cs, cs_nsec - wd_nsec); |
| 286 | continue; | 289 | continue; |
| @@ -318,7 +321,6 @@ static inline void clocksource_start_watchdog(void) | |||
| 318 | return; | 321 | return; |
| 319 | init_timer(&watchdog_timer); | 322 | init_timer(&watchdog_timer); |
| 320 | watchdog_timer.function = clocksource_watchdog; | 323 | watchdog_timer.function = clocksource_watchdog; |
| 321 | watchdog_last = watchdog->read(watchdog); | ||
| 322 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | 324 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; |
| 323 | add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); | 325 | add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); |
| 324 | watchdog_running = 1; | 326 | watchdog_running = 1; |
| @@ -626,19 +628,6 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
| 626 | list_add(&cs->list, entry); | 628 | list_add(&cs->list, entry); |
| 627 | } | 629 | } |
| 628 | 630 | ||
| 629 | |||
| 630 | /* | ||
| 631 | * Maximum time we expect to go between ticks. This includes idle | ||
| 632 | * tickless time. It provides the trade off between selecting a | ||
| 633 | * mult/shift pair that is very precise but can only handle a short | ||
| 634 | * period of time, vs. a mult/shift pair that can handle long periods | ||
| 635 | * of time but isn't as precise. | ||
| 636 | * | ||
| 637 | * This is a subsystem constant, and actual hardware limitations | ||
| 638 | * may override it (ie: clocksources that wrap every 3 seconds). | ||
| 639 | */ | ||
| 640 | #define MAX_UPDATE_LENGTH 5 /* Seconds */ | ||
| 641 | |||
| 642 | /** | 631 | /** |
| 643 | * __clocksource_updatefreq_scale - Used update clocksource with new freq | 632 | * __clocksource_updatefreq_scale - Used update clocksource with new freq |
| 644 | * @t: clocksource to be registered | 633 | * @t: clocksource to be registered |
| @@ -652,15 +641,28 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
| 652 | */ | 641 | */ |
| 653 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 642 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
| 654 | { | 643 | { |
| 644 | u64 sec; | ||
| 645 | |||
| 655 | /* | 646 | /* |
| 656 | * Ideally we want to use some of the limits used in | 647 | * Calc the maximum number of seconds which we can run before |
| 657 | * clocksource_max_deferment, to provide a more informed | 648 | * wrapping around. For clocksources which have a mask > 32bit |
| 658 | * MAX_UPDATE_LENGTH. But for now this just gets the | 649 | * we need to limit the max sleep time to have a good |
| 659 | * register interface working properly. | 650 | * conversion precision. 10 minutes is still a reasonable |
| 651 | * amount. That results in a shift value of 24 for a | ||
| 652 | * clocksource with mask >= 40bit and f >= 4GHz. That maps to | ||
| 653 | * ~ 0.06ppm granularity for NTP. We apply the same 12.5% | ||
| 654 | * margin as we do in clocksource_max_deferment() | ||
| 660 | */ | 655 | */ |
| 656 | sec = (cs->mask - (cs->mask >> 5)); | ||
| 657 | do_div(sec, freq); | ||
| 658 | do_div(sec, scale); | ||
| 659 | if (!sec) | ||
| 660 | sec = 1; | ||
| 661 | else if (sec > 600 && cs->mask > UINT_MAX) | ||
| 662 | sec = 600; | ||
| 663 | |||
| 661 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 664 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
| 662 | NSEC_PER_SEC/scale, | 665 | NSEC_PER_SEC / scale, sec * scale); |
| 663 | MAX_UPDATE_LENGTH*scale); | ||
| 664 | cs->max_idle_ns = clocksource_max_deferment(cs); | 666 | cs->max_idle_ns = clocksource_max_deferment(cs); |
| 665 | } | 667 | } |
| 666 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 668 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
| @@ -685,8 +687,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
| 685 | /* Add clocksource to the clcoksource list */ | 687 | /* Add clocksource to the clcoksource list */ |
| 686 | mutex_lock(&clocksource_mutex); | 688 | mutex_lock(&clocksource_mutex); |
| 687 | clocksource_enqueue(cs); | 689 | clocksource_enqueue(cs); |
| 688 | clocksource_select(); | ||
| 689 | clocksource_enqueue_watchdog(cs); | 690 | clocksource_enqueue_watchdog(cs); |
| 691 | clocksource_select(); | ||
| 690 | mutex_unlock(&clocksource_mutex); | 692 | mutex_unlock(&clocksource_mutex); |
| 691 | return 0; | 693 | return 0; |
| 692 | } | 694 | } |
| @@ -706,8 +708,8 @@ int clocksource_register(struct clocksource *cs) | |||
| 706 | 708 | ||
| 707 | mutex_lock(&clocksource_mutex); | 709 | mutex_lock(&clocksource_mutex); |
| 708 | clocksource_enqueue(cs); | 710 | clocksource_enqueue(cs); |
| 709 | clocksource_select(); | ||
| 710 | clocksource_enqueue_watchdog(cs); | 711 | clocksource_enqueue_watchdog(cs); |
| 712 | clocksource_select(); | ||
| 711 | mutex_unlock(&clocksource_mutex); | 713 | mutex_unlock(&clocksource_mutex); |
| 712 | return 0; | 714 | return 0; |
| 713 | } | 715 | } |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index da800ffa810c..c7218d132738 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
| @@ -456,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
| 456 | unsigned long flags; | 456 | unsigned long flags; |
| 457 | int cpu; | 457 | int cpu; |
| 458 | 458 | ||
| 459 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
| 460 | |||
| 461 | /* | 459 | /* |
| 462 | * Periodic mode does not care about the enter/exit of power | 460 | * Periodic mode does not care about the enter/exit of power |
| 463 | * states | 461 | * states |
| 464 | */ | 462 | */ |
| 465 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) | 463 | if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) |
| 466 | goto out; | 464 | return; |
| 467 | 465 | ||
| 468 | bc = tick_broadcast_device.evtdev; | 466 | /* |
| 467 | * We are called with preemtion disabled from the depth of the | ||
| 468 | * idle code, so we can't be moved away. | ||
| 469 | */ | ||
| 469 | cpu = smp_processor_id(); | 470 | cpu = smp_processor_id(); |
| 470 | td = &per_cpu(tick_cpu_device, cpu); | 471 | td = &per_cpu(tick_cpu_device, cpu); |
| 471 | dev = td->evtdev; | 472 | dev = td->evtdev; |
| 472 | 473 | ||
| 473 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) | 474 | if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) |
| 474 | goto out; | 475 | return; |
| 475 | 476 | ||
| 477 | bc = tick_broadcast_device.evtdev; | ||
| 478 | |||
| 479 | raw_spin_lock_irqsave(&tick_broadcast_lock, flags); | ||
| 476 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { | 480 | if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { |
| 477 | if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { | 481 | if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { |
| 478 | cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); | 482 | cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); |
| @@ -489,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) | |||
| 489 | tick_program_event(dev->next_event, 1); | 493 | tick_program_event(dev->next_event, 1); |
| 490 | } | 494 | } |
| 491 | } | 495 | } |
| 492 | |||
| 493 | out: | ||
| 494 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); | 496 | raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); |
| 495 | } | 497 | } |
| 496 | 498 | ||
| @@ -522,10 +524,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, | |||
| 522 | */ | 524 | */ |
| 523 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 525 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
| 524 | { | 526 | { |
| 527 | int cpu = smp_processor_id(); | ||
| 528 | |||
| 525 | /* Set it up only once ! */ | 529 | /* Set it up only once ! */ |
| 526 | if (bc->event_handler != tick_handle_oneshot_broadcast) { | 530 | if (bc->event_handler != tick_handle_oneshot_broadcast) { |
| 527 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; | 531 | int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; |
| 528 | int cpu = smp_processor_id(); | ||
| 529 | 532 | ||
| 530 | bc->event_handler = tick_handle_oneshot_broadcast; | 533 | bc->event_handler = tick_handle_oneshot_broadcast; |
| 531 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); | 534 | clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); |
| @@ -551,6 +554,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
| 551 | tick_broadcast_set_event(tick_next_period, 1); | 554 | tick_broadcast_set_event(tick_next_period, 1); |
| 552 | } else | 555 | } else |
| 553 | bc->next_event.tv64 = KTIME_MAX; | 556 | bc->next_event.tv64 = KTIME_MAX; |
| 557 | } else { | ||
| 558 | /* | ||
| 559 | * The first cpu which switches to oneshot mode sets | ||
| 560 | * the bit for all other cpus which are in the general | ||
| 561 | * (periodic) broadcast mask. So the bit is set and | ||
| 562 | * would prevent the first broadcast enter after this | ||
| 563 | * to program the bc device. | ||
| 564 | */ | ||
| 565 | tick_broadcast_clear_oneshot(cpu); | ||
| 554 | } | 566 | } |
| 555 | } | 567 | } |
| 556 | 568 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8ad5d576755e..2b021b0e8507 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -596,6 +596,64 @@ void __init timekeeping_init(void) | |||
| 596 | static struct timespec timekeeping_suspend_time; | 596 | static struct timespec timekeeping_suspend_time; |
| 597 | 597 | ||
| 598 | /** | 598 | /** |
| 599 | * __timekeeping_inject_sleeptime - Internal function to add sleep interval | ||
| 600 | * @delta: pointer to a timespec delta value | ||
| 601 | * | ||
| 602 | * Takes a timespec offset measuring a suspend interval and properly | ||
| 603 | * adds the sleep offset to the timekeeping variables. | ||
| 604 | */ | ||
| 605 | static void __timekeeping_inject_sleeptime(struct timespec *delta) | ||
| 606 | { | ||
| 607 | if (!timespec_valid(delta)) { | ||
| 608 | printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " | ||
| 609 | "sleep delta value!\n"); | ||
| 610 | return; | ||
| 611 | } | ||
| 612 | |||
| 613 | xtime = timespec_add(xtime, *delta); | ||
| 614 | wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); | ||
| 615 | total_sleep_time = timespec_add(total_sleep_time, *delta); | ||
| 616 | } | ||
| 617 | |||
| 618 | |||
| 619 | /** | ||
| 620 | * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values | ||
| 621 | * @delta: pointer to a timespec delta value | ||
| 622 | * | ||
| 623 | * This hook is for architectures that cannot support read_persistent_clock | ||
| 624 | * because their RTC/persistent clock is only accessible when irqs are enabled. | ||
| 625 | * | ||
| 626 | * This function should only be called by rtc_resume(), and allows | ||
| 627 | * a suspend offset to be injected into the timekeeping values. | ||
| 628 | */ | ||
| 629 | void timekeeping_inject_sleeptime(struct timespec *delta) | ||
| 630 | { | ||
| 631 | unsigned long flags; | ||
| 632 | struct timespec ts; | ||
| 633 | |||
| 634 | /* Make sure we don't set the clock twice */ | ||
| 635 | read_persistent_clock(&ts); | ||
| 636 | if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) | ||
| 637 | return; | ||
| 638 | |||
| 639 | write_seqlock_irqsave(&xtime_lock, flags); | ||
| 640 | timekeeping_forward_now(); | ||
| 641 | |||
| 642 | __timekeeping_inject_sleeptime(delta); | ||
| 643 | |||
| 644 | timekeeper.ntp_error = 0; | ||
| 645 | ntp_clear(); | ||
| 646 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | ||
| 647 | timekeeper.mult); | ||
| 648 | |||
| 649 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
| 650 | |||
| 651 | /* signal hrtimers about time change */ | ||
| 652 | clock_was_set(); | ||
| 653 | } | ||
| 654 | |||
| 655 | |||
| 656 | /** | ||
| 599 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | 657 | * timekeeping_resume - Resumes the generic timekeeping subsystem. |
| 600 | * | 658 | * |
| 601 | * This is for the generic clocksource timekeeping. | 659 | * This is for the generic clocksource timekeeping. |
| @@ -615,9 +673,7 @@ static void timekeeping_resume(void) | |||
| 615 | 673 | ||
| 616 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { | 674 | if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { |
| 617 | ts = timespec_sub(ts, timekeeping_suspend_time); | 675 | ts = timespec_sub(ts, timekeeping_suspend_time); |
| 618 | xtime = timespec_add(xtime, ts); | 676 | __timekeeping_inject_sleeptime(&ts); |
| 619 | wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); | ||
| 620 | total_sleep_time = timespec_add(total_sleep_time, ts); | ||
| 621 | } | 677 | } |
| 622 | /* re-base the last cycle value */ | 678 | /* re-base the last cycle value */ |
| 623 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); | 679 | timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); |
| @@ -630,18 +686,40 @@ static void timekeeping_resume(void) | |||
| 630 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); | 686 | clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); |
| 631 | 687 | ||
| 632 | /* Resume hrtimers */ | 688 | /* Resume hrtimers */ |
| 633 | hres_timers_resume(); | 689 | hrtimers_resume(); |
| 634 | } | 690 | } |
| 635 | 691 | ||
| 636 | static int timekeeping_suspend(void) | 692 | static int timekeeping_suspend(void) |
| 637 | { | 693 | { |
| 638 | unsigned long flags; | 694 | unsigned long flags; |
| 695 | struct timespec delta, delta_delta; | ||
| 696 | static struct timespec old_delta; | ||
| 639 | 697 | ||
| 640 | read_persistent_clock(&timekeeping_suspend_time); | 698 | read_persistent_clock(&timekeeping_suspend_time); |
| 641 | 699 | ||
| 642 | write_seqlock_irqsave(&xtime_lock, flags); | 700 | write_seqlock_irqsave(&xtime_lock, flags); |
| 643 | timekeeping_forward_now(); | 701 | timekeeping_forward_now(); |
| 644 | timekeeping_suspended = 1; | 702 | timekeeping_suspended = 1; |
| 703 | |||
| 704 | /* | ||
| 705 | * To avoid drift caused by repeated suspend/resumes, | ||
| 706 | * which each can add ~1 second drift error, | ||
| 707 | * try to compensate so the difference in system time | ||
| 708 | * and persistent_clock time stays close to constant. | ||
| 709 | */ | ||
| 710 | delta = timespec_sub(xtime, timekeeping_suspend_time); | ||
| 711 | delta_delta = timespec_sub(delta, old_delta); | ||
| 712 | if (abs(delta_delta.tv_sec) >= 2) { | ||
| 713 | /* | ||
| 714 | * if delta_delta is too large, assume time correction | ||
| 715 | * has occured and set old_delta to the current delta. | ||
| 716 | */ | ||
| 717 | old_delta = delta; | ||
| 718 | } else { | ||
| 719 | /* Otherwise try to adjust old_system to compensate */ | ||
| 720 | timekeeping_suspend_time = | ||
| 721 | timespec_add(timekeeping_suspend_time, delta_delta); | ||
| 722 | } | ||
| 645 | write_sequnlock_irqrestore(&xtime_lock, flags); | 723 | write_sequnlock_irqrestore(&xtime_lock, flags); |
| 646 | 724 | ||
| 647 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 725 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
| @@ -1049,6 +1127,21 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, | |||
| 1049 | } | 1127 | } |
| 1050 | 1128 | ||
| 1051 | /** | 1129 | /** |
| 1130 | * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format | ||
| 1131 | */ | ||
| 1132 | ktime_t ktime_get_monotonic_offset(void) | ||
| 1133 | { | ||
| 1134 | unsigned long seq; | ||
| 1135 | struct timespec wtom; | ||
| 1136 | |||
| 1137 | do { | ||
| 1138 | seq = read_seqbegin(&xtime_lock); | ||
| 1139 | wtom = wall_to_monotonic; | ||
| 1140 | } while (read_seqretry(&xtime_lock, seq)); | ||
| 1141 | return timespec_to_ktime(wtom); | ||
| 1142 | } | ||
| 1143 | |||
| 1144 | /** | ||
| 1052 | * xtime_update() - advances the timekeeping infrastructure | 1145 | * xtime_update() - advances the timekeeping infrastructure |
| 1053 | * @ticks: number of ticks, that have elapsed since the last call. | 1146 | * @ticks: number of ticks, that have elapsed since the last call. |
| 1054 | * | 1147 | * |
diff --git a/kernel/timer.c b/kernel/timer.c index fd6198692b57..8cff36119e4d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | |||
| 749 | unsigned long expires_limit, mask; | 749 | unsigned long expires_limit, mask; |
| 750 | int bit; | 750 | int bit; |
| 751 | 751 | ||
| 752 | expires_limit = expires; | ||
| 753 | |||
| 754 | if (timer->slack >= 0) { | 752 | if (timer->slack >= 0) { |
| 755 | expires_limit = expires + timer->slack; | 753 | expires_limit = expires + timer->slack; |
| 756 | } else { | 754 | } else { |
| 757 | unsigned long now = jiffies; | 755 | long delta = expires - jiffies; |
| 756 | |||
| 757 | if (delta < 256) | ||
| 758 | return expires; | ||
| 758 | 759 | ||
| 759 | /* No slack, if already expired else auto slack 0.4% */ | 760 | expires_limit = expires + delta / 256; |
| 760 | if (time_after(expires, now)) | ||
| 761 | expires_limit = expires + (expires - now)/256; | ||
| 762 | } | 761 | } |
| 763 | mask = expires ^ expires_limit; | 762 | mask = expires ^ expires_limit; |
| 764 | if (mask == 0) | 763 | if (mask == 0) |
| @@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) | |||
| 795 | */ | 794 | */ |
| 796 | int mod_timer(struct timer_list *timer, unsigned long expires) | 795 | int mod_timer(struct timer_list *timer, unsigned long expires) |
| 797 | { | 796 | { |
| 797 | expires = apply_slack(timer, expires); | ||
| 798 | |||
| 798 | /* | 799 | /* |
| 799 | * This is a common optimization triggered by the | 800 | * This is a common optimization triggered by the |
| 800 | * networking code - if the timer is re-modified | 801 | * networking code - if the timer is re-modified |
| @@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) | |||
| 803 | if (timer_pending(timer) && timer->expires == expires) | 804 | if (timer_pending(timer) && timer->expires == expires) |
| 804 | return 1; | 805 | return 1; |
| 805 | 806 | ||
| 806 | expires = apply_slack(timer, expires); | ||
| 807 | |||
| 808 | return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); | 807 | return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); |
| 809 | } | 808 | } |
| 810 | EXPORT_SYMBOL(mod_timer); | 809 | EXPORT_SYMBOL(mod_timer); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2ad39e556cb4..cd3134510f3d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED | |||
| 82 | power:power_frequency | 82 | power:power_frequency |
| 83 | This is for userspace compatibility | 83 | This is for userspace compatibility |
| 84 | and will vanish after 5 kernel iterations, | 84 | and will vanish after 5 kernel iterations, |
| 85 | namely 2.6.41. | 85 | namely 3.1. |
| 86 | 86 | ||
| 87 | config CONTEXT_SWITCH_TRACER | 87 | config CONTEXT_SWITCH_TRACER |
| 88 | bool | 88 | bool |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ee24fa1935ac..c3e4575e7829 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -32,27 +32,32 @@ | |||
| 32 | 32 | ||
| 33 | #include <trace/events/sched.h> | 33 | #include <trace/events/sched.h> |
| 34 | 34 | ||
| 35 | #include <asm/ftrace.h> | ||
| 36 | #include <asm/setup.h> | 35 | #include <asm/setup.h> |
| 37 | 36 | ||
| 38 | #include "trace_output.h" | 37 | #include "trace_output.h" |
| 39 | #include "trace_stat.h" | 38 | #include "trace_stat.h" |
| 40 | 39 | ||
| 41 | #define FTRACE_WARN_ON(cond) \ | 40 | #define FTRACE_WARN_ON(cond) \ |
| 42 | do { \ | 41 | ({ \ |
| 43 | if (WARN_ON(cond)) \ | 42 | int ___r = cond; \ |
| 43 | if (WARN_ON(___r)) \ | ||
| 44 | ftrace_kill(); \ | 44 | ftrace_kill(); \ |
| 45 | } while (0) | 45 | ___r; \ |
| 46 | }) | ||
| 46 | 47 | ||
| 47 | #define FTRACE_WARN_ON_ONCE(cond) \ | 48 | #define FTRACE_WARN_ON_ONCE(cond) \ |
| 48 | do { \ | 49 | ({ \ |
| 49 | if (WARN_ON_ONCE(cond)) \ | 50 | int ___r = cond; \ |
| 51 | if (WARN_ON_ONCE(___r)) \ | ||
| 50 | ftrace_kill(); \ | 52 | ftrace_kill(); \ |
| 51 | } while (0) | 53 | ___r; \ |
| 54 | }) | ||
| 52 | 55 | ||
| 53 | /* hash bits for specific function selection */ | 56 | /* hash bits for specific function selection */ |
| 54 | #define FTRACE_HASH_BITS 7 | 57 | #define FTRACE_HASH_BITS 7 |
| 55 | #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) | 58 | #define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) |
| 59 | #define FTRACE_HASH_DEFAULT_BITS 10 | ||
| 60 | #define FTRACE_HASH_MAX_BITS 12 | ||
| 56 | 61 | ||
| 57 | /* ftrace_enabled is a method to turn ftrace on or off */ | 62 | /* ftrace_enabled is a method to turn ftrace on or off */ |
| 58 | int ftrace_enabled __read_mostly; | 63 | int ftrace_enabled __read_mostly; |
| @@ -76,33 +81,45 @@ static int ftrace_disabled __read_mostly; | |||
| 76 | 81 | ||
| 77 | static DEFINE_MUTEX(ftrace_lock); | 82 | static DEFINE_MUTEX(ftrace_lock); |
| 78 | 83 | ||
| 79 | static struct ftrace_ops ftrace_list_end __read_mostly = | 84 | static struct ftrace_ops ftrace_list_end __read_mostly = { |
| 80 | { | ||
| 81 | .func = ftrace_stub, | 85 | .func = ftrace_stub, |
| 82 | }; | 86 | }; |
| 83 | 87 | ||
| 84 | static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; | 88 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; |
| 89 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | ||
| 85 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 90 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
| 91 | static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; | ||
| 86 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; | 92 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; |
| 87 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | 93 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; |
| 94 | static struct ftrace_ops global_ops; | ||
| 95 | |||
| 96 | static void | ||
| 97 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); | ||
| 88 | 98 | ||
| 89 | /* | 99 | /* |
| 90 | * Traverse the ftrace_list, invoking all entries. The reason that we | 100 | * Traverse the ftrace_global_list, invoking all entries. The reason that we |
| 91 | * can use rcu_dereference_raw() is that elements removed from this list | 101 | * can use rcu_dereference_raw() is that elements removed from this list |
| 92 | * are simply leaked, so there is no need to interact with a grace-period | 102 | * are simply leaked, so there is no need to interact with a grace-period |
| 93 | * mechanism. The rcu_dereference_raw() calls are needed to handle | 103 | * mechanism. The rcu_dereference_raw() calls are needed to handle |
| 94 | * concurrent insertions into the ftrace_list. | 104 | * concurrent insertions into the ftrace_global_list. |
| 95 | * | 105 | * |
| 96 | * Silly Alpha and silly pointer-speculation compiler optimizations! | 106 | * Silly Alpha and silly pointer-speculation compiler optimizations! |
| 97 | */ | 107 | */ |
| 98 | static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) | 108 | static void ftrace_global_list_func(unsigned long ip, |
| 109 | unsigned long parent_ip) | ||
| 99 | { | 110 | { |
| 100 | struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ | 111 | struct ftrace_ops *op; |
| 112 | |||
| 113 | if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) | ||
| 114 | return; | ||
| 101 | 115 | ||
| 116 | trace_recursion_set(TRACE_GLOBAL_BIT); | ||
| 117 | op = rcu_dereference_raw(ftrace_global_list); /*see above*/ | ||
| 102 | while (op != &ftrace_list_end) { | 118 | while (op != &ftrace_list_end) { |
| 103 | op->func(ip, parent_ip); | 119 | op->func(ip, parent_ip); |
| 104 | op = rcu_dereference_raw(op->next); /*see above*/ | 120 | op = rcu_dereference_raw(op->next); /*see above*/ |
| 105 | }; | 121 | }; |
| 122 | trace_recursion_clear(TRACE_GLOBAL_BIT); | ||
| 106 | } | 123 | } |
| 107 | 124 | ||
| 108 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) | 125 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) |
| @@ -130,9 +147,11 @@ void clear_ftrace_function(void) | |||
| 130 | { | 147 | { |
| 131 | ftrace_trace_function = ftrace_stub; | 148 | ftrace_trace_function = ftrace_stub; |
| 132 | __ftrace_trace_function = ftrace_stub; | 149 | __ftrace_trace_function = ftrace_stub; |
| 150 | __ftrace_trace_function_delay = ftrace_stub; | ||
| 133 | ftrace_pid_function = ftrace_stub; | 151 | ftrace_pid_function = ftrace_stub; |
| 134 | } | 152 | } |
| 135 | 153 | ||
| 154 | #undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
| 136 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 155 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST |
| 137 | /* | 156 | /* |
| 138 | * For those archs that do not test ftrace_trace_stop in their | 157 | * For those archs that do not test ftrace_trace_stop in their |
| @@ -147,46 +166,74 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) | |||
| 147 | } | 166 | } |
| 148 | #endif | 167 | #endif |
| 149 | 168 | ||
| 150 | static int __register_ftrace_function(struct ftrace_ops *ops) | 169 | static void update_global_ops(void) |
| 151 | { | 170 | { |
| 152 | ops->next = ftrace_list; | 171 | ftrace_func_t func; |
| 172 | |||
| 153 | /* | 173 | /* |
| 154 | * We are entering ops into the ftrace_list but another | 174 | * If there's only one function registered, then call that |
| 155 | * CPU might be walking that list. We need to make sure | 175 | * function directly. Otherwise, we need to iterate over the |
| 156 | * the ops->next pointer is valid before another CPU sees | 176 | * registered callers. |
| 157 | * the ops pointer included into the ftrace_list. | ||
| 158 | */ | 177 | */ |
| 159 | rcu_assign_pointer(ftrace_list, ops); | 178 | if (ftrace_global_list == &ftrace_list_end || |
| 179 | ftrace_global_list->next == &ftrace_list_end) | ||
| 180 | func = ftrace_global_list->func; | ||
| 181 | else | ||
| 182 | func = ftrace_global_list_func; | ||
| 160 | 183 | ||
| 161 | if (ftrace_enabled) { | 184 | /* If we filter on pids, update to use the pid function */ |
| 162 | ftrace_func_t func; | 185 | if (!list_empty(&ftrace_pids)) { |
| 186 | set_ftrace_pid_function(func); | ||
| 187 | func = ftrace_pid_func; | ||
| 188 | } | ||
| 163 | 189 | ||
| 164 | if (ops->next == &ftrace_list_end) | 190 | global_ops.func = func; |
| 165 | func = ops->func; | 191 | } |
| 166 | else | ||
| 167 | func = ftrace_list_func; | ||
| 168 | 192 | ||
| 169 | if (!list_empty(&ftrace_pids)) { | 193 | static void update_ftrace_function(void) |
| 170 | set_ftrace_pid_function(func); | 194 | { |
| 171 | func = ftrace_pid_func; | 195 | ftrace_func_t func; |
| 172 | } | 196 | |
| 197 | update_global_ops(); | ||
| 198 | |||
| 199 | /* | ||
| 200 | * If we are at the end of the list and this ops is | ||
| 201 | * not dynamic, then have the mcount trampoline call | ||
| 202 | * the function directly | ||
| 203 | */ | ||
| 204 | if (ftrace_ops_list == &ftrace_list_end || | ||
| 205 | (ftrace_ops_list->next == &ftrace_list_end && | ||
| 206 | !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) | ||
| 207 | func = ftrace_ops_list->func; | ||
| 208 | else | ||
| 209 | func = ftrace_ops_list_func; | ||
| 173 | 210 | ||
| 174 | /* | ||
| 175 | * For one func, simply call it directly. | ||
| 176 | * For more than one func, call the chain. | ||
| 177 | */ | ||
| 178 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 211 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST |
| 179 | ftrace_trace_function = func; | 212 | ftrace_trace_function = func; |
| 180 | #else | 213 | #else |
| 181 | __ftrace_trace_function = func; | 214 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 182 | ftrace_trace_function = ftrace_test_stop_func; | 215 | /* do not update till all functions have been modified */ |
| 216 | __ftrace_trace_function_delay = func; | ||
| 217 | #else | ||
| 218 | __ftrace_trace_function = func; | ||
| 183 | #endif | 219 | #endif |
| 184 | } | 220 | ftrace_trace_function = ftrace_test_stop_func; |
| 221 | #endif | ||
| 222 | } | ||
| 185 | 223 | ||
| 186 | return 0; | 224 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
| 225 | { | ||
| 226 | ops->next = *list; | ||
| 227 | /* | ||
| 228 | * We are entering ops into the list but another | ||
| 229 | * CPU might be walking that list. We need to make sure | ||
| 230 | * the ops->next pointer is valid before another CPU sees | ||
| 231 | * the ops pointer included into the list. | ||
| 232 | */ | ||
| 233 | rcu_assign_pointer(*list, ops); | ||
| 187 | } | 234 | } |
| 188 | 235 | ||
| 189 | static int __unregister_ftrace_function(struct ftrace_ops *ops) | 236 | static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
| 190 | { | 237 | { |
| 191 | struct ftrace_ops **p; | 238 | struct ftrace_ops **p; |
| 192 | 239 | ||
| @@ -194,13 +241,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
| 194 | * If we are removing the last function, then simply point | 241 | * If we are removing the last function, then simply point |
| 195 | * to the ftrace_stub. | 242 | * to the ftrace_stub. |
| 196 | */ | 243 | */ |
| 197 | if (ftrace_list == ops && ops->next == &ftrace_list_end) { | 244 | if (*list == ops && ops->next == &ftrace_list_end) { |
| 198 | ftrace_trace_function = ftrace_stub; | 245 | *list = &ftrace_list_end; |
| 199 | ftrace_list = &ftrace_list_end; | ||
| 200 | return 0; | 246 | return 0; |
| 201 | } | 247 | } |
| 202 | 248 | ||
| 203 | for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) | 249 | for (p = list; *p != &ftrace_list_end; p = &(*p)->next) |
| 204 | if (*p == ops) | 250 | if (*p == ops) |
| 205 | break; | 251 | break; |
| 206 | 252 | ||
| @@ -208,53 +254,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) | |||
| 208 | return -1; | 254 | return -1; |
| 209 | 255 | ||
| 210 | *p = (*p)->next; | 256 | *p = (*p)->next; |
| 257 | return 0; | ||
| 258 | } | ||
| 211 | 259 | ||
| 212 | if (ftrace_enabled) { | 260 | static int __register_ftrace_function(struct ftrace_ops *ops) |
| 213 | /* If we only have one func left, then call that directly */ | 261 | { |
| 214 | if (ftrace_list->next == &ftrace_list_end) { | 262 | if (ftrace_disabled) |
| 215 | ftrace_func_t func = ftrace_list->func; | 263 | return -ENODEV; |
| 216 | 264 | ||
| 217 | if (!list_empty(&ftrace_pids)) { | 265 | if (FTRACE_WARN_ON(ops == &global_ops)) |
| 218 | set_ftrace_pid_function(func); | 266 | return -EINVAL; |
| 219 | func = ftrace_pid_func; | 267 | |
| 220 | } | 268 | if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) |
| 221 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 269 | return -EBUSY; |
| 222 | ftrace_trace_function = func; | 270 | |
| 223 | #else | 271 | if (!core_kernel_data((unsigned long)ops)) |
| 224 | __ftrace_trace_function = func; | 272 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; |
| 225 | #endif | 273 | |
| 226 | } | 274 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
| 227 | } | 275 | int first = ftrace_global_list == &ftrace_list_end; |
| 276 | add_ftrace_ops(&ftrace_global_list, ops); | ||
| 277 | ops->flags |= FTRACE_OPS_FL_ENABLED; | ||
| 278 | if (first) | ||
| 279 | add_ftrace_ops(&ftrace_ops_list, &global_ops); | ||
| 280 | } else | ||
| 281 | add_ftrace_ops(&ftrace_ops_list, ops); | ||
| 282 | |||
| 283 | if (ftrace_enabled) | ||
| 284 | update_ftrace_function(); | ||
| 228 | 285 | ||
| 229 | return 0; | 286 | return 0; |
| 230 | } | 287 | } |
| 231 | 288 | ||
| 232 | static void ftrace_update_pid_func(void) | 289 | static int __unregister_ftrace_function(struct ftrace_ops *ops) |
| 233 | { | 290 | { |
| 234 | ftrace_func_t func; | 291 | int ret; |
| 235 | 292 | ||
| 236 | if (ftrace_trace_function == ftrace_stub) | 293 | if (ftrace_disabled) |
| 237 | return; | 294 | return -ENODEV; |
| 238 | 295 | ||
| 239 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 296 | if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) |
| 240 | func = ftrace_trace_function; | 297 | return -EBUSY; |
| 241 | #else | ||
| 242 | func = __ftrace_trace_function; | ||
| 243 | #endif | ||
| 244 | 298 | ||
| 245 | if (!list_empty(&ftrace_pids)) { | 299 | if (FTRACE_WARN_ON(ops == &global_ops)) |
| 246 | set_ftrace_pid_function(func); | 300 | return -EINVAL; |
| 247 | func = ftrace_pid_func; | ||
| 248 | } else { | ||
| 249 | if (func == ftrace_pid_func) | ||
| 250 | func = ftrace_pid_function; | ||
| 251 | } | ||
| 252 | 301 | ||
| 253 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | 302 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { |
| 254 | ftrace_trace_function = func; | 303 | ret = remove_ftrace_ops(&ftrace_global_list, ops); |
| 255 | #else | 304 | if (!ret && ftrace_global_list == &ftrace_list_end) |
| 256 | __ftrace_trace_function = func; | 305 | ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); |
| 257 | #endif | 306 | if (!ret) |
| 307 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
| 308 | } else | ||
| 309 | ret = remove_ftrace_ops(&ftrace_ops_list, ops); | ||
| 310 | |||
| 311 | if (ret < 0) | ||
| 312 | return ret; | ||
| 313 | |||
| 314 | if (ftrace_enabled) | ||
| 315 | update_ftrace_function(); | ||
| 316 | |||
| 317 | /* | ||
| 318 | * Dynamic ops may be freed, we must make sure that all | ||
| 319 | * callers are done before leaving this function. | ||
| 320 | */ | ||
| 321 | if (ops->flags & FTRACE_OPS_FL_DYNAMIC) | ||
| 322 | synchronize_sched(); | ||
| 323 | |||
| 324 | return 0; | ||
| 325 | } | ||
| 326 | |||
| 327 | static void ftrace_update_pid_func(void) | ||
| 328 | { | ||
| 329 | /* Only do something if we are tracing something */ | ||
| 330 | if (ftrace_trace_function == ftrace_stub) | ||
| 331 | return; | ||
| 332 | |||
| 333 | update_ftrace_function(); | ||
| 258 | } | 334 | } |
| 259 | 335 | ||
| 260 | #ifdef CONFIG_FUNCTION_PROFILER | 336 | #ifdef CONFIG_FUNCTION_PROFILER |
| @@ -715,8 +791,7 @@ static void unregister_ftrace_profiler(void) | |||
| 715 | unregister_ftrace_graph(); | 791 | unregister_ftrace_graph(); |
| 716 | } | 792 | } |
| 717 | #else | 793 | #else |
| 718 | static struct ftrace_ops ftrace_profile_ops __read_mostly = | 794 | static struct ftrace_ops ftrace_profile_ops __read_mostly = { |
| 719 | { | ||
| 720 | .func = function_profile_call, | 795 | .func = function_profile_call, |
| 721 | }; | 796 | }; |
| 722 | 797 | ||
| @@ -736,19 +811,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, | |||
| 736 | size_t cnt, loff_t *ppos) | 811 | size_t cnt, loff_t *ppos) |
| 737 | { | 812 | { |
| 738 | unsigned long val; | 813 | unsigned long val; |
| 739 | char buf[64]; /* big enough to hold a number */ | ||
| 740 | int ret; | 814 | int ret; |
| 741 | 815 | ||
| 742 | if (cnt >= sizeof(buf)) | 816 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
| 743 | return -EINVAL; | 817 | if (ret) |
| 744 | |||
| 745 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 746 | return -EFAULT; | ||
| 747 | |||
| 748 | buf[cnt] = 0; | ||
| 749 | |||
| 750 | ret = strict_strtoul(buf, 10, &val); | ||
| 751 | if (ret < 0) | ||
| 752 | return ret; | 818 | return ret; |
| 753 | 819 | ||
| 754 | val = !!val; | 820 | val = !!val; |
| @@ -888,8 +954,35 @@ enum { | |||
| 888 | FTRACE_START_FUNC_RET = (1 << 3), | 954 | FTRACE_START_FUNC_RET = (1 << 3), |
| 889 | FTRACE_STOP_FUNC_RET = (1 << 4), | 955 | FTRACE_STOP_FUNC_RET = (1 << 4), |
| 890 | }; | 956 | }; |
| 957 | struct ftrace_func_entry { | ||
| 958 | struct hlist_node hlist; | ||
| 959 | unsigned long ip; | ||
| 960 | }; | ||
| 891 | 961 | ||
| 892 | static int ftrace_filtered; | 962 | struct ftrace_hash { |
| 963 | unsigned long size_bits; | ||
| 964 | struct hlist_head *buckets; | ||
| 965 | unsigned long count; | ||
| 966 | struct rcu_head rcu; | ||
| 967 | }; | ||
| 968 | |||
| 969 | /* | ||
| 970 | * We make these constant because no one should touch them, | ||
| 971 | * but they are used as the default "empty hash", to avoid allocating | ||
| 972 | * it all the time. These are in a read only section such that if | ||
| 973 | * anyone does try to modify it, it will cause an exception. | ||
| 974 | */ | ||
| 975 | static const struct hlist_head empty_buckets[1]; | ||
| 976 | static const struct ftrace_hash empty_hash = { | ||
| 977 | .buckets = (struct hlist_head *)empty_buckets, | ||
| 978 | }; | ||
| 979 | #define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) | ||
| 980 | |||
| 981 | static struct ftrace_ops global_ops = { | ||
| 982 | .func = ftrace_stub, | ||
| 983 | .notrace_hash = EMPTY_HASH, | ||
| 984 | .filter_hash = EMPTY_HASH, | ||
| 985 | }; | ||
| 893 | 986 | ||
| 894 | static struct dyn_ftrace *ftrace_new_addrs; | 987 | static struct dyn_ftrace *ftrace_new_addrs; |
| 895 | 988 | ||
| @@ -912,6 +1005,292 @@ static struct ftrace_page *ftrace_pages; | |||
| 912 | 1005 | ||
| 913 | static struct dyn_ftrace *ftrace_free_records; | 1006 | static struct dyn_ftrace *ftrace_free_records; |
| 914 | 1007 | ||
| 1008 | static struct ftrace_func_entry * | ||
| 1009 | ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) | ||
| 1010 | { | ||
| 1011 | unsigned long key; | ||
| 1012 | struct ftrace_func_entry *entry; | ||
| 1013 | struct hlist_head *hhd; | ||
| 1014 | struct hlist_node *n; | ||
| 1015 | |||
| 1016 | if (!hash->count) | ||
| 1017 | return NULL; | ||
| 1018 | |||
| 1019 | if (hash->size_bits > 0) | ||
| 1020 | key = hash_long(ip, hash->size_bits); | ||
| 1021 | else | ||
| 1022 | key = 0; | ||
| 1023 | |||
| 1024 | hhd = &hash->buckets[key]; | ||
| 1025 | |||
| 1026 | hlist_for_each_entry_rcu(entry, n, hhd, hlist) { | ||
| 1027 | if (entry->ip == ip) | ||
| 1028 | return entry; | ||
| 1029 | } | ||
| 1030 | return NULL; | ||
| 1031 | } | ||
| 1032 | |||
| 1033 | static void __add_hash_entry(struct ftrace_hash *hash, | ||
| 1034 | struct ftrace_func_entry *entry) | ||
| 1035 | { | ||
| 1036 | struct hlist_head *hhd; | ||
| 1037 | unsigned long key; | ||
| 1038 | |||
| 1039 | if (hash->size_bits) | ||
| 1040 | key = hash_long(entry->ip, hash->size_bits); | ||
| 1041 | else | ||
| 1042 | key = 0; | ||
| 1043 | |||
| 1044 | hhd = &hash->buckets[key]; | ||
| 1045 | hlist_add_head(&entry->hlist, hhd); | ||
| 1046 | hash->count++; | ||
| 1047 | } | ||
| 1048 | |||
| 1049 | static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) | ||
| 1050 | { | ||
| 1051 | struct ftrace_func_entry *entry; | ||
| 1052 | |||
| 1053 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | ||
| 1054 | if (!entry) | ||
| 1055 | return -ENOMEM; | ||
| 1056 | |||
| 1057 | entry->ip = ip; | ||
| 1058 | __add_hash_entry(hash, entry); | ||
| 1059 | |||
| 1060 | return 0; | ||
| 1061 | } | ||
| 1062 | |||
| 1063 | static void | ||
| 1064 | free_hash_entry(struct ftrace_hash *hash, | ||
| 1065 | struct ftrace_func_entry *entry) | ||
| 1066 | { | ||
| 1067 | hlist_del(&entry->hlist); | ||
| 1068 | kfree(entry); | ||
| 1069 | hash->count--; | ||
| 1070 | } | ||
| 1071 | |||
| 1072 | static void | ||
| 1073 | remove_hash_entry(struct ftrace_hash *hash, | ||
| 1074 | struct ftrace_func_entry *entry) | ||
| 1075 | { | ||
| 1076 | hlist_del(&entry->hlist); | ||
| 1077 | hash->count--; | ||
| 1078 | } | ||
| 1079 | |||
| 1080 | static void ftrace_hash_clear(struct ftrace_hash *hash) | ||
| 1081 | { | ||
| 1082 | struct hlist_head *hhd; | ||
| 1083 | struct hlist_node *tp, *tn; | ||
| 1084 | struct ftrace_func_entry *entry; | ||
| 1085 | int size = 1 << hash->size_bits; | ||
| 1086 | int i; | ||
| 1087 | |||
| 1088 | if (!hash->count) | ||
| 1089 | return; | ||
| 1090 | |||
| 1091 | for (i = 0; i < size; i++) { | ||
| 1092 | hhd = &hash->buckets[i]; | ||
| 1093 | hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) | ||
| 1094 | free_hash_entry(hash, entry); | ||
| 1095 | } | ||
| 1096 | FTRACE_WARN_ON(hash->count); | ||
| 1097 | } | ||
| 1098 | |||
| 1099 | static void free_ftrace_hash(struct ftrace_hash *hash) | ||
| 1100 | { | ||
| 1101 | if (!hash || hash == EMPTY_HASH) | ||
| 1102 | return; | ||
| 1103 | ftrace_hash_clear(hash); | ||
| 1104 | kfree(hash->buckets); | ||
| 1105 | kfree(hash); | ||
| 1106 | } | ||
| 1107 | |||
| 1108 | static void __free_ftrace_hash_rcu(struct rcu_head *rcu) | ||
| 1109 | { | ||
| 1110 | struct ftrace_hash *hash; | ||
| 1111 | |||
| 1112 | hash = container_of(rcu, struct ftrace_hash, rcu); | ||
| 1113 | free_ftrace_hash(hash); | ||
| 1114 | } | ||
| 1115 | |||
| 1116 | static void free_ftrace_hash_rcu(struct ftrace_hash *hash) | ||
| 1117 | { | ||
| 1118 | if (!hash || hash == EMPTY_HASH) | ||
| 1119 | return; | ||
| 1120 | call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); | ||
| 1121 | } | ||
| 1122 | |||
| 1123 | static struct ftrace_hash *alloc_ftrace_hash(int size_bits) | ||
| 1124 | { | ||
| 1125 | struct ftrace_hash *hash; | ||
| 1126 | int size; | ||
| 1127 | |||
| 1128 | hash = kzalloc(sizeof(*hash), GFP_KERNEL); | ||
| 1129 | if (!hash) | ||
| 1130 | return NULL; | ||
| 1131 | |||
| 1132 | size = 1 << size_bits; | ||
| 1133 | hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); | ||
| 1134 | |||
| 1135 | if (!hash->buckets) { | ||
| 1136 | kfree(hash); | ||
| 1137 | return NULL; | ||
| 1138 | } | ||
| 1139 | |||
| 1140 | hash->size_bits = size_bits; | ||
| 1141 | |||
| 1142 | return hash; | ||
| 1143 | } | ||
| 1144 | |||
| 1145 | static struct ftrace_hash * | ||
| 1146 | alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) | ||
| 1147 | { | ||
| 1148 | struct ftrace_func_entry *entry; | ||
| 1149 | struct ftrace_hash *new_hash; | ||
| 1150 | struct hlist_node *tp; | ||
| 1151 | int size; | ||
| 1152 | int ret; | ||
| 1153 | int i; | ||
| 1154 | |||
| 1155 | new_hash = alloc_ftrace_hash(size_bits); | ||
| 1156 | if (!new_hash) | ||
| 1157 | return NULL; | ||
| 1158 | |||
| 1159 | /* Empty hash? */ | ||
| 1160 | if (!hash || !hash->count) | ||
| 1161 | return new_hash; | ||
| 1162 | |||
| 1163 | size = 1 << hash->size_bits; | ||
| 1164 | for (i = 0; i < size; i++) { | ||
| 1165 | hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { | ||
| 1166 | ret = add_hash_entry(new_hash, entry->ip); | ||
| 1167 | if (ret < 0) | ||
| 1168 | goto free_hash; | ||
| 1169 | } | ||
| 1170 | } | ||
| 1171 | |||
| 1172 | FTRACE_WARN_ON(new_hash->count != hash->count); | ||
| 1173 | |||
| 1174 | return new_hash; | ||
| 1175 | |||
| 1176 | free_hash: | ||
| 1177 | free_ftrace_hash(new_hash); | ||
| 1178 | return NULL; | ||
| 1179 | } | ||
| 1180 | |||
| 1181 | static void | ||
| 1182 | ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); | ||
| 1183 | static void | ||
| 1184 | ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); | ||
| 1185 | |||
| 1186 | static int | ||
| 1187 | ftrace_hash_move(struct ftrace_ops *ops, int enable, | ||
| 1188 | struct ftrace_hash **dst, struct ftrace_hash *src) | ||
| 1189 | { | ||
| 1190 | struct ftrace_func_entry *entry; | ||
| 1191 | struct hlist_node *tp, *tn; | ||
| 1192 | struct hlist_head *hhd; | ||
| 1193 | struct ftrace_hash *old_hash; | ||
| 1194 | struct ftrace_hash *new_hash; | ||
| 1195 | unsigned long key; | ||
| 1196 | int size = src->count; | ||
| 1197 | int bits = 0; | ||
| 1198 | int ret; | ||
| 1199 | int i; | ||
| 1200 | |||
| 1201 | /* | ||
| 1202 | * Remove the current set, update the hash and add | ||
| 1203 | * them back. | ||
| 1204 | */ | ||
| 1205 | ftrace_hash_rec_disable(ops, enable); | ||
| 1206 | |||
| 1207 | /* | ||
| 1208 | * If the new source is empty, just free dst and assign it | ||
| 1209 | * the empty_hash. | ||
| 1210 | */ | ||
| 1211 | if (!src->count) { | ||
| 1212 | free_ftrace_hash_rcu(*dst); | ||
| 1213 | rcu_assign_pointer(*dst, EMPTY_HASH); | ||
| 1214 | return 0; | ||
| 1215 | } | ||
| 1216 | |||
| 1217 | /* | ||
| 1218 | * Make the hash size about 1/2 the # found | ||
| 1219 | */ | ||
| 1220 | for (size /= 2; size; size >>= 1) | ||
| 1221 | bits++; | ||
| 1222 | |||
| 1223 | /* Don't allocate too much */ | ||
| 1224 | if (bits > FTRACE_HASH_MAX_BITS) | ||
| 1225 | bits = FTRACE_HASH_MAX_BITS; | ||
| 1226 | |||
| 1227 | ret = -ENOMEM; | ||
| 1228 | new_hash = alloc_ftrace_hash(bits); | ||
| 1229 | if (!new_hash) | ||
| 1230 | goto out; | ||
| 1231 | |||
| 1232 | size = 1 << src->size_bits; | ||
| 1233 | for (i = 0; i < size; i++) { | ||
| 1234 | hhd = &src->buckets[i]; | ||
| 1235 | hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { | ||
| 1236 | if (bits > 0) | ||
| 1237 | key = hash_long(entry->ip, bits); | ||
| 1238 | else | ||
| 1239 | key = 0; | ||
| 1240 | remove_hash_entry(src, entry); | ||
| 1241 | __add_hash_entry(new_hash, entry); | ||
| 1242 | } | ||
| 1243 | } | ||
| 1244 | |||
| 1245 | old_hash = *dst; | ||
| 1246 | rcu_assign_pointer(*dst, new_hash); | ||
| 1247 | free_ftrace_hash_rcu(old_hash); | ||
| 1248 | |||
| 1249 | ret = 0; | ||
| 1250 | out: | ||
| 1251 | /* | ||
| 1252 | * Enable regardless of ret: | ||
| 1253 | * On success, we enable the new hash. | ||
| 1254 | * On failure, we re-enable the original hash. | ||
| 1255 | */ | ||
| 1256 | ftrace_hash_rec_enable(ops, enable); | ||
| 1257 | |||
| 1258 | return ret; | ||
| 1259 | } | ||
| 1260 | |||
| 1261 | /* | ||
| 1262 | * Test the hashes for this ops to see if we want to call | ||
| 1263 | * the ops->func or not. | ||
| 1264 | * | ||
| 1265 | * It's a match if the ip is in the ops->filter_hash or | ||
| 1266 | * the filter_hash does not exist or is empty, | ||
| 1267 | * AND | ||
| 1268 | * the ip is not in the ops->notrace_hash. | ||
| 1269 | * | ||
| 1270 | * This needs to be called with preemption disabled as | ||
| 1271 | * the hashes are freed with call_rcu_sched(). | ||
| 1272 | */ | ||
| 1273 | static int | ||
| 1274 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | ||
| 1275 | { | ||
| 1276 | struct ftrace_hash *filter_hash; | ||
| 1277 | struct ftrace_hash *notrace_hash; | ||
| 1278 | int ret; | ||
| 1279 | |||
| 1280 | filter_hash = rcu_dereference_raw(ops->filter_hash); | ||
| 1281 | notrace_hash = rcu_dereference_raw(ops->notrace_hash); | ||
| 1282 | |||
| 1283 | if ((!filter_hash || !filter_hash->count || | ||
| 1284 | ftrace_lookup_ip(filter_hash, ip)) && | ||
| 1285 | (!notrace_hash || !notrace_hash->count || | ||
| 1286 | !ftrace_lookup_ip(notrace_hash, ip))) | ||
| 1287 | ret = 1; | ||
| 1288 | else | ||
| 1289 | ret = 0; | ||
| 1290 | |||
| 1291 | return ret; | ||
| 1292 | } | ||
| 1293 | |||
| 915 | /* | 1294 | /* |
| 916 | * This is a double for. Do not use 'break' to break out of the loop, | 1295 | * This is a double for. Do not use 'break' to break out of the loop, |
| 917 | * you must use a goto. | 1296 | * you must use a goto. |
| @@ -926,6 +1305,105 @@ static struct dyn_ftrace *ftrace_free_records; | |||
| 926 | } \ | 1305 | } \ |
| 927 | } | 1306 | } |
| 928 | 1307 | ||
| 1308 | static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | ||
| 1309 | int filter_hash, | ||
| 1310 | bool inc) | ||
| 1311 | { | ||
| 1312 | struct ftrace_hash *hash; | ||
| 1313 | struct ftrace_hash *other_hash; | ||
| 1314 | struct ftrace_page *pg; | ||
| 1315 | struct dyn_ftrace *rec; | ||
| 1316 | int count = 0; | ||
| 1317 | int all = 0; | ||
| 1318 | |||
| 1319 | /* Only update if the ops has been registered */ | ||
| 1320 | if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) | ||
| 1321 | return; | ||
| 1322 | |||
| 1323 | /* | ||
| 1324 | * In the filter_hash case: | ||
| 1325 | * If the count is zero, we update all records. | ||
| 1326 | * Otherwise we just update the items in the hash. | ||
| 1327 | * | ||
| 1328 | * In the notrace_hash case: | ||
| 1329 | * We enable the update in the hash. | ||
| 1330 | * As disabling notrace means enabling the tracing, | ||
| 1331 | * and enabling notrace means disabling, the inc variable | ||
| 1332 | * gets inversed. | ||
| 1333 | */ | ||
| 1334 | if (filter_hash) { | ||
| 1335 | hash = ops->filter_hash; | ||
| 1336 | other_hash = ops->notrace_hash; | ||
| 1337 | if (!hash || !hash->count) | ||
| 1338 | all = 1; | ||
| 1339 | } else { | ||
| 1340 | inc = !inc; | ||
| 1341 | hash = ops->notrace_hash; | ||
| 1342 | other_hash = ops->filter_hash; | ||
| 1343 | /* | ||
| 1344 | * If the notrace hash has no items, | ||
| 1345 | * then there's nothing to do. | ||
| 1346 | */ | ||
| 1347 | if (hash && !hash->count) | ||
| 1348 | return; | ||
| 1349 | } | ||
| 1350 | |||
| 1351 | do_for_each_ftrace_rec(pg, rec) { | ||
| 1352 | int in_other_hash = 0; | ||
| 1353 | int in_hash = 0; | ||
| 1354 | int match = 0; | ||
| 1355 | |||
| 1356 | if (all) { | ||
| 1357 | /* | ||
| 1358 | * Only the filter_hash affects all records. | ||
| 1359 | * Update if the record is not in the notrace hash. | ||
| 1360 | */ | ||
| 1361 | if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) | ||
| 1362 | match = 1; | ||
| 1363 | } else { | ||
| 1364 | in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); | ||
| 1365 | in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); | ||
| 1366 | |||
| 1367 | /* | ||
| 1368 | * | ||
| 1369 | */ | ||
| 1370 | if (filter_hash && in_hash && !in_other_hash) | ||
| 1371 | match = 1; | ||
| 1372 | else if (!filter_hash && in_hash && | ||
| 1373 | (in_other_hash || !other_hash->count)) | ||
| 1374 | match = 1; | ||
| 1375 | } | ||
| 1376 | if (!match) | ||
| 1377 | continue; | ||
| 1378 | |||
| 1379 | if (inc) { | ||
| 1380 | rec->flags++; | ||
| 1381 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) | ||
| 1382 | return; | ||
| 1383 | } else { | ||
| 1384 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) | ||
| 1385 | return; | ||
| 1386 | rec->flags--; | ||
| 1387 | } | ||
| 1388 | count++; | ||
| 1389 | /* Shortcut, if we handled all records, we are done. */ | ||
| 1390 | if (!all && count == hash->count) | ||
| 1391 | return; | ||
| 1392 | } while_for_each_ftrace_rec(); | ||
| 1393 | } | ||
| 1394 | |||
| 1395 | static void ftrace_hash_rec_disable(struct ftrace_ops *ops, | ||
| 1396 | int filter_hash) | ||
| 1397 | { | ||
| 1398 | __ftrace_hash_rec_update(ops, filter_hash, 0); | ||
| 1399 | } | ||
| 1400 | |||
| 1401 | static void ftrace_hash_rec_enable(struct ftrace_ops *ops, | ||
| 1402 | int filter_hash) | ||
| 1403 | { | ||
| 1404 | __ftrace_hash_rec_update(ops, filter_hash, 1); | ||
| 1405 | } | ||
| 1406 | |||
| 929 | static void ftrace_free_rec(struct dyn_ftrace *rec) | 1407 | static void ftrace_free_rec(struct dyn_ftrace *rec) |
| 930 | { | 1408 | { |
| 931 | rec->freelist = ftrace_free_records; | 1409 | rec->freelist = ftrace_free_records; |
| @@ -1047,18 +1525,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
| 1047 | ftrace_addr = (unsigned long)FTRACE_ADDR; | 1525 | ftrace_addr = (unsigned long)FTRACE_ADDR; |
| 1048 | 1526 | ||
| 1049 | /* | 1527 | /* |
| 1050 | * If this record is not to be traced or we want to disable it, | 1528 | * If we are enabling tracing: |
| 1051 | * then disable it. | 1529 | * |
| 1530 | * If the record has a ref count, then we need to enable it | ||
| 1531 | * because someone is using it. | ||
| 1052 | * | 1532 | * |
| 1053 | * If we want to enable it and filtering is off, then enable it. | 1533 | * Otherwise we make sure its disabled. |
| 1054 | * | 1534 | * |
| 1055 | * If we want to enable it and filtering is on, enable it only if | 1535 | * If we are disabling tracing, then disable all records that |
| 1056 | * it's filtered | 1536 | * are enabled. |
| 1057 | */ | 1537 | */ |
| 1058 | if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) { | 1538 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) |
| 1059 | if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER)) | 1539 | flag = FTRACE_FL_ENABLED; |
| 1060 | flag = FTRACE_FL_ENABLED; | ||
| 1061 | } | ||
| 1062 | 1540 | ||
| 1063 | /* If the state of this record hasn't changed, then do nothing */ | 1541 | /* If the state of this record hasn't changed, then do nothing */ |
| 1064 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) | 1542 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) |
| @@ -1079,19 +1557,16 @@ static void ftrace_replace_code(int enable) | |||
| 1079 | struct ftrace_page *pg; | 1557 | struct ftrace_page *pg; |
| 1080 | int failed; | 1558 | int failed; |
| 1081 | 1559 | ||
| 1560 | if (unlikely(ftrace_disabled)) | ||
| 1561 | return; | ||
| 1562 | |||
| 1082 | do_for_each_ftrace_rec(pg, rec) { | 1563 | do_for_each_ftrace_rec(pg, rec) { |
| 1083 | /* | 1564 | /* Skip over free records */ |
| 1084 | * Skip over free records, records that have | 1565 | if (rec->flags & FTRACE_FL_FREE) |
| 1085 | * failed and not converted. | ||
| 1086 | */ | ||
| 1087 | if (rec->flags & FTRACE_FL_FREE || | ||
| 1088 | rec->flags & FTRACE_FL_FAILED || | ||
| 1089 | !(rec->flags & FTRACE_FL_CONVERTED)) | ||
| 1090 | continue; | 1566 | continue; |
| 1091 | 1567 | ||
| 1092 | failed = __ftrace_replace_code(rec, enable); | 1568 | failed = __ftrace_replace_code(rec, enable); |
| 1093 | if (failed) { | 1569 | if (failed) { |
| 1094 | rec->flags |= FTRACE_FL_FAILED; | ||
| 1095 | ftrace_bug(failed, rec->ip); | 1570 | ftrace_bug(failed, rec->ip); |
| 1096 | /* Stop processing */ | 1571 | /* Stop processing */ |
| 1097 | return; | 1572 | return; |
| @@ -1107,10 +1582,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) | |||
| 1107 | 1582 | ||
| 1108 | ip = rec->ip; | 1583 | ip = rec->ip; |
| 1109 | 1584 | ||
| 1585 | if (unlikely(ftrace_disabled)) | ||
| 1586 | return 0; | ||
| 1587 | |||
| 1110 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); | 1588 | ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); |
| 1111 | if (ret) { | 1589 | if (ret) { |
| 1112 | ftrace_bug(ret, ip); | 1590 | ftrace_bug(ret, ip); |
| 1113 | rec->flags |= FTRACE_FL_FAILED; | ||
| 1114 | return 0; | 1591 | return 0; |
| 1115 | } | 1592 | } |
| 1116 | return 1; | 1593 | return 1; |
| @@ -1138,6 +1615,12 @@ static int __ftrace_modify_code(void *data) | |||
| 1138 | { | 1615 | { |
| 1139 | int *command = data; | 1616 | int *command = data; |
| 1140 | 1617 | ||
| 1618 | /* | ||
| 1619 | * Do not call function tracer while we update the code. | ||
| 1620 | * We are in stop machine, no worrying about races. | ||
| 1621 | */ | ||
| 1622 | function_trace_stop++; | ||
| 1623 | |||
| 1141 | if (*command & FTRACE_ENABLE_CALLS) | 1624 | if (*command & FTRACE_ENABLE_CALLS) |
| 1142 | ftrace_replace_code(1); | 1625 | ftrace_replace_code(1); |
| 1143 | else if (*command & FTRACE_DISABLE_CALLS) | 1626 | else if (*command & FTRACE_DISABLE_CALLS) |
| @@ -1151,6 +1634,18 @@ static int __ftrace_modify_code(void *data) | |||
| 1151 | else if (*command & FTRACE_STOP_FUNC_RET) | 1634 | else if (*command & FTRACE_STOP_FUNC_RET) |
| 1152 | ftrace_disable_ftrace_graph_caller(); | 1635 | ftrace_disable_ftrace_graph_caller(); |
| 1153 | 1636 | ||
| 1637 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
| 1638 | /* | ||
| 1639 | * For archs that call ftrace_test_stop_func(), we must | ||
| 1640 | * wait till after we update all the function callers | ||
| 1641 | * before we update the callback. This keeps different | ||
| 1642 | * ops that record different functions from corrupting | ||
| 1643 | * each other. | ||
| 1644 | */ | ||
| 1645 | __ftrace_trace_function = __ftrace_trace_function_delay; | ||
| 1646 | #endif | ||
| 1647 | function_trace_stop--; | ||
| 1648 | |||
| 1154 | return 0; | 1649 | return 0; |
| 1155 | } | 1650 | } |
| 1156 | 1651 | ||
| @@ -1171,6 +1666,7 @@ static void ftrace_run_update_code(int command) | |||
| 1171 | 1666 | ||
| 1172 | static ftrace_func_t saved_ftrace_func; | 1667 | static ftrace_func_t saved_ftrace_func; |
| 1173 | static int ftrace_start_up; | 1668 | static int ftrace_start_up; |
| 1669 | static int global_start_up; | ||
| 1174 | 1670 | ||
| 1175 | static void ftrace_startup_enable(int command) | 1671 | static void ftrace_startup_enable(int command) |
| 1176 | { | 1672 | { |
| @@ -1185,19 +1681,38 @@ static void ftrace_startup_enable(int command) | |||
| 1185 | ftrace_run_update_code(command); | 1681 | ftrace_run_update_code(command); |
| 1186 | } | 1682 | } |
| 1187 | 1683 | ||
| 1188 | static void ftrace_startup(int command) | 1684 | static int ftrace_startup(struct ftrace_ops *ops, int command) |
| 1189 | { | 1685 | { |
| 1686 | bool hash_enable = true; | ||
| 1687 | |||
| 1190 | if (unlikely(ftrace_disabled)) | 1688 | if (unlikely(ftrace_disabled)) |
| 1191 | return; | 1689 | return -ENODEV; |
| 1192 | 1690 | ||
| 1193 | ftrace_start_up++; | 1691 | ftrace_start_up++; |
| 1194 | command |= FTRACE_ENABLE_CALLS; | 1692 | command |= FTRACE_ENABLE_CALLS; |
| 1195 | 1693 | ||
| 1694 | /* ops marked global share the filter hashes */ | ||
| 1695 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
| 1696 | ops = &global_ops; | ||
| 1697 | /* Don't update hash if global is already set */ | ||
| 1698 | if (global_start_up) | ||
| 1699 | hash_enable = false; | ||
| 1700 | global_start_up++; | ||
| 1701 | } | ||
| 1702 | |||
| 1703 | ops->flags |= FTRACE_OPS_FL_ENABLED; | ||
| 1704 | if (hash_enable) | ||
| 1705 | ftrace_hash_rec_enable(ops, 1); | ||
| 1706 | |||
| 1196 | ftrace_startup_enable(command); | 1707 | ftrace_startup_enable(command); |
| 1708 | |||
| 1709 | return 0; | ||
| 1197 | } | 1710 | } |
| 1198 | 1711 | ||
| 1199 | static void ftrace_shutdown(int command) | 1712 | static void ftrace_shutdown(struct ftrace_ops *ops, int command) |
| 1200 | { | 1713 | { |
| 1714 | bool hash_disable = true; | ||
| 1715 | |||
| 1201 | if (unlikely(ftrace_disabled)) | 1716 | if (unlikely(ftrace_disabled)) |
| 1202 | return; | 1717 | return; |
| 1203 | 1718 | ||
| @@ -1209,6 +1724,23 @@ static void ftrace_shutdown(int command) | |||
| 1209 | */ | 1724 | */ |
| 1210 | WARN_ON_ONCE(ftrace_start_up < 0); | 1725 | WARN_ON_ONCE(ftrace_start_up < 0); |
| 1211 | 1726 | ||
| 1727 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) { | ||
| 1728 | ops = &global_ops; | ||
| 1729 | global_start_up--; | ||
| 1730 | WARN_ON_ONCE(global_start_up < 0); | ||
| 1731 | /* Don't update hash if global still has users */ | ||
| 1732 | if (global_start_up) { | ||
| 1733 | WARN_ON_ONCE(!ftrace_start_up); | ||
| 1734 | hash_disable = false; | ||
| 1735 | } | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | if (hash_disable) | ||
| 1739 | ftrace_hash_rec_disable(ops, 1); | ||
| 1740 | |||
| 1741 | if (ops != &global_ops || !global_start_up) | ||
| 1742 | ops->flags &= ~FTRACE_OPS_FL_ENABLED; | ||
| 1743 | |||
| 1212 | if (!ftrace_start_up) | 1744 | if (!ftrace_start_up) |
| 1213 | command |= FTRACE_DISABLE_CALLS; | 1745 | command |= FTRACE_DISABLE_CALLS; |
| 1214 | 1746 | ||
| @@ -1249,10 +1781,36 @@ static cycle_t ftrace_update_time; | |||
| 1249 | static unsigned long ftrace_update_cnt; | 1781 | static unsigned long ftrace_update_cnt; |
| 1250 | unsigned long ftrace_update_tot_cnt; | 1782 | unsigned long ftrace_update_tot_cnt; |
| 1251 | 1783 | ||
| 1784 | static int ops_traces_mod(struct ftrace_ops *ops) | ||
| 1785 | { | ||
| 1786 | struct ftrace_hash *hash; | ||
| 1787 | |||
| 1788 | hash = ops->filter_hash; | ||
| 1789 | return !!(!hash || !hash->count); | ||
| 1790 | } | ||
| 1791 | |||
| 1252 | static int ftrace_update_code(struct module *mod) | 1792 | static int ftrace_update_code(struct module *mod) |
| 1253 | { | 1793 | { |
| 1254 | struct dyn_ftrace *p; | 1794 | struct dyn_ftrace *p; |
| 1255 | cycle_t start, stop; | 1795 | cycle_t start, stop; |
| 1796 | unsigned long ref = 0; | ||
| 1797 | |||
| 1798 | /* | ||
| 1799 | * When adding a module, we need to check if tracers are | ||
| 1800 | * currently enabled and if they are set to trace all functions. | ||
| 1801 | * If they are, we need to enable the module functions as well | ||
| 1802 | * as update the reference counts for those function records. | ||
| 1803 | */ | ||
| 1804 | if (mod) { | ||
| 1805 | struct ftrace_ops *ops; | ||
| 1806 | |||
| 1807 | for (ops = ftrace_ops_list; | ||
| 1808 | ops != &ftrace_list_end; ops = ops->next) { | ||
| 1809 | if (ops->flags & FTRACE_OPS_FL_ENABLED && | ||
| 1810 | ops_traces_mod(ops)) | ||
| 1811 | ref++; | ||
| 1812 | } | ||
| 1813 | } | ||
| 1256 | 1814 | ||
| 1257 | start = ftrace_now(raw_smp_processor_id()); | 1815 | start = ftrace_now(raw_smp_processor_id()); |
| 1258 | ftrace_update_cnt = 0; | 1816 | ftrace_update_cnt = 0; |
| @@ -1265,7 +1823,7 @@ static int ftrace_update_code(struct module *mod) | |||
| 1265 | 1823 | ||
| 1266 | p = ftrace_new_addrs; | 1824 | p = ftrace_new_addrs; |
| 1267 | ftrace_new_addrs = p->newlist; | 1825 | ftrace_new_addrs = p->newlist; |
| 1268 | p->flags = 0L; | 1826 | p->flags = ref; |
| 1269 | 1827 | ||
| 1270 | /* | 1828 | /* |
| 1271 | * Do the initial record conversion from mcount jump | 1829 | * Do the initial record conversion from mcount jump |
| @@ -1273,10 +1831,10 @@ static int ftrace_update_code(struct module *mod) | |||
| 1273 | */ | 1831 | */ |
| 1274 | if (!ftrace_code_disable(mod, p)) { | 1832 | if (!ftrace_code_disable(mod, p)) { |
| 1275 | ftrace_free_rec(p); | 1833 | ftrace_free_rec(p); |
| 1276 | continue; | 1834 | /* Game over */ |
| 1835 | break; | ||
| 1277 | } | 1836 | } |
| 1278 | 1837 | ||
| 1279 | p->flags |= FTRACE_FL_CONVERTED; | ||
| 1280 | ftrace_update_cnt++; | 1838 | ftrace_update_cnt++; |
| 1281 | 1839 | ||
| 1282 | /* | 1840 | /* |
| @@ -1288,7 +1846,7 @@ static int ftrace_update_code(struct module *mod) | |||
| 1288 | * conversion puts the module to the correct state, thus | 1846 | * conversion puts the module to the correct state, thus |
| 1289 | * passing the ftrace_make_call check. | 1847 | * passing the ftrace_make_call check. |
| 1290 | */ | 1848 | */ |
| 1291 | if (ftrace_start_up) { | 1849 | if (ftrace_start_up && ref) { |
| 1292 | int failed = __ftrace_replace_code(p, 1); | 1850 | int failed = __ftrace_replace_code(p, 1); |
| 1293 | if (failed) { | 1851 | if (failed) { |
| 1294 | ftrace_bug(failed, p->ip); | 1852 | ftrace_bug(failed, p->ip); |
| @@ -1351,9 +1909,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) | |||
| 1351 | enum { | 1909 | enum { |
| 1352 | FTRACE_ITER_FILTER = (1 << 0), | 1910 | FTRACE_ITER_FILTER = (1 << 0), |
| 1353 | FTRACE_ITER_NOTRACE = (1 << 1), | 1911 | FTRACE_ITER_NOTRACE = (1 << 1), |
| 1354 | FTRACE_ITER_FAILURES = (1 << 2), | 1912 | FTRACE_ITER_PRINTALL = (1 << 2), |
| 1355 | FTRACE_ITER_PRINTALL = (1 << 3), | 1913 | FTRACE_ITER_HASH = (1 << 3), |
| 1356 | FTRACE_ITER_HASH = (1 << 4), | 1914 | FTRACE_ITER_ENABLED = (1 << 4), |
| 1357 | }; | 1915 | }; |
| 1358 | 1916 | ||
| 1359 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ | 1917 | #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ |
| @@ -1365,6 +1923,8 @@ struct ftrace_iterator { | |||
| 1365 | struct dyn_ftrace *func; | 1923 | struct dyn_ftrace *func; |
| 1366 | struct ftrace_func_probe *probe; | 1924 | struct ftrace_func_probe *probe; |
| 1367 | struct trace_parser parser; | 1925 | struct trace_parser parser; |
| 1926 | struct ftrace_hash *hash; | ||
| 1927 | struct ftrace_ops *ops; | ||
| 1368 | int hidx; | 1928 | int hidx; |
| 1369 | int idx; | 1929 | int idx; |
| 1370 | unsigned flags; | 1930 | unsigned flags; |
| @@ -1461,8 +2021,12 @@ static void * | |||
| 1461 | t_next(struct seq_file *m, void *v, loff_t *pos) | 2021 | t_next(struct seq_file *m, void *v, loff_t *pos) |
| 1462 | { | 2022 | { |
| 1463 | struct ftrace_iterator *iter = m->private; | 2023 | struct ftrace_iterator *iter = m->private; |
| 2024 | struct ftrace_ops *ops = &global_ops; | ||
| 1464 | struct dyn_ftrace *rec = NULL; | 2025 | struct dyn_ftrace *rec = NULL; |
| 1465 | 2026 | ||
| 2027 | if (unlikely(ftrace_disabled)) | ||
| 2028 | return NULL; | ||
| 2029 | |||
| 1466 | if (iter->flags & FTRACE_ITER_HASH) | 2030 | if (iter->flags & FTRACE_ITER_HASH) |
| 1467 | return t_hash_next(m, pos); | 2031 | return t_hash_next(m, pos); |
| 1468 | 2032 | ||
| @@ -1483,17 +2047,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos) | |||
| 1483 | rec = &iter->pg->records[iter->idx++]; | 2047 | rec = &iter->pg->records[iter->idx++]; |
| 1484 | if ((rec->flags & FTRACE_FL_FREE) || | 2048 | if ((rec->flags & FTRACE_FL_FREE) || |
| 1485 | 2049 | ||
| 1486 | (!(iter->flags & FTRACE_ITER_FAILURES) && | ||
| 1487 | (rec->flags & FTRACE_FL_FAILED)) || | ||
| 1488 | |||
| 1489 | ((iter->flags & FTRACE_ITER_FAILURES) && | ||
| 1490 | !(rec->flags & FTRACE_FL_FAILED)) || | ||
| 1491 | |||
| 1492 | ((iter->flags & FTRACE_ITER_FILTER) && | 2050 | ((iter->flags & FTRACE_ITER_FILTER) && |
| 1493 | !(rec->flags & FTRACE_FL_FILTER)) || | 2051 | !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || |
| 1494 | 2052 | ||
| 1495 | ((iter->flags & FTRACE_ITER_NOTRACE) && | 2053 | ((iter->flags & FTRACE_ITER_NOTRACE) && |
| 1496 | !(rec->flags & FTRACE_FL_NOTRACE))) { | 2054 | !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || |
| 2055 | |||
| 2056 | ((iter->flags & FTRACE_ITER_ENABLED) && | ||
| 2057 | !(rec->flags & ~FTRACE_FL_MASK))) { | ||
| 2058 | |||
| 1497 | rec = NULL; | 2059 | rec = NULL; |
| 1498 | goto retry; | 2060 | goto retry; |
| 1499 | } | 2061 | } |
| @@ -1517,10 +2079,15 @@ static void reset_iter_read(struct ftrace_iterator *iter) | |||
| 1517 | static void *t_start(struct seq_file *m, loff_t *pos) | 2079 | static void *t_start(struct seq_file *m, loff_t *pos) |
| 1518 | { | 2080 | { |
| 1519 | struct ftrace_iterator *iter = m->private; | 2081 | struct ftrace_iterator *iter = m->private; |
| 2082 | struct ftrace_ops *ops = &global_ops; | ||
| 1520 | void *p = NULL; | 2083 | void *p = NULL; |
| 1521 | loff_t l; | 2084 | loff_t l; |
| 1522 | 2085 | ||
| 1523 | mutex_lock(&ftrace_lock); | 2086 | mutex_lock(&ftrace_lock); |
| 2087 | |||
| 2088 | if (unlikely(ftrace_disabled)) | ||
| 2089 | return NULL; | ||
| 2090 | |||
| 1524 | /* | 2091 | /* |
| 1525 | * If an lseek was done, then reset and start from beginning. | 2092 | * If an lseek was done, then reset and start from beginning. |
| 1526 | */ | 2093 | */ |
| @@ -1532,7 +2099,7 @@ static void *t_start(struct seq_file *m, loff_t *pos) | |||
| 1532 | * off, we can short cut and just print out that all | 2099 | * off, we can short cut and just print out that all |
| 1533 | * functions are enabled. | 2100 | * functions are enabled. |
| 1534 | */ | 2101 | */ |
| 1535 | if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) { | 2102 | if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { |
| 1536 | if (*pos > 0) | 2103 | if (*pos > 0) |
| 1537 | return t_hash_start(m, pos); | 2104 | return t_hash_start(m, pos); |
| 1538 | iter->flags |= FTRACE_ITER_PRINTALL; | 2105 | iter->flags |= FTRACE_ITER_PRINTALL; |
| @@ -1590,7 +2157,11 @@ static int t_show(struct seq_file *m, void *v) | |||
| 1590 | if (!rec) | 2157 | if (!rec) |
| 1591 | return 0; | 2158 | return 0; |
| 1592 | 2159 | ||
| 1593 | seq_printf(m, "%ps\n", (void *)rec->ip); | 2160 | seq_printf(m, "%ps", (void *)rec->ip); |
| 2161 | if (iter->flags & FTRACE_ITER_ENABLED) | ||
| 2162 | seq_printf(m, " (%ld)", | ||
| 2163 | rec->flags & ~FTRACE_FL_MASK); | ||
| 2164 | seq_printf(m, "\n"); | ||
| 1594 | 2165 | ||
| 1595 | return 0; | 2166 | return 0; |
| 1596 | } | 2167 | } |
| @@ -1630,44 +2201,46 @@ ftrace_avail_open(struct inode *inode, struct file *file) | |||
| 1630 | } | 2201 | } |
| 1631 | 2202 | ||
| 1632 | static int | 2203 | static int |
| 1633 | ftrace_failures_open(struct inode *inode, struct file *file) | 2204 | ftrace_enabled_open(struct inode *inode, struct file *file) |
| 1634 | { | 2205 | { |
| 1635 | int ret; | ||
| 1636 | struct seq_file *m; | ||
| 1637 | struct ftrace_iterator *iter; | 2206 | struct ftrace_iterator *iter; |
| 2207 | int ret; | ||
| 1638 | 2208 | ||
| 1639 | ret = ftrace_avail_open(inode, file); | 2209 | if (unlikely(ftrace_disabled)) |
| 2210 | return -ENODEV; | ||
| 2211 | |||
| 2212 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | ||
| 2213 | if (!iter) | ||
| 2214 | return -ENOMEM; | ||
| 2215 | |||
| 2216 | iter->pg = ftrace_pages_start; | ||
| 2217 | iter->flags = FTRACE_ITER_ENABLED; | ||
| 2218 | |||
| 2219 | ret = seq_open(file, &show_ftrace_seq_ops); | ||
| 1640 | if (!ret) { | 2220 | if (!ret) { |
| 1641 | m = file->private_data; | 2221 | struct seq_file *m = file->private_data; |
| 1642 | iter = m->private; | 2222 | |
| 1643 | iter->flags = FTRACE_ITER_FAILURES; | 2223 | m->private = iter; |
| 2224 | } else { | ||
| 2225 | kfree(iter); | ||
| 1644 | } | 2226 | } |
| 1645 | 2227 | ||
| 1646 | return ret; | 2228 | return ret; |
| 1647 | } | 2229 | } |
| 1648 | 2230 | ||
| 1649 | 2231 | static void ftrace_filter_reset(struct ftrace_hash *hash) | |
| 1650 | static void ftrace_filter_reset(int enable) | ||
| 1651 | { | 2232 | { |
| 1652 | struct ftrace_page *pg; | ||
| 1653 | struct dyn_ftrace *rec; | ||
| 1654 | unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | ||
| 1655 | |||
| 1656 | mutex_lock(&ftrace_lock); | 2233 | mutex_lock(&ftrace_lock); |
| 1657 | if (enable) | 2234 | ftrace_hash_clear(hash); |
| 1658 | ftrace_filtered = 0; | ||
| 1659 | do_for_each_ftrace_rec(pg, rec) { | ||
| 1660 | if (rec->flags & FTRACE_FL_FAILED) | ||
| 1661 | continue; | ||
| 1662 | rec->flags &= ~type; | ||
| 1663 | } while_for_each_ftrace_rec(); | ||
| 1664 | mutex_unlock(&ftrace_lock); | 2235 | mutex_unlock(&ftrace_lock); |
| 1665 | } | 2236 | } |
| 1666 | 2237 | ||
| 1667 | static int | 2238 | static int |
| 1668 | ftrace_regex_open(struct inode *inode, struct file *file, int enable) | 2239 | ftrace_regex_open(struct ftrace_ops *ops, int flag, |
| 2240 | struct inode *inode, struct file *file) | ||
| 1669 | { | 2241 | { |
| 1670 | struct ftrace_iterator *iter; | 2242 | struct ftrace_iterator *iter; |
| 2243 | struct ftrace_hash *hash; | ||
| 1671 | int ret = 0; | 2244 | int ret = 0; |
| 1672 | 2245 | ||
| 1673 | if (unlikely(ftrace_disabled)) | 2246 | if (unlikely(ftrace_disabled)) |
| @@ -1682,21 +2255,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) | |||
| 1682 | return -ENOMEM; | 2255 | return -ENOMEM; |
| 1683 | } | 2256 | } |
| 1684 | 2257 | ||
| 2258 | if (flag & FTRACE_ITER_NOTRACE) | ||
| 2259 | hash = ops->notrace_hash; | ||
| 2260 | else | ||
| 2261 | hash = ops->filter_hash; | ||
| 2262 | |||
| 2263 | iter->ops = ops; | ||
| 2264 | iter->flags = flag; | ||
| 2265 | |||
| 2266 | if (file->f_mode & FMODE_WRITE) { | ||
| 2267 | mutex_lock(&ftrace_lock); | ||
| 2268 | iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); | ||
| 2269 | mutex_unlock(&ftrace_lock); | ||
| 2270 | |||
| 2271 | if (!iter->hash) { | ||
| 2272 | trace_parser_put(&iter->parser); | ||
| 2273 | kfree(iter); | ||
| 2274 | return -ENOMEM; | ||
| 2275 | } | ||
| 2276 | } | ||
| 2277 | |||
| 1685 | mutex_lock(&ftrace_regex_lock); | 2278 | mutex_lock(&ftrace_regex_lock); |
| 2279 | |||
| 1686 | if ((file->f_mode & FMODE_WRITE) && | 2280 | if ((file->f_mode & FMODE_WRITE) && |
| 1687 | (file->f_flags & O_TRUNC)) | 2281 | (file->f_flags & O_TRUNC)) |
| 1688 | ftrace_filter_reset(enable); | 2282 | ftrace_filter_reset(iter->hash); |
| 1689 | 2283 | ||
| 1690 | if (file->f_mode & FMODE_READ) { | 2284 | if (file->f_mode & FMODE_READ) { |
| 1691 | iter->pg = ftrace_pages_start; | 2285 | iter->pg = ftrace_pages_start; |
| 1692 | iter->flags = enable ? FTRACE_ITER_FILTER : | ||
| 1693 | FTRACE_ITER_NOTRACE; | ||
| 1694 | 2286 | ||
| 1695 | ret = seq_open(file, &show_ftrace_seq_ops); | 2287 | ret = seq_open(file, &show_ftrace_seq_ops); |
| 1696 | if (!ret) { | 2288 | if (!ret) { |
| 1697 | struct seq_file *m = file->private_data; | 2289 | struct seq_file *m = file->private_data; |
| 1698 | m->private = iter; | 2290 | m->private = iter; |
| 1699 | } else { | 2291 | } else { |
| 2292 | /* Failed */ | ||
| 2293 | free_ftrace_hash(iter->hash); | ||
| 1700 | trace_parser_put(&iter->parser); | 2294 | trace_parser_put(&iter->parser); |
| 1701 | kfree(iter); | 2295 | kfree(iter); |
| 1702 | } | 2296 | } |
| @@ -1710,13 +2304,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) | |||
| 1710 | static int | 2304 | static int |
| 1711 | ftrace_filter_open(struct inode *inode, struct file *file) | 2305 | ftrace_filter_open(struct inode *inode, struct file *file) |
| 1712 | { | 2306 | { |
| 1713 | return ftrace_regex_open(inode, file, 1); | 2307 | return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, |
| 2308 | inode, file); | ||
| 1714 | } | 2309 | } |
| 1715 | 2310 | ||
| 1716 | static int | 2311 | static int |
| 1717 | ftrace_notrace_open(struct inode *inode, struct file *file) | 2312 | ftrace_notrace_open(struct inode *inode, struct file *file) |
| 1718 | { | 2313 | { |
| 1719 | return ftrace_regex_open(inode, file, 0); | 2314 | return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, |
| 2315 | inode, file); | ||
| 1720 | } | 2316 | } |
| 1721 | 2317 | ||
| 1722 | static loff_t | 2318 | static loff_t |
| @@ -1761,86 +2357,99 @@ static int ftrace_match(char *str, char *regex, int len, int type) | |||
| 1761 | } | 2357 | } |
| 1762 | 2358 | ||
| 1763 | static int | 2359 | static int |
| 1764 | ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type) | 2360 | enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) |
| 2361 | { | ||
| 2362 | struct ftrace_func_entry *entry; | ||
| 2363 | int ret = 0; | ||
| 2364 | |||
| 2365 | entry = ftrace_lookup_ip(hash, rec->ip); | ||
| 2366 | if (not) { | ||
| 2367 | /* Do nothing if it doesn't exist */ | ||
| 2368 | if (!entry) | ||
| 2369 | return 0; | ||
| 2370 | |||
| 2371 | free_hash_entry(hash, entry); | ||
| 2372 | } else { | ||
| 2373 | /* Do nothing if it exists */ | ||
| 2374 | if (entry) | ||
| 2375 | return 0; | ||
| 2376 | |||
| 2377 | ret = add_hash_entry(hash, rec->ip); | ||
| 2378 | } | ||
| 2379 | return ret; | ||
| 2380 | } | ||
| 2381 | |||
| 2382 | static int | ||
| 2383 | ftrace_match_record(struct dyn_ftrace *rec, char *mod, | ||
| 2384 | char *regex, int len, int type) | ||
| 1765 | { | 2385 | { |
| 1766 | char str[KSYM_SYMBOL_LEN]; | 2386 | char str[KSYM_SYMBOL_LEN]; |
| 2387 | char *modname; | ||
| 2388 | |||
| 2389 | kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); | ||
| 2390 | |||
| 2391 | if (mod) { | ||
| 2392 | /* module lookup requires matching the module */ | ||
| 2393 | if (!modname || strcmp(modname, mod)) | ||
| 2394 | return 0; | ||
| 2395 | |||
| 2396 | /* blank search means to match all funcs in the mod */ | ||
| 2397 | if (!len) | ||
| 2398 | return 1; | ||
| 2399 | } | ||
| 1767 | 2400 | ||
| 1768 | kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); | ||
| 1769 | return ftrace_match(str, regex, len, type); | 2401 | return ftrace_match(str, regex, len, type); |
| 1770 | } | 2402 | } |
| 1771 | 2403 | ||
| 1772 | static int ftrace_match_records(char *buff, int len, int enable) | 2404 | static int |
| 2405 | match_records(struct ftrace_hash *hash, char *buff, | ||
| 2406 | int len, char *mod, int not) | ||
| 1773 | { | 2407 | { |
| 1774 | unsigned int search_len; | 2408 | unsigned search_len = 0; |
| 1775 | struct ftrace_page *pg; | 2409 | struct ftrace_page *pg; |
| 1776 | struct dyn_ftrace *rec; | 2410 | struct dyn_ftrace *rec; |
| 1777 | unsigned long flag; | 2411 | int type = MATCH_FULL; |
| 1778 | char *search; | 2412 | char *search = buff; |
| 1779 | int type; | ||
| 1780 | int not; | ||
| 1781 | int found = 0; | 2413 | int found = 0; |
| 2414 | int ret; | ||
| 1782 | 2415 | ||
| 1783 | flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | 2416 | if (len) { |
| 1784 | type = filter_parse_regex(buff, len, &search, ¬); | 2417 | type = filter_parse_regex(buff, len, &search, ¬); |
| 1785 | 2418 | search_len = strlen(search); | |
| 1786 | search_len = strlen(search); | 2419 | } |
| 1787 | 2420 | ||
| 1788 | mutex_lock(&ftrace_lock); | 2421 | mutex_lock(&ftrace_lock); |
| 1789 | do_for_each_ftrace_rec(pg, rec) { | ||
| 1790 | 2422 | ||
| 1791 | if (rec->flags & FTRACE_FL_FAILED) | 2423 | if (unlikely(ftrace_disabled)) |
| 1792 | continue; | 2424 | goto out_unlock; |
| 1793 | 2425 | ||
| 1794 | if (ftrace_match_record(rec, search, search_len, type)) { | 2426 | do_for_each_ftrace_rec(pg, rec) { |
| 1795 | if (not) | 2427 | |
| 1796 | rec->flags &= ~flag; | 2428 | if (ftrace_match_record(rec, mod, search, search_len, type)) { |
| 1797 | else | 2429 | ret = enter_record(hash, rec, not); |
| 1798 | rec->flags |= flag; | 2430 | if (ret < 0) { |
| 2431 | found = ret; | ||
| 2432 | goto out_unlock; | ||
| 2433 | } | ||
| 1799 | found = 1; | 2434 | found = 1; |
| 1800 | } | 2435 | } |
| 1801 | /* | ||
| 1802 | * Only enable filtering if we have a function that | ||
| 1803 | * is filtered on. | ||
| 1804 | */ | ||
| 1805 | if (enable && (rec->flags & FTRACE_FL_FILTER)) | ||
| 1806 | ftrace_filtered = 1; | ||
| 1807 | } while_for_each_ftrace_rec(); | 2436 | } while_for_each_ftrace_rec(); |
| 2437 | out_unlock: | ||
| 1808 | mutex_unlock(&ftrace_lock); | 2438 | mutex_unlock(&ftrace_lock); |
| 1809 | 2439 | ||
| 1810 | return found; | 2440 | return found; |
| 1811 | } | 2441 | } |
| 1812 | 2442 | ||
| 1813 | static int | 2443 | static int |
| 1814 | ftrace_match_module_record(struct dyn_ftrace *rec, char *mod, | 2444 | ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) |
| 1815 | char *regex, int len, int type) | ||
| 1816 | { | 2445 | { |
| 1817 | char str[KSYM_SYMBOL_LEN]; | 2446 | return match_records(hash, buff, len, NULL, 0); |
| 1818 | char *modname; | ||
| 1819 | |||
| 1820 | kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); | ||
| 1821 | |||
| 1822 | if (!modname || strcmp(modname, mod)) | ||
| 1823 | return 0; | ||
| 1824 | |||
| 1825 | /* blank search means to match all funcs in the mod */ | ||
| 1826 | if (len) | ||
| 1827 | return ftrace_match(str, regex, len, type); | ||
| 1828 | else | ||
| 1829 | return 1; | ||
| 1830 | } | 2447 | } |
| 1831 | 2448 | ||
| 1832 | static int ftrace_match_module_records(char *buff, char *mod, int enable) | 2449 | static int |
| 2450 | ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) | ||
| 1833 | { | 2451 | { |
| 1834 | unsigned search_len = 0; | ||
| 1835 | struct ftrace_page *pg; | ||
| 1836 | struct dyn_ftrace *rec; | ||
| 1837 | int type = MATCH_FULL; | ||
| 1838 | char *search = buff; | ||
| 1839 | unsigned long flag; | ||
| 1840 | int not = 0; | 2452 | int not = 0; |
| 1841 | int found = 0; | ||
| 1842 | |||
| 1843 | flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; | ||
| 1844 | 2453 | ||
| 1845 | /* blank or '*' mean the same */ | 2454 | /* blank or '*' mean the same */ |
| 1846 | if (strcmp(buff, "*") == 0) | 2455 | if (strcmp(buff, "*") == 0) |
| @@ -1852,32 +2461,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) | |||
| 1852 | not = 1; | 2461 | not = 1; |
| 1853 | } | 2462 | } |
| 1854 | 2463 | ||
| 1855 | if (strlen(buff)) { | 2464 | return match_records(hash, buff, strlen(buff), mod, not); |
| 1856 | type = filter_parse_regex(buff, strlen(buff), &search, ¬); | ||
| 1857 | search_len = strlen(search); | ||
| 1858 | } | ||
| 1859 | |||
| 1860 | mutex_lock(&ftrace_lock); | ||
| 1861 | do_for_each_ftrace_rec(pg, rec) { | ||
| 1862 | |||
| 1863 | if (rec->flags & FTRACE_FL_FAILED) | ||
| 1864 | continue; | ||
| 1865 | |||
| 1866 | if (ftrace_match_module_record(rec, mod, | ||
| 1867 | search, search_len, type)) { | ||
| 1868 | if (not) | ||
| 1869 | rec->flags &= ~flag; | ||
| 1870 | else | ||
| 1871 | rec->flags |= flag; | ||
| 1872 | found = 1; | ||
| 1873 | } | ||
| 1874 | if (enable && (rec->flags & FTRACE_FL_FILTER)) | ||
| 1875 | ftrace_filtered = 1; | ||
| 1876 | |||
| 1877 | } while_for_each_ftrace_rec(); | ||
| 1878 | mutex_unlock(&ftrace_lock); | ||
| 1879 | |||
| 1880 | return found; | ||
| 1881 | } | 2465 | } |
| 1882 | 2466 | ||
| 1883 | /* | 2467 | /* |
| @@ -1886,9 +2470,11 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable) | |||
| 1886 | */ | 2470 | */ |
| 1887 | 2471 | ||
| 1888 | static int | 2472 | static int |
| 1889 | ftrace_mod_callback(char *func, char *cmd, char *param, int enable) | 2473 | ftrace_mod_callback(struct ftrace_hash *hash, |
| 2474 | char *func, char *cmd, char *param, int enable) | ||
| 1890 | { | 2475 | { |
| 1891 | char *mod; | 2476 | char *mod; |
| 2477 | int ret = -EINVAL; | ||
| 1892 | 2478 | ||
| 1893 | /* | 2479 | /* |
| 1894 | * cmd == 'mod' because we only registered this func | 2480 | * cmd == 'mod' because we only registered this func |
| @@ -1900,15 +2486,19 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable) | |||
| 1900 | 2486 | ||
| 1901 | /* we must have a module name */ | 2487 | /* we must have a module name */ |
| 1902 | if (!param) | 2488 | if (!param) |
| 1903 | return -EINVAL; | 2489 | return ret; |
| 1904 | 2490 | ||
| 1905 | mod = strsep(¶m, ":"); | 2491 | mod = strsep(¶m, ":"); |
| 1906 | if (!strlen(mod)) | 2492 | if (!strlen(mod)) |
| 1907 | return -EINVAL; | 2493 | return ret; |
| 1908 | 2494 | ||
| 1909 | if (ftrace_match_module_records(func, mod, enable)) | 2495 | ret = ftrace_match_module_records(hash, func, mod); |
| 1910 | return 0; | 2496 | if (!ret) |
| 1911 | return -EINVAL; | 2497 | ret = -EINVAL; |
| 2498 | if (ret < 0) | ||
| 2499 | return ret; | ||
| 2500 | |||
| 2501 | return 0; | ||
| 1912 | } | 2502 | } |
| 1913 | 2503 | ||
| 1914 | static struct ftrace_func_command ftrace_mod_cmd = { | 2504 | static struct ftrace_func_command ftrace_mod_cmd = { |
| @@ -1959,6 +2549,7 @@ static int ftrace_probe_registered; | |||
| 1959 | 2549 | ||
| 1960 | static void __enable_ftrace_function_probe(void) | 2550 | static void __enable_ftrace_function_probe(void) |
| 1961 | { | 2551 | { |
| 2552 | int ret; | ||
| 1962 | int i; | 2553 | int i; |
| 1963 | 2554 | ||
| 1964 | if (ftrace_probe_registered) | 2555 | if (ftrace_probe_registered) |
| @@ -1973,13 +2564,16 @@ static void __enable_ftrace_function_probe(void) | |||
| 1973 | if (i == FTRACE_FUNC_HASHSIZE) | 2564 | if (i == FTRACE_FUNC_HASHSIZE) |
| 1974 | return; | 2565 | return; |
| 1975 | 2566 | ||
| 1976 | __register_ftrace_function(&trace_probe_ops); | 2567 | ret = __register_ftrace_function(&trace_probe_ops); |
| 1977 | ftrace_startup(0); | 2568 | if (!ret) |
| 2569 | ret = ftrace_startup(&trace_probe_ops, 0); | ||
| 2570 | |||
| 1978 | ftrace_probe_registered = 1; | 2571 | ftrace_probe_registered = 1; |
| 1979 | } | 2572 | } |
| 1980 | 2573 | ||
| 1981 | static void __disable_ftrace_function_probe(void) | 2574 | static void __disable_ftrace_function_probe(void) |
| 1982 | { | 2575 | { |
| 2576 | int ret; | ||
| 1983 | int i; | 2577 | int i; |
| 1984 | 2578 | ||
| 1985 | if (!ftrace_probe_registered) | 2579 | if (!ftrace_probe_registered) |
| @@ -1992,8 +2586,10 @@ static void __disable_ftrace_function_probe(void) | |||
| 1992 | } | 2586 | } |
| 1993 | 2587 | ||
| 1994 | /* no more funcs left */ | 2588 | /* no more funcs left */ |
| 1995 | __unregister_ftrace_function(&trace_probe_ops); | 2589 | ret = __unregister_ftrace_function(&trace_probe_ops); |
| 1996 | ftrace_shutdown(0); | 2590 | if (!ret) |
| 2591 | ftrace_shutdown(&trace_probe_ops, 0); | ||
| 2592 | |||
| 1997 | ftrace_probe_registered = 0; | 2593 | ftrace_probe_registered = 0; |
| 1998 | } | 2594 | } |
| 1999 | 2595 | ||
| @@ -2029,12 +2625,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, | |||
| 2029 | return -EINVAL; | 2625 | return -EINVAL; |
| 2030 | 2626 | ||
| 2031 | mutex_lock(&ftrace_lock); | 2627 | mutex_lock(&ftrace_lock); |
| 2032 | do_for_each_ftrace_rec(pg, rec) { | ||
| 2033 | 2628 | ||
| 2034 | if (rec->flags & FTRACE_FL_FAILED) | 2629 | if (unlikely(ftrace_disabled)) |
| 2035 | continue; | 2630 | goto out_unlock; |
| 2036 | 2631 | ||
| 2037 | if (!ftrace_match_record(rec, search, len, type)) | 2632 | do_for_each_ftrace_rec(pg, rec) { |
| 2633 | |||
| 2634 | if (!ftrace_match_record(rec, NULL, search, len, type)) | ||
| 2038 | continue; | 2635 | continue; |
| 2039 | 2636 | ||
| 2040 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); | 2637 | entry = kmalloc(sizeof(*entry), GFP_KERNEL); |
| @@ -2195,7 +2792,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd) | |||
| 2195 | return ret; | 2792 | return ret; |
| 2196 | } | 2793 | } |
| 2197 | 2794 | ||
| 2198 | static int ftrace_process_regex(char *buff, int len, int enable) | 2795 | static int ftrace_process_regex(struct ftrace_hash *hash, |
| 2796 | char *buff, int len, int enable) | ||
| 2199 | { | 2797 | { |
| 2200 | char *func, *command, *next = buff; | 2798 | char *func, *command, *next = buff; |
| 2201 | struct ftrace_func_command *p; | 2799 | struct ftrace_func_command *p; |
| @@ -2204,9 +2802,12 @@ static int ftrace_process_regex(char *buff, int len, int enable) | |||
| 2204 | func = strsep(&next, ":"); | 2802 | func = strsep(&next, ":"); |
| 2205 | 2803 | ||
| 2206 | if (!next) { | 2804 | if (!next) { |
| 2207 | if (ftrace_match_records(func, len, enable)) | 2805 | ret = ftrace_match_records(hash, func, len); |
| 2208 | return 0; | 2806 | if (!ret) |
| 2209 | return ret; | 2807 | ret = -EINVAL; |
| 2808 | if (ret < 0) | ||
| 2809 | return ret; | ||
| 2810 | return 0; | ||
| 2210 | } | 2811 | } |
| 2211 | 2812 | ||
| 2212 | /* command found */ | 2813 | /* command found */ |
| @@ -2216,7 +2817,7 @@ static int ftrace_process_regex(char *buff, int len, int enable) | |||
| 2216 | mutex_lock(&ftrace_cmd_mutex); | 2817 | mutex_lock(&ftrace_cmd_mutex); |
| 2217 | list_for_each_entry(p, &ftrace_commands, list) { | 2818 | list_for_each_entry(p, &ftrace_commands, list) { |
| 2218 | if (strcmp(p->name, command) == 0) { | 2819 | if (strcmp(p->name, command) == 0) { |
| 2219 | ret = p->func(func, command, next, enable); | 2820 | ret = p->func(hash, func, command, next, enable); |
| 2220 | goto out_unlock; | 2821 | goto out_unlock; |
| 2221 | } | 2822 | } |
| 2222 | } | 2823 | } |
| @@ -2239,6 +2840,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
| 2239 | 2840 | ||
| 2240 | mutex_lock(&ftrace_regex_lock); | 2841 | mutex_lock(&ftrace_regex_lock); |
| 2241 | 2842 | ||
| 2843 | ret = -ENODEV; | ||
| 2844 | if (unlikely(ftrace_disabled)) | ||
| 2845 | goto out_unlock; | ||
| 2846 | |||
| 2242 | if (file->f_mode & FMODE_READ) { | 2847 | if (file->f_mode & FMODE_READ) { |
| 2243 | struct seq_file *m = file->private_data; | 2848 | struct seq_file *m = file->private_data; |
| 2244 | iter = m->private; | 2849 | iter = m->private; |
| @@ -2250,7 +2855,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
| 2250 | 2855 | ||
| 2251 | if (read >= 0 && trace_parser_loaded(parser) && | 2856 | if (read >= 0 && trace_parser_loaded(parser) && |
| 2252 | !trace_parser_cont(parser)) { | 2857 | !trace_parser_cont(parser)) { |
| 2253 | ret = ftrace_process_regex(parser->buffer, | 2858 | ret = ftrace_process_regex(iter->hash, parser->buffer, |
| 2254 | parser->idx, enable); | 2859 | parser->idx, enable); |
| 2255 | trace_parser_clear(parser); | 2860 | trace_parser_clear(parser); |
| 2256 | if (ret) | 2861 | if (ret) |
| @@ -2278,22 +2883,53 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, | |||
| 2278 | return ftrace_regex_write(file, ubuf, cnt, ppos, 0); | 2883 | return ftrace_regex_write(file, ubuf, cnt, ppos, 0); |
| 2279 | } | 2884 | } |
| 2280 | 2885 | ||
| 2281 | static void | 2886 | static int |
| 2282 | ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) | 2887 | ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, |
| 2888 | int reset, int enable) | ||
| 2283 | { | 2889 | { |
| 2890 | struct ftrace_hash **orig_hash; | ||
| 2891 | struct ftrace_hash *hash; | ||
| 2892 | int ret; | ||
| 2893 | |||
| 2894 | /* All global ops uses the global ops filters */ | ||
| 2895 | if (ops->flags & FTRACE_OPS_FL_GLOBAL) | ||
| 2896 | ops = &global_ops; | ||
| 2897 | |||
| 2284 | if (unlikely(ftrace_disabled)) | 2898 | if (unlikely(ftrace_disabled)) |
| 2285 | return; | 2899 | return -ENODEV; |
| 2900 | |||
| 2901 | if (enable) | ||
| 2902 | orig_hash = &ops->filter_hash; | ||
| 2903 | else | ||
| 2904 | orig_hash = &ops->notrace_hash; | ||
| 2905 | |||
| 2906 | hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); | ||
| 2907 | if (!hash) | ||
| 2908 | return -ENOMEM; | ||
| 2286 | 2909 | ||
| 2287 | mutex_lock(&ftrace_regex_lock); | 2910 | mutex_lock(&ftrace_regex_lock); |
| 2288 | if (reset) | 2911 | if (reset) |
| 2289 | ftrace_filter_reset(enable); | 2912 | ftrace_filter_reset(hash); |
| 2290 | if (buf) | 2913 | if (buf) |
| 2291 | ftrace_match_records(buf, len, enable); | 2914 | ftrace_match_records(hash, buf, len); |
| 2915 | |||
| 2916 | mutex_lock(&ftrace_lock); | ||
| 2917 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | ||
| 2918 | if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED | ||
| 2919 | && ftrace_enabled) | ||
| 2920 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | ||
| 2921 | |||
| 2922 | mutex_unlock(&ftrace_lock); | ||
| 2923 | |||
| 2292 | mutex_unlock(&ftrace_regex_lock); | 2924 | mutex_unlock(&ftrace_regex_lock); |
| 2925 | |||
| 2926 | free_ftrace_hash(hash); | ||
| 2927 | return ret; | ||
| 2293 | } | 2928 | } |
| 2294 | 2929 | ||
| 2295 | /** | 2930 | /** |
| 2296 | * ftrace_set_filter - set a function to filter on in ftrace | 2931 | * ftrace_set_filter - set a function to filter on in ftrace |
| 2932 | * @ops - the ops to set the filter with | ||
| 2297 | * @buf - the string that holds the function filter text. | 2933 | * @buf - the string that holds the function filter text. |
| 2298 | * @len - the length of the string. | 2934 | * @len - the length of the string. |
| 2299 | * @reset - non zero to reset all filters before applying this filter. | 2935 | * @reset - non zero to reset all filters before applying this filter. |
| @@ -2301,13 +2937,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable) | |||
| 2301 | * Filters denote which functions should be enabled when tracing is enabled. | 2937 | * Filters denote which functions should be enabled when tracing is enabled. |
| 2302 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. | 2938 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. |
| 2303 | */ | 2939 | */ |
| 2304 | void ftrace_set_filter(unsigned char *buf, int len, int reset) | 2940 | void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, |
| 2941 | int len, int reset) | ||
| 2305 | { | 2942 | { |
| 2306 | ftrace_set_regex(buf, len, reset, 1); | 2943 | ftrace_set_regex(ops, buf, len, reset, 1); |
| 2307 | } | 2944 | } |
| 2945 | EXPORT_SYMBOL_GPL(ftrace_set_filter); | ||
| 2308 | 2946 | ||
| 2309 | /** | 2947 | /** |
| 2310 | * ftrace_set_notrace - set a function to not trace in ftrace | 2948 | * ftrace_set_notrace - set a function to not trace in ftrace |
| 2949 | * @ops - the ops to set the notrace filter with | ||
| 2311 | * @buf - the string that holds the function notrace text. | 2950 | * @buf - the string that holds the function notrace text. |
| 2312 | * @len - the length of the string. | 2951 | * @len - the length of the string. |
| 2313 | * @reset - non zero to reset all filters before applying this filter. | 2952 | * @reset - non zero to reset all filters before applying this filter. |
| @@ -2316,10 +2955,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset) | |||
| 2316 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled | 2955 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled |
| 2317 | * for tracing. | 2956 | * for tracing. |
| 2318 | */ | 2957 | */ |
| 2319 | void ftrace_set_notrace(unsigned char *buf, int len, int reset) | 2958 | void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, |
| 2959 | int len, int reset) | ||
| 2320 | { | 2960 | { |
| 2321 | ftrace_set_regex(buf, len, reset, 0); | 2961 | ftrace_set_regex(ops, buf, len, reset, 0); |
| 2322 | } | 2962 | } |
| 2963 | EXPORT_SYMBOL_GPL(ftrace_set_notrace); | ||
| 2964 | /** | ||
| 2965 | * ftrace_set_filter - set a function to filter on in ftrace | ||
| 2966 | * @ops - the ops to set the filter with | ||
| 2967 | * @buf - the string that holds the function filter text. | ||
| 2968 | * @len - the length of the string. | ||
| 2969 | * @reset - non zero to reset all filters before applying this filter. | ||
| 2970 | * | ||
| 2971 | * Filters denote which functions should be enabled when tracing is enabled. | ||
| 2972 | * If @buf is NULL and reset is set, all functions will be enabled for tracing. | ||
| 2973 | */ | ||
| 2974 | void ftrace_set_global_filter(unsigned char *buf, int len, int reset) | ||
| 2975 | { | ||
| 2976 | ftrace_set_regex(&global_ops, buf, len, reset, 1); | ||
| 2977 | } | ||
| 2978 | EXPORT_SYMBOL_GPL(ftrace_set_global_filter); | ||
| 2979 | |||
| 2980 | /** | ||
| 2981 | * ftrace_set_notrace - set a function to not trace in ftrace | ||
| 2982 | * @ops - the ops to set the notrace filter with | ||
| 2983 | * @buf - the string that holds the function notrace text. | ||
| 2984 | * @len - the length of the string. | ||
| 2985 | * @reset - non zero to reset all filters before applying this filter. | ||
| 2986 | * | ||
| 2987 | * Notrace Filters denote which functions should not be enabled when tracing | ||
| 2988 | * is enabled. If @buf is NULL and reset is set, all functions will be enabled | ||
| 2989 | * for tracing. | ||
| 2990 | */ | ||
| 2991 | void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) | ||
| 2992 | { | ||
| 2993 | ftrace_set_regex(&global_ops, buf, len, reset, 0); | ||
| 2994 | } | ||
| 2995 | EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); | ||
| 2323 | 2996 | ||
| 2324 | /* | 2997 | /* |
| 2325 | * command line interface to allow users to set filters on boot up. | 2998 | * command line interface to allow users to set filters on boot up. |
| @@ -2370,22 +3043,23 @@ static void __init set_ftrace_early_graph(char *buf) | |||
| 2370 | } | 3043 | } |
| 2371 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ | 3044 | #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ |
| 2372 | 3045 | ||
| 2373 | static void __init set_ftrace_early_filter(char *buf, int enable) | 3046 | static void __init |
| 3047 | set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) | ||
| 2374 | { | 3048 | { |
| 2375 | char *func; | 3049 | char *func; |
| 2376 | 3050 | ||
| 2377 | while (buf) { | 3051 | while (buf) { |
| 2378 | func = strsep(&buf, ","); | 3052 | func = strsep(&buf, ","); |
| 2379 | ftrace_set_regex(func, strlen(func), 0, enable); | 3053 | ftrace_set_regex(ops, func, strlen(func), 0, enable); |
| 2380 | } | 3054 | } |
| 2381 | } | 3055 | } |
| 2382 | 3056 | ||
| 2383 | static void __init set_ftrace_early_filters(void) | 3057 | static void __init set_ftrace_early_filters(void) |
| 2384 | { | 3058 | { |
| 2385 | if (ftrace_filter_buf[0]) | 3059 | if (ftrace_filter_buf[0]) |
| 2386 | set_ftrace_early_filter(ftrace_filter_buf, 1); | 3060 | set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); |
| 2387 | if (ftrace_notrace_buf[0]) | 3061 | if (ftrace_notrace_buf[0]) |
| 2388 | set_ftrace_early_filter(ftrace_notrace_buf, 0); | 3062 | set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); |
| 2389 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3063 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 2390 | if (ftrace_graph_buf[0]) | 3064 | if (ftrace_graph_buf[0]) |
| 2391 | set_ftrace_early_graph(ftrace_graph_buf); | 3065 | set_ftrace_early_graph(ftrace_graph_buf); |
| @@ -2393,11 +3067,14 @@ static void __init set_ftrace_early_filters(void) | |||
| 2393 | } | 3067 | } |
| 2394 | 3068 | ||
| 2395 | static int | 3069 | static int |
| 2396 | ftrace_regex_release(struct inode *inode, struct file *file, int enable) | 3070 | ftrace_regex_release(struct inode *inode, struct file *file) |
| 2397 | { | 3071 | { |
| 2398 | struct seq_file *m = (struct seq_file *)file->private_data; | 3072 | struct seq_file *m = (struct seq_file *)file->private_data; |
| 2399 | struct ftrace_iterator *iter; | 3073 | struct ftrace_iterator *iter; |
| 3074 | struct ftrace_hash **orig_hash; | ||
| 2400 | struct trace_parser *parser; | 3075 | struct trace_parser *parser; |
| 3076 | int filter_hash; | ||
| 3077 | int ret; | ||
| 2401 | 3078 | ||
| 2402 | mutex_lock(&ftrace_regex_lock); | 3079 | mutex_lock(&ftrace_regex_lock); |
| 2403 | if (file->f_mode & FMODE_READ) { | 3080 | if (file->f_mode & FMODE_READ) { |
| @@ -2410,33 +3087,35 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) | |||
| 2410 | parser = &iter->parser; | 3087 | parser = &iter->parser; |
| 2411 | if (trace_parser_loaded(parser)) { | 3088 | if (trace_parser_loaded(parser)) { |
| 2412 | parser->buffer[parser->idx] = 0; | 3089 | parser->buffer[parser->idx] = 0; |
| 2413 | ftrace_match_records(parser->buffer, parser->idx, enable); | 3090 | ftrace_match_records(iter->hash, parser->buffer, parser->idx); |
| 2414 | } | 3091 | } |
| 2415 | 3092 | ||
| 2416 | mutex_lock(&ftrace_lock); | ||
| 2417 | if (ftrace_start_up && ftrace_enabled) | ||
| 2418 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | ||
| 2419 | mutex_unlock(&ftrace_lock); | ||
| 2420 | |||
| 2421 | trace_parser_put(parser); | 3093 | trace_parser_put(parser); |
| 3094 | |||
| 3095 | if (file->f_mode & FMODE_WRITE) { | ||
| 3096 | filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); | ||
| 3097 | |||
| 3098 | if (filter_hash) | ||
| 3099 | orig_hash = &iter->ops->filter_hash; | ||
| 3100 | else | ||
| 3101 | orig_hash = &iter->ops->notrace_hash; | ||
| 3102 | |||
| 3103 | mutex_lock(&ftrace_lock); | ||
| 3104 | ret = ftrace_hash_move(iter->ops, filter_hash, | ||
| 3105 | orig_hash, iter->hash); | ||
| 3106 | if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) | ||
| 3107 | && ftrace_enabled) | ||
| 3108 | ftrace_run_update_code(FTRACE_ENABLE_CALLS); | ||
| 3109 | |||
| 3110 | mutex_unlock(&ftrace_lock); | ||
| 3111 | } | ||
| 3112 | free_ftrace_hash(iter->hash); | ||
| 2422 | kfree(iter); | 3113 | kfree(iter); |
| 2423 | 3114 | ||
| 2424 | mutex_unlock(&ftrace_regex_lock); | 3115 | mutex_unlock(&ftrace_regex_lock); |
| 2425 | return 0; | 3116 | return 0; |
| 2426 | } | 3117 | } |
| 2427 | 3118 | ||
| 2428 | static int | ||
| 2429 | ftrace_filter_release(struct inode *inode, struct file *file) | ||
| 2430 | { | ||
| 2431 | return ftrace_regex_release(inode, file, 1); | ||
| 2432 | } | ||
| 2433 | |||
| 2434 | static int | ||
| 2435 | ftrace_notrace_release(struct inode *inode, struct file *file) | ||
| 2436 | { | ||
| 2437 | return ftrace_regex_release(inode, file, 0); | ||
| 2438 | } | ||
| 2439 | |||
| 2440 | static const struct file_operations ftrace_avail_fops = { | 3119 | static const struct file_operations ftrace_avail_fops = { |
| 2441 | .open = ftrace_avail_open, | 3120 | .open = ftrace_avail_open, |
| 2442 | .read = seq_read, | 3121 | .read = seq_read, |
| @@ -2444,8 +3123,8 @@ static const struct file_operations ftrace_avail_fops = { | |||
| 2444 | .release = seq_release_private, | 3123 | .release = seq_release_private, |
| 2445 | }; | 3124 | }; |
| 2446 | 3125 | ||
| 2447 | static const struct file_operations ftrace_failures_fops = { | 3126 | static const struct file_operations ftrace_enabled_fops = { |
| 2448 | .open = ftrace_failures_open, | 3127 | .open = ftrace_enabled_open, |
| 2449 | .read = seq_read, | 3128 | .read = seq_read, |
| 2450 | .llseek = seq_lseek, | 3129 | .llseek = seq_lseek, |
| 2451 | .release = seq_release_private, | 3130 | .release = seq_release_private, |
| @@ -2456,7 +3135,7 @@ static const struct file_operations ftrace_filter_fops = { | |||
| 2456 | .read = seq_read, | 3135 | .read = seq_read, |
| 2457 | .write = ftrace_filter_write, | 3136 | .write = ftrace_filter_write, |
| 2458 | .llseek = ftrace_regex_lseek, | 3137 | .llseek = ftrace_regex_lseek, |
| 2459 | .release = ftrace_filter_release, | 3138 | .release = ftrace_regex_release, |
| 2460 | }; | 3139 | }; |
| 2461 | 3140 | ||
| 2462 | static const struct file_operations ftrace_notrace_fops = { | 3141 | static const struct file_operations ftrace_notrace_fops = { |
| @@ -2464,7 +3143,7 @@ static const struct file_operations ftrace_notrace_fops = { | |||
| 2464 | .read = seq_read, | 3143 | .read = seq_read, |
| 2465 | .write = ftrace_notrace_write, | 3144 | .write = ftrace_notrace_write, |
| 2466 | .llseek = ftrace_regex_lseek, | 3145 | .llseek = ftrace_regex_lseek, |
| 2467 | .release = ftrace_notrace_release, | 3146 | .release = ftrace_regex_release, |
| 2468 | }; | 3147 | }; |
| 2469 | 3148 | ||
| 2470 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 3149 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| @@ -2573,9 +3252,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
| 2573 | bool exists; | 3252 | bool exists; |
| 2574 | int i; | 3253 | int i; |
| 2575 | 3254 | ||
| 2576 | if (ftrace_disabled) | ||
| 2577 | return -ENODEV; | ||
| 2578 | |||
| 2579 | /* decode regex */ | 3255 | /* decode regex */ |
| 2580 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); | 3256 | type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); |
| 2581 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) | 3257 | if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) |
| @@ -2584,12 +3260,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) | |||
| 2584 | search_len = strlen(search); | 3260 | search_len = strlen(search); |
| 2585 | 3261 | ||
| 2586 | mutex_lock(&ftrace_lock); | 3262 | mutex_lock(&ftrace_lock); |
| 3263 | |||
| 3264 | if (unlikely(ftrace_disabled)) { | ||
| 3265 | mutex_unlock(&ftrace_lock); | ||
| 3266 | return -ENODEV; | ||
| 3267 | } | ||
| 3268 | |||
| 2587 | do_for_each_ftrace_rec(pg, rec) { | 3269 | do_for_each_ftrace_rec(pg, rec) { |
| 2588 | 3270 | ||
| 2589 | if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) | 3271 | if (rec->flags & FTRACE_FL_FREE) |
| 2590 | continue; | 3272 | continue; |
| 2591 | 3273 | ||
| 2592 | if (ftrace_match_record(rec, search, search_len, type)) { | 3274 | if (ftrace_match_record(rec, NULL, search, search_len, type)) { |
| 2593 | /* if it is in the array */ | 3275 | /* if it is in the array */ |
| 2594 | exists = false; | 3276 | exists = false; |
| 2595 | for (i = 0; i < *idx; i++) { | 3277 | for (i = 0; i < *idx; i++) { |
| @@ -2679,8 +3361,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) | |||
| 2679 | trace_create_file("available_filter_functions", 0444, | 3361 | trace_create_file("available_filter_functions", 0444, |
| 2680 | d_tracer, NULL, &ftrace_avail_fops); | 3362 | d_tracer, NULL, &ftrace_avail_fops); |
| 2681 | 3363 | ||
| 2682 | trace_create_file("failures", 0444, | 3364 | trace_create_file("enabled_functions", 0444, |
| 2683 | d_tracer, NULL, &ftrace_failures_fops); | 3365 | d_tracer, NULL, &ftrace_enabled_fops); |
| 2684 | 3366 | ||
| 2685 | trace_create_file("set_ftrace_filter", 0644, d_tracer, | 3367 | trace_create_file("set_ftrace_filter", 0644, d_tracer, |
| 2686 | NULL, &ftrace_filter_fops); | 3368 | NULL, &ftrace_filter_fops); |
| @@ -2703,7 +3385,7 @@ static int ftrace_process_locs(struct module *mod, | |||
| 2703 | { | 3385 | { |
| 2704 | unsigned long *p; | 3386 | unsigned long *p; |
| 2705 | unsigned long addr; | 3387 | unsigned long addr; |
| 2706 | unsigned long flags; | 3388 | unsigned long flags = 0; /* Shut up gcc */ |
| 2707 | 3389 | ||
| 2708 | mutex_lock(&ftrace_lock); | 3390 | mutex_lock(&ftrace_lock); |
| 2709 | p = start; | 3391 | p = start; |
| @@ -2720,10 +3402,19 @@ static int ftrace_process_locs(struct module *mod, | |||
| 2720 | ftrace_record_ip(addr); | 3402 | ftrace_record_ip(addr); |
| 2721 | } | 3403 | } |
| 2722 | 3404 | ||
| 2723 | /* disable interrupts to prevent kstop machine */ | 3405 | /* |
| 2724 | local_irq_save(flags); | 3406 | * We only need to disable interrupts on start up |
| 3407 | * because we are modifying code that an interrupt | ||
| 3408 | * may execute, and the modification is not atomic. | ||
| 3409 | * But for modules, nothing runs the code we modify | ||
| 3410 | * until we are finished with it, and there's no | ||
| 3411 | * reason to cause large interrupt latencies while we do it. | ||
| 3412 | */ | ||
| 3413 | if (!mod) | ||
| 3414 | local_irq_save(flags); | ||
| 2725 | ftrace_update_code(mod); | 3415 | ftrace_update_code(mod); |
| 2726 | local_irq_restore(flags); | 3416 | if (!mod) |
| 3417 | local_irq_restore(flags); | ||
| 2727 | mutex_unlock(&ftrace_lock); | 3418 | mutex_unlock(&ftrace_lock); |
| 2728 | 3419 | ||
| 2729 | return 0; | 3420 | return 0; |
| @@ -2735,10 +3426,11 @@ void ftrace_release_mod(struct module *mod) | |||
| 2735 | struct dyn_ftrace *rec; | 3426 | struct dyn_ftrace *rec; |
| 2736 | struct ftrace_page *pg; | 3427 | struct ftrace_page *pg; |
| 2737 | 3428 | ||
| 3429 | mutex_lock(&ftrace_lock); | ||
| 3430 | |||
| 2738 | if (ftrace_disabled) | 3431 | if (ftrace_disabled) |
| 2739 | return; | 3432 | goto out_unlock; |
| 2740 | 3433 | ||
| 2741 | mutex_lock(&ftrace_lock); | ||
| 2742 | do_for_each_ftrace_rec(pg, rec) { | 3434 | do_for_each_ftrace_rec(pg, rec) { |
| 2743 | if (within_module_core(rec->ip, mod)) { | 3435 | if (within_module_core(rec->ip, mod)) { |
| 2744 | /* | 3436 | /* |
| @@ -2749,6 +3441,7 @@ void ftrace_release_mod(struct module *mod) | |||
| 2749 | ftrace_free_rec(rec); | 3441 | ftrace_free_rec(rec); |
| 2750 | } | 3442 | } |
| 2751 | } while_for_each_ftrace_rec(); | 3443 | } while_for_each_ftrace_rec(); |
| 3444 | out_unlock: | ||
| 2752 | mutex_unlock(&ftrace_lock); | 3445 | mutex_unlock(&ftrace_lock); |
| 2753 | } | 3446 | } |
| 2754 | 3447 | ||
| @@ -2835,6 +3528,10 @@ void __init ftrace_init(void) | |||
| 2835 | 3528 | ||
| 2836 | #else | 3529 | #else |
| 2837 | 3530 | ||
| 3531 | static struct ftrace_ops global_ops = { | ||
| 3532 | .func = ftrace_stub, | ||
| 3533 | }; | ||
| 3534 | |||
| 2838 | static int __init ftrace_nodyn_init(void) | 3535 | static int __init ftrace_nodyn_init(void) |
| 2839 | { | 3536 | { |
| 2840 | ftrace_enabled = 1; | 3537 | ftrace_enabled = 1; |
| @@ -2845,12 +3542,47 @@ device_initcall(ftrace_nodyn_init); | |||
| 2845 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } | 3542 | static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } |
| 2846 | static inline void ftrace_startup_enable(int command) { } | 3543 | static inline void ftrace_startup_enable(int command) { } |
| 2847 | /* Keep as macros so we do not need to define the commands */ | 3544 | /* Keep as macros so we do not need to define the commands */ |
| 2848 | # define ftrace_startup(command) do { } while (0) | 3545 | # define ftrace_startup(ops, command) \ |
| 2849 | # define ftrace_shutdown(command) do { } while (0) | 3546 | ({ \ |
| 3547 | (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ | ||
| 3548 | 0; \ | ||
| 3549 | }) | ||
| 3550 | # define ftrace_shutdown(ops, command) do { } while (0) | ||
| 2850 | # define ftrace_startup_sysctl() do { } while (0) | 3551 | # define ftrace_startup_sysctl() do { } while (0) |
| 2851 | # define ftrace_shutdown_sysctl() do { } while (0) | 3552 | # define ftrace_shutdown_sysctl() do { } while (0) |
| 3553 | |||
| 3554 | static inline int | ||
| 3555 | ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | ||
| 3556 | { | ||
| 3557 | return 1; | ||
| 3558 | } | ||
| 3559 | |||
| 2852 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 3560 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
| 2853 | 3561 | ||
| 3562 | static void | ||
| 3563 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) | ||
| 3564 | { | ||
| 3565 | struct ftrace_ops *op; | ||
| 3566 | |||
| 3567 | if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) | ||
| 3568 | return; | ||
| 3569 | |||
| 3570 | trace_recursion_set(TRACE_INTERNAL_BIT); | ||
| 3571 | /* | ||
| 3572 | * Some of the ops may be dynamically allocated, | ||
| 3573 | * they must be freed after a synchronize_sched(). | ||
| 3574 | */ | ||
| 3575 | preempt_disable_notrace(); | ||
| 3576 | op = rcu_dereference_raw(ftrace_ops_list); | ||
| 3577 | while (op != &ftrace_list_end) { | ||
| 3578 | if (ftrace_ops_test(op, ip)) | ||
| 3579 | op->func(ip, parent_ip); | ||
| 3580 | op = rcu_dereference_raw(op->next); | ||
| 3581 | }; | ||
| 3582 | preempt_enable_notrace(); | ||
| 3583 | trace_recursion_clear(TRACE_INTERNAL_BIT); | ||
| 3584 | } | ||
| 3585 | |||
| 2854 | static void clear_ftrace_swapper(void) | 3586 | static void clear_ftrace_swapper(void) |
| 2855 | { | 3587 | { |
| 2856 | struct task_struct *p; | 3588 | struct task_struct *p; |
| @@ -3143,19 +3875,23 @@ void ftrace_kill(void) | |||
| 3143 | */ | 3875 | */ |
| 3144 | int register_ftrace_function(struct ftrace_ops *ops) | 3876 | int register_ftrace_function(struct ftrace_ops *ops) |
| 3145 | { | 3877 | { |
| 3146 | int ret; | 3878 | int ret = -1; |
| 3147 | |||
| 3148 | if (unlikely(ftrace_disabled)) | ||
| 3149 | return -1; | ||
| 3150 | 3879 | ||
| 3151 | mutex_lock(&ftrace_lock); | 3880 | mutex_lock(&ftrace_lock); |
| 3152 | 3881 | ||
| 3882 | if (unlikely(ftrace_disabled)) | ||
| 3883 | goto out_unlock; | ||
| 3884 | |||
| 3153 | ret = __register_ftrace_function(ops); | 3885 | ret = __register_ftrace_function(ops); |
| 3154 | ftrace_startup(0); | 3886 | if (!ret) |
| 3887 | ret = ftrace_startup(ops, 0); | ||
| 3888 | |||
| 3155 | 3889 | ||
| 3890 | out_unlock: | ||
| 3156 | mutex_unlock(&ftrace_lock); | 3891 | mutex_unlock(&ftrace_lock); |
| 3157 | return ret; | 3892 | return ret; |
| 3158 | } | 3893 | } |
| 3894 | EXPORT_SYMBOL_GPL(register_ftrace_function); | ||
| 3159 | 3895 | ||
| 3160 | /** | 3896 | /** |
| 3161 | * unregister_ftrace_function - unregister a function for profiling. | 3897 | * unregister_ftrace_function - unregister a function for profiling. |
| @@ -3169,25 +3905,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops) | |||
| 3169 | 3905 | ||
| 3170 | mutex_lock(&ftrace_lock); | 3906 | mutex_lock(&ftrace_lock); |
| 3171 | ret = __unregister_ftrace_function(ops); | 3907 | ret = __unregister_ftrace_function(ops); |
| 3172 | ftrace_shutdown(0); | 3908 | if (!ret) |
| 3909 | ftrace_shutdown(ops, 0); | ||
| 3173 | mutex_unlock(&ftrace_lock); | 3910 | mutex_unlock(&ftrace_lock); |
| 3174 | 3911 | ||
| 3175 | return ret; | 3912 | return ret; |
| 3176 | } | 3913 | } |
| 3914 | EXPORT_SYMBOL_GPL(unregister_ftrace_function); | ||
| 3177 | 3915 | ||
| 3178 | int | 3916 | int |
| 3179 | ftrace_enable_sysctl(struct ctl_table *table, int write, | 3917 | ftrace_enable_sysctl(struct ctl_table *table, int write, |
| 3180 | void __user *buffer, size_t *lenp, | 3918 | void __user *buffer, size_t *lenp, |
| 3181 | loff_t *ppos) | 3919 | loff_t *ppos) |
| 3182 | { | 3920 | { |
| 3183 | int ret; | 3921 | int ret = -ENODEV; |
| 3184 | |||
| 3185 | if (unlikely(ftrace_disabled)) | ||
| 3186 | return -ENODEV; | ||
| 3187 | 3922 | ||
| 3188 | mutex_lock(&ftrace_lock); | 3923 | mutex_lock(&ftrace_lock); |
| 3189 | 3924 | ||
| 3190 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 3925 | if (unlikely(ftrace_disabled)) |
| 3926 | goto out; | ||
| 3927 | |||
| 3928 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 3191 | 3929 | ||
| 3192 | if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) | 3930 | if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) |
| 3193 | goto out; | 3931 | goto out; |
| @@ -3199,11 +3937,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, | |||
| 3199 | ftrace_startup_sysctl(); | 3937 | ftrace_startup_sysctl(); |
| 3200 | 3938 | ||
| 3201 | /* we are starting ftrace again */ | 3939 | /* we are starting ftrace again */ |
| 3202 | if (ftrace_list != &ftrace_list_end) { | 3940 | if (ftrace_ops_list != &ftrace_list_end) { |
| 3203 | if (ftrace_list->next == &ftrace_list_end) | 3941 | if (ftrace_ops_list->next == &ftrace_list_end) |
| 3204 | ftrace_trace_function = ftrace_list->func; | 3942 | ftrace_trace_function = ftrace_ops_list->func; |
| 3205 | else | 3943 | else |
| 3206 | ftrace_trace_function = ftrace_list_func; | 3944 | ftrace_trace_function = ftrace_ops_list_func; |
| 3207 | } | 3945 | } |
| 3208 | 3946 | ||
| 3209 | } else { | 3947 | } else { |
| @@ -3392,7 +4130,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, | |||
| 3392 | ftrace_graph_return = retfunc; | 4130 | ftrace_graph_return = retfunc; |
| 3393 | ftrace_graph_entry = entryfunc; | 4131 | ftrace_graph_entry = entryfunc; |
| 3394 | 4132 | ||
| 3395 | ftrace_startup(FTRACE_START_FUNC_RET); | 4133 | ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); |
| 3396 | 4134 | ||
| 3397 | out: | 4135 | out: |
| 3398 | mutex_unlock(&ftrace_lock); | 4136 | mutex_unlock(&ftrace_lock); |
| @@ -3409,7 +4147,7 @@ void unregister_ftrace_graph(void) | |||
| 3409 | ftrace_graph_active--; | 4147 | ftrace_graph_active--; |
| 3410 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; | 4148 | ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; |
| 3411 | ftrace_graph_entry = ftrace_graph_entry_stub; | 4149 | ftrace_graph_entry = ftrace_graph_entry_stub; |
| 3412 | ftrace_shutdown(FTRACE_STOP_FUNC_RET); | 4150 | ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); |
| 3413 | unregister_pm_notifier(&ftrace_suspend_notifier); | 4151 | unregister_pm_notifier(&ftrace_suspend_notifier); |
| 3414 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); | 4152 | unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); |
| 3415 | 4153 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0ef7b4b2a1f7..731201bf4acc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -997,15 +997,21 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
| 997 | unsigned nr_pages) | 997 | unsigned nr_pages) |
| 998 | { | 998 | { |
| 999 | struct buffer_page *bpage, *tmp; | 999 | struct buffer_page *bpage, *tmp; |
| 1000 | unsigned long addr; | ||
| 1001 | LIST_HEAD(pages); | 1000 | LIST_HEAD(pages); |
| 1002 | unsigned i; | 1001 | unsigned i; |
| 1003 | 1002 | ||
| 1004 | WARN_ON(!nr_pages); | 1003 | WARN_ON(!nr_pages); |
| 1005 | 1004 | ||
| 1006 | for (i = 0; i < nr_pages; i++) { | 1005 | for (i = 0; i < nr_pages; i++) { |
| 1006 | struct page *page; | ||
| 1007 | /* | ||
| 1008 | * __GFP_NORETRY flag makes sure that the allocation fails | ||
| 1009 | * gracefully without invoking oom-killer and the system is | ||
| 1010 | * not destabilized. | ||
| 1011 | */ | ||
| 1007 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 1012 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
| 1008 | GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); | 1013 | GFP_KERNEL | __GFP_NORETRY, |
| 1014 | cpu_to_node(cpu_buffer->cpu)); | ||
| 1009 | if (!bpage) | 1015 | if (!bpage) |
| 1010 | goto free_pages; | 1016 | goto free_pages; |
| 1011 | 1017 | ||
| @@ -1013,10 +1019,11 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
| 1013 | 1019 | ||
| 1014 | list_add(&bpage->list, &pages); | 1020 | list_add(&bpage->list, &pages); |
| 1015 | 1021 | ||
| 1016 | addr = __get_free_page(GFP_KERNEL); | 1022 | page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), |
| 1017 | if (!addr) | 1023 | GFP_KERNEL | __GFP_NORETRY, 0); |
| 1024 | if (!page) | ||
| 1018 | goto free_pages; | 1025 | goto free_pages; |
| 1019 | bpage->page = (void *)addr; | 1026 | bpage->page = page_address(page); |
| 1020 | rb_init_page(bpage->page); | 1027 | rb_init_page(bpage->page); |
| 1021 | } | 1028 | } |
| 1022 | 1029 | ||
| @@ -1045,7 +1052,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
| 1045 | { | 1052 | { |
| 1046 | struct ring_buffer_per_cpu *cpu_buffer; | 1053 | struct ring_buffer_per_cpu *cpu_buffer; |
| 1047 | struct buffer_page *bpage; | 1054 | struct buffer_page *bpage; |
| 1048 | unsigned long addr; | 1055 | struct page *page; |
| 1049 | int ret; | 1056 | int ret; |
| 1050 | 1057 | ||
| 1051 | cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), | 1058 | cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), |
| @@ -1067,10 +1074,10 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
| 1067 | rb_check_bpage(cpu_buffer, bpage); | 1074 | rb_check_bpage(cpu_buffer, bpage); |
| 1068 | 1075 | ||
| 1069 | cpu_buffer->reader_page = bpage; | 1076 | cpu_buffer->reader_page = bpage; |
| 1070 | addr = __get_free_page(GFP_KERNEL); | 1077 | page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0); |
| 1071 | if (!addr) | 1078 | if (!page) |
| 1072 | goto fail_free_reader; | 1079 | goto fail_free_reader; |
| 1073 | bpage->page = (void *)addr; | 1080 | bpage->page = page_address(page); |
| 1074 | rb_init_page(bpage->page); | 1081 | rb_init_page(bpage->page); |
| 1075 | 1082 | ||
| 1076 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 1083 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
| @@ -1314,7 +1321,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
| 1314 | unsigned nr_pages, rm_pages, new_pages; | 1321 | unsigned nr_pages, rm_pages, new_pages; |
| 1315 | struct buffer_page *bpage, *tmp; | 1322 | struct buffer_page *bpage, *tmp; |
| 1316 | unsigned long buffer_size; | 1323 | unsigned long buffer_size; |
| 1317 | unsigned long addr; | ||
| 1318 | LIST_HEAD(pages); | 1324 | LIST_HEAD(pages); |
| 1319 | int i, cpu; | 1325 | int i, cpu; |
| 1320 | 1326 | ||
| @@ -1375,16 +1381,24 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
| 1375 | 1381 | ||
| 1376 | for_each_buffer_cpu(buffer, cpu) { | 1382 | for_each_buffer_cpu(buffer, cpu) { |
| 1377 | for (i = 0; i < new_pages; i++) { | 1383 | for (i = 0; i < new_pages; i++) { |
| 1384 | struct page *page; | ||
| 1385 | /* | ||
| 1386 | * __GFP_NORETRY flag makes sure that the allocation | ||
| 1387 | * fails gracefully without invoking oom-killer and | ||
| 1388 | * the system is not destabilized. | ||
| 1389 | */ | ||
| 1378 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), | 1390 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), |
| 1379 | cache_line_size()), | 1391 | cache_line_size()), |
| 1380 | GFP_KERNEL, cpu_to_node(cpu)); | 1392 | GFP_KERNEL | __GFP_NORETRY, |
| 1393 | cpu_to_node(cpu)); | ||
| 1381 | if (!bpage) | 1394 | if (!bpage) |
| 1382 | goto free_pages; | 1395 | goto free_pages; |
| 1383 | list_add(&bpage->list, &pages); | 1396 | list_add(&bpage->list, &pages); |
| 1384 | addr = __get_free_page(GFP_KERNEL); | 1397 | page = alloc_pages_node(cpu_to_node(cpu), |
| 1385 | if (!addr) | 1398 | GFP_KERNEL | __GFP_NORETRY, 0); |
| 1399 | if (!page) | ||
| 1386 | goto free_pages; | 1400 | goto free_pages; |
| 1387 | bpage->page = (void *)addr; | 1401 | bpage->page = page_address(page); |
| 1388 | rb_init_page(bpage->page); | 1402 | rb_init_page(bpage->page); |
| 1389 | } | 1403 | } |
| 1390 | } | 1404 | } |
| @@ -2216,7 +2230,7 @@ static noinline void trace_recursive_fail(void) | |||
| 2216 | 2230 | ||
| 2217 | printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" | 2231 | printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" |
| 2218 | "HC[%lu]:SC[%lu]:NMI[%lu]\n", | 2232 | "HC[%lu]:SC[%lu]:NMI[%lu]\n", |
| 2219 | current->trace_recursion, | 2233 | trace_recursion_buffer(), |
| 2220 | hardirq_count() >> HARDIRQ_SHIFT, | 2234 | hardirq_count() >> HARDIRQ_SHIFT, |
| 2221 | softirq_count() >> SOFTIRQ_SHIFT, | 2235 | softirq_count() >> SOFTIRQ_SHIFT, |
| 2222 | in_nmi()); | 2236 | in_nmi()); |
| @@ -2226,9 +2240,9 @@ static noinline void trace_recursive_fail(void) | |||
| 2226 | 2240 | ||
| 2227 | static inline int trace_recursive_lock(void) | 2241 | static inline int trace_recursive_lock(void) |
| 2228 | { | 2242 | { |
| 2229 | current->trace_recursion++; | 2243 | trace_recursion_inc(); |
| 2230 | 2244 | ||
| 2231 | if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) | 2245 | if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) |
| 2232 | return 0; | 2246 | return 0; |
| 2233 | 2247 | ||
| 2234 | trace_recursive_fail(); | 2248 | trace_recursive_fail(); |
| @@ -2238,9 +2252,9 @@ static inline int trace_recursive_lock(void) | |||
| 2238 | 2252 | ||
| 2239 | static inline void trace_recursive_unlock(void) | 2253 | static inline void trace_recursive_unlock(void) |
| 2240 | { | 2254 | { |
| 2241 | WARN_ON_ONCE(!current->trace_recursion); | 2255 | WARN_ON_ONCE(!trace_recursion_buffer()); |
| 2242 | 2256 | ||
| 2243 | current->trace_recursion--; | 2257 | trace_recursion_dec(); |
| 2244 | } | 2258 | } |
| 2245 | 2259 | ||
| 2246 | #else | 2260 | #else |
| @@ -3730,16 +3744,17 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); | |||
| 3730 | * Returns: | 3744 | * Returns: |
| 3731 | * The page allocated, or NULL on error. | 3745 | * The page allocated, or NULL on error. |
| 3732 | */ | 3746 | */ |
| 3733 | void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) | 3747 | void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) |
| 3734 | { | 3748 | { |
| 3735 | struct buffer_data_page *bpage; | 3749 | struct buffer_data_page *bpage; |
| 3736 | unsigned long addr; | 3750 | struct page *page; |
| 3737 | 3751 | ||
| 3738 | addr = __get_free_page(GFP_KERNEL); | 3752 | page = alloc_pages_node(cpu_to_node(cpu), |
| 3739 | if (!addr) | 3753 | GFP_KERNEL | __GFP_NORETRY, 0); |
| 3754 | if (!page) | ||
| 3740 | return NULL; | 3755 | return NULL; |
| 3741 | 3756 | ||
| 3742 | bpage = (void *)addr; | 3757 | bpage = page_address(page); |
| 3743 | 3758 | ||
| 3744 | rb_init_page(bpage); | 3759 | rb_init_page(bpage); |
| 3745 | 3760 | ||
| @@ -3978,20 +3993,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf, | |||
| 3978 | size_t cnt, loff_t *ppos) | 3993 | size_t cnt, loff_t *ppos) |
| 3979 | { | 3994 | { |
| 3980 | unsigned long *p = filp->private_data; | 3995 | unsigned long *p = filp->private_data; |
| 3981 | char buf[64]; | ||
| 3982 | unsigned long val; | 3996 | unsigned long val; |
| 3983 | int ret; | 3997 | int ret; |
| 3984 | 3998 | ||
| 3985 | if (cnt >= sizeof(buf)) | 3999 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
| 3986 | return -EINVAL; | 4000 | if (ret) |
| 3987 | |||
| 3988 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 3989 | return -EFAULT; | ||
| 3990 | |||
| 3991 | buf[cnt] = 0; | ||
| 3992 | |||
| 3993 | ret = strict_strtoul(buf, 10, &val); | ||
| 3994 | if (ret < 0) | ||
| 3995 | return ret; | 4001 | return ret; |
| 3996 | 4002 | ||
| 3997 | if (val) | 4003 | if (val) |
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 302f8a614635..a5457d577b98 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
| @@ -106,7 +106,7 @@ static enum event_status read_page(int cpu) | |||
| 106 | int inc; | 106 | int inc; |
| 107 | int i; | 107 | int i; |
| 108 | 108 | ||
| 109 | bpage = ring_buffer_alloc_read_page(buffer); | 109 | bpage = ring_buffer_alloc_read_page(buffer, cpu); |
| 110 | if (!bpage) | 110 | if (!bpage) |
| 111 | return EVENT_DROPPED; | 111 | return EVENT_DROPPED; |
| 112 | 112 | ||
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1cb49be7c7fb..e5df02c69b1d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -343,26 +343,27 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | |||
| 343 | static int trace_stop_count; | 343 | static int trace_stop_count; |
| 344 | static DEFINE_SPINLOCK(tracing_start_lock); | 344 | static DEFINE_SPINLOCK(tracing_start_lock); |
| 345 | 345 | ||
| 346 | static void wakeup_work_handler(struct work_struct *work) | ||
| 347 | { | ||
| 348 | wake_up(&trace_wait); | ||
| 349 | } | ||
| 350 | |||
| 351 | static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); | ||
| 352 | |||
| 346 | /** | 353 | /** |
| 347 | * trace_wake_up - wake up tasks waiting for trace input | 354 | * trace_wake_up - wake up tasks waiting for trace input |
| 348 | * | 355 | * |
| 349 | * Simply wakes up any task that is blocked on the trace_wait | 356 | * Schedules a delayed work to wake up any task that is blocked on the |
| 350 | * queue. These is used with trace_poll for tasks polling the trace. | 357 | * trace_wait queue. These is used with trace_poll for tasks polling the |
| 358 | * trace. | ||
| 351 | */ | 359 | */ |
| 352 | void trace_wake_up(void) | 360 | void trace_wake_up(void) |
| 353 | { | 361 | { |
| 354 | int cpu; | 362 | const unsigned long delay = msecs_to_jiffies(2); |
| 355 | 363 | ||
| 356 | if (trace_flags & TRACE_ITER_BLOCK) | 364 | if (trace_flags & TRACE_ITER_BLOCK) |
| 357 | return; | 365 | return; |
| 358 | /* | 366 | schedule_delayed_work(&wakeup_work, delay); |
| 359 | * The runqueue_is_locked() can fail, but this is the best we | ||
| 360 | * have for now: | ||
| 361 | */ | ||
| 362 | cpu = get_cpu(); | ||
| 363 | if (!runqueue_is_locked(cpu)) | ||
| 364 | wake_up(&trace_wait); | ||
| 365 | put_cpu(); | ||
| 366 | } | 367 | } |
| 367 | 368 | ||
| 368 | static int __init set_buf_size(char *str) | 369 | static int __init set_buf_size(char *str) |
| @@ -424,6 +425,7 @@ static const char *trace_options[] = { | |||
| 424 | "graph-time", | 425 | "graph-time", |
| 425 | "record-cmd", | 426 | "record-cmd", |
| 426 | "overwrite", | 427 | "overwrite", |
| 428 | "disable_on_free", | ||
| 427 | NULL | 429 | NULL |
| 428 | }; | 430 | }; |
| 429 | 431 | ||
| @@ -1191,6 +1193,18 @@ void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, | |||
| 1191 | } | 1193 | } |
| 1192 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); | 1194 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); |
| 1193 | 1195 | ||
| 1196 | void trace_nowake_buffer_unlock_commit_regs(struct ring_buffer *buffer, | ||
| 1197 | struct ring_buffer_event *event, | ||
| 1198 | unsigned long flags, int pc, | ||
| 1199 | struct pt_regs *regs) | ||
| 1200 | { | ||
| 1201 | ring_buffer_unlock_commit(buffer, event); | ||
| 1202 | |||
| 1203 | ftrace_trace_stack_regs(buffer, flags, 0, pc, regs); | ||
| 1204 | ftrace_trace_userstack(buffer, flags, pc); | ||
| 1205 | } | ||
| 1206 | EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit_regs); | ||
| 1207 | |||
| 1194 | void trace_current_buffer_discard_commit(struct ring_buffer *buffer, | 1208 | void trace_current_buffer_discard_commit(struct ring_buffer *buffer, |
| 1195 | struct ring_buffer_event *event) | 1209 | struct ring_buffer_event *event) |
| 1196 | { | 1210 | { |
| @@ -1234,30 +1248,103 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data, | |||
| 1234 | } | 1248 | } |
| 1235 | 1249 | ||
| 1236 | #ifdef CONFIG_STACKTRACE | 1250 | #ifdef CONFIG_STACKTRACE |
| 1251 | |||
| 1252 | #define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) | ||
| 1253 | struct ftrace_stack { | ||
| 1254 | unsigned long calls[FTRACE_STACK_MAX_ENTRIES]; | ||
| 1255 | }; | ||
| 1256 | |||
| 1257 | static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); | ||
| 1258 | static DEFINE_PER_CPU(int, ftrace_stack_reserve); | ||
| 1259 | |||
| 1237 | static void __ftrace_trace_stack(struct ring_buffer *buffer, | 1260 | static void __ftrace_trace_stack(struct ring_buffer *buffer, |
| 1238 | unsigned long flags, | 1261 | unsigned long flags, |
| 1239 | int skip, int pc) | 1262 | int skip, int pc, struct pt_regs *regs) |
| 1240 | { | 1263 | { |
| 1241 | struct ftrace_event_call *call = &event_kernel_stack; | 1264 | struct ftrace_event_call *call = &event_kernel_stack; |
| 1242 | struct ring_buffer_event *event; | 1265 | struct ring_buffer_event *event; |
| 1243 | struct stack_entry *entry; | 1266 | struct stack_entry *entry; |
| 1244 | struct stack_trace trace; | 1267 | struct stack_trace trace; |
| 1268 | int use_stack; | ||
| 1269 | int size = FTRACE_STACK_ENTRIES; | ||
| 1270 | |||
| 1271 | trace.nr_entries = 0; | ||
| 1272 | trace.skip = skip; | ||
| 1273 | |||
| 1274 | /* | ||
| 1275 | * Since events can happen in NMIs there's no safe way to | ||
| 1276 | * use the per cpu ftrace_stacks. We reserve it and if an interrupt | ||
| 1277 | * or NMI comes in, it will just have to use the default | ||
| 1278 | * FTRACE_STACK_SIZE. | ||
| 1279 | */ | ||
| 1280 | preempt_disable_notrace(); | ||
| 1281 | |||
| 1282 | use_stack = ++__get_cpu_var(ftrace_stack_reserve); | ||
| 1283 | /* | ||
| 1284 | * We don't need any atomic variables, just a barrier. | ||
| 1285 | * If an interrupt comes in, we don't care, because it would | ||
| 1286 | * have exited and put the counter back to what we want. | ||
| 1287 | * We just need a barrier to keep gcc from moving things | ||
| 1288 | * around. | ||
| 1289 | */ | ||
| 1290 | barrier(); | ||
| 1291 | if (use_stack == 1) { | ||
| 1292 | trace.entries = &__get_cpu_var(ftrace_stack).calls[0]; | ||
| 1293 | trace.max_entries = FTRACE_STACK_MAX_ENTRIES; | ||
| 1294 | |||
| 1295 | if (regs) | ||
| 1296 | save_stack_trace_regs(regs, &trace); | ||
| 1297 | else | ||
| 1298 | save_stack_trace(&trace); | ||
| 1299 | |||
| 1300 | if (trace.nr_entries > size) | ||
| 1301 | size = trace.nr_entries; | ||
| 1302 | } else | ||
| 1303 | /* From now on, use_stack is a boolean */ | ||
| 1304 | use_stack = 0; | ||
| 1305 | |||
| 1306 | size *= sizeof(unsigned long); | ||
| 1245 | 1307 | ||
| 1246 | event = trace_buffer_lock_reserve(buffer, TRACE_STACK, | 1308 | event = trace_buffer_lock_reserve(buffer, TRACE_STACK, |
| 1247 | sizeof(*entry), flags, pc); | 1309 | sizeof(*entry) + size, flags, pc); |
| 1248 | if (!event) | 1310 | if (!event) |
| 1249 | return; | 1311 | goto out; |
| 1250 | entry = ring_buffer_event_data(event); | 1312 | entry = ring_buffer_event_data(event); |
| 1251 | memset(&entry->caller, 0, sizeof(entry->caller)); | ||
| 1252 | 1313 | ||
| 1253 | trace.nr_entries = 0; | 1314 | memset(&entry->caller, 0, size); |
| 1254 | trace.max_entries = FTRACE_STACK_ENTRIES; | 1315 | |
| 1255 | trace.skip = skip; | 1316 | if (use_stack) |
| 1256 | trace.entries = entry->caller; | 1317 | memcpy(&entry->caller, trace.entries, |
| 1318 | trace.nr_entries * sizeof(unsigned long)); | ||
| 1319 | else { | ||
| 1320 | trace.max_entries = FTRACE_STACK_ENTRIES; | ||
| 1321 | trace.entries = entry->caller; | ||
| 1322 | if (regs) | ||
| 1323 | save_stack_trace_regs(regs, &trace); | ||
| 1324 | else | ||
| 1325 | save_stack_trace(&trace); | ||
| 1326 | } | ||
| 1327 | |||
| 1328 | entry->size = trace.nr_entries; | ||
| 1257 | 1329 | ||
| 1258 | save_stack_trace(&trace); | ||
| 1259 | if (!filter_check_discard(call, entry, buffer, event)) | 1330 | if (!filter_check_discard(call, entry, buffer, event)) |
| 1260 | ring_buffer_unlock_commit(buffer, event); | 1331 | ring_buffer_unlock_commit(buffer, event); |
| 1332 | |||
| 1333 | out: | ||
| 1334 | /* Again, don't let gcc optimize things here */ | ||
| 1335 | barrier(); | ||
| 1336 | __get_cpu_var(ftrace_stack_reserve)--; | ||
| 1337 | preempt_enable_notrace(); | ||
| 1338 | |||
| 1339 | } | ||
| 1340 | |||
| 1341 | void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, | ||
| 1342 | int skip, int pc, struct pt_regs *regs) | ||
| 1343 | { | ||
| 1344 | if (!(trace_flags & TRACE_ITER_STACKTRACE)) | ||
| 1345 | return; | ||
| 1346 | |||
| 1347 | __ftrace_trace_stack(buffer, flags, skip, pc, regs); | ||
| 1261 | } | 1348 | } |
| 1262 | 1349 | ||
| 1263 | void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, | 1350 | void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, |
| @@ -1266,13 +1353,13 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, | |||
| 1266 | if (!(trace_flags & TRACE_ITER_STACKTRACE)) | 1353 | if (!(trace_flags & TRACE_ITER_STACKTRACE)) |
| 1267 | return; | 1354 | return; |
| 1268 | 1355 | ||
| 1269 | __ftrace_trace_stack(buffer, flags, skip, pc); | 1356 | __ftrace_trace_stack(buffer, flags, skip, pc, NULL); |
| 1270 | } | 1357 | } |
| 1271 | 1358 | ||
| 1272 | void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, | 1359 | void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, |
| 1273 | int pc) | 1360 | int pc) |
| 1274 | { | 1361 | { |
| 1275 | __ftrace_trace_stack(tr->buffer, flags, skip, pc); | 1362 | __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); |
| 1276 | } | 1363 | } |
| 1277 | 1364 | ||
| 1278 | /** | 1365 | /** |
| @@ -1288,7 +1375,7 @@ void trace_dump_stack(void) | |||
| 1288 | local_save_flags(flags); | 1375 | local_save_flags(flags); |
| 1289 | 1376 | ||
| 1290 | /* skipping 3 traces, seems to get us at the caller of this function */ | 1377 | /* skipping 3 traces, seems to get us at the caller of this function */ |
| 1291 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); | 1378 | __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); |
| 1292 | } | 1379 | } |
| 1293 | 1380 | ||
| 1294 | static DEFINE_PER_CPU(int, user_stack_count); | 1381 | static DEFINE_PER_CPU(int, user_stack_count); |
| @@ -1536,7 +1623,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, | |||
| 1536 | 1623 | ||
| 1537 | ftrace_enable_cpu(); | 1624 | ftrace_enable_cpu(); |
| 1538 | 1625 | ||
| 1539 | return event ? ring_buffer_event_data(event) : NULL; | 1626 | if (event) { |
| 1627 | iter->ent_size = ring_buffer_event_length(event); | ||
| 1628 | return ring_buffer_event_data(event); | ||
| 1629 | } | ||
| 1630 | iter->ent_size = 0; | ||
| 1631 | return NULL; | ||
| 1540 | } | 1632 | } |
| 1541 | 1633 | ||
| 1542 | static struct trace_entry * | 1634 | static struct trace_entry * |
| @@ -2014,9 +2106,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) | |||
| 2014 | { | 2106 | { |
| 2015 | enum print_line_t ret; | 2107 | enum print_line_t ret; |
| 2016 | 2108 | ||
| 2017 | if (iter->lost_events) | 2109 | if (iter->lost_events && |
| 2018 | trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", | 2110 | !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", |
| 2019 | iter->cpu, iter->lost_events); | 2111 | iter->cpu, iter->lost_events)) |
| 2112 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 2020 | 2113 | ||
| 2021 | if (iter->trace && iter->trace->print_line) { | 2114 | if (iter->trace && iter->trace->print_line) { |
| 2022 | ret = iter->trace->print_line(iter); | 2115 | ret = iter->trace->print_line(iter); |
| @@ -2050,6 +2143,9 @@ void trace_default_header(struct seq_file *m) | |||
| 2050 | { | 2143 | { |
| 2051 | struct trace_iterator *iter = m->private; | 2144 | struct trace_iterator *iter = m->private; |
| 2052 | 2145 | ||
| 2146 | if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) | ||
| 2147 | return; | ||
| 2148 | |||
| 2053 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) { | 2149 | if (iter->iter_flags & TRACE_FILE_LAT_FMT) { |
| 2054 | /* print nothing if the buffers are empty */ | 2150 | /* print nothing if the buffers are empty */ |
| 2055 | if (trace_empty(iter)) | 2151 | if (trace_empty(iter)) |
| @@ -2700,20 +2796,11 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, | |||
| 2700 | size_t cnt, loff_t *ppos) | 2796 | size_t cnt, loff_t *ppos) |
| 2701 | { | 2797 | { |
| 2702 | struct trace_array *tr = filp->private_data; | 2798 | struct trace_array *tr = filp->private_data; |
| 2703 | char buf[64]; | ||
| 2704 | unsigned long val; | 2799 | unsigned long val; |
| 2705 | int ret; | 2800 | int ret; |
| 2706 | 2801 | ||
| 2707 | if (cnt >= sizeof(buf)) | 2802 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
| 2708 | return -EINVAL; | 2803 | if (ret) |
| 2709 | |||
| 2710 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 2711 | return -EFAULT; | ||
| 2712 | |||
| 2713 | buf[cnt] = 0; | ||
| 2714 | |||
| 2715 | ret = strict_strtoul(buf, 10, &val); | ||
| 2716 | if (ret < 0) | ||
| 2717 | return ret; | 2804 | return ret; |
| 2718 | 2805 | ||
| 2719 | val = !!val; | 2806 | val = !!val; |
| @@ -2766,7 +2853,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr) | |||
| 2766 | return t->init(tr); | 2853 | return t->init(tr); |
| 2767 | } | 2854 | } |
| 2768 | 2855 | ||
| 2769 | static int tracing_resize_ring_buffer(unsigned long size) | 2856 | static int __tracing_resize_ring_buffer(unsigned long size) |
| 2770 | { | 2857 | { |
| 2771 | int ret; | 2858 | int ret; |
| 2772 | 2859 | ||
| @@ -2818,6 +2905,41 @@ static int tracing_resize_ring_buffer(unsigned long size) | |||
| 2818 | return ret; | 2905 | return ret; |
| 2819 | } | 2906 | } |
| 2820 | 2907 | ||
| 2908 | static ssize_t tracing_resize_ring_buffer(unsigned long size) | ||
| 2909 | { | ||
| 2910 | int cpu, ret = size; | ||
| 2911 | |||
| 2912 | mutex_lock(&trace_types_lock); | ||
| 2913 | |||
| 2914 | tracing_stop(); | ||
| 2915 | |||
| 2916 | /* disable all cpu buffers */ | ||
| 2917 | for_each_tracing_cpu(cpu) { | ||
| 2918 | if (global_trace.data[cpu]) | ||
| 2919 | atomic_inc(&global_trace.data[cpu]->disabled); | ||
| 2920 | if (max_tr.data[cpu]) | ||
| 2921 | atomic_inc(&max_tr.data[cpu]->disabled); | ||
| 2922 | } | ||
| 2923 | |||
| 2924 | if (size != global_trace.entries) | ||
| 2925 | ret = __tracing_resize_ring_buffer(size); | ||
| 2926 | |||
| 2927 | if (ret < 0) | ||
| 2928 | ret = -ENOMEM; | ||
| 2929 | |||
| 2930 | for_each_tracing_cpu(cpu) { | ||
| 2931 | if (global_trace.data[cpu]) | ||
| 2932 | atomic_dec(&global_trace.data[cpu]->disabled); | ||
| 2933 | if (max_tr.data[cpu]) | ||
| 2934 | atomic_dec(&max_tr.data[cpu]->disabled); | ||
| 2935 | } | ||
| 2936 | |||
| 2937 | tracing_start(); | ||
| 2938 | mutex_unlock(&trace_types_lock); | ||
| 2939 | |||
| 2940 | return ret; | ||
| 2941 | } | ||
| 2942 | |||
| 2821 | 2943 | ||
| 2822 | /** | 2944 | /** |
| 2823 | * tracing_update_buffers - used by tracing facility to expand ring buffers | 2945 | * tracing_update_buffers - used by tracing facility to expand ring buffers |
| @@ -2835,7 +2957,7 @@ int tracing_update_buffers(void) | |||
| 2835 | 2957 | ||
| 2836 | mutex_lock(&trace_types_lock); | 2958 | mutex_lock(&trace_types_lock); |
| 2837 | if (!ring_buffer_expanded) | 2959 | if (!ring_buffer_expanded) |
| 2838 | ret = tracing_resize_ring_buffer(trace_buf_size); | 2960 | ret = __tracing_resize_ring_buffer(trace_buf_size); |
| 2839 | mutex_unlock(&trace_types_lock); | 2961 | mutex_unlock(&trace_types_lock); |
| 2840 | 2962 | ||
| 2841 | return ret; | 2963 | return ret; |
| @@ -2859,7 +2981,7 @@ static int tracing_set_tracer(const char *buf) | |||
| 2859 | mutex_lock(&trace_types_lock); | 2981 | mutex_lock(&trace_types_lock); |
| 2860 | 2982 | ||
| 2861 | if (!ring_buffer_expanded) { | 2983 | if (!ring_buffer_expanded) { |
| 2862 | ret = tracing_resize_ring_buffer(trace_buf_size); | 2984 | ret = __tracing_resize_ring_buffer(trace_buf_size); |
| 2863 | if (ret < 0) | 2985 | if (ret < 0) |
| 2864 | goto out; | 2986 | goto out; |
| 2865 | ret = 0; | 2987 | ret = 0; |
| @@ -2965,20 +3087,11 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, | |||
| 2965 | size_t cnt, loff_t *ppos) | 3087 | size_t cnt, loff_t *ppos) |
| 2966 | { | 3088 | { |
| 2967 | unsigned long *ptr = filp->private_data; | 3089 | unsigned long *ptr = filp->private_data; |
| 2968 | char buf[64]; | ||
| 2969 | unsigned long val; | 3090 | unsigned long val; |
| 2970 | int ret; | 3091 | int ret; |
| 2971 | 3092 | ||
| 2972 | if (cnt >= sizeof(buf)) | 3093 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
| 2973 | return -EINVAL; | 3094 | if (ret) |
| 2974 | |||
| 2975 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 2976 | return -EFAULT; | ||
| 2977 | |||
| 2978 | buf[cnt] = 0; | ||
| 2979 | |||
| 2980 | ret = strict_strtoul(buf, 10, &val); | ||
| 2981 | if (ret < 0) | ||
| 2982 | return ret; | 3095 | return ret; |
| 2983 | 3096 | ||
| 2984 | *ptr = val * 1000; | 3097 | *ptr = val * 1000; |
| @@ -3230,6 +3343,14 @@ waitagain: | |||
| 3230 | 3343 | ||
| 3231 | if (iter->seq.len >= cnt) | 3344 | if (iter->seq.len >= cnt) |
| 3232 | break; | 3345 | break; |
| 3346 | |||
| 3347 | /* | ||
| 3348 | * Setting the full flag means we reached the trace_seq buffer | ||
| 3349 | * size and we should leave by partial output condition above. | ||
| 3350 | * One of the trace_seq_* functions is not used properly. | ||
| 3351 | */ | ||
| 3352 | WARN_ONCE(iter->seq.full, "full flag set for trace type %d", | ||
| 3353 | iter->ent->type); | ||
| 3233 | } | 3354 | } |
| 3234 | trace_access_unlock(iter->cpu_file); | 3355 | trace_access_unlock(iter->cpu_file); |
| 3235 | trace_event_read_unlock(); | 3356 | trace_event_read_unlock(); |
| @@ -3425,67 +3546,54 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
| 3425 | size_t cnt, loff_t *ppos) | 3546 | size_t cnt, loff_t *ppos) |
| 3426 | { | 3547 | { |
| 3427 | unsigned long val; | 3548 | unsigned long val; |
| 3428 | char buf[64]; | 3549 | int ret; |
| 3429 | int ret, cpu; | ||
| 3430 | |||
| 3431 | if (cnt >= sizeof(buf)) | ||
| 3432 | return -EINVAL; | ||
| 3433 | |||
| 3434 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 3435 | return -EFAULT; | ||
| 3436 | |||
| 3437 | buf[cnt] = 0; | ||
| 3438 | 3550 | ||
| 3439 | ret = strict_strtoul(buf, 10, &val); | 3551 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
| 3440 | if (ret < 0) | 3552 | if (ret) |
| 3441 | return ret; | 3553 | return ret; |
| 3442 | 3554 | ||
| 3443 | /* must have at least 1 entry */ | 3555 | /* must have at least 1 entry */ |
| 3444 | if (!val) | 3556 | if (!val) |
| 3445 | return -EINVAL; | 3557 | return -EINVAL; |
| 3446 | 3558 | ||
| 3447 | mutex_lock(&trace_types_lock); | ||
| 3448 | |||
| 3449 | tracing_stop(); | ||
| 3450 | |||
| 3451 | /* disable all cpu buffers */ | ||
| 3452 | for_each_tracing_cpu(cpu) { | ||
| 3453 | if (global_trace.data[cpu]) | ||
| 3454 | atomic_inc(&global_trace.data[cpu]->disabled); | ||
| 3455 | if (max_tr.data[cpu]) | ||
| 3456 | atomic_inc(&max_tr.data[cpu]->disabled); | ||
| 3457 | } | ||
| 3458 | |||
| 3459 | /* value is in KB */ | 3559 | /* value is in KB */ |
| 3460 | val <<= 10; | 3560 | val <<= 10; |
| 3461 | 3561 | ||
| 3462 | if (val != global_trace.entries) { | 3562 | ret = tracing_resize_ring_buffer(val); |
| 3463 | ret = tracing_resize_ring_buffer(val); | 3563 | if (ret < 0) |
| 3464 | if (ret < 0) { | 3564 | return ret; |
| 3465 | cnt = ret; | ||
| 3466 | goto out; | ||
| 3467 | } | ||
| 3468 | } | ||
| 3469 | 3565 | ||
| 3470 | *ppos += cnt; | 3566 | *ppos += cnt; |
| 3471 | 3567 | ||
| 3472 | /* If check pages failed, return ENOMEM */ | 3568 | return cnt; |
| 3473 | if (tracing_disabled) | 3569 | } |
| 3474 | cnt = -ENOMEM; | ||
| 3475 | out: | ||
| 3476 | for_each_tracing_cpu(cpu) { | ||
| 3477 | if (global_trace.data[cpu]) | ||
| 3478 | atomic_dec(&global_trace.data[cpu]->disabled); | ||
| 3479 | if (max_tr.data[cpu]) | ||
| 3480 | atomic_dec(&max_tr.data[cpu]->disabled); | ||
| 3481 | } | ||
| 3482 | 3570 | ||
| 3483 | tracing_start(); | 3571 | static ssize_t |
| 3484 | mutex_unlock(&trace_types_lock); | 3572 | tracing_free_buffer_write(struct file *filp, const char __user *ubuf, |
| 3573 | size_t cnt, loff_t *ppos) | ||
| 3574 | { | ||
| 3575 | /* | ||
| 3576 | * There is no need to read what the user has written, this function | ||
| 3577 | * is just to make sure that there is no error when "echo" is used | ||
| 3578 | */ | ||
| 3579 | |||
| 3580 | *ppos += cnt; | ||
| 3485 | 3581 | ||
| 3486 | return cnt; | 3582 | return cnt; |
| 3487 | } | 3583 | } |
| 3488 | 3584 | ||
| 3585 | static int | ||
| 3586 | tracing_free_buffer_release(struct inode *inode, struct file *filp) | ||
| 3587 | { | ||
| 3588 | /* disable tracing ? */ | ||
| 3589 | if (trace_flags & TRACE_ITER_STOP_ON_FREE) | ||
| 3590 | tracing_off(); | ||
| 3591 | /* resize the ring buffer to 0 */ | ||
| 3592 | tracing_resize_ring_buffer(0); | ||
| 3593 | |||
| 3594 | return 0; | ||
| 3595 | } | ||
| 3596 | |||
| 3489 | static int mark_printk(const char *fmt, ...) | 3597 | static int mark_printk(const char *fmt, ...) |
| 3490 | { | 3598 | { |
| 3491 | int ret; | 3599 | int ret; |
| @@ -3631,6 +3739,11 @@ static const struct file_operations tracing_entries_fops = { | |||
| 3631 | .llseek = generic_file_llseek, | 3739 | .llseek = generic_file_llseek, |
| 3632 | }; | 3740 | }; |
| 3633 | 3741 | ||
| 3742 | static const struct file_operations tracing_free_buffer_fops = { | ||
| 3743 | .write = tracing_free_buffer_write, | ||
| 3744 | .release = tracing_free_buffer_release, | ||
| 3745 | }; | ||
| 3746 | |||
| 3634 | static const struct file_operations tracing_mark_fops = { | 3747 | static const struct file_operations tracing_mark_fops = { |
| 3635 | .open = tracing_open_generic, | 3748 | .open = tracing_open_generic, |
| 3636 | .write = tracing_mark_write, | 3749 | .write = tracing_mark_write, |
| @@ -3687,7 +3800,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
| 3687 | return 0; | 3800 | return 0; |
| 3688 | 3801 | ||
| 3689 | if (!info->spare) | 3802 | if (!info->spare) |
| 3690 | info->spare = ring_buffer_alloc_read_page(info->tr->buffer); | 3803 | info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); |
| 3691 | if (!info->spare) | 3804 | if (!info->spare) |
| 3692 | return -ENOMEM; | 3805 | return -ENOMEM; |
| 3693 | 3806 | ||
| @@ -3844,7 +3957,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 3844 | 3957 | ||
| 3845 | ref->ref = 1; | 3958 | ref->ref = 1; |
| 3846 | ref->buffer = info->tr->buffer; | 3959 | ref->buffer = info->tr->buffer; |
| 3847 | ref->page = ring_buffer_alloc_read_page(ref->buffer); | 3960 | ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); |
| 3848 | if (!ref->page) { | 3961 | if (!ref->page) { |
| 3849 | kfree(ref); | 3962 | kfree(ref); |
| 3850 | break; | 3963 | break; |
| @@ -3853,8 +3966,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
| 3853 | r = ring_buffer_read_page(ref->buffer, &ref->page, | 3966 | r = ring_buffer_read_page(ref->buffer, &ref->page, |
| 3854 | len, info->cpu, 1); | 3967 | len, info->cpu, 1); |
| 3855 | if (r < 0) { | 3968 | if (r < 0) { |
| 3856 | ring_buffer_free_read_page(ref->buffer, | 3969 | ring_buffer_free_read_page(ref->buffer, ref->page); |
| 3857 | ref->page); | ||
| 3858 | kfree(ref); | 3970 | kfree(ref); |
| 3859 | break; | 3971 | break; |
| 3860 | } | 3972 | } |
| @@ -4090,19 +4202,10 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
| 4090 | { | 4202 | { |
| 4091 | struct trace_option_dentry *topt = filp->private_data; | 4203 | struct trace_option_dentry *topt = filp->private_data; |
| 4092 | unsigned long val; | 4204 | unsigned long val; |
| 4093 | char buf[64]; | ||
| 4094 | int ret; | 4205 | int ret; |
| 4095 | 4206 | ||
| 4096 | if (cnt >= sizeof(buf)) | 4207 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
| 4097 | return -EINVAL; | 4208 | if (ret) |
| 4098 | |||
| 4099 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 4100 | return -EFAULT; | ||
| 4101 | |||
| 4102 | buf[cnt] = 0; | ||
| 4103 | |||
| 4104 | ret = strict_strtoul(buf, 10, &val); | ||
| 4105 | if (ret < 0) | ||
| 4106 | return ret; | 4209 | return ret; |
| 4107 | 4210 | ||
| 4108 | if (val != 0 && val != 1) | 4211 | if (val != 0 && val != 1) |
| @@ -4150,20 +4253,11 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
| 4150 | loff_t *ppos) | 4253 | loff_t *ppos) |
| 4151 | { | 4254 | { |
| 4152 | long index = (long)filp->private_data; | 4255 | long index = (long)filp->private_data; |
| 4153 | char buf[64]; | ||
| 4154 | unsigned long val; | 4256 | unsigned long val; |
| 4155 | int ret; | 4257 | int ret; |
| 4156 | 4258 | ||
| 4157 | if (cnt >= sizeof(buf)) | 4259 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
| 4158 | return -EINVAL; | 4260 | if (ret) |
| 4159 | |||
| 4160 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 4161 | return -EFAULT; | ||
| 4162 | |||
| 4163 | buf[cnt] = 0; | ||
| 4164 | |||
| 4165 | ret = strict_strtoul(buf, 10, &val); | ||
| 4166 | if (ret < 0) | ||
| 4167 | return ret; | 4261 | return ret; |
| 4168 | 4262 | ||
| 4169 | if (val != 0 && val != 1) | 4263 | if (val != 0 && val != 1) |
| @@ -4356,6 +4450,9 @@ static __init int tracer_init_debugfs(void) | |||
| 4356 | trace_create_file("buffer_size_kb", 0644, d_tracer, | 4450 | trace_create_file("buffer_size_kb", 0644, d_tracer, |
| 4357 | &global_trace, &tracing_entries_fops); | 4451 | &global_trace, &tracing_entries_fops); |
| 4358 | 4452 | ||
| 4453 | trace_create_file("free_buffer", 0644, d_tracer, | ||
| 4454 | &global_trace, &tracing_free_buffer_fops); | ||
| 4455 | |||
| 4359 | trace_create_file("trace_marker", 0220, d_tracer, | 4456 | trace_create_file("trace_marker", 0220, d_tracer, |
| 4360 | NULL, &tracing_mark_fops); | 4457 | NULL, &tracing_mark_fops); |
| 4361 | 4458 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5e9dfc6286dd..616846bcfee5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -2,7 +2,7 @@ | |||
| 2 | #define _LINUX_KERNEL_TRACE_H | 2 | #define _LINUX_KERNEL_TRACE_H |
| 3 | 3 | ||
| 4 | #include <linux/fs.h> | 4 | #include <linux/fs.h> |
| 5 | #include <asm/atomic.h> | 5 | #include <linux/atomic.h> |
| 6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
| 7 | #include <linux/clocksource.h> | 7 | #include <linux/clocksource.h> |
| 8 | #include <linux/ring_buffer.h> | 8 | #include <linux/ring_buffer.h> |
| @@ -278,6 +278,29 @@ struct tracer { | |||
| 278 | }; | 278 | }; |
| 279 | 279 | ||
| 280 | 280 | ||
| 281 | /* Only current can touch trace_recursion */ | ||
| 282 | #define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) | ||
| 283 | #define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) | ||
| 284 | |||
| 285 | /* Ring buffer has the 10 LSB bits to count */ | ||
| 286 | #define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) | ||
| 287 | |||
| 288 | /* for function tracing recursion */ | ||
| 289 | #define TRACE_INTERNAL_BIT (1<<11) | ||
| 290 | #define TRACE_GLOBAL_BIT (1<<12) | ||
| 291 | /* | ||
| 292 | * Abuse of the trace_recursion. | ||
| 293 | * As we need a way to maintain state if we are tracing the function | ||
| 294 | * graph in irq because we want to trace a particular function that | ||
| 295 | * was called in irq context but we have irq tracing off. Since this | ||
| 296 | * can only be modified by current, we can reuse trace_recursion. | ||
| 297 | */ | ||
| 298 | #define TRACE_IRQ_BIT (1<<13) | ||
| 299 | |||
| 300 | #define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) | ||
| 301 | #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) | ||
| 302 | #define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) | ||
| 303 | |||
| 281 | #define TRACE_PIPE_ALL_CPU -1 | 304 | #define TRACE_PIPE_ALL_CPU -1 |
| 282 | 305 | ||
| 283 | int tracer_init(struct tracer *t, struct trace_array *tr); | 306 | int tracer_init(struct tracer *t, struct trace_array *tr); |
| @@ -389,6 +412,9 @@ void update_max_tr_single(struct trace_array *tr, | |||
| 389 | void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, | 412 | void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, |
| 390 | int skip, int pc); | 413 | int skip, int pc); |
| 391 | 414 | ||
| 415 | void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags, | ||
| 416 | int skip, int pc, struct pt_regs *regs); | ||
| 417 | |||
| 392 | void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, | 418 | void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, |
| 393 | int pc); | 419 | int pc); |
| 394 | 420 | ||
| @@ -400,6 +426,12 @@ static inline void ftrace_trace_stack(struct ring_buffer *buffer, | |||
| 400 | { | 426 | { |
| 401 | } | 427 | } |
| 402 | 428 | ||
| 429 | static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer, | ||
| 430 | unsigned long flags, int skip, | ||
| 431 | int pc, struct pt_regs *regs) | ||
| 432 | { | ||
| 433 | } | ||
| 434 | |||
| 403 | static inline void ftrace_trace_userstack(struct ring_buffer *buffer, | 435 | static inline void ftrace_trace_userstack(struct ring_buffer *buffer, |
| 404 | unsigned long flags, int pc) | 436 | unsigned long flags, int pc) |
| 405 | { | 437 | { |
| @@ -419,6 +451,8 @@ extern void trace_find_cmdline(int pid, char comm[]); | |||
| 419 | extern unsigned long ftrace_update_tot_cnt; | 451 | extern unsigned long ftrace_update_tot_cnt; |
| 420 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func | 452 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func |
| 421 | extern int DYN_FTRACE_TEST_NAME(void); | 453 | extern int DYN_FTRACE_TEST_NAME(void); |
| 454 | #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 | ||
| 455 | extern int DYN_FTRACE_TEST_NAME2(void); | ||
| 422 | #endif | 456 | #endif |
| 423 | 457 | ||
| 424 | extern int ring_buffer_expanded; | 458 | extern int ring_buffer_expanded; |
| @@ -505,8 +539,18 @@ static inline int ftrace_graph_addr(unsigned long addr) | |||
| 505 | return 1; | 539 | return 1; |
| 506 | 540 | ||
| 507 | for (i = 0; i < ftrace_graph_count; i++) { | 541 | for (i = 0; i < ftrace_graph_count; i++) { |
| 508 | if (addr == ftrace_graph_funcs[i]) | 542 | if (addr == ftrace_graph_funcs[i]) { |
| 543 | /* | ||
| 544 | * If no irqs are to be traced, but a set_graph_function | ||
| 545 | * is set, and called by an interrupt handler, we still | ||
| 546 | * want to trace it. | ||
| 547 | */ | ||
| 548 | if (in_irq()) | ||
| 549 | trace_recursion_set(TRACE_IRQ_BIT); | ||
| 550 | else | ||
| 551 | trace_recursion_clear(TRACE_IRQ_BIT); | ||
| 509 | return 1; | 552 | return 1; |
| 553 | } | ||
| 510 | } | 554 | } |
| 511 | 555 | ||
| 512 | return 0; | 556 | return 0; |
| @@ -607,6 +651,7 @@ enum trace_iterator_flags { | |||
| 607 | TRACE_ITER_GRAPH_TIME = 0x80000, | 651 | TRACE_ITER_GRAPH_TIME = 0x80000, |
| 608 | TRACE_ITER_RECORD_CMD = 0x100000, | 652 | TRACE_ITER_RECORD_CMD = 0x100000, |
| 609 | TRACE_ITER_OVERWRITE = 0x200000, | 653 | TRACE_ITER_OVERWRITE = 0x200000, |
| 654 | TRACE_ITER_STOP_ON_FREE = 0x400000, | ||
| 610 | }; | 655 | }; |
| 611 | 656 | ||
| 612 | /* | 657 | /* |
| @@ -675,6 +720,7 @@ struct event_subsystem { | |||
| 675 | struct dentry *entry; | 720 | struct dentry *entry; |
| 676 | struct event_filter *filter; | 721 | struct event_filter *filter; |
| 677 | int nr_events; | 722 | int nr_events; |
| 723 | int ref_count; | ||
| 678 | }; | 724 | }; |
| 679 | 725 | ||
| 680 | #define FILTER_PRED_INVALID ((unsigned short)-1) | 726 | #define FILTER_PRED_INVALID ((unsigned short)-1) |
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e32744c84d94..93365907f219 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h | |||
| @@ -161,7 +161,8 @@ FTRACE_ENTRY(kernel_stack, stack_entry, | |||
| 161 | TRACE_STACK, | 161 | TRACE_STACK, |
| 162 | 162 | ||
| 163 | F_STRUCT( | 163 | F_STRUCT( |
| 164 | __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) | 164 | __field( int, size ) |
| 165 | __dynamic_array(unsigned long, caller ) | ||
| 165 | ), | 166 | ), |
| 166 | 167 | ||
| 167 | F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" | 168 | F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2fe110341359..581876f9f387 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -244,6 +244,35 @@ static void ftrace_clear_events(void) | |||
| 244 | mutex_unlock(&event_mutex); | 244 | mutex_unlock(&event_mutex); |
| 245 | } | 245 | } |
| 246 | 246 | ||
| 247 | static void __put_system(struct event_subsystem *system) | ||
| 248 | { | ||
| 249 | struct event_filter *filter = system->filter; | ||
| 250 | |||
| 251 | WARN_ON_ONCE(system->ref_count == 0); | ||
| 252 | if (--system->ref_count) | ||
| 253 | return; | ||
| 254 | |||
| 255 | if (filter) { | ||
| 256 | kfree(filter->filter_string); | ||
| 257 | kfree(filter); | ||
| 258 | } | ||
| 259 | kfree(system->name); | ||
| 260 | kfree(system); | ||
| 261 | } | ||
| 262 | |||
| 263 | static void __get_system(struct event_subsystem *system) | ||
| 264 | { | ||
| 265 | WARN_ON_ONCE(system->ref_count == 0); | ||
| 266 | system->ref_count++; | ||
| 267 | } | ||
| 268 | |||
| 269 | static void put_system(struct event_subsystem *system) | ||
| 270 | { | ||
| 271 | mutex_lock(&event_mutex); | ||
| 272 | __put_system(system); | ||
| 273 | mutex_unlock(&event_mutex); | ||
| 274 | } | ||
| 275 | |||
| 247 | /* | 276 | /* |
| 248 | * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. | 277 | * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. |
| 249 | */ | 278 | */ |
| @@ -486,20 +515,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
| 486 | loff_t *ppos) | 515 | loff_t *ppos) |
| 487 | { | 516 | { |
| 488 | struct ftrace_event_call *call = filp->private_data; | 517 | struct ftrace_event_call *call = filp->private_data; |
| 489 | char buf[64]; | ||
| 490 | unsigned long val; | 518 | unsigned long val; |
| 491 | int ret; | 519 | int ret; |
| 492 | 520 | ||
| 493 | if (cnt >= sizeof(buf)) | 521 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
| 494 | return -EINVAL; | 522 | if (ret) |
| 495 | |||
| 496 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 497 | return -EFAULT; | ||
| 498 | |||
| 499 | buf[cnt] = 0; | ||
| 500 | |||
| 501 | ret = strict_strtoul(buf, 10, &val); | ||
| 502 | if (ret < 0) | ||
| 503 | return ret; | 523 | return ret; |
| 504 | 524 | ||
| 505 | ret = tracing_update_buffers(); | 525 | ret = tracing_update_buffers(); |
| @@ -528,7 +548,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
| 528 | loff_t *ppos) | 548 | loff_t *ppos) |
| 529 | { | 549 | { |
| 530 | const char set_to_char[4] = { '?', '0', '1', 'X' }; | 550 | const char set_to_char[4] = { '?', '0', '1', 'X' }; |
| 531 | const char *system = filp->private_data; | 551 | struct event_subsystem *system = filp->private_data; |
| 532 | struct ftrace_event_call *call; | 552 | struct ftrace_event_call *call; |
| 533 | char buf[2]; | 553 | char buf[2]; |
| 534 | int set = 0; | 554 | int set = 0; |
| @@ -539,7 +559,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, | |||
| 539 | if (!call->name || !call->class || !call->class->reg) | 559 | if (!call->name || !call->class || !call->class->reg) |
| 540 | continue; | 560 | continue; |
| 541 | 561 | ||
| 542 | if (system && strcmp(call->class->system, system) != 0) | 562 | if (system && strcmp(call->class->system, system->name) != 0) |
| 543 | continue; | 563 | continue; |
| 544 | 564 | ||
| 545 | /* | 565 | /* |
| @@ -569,21 +589,13 @@ static ssize_t | |||
| 569 | system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | 589 | system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, |
| 570 | loff_t *ppos) | 590 | loff_t *ppos) |
| 571 | { | 591 | { |
| 572 | const char *system = filp->private_data; | 592 | struct event_subsystem *system = filp->private_data; |
| 593 | const char *name = NULL; | ||
| 573 | unsigned long val; | 594 | unsigned long val; |
| 574 | char buf[64]; | ||
| 575 | ssize_t ret; | 595 | ssize_t ret; |
| 576 | 596 | ||
| 577 | if (cnt >= sizeof(buf)) | 597 | ret = kstrtoul_from_user(ubuf, cnt, 10, &val); |
| 578 | return -EINVAL; | 598 | if (ret) |
| 579 | |||
| 580 | if (copy_from_user(&buf, ubuf, cnt)) | ||
| 581 | return -EFAULT; | ||
| 582 | |||
| 583 | buf[cnt] = 0; | ||
| 584 | |||
| 585 | ret = strict_strtoul(buf, 10, &val); | ||
| 586 | if (ret < 0) | ||
| 587 | return ret; | 599 | return ret; |
| 588 | 600 | ||
| 589 | ret = tracing_update_buffers(); | 601 | ret = tracing_update_buffers(); |
| @@ -593,7 +605,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
| 593 | if (val != 0 && val != 1) | 605 | if (val != 0 && val != 1) |
| 594 | return -EINVAL; | 606 | return -EINVAL; |
| 595 | 607 | ||
| 596 | ret = __ftrace_set_clr_event(NULL, system, NULL, val); | 608 | /* |
| 609 | * Opening of "enable" adds a ref count to system, | ||
| 610 | * so the name is safe to use. | ||
| 611 | */ | ||
| 612 | if (system) | ||
| 613 | name = system->name; | ||
| 614 | |||
| 615 | ret = __ftrace_set_clr_event(NULL, name, NULL, val); | ||
| 597 | if (ret) | 616 | if (ret) |
| 598 | goto out; | 617 | goto out; |
| 599 | 618 | ||
| @@ -826,6 +845,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
| 826 | return cnt; | 845 | return cnt; |
| 827 | } | 846 | } |
| 828 | 847 | ||
| 848 | static LIST_HEAD(event_subsystems); | ||
| 849 | |||
| 850 | static int subsystem_open(struct inode *inode, struct file *filp) | ||
| 851 | { | ||
| 852 | struct event_subsystem *system = NULL; | ||
| 853 | int ret; | ||
| 854 | |||
| 855 | if (!inode->i_private) | ||
| 856 | goto skip_search; | ||
| 857 | |||
| 858 | /* Make sure the system still exists */ | ||
| 859 | mutex_lock(&event_mutex); | ||
| 860 | list_for_each_entry(system, &event_subsystems, list) { | ||
| 861 | if (system == inode->i_private) { | ||
| 862 | /* Don't open systems with no events */ | ||
| 863 | if (!system->nr_events) { | ||
| 864 | system = NULL; | ||
| 865 | break; | ||
| 866 | } | ||
| 867 | __get_system(system); | ||
| 868 | break; | ||
| 869 | } | ||
| 870 | } | ||
| 871 | mutex_unlock(&event_mutex); | ||
| 872 | |||
| 873 | if (system != inode->i_private) | ||
| 874 | return -ENODEV; | ||
| 875 | |||
| 876 | skip_search: | ||
| 877 | ret = tracing_open_generic(inode, filp); | ||
| 878 | if (ret < 0 && system) | ||
| 879 | put_system(system); | ||
| 880 | |||
| 881 | return ret; | ||
| 882 | } | ||
| 883 | |||
| 884 | static int subsystem_release(struct inode *inode, struct file *file) | ||
| 885 | { | ||
| 886 | struct event_subsystem *system = inode->i_private; | ||
| 887 | |||
| 888 | if (system) | ||
| 889 | put_system(system); | ||
| 890 | |||
| 891 | return 0; | ||
| 892 | } | ||
| 893 | |||
| 829 | static ssize_t | 894 | static ssize_t |
| 830 | subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, | 895 | subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, |
| 831 | loff_t *ppos) | 896 | loff_t *ppos) |
| @@ -963,17 +1028,19 @@ static const struct file_operations ftrace_event_filter_fops = { | |||
| 963 | }; | 1028 | }; |
| 964 | 1029 | ||
| 965 | static const struct file_operations ftrace_subsystem_filter_fops = { | 1030 | static const struct file_operations ftrace_subsystem_filter_fops = { |
| 966 | .open = tracing_open_generic, | 1031 | .open = subsystem_open, |
| 967 | .read = subsystem_filter_read, | 1032 | .read = subsystem_filter_read, |
| 968 | .write = subsystem_filter_write, | 1033 | .write = subsystem_filter_write, |
| 969 | .llseek = default_llseek, | 1034 | .llseek = default_llseek, |
| 1035 | .release = subsystem_release, | ||
| 970 | }; | 1036 | }; |
| 971 | 1037 | ||
| 972 | static const struct file_operations ftrace_system_enable_fops = { | 1038 | static const struct file_operations ftrace_system_enable_fops = { |
| 973 | .open = tracing_open_generic, | 1039 | .open = subsystem_open, |
| 974 | .read = system_enable_read, | 1040 | .read = system_enable_read, |
| 975 | .write = system_enable_write, | 1041 | .write = system_enable_write, |
| 976 | .llseek = default_llseek, | 1042 | .llseek = default_llseek, |
| 1043 | .release = subsystem_release, | ||
| 977 | }; | 1044 | }; |
| 978 | 1045 | ||
| 979 | static const struct file_operations ftrace_show_header_fops = { | 1046 | static const struct file_operations ftrace_show_header_fops = { |
| @@ -1002,8 +1069,6 @@ static struct dentry *event_trace_events_dir(void) | |||
| 1002 | return d_events; | 1069 | return d_events; |
| 1003 | } | 1070 | } |
| 1004 | 1071 | ||
| 1005 | static LIST_HEAD(event_subsystems); | ||
| 1006 | |||
| 1007 | static struct dentry * | 1072 | static struct dentry * |
| 1008 | event_subsystem_dir(const char *name, struct dentry *d_events) | 1073 | event_subsystem_dir(const char *name, struct dentry *d_events) |
| 1009 | { | 1074 | { |
| @@ -1013,6 +1078,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) | |||
| 1013 | /* First see if we did not already create this dir */ | 1078 | /* First see if we did not already create this dir */ |
| 1014 | list_for_each_entry(system, &event_subsystems, list) { | 1079 | list_for_each_entry(system, &event_subsystems, list) { |
| 1015 | if (strcmp(system->name, name) == 0) { | 1080 | if (strcmp(system->name, name) == 0) { |
| 1081 | __get_system(system); | ||
| 1016 | system->nr_events++; | 1082 | system->nr_events++; |
| 1017 | return system->entry; | 1083 | return system->entry; |
| 1018 | } | 1084 | } |
| @@ -1035,6 +1101,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) | |||
| 1035 | } | 1101 | } |
| 1036 | 1102 | ||
| 1037 | system->nr_events = 1; | 1103 | system->nr_events = 1; |
| 1104 | system->ref_count = 1; | ||
| 1038 | system->name = kstrdup(name, GFP_KERNEL); | 1105 | system->name = kstrdup(name, GFP_KERNEL); |
| 1039 | if (!system->name) { | 1106 | if (!system->name) { |
| 1040 | debugfs_remove(system->entry); | 1107 | debugfs_remove(system->entry); |
| @@ -1062,8 +1129,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) | |||
| 1062 | "'%s/filter' entry\n", name); | 1129 | "'%s/filter' entry\n", name); |
| 1063 | } | 1130 | } |
| 1064 | 1131 | ||
| 1065 | trace_create_file("enable", 0644, system->entry, | 1132 | trace_create_file("enable", 0644, system->entry, system, |
| 1066 | (void *)system->name, | ||
| 1067 | &ftrace_system_enable_fops); | 1133 | &ftrace_system_enable_fops); |
| 1068 | 1134 | ||
| 1069 | return system->entry; | 1135 | return system->entry; |
| @@ -1184,16 +1250,9 @@ static void remove_subsystem_dir(const char *name) | |||
| 1184 | list_for_each_entry(system, &event_subsystems, list) { | 1250 | list_for_each_entry(system, &event_subsystems, list) { |
| 1185 | if (strcmp(system->name, name) == 0) { | 1251 | if (strcmp(system->name, name) == 0) { |
| 1186 | if (!--system->nr_events) { | 1252 | if (!--system->nr_events) { |
| 1187 | struct event_filter *filter = system->filter; | ||
| 1188 | |||
| 1189 | debugfs_remove_recursive(system->entry); | 1253 | debugfs_remove_recursive(system->entry); |
| 1190 | list_del(&system->list); | 1254 | list_del(&system->list); |
| 1191 | if (filter) { | 1255 | __put_system(system); |
| 1192 | kfree(filter->filter_string); | ||
| 1193 | kfree(filter); | ||
| 1194 | } | ||
| 1195 | kfree(system->name); | ||
| 1196 | kfree(system); | ||
| 1197 | } | 1256 | } |
| 1198 | break; | 1257 | break; |
| 1199 | } | 1258 | } |
| @@ -1657,7 +1716,12 @@ static struct ftrace_ops trace_ops __initdata = | |||
| 1657 | 1716 | ||
| 1658 | static __init void event_trace_self_test_with_function(void) | 1717 | static __init void event_trace_self_test_with_function(void) |
| 1659 | { | 1718 | { |
| 1660 | register_ftrace_function(&trace_ops); | 1719 | int ret; |
| 1720 | ret = register_ftrace_function(&trace_ops); | ||
| 1721 | if (WARN_ON(ret < 0)) { | ||
| 1722 | pr_info("Failed to enable function tracer for event tests\n"); | ||
| 1723 | return; | ||
| 1724 | } | ||
| 1661 | pr_info("Running tests again, along with the function tracer\n"); | 1725 | pr_info("Running tests again, along with the function tracer\n"); |
| 1662 | event_trace_self_tests(); | 1726 | event_trace_self_tests(); |
| 1663 | unregister_ftrace_function(&trace_ops); | 1727 | unregister_ftrace_function(&trace_ops); |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8008ddcfbf20..256764ecccd6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
| @@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, | |||
| 1886 | 1886 | ||
| 1887 | mutex_lock(&event_mutex); | 1887 | mutex_lock(&event_mutex); |
| 1888 | 1888 | ||
| 1889 | /* Make sure the system still has events */ | ||
| 1890 | if (!system->nr_events) { | ||
| 1891 | err = -ENODEV; | ||
| 1892 | goto out_unlock; | ||
| 1893 | } | ||
| 1894 | |||
| 1889 | if (!strcmp(strstrip(filter_string), "0")) { | 1895 | if (!strcmp(strstrip(filter_string), "0")) { |
| 1890 | filter_free_subsystem_preds(system); | 1896 | filter_free_subsystem_preds(system); |
| 1891 | remove_filter_string(system->filter); | 1897 | remove_filter_string(system->filter); |
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 16aee4d44e8f..c7b0c6a7db09 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
| @@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
| 149 | static struct ftrace_ops trace_ops __read_mostly = | 149 | static struct ftrace_ops trace_ops __read_mostly = |
| 150 | { | 150 | { |
| 151 | .func = function_trace_call, | 151 | .func = function_trace_call, |
| 152 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
| 152 | }; | 153 | }; |
| 153 | 154 | ||
| 154 | static struct ftrace_ops trace_stack_ops __read_mostly = | 155 | static struct ftrace_ops trace_stack_ops __read_mostly = |
| 155 | { | 156 | { |
| 156 | .func = function_stack_trace_call, | 157 | .func = function_stack_trace_call, |
| 158 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
| 157 | }; | 159 | }; |
| 158 | 160 | ||
| 159 | /* Our two options */ | 161 | /* Our two options */ |
| @@ -322,7 +324,8 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) | |||
| 322 | } | 324 | } |
| 323 | 325 | ||
| 324 | static int | 326 | static int |
| 325 | ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable) | 327 | ftrace_trace_onoff_callback(struct ftrace_hash *hash, |
| 328 | char *glob, char *cmd, char *param, int enable) | ||
| 326 | { | 329 | { |
| 327 | struct ftrace_probe_ops *ops; | 330 | struct ftrace_probe_ops *ops; |
| 328 | void *count = (void *)-1; | 331 | void *count = (void *)-1; |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 962cdb24ed81..a7d2a4c653d8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -74,6 +74,20 @@ static struct tracer_flags tracer_flags = { | |||
| 74 | 74 | ||
| 75 | static struct trace_array *graph_array; | 75 | static struct trace_array *graph_array; |
| 76 | 76 | ||
| 77 | /* | ||
| 78 | * DURATION column is being also used to display IRQ signs, | ||
| 79 | * following values are used by print_graph_irq and others | ||
| 80 | * to fill in space into DURATION column. | ||
| 81 | */ | ||
| 82 | enum { | ||
| 83 | DURATION_FILL_FULL = -1, | ||
| 84 | DURATION_FILL_START = -2, | ||
| 85 | DURATION_FILL_END = -3, | ||
| 86 | }; | ||
| 87 | |||
| 88 | static enum print_line_t | ||
| 89 | print_graph_duration(unsigned long long duration, struct trace_seq *s, | ||
| 90 | u32 flags); | ||
| 77 | 91 | ||
| 78 | /* Add a function return address to the trace stack on thread info.*/ | 92 | /* Add a function return address to the trace stack on thread info.*/ |
| 79 | int | 93 | int |
| @@ -213,7 +227,7 @@ int __trace_graph_entry(struct trace_array *tr, | |||
| 213 | 227 | ||
| 214 | static inline int ftrace_graph_ignore_irqs(void) | 228 | static inline int ftrace_graph_ignore_irqs(void) |
| 215 | { | 229 | { |
| 216 | if (!ftrace_graph_skip_irqs) | 230 | if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) |
| 217 | return 0; | 231 | return 0; |
| 218 | 232 | ||
| 219 | return in_irq(); | 233 | return in_irq(); |
| @@ -577,32 +591,6 @@ get_return_for_leaf(struct trace_iterator *iter, | |||
| 577 | return next; | 591 | return next; |
| 578 | } | 592 | } |
| 579 | 593 | ||
| 580 | /* Signal a overhead of time execution to the output */ | ||
| 581 | static int | ||
| 582 | print_graph_overhead(unsigned long long duration, struct trace_seq *s, | ||
| 583 | u32 flags) | ||
| 584 | { | ||
| 585 | /* If duration disappear, we don't need anything */ | ||
| 586 | if (!(flags & TRACE_GRAPH_PRINT_DURATION)) | ||
| 587 | return 1; | ||
| 588 | |||
| 589 | /* Non nested entry or return */ | ||
| 590 | if (duration == -1) | ||
| 591 | return trace_seq_printf(s, " "); | ||
| 592 | |||
| 593 | if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { | ||
| 594 | /* Duration exceeded 100 msecs */ | ||
| 595 | if (duration > 100000ULL) | ||
| 596 | return trace_seq_printf(s, "! "); | ||
| 597 | |||
| 598 | /* Duration exceeded 10 msecs */ | ||
| 599 | if (duration > 10000ULL) | ||
| 600 | return trace_seq_printf(s, "+ "); | ||
| 601 | } | ||
| 602 | |||
| 603 | return trace_seq_printf(s, " "); | ||
| 604 | } | ||
| 605 | |||
| 606 | static int print_graph_abs_time(u64 t, struct trace_seq *s) | 594 | static int print_graph_abs_time(u64 t, struct trace_seq *s) |
| 607 | { | 595 | { |
| 608 | unsigned long usecs_rem; | 596 | unsigned long usecs_rem; |
| @@ -625,34 +613,36 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, | |||
| 625 | addr >= (unsigned long)__irqentry_text_end) | 613 | addr >= (unsigned long)__irqentry_text_end) |
| 626 | return TRACE_TYPE_UNHANDLED; | 614 | return TRACE_TYPE_UNHANDLED; |
| 627 | 615 | ||
| 628 | /* Absolute time */ | 616 | if (trace_flags & TRACE_ITER_CONTEXT_INFO) { |
| 629 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { | 617 | /* Absolute time */ |
| 630 | ret = print_graph_abs_time(iter->ts, s); | 618 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { |
| 631 | if (!ret) | 619 | ret = print_graph_abs_time(iter->ts, s); |
| 632 | return TRACE_TYPE_PARTIAL_LINE; | 620 | if (!ret) |
| 633 | } | 621 | return TRACE_TYPE_PARTIAL_LINE; |
| 622 | } | ||
| 634 | 623 | ||
| 635 | /* Cpu */ | 624 | /* Cpu */ |
| 636 | if (flags & TRACE_GRAPH_PRINT_CPU) { | 625 | if (flags & TRACE_GRAPH_PRINT_CPU) { |
| 637 | ret = print_graph_cpu(s, cpu); | 626 | ret = print_graph_cpu(s, cpu); |
| 638 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 627 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
| 639 | return TRACE_TYPE_PARTIAL_LINE; | 628 | return TRACE_TYPE_PARTIAL_LINE; |
| 640 | } | 629 | } |
| 641 | 630 | ||
| 642 | /* Proc */ | 631 | /* Proc */ |
| 643 | if (flags & TRACE_GRAPH_PRINT_PROC) { | 632 | if (flags & TRACE_GRAPH_PRINT_PROC) { |
| 644 | ret = print_graph_proc(s, pid); | 633 | ret = print_graph_proc(s, pid); |
| 645 | if (ret == TRACE_TYPE_PARTIAL_LINE) | 634 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
| 646 | return TRACE_TYPE_PARTIAL_LINE; | 635 | return TRACE_TYPE_PARTIAL_LINE; |
| 647 | ret = trace_seq_printf(s, " | "); | 636 | ret = trace_seq_printf(s, " | "); |
| 648 | if (!ret) | 637 | if (!ret) |
| 649 | return TRACE_TYPE_PARTIAL_LINE; | 638 | return TRACE_TYPE_PARTIAL_LINE; |
| 639 | } | ||
| 650 | } | 640 | } |
| 651 | 641 | ||
| 652 | /* No overhead */ | 642 | /* No overhead */ |
| 653 | ret = print_graph_overhead(-1, s, flags); | 643 | ret = print_graph_duration(DURATION_FILL_START, s, flags); |
| 654 | if (!ret) | 644 | if (ret != TRACE_TYPE_HANDLED) |
| 655 | return TRACE_TYPE_PARTIAL_LINE; | 645 | return ret; |
| 656 | 646 | ||
| 657 | if (type == TRACE_GRAPH_ENT) | 647 | if (type == TRACE_GRAPH_ENT) |
| 658 | ret = trace_seq_printf(s, "==========>"); | 648 | ret = trace_seq_printf(s, "==========>"); |
| @@ -662,9 +652,10 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, | |||
| 662 | if (!ret) | 652 | if (!ret) |
| 663 | return TRACE_TYPE_PARTIAL_LINE; | 653 | return TRACE_TYPE_PARTIAL_LINE; |
| 664 | 654 | ||
| 665 | /* Don't close the duration column if haven't one */ | 655 | ret = print_graph_duration(DURATION_FILL_END, s, flags); |
| 666 | if (flags & TRACE_GRAPH_PRINT_DURATION) | 656 | if (ret != TRACE_TYPE_HANDLED) |
| 667 | trace_seq_printf(s, " |"); | 657 | return ret; |
| 658 | |||
| 668 | ret = trace_seq_printf(s, "\n"); | 659 | ret = trace_seq_printf(s, "\n"); |
| 669 | 660 | ||
| 670 | if (!ret) | 661 | if (!ret) |
| @@ -716,9 +707,49 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) | |||
| 716 | } | 707 | } |
| 717 | 708 | ||
| 718 | static enum print_line_t | 709 | static enum print_line_t |
| 719 | print_graph_duration(unsigned long long duration, struct trace_seq *s) | 710 | print_graph_duration(unsigned long long duration, struct trace_seq *s, |
| 711 | u32 flags) | ||
| 720 | { | 712 | { |
| 721 | int ret; | 713 | int ret = -1; |
| 714 | |||
| 715 | if (!(flags & TRACE_GRAPH_PRINT_DURATION) || | ||
| 716 | !(trace_flags & TRACE_ITER_CONTEXT_INFO)) | ||
| 717 | return TRACE_TYPE_HANDLED; | ||
| 718 | |||
| 719 | /* No real adata, just filling the column with spaces */ | ||
| 720 | switch (duration) { | ||
| 721 | case DURATION_FILL_FULL: | ||
| 722 | ret = trace_seq_printf(s, " | "); | ||
| 723 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | ||
| 724 | case DURATION_FILL_START: | ||
| 725 | ret = trace_seq_printf(s, " "); | ||
| 726 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | ||
| 727 | case DURATION_FILL_END: | ||
| 728 | ret = trace_seq_printf(s, " |"); | ||
| 729 | return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; | ||
| 730 | } | ||
| 731 | |||
| 732 | /* Signal a overhead of time execution to the output */ | ||
| 733 | if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { | ||
| 734 | /* Duration exceeded 100 msecs */ | ||
| 735 | if (duration > 100000ULL) | ||
| 736 | ret = trace_seq_printf(s, "! "); | ||
| 737 | /* Duration exceeded 10 msecs */ | ||
| 738 | else if (duration > 10000ULL) | ||
| 739 | ret = trace_seq_printf(s, "+ "); | ||
| 740 | } | ||
| 741 | |||
| 742 | /* | ||
| 743 | * The -1 means we either did not exceed the duration tresholds | ||
| 744 | * or we dont want to print out the overhead. Either way we need | ||
| 745 | * to fill out the space. | ||
| 746 | */ | ||
| 747 | if (ret == -1) | ||
| 748 | ret = trace_seq_printf(s, " "); | ||
| 749 | |||
| 750 | /* Catching here any failure happenned above */ | ||
| 751 | if (!ret) | ||
| 752 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 722 | 753 | ||
| 723 | ret = trace_print_graph_duration(duration, s); | 754 | ret = trace_print_graph_duration(duration, s); |
| 724 | if (ret != TRACE_TYPE_HANDLED) | 755 | if (ret != TRACE_TYPE_HANDLED) |
| @@ -767,18 +798,11 @@ print_graph_entry_leaf(struct trace_iterator *iter, | |||
| 767 | cpu_data->enter_funcs[call->depth] = 0; | 798 | cpu_data->enter_funcs[call->depth] = 0; |
| 768 | } | 799 | } |
| 769 | 800 | ||
| 770 | /* Overhead */ | 801 | /* Overhead and duration */ |
| 771 | ret = print_graph_overhead(duration, s, flags); | 802 | ret = print_graph_duration(duration, s, flags); |
| 772 | if (!ret) | 803 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
| 773 | return TRACE_TYPE_PARTIAL_LINE; | 804 | return TRACE_TYPE_PARTIAL_LINE; |
| 774 | 805 | ||
| 775 | /* Duration */ | ||
| 776 | if (flags & TRACE_GRAPH_PRINT_DURATION) { | ||
| 777 | ret = print_graph_duration(duration, s); | ||
| 778 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 779 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 780 | } | ||
| 781 | |||
| 782 | /* Function */ | 806 | /* Function */ |
| 783 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { | 807 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { |
| 784 | ret = trace_seq_printf(s, " "); | 808 | ret = trace_seq_printf(s, " "); |
| @@ -815,17 +839,10 @@ print_graph_entry_nested(struct trace_iterator *iter, | |||
| 815 | cpu_data->enter_funcs[call->depth] = call->func; | 839 | cpu_data->enter_funcs[call->depth] = call->func; |
| 816 | } | 840 | } |
| 817 | 841 | ||
| 818 | /* No overhead */ | ||
| 819 | ret = print_graph_overhead(-1, s, flags); | ||
| 820 | if (!ret) | ||
| 821 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 822 | |||
| 823 | /* No time */ | 842 | /* No time */ |
| 824 | if (flags & TRACE_GRAPH_PRINT_DURATION) { | 843 | ret = print_graph_duration(DURATION_FILL_FULL, s, flags); |
| 825 | ret = trace_seq_printf(s, " | "); | 844 | if (ret != TRACE_TYPE_HANDLED) |
| 826 | if (!ret) | 845 | return ret; |
| 827 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 828 | } | ||
| 829 | 846 | ||
| 830 | /* Function */ | 847 | /* Function */ |
| 831 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { | 848 | for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { |
| @@ -865,6 +882,9 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, | |||
| 865 | return TRACE_TYPE_PARTIAL_LINE; | 882 | return TRACE_TYPE_PARTIAL_LINE; |
| 866 | } | 883 | } |
| 867 | 884 | ||
| 885 | if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) | ||
| 886 | return 0; | ||
| 887 | |||
| 868 | /* Absolute time */ | 888 | /* Absolute time */ |
| 869 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { | 889 | if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { |
| 870 | ret = print_graph_abs_time(iter->ts, s); | 890 | ret = print_graph_abs_time(iter->ts, s); |
| @@ -1078,18 +1098,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, | |||
| 1078 | if (print_graph_prologue(iter, s, 0, 0, flags)) | 1098 | if (print_graph_prologue(iter, s, 0, 0, flags)) |
| 1079 | return TRACE_TYPE_PARTIAL_LINE; | 1099 | return TRACE_TYPE_PARTIAL_LINE; |
| 1080 | 1100 | ||
| 1081 | /* Overhead */ | 1101 | /* Overhead and duration */ |
| 1082 | ret = print_graph_overhead(duration, s, flags); | 1102 | ret = print_graph_duration(duration, s, flags); |
| 1083 | if (!ret) | 1103 | if (ret == TRACE_TYPE_PARTIAL_LINE) |
| 1084 | return TRACE_TYPE_PARTIAL_LINE; | 1104 | return TRACE_TYPE_PARTIAL_LINE; |
| 1085 | 1105 | ||
| 1086 | /* Duration */ | ||
| 1087 | if (flags & TRACE_GRAPH_PRINT_DURATION) { | ||
| 1088 | ret = print_graph_duration(duration, s); | ||
| 1089 | if (ret == TRACE_TYPE_PARTIAL_LINE) | ||
| 1090 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1091 | } | ||
| 1092 | |||
| 1093 | /* Closing brace */ | 1106 | /* Closing brace */ |
| 1094 | for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { | 1107 | for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { |
| 1095 | ret = trace_seq_printf(s, " "); | 1108 | ret = trace_seq_printf(s, " "); |
| @@ -1146,17 +1159,10 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
| 1146 | if (print_graph_prologue(iter, s, 0, 0, flags)) | 1159 | if (print_graph_prologue(iter, s, 0, 0, flags)) |
| 1147 | return TRACE_TYPE_PARTIAL_LINE; | 1160 | return TRACE_TYPE_PARTIAL_LINE; |
| 1148 | 1161 | ||
| 1149 | /* No overhead */ | ||
| 1150 | ret = print_graph_overhead(-1, s, flags); | ||
| 1151 | if (!ret) | ||
| 1152 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1153 | |||
| 1154 | /* No time */ | 1162 | /* No time */ |
| 1155 | if (flags & TRACE_GRAPH_PRINT_DURATION) { | 1163 | ret = print_graph_duration(DURATION_FILL_FULL, s, flags); |
| 1156 | ret = trace_seq_printf(s, " | "); | 1164 | if (ret != TRACE_TYPE_HANDLED) |
| 1157 | if (!ret) | 1165 | return ret; |
| 1158 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 1159 | } | ||
| 1160 | 1166 | ||
| 1161 | /* Indentation */ | 1167 | /* Indentation */ |
| 1162 | if (depth > 0) | 1168 | if (depth > 0) |
| @@ -1207,7 +1213,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, | |||
| 1207 | 1213 | ||
| 1208 | 1214 | ||
| 1209 | enum print_line_t | 1215 | enum print_line_t |
| 1210 | __print_graph_function_flags(struct trace_iterator *iter, u32 flags) | 1216 | print_graph_function_flags(struct trace_iterator *iter, u32 flags) |
| 1211 | { | 1217 | { |
| 1212 | struct ftrace_graph_ent_entry *field; | 1218 | struct ftrace_graph_ent_entry *field; |
| 1213 | struct fgraph_data *data = iter->private; | 1219 | struct fgraph_data *data = iter->private; |
| @@ -1270,18 +1276,7 @@ __print_graph_function_flags(struct trace_iterator *iter, u32 flags) | |||
| 1270 | static enum print_line_t | 1276 | static enum print_line_t |
| 1271 | print_graph_function(struct trace_iterator *iter) | 1277 | print_graph_function(struct trace_iterator *iter) |
| 1272 | { | 1278 | { |
| 1273 | return __print_graph_function_flags(iter, tracer_flags.val); | 1279 | return print_graph_function_flags(iter, tracer_flags.val); |
| 1274 | } | ||
| 1275 | |||
| 1276 | enum print_line_t print_graph_function_flags(struct trace_iterator *iter, | ||
| 1277 | u32 flags) | ||
| 1278 | { | ||
| 1279 | if (trace_flags & TRACE_ITER_LATENCY_FMT) | ||
| 1280 | flags |= TRACE_GRAPH_PRINT_DURATION; | ||
| 1281 | else | ||
| 1282 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
| 1283 | |||
| 1284 | return __print_graph_function_flags(iter, flags); | ||
| 1285 | } | 1280 | } |
| 1286 | 1281 | ||
| 1287 | static enum print_line_t | 1282 | static enum print_line_t |
| @@ -1309,8 +1304,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) | |||
| 1309 | seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); | 1304 | seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); |
| 1310 | seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); | 1305 | seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); |
| 1311 | seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); | 1306 | seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); |
| 1312 | seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); | 1307 | seq_printf(s, "#%.*s||| / \n", size, spaces); |
| 1313 | seq_printf(s, "#%.*s|||| / \n", size, spaces); | ||
| 1314 | } | 1308 | } |
| 1315 | 1309 | ||
| 1316 | static void __print_graph_headers_flags(struct seq_file *s, u32 flags) | 1310 | static void __print_graph_headers_flags(struct seq_file *s, u32 flags) |
| @@ -1329,7 +1323,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) | |||
| 1329 | if (flags & TRACE_GRAPH_PRINT_PROC) | 1323 | if (flags & TRACE_GRAPH_PRINT_PROC) |
| 1330 | seq_printf(s, " TASK/PID "); | 1324 | seq_printf(s, " TASK/PID "); |
| 1331 | if (lat) | 1325 | if (lat) |
| 1332 | seq_printf(s, "|||||"); | 1326 | seq_printf(s, "||||"); |
| 1333 | if (flags & TRACE_GRAPH_PRINT_DURATION) | 1327 | if (flags & TRACE_GRAPH_PRINT_DURATION) |
| 1334 | seq_printf(s, " DURATION "); | 1328 | seq_printf(s, " DURATION "); |
| 1335 | seq_printf(s, " FUNCTION CALLS\n"); | 1329 | seq_printf(s, " FUNCTION CALLS\n"); |
| @@ -1343,7 +1337,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags) | |||
| 1343 | if (flags & TRACE_GRAPH_PRINT_PROC) | 1337 | if (flags & TRACE_GRAPH_PRINT_PROC) |
| 1344 | seq_printf(s, " | | "); | 1338 | seq_printf(s, " | | "); |
| 1345 | if (lat) | 1339 | if (lat) |
| 1346 | seq_printf(s, "|||||"); | 1340 | seq_printf(s, "||||"); |
| 1347 | if (flags & TRACE_GRAPH_PRINT_DURATION) | 1341 | if (flags & TRACE_GRAPH_PRINT_DURATION) |
| 1348 | seq_printf(s, " | | "); | 1342 | seq_printf(s, " | | "); |
| 1349 | seq_printf(s, " | | | |\n"); | 1343 | seq_printf(s, " | | | |\n"); |
| @@ -1358,15 +1352,16 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) | |||
| 1358 | { | 1352 | { |
| 1359 | struct trace_iterator *iter = s->private; | 1353 | struct trace_iterator *iter = s->private; |
| 1360 | 1354 | ||
| 1355 | if (!(trace_flags & TRACE_ITER_CONTEXT_INFO)) | ||
| 1356 | return; | ||
| 1357 | |||
| 1361 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { | 1358 | if (trace_flags & TRACE_ITER_LATENCY_FMT) { |
| 1362 | /* print nothing if the buffers are empty */ | 1359 | /* print nothing if the buffers are empty */ |
| 1363 | if (trace_empty(iter)) | 1360 | if (trace_empty(iter)) |
| 1364 | return; | 1361 | return; |
| 1365 | 1362 | ||
| 1366 | print_trace_header(s, iter); | 1363 | print_trace_header(s, iter); |
| 1367 | flags |= TRACE_GRAPH_PRINT_DURATION; | 1364 | } |
| 1368 | } else | ||
| 1369 | flags |= TRACE_GRAPH_PRINT_ABS_TIME; | ||
| 1370 | 1365 | ||
| 1371 | __print_graph_headers_flags(s, flags); | 1366 | __print_graph_headers_flags(s, flags); |
| 1372 | } | 1367 | } |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a4969b47afc1..667aa8cc0cfc 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
| @@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
| 153 | static struct ftrace_ops trace_ops __read_mostly = | 153 | static struct ftrace_ops trace_ops __read_mostly = |
| 154 | { | 154 | { |
| 155 | .func = irqsoff_tracer_call, | 155 | .func = irqsoff_tracer_call, |
| 156 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
| 156 | }; | 157 | }; |
| 157 | #endif /* CONFIG_FUNCTION_TRACER */ | 158 | #endif /* CONFIG_FUNCTION_TRACER */ |
| 158 | 159 | ||
| @@ -225,7 +226,9 @@ static void irqsoff_trace_close(struct trace_iterator *iter) | |||
| 225 | } | 226 | } |
| 226 | 227 | ||
| 227 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ | 228 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ |
| 228 | TRACE_GRAPH_PRINT_PROC) | 229 | TRACE_GRAPH_PRINT_PROC | \ |
| 230 | TRACE_GRAPH_PRINT_ABS_TIME | \ | ||
| 231 | TRACE_GRAPH_PRINT_DURATION) | ||
| 229 | 232 | ||
| 230 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) | 233 | static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) |
| 231 | { | 234 | { |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35d55a386145..5fb3697bf0e5 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -53,7 +53,6 @@ const char *reserved_field_names[] = { | |||
| 53 | "common_preempt_count", | 53 | "common_preempt_count", |
| 54 | "common_pid", | 54 | "common_pid", |
| 55 | "common_tgid", | 55 | "common_tgid", |
| 56 | "common_lock_depth", | ||
| 57 | FIELD_STRING_IP, | 56 | FIELD_STRING_IP, |
| 58 | FIELD_STRING_RETIP, | 57 | FIELD_STRING_RETIP, |
| 59 | FIELD_STRING_FUNC, | 58 | FIELD_STRING_FUNC, |
| @@ -344,6 +343,14 @@ DEFINE_BASIC_FETCH_FUNCS(deref) | |||
| 344 | DEFINE_FETCH_deref(string) | 343 | DEFINE_FETCH_deref(string) |
| 345 | DEFINE_FETCH_deref(string_size) | 344 | DEFINE_FETCH_deref(string_size) |
| 346 | 345 | ||
| 346 | static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data) | ||
| 347 | { | ||
| 348 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
| 349 | update_deref_fetch_param(data->orig.data); | ||
| 350 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
| 351 | update_symbol_cache(data->orig.data); | ||
| 352 | } | ||
| 353 | |||
| 347 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) | 354 | static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) |
| 348 | { | 355 | { |
| 349 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | 356 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) |
| @@ -378,6 +385,19 @@ DEFINE_BASIC_FETCH_FUNCS(bitfield) | |||
| 378 | #define fetch_bitfield_string_size NULL | 385 | #define fetch_bitfield_string_size NULL |
| 379 | 386 | ||
| 380 | static __kprobes void | 387 | static __kprobes void |
| 388 | update_bitfield_fetch_param(struct bitfield_fetch_param *data) | ||
| 389 | { | ||
| 390 | /* | ||
| 391 | * Don't check the bitfield itself, because this must be the | ||
| 392 | * last fetch function. | ||
| 393 | */ | ||
| 394 | if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) | ||
| 395 | update_deref_fetch_param(data->orig.data); | ||
| 396 | else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) | ||
| 397 | update_symbol_cache(data->orig.data); | ||
| 398 | } | ||
| 399 | |||
| 400 | static __kprobes void | ||
| 381 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) | 401 | free_bitfield_fetch_param(struct bitfield_fetch_param *data) |
| 382 | { | 402 | { |
| 383 | /* | 403 | /* |
| @@ -390,6 +410,7 @@ free_bitfield_fetch_param(struct bitfield_fetch_param *data) | |||
| 390 | free_symbol_cache(data->orig.data); | 410 | free_symbol_cache(data->orig.data); |
| 391 | kfree(data); | 411 | kfree(data); |
| 392 | } | 412 | } |
| 413 | |||
| 393 | /* Default (unsigned long) fetch type */ | 414 | /* Default (unsigned long) fetch type */ |
| 394 | #define __DEFAULT_FETCH_TYPE(t) u##t | 415 | #define __DEFAULT_FETCH_TYPE(t) u##t |
| 395 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) | 416 | #define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) |
| @@ -537,6 +558,7 @@ struct probe_arg { | |||
| 537 | /* Flags for trace_probe */ | 558 | /* Flags for trace_probe */ |
| 538 | #define TP_FLAG_TRACE 1 | 559 | #define TP_FLAG_TRACE 1 |
| 539 | #define TP_FLAG_PROFILE 2 | 560 | #define TP_FLAG_PROFILE 2 |
| 561 | #define TP_FLAG_REGISTERED 4 | ||
| 540 | 562 | ||
| 541 | struct trace_probe { | 563 | struct trace_probe { |
| 542 | struct list_head list; | 564 | struct list_head list; |
| @@ -556,16 +578,49 @@ struct trace_probe { | |||
| 556 | (sizeof(struct probe_arg) * (n))) | 578 | (sizeof(struct probe_arg) * (n))) |
| 557 | 579 | ||
| 558 | 580 | ||
| 559 | static __kprobes int probe_is_return(struct trace_probe *tp) | 581 | static __kprobes int trace_probe_is_return(struct trace_probe *tp) |
| 560 | { | 582 | { |
| 561 | return tp->rp.handler != NULL; | 583 | return tp->rp.handler != NULL; |
| 562 | } | 584 | } |
| 563 | 585 | ||
| 564 | static __kprobes const char *probe_symbol(struct trace_probe *tp) | 586 | static __kprobes const char *trace_probe_symbol(struct trace_probe *tp) |
| 565 | { | 587 | { |
| 566 | return tp->symbol ? tp->symbol : "unknown"; | 588 | return tp->symbol ? tp->symbol : "unknown"; |
| 567 | } | 589 | } |
| 568 | 590 | ||
| 591 | static __kprobes unsigned long trace_probe_offset(struct trace_probe *tp) | ||
| 592 | { | ||
| 593 | return tp->rp.kp.offset; | ||
| 594 | } | ||
| 595 | |||
| 596 | static __kprobes bool trace_probe_is_enabled(struct trace_probe *tp) | ||
| 597 | { | ||
| 598 | return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE)); | ||
| 599 | } | ||
| 600 | |||
| 601 | static __kprobes bool trace_probe_is_registered(struct trace_probe *tp) | ||
| 602 | { | ||
| 603 | return !!(tp->flags & TP_FLAG_REGISTERED); | ||
| 604 | } | ||
| 605 | |||
| 606 | static __kprobes bool trace_probe_has_gone(struct trace_probe *tp) | ||
| 607 | { | ||
| 608 | return !!(kprobe_gone(&tp->rp.kp)); | ||
| 609 | } | ||
| 610 | |||
| 611 | static __kprobes bool trace_probe_within_module(struct trace_probe *tp, | ||
| 612 | struct module *mod) | ||
| 613 | { | ||
| 614 | int len = strlen(mod->name); | ||
| 615 | const char *name = trace_probe_symbol(tp); | ||
| 616 | return strncmp(mod->name, name, len) == 0 && name[len] == ':'; | ||
| 617 | } | ||
| 618 | |||
| 619 | static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp) | ||
| 620 | { | ||
| 621 | return !!strchr(trace_probe_symbol(tp), ':'); | ||
| 622 | } | ||
| 623 | |||
| 569 | static int register_probe_event(struct trace_probe *tp); | 624 | static int register_probe_event(struct trace_probe *tp); |
| 570 | static void unregister_probe_event(struct trace_probe *tp); | 625 | static void unregister_probe_event(struct trace_probe *tp); |
| 571 | 626 | ||
| @@ -647,6 +702,16 @@ error: | |||
| 647 | return ERR_PTR(ret); | 702 | return ERR_PTR(ret); |
| 648 | } | 703 | } |
| 649 | 704 | ||
| 705 | static void update_probe_arg(struct probe_arg *arg) | ||
| 706 | { | ||
| 707 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | ||
| 708 | update_bitfield_fetch_param(arg->fetch.data); | ||
| 709 | else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) | ||
| 710 | update_deref_fetch_param(arg->fetch.data); | ||
| 711 | else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) | ||
| 712 | update_symbol_cache(arg->fetch.data); | ||
| 713 | } | ||
| 714 | |||
| 650 | static void free_probe_arg(struct probe_arg *arg) | 715 | static void free_probe_arg(struct probe_arg *arg) |
| 651 | { | 716 | { |
| 652 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) | 717 | if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) |
| @@ -672,7 +737,7 @@ static void free_trace_probe(struct trace_probe *tp) | |||
| 672 | kfree(tp); | 737 | kfree(tp); |
| 673 | } | 738 | } |
| 674 | 739 | ||
| 675 | static struct trace_probe *find_probe_event(const char *event, | 740 | static struct trace_probe *find_trace_probe(const char *event, |
| 676 | const char *group) | 741 | const char *group) |
| 677 | { | 742 | { |
| 678 | struct trace_probe *tp; | 743 | struct trace_probe *tp; |
| @@ -684,13 +749,96 @@ static struct trace_probe *find_probe_event(const char *event, | |||
| 684 | return NULL; | 749 | return NULL; |
| 685 | } | 750 | } |
| 686 | 751 | ||
| 752 | /* Enable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ | ||
| 753 | static int enable_trace_probe(struct trace_probe *tp, int flag) | ||
| 754 | { | ||
| 755 | int ret = 0; | ||
| 756 | |||
| 757 | tp->flags |= flag; | ||
| 758 | if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && | ||
| 759 | !trace_probe_has_gone(tp)) { | ||
| 760 | if (trace_probe_is_return(tp)) | ||
| 761 | ret = enable_kretprobe(&tp->rp); | ||
| 762 | else | ||
| 763 | ret = enable_kprobe(&tp->rp.kp); | ||
| 764 | } | ||
| 765 | |||
| 766 | return ret; | ||
| 767 | } | ||
| 768 | |||
| 769 | /* Disable trace_probe - @flag must be TP_FLAG_TRACE or TP_FLAG_PROFILE */ | ||
| 770 | static void disable_trace_probe(struct trace_probe *tp, int flag) | ||
| 771 | { | ||
| 772 | tp->flags &= ~flag; | ||
| 773 | if (!trace_probe_is_enabled(tp) && trace_probe_is_registered(tp)) { | ||
| 774 | if (trace_probe_is_return(tp)) | ||
| 775 | disable_kretprobe(&tp->rp); | ||
| 776 | else | ||
| 777 | disable_kprobe(&tp->rp.kp); | ||
| 778 | } | ||
| 779 | } | ||
| 780 | |||
| 781 | /* Internal register function - just handle k*probes and flags */ | ||
| 782 | static int __register_trace_probe(struct trace_probe *tp) | ||
| 783 | { | ||
| 784 | int i, ret; | ||
| 785 | |||
| 786 | if (trace_probe_is_registered(tp)) | ||
| 787 | return -EINVAL; | ||
| 788 | |||
| 789 | for (i = 0; i < tp->nr_args; i++) | ||
| 790 | update_probe_arg(&tp->args[i]); | ||
| 791 | |||
| 792 | /* Set/clear disabled flag according to tp->flag */ | ||
| 793 | if (trace_probe_is_enabled(tp)) | ||
| 794 | tp->rp.kp.flags &= ~KPROBE_FLAG_DISABLED; | ||
| 795 | else | ||
| 796 | tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; | ||
| 797 | |||
| 798 | if (trace_probe_is_return(tp)) | ||
| 799 | ret = register_kretprobe(&tp->rp); | ||
| 800 | else | ||
| 801 | ret = register_kprobe(&tp->rp.kp); | ||
| 802 | |||
| 803 | if (ret == 0) | ||
| 804 | tp->flags |= TP_FLAG_REGISTERED; | ||
| 805 | else { | ||
| 806 | pr_warning("Could not insert probe at %s+%lu: %d\n", | ||
| 807 | trace_probe_symbol(tp), trace_probe_offset(tp), ret); | ||
| 808 | if (ret == -ENOENT && trace_probe_is_on_module(tp)) { | ||
| 809 | pr_warning("This probe might be able to register after" | ||
| 810 | "target module is loaded. Continue.\n"); | ||
| 811 | ret = 0; | ||
| 812 | } else if (ret == -EILSEQ) { | ||
| 813 | pr_warning("Probing address(0x%p) is not an " | ||
| 814 | "instruction boundary.\n", | ||
| 815 | tp->rp.kp.addr); | ||
| 816 | ret = -EINVAL; | ||
| 817 | } | ||
| 818 | } | ||
| 819 | |||
| 820 | return ret; | ||
| 821 | } | ||
| 822 | |||
| 823 | /* Internal unregister function - just handle k*probes and flags */ | ||
| 824 | static void __unregister_trace_probe(struct trace_probe *tp) | ||
| 825 | { | ||
| 826 | if (trace_probe_is_registered(tp)) { | ||
| 827 | if (trace_probe_is_return(tp)) | ||
| 828 | unregister_kretprobe(&tp->rp); | ||
| 829 | else | ||
| 830 | unregister_kprobe(&tp->rp.kp); | ||
| 831 | tp->flags &= ~TP_FLAG_REGISTERED; | ||
| 832 | /* Cleanup kprobe for reuse */ | ||
| 833 | if (tp->rp.kp.symbol_name) | ||
| 834 | tp->rp.kp.addr = NULL; | ||
| 835 | } | ||
| 836 | } | ||
| 837 | |||
| 687 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ | 838 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ |
| 688 | static void unregister_trace_probe(struct trace_probe *tp) | 839 | static void unregister_trace_probe(struct trace_probe *tp) |
| 689 | { | 840 | { |
| 690 | if (probe_is_return(tp)) | 841 | __unregister_trace_probe(tp); |
| 691 | unregister_kretprobe(&tp->rp); | ||
| 692 | else | ||
| 693 | unregister_kprobe(&tp->rp.kp); | ||
| 694 | list_del(&tp->list); | 842 | list_del(&tp->list); |
| 695 | unregister_probe_event(tp); | 843 | unregister_probe_event(tp); |
| 696 | } | 844 | } |
| @@ -703,41 +851,65 @@ static int register_trace_probe(struct trace_probe *tp) | |||
| 703 | 851 | ||
| 704 | mutex_lock(&probe_lock); | 852 | mutex_lock(&probe_lock); |
| 705 | 853 | ||
| 706 | /* register as an event */ | 854 | /* Delete old (same name) event if exist */ |
| 707 | old_tp = find_probe_event(tp->call.name, tp->call.class->system); | 855 | old_tp = find_trace_probe(tp->call.name, tp->call.class->system); |
| 708 | if (old_tp) { | 856 | if (old_tp) { |
| 709 | /* delete old event */ | ||
| 710 | unregister_trace_probe(old_tp); | 857 | unregister_trace_probe(old_tp); |
| 711 | free_trace_probe(old_tp); | 858 | free_trace_probe(old_tp); |
| 712 | } | 859 | } |
| 860 | |||
| 861 | /* Register new event */ | ||
| 713 | ret = register_probe_event(tp); | 862 | ret = register_probe_event(tp); |
| 714 | if (ret) { | 863 | if (ret) { |
| 715 | pr_warning("Failed to register probe event(%d)\n", ret); | 864 | pr_warning("Failed to register probe event(%d)\n", ret); |
| 716 | goto end; | 865 | goto end; |
| 717 | } | 866 | } |
| 718 | 867 | ||
| 719 | tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; | 868 | /* Register k*probe */ |
| 720 | if (probe_is_return(tp)) | 869 | ret = __register_trace_probe(tp); |
| 721 | ret = register_kretprobe(&tp->rp); | 870 | if (ret < 0) |
| 722 | else | ||
| 723 | ret = register_kprobe(&tp->rp.kp); | ||
| 724 | |||
| 725 | if (ret) { | ||
| 726 | pr_warning("Could not insert probe(%d)\n", ret); | ||
| 727 | if (ret == -EILSEQ) { | ||
| 728 | pr_warning("Probing address(0x%p) is not an " | ||
| 729 | "instruction boundary.\n", | ||
| 730 | tp->rp.kp.addr); | ||
| 731 | ret = -EINVAL; | ||
| 732 | } | ||
| 733 | unregister_probe_event(tp); | 871 | unregister_probe_event(tp); |
| 734 | } else | 872 | else |
| 735 | list_add_tail(&tp->list, &probe_list); | 873 | list_add_tail(&tp->list, &probe_list); |
| 874 | |||
| 736 | end: | 875 | end: |
| 737 | mutex_unlock(&probe_lock); | 876 | mutex_unlock(&probe_lock); |
| 738 | return ret; | 877 | return ret; |
| 739 | } | 878 | } |
| 740 | 879 | ||
| 880 | /* Module notifier call back, checking event on the module */ | ||
| 881 | static int trace_probe_module_callback(struct notifier_block *nb, | ||
| 882 | unsigned long val, void *data) | ||
| 883 | { | ||
| 884 | struct module *mod = data; | ||
| 885 | struct trace_probe *tp; | ||
| 886 | int ret; | ||
| 887 | |||
| 888 | if (val != MODULE_STATE_COMING) | ||
| 889 | return NOTIFY_DONE; | ||
| 890 | |||
| 891 | /* Update probes on coming module */ | ||
| 892 | mutex_lock(&probe_lock); | ||
| 893 | list_for_each_entry(tp, &probe_list, list) { | ||
| 894 | if (trace_probe_within_module(tp, mod)) { | ||
| 895 | __unregister_trace_probe(tp); | ||
| 896 | ret = __register_trace_probe(tp); | ||
| 897 | if (ret) | ||
| 898 | pr_warning("Failed to re-register probe %s on" | ||
| 899 | "%s: %d\n", | ||
| 900 | tp->call.name, mod->name, ret); | ||
| 901 | } | ||
| 902 | } | ||
| 903 | mutex_unlock(&probe_lock); | ||
| 904 | |||
| 905 | return NOTIFY_DONE; | ||
| 906 | } | ||
| 907 | |||
| 908 | static struct notifier_block trace_probe_module_nb = { | ||
| 909 | .notifier_call = trace_probe_module_callback, | ||
| 910 | .priority = 1 /* Invoked after kprobe module callback */ | ||
| 911 | }; | ||
| 912 | |||
| 741 | /* Split symbol and offset. */ | 913 | /* Split symbol and offset. */ |
| 742 | static int split_symbol_offset(char *symbol, unsigned long *offset) | 914 | static int split_symbol_offset(char *symbol, unsigned long *offset) |
| 743 | { | 915 | { |
| @@ -963,8 +1135,8 @@ static int create_trace_probe(int argc, char **argv) | |||
| 963 | { | 1135 | { |
| 964 | /* | 1136 | /* |
| 965 | * Argument syntax: | 1137 | * Argument syntax: |
| 966 | * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] | 1138 | * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] |
| 967 | * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] | 1139 | * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] |
| 968 | * Fetch args: | 1140 | * Fetch args: |
| 969 | * $retval : fetch return value | 1141 | * $retval : fetch return value |
| 970 | * $stack : fetch stack address | 1142 | * $stack : fetch stack address |
| @@ -1026,7 +1198,7 @@ static int create_trace_probe(int argc, char **argv) | |||
| 1026 | return -EINVAL; | 1198 | return -EINVAL; |
| 1027 | } | 1199 | } |
| 1028 | mutex_lock(&probe_lock); | 1200 | mutex_lock(&probe_lock); |
| 1029 | tp = find_probe_event(event, group); | 1201 | tp = find_trace_probe(event, group); |
| 1030 | if (!tp) { | 1202 | if (!tp) { |
| 1031 | mutex_unlock(&probe_lock); | 1203 | mutex_unlock(&probe_lock); |
| 1032 | pr_info("Event %s/%s doesn't exist.\n", group, event); | 1204 | pr_info("Event %s/%s doesn't exist.\n", group, event); |
| @@ -1145,7 +1317,7 @@ error: | |||
| 1145 | return ret; | 1317 | return ret; |
| 1146 | } | 1318 | } |
| 1147 | 1319 | ||
| 1148 | static void cleanup_all_probes(void) | 1320 | static void release_all_trace_probes(void) |
| 1149 | { | 1321 | { |
| 1150 | struct trace_probe *tp; | 1322 | struct trace_probe *tp; |
| 1151 | 1323 | ||
| @@ -1159,7 +1331,6 @@ static void cleanup_all_probes(void) | |||
| 1159 | mutex_unlock(&probe_lock); | 1331 | mutex_unlock(&probe_lock); |
| 1160 | } | 1332 | } |
| 1161 | 1333 | ||
| 1162 | |||
| 1163 | /* Probes listing interfaces */ | 1334 | /* Probes listing interfaces */ |
| 1164 | static void *probes_seq_start(struct seq_file *m, loff_t *pos) | 1335 | static void *probes_seq_start(struct seq_file *m, loff_t *pos) |
| 1165 | { | 1336 | { |
| @@ -1182,15 +1353,16 @@ static int probes_seq_show(struct seq_file *m, void *v) | |||
| 1182 | struct trace_probe *tp = v; | 1353 | struct trace_probe *tp = v; |
| 1183 | int i; | 1354 | int i; |
| 1184 | 1355 | ||
| 1185 | seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); | 1356 | seq_printf(m, "%c", trace_probe_is_return(tp) ? 'r' : 'p'); |
| 1186 | seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); | 1357 | seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); |
| 1187 | 1358 | ||
| 1188 | if (!tp->symbol) | 1359 | if (!tp->symbol) |
| 1189 | seq_printf(m, " 0x%p", tp->rp.kp.addr); | 1360 | seq_printf(m, " 0x%p", tp->rp.kp.addr); |
| 1190 | else if (tp->rp.kp.offset) | 1361 | else if (tp->rp.kp.offset) |
| 1191 | seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); | 1362 | seq_printf(m, " %s+%u", trace_probe_symbol(tp), |
| 1363 | tp->rp.kp.offset); | ||
| 1192 | else | 1364 | else |
| 1193 | seq_printf(m, " %s", probe_symbol(tp)); | 1365 | seq_printf(m, " %s", trace_probe_symbol(tp)); |
| 1194 | 1366 | ||
| 1195 | for (i = 0; i < tp->nr_args; i++) | 1367 | for (i = 0; i < tp->nr_args; i++) |
| 1196 | seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); | 1368 | seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); |
| @@ -1210,7 +1382,7 @@ static int probes_open(struct inode *inode, struct file *file) | |||
| 1210 | { | 1382 | { |
| 1211 | if ((file->f_mode & FMODE_WRITE) && | 1383 | if ((file->f_mode & FMODE_WRITE) && |
| 1212 | (file->f_flags & O_TRUNC)) | 1384 | (file->f_flags & O_TRUNC)) |
| 1213 | cleanup_all_probes(); | 1385 | release_all_trace_probes(); |
| 1214 | 1386 | ||
| 1215 | return seq_open(file, &probes_seq_op); | 1387 | return seq_open(file, &probes_seq_op); |
| 1216 | } | 1388 | } |
| @@ -1398,7 +1570,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | |||
| 1398 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1570 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
| 1399 | 1571 | ||
| 1400 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1572 | if (!filter_current_check_discard(buffer, call, entry, event)) |
| 1401 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1573 | trace_nowake_buffer_unlock_commit_regs(buffer, event, |
| 1574 | irq_flags, pc, regs); | ||
| 1402 | } | 1575 | } |
| 1403 | 1576 | ||
| 1404 | /* Kretprobe handler */ | 1577 | /* Kretprobe handler */ |
| @@ -1430,7 +1603,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, | |||
| 1430 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); | 1603 | store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); |
| 1431 | 1604 | ||
| 1432 | if (!filter_current_check_discard(buffer, call, entry, event)) | 1605 | if (!filter_current_check_discard(buffer, call, entry, event)) |
| 1433 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | 1606 | trace_nowake_buffer_unlock_commit_regs(buffer, event, |
| 1607 | irq_flags, pc, regs); | ||
| 1434 | } | 1608 | } |
| 1435 | 1609 | ||
| 1436 | /* Event entry printers */ | 1610 | /* Event entry printers */ |
| @@ -1512,30 +1686,6 @@ partial: | |||
| 1512 | return TRACE_TYPE_PARTIAL_LINE; | 1686 | return TRACE_TYPE_PARTIAL_LINE; |
| 1513 | } | 1687 | } |
| 1514 | 1688 | ||
| 1515 | static int probe_event_enable(struct ftrace_event_call *call) | ||
| 1516 | { | ||
| 1517 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
| 1518 | |||
| 1519 | tp->flags |= TP_FLAG_TRACE; | ||
| 1520 | if (probe_is_return(tp)) | ||
| 1521 | return enable_kretprobe(&tp->rp); | ||
| 1522 | else | ||
| 1523 | return enable_kprobe(&tp->rp.kp); | ||
| 1524 | } | ||
| 1525 | |||
| 1526 | static void probe_event_disable(struct ftrace_event_call *call) | ||
| 1527 | { | ||
| 1528 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
| 1529 | |||
| 1530 | tp->flags &= ~TP_FLAG_TRACE; | ||
| 1531 | if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { | ||
| 1532 | if (probe_is_return(tp)) | ||
| 1533 | disable_kretprobe(&tp->rp); | ||
| 1534 | else | ||
| 1535 | disable_kprobe(&tp->rp.kp); | ||
| 1536 | } | ||
| 1537 | } | ||
| 1538 | |||
| 1539 | #undef DEFINE_FIELD | 1689 | #undef DEFINE_FIELD |
| 1540 | #define DEFINE_FIELD(type, item, name, is_signed) \ | 1690 | #define DEFINE_FIELD(type, item, name, is_signed) \ |
| 1541 | do { \ | 1691 | do { \ |
| @@ -1597,7 +1747,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) | |||
| 1597 | 1747 | ||
| 1598 | const char *fmt, *arg; | 1748 | const char *fmt, *arg; |
| 1599 | 1749 | ||
| 1600 | if (!probe_is_return(tp)) { | 1750 | if (!trace_probe_is_return(tp)) { |
| 1601 | fmt = "(%lx)"; | 1751 | fmt = "(%lx)"; |
| 1602 | arg = "REC->" FIELD_STRING_IP; | 1752 | arg = "REC->" FIELD_STRING_IP; |
| 1603 | } else { | 1753 | } else { |
| @@ -1714,49 +1864,25 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, | |||
| 1714 | head = this_cpu_ptr(call->perf_events); | 1864 | head = this_cpu_ptr(call->perf_events); |
| 1715 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); | 1865 | perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); |
| 1716 | } | 1866 | } |
| 1717 | |||
| 1718 | static int probe_perf_enable(struct ftrace_event_call *call) | ||
| 1719 | { | ||
| 1720 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
| 1721 | |||
| 1722 | tp->flags |= TP_FLAG_PROFILE; | ||
| 1723 | |||
| 1724 | if (probe_is_return(tp)) | ||
| 1725 | return enable_kretprobe(&tp->rp); | ||
| 1726 | else | ||
| 1727 | return enable_kprobe(&tp->rp.kp); | ||
| 1728 | } | ||
| 1729 | |||
| 1730 | static void probe_perf_disable(struct ftrace_event_call *call) | ||
| 1731 | { | ||
| 1732 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
| 1733 | |||
| 1734 | tp->flags &= ~TP_FLAG_PROFILE; | ||
| 1735 | |||
| 1736 | if (!(tp->flags & TP_FLAG_TRACE)) { | ||
| 1737 | if (probe_is_return(tp)) | ||
| 1738 | disable_kretprobe(&tp->rp); | ||
| 1739 | else | ||
| 1740 | disable_kprobe(&tp->rp.kp); | ||
| 1741 | } | ||
| 1742 | } | ||
| 1743 | #endif /* CONFIG_PERF_EVENTS */ | 1867 | #endif /* CONFIG_PERF_EVENTS */ |
| 1744 | 1868 | ||
| 1745 | static __kprobes | 1869 | static __kprobes |
| 1746 | int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) | 1870 | int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) |
| 1747 | { | 1871 | { |
| 1872 | struct trace_probe *tp = (struct trace_probe *)event->data; | ||
| 1873 | |||
| 1748 | switch (type) { | 1874 | switch (type) { |
| 1749 | case TRACE_REG_REGISTER: | 1875 | case TRACE_REG_REGISTER: |
| 1750 | return probe_event_enable(event); | 1876 | return enable_trace_probe(tp, TP_FLAG_TRACE); |
| 1751 | case TRACE_REG_UNREGISTER: | 1877 | case TRACE_REG_UNREGISTER: |
| 1752 | probe_event_disable(event); | 1878 | disable_trace_probe(tp, TP_FLAG_TRACE); |
| 1753 | return 0; | 1879 | return 0; |
| 1754 | 1880 | ||
| 1755 | #ifdef CONFIG_PERF_EVENTS | 1881 | #ifdef CONFIG_PERF_EVENTS |
| 1756 | case TRACE_REG_PERF_REGISTER: | 1882 | case TRACE_REG_PERF_REGISTER: |
| 1757 | return probe_perf_enable(event); | 1883 | return enable_trace_probe(tp, TP_FLAG_PROFILE); |
| 1758 | case TRACE_REG_PERF_UNREGISTER: | 1884 | case TRACE_REG_PERF_UNREGISTER: |
| 1759 | probe_perf_disable(event); | 1885 | disable_trace_probe(tp, TP_FLAG_PROFILE); |
| 1760 | return 0; | 1886 | return 0; |
| 1761 | #endif | 1887 | #endif |
| 1762 | } | 1888 | } |
| @@ -1806,7 +1932,7 @@ static int register_probe_event(struct trace_probe *tp) | |||
| 1806 | 1932 | ||
| 1807 | /* Initialize ftrace_event_call */ | 1933 | /* Initialize ftrace_event_call */ |
| 1808 | INIT_LIST_HEAD(&call->class->fields); | 1934 | INIT_LIST_HEAD(&call->class->fields); |
| 1809 | if (probe_is_return(tp)) { | 1935 | if (trace_probe_is_return(tp)) { |
| 1810 | call->event.funcs = &kretprobe_funcs; | 1936 | call->event.funcs = &kretprobe_funcs; |
| 1811 | call->class->define_fields = kretprobe_event_define_fields; | 1937 | call->class->define_fields = kretprobe_event_define_fields; |
| 1812 | } else { | 1938 | } else { |
| @@ -1845,6 +1971,9 @@ static __init int init_kprobe_trace(void) | |||
| 1845 | struct dentry *d_tracer; | 1971 | struct dentry *d_tracer; |
| 1846 | struct dentry *entry; | 1972 | struct dentry *entry; |
| 1847 | 1973 | ||
| 1974 | if (register_module_notifier(&trace_probe_module_nb)) | ||
| 1975 | return -EINVAL; | ||
| 1976 | |||
| 1848 | d_tracer = tracing_init_dentry(); | 1977 | d_tracer = tracing_init_dentry(); |
| 1849 | if (!d_tracer) | 1978 | if (!d_tracer) |
| 1850 | return 0; | 1979 | return 0; |
| @@ -1871,8 +2000,12 @@ fs_initcall(init_kprobe_trace); | |||
| 1871 | 2000 | ||
| 1872 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 2001 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
| 1873 | 2002 | ||
| 1874 | static int kprobe_trace_selftest_target(int a1, int a2, int a3, | 2003 | /* |
| 1875 | int a4, int a5, int a6) | 2004 | * The "__used" keeps gcc from removing the function symbol |
| 2005 | * from the kallsyms table. | ||
| 2006 | */ | ||
| 2007 | static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, | ||
| 2008 | int a4, int a5, int a6) | ||
| 1876 | { | 2009 | { |
| 1877 | return a1 + a2 + a3 + a4 + a5 + a6; | 2010 | return a1 + a2 + a3 + a4 + a5 + a6; |
| 1878 | } | 2011 | } |
| @@ -1894,12 +2027,12 @@ static __init int kprobe_trace_self_tests_init(void) | |||
| 1894 | warn++; | 2027 | warn++; |
| 1895 | } else { | 2028 | } else { |
| 1896 | /* Enable trace point */ | 2029 | /* Enable trace point */ |
| 1897 | tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); | 2030 | tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); |
| 1898 | if (WARN_ON_ONCE(tp == NULL)) { | 2031 | if (WARN_ON_ONCE(tp == NULL)) { |
| 1899 | pr_warning("error on getting new probe.\n"); | 2032 | pr_warning("error on getting new probe.\n"); |
| 1900 | warn++; | 2033 | warn++; |
| 1901 | } else | 2034 | } else |
| 1902 | probe_event_enable(&tp->call); | 2035 | enable_trace_probe(tp, TP_FLAG_TRACE); |
| 1903 | } | 2036 | } |
| 1904 | 2037 | ||
| 1905 | ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " | 2038 | ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " |
| @@ -1909,12 +2042,12 @@ static __init int kprobe_trace_self_tests_init(void) | |||
| 1909 | warn++; | 2042 | warn++; |
| 1910 | } else { | 2043 | } else { |
| 1911 | /* Enable trace point */ | 2044 | /* Enable trace point */ |
| 1912 | tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); | 2045 | tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); |
| 1913 | if (WARN_ON_ONCE(tp == NULL)) { | 2046 | if (WARN_ON_ONCE(tp == NULL)) { |
| 1914 | pr_warning("error on getting new probe.\n"); | 2047 | pr_warning("error on getting new probe.\n"); |
| 1915 | warn++; | 2048 | warn++; |
| 1916 | } else | 2049 | } else |
| 1917 | probe_event_enable(&tp->call); | 2050 | enable_trace_probe(tp, TP_FLAG_TRACE); |
| 1918 | } | 2051 | } |
| 1919 | 2052 | ||
| 1920 | if (warn) | 2053 | if (warn) |
| @@ -1935,7 +2068,7 @@ static __init int kprobe_trace_self_tests_init(void) | |||
| 1935 | } | 2068 | } |
| 1936 | 2069 | ||
| 1937 | end: | 2070 | end: |
| 1938 | cleanup_all_probes(); | 2071 | release_all_trace_probes(); |
| 1939 | if (warn) | 2072 | if (warn) |
| 1940 | pr_cont("NG: Some tests are failed. Please check them.\n"); | 2073 | pr_cont("NG: Some tests are failed. Please check them.\n"); |
| 1941 | else | 2074 | else |
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 017fa376505d..fd3c8aae55e5 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c | |||
| @@ -12,7 +12,7 @@ | |||
| 12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
| 13 | #include <linux/time.h> | 13 | #include <linux/time.h> |
| 14 | 14 | ||
| 15 | #include <asm/atomic.h> | 15 | #include <linux/atomic.h> |
| 16 | 16 | ||
| 17 | #include "trace.h" | 17 | #include "trace.h" |
| 18 | #include "trace_output.h" | 18 | #include "trace_output.h" |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 456be9063c2d..51999309a6cf 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, | |||
| 353 | } | 353 | } |
| 354 | EXPORT_SYMBOL(ftrace_print_symbols_seq); | 354 | EXPORT_SYMBOL(ftrace_print_symbols_seq); |
| 355 | 355 | ||
| 356 | #if BITS_PER_LONG == 32 | ||
| 357 | const char * | ||
| 358 | ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, | ||
| 359 | const struct trace_print_flags_u64 *symbol_array) | ||
| 360 | { | ||
| 361 | int i; | ||
| 362 | const char *ret = p->buffer + p->len; | ||
| 363 | |||
| 364 | for (i = 0; symbol_array[i].name; i++) { | ||
| 365 | |||
| 366 | if (val != symbol_array[i].mask) | ||
| 367 | continue; | ||
| 368 | |||
| 369 | trace_seq_puts(p, symbol_array[i].name); | ||
| 370 | break; | ||
| 371 | } | ||
| 372 | |||
| 373 | if (!p->len) | ||
| 374 | trace_seq_printf(p, "0x%llx", val); | ||
| 375 | |||
| 376 | trace_seq_putc(p, 0); | ||
| 377 | |||
| 378 | return ret; | ||
| 379 | } | ||
| 380 | EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); | ||
| 381 | #endif | ||
| 382 | |||
| 356 | const char * | 383 | const char * |
| 357 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) | 384 | ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) |
| 358 | { | 385 | { |
| @@ -830,6 +857,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); | |||
| 830 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, | 857 | enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, |
| 831 | struct trace_event *event) | 858 | struct trace_event *event) |
| 832 | { | 859 | { |
| 860 | if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) | ||
| 861 | return TRACE_TYPE_PARTIAL_LINE; | ||
| 862 | |||
| 833 | return TRACE_TYPE_HANDLED; | 863 | return TRACE_TYPE_HANDLED; |
| 834 | } | 864 | } |
| 835 | 865 | ||
| @@ -1077,19 +1107,20 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, | |||
| 1077 | { | 1107 | { |
| 1078 | struct stack_entry *field; | 1108 | struct stack_entry *field; |
| 1079 | struct trace_seq *s = &iter->seq; | 1109 | struct trace_seq *s = &iter->seq; |
| 1080 | int i; | 1110 | unsigned long *p; |
| 1111 | unsigned long *end; | ||
| 1081 | 1112 | ||
| 1082 | trace_assign_type(field, iter->ent); | 1113 | trace_assign_type(field, iter->ent); |
| 1114 | end = (unsigned long *)((long)iter->ent + iter->ent_size); | ||
| 1083 | 1115 | ||
| 1084 | if (!trace_seq_puts(s, "<stack trace>\n")) | 1116 | if (!trace_seq_puts(s, "<stack trace>\n")) |
| 1085 | goto partial; | 1117 | goto partial; |
| 1086 | for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { | 1118 | |
| 1087 | if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) | 1119 | for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { |
| 1088 | break; | ||
| 1089 | if (!trace_seq_puts(s, " => ")) | 1120 | if (!trace_seq_puts(s, " => ")) |
| 1090 | goto partial; | 1121 | goto partial; |
| 1091 | 1122 | ||
| 1092 | if (!seq_print_ip_sym(s, field->caller[i], flags)) | 1123 | if (!seq_print_ip_sym(s, *p, flags)) |
| 1093 | goto partial; | 1124 | goto partial; |
| 1094 | if (!trace_seq_puts(s, "\n")) | 1125 | if (!trace_seq_puts(s, "\n")) |
| 1095 | goto partial; | 1126 | goto partial; |
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2547d8813cf0..1f06468a10d7 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c | |||
| @@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex); | |||
| 32 | 32 | ||
| 33 | struct trace_bprintk_fmt { | 33 | struct trace_bprintk_fmt { |
| 34 | struct list_head list; | 34 | struct list_head list; |
| 35 | char fmt[0]; | 35 | const char *fmt; |
| 36 | }; | 36 | }; |
| 37 | 37 | ||
| 38 | static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) | 38 | static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) |
| @@ -49,6 +49,7 @@ static | |||
| 49 | void hold_module_trace_bprintk_format(const char **start, const char **end) | 49 | void hold_module_trace_bprintk_format(const char **start, const char **end) |
| 50 | { | 50 | { |
| 51 | const char **iter; | 51 | const char **iter; |
| 52 | char *fmt; | ||
| 52 | 53 | ||
| 53 | mutex_lock(&btrace_mutex); | 54 | mutex_lock(&btrace_mutex); |
| 54 | for (iter = start; iter < end; iter++) { | 55 | for (iter = start; iter < end; iter++) { |
| @@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) | |||
| 58 | continue; | 59 | continue; |
| 59 | } | 60 | } |
| 60 | 61 | ||
| 61 | tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt) | 62 | tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); |
| 62 | + strlen(*iter) + 1, GFP_KERNEL); | 63 | if (tb_fmt) |
| 63 | if (tb_fmt) { | 64 | fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); |
| 65 | if (tb_fmt && fmt) { | ||
| 64 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); | 66 | list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); |
| 65 | strcpy(tb_fmt->fmt, *iter); | 67 | strcpy(fmt, *iter); |
| 68 | tb_fmt->fmt = fmt; | ||
| 66 | *iter = tb_fmt->fmt; | 69 | *iter = tb_fmt->fmt; |
| 67 | } else | 70 | } else { |
| 71 | kfree(tb_fmt); | ||
| 68 | *iter = NULL; | 72 | *iter = NULL; |
| 73 | } | ||
| 69 | } | 74 | } |
| 70 | mutex_unlock(&btrace_mutex); | 75 | mutex_unlock(&btrace_mutex); |
| 71 | } | 76 | } |
| @@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self, | |||
| 84 | return 0; | 89 | return 0; |
| 85 | } | 90 | } |
| 86 | 91 | ||
| 92 | /* | ||
| 93 | * The debugfs/tracing/printk_formats file maps the addresses with | ||
| 94 | * the ASCII formats that are used in the bprintk events in the | ||
| 95 | * buffer. For userspace tools to be able to decode the events from | ||
| 96 | * the buffer, they need to be able to map the address with the format. | ||
| 97 | * | ||
| 98 | * The addresses of the bprintk formats are in their own section | ||
| 99 | * __trace_printk_fmt. But for modules we copy them into a link list. | ||
| 100 | * The code to print the formats and their addresses passes around the | ||
| 101 | * address of the fmt string. If the fmt address passed into the seq | ||
| 102 | * functions is within the kernel core __trace_printk_fmt section, then | ||
| 103 | * it simply uses the next pointer in the list. | ||
| 104 | * | ||
| 105 | * When the fmt pointer is outside the kernel core __trace_printk_fmt | ||
| 106 | * section, then we need to read the link list pointers. The trick is | ||
| 107 | * we pass the address of the string to the seq function just like | ||
| 108 | * we do for the kernel core formats. To get back the structure that | ||
| 109 | * holds the format, we simply use containerof() and then go to the | ||
| 110 | * next format in the list. | ||
| 111 | */ | ||
| 112 | static const char ** | ||
| 113 | find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) | ||
| 114 | { | ||
| 115 | struct trace_bprintk_fmt *mod_fmt; | ||
| 116 | |||
| 117 | if (list_empty(&trace_bprintk_fmt_list)) | ||
| 118 | return NULL; | ||
| 119 | |||
| 120 | /* | ||
| 121 | * v will point to the address of the fmt record from t_next | ||
| 122 | * v will be NULL from t_start. | ||
| 123 | * If this is the first pointer or called from start | ||
| 124 | * then we need to walk the list. | ||
| 125 | */ | ||
| 126 | if (!v || start_index == *pos) { | ||
| 127 | struct trace_bprintk_fmt *p; | ||
| 128 | |||
| 129 | /* search the module list */ | ||
| 130 | list_for_each_entry(p, &trace_bprintk_fmt_list, list) { | ||
| 131 | if (start_index == *pos) | ||
| 132 | return &p->fmt; | ||
| 133 | start_index++; | ||
| 134 | } | ||
| 135 | /* pos > index */ | ||
| 136 | return NULL; | ||
| 137 | } | ||
| 138 | |||
| 139 | /* | ||
| 140 | * v points to the address of the fmt field in the mod list | ||
| 141 | * structure that holds the module print format. | ||
| 142 | */ | ||
| 143 | mod_fmt = container_of(v, typeof(*mod_fmt), fmt); | ||
| 144 | if (mod_fmt->list.next == &trace_bprintk_fmt_list) | ||
| 145 | return NULL; | ||
| 146 | |||
| 147 | mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); | ||
| 148 | |||
| 149 | return &mod_fmt->fmt; | ||
| 150 | } | ||
| 151 | |||
| 152 | static void format_mod_start(void) | ||
| 153 | { | ||
| 154 | mutex_lock(&btrace_mutex); | ||
| 155 | } | ||
| 156 | |||
| 157 | static void format_mod_stop(void) | ||
| 158 | { | ||
| 159 | mutex_unlock(&btrace_mutex); | ||
| 160 | } | ||
| 161 | |||
| 87 | #else /* !CONFIG_MODULES */ | 162 | #else /* !CONFIG_MODULES */ |
| 88 | __init static int | 163 | __init static int |
| 89 | module_trace_bprintk_format_notify(struct notifier_block *self, | 164 | module_trace_bprintk_format_notify(struct notifier_block *self, |
| @@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self, | |||
| 91 | { | 166 | { |
| 92 | return 0; | 167 | return 0; |
| 93 | } | 168 | } |
| 169 | static inline const char ** | ||
| 170 | find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) | ||
| 171 | { | ||
| 172 | return NULL; | ||
| 173 | } | ||
| 174 | static inline void format_mod_start(void) { } | ||
| 175 | static inline void format_mod_stop(void) { } | ||
| 94 | #endif /* CONFIG_MODULES */ | 176 | #endif /* CONFIG_MODULES */ |
| 95 | 177 | ||
| 96 | 178 | ||
| @@ -153,20 +235,30 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) | |||
| 153 | } | 235 | } |
| 154 | EXPORT_SYMBOL_GPL(__ftrace_vprintk); | 236 | EXPORT_SYMBOL_GPL(__ftrace_vprintk); |
| 155 | 237 | ||
| 238 | static const char **find_next(void *v, loff_t *pos) | ||
| 239 | { | ||
| 240 | const char **fmt = v; | ||
| 241 | int start_index; | ||
| 242 | |||
| 243 | start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; | ||
| 244 | |||
| 245 | if (*pos < start_index) | ||
| 246 | return __start___trace_bprintk_fmt + *pos; | ||
| 247 | |||
| 248 | return find_next_mod_format(start_index, v, fmt, pos); | ||
| 249 | } | ||
| 250 | |||
| 156 | static void * | 251 | static void * |
| 157 | t_start(struct seq_file *m, loff_t *pos) | 252 | t_start(struct seq_file *m, loff_t *pos) |
| 158 | { | 253 | { |
| 159 | const char **fmt = __start___trace_bprintk_fmt + *pos; | 254 | format_mod_start(); |
| 160 | 255 | return find_next(NULL, pos); | |
| 161 | if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt) | ||
| 162 | return NULL; | ||
| 163 | return fmt; | ||
| 164 | } | 256 | } |
| 165 | 257 | ||
| 166 | static void *t_next(struct seq_file *m, void * v, loff_t *pos) | 258 | static void *t_next(struct seq_file *m, void * v, loff_t *pos) |
| 167 | { | 259 | { |
| 168 | (*pos)++; | 260 | (*pos)++; |
| 169 | return t_start(m, pos); | 261 | return find_next(v, pos); |
| 170 | } | 262 | } |
| 171 | 263 | ||
| 172 | static int t_show(struct seq_file *m, void *v) | 264 | static int t_show(struct seq_file *m, void *v) |
| @@ -205,6 +297,7 @@ static int t_show(struct seq_file *m, void *v) | |||
| 205 | 297 | ||
| 206 | static void t_stop(struct seq_file *m, void *p) | 298 | static void t_stop(struct seq_file *m, void *p) |
| 207 | { | 299 | { |
| 300 | format_mod_stop(); | ||
| 208 | } | 301 | } |
| 209 | 302 | ||
| 210 | static const struct seq_operations show_format_seq_ops = { | 303 | static const struct seq_operations show_format_seq_ops = { |
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 7319559ed59f..e4a70c0c71b6 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
| @@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
| 129 | static struct ftrace_ops trace_ops __read_mostly = | 129 | static struct ftrace_ops trace_ops __read_mostly = |
| 130 | { | 130 | { |
| 131 | .func = wakeup_tracer_call, | 131 | .func = wakeup_tracer_call, |
| 132 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
| 132 | }; | 133 | }; |
| 133 | #endif /* CONFIG_FUNCTION_TRACER */ | 134 | #endif /* CONFIG_FUNCTION_TRACER */ |
| 134 | 135 | ||
| @@ -226,7 +227,9 @@ static void wakeup_trace_close(struct trace_iterator *iter) | |||
| 226 | graph_trace_close(iter); | 227 | graph_trace_close(iter); |
| 227 | } | 228 | } |
| 228 | 229 | ||
| 229 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) | 230 | #define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC | \ |
| 231 | TRACE_GRAPH_PRINT_ABS_TIME | \ | ||
| 232 | TRACE_GRAPH_PRINT_DURATION) | ||
| 230 | 233 | ||
| 231 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) | 234 | static enum print_line_t wakeup_print_line(struct trace_iterator *iter) |
| 232 | { | 235 | { |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 659732eba07c..288541f977fb 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
| @@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) | |||
| 101 | 101 | ||
| 102 | #ifdef CONFIG_DYNAMIC_FTRACE | 102 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 103 | 103 | ||
| 104 | static int trace_selftest_test_probe1_cnt; | ||
| 105 | static void trace_selftest_test_probe1_func(unsigned long ip, | ||
| 106 | unsigned long pip) | ||
| 107 | { | ||
| 108 | trace_selftest_test_probe1_cnt++; | ||
| 109 | } | ||
| 110 | |||
| 111 | static int trace_selftest_test_probe2_cnt; | ||
| 112 | static void trace_selftest_test_probe2_func(unsigned long ip, | ||
| 113 | unsigned long pip) | ||
| 114 | { | ||
| 115 | trace_selftest_test_probe2_cnt++; | ||
| 116 | } | ||
| 117 | |||
| 118 | static int trace_selftest_test_probe3_cnt; | ||
| 119 | static void trace_selftest_test_probe3_func(unsigned long ip, | ||
| 120 | unsigned long pip) | ||
| 121 | { | ||
| 122 | trace_selftest_test_probe3_cnt++; | ||
| 123 | } | ||
| 124 | |||
| 125 | static int trace_selftest_test_global_cnt; | ||
| 126 | static void trace_selftest_test_global_func(unsigned long ip, | ||
| 127 | unsigned long pip) | ||
| 128 | { | ||
| 129 | trace_selftest_test_global_cnt++; | ||
| 130 | } | ||
| 131 | |||
| 132 | static int trace_selftest_test_dyn_cnt; | ||
| 133 | static void trace_selftest_test_dyn_func(unsigned long ip, | ||
| 134 | unsigned long pip) | ||
| 135 | { | ||
| 136 | trace_selftest_test_dyn_cnt++; | ||
| 137 | } | ||
| 138 | |||
| 139 | static struct ftrace_ops test_probe1 = { | ||
| 140 | .func = trace_selftest_test_probe1_func, | ||
| 141 | }; | ||
| 142 | |||
| 143 | static struct ftrace_ops test_probe2 = { | ||
| 144 | .func = trace_selftest_test_probe2_func, | ||
| 145 | }; | ||
| 146 | |||
| 147 | static struct ftrace_ops test_probe3 = { | ||
| 148 | .func = trace_selftest_test_probe3_func, | ||
| 149 | }; | ||
| 150 | |||
| 151 | static struct ftrace_ops test_global = { | ||
| 152 | .func = trace_selftest_test_global_func, | ||
| 153 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
| 154 | }; | ||
| 155 | |||
| 156 | static void print_counts(void) | ||
| 157 | { | ||
| 158 | printk("(%d %d %d %d %d) ", | ||
| 159 | trace_selftest_test_probe1_cnt, | ||
| 160 | trace_selftest_test_probe2_cnt, | ||
| 161 | trace_selftest_test_probe3_cnt, | ||
| 162 | trace_selftest_test_global_cnt, | ||
| 163 | trace_selftest_test_dyn_cnt); | ||
| 164 | } | ||
| 165 | |||
| 166 | static void reset_counts(void) | ||
| 167 | { | ||
| 168 | trace_selftest_test_probe1_cnt = 0; | ||
| 169 | trace_selftest_test_probe2_cnt = 0; | ||
| 170 | trace_selftest_test_probe3_cnt = 0; | ||
| 171 | trace_selftest_test_global_cnt = 0; | ||
| 172 | trace_selftest_test_dyn_cnt = 0; | ||
| 173 | } | ||
| 174 | |||
| 175 | static int trace_selftest_ops(int cnt) | ||
| 176 | { | ||
| 177 | int save_ftrace_enabled = ftrace_enabled; | ||
| 178 | struct ftrace_ops *dyn_ops; | ||
| 179 | char *func1_name; | ||
| 180 | char *func2_name; | ||
| 181 | int len1; | ||
| 182 | int len2; | ||
| 183 | int ret = -1; | ||
| 184 | |||
| 185 | printk(KERN_CONT "PASSED\n"); | ||
| 186 | pr_info("Testing dynamic ftrace ops #%d: ", cnt); | ||
| 187 | |||
| 188 | ftrace_enabled = 1; | ||
| 189 | reset_counts(); | ||
| 190 | |||
| 191 | /* Handle PPC64 '.' name */ | ||
| 192 | func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | ||
| 193 | func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); | ||
| 194 | len1 = strlen(func1_name); | ||
| 195 | len2 = strlen(func2_name); | ||
| 196 | |||
| 197 | /* | ||
| 198 | * Probe 1 will trace function 1. | ||
| 199 | * Probe 2 will trace function 2. | ||
| 200 | * Probe 3 will trace functions 1 and 2. | ||
| 201 | */ | ||
| 202 | ftrace_set_filter(&test_probe1, func1_name, len1, 1); | ||
| 203 | ftrace_set_filter(&test_probe2, func2_name, len2, 1); | ||
| 204 | ftrace_set_filter(&test_probe3, func1_name, len1, 1); | ||
| 205 | ftrace_set_filter(&test_probe3, func2_name, len2, 0); | ||
| 206 | |||
| 207 | register_ftrace_function(&test_probe1); | ||
| 208 | register_ftrace_function(&test_probe2); | ||
| 209 | register_ftrace_function(&test_probe3); | ||
| 210 | register_ftrace_function(&test_global); | ||
| 211 | |||
| 212 | DYN_FTRACE_TEST_NAME(); | ||
| 213 | |||
| 214 | print_counts(); | ||
| 215 | |||
| 216 | if (trace_selftest_test_probe1_cnt != 1) | ||
| 217 | goto out; | ||
| 218 | if (trace_selftest_test_probe2_cnt != 0) | ||
| 219 | goto out; | ||
| 220 | if (trace_selftest_test_probe3_cnt != 1) | ||
| 221 | goto out; | ||
| 222 | if (trace_selftest_test_global_cnt == 0) | ||
| 223 | goto out; | ||
| 224 | |||
| 225 | DYN_FTRACE_TEST_NAME2(); | ||
| 226 | |||
| 227 | print_counts(); | ||
| 228 | |||
| 229 | if (trace_selftest_test_probe1_cnt != 1) | ||
| 230 | goto out; | ||
| 231 | if (trace_selftest_test_probe2_cnt != 1) | ||
| 232 | goto out; | ||
| 233 | if (trace_selftest_test_probe3_cnt != 2) | ||
| 234 | goto out; | ||
| 235 | |||
| 236 | /* Add a dynamic probe */ | ||
| 237 | dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); | ||
| 238 | if (!dyn_ops) { | ||
| 239 | printk("MEMORY ERROR "); | ||
| 240 | goto out; | ||
| 241 | } | ||
| 242 | |||
| 243 | dyn_ops->func = trace_selftest_test_dyn_func; | ||
| 244 | |||
| 245 | register_ftrace_function(dyn_ops); | ||
| 246 | |||
| 247 | trace_selftest_test_global_cnt = 0; | ||
| 248 | |||
| 249 | DYN_FTRACE_TEST_NAME(); | ||
| 250 | |||
| 251 | print_counts(); | ||
| 252 | |||
| 253 | if (trace_selftest_test_probe1_cnt != 2) | ||
| 254 | goto out_free; | ||
| 255 | if (trace_selftest_test_probe2_cnt != 1) | ||
| 256 | goto out_free; | ||
| 257 | if (trace_selftest_test_probe3_cnt != 3) | ||
| 258 | goto out_free; | ||
| 259 | if (trace_selftest_test_global_cnt == 0) | ||
| 260 | goto out; | ||
| 261 | if (trace_selftest_test_dyn_cnt == 0) | ||
| 262 | goto out_free; | ||
| 263 | |||
| 264 | DYN_FTRACE_TEST_NAME2(); | ||
| 265 | |||
| 266 | print_counts(); | ||
| 267 | |||
| 268 | if (trace_selftest_test_probe1_cnt != 2) | ||
| 269 | goto out_free; | ||
| 270 | if (trace_selftest_test_probe2_cnt != 2) | ||
| 271 | goto out_free; | ||
| 272 | if (trace_selftest_test_probe3_cnt != 4) | ||
| 273 | goto out_free; | ||
| 274 | |||
| 275 | ret = 0; | ||
| 276 | out_free: | ||
| 277 | unregister_ftrace_function(dyn_ops); | ||
| 278 | kfree(dyn_ops); | ||
| 279 | |||
| 280 | out: | ||
| 281 | /* Purposely unregister in the same order */ | ||
| 282 | unregister_ftrace_function(&test_probe1); | ||
| 283 | unregister_ftrace_function(&test_probe2); | ||
| 284 | unregister_ftrace_function(&test_probe3); | ||
| 285 | unregister_ftrace_function(&test_global); | ||
| 286 | |||
| 287 | /* Make sure everything is off */ | ||
| 288 | reset_counts(); | ||
| 289 | DYN_FTRACE_TEST_NAME(); | ||
| 290 | DYN_FTRACE_TEST_NAME(); | ||
| 291 | |||
| 292 | if (trace_selftest_test_probe1_cnt || | ||
| 293 | trace_selftest_test_probe2_cnt || | ||
| 294 | trace_selftest_test_probe3_cnt || | ||
| 295 | trace_selftest_test_global_cnt || | ||
| 296 | trace_selftest_test_dyn_cnt) | ||
| 297 | ret = -1; | ||
| 298 | |||
| 299 | ftrace_enabled = save_ftrace_enabled; | ||
| 300 | |||
| 301 | return ret; | ||
| 302 | } | ||
| 303 | |||
| 104 | /* Test dynamic code modification and ftrace filters */ | 304 | /* Test dynamic code modification and ftrace filters */ |
| 105 | int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | 305 | int trace_selftest_startup_dynamic_tracing(struct tracer *trace, |
| 106 | struct trace_array *tr, | 306 | struct trace_array *tr, |
| @@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
| 131 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | 331 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); |
| 132 | 332 | ||
| 133 | /* filter only on our function */ | 333 | /* filter only on our function */ |
| 134 | ftrace_set_filter(func_name, strlen(func_name), 1); | 334 | ftrace_set_global_filter(func_name, strlen(func_name), 1); |
| 135 | 335 | ||
| 136 | /* enable tracing */ | 336 | /* enable tracing */ |
| 137 | ret = tracer_init(trace, tr); | 337 | ret = tracer_init(trace, tr); |
| @@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
| 166 | 366 | ||
| 167 | /* check the trace buffer */ | 367 | /* check the trace buffer */ |
| 168 | ret = trace_test_buffer(tr, &count); | 368 | ret = trace_test_buffer(tr, &count); |
| 169 | trace->reset(tr); | ||
| 170 | tracing_start(); | 369 | tracing_start(); |
| 171 | 370 | ||
| 172 | /* we should only have one item */ | 371 | /* we should only have one item */ |
| 173 | if (!ret && count != 1) { | 372 | if (!ret && count != 1) { |
| 373 | trace->reset(tr); | ||
| 174 | printk(KERN_CONT ".. filter failed count=%ld ..", count); | 374 | printk(KERN_CONT ".. filter failed count=%ld ..", count); |
| 175 | ret = -1; | 375 | ret = -1; |
| 176 | goto out; | 376 | goto out; |
| 177 | } | 377 | } |
| 178 | 378 | ||
| 379 | /* Test the ops with global tracing running */ | ||
| 380 | ret = trace_selftest_ops(1); | ||
| 381 | trace->reset(tr); | ||
| 382 | |||
| 179 | out: | 383 | out: |
| 180 | ftrace_enabled = save_ftrace_enabled; | 384 | ftrace_enabled = save_ftrace_enabled; |
| 181 | tracer_enabled = save_tracer_enabled; | 385 | tracer_enabled = save_tracer_enabled; |
| 182 | 386 | ||
| 183 | /* Enable tracing on all functions again */ | 387 | /* Enable tracing on all functions again */ |
| 184 | ftrace_set_filter(NULL, 0, 1); | 388 | ftrace_set_global_filter(NULL, 0, 1); |
| 389 | |||
| 390 | /* Test the ops with global tracing off */ | ||
| 391 | if (!ret) | ||
| 392 | ret = trace_selftest_ops(2); | ||
| 185 | 393 | ||
| 186 | return ret; | 394 | return ret; |
| 187 | } | 395 | } |
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 54dd77cce5bf..b4c475a0a48b 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c | |||
| @@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void) | |||
| 5 | /* used to call mcount */ | 5 | /* used to call mcount */ |
| 6 | return 0; | 6 | return 0; |
| 7 | } | 7 | } |
| 8 | |||
| 9 | int DYN_FTRACE_TEST_NAME2(void) | ||
| 10 | { | ||
| 11 | /* used to call mcount */ | ||
| 12 | return 0; | ||
| 13 | } | ||
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 4c5dead0c239..77575b386d97 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
| 133 | static struct ftrace_ops trace_ops __read_mostly = | 133 | static struct ftrace_ops trace_ops __read_mostly = |
| 134 | { | 134 | { |
| 135 | .func = stack_trace_call, | 135 | .func = stack_trace_call, |
| 136 | .flags = FTRACE_OPS_FL_GLOBAL, | ||
| 136 | }; | 137 | }; |
| 137 | 138 | ||
| 138 | static ssize_t | 139 | static ssize_t |
| @@ -155,20 +156,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, | |||
| 155 | { | 156 | { |
| 156 | long *ptr = filp->private_data; | 157 | long *ptr = filp->private_data; |
| 157 | unsigned long val, flags; | 158 | unsigned long val, flags; |
| 158 | char buf[64]; | ||
| 159 | int ret; | 159 | int ret; |
| 160 | int cpu; | 160 | int cpu; |
| 161 | 161 | ||
| 162 | if (count >= sizeof(buf)) | 162 | ret = kstrtoul_from_user(ubuf, count, 10, &val); |
| 163 | return -EINVAL; | 163 | if (ret) |
| 164 | |||
| 165 | if (copy_from_user(&buf, ubuf, count)) | ||
| 166 | return -EFAULT; | ||
| 167 | |||
| 168 | buf[count] = 0; | ||
| 169 | |||
| 170 | ret = strict_strtoul(buf, 10, &val); | ||
| 171 | if (ret < 0) | ||
| 172 | return ret; | 164 | return ret; |
| 173 | 165 | ||
| 174 | local_irq_save(flags); | 166 | local_irq_save(flags); |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 68187af4889e..b219f1449c54 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
| @@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
| 251 | { | 251 | { |
| 252 | WARN_ON(strcmp((*entry)->name, elem->name) != 0); | 252 | WARN_ON(strcmp((*entry)->name, elem->name) != 0); |
| 253 | 253 | ||
| 254 | if (elem->regfunc && !elem->state && active) | 254 | if (elem->regfunc && !jump_label_enabled(&elem->key) && active) |
| 255 | elem->regfunc(); | 255 | elem->regfunc(); |
| 256 | else if (elem->unregfunc && elem->state && !active) | 256 | else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) |
| 257 | elem->unregfunc(); | 257 | elem->unregfunc(); |
| 258 | 258 | ||
| 259 | /* | 259 | /* |
| @@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
| 264 | * is used. | 264 | * is used. |
| 265 | */ | 265 | */ |
| 266 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); | 266 | rcu_assign_pointer(elem->funcs, (*entry)->funcs); |
| 267 | if (!elem->state && active) { | 267 | if (active && !jump_label_enabled(&elem->key)) |
| 268 | jump_label_enable(&elem->state); | 268 | jump_label_inc(&elem->key); |
| 269 | elem->state = active; | 269 | else if (!active && jump_label_enabled(&elem->key)) |
| 270 | } else if (elem->state && !active) { | 270 | jump_label_dec(&elem->key); |
| 271 | jump_label_disable(&elem->state); | ||
| 272 | elem->state = active; | ||
| 273 | } | ||
| 274 | } | 271 | } |
| 275 | 272 | ||
| 276 | /* | 273 | /* |
| @@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, | |||
| 281 | */ | 278 | */ |
| 282 | static void disable_tracepoint(struct tracepoint *elem) | 279 | static void disable_tracepoint(struct tracepoint *elem) |
| 283 | { | 280 | { |
| 284 | if (elem->unregfunc && elem->state) | 281 | if (elem->unregfunc && jump_label_enabled(&elem->key)) |
| 285 | elem->unregfunc(); | 282 | elem->unregfunc(); |
| 286 | 283 | ||
| 287 | if (elem->state) { | 284 | if (jump_label_enabled(&elem->key)) |
| 288 | jump_label_disable(&elem->state); | 285 | jump_label_dec(&elem->key); |
| 289 | elem->state = 0; | ||
| 290 | } | ||
| 291 | rcu_assign_pointer(elem->funcs, NULL); | 286 | rcu_assign_pointer(elem->funcs, NULL); |
| 292 | } | 287 | } |
| 293 | 288 | ||
diff --git a/kernel/utsname.c b/kernel/utsname.c index 44646179eaba..bff131b9510a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
| 16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
| 17 | #include <linux/user_namespace.h> | 17 | #include <linux/user_namespace.h> |
| 18 | #include <linux/proc_fs.h> | ||
| 18 | 19 | ||
| 19 | static struct uts_namespace *create_uts_ns(void) | 20 | static struct uts_namespace *create_uts_ns(void) |
| 20 | { | 21 | { |
| @@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref) | |||
| 79 | put_user_ns(ns->user_ns); | 80 | put_user_ns(ns->user_ns); |
| 80 | kfree(ns); | 81 | kfree(ns); |
| 81 | } | 82 | } |
| 83 | |||
| 84 | static void *utsns_get(struct task_struct *task) | ||
| 85 | { | ||
| 86 | struct uts_namespace *ns = NULL; | ||
| 87 | struct nsproxy *nsproxy; | ||
| 88 | |||
| 89 | rcu_read_lock(); | ||
| 90 | nsproxy = task_nsproxy(task); | ||
| 91 | if (nsproxy) { | ||
| 92 | ns = nsproxy->uts_ns; | ||
| 93 | get_uts_ns(ns); | ||
| 94 | } | ||
| 95 | rcu_read_unlock(); | ||
| 96 | |||
| 97 | return ns; | ||
| 98 | } | ||
| 99 | |||
| 100 | static void utsns_put(void *ns) | ||
| 101 | { | ||
| 102 | put_uts_ns(ns); | ||
| 103 | } | ||
| 104 | |||
| 105 | static int utsns_install(struct nsproxy *nsproxy, void *ns) | ||
| 106 | { | ||
| 107 | get_uts_ns(ns); | ||
| 108 | put_uts_ns(nsproxy->uts_ns); | ||
| 109 | nsproxy->uts_ns = ns; | ||
| 110 | return 0; | ||
| 111 | } | ||
| 112 | |||
| 113 | const struct proc_ns_operations utsns_operations = { | ||
| 114 | .name = "uts", | ||
| 115 | .type = CLONE_NEWUTS, | ||
| 116 | .get = utsns_get, | ||
| 117 | .put = utsns_put, | ||
| 118 | .install = utsns_install, | ||
| 119 | }; | ||
| 120 | |||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 14733d4d156b..36491cd5b7d4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -28,7 +28,7 @@ | |||
| 28 | #include <linux/perf_event.h> | 28 | #include <linux/perf_event.h> |
| 29 | 29 | ||
| 30 | int watchdog_enabled = 1; | 30 | int watchdog_enabled = 1; |
| 31 | int __read_mostly softlockup_thresh = 60; | 31 | int __read_mostly watchdog_thresh = 10; |
| 32 | 32 | ||
| 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
| 34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | 34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); |
| @@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str) | |||
| 91 | __setup("nosoftlockup", nosoftlockup_setup); | 91 | __setup("nosoftlockup", nosoftlockup_setup); |
| 92 | /* */ | 92 | /* */ |
| 93 | 93 | ||
| 94 | /* | ||
| 95 | * Hard-lockup warnings should be triggered after just a few seconds. Soft- | ||
| 96 | * lockups can have false positives under extreme conditions. So we generally | ||
| 97 | * want a higher threshold for soft lockups than for hard lockups. So we couple | ||
| 98 | * the thresholds with a factor: we make the soft threshold twice the amount of | ||
| 99 | * time the hard threshold is. | ||
| 100 | */ | ||
| 101 | static int get_softlockup_thresh(void) | ||
| 102 | { | ||
| 103 | return watchdog_thresh * 2; | ||
| 104 | } | ||
| 94 | 105 | ||
| 95 | /* | 106 | /* |
| 96 | * Returns seconds, approximately. We don't need nanosecond | 107 | * Returns seconds, approximately. We don't need nanosecond |
| @@ -105,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu) | |||
| 105 | static unsigned long get_sample_period(void) | 116 | static unsigned long get_sample_period(void) |
| 106 | { | 117 | { |
| 107 | /* | 118 | /* |
| 108 | * convert softlockup_thresh from seconds to ns | 119 | * convert watchdog_thresh from seconds to ns |
| 109 | * the divide by 5 is to give hrtimer 5 chances to | 120 | * the divide by 5 is to give hrtimer 5 chances to |
| 110 | * increment before the hardlockup detector generates | 121 | * increment before the hardlockup detector generates |
| 111 | * a warning | 122 | * a warning |
| 112 | */ | 123 | */ |
| 113 | return softlockup_thresh / 5 * NSEC_PER_SEC; | 124 | return get_softlockup_thresh() * (NSEC_PER_SEC / 5); |
| 114 | } | 125 | } |
| 115 | 126 | ||
| 116 | /* Commands for resetting the watchdog */ | 127 | /* Commands for resetting the watchdog */ |
| @@ -182,13 +193,14 @@ static int is_softlockup(unsigned long touch_ts) | |||
| 182 | unsigned long now = get_timestamp(smp_processor_id()); | 193 | unsigned long now = get_timestamp(smp_processor_id()); |
| 183 | 194 | ||
| 184 | /* Warn about unreasonable delays: */ | 195 | /* Warn about unreasonable delays: */ |
| 185 | if (time_after(now, touch_ts + softlockup_thresh)) | 196 | if (time_after(now, touch_ts + get_softlockup_thresh())) |
| 186 | return now - touch_ts; | 197 | return now - touch_ts; |
| 187 | 198 | ||
| 188 | return 0; | 199 | return 0; |
| 189 | } | 200 | } |
| 190 | 201 | ||
| 191 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 202 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 203 | |||
| 192 | static struct perf_event_attr wd_hw_attr = { | 204 | static struct perf_event_attr wd_hw_attr = { |
| 193 | .type = PERF_TYPE_HARDWARE, | 205 | .type = PERF_TYPE_HARDWARE, |
| 194 | .config = PERF_COUNT_HW_CPU_CYCLES, | 206 | .config = PERF_COUNT_HW_CPU_CYCLES, |
| @@ -198,7 +210,7 @@ static struct perf_event_attr wd_hw_attr = { | |||
| 198 | }; | 210 | }; |
| 199 | 211 | ||
| 200 | /* Callback function for perf event subsystem */ | 212 | /* Callback function for perf event subsystem */ |
| 201 | static void watchdog_overflow_callback(struct perf_event *event, int nmi, | 213 | static void watchdog_overflow_callback(struct perf_event *event, |
| 202 | struct perf_sample_data *data, | 214 | struct perf_sample_data *data, |
| 203 | struct pt_regs *regs) | 215 | struct pt_regs *regs) |
| 204 | { | 216 | { |
| @@ -357,10 +369,11 @@ static int watchdog_nmi_enable(int cpu) | |||
| 357 | if (event != NULL) | 369 | if (event != NULL) |
| 358 | goto out_enable; | 370 | goto out_enable; |
| 359 | 371 | ||
| 360 | /* Try to register using hardware perf events */ | ||
| 361 | wd_attr = &wd_hw_attr; | 372 | wd_attr = &wd_hw_attr; |
| 362 | wd_attr->sample_period = hw_nmi_get_sample_period(); | 373 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); |
| 363 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); | 374 | |
| 375 | /* Try to register using hardware perf events */ | ||
| 376 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | ||
| 364 | if (!IS_ERR(event)) { | 377 | if (!IS_ERR(event)) { |
| 365 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); | 378 | printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); |
| 366 | goto out_save; | 379 | goto out_save; |
| @@ -404,15 +417,13 @@ static void watchdog_nmi_disable(int cpu) { return; } | |||
| 404 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 417 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
| 405 | 418 | ||
| 406 | /* prepare/enable/disable routines */ | 419 | /* prepare/enable/disable routines */ |
| 407 | static int watchdog_prepare_cpu(int cpu) | 420 | static void watchdog_prepare_cpu(int cpu) |
| 408 | { | 421 | { |
| 409 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | 422 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); |
| 410 | 423 | ||
| 411 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); | 424 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); |
| 412 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 425 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 413 | hrtimer->function = watchdog_timer_fn; | 426 | hrtimer->function = watchdog_timer_fn; |
| 414 | |||
| 415 | return 0; | ||
| 416 | } | 427 | } |
| 417 | 428 | ||
| 418 | static int watchdog_enable(int cpu) | 429 | static int watchdog_enable(int cpu) |
| @@ -501,28 +512,25 @@ static void watchdog_disable_all_cpus(void) | |||
| 501 | /* sysctl functions */ | 512 | /* sysctl functions */ |
| 502 | #ifdef CONFIG_SYSCTL | 513 | #ifdef CONFIG_SYSCTL |
| 503 | /* | 514 | /* |
| 504 | * proc handler for /proc/sys/kernel/nmi_watchdog | 515 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh |
| 505 | */ | 516 | */ |
| 506 | 517 | ||
| 507 | int proc_dowatchdog_enabled(struct ctl_table *table, int write, | 518 | int proc_dowatchdog(struct ctl_table *table, int write, |
| 508 | void __user *buffer, size_t *length, loff_t *ppos) | 519 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 509 | { | 520 | { |
| 510 | proc_dointvec(table, write, buffer, length, ppos); | 521 | int ret; |
| 511 | 522 | ||
| 512 | if (write) { | 523 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 513 | if (watchdog_enabled) | 524 | if (ret || !write) |
| 514 | watchdog_enable_all_cpus(); | 525 | goto out; |
| 515 | else | ||
| 516 | watchdog_disable_all_cpus(); | ||
| 517 | } | ||
| 518 | return 0; | ||
| 519 | } | ||
| 520 | 526 | ||
| 521 | int proc_dowatchdog_thresh(struct ctl_table *table, int write, | 527 | if (watchdog_enabled && watchdog_thresh) |
| 522 | void __user *buffer, | 528 | watchdog_enable_all_cpus(); |
| 523 | size_t *lenp, loff_t *ppos) | 529 | else |
| 524 | { | 530 | watchdog_disable_all_cpus(); |
| 525 | return proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 531 | |
| 532 | out: | ||
| 533 | return ret; | ||
| 526 | } | 534 | } |
| 527 | #endif /* CONFIG_SYSCTL */ | 535 | #endif /* CONFIG_SYSCTL */ |
| 528 | 536 | ||
| @@ -534,17 +542,16 @@ static int __cpuinit | |||
| 534 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 542 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) |
| 535 | { | 543 | { |
| 536 | int hotcpu = (unsigned long)hcpu; | 544 | int hotcpu = (unsigned long)hcpu; |
| 537 | int err = 0; | ||
| 538 | 545 | ||
| 539 | switch (action) { | 546 | switch (action) { |
| 540 | case CPU_UP_PREPARE: | 547 | case CPU_UP_PREPARE: |
| 541 | case CPU_UP_PREPARE_FROZEN: | 548 | case CPU_UP_PREPARE_FROZEN: |
| 542 | err = watchdog_prepare_cpu(hotcpu); | 549 | watchdog_prepare_cpu(hotcpu); |
| 543 | break; | 550 | break; |
| 544 | case CPU_ONLINE: | 551 | case CPU_ONLINE: |
| 545 | case CPU_ONLINE_FROZEN: | 552 | case CPU_ONLINE_FROZEN: |
| 546 | if (watchdog_enabled) | 553 | if (watchdog_enabled) |
| 547 | err = watchdog_enable(hotcpu); | 554 | watchdog_enable(hotcpu); |
| 548 | break; | 555 | break; |
| 549 | #ifdef CONFIG_HOTPLUG_CPU | 556 | #ifdef CONFIG_HOTPLUG_CPU |
| 550 | case CPU_UP_CANCELED: | 557 | case CPU_UP_CANCELED: |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e3378e8d3a5c..25fb1b0e53fa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -221,7 +221,7 @@ typedef unsigned long mayday_mask_t; | |||
| 221 | * per-CPU workqueues: | 221 | * per-CPU workqueues: |
| 222 | */ | 222 | */ |
| 223 | struct workqueue_struct { | 223 | struct workqueue_struct { |
| 224 | unsigned int flags; /* I: WQ_* flags */ | 224 | unsigned int flags; /* W: WQ_* flags */ |
| 225 | union { | 225 | union { |
| 226 | struct cpu_workqueue_struct __percpu *pcpu; | 226 | struct cpu_workqueue_struct __percpu *pcpu; |
| 227 | struct cpu_workqueue_struct *single; | 227 | struct cpu_workqueue_struct *single; |
| @@ -240,6 +240,7 @@ struct workqueue_struct { | |||
| 240 | mayday_mask_t mayday_mask; /* cpus requesting rescue */ | 240 | mayday_mask_t mayday_mask; /* cpus requesting rescue */ |
| 241 | struct worker *rescuer; /* I: rescue worker */ | 241 | struct worker *rescuer; /* I: rescue worker */ |
| 242 | 242 | ||
| 243 | int nr_drainers; /* W: drain in progress */ | ||
| 243 | int saved_max_active; /* W: saved cwq max_active */ | 244 | int saved_max_active; /* W: saved cwq max_active */ |
| 244 | const char *name; /* I: workqueue name */ | 245 | const char *name; /* I: workqueue name */ |
| 245 | #ifdef CONFIG_LOCKDEP | 246 | #ifdef CONFIG_LOCKDEP |
| @@ -990,7 +991,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
| 990 | debug_work_activate(work); | 991 | debug_work_activate(work); |
| 991 | 992 | ||
| 992 | /* if dying, only works from the same workqueue are allowed */ | 993 | /* if dying, only works from the same workqueue are allowed */ |
| 993 | if (unlikely(wq->flags & WQ_DYING) && | 994 | if (unlikely(wq->flags & WQ_DRAINING) && |
| 994 | WARN_ON_ONCE(!is_chained_work(wq))) | 995 | WARN_ON_ONCE(!is_chained_work(wq))) |
| 995 | return; | 996 | return; |
| 996 | 997 | ||
| @@ -2381,6 +2382,54 @@ out_unlock: | |||
| 2381 | } | 2382 | } |
| 2382 | EXPORT_SYMBOL_GPL(flush_workqueue); | 2383 | EXPORT_SYMBOL_GPL(flush_workqueue); |
| 2383 | 2384 | ||
| 2385 | /** | ||
| 2386 | * drain_workqueue - drain a workqueue | ||
| 2387 | * @wq: workqueue to drain | ||
| 2388 | * | ||
| 2389 | * Wait until the workqueue becomes empty. While draining is in progress, | ||
| 2390 | * only chain queueing is allowed. IOW, only currently pending or running | ||
| 2391 | * work items on @wq can queue further work items on it. @wq is flushed | ||
| 2392 | * repeatedly until it becomes empty. The number of flushing is detemined | ||
| 2393 | * by the depth of chaining and should be relatively short. Whine if it | ||
| 2394 | * takes too long. | ||
| 2395 | */ | ||
| 2396 | void drain_workqueue(struct workqueue_struct *wq) | ||
| 2397 | { | ||
| 2398 | unsigned int flush_cnt = 0; | ||
| 2399 | unsigned int cpu; | ||
| 2400 | |||
| 2401 | /* | ||
| 2402 | * __queue_work() needs to test whether there are drainers, is much | ||
| 2403 | * hotter than drain_workqueue() and already looks at @wq->flags. | ||
| 2404 | * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. | ||
| 2405 | */ | ||
| 2406 | spin_lock(&workqueue_lock); | ||
| 2407 | if (!wq->nr_drainers++) | ||
| 2408 | wq->flags |= WQ_DRAINING; | ||
| 2409 | spin_unlock(&workqueue_lock); | ||
| 2410 | reflush: | ||
| 2411 | flush_workqueue(wq); | ||
| 2412 | |||
| 2413 | for_each_cwq_cpu(cpu, wq) { | ||
| 2414 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
| 2415 | |||
| 2416 | if (!cwq->nr_active && list_empty(&cwq->delayed_works)) | ||
| 2417 | continue; | ||
| 2418 | |||
| 2419 | if (++flush_cnt == 10 || | ||
| 2420 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | ||
| 2421 | pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", | ||
| 2422 | wq->name, flush_cnt); | ||
| 2423 | goto reflush; | ||
| 2424 | } | ||
| 2425 | |||
| 2426 | spin_lock(&workqueue_lock); | ||
| 2427 | if (!--wq->nr_drainers) | ||
| 2428 | wq->flags &= ~WQ_DRAINING; | ||
| 2429 | spin_unlock(&workqueue_lock); | ||
| 2430 | } | ||
| 2431 | EXPORT_SYMBOL_GPL(drain_workqueue); | ||
| 2432 | |||
| 2384 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | 2433 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, |
| 2385 | bool wait_executing) | 2434 | bool wait_executing) |
| 2386 | { | 2435 | { |
| @@ -2866,9 +2915,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) | |||
| 2866 | } | 2915 | } |
| 2867 | } | 2916 | } |
| 2868 | 2917 | ||
| 2869 | /* just in case, make sure it's actually aligned | 2918 | /* just in case, make sure it's actually aligned */ |
| 2870 | * - this is affected by PERCPU() alignment in vmlinux.lds.S | ||
| 2871 | */ | ||
| 2872 | BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); | 2919 | BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); |
| 2873 | return wq->cpu_wq.v ? 0 : -ENOMEM; | 2920 | return wq->cpu_wq.v ? 0 : -ENOMEM; |
| 2874 | } | 2921 | } |
| @@ -3011,34 +3058,10 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); | |||
| 3011 | */ | 3058 | */ |
| 3012 | void destroy_workqueue(struct workqueue_struct *wq) | 3059 | void destroy_workqueue(struct workqueue_struct *wq) |
| 3013 | { | 3060 | { |
| 3014 | unsigned int flush_cnt = 0; | ||
| 3015 | unsigned int cpu; | 3061 | unsigned int cpu; |
| 3016 | 3062 | ||
| 3017 | /* | 3063 | /* drain it before proceeding with destruction */ |
| 3018 | * Mark @wq dying and drain all pending works. Once WQ_DYING is | 3064 | drain_workqueue(wq); |
| 3019 | * set, only chain queueing is allowed. IOW, only currently | ||
| 3020 | * pending or running work items on @wq can queue further work | ||
| 3021 | * items on it. @wq is flushed repeatedly until it becomes empty. | ||
| 3022 | * The number of flushing is detemined by the depth of chaining and | ||
| 3023 | * should be relatively short. Whine if it takes too long. | ||
| 3024 | */ | ||
| 3025 | wq->flags |= WQ_DYING; | ||
| 3026 | reflush: | ||
| 3027 | flush_workqueue(wq); | ||
| 3028 | |||
| 3029 | for_each_cwq_cpu(cpu, wq) { | ||
| 3030 | struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); | ||
| 3031 | |||
| 3032 | if (!cwq->nr_active && list_empty(&cwq->delayed_works)) | ||
| 3033 | continue; | ||
| 3034 | |||
| 3035 | if (++flush_cnt == 10 || | ||
| 3036 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | ||
| 3037 | printk(KERN_WARNING "workqueue %s: flush on " | ||
| 3038 | "destruction isn't complete after %u tries\n", | ||
| 3039 | wq->name, flush_cnt); | ||
| 3040 | goto reflush; | ||
| 3041 | } | ||
| 3042 | 3065 | ||
| 3043 | /* | 3066 | /* |
| 3044 | * wq list is used to freeze wq, remove from list after | 3067 | * wq list is used to freeze wq, remove from list after |
