aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorHerbert Xu <herbert@gondor.apana.org.au>2013-09-06 22:53:35 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2013-09-06 22:53:35 -0400
commiteeca9fad52fc4bfdf42c38bfcf383e932eb3e9d6 (patch)
treecc51c880459d41c0e8d7576405bef4c987bc7aa0 /kernel
parentff6f83fc9d44db09997937c3475d525a6866fbb4 (diff)
parentb48a97be8e6c2afdba2f3b61fd88c3c7743fbd73 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux
Merge upstream tree in order to reinstate crct10dif.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/auditfilter.c8
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/cgroup.c42
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/events/core.c38
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/hrtimer.c38
-rw-r--r--kernel/irq/generic-chip.c6
-rw-r--r--kernel/irq/irqdomain.c591
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/module.c77
-rw-r--r--kernel/mutex.c1
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/posix-cpu-timers.c395
-rw-r--r--kernel/power/autosleep.c3
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c17
-rw-r--r--kernel/rcutorture.c6
-rw-r--r--kernel/rcutree.c6
-rw-r--r--kernel/rcutree.h4
-rw-r--r--kernel/rcutree_plugin.h6
-rw-r--r--kernel/reboot.c419
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/sched/core.c32
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sched/stats.h39
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/smpboot.c2
-rw-r--r--kernel/softirq.c8
-rw-r--r--kernel/sys.c336
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/alarmtimer.c47
-rw-r--r--kernel/time/clockevents.c271
-rw-r--r--kernel/time/clocksource.c266
-rw-r--r--kernel/time/sched_clock.c212
-rw-r--r--kernel/time/tick-broadcast.c129
-rw-r--r--kernel/time/tick-common.c197
-rw-r--r--kernel/time/tick-internal.h17
-rw-r--r--kernel/time/tick-sched.c17
-rw-r--r--kernel/time/timekeeping.c65
-rw-r--r--kernel/time/timekeeping_debug.c72
-rw-r--r--kernel/time/timekeeping_internal.h14
-rw-r--r--kernel/timer.c18
-rw-r--r--kernel/trace/ftrace.c38
-rw-r--r--kernel/trace/ring_buffer.c26
-rw-r--r--kernel/trace/trace.c368
-rw-r--r--kernel/trace/trace.h27
-rw-r--r--kernel/trace/trace_event_perf.c10
-rw-r--r--kernel/trace/trace_events.c264
-rw-r--r--kernel/trace/trace_events_filter.c10
-rw-r--r--kernel/trace/trace_functions.c105
-rw-r--r--kernel/trace/trace_functions_graph.c54
-rw-r--r--kernel/trace/trace_irqsoff.c4
-rw-r--r--kernel/trace/trace_kprobe.c209
-rw-r--r--kernel/trace/trace_mmiotrace.c8
-rw-r--r--kernel/trace/trace_output.c14
-rw-r--r--kernel/trace/trace_selftest.c18
-rw-r--r--kernel/trace/trace_syscalls.c47
-rw-r--r--kernel/trace/trace_uprobe.c6
-rw-r--r--kernel/wait.c3
-rw-r--r--kernel/watchdog.c113
-rw-r--r--kernel/workqueue.c4
69 files changed, 2886 insertions, 1908 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 271fd3119af9..470839d1a30e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o cred.o \ 12 notifier.o ksysfs.o cred.o reboot.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o lglock.o smpboot.o
14 14
15ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/audit.h b/kernel/audit.h
index 1c95131ef760..123c9b7c3979 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -85,6 +85,7 @@ struct audit_names {
85 85
86 struct filename *name; 86 struct filename *name;
87 int name_len; /* number of chars to log */ 87 int name_len; /* number of chars to log */
88 bool hidden; /* don't log this record */
88 bool name_put; /* call __putname()? */ 89 bool name_put; /* call __putname()? */
89 90
90 unsigned long ino; 91 unsigned long ino;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6bd4a90d1991..f7aee8be7fb2 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
423 f->lsm_rule = NULL; 423 f->lsm_rule = NULL;
424 424
425 /* Support legacy tests for a valid loginuid */ 425 /* Support legacy tests for a valid loginuid */
426 if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) { 426 if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) {
427 f->type = AUDIT_LOGINUID_SET; 427 f->type = AUDIT_LOGINUID_SET;
428 f->val = 0; 428 f->val = 0;
429 } 429 }
@@ -865,6 +865,12 @@ static inline int audit_add_rule(struct audit_entry *entry)
865 err = audit_add_watch(&entry->rule, &list); 865 err = audit_add_watch(&entry->rule, &list);
866 if (err) { 866 if (err) {
867 mutex_unlock(&audit_filter_mutex); 867 mutex_unlock(&audit_filter_mutex);
868 /*
869 * normally audit_add_tree_rule() will free it
870 * on failure
871 */
872 if (tree)
873 audit_put_tree(tree);
868 goto error; 874 goto error;
869 } 875 }
870 } 876 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3c8a601324a2..9845cb32b60a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1399 } 1399 }
1400 1400
1401 i = 0; 1401 i = 0;
1402 list_for_each_entry(n, &context->names_list, list) 1402 list_for_each_entry(n, &context->names_list, list) {
1403 if (n->hidden)
1404 continue;
1403 audit_log_name(context, n, NULL, i++, &call_panic); 1405 audit_log_name(context, n, NULL, i++, &call_panic);
1406 }
1404 1407
1405 /* Send end of event record to help user space know we are finished */ 1408 /* Send end of event record to help user space know we are finished */
1406 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); 1409 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name)
1769 * __audit_inode - store the inode and device from a lookup 1772 * __audit_inode - store the inode and device from a lookup
1770 * @name: name being audited 1773 * @name: name being audited
1771 * @dentry: dentry being audited 1774 * @dentry: dentry being audited
1772 * @parent: does this dentry represent the parent? 1775 * @flags: attributes for this particular entry
1773 */ 1776 */
1774void __audit_inode(struct filename *name, const struct dentry *dentry, 1777void __audit_inode(struct filename *name, const struct dentry *dentry,
1775 unsigned int parent) 1778 unsigned int flags)
1776{ 1779{
1777 struct audit_context *context = current->audit_context; 1780 struct audit_context *context = current->audit_context;
1778 const struct inode *inode = dentry->d_inode; 1781 const struct inode *inode = dentry->d_inode;
1779 struct audit_names *n; 1782 struct audit_names *n;
1783 bool parent = flags & AUDIT_INODE_PARENT;
1780 1784
1781 if (!context->in_syscall) 1785 if (!context->in_syscall)
1782 return; 1786 return;
@@ -1831,6 +1835,8 @@ out:
1831 if (parent) { 1835 if (parent) {
1832 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; 1836 n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
1833 n->type = AUDIT_TYPE_PARENT; 1837 n->type = AUDIT_TYPE_PARENT;
1838 if (flags & AUDIT_INODE_HIDDEN)
1839 n->hidden = true;
1834 } else { 1840 } else {
1835 n->name_len = AUDIT_NAME_FULL; 1841 n->name_len = AUDIT_NAME_FULL;
1836 n->type = AUDIT_TYPE_NORMAL; 1842 n->type = AUDIT_TYPE_NORMAL;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e5583d10a325..789ec4683db3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -802,7 +802,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
802 */ 802 */
803 803
804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 804static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
805static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int);
806static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 805static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
807static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 806static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
808 unsigned long subsys_mask); 807 unsigned long subsys_mask);
@@ -1846,36 +1845,43 @@ out:
1846EXPORT_SYMBOL_GPL(cgroup_path); 1845EXPORT_SYMBOL_GPL(cgroup_path);
1847 1846
1848/** 1847/**
1849 * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy 1848 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
1850 * @task: target task 1849 * @task: target task
1851 * @hierarchy_id: the hierarchy to look up @task's cgroup from
1852 * @buf: the buffer to write the path into 1850 * @buf: the buffer to write the path into
1853 * @buflen: the length of the buffer 1851 * @buflen: the length of the buffer
1854 * 1852 *
1855 * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and 1853 * Determine @task's cgroup on the first (the one with the lowest non-zero
1856 * copy its path into @buf. This function grabs cgroup_mutex and shouldn't 1854 * hierarchy_id) cgroup hierarchy and copy its path into @buf. This
1857 * be used inside locks used by cgroup controller callbacks. 1855 * function grabs cgroup_mutex and shouldn't be used inside locks used by
1856 * cgroup controller callbacks.
1857 *
1858 * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
1858 */ 1859 */
1859int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id, 1860int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
1860 char *buf, size_t buflen)
1861{ 1861{
1862 struct cgroupfs_root *root; 1862 struct cgroupfs_root *root;
1863 struct cgroup *cgrp = NULL; 1863 struct cgroup *cgrp;
1864 int ret = -ENOENT; 1864 int hierarchy_id = 1, ret = 0;
1865
1866 if (buflen < 2)
1867 return -ENAMETOOLONG;
1865 1868
1866 mutex_lock(&cgroup_mutex); 1869 mutex_lock(&cgroup_mutex);
1867 1870
1868 root = idr_find(&cgroup_hierarchy_idr, hierarchy_id); 1871 root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
1872
1869 if (root) { 1873 if (root) {
1870 cgrp = task_cgroup_from_root(task, root); 1874 cgrp = task_cgroup_from_root(task, root);
1871 ret = cgroup_path(cgrp, buf, buflen); 1875 ret = cgroup_path(cgrp, buf, buflen);
1876 } else {
1877 /* if no hierarchy exists, everyone is in "/" */
1878 memcpy(buf, "/", 2);
1872 } 1879 }
1873 1880
1874 mutex_unlock(&cgroup_mutex); 1881 mutex_unlock(&cgroup_mutex);
1875
1876 return ret; 1882 return ret;
1877} 1883}
1878EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); 1884EXPORT_SYMBOL_GPL(task_cgroup_path);
1879 1885
1880/* 1886/*
1881 * Control Group taskset 1887 * Control Group taskset
@@ -2642,7 +2648,7 @@ static const struct inode_operations cgroup_file_inode_operations = {
2642}; 2648};
2643 2649
2644static const struct inode_operations cgroup_dir_inode_operations = { 2650static const struct inode_operations cgroup_dir_inode_operations = {
2645 .lookup = cgroup_lookup, 2651 .lookup = simple_lookup,
2646 .mkdir = cgroup_mkdir, 2652 .mkdir = cgroup_mkdir,
2647 .rmdir = cgroup_rmdir, 2653 .rmdir = cgroup_rmdir,
2648 .rename = cgroup_rename, 2654 .rename = cgroup_rename,
@@ -2652,14 +2658,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
2652 .removexattr = cgroup_removexattr, 2658 .removexattr = cgroup_removexattr,
2653}; 2659};
2654 2660
2655static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
2656{
2657 if (dentry->d_name.len > NAME_MAX)
2658 return ERR_PTR(-ENAMETOOLONG);
2659 d_add(dentry, NULL);
2660 return NULL;
2661}
2662
2663/* 2661/*
2664 * Check if a file is a control file 2662 * Check if a file is a control file
2665 */ 2663 */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 198a38883e64..b2b227b82123 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -366,7 +366,7 @@ EXPORT_SYMBOL(cpu_down);
366#endif /*CONFIG_HOTPLUG_CPU*/ 366#endif /*CONFIG_HOTPLUG_CPU*/
367 367
368/* Requires cpu_add_remove_lock to be held */ 368/* Requires cpu_add_remove_lock to be held */
369static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) 369static int _cpu_up(unsigned int cpu, int tasks_frozen)
370{ 370{
371 int ret, nr_calls = 0; 371 int ret, nr_calls = 0;
372 void *hcpu = (void *)(long)cpu; 372 void *hcpu = (void *)(long)cpu;
@@ -419,7 +419,7 @@ out:
419 return ret; 419 return ret;
420} 420}
421 421
422int __cpuinit cpu_up(unsigned int cpu) 422int cpu_up(unsigned int cpu)
423{ 423{
424 int err = 0; 424 int err = 0;
425 425
@@ -618,7 +618,7 @@ core_initcall(cpu_hotplug_pm_sync_init);
618 * It must be called by the arch code on the new cpu, before the new cpu 618 * It must be called by the arch code on the new cpu, before the new cpu
619 * enables interrupts and before the "boot" cpu returns from __cpu_up(). 619 * enables interrupts and before the "boot" cpu returns from __cpu_up().
620 */ 620 */
621void __cpuinit notify_cpu_starting(unsigned int cpu) 621void notify_cpu_starting(unsigned int cpu)
622{ 622{
623 unsigned long val = CPU_STARTING; 623 unsigned long val = CPU_STARTING;
624 624
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1db3af933704..f86599e8c123 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -182,7 +182,7 @@ void update_perf_cpu_limits(void)
182 u64 tmp = perf_sample_period_ns; 182 u64 tmp = perf_sample_period_ns;
183 183
184 tmp *= sysctl_perf_cpu_time_max_percent; 184 tmp *= sysctl_perf_cpu_time_max_percent;
185 tmp = do_div(tmp, 100); 185 do_div(tmp, 100);
186 atomic_set(&perf_sample_allowed_ns, tmp); 186 atomic_set(&perf_sample_allowed_ns, tmp);
187} 187}
188 188
@@ -232,7 +232,7 @@ DEFINE_PER_CPU(u64, running_sample_length);
232void perf_sample_event_took(u64 sample_len_ns) 232void perf_sample_event_took(u64 sample_len_ns)
233{ 233{
234 u64 avg_local_sample_len; 234 u64 avg_local_sample_len;
235 u64 local_samples_len = __get_cpu_var(running_sample_length); 235 u64 local_samples_len;
236 236
237 if (atomic_read(&perf_sample_allowed_ns) == 0) 237 if (atomic_read(&perf_sample_allowed_ns) == 0)
238 return; 238 return;
@@ -947,8 +947,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
947{ 947{
948 struct perf_event_context *ctx; 948 struct perf_event_context *ctx;
949 949
950 rcu_read_lock();
951retry: 950retry:
951 /*
952 * One of the few rules of preemptible RCU is that one cannot do
953 * rcu_read_unlock() while holding a scheduler (or nested) lock when
954 * part of the read side critical section was preemptible -- see
955 * rcu_read_unlock_special().
956 *
957 * Since ctx->lock nests under rq->lock we must ensure the entire read
958 * side critical section is non-preemptible.
959 */
960 preempt_disable();
961 rcu_read_lock();
952 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); 962 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
953 if (ctx) { 963 if (ctx) {
954 /* 964 /*
@@ -964,6 +974,8 @@ retry:
964 raw_spin_lock_irqsave(&ctx->lock, *flags); 974 raw_spin_lock_irqsave(&ctx->lock, *flags);
965 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { 975 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
966 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 976 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
977 rcu_read_unlock();
978 preempt_enable();
967 goto retry; 979 goto retry;
968 } 980 }
969 981
@@ -973,6 +985,7 @@ retry:
973 } 985 }
974 } 986 }
975 rcu_read_unlock(); 987 rcu_read_unlock();
988 preempt_enable();
976 return ctx; 989 return ctx;
977} 990}
978 991
@@ -1950,7 +1963,16 @@ static int __perf_event_enable(void *info)
1950 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1963 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1951 int err; 1964 int err;
1952 1965
1953 if (WARN_ON_ONCE(!ctx->is_active)) 1966 /*
1967 * There's a time window between 'ctx->is_active' check
1968 * in perf_event_enable function and this place having:
1969 * - IRQs on
1970 * - ctx->lock unlocked
1971 *
1972 * where the task could be killed and 'ctx' deactivated
1973 * by perf_event_exit_task.
1974 */
1975 if (!ctx->is_active)
1954 return -EINVAL; 1976 return -EINVAL;
1955 1977
1956 raw_spin_lock(&ctx->lock); 1978 raw_spin_lock(&ctx->lock);
@@ -6212,8 +6234,6 @@ perf_event_mux_interval_ms_store(struct device *dev,
6212 return count; 6234 return count;
6213} 6235}
6214 6236
6215#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
6216
6217static struct device_attribute pmu_dev_attrs[] = { 6237static struct device_attribute pmu_dev_attrs[] = {
6218 __ATTR_RO(type), 6238 __ATTR_RO(type),
6219 __ATTR_RW(perf_event_mux_interval_ms), 6239 __ATTR_RW(perf_event_mux_interval_ms),
@@ -7465,7 +7485,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
7465 * child. 7485 * child.
7466 */ 7486 */
7467 7487
7468 child_ctx = alloc_perf_context(event->pmu, child); 7488 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
7469 if (!child_ctx) 7489 if (!child_ctx)
7470 return -ENOMEM; 7490 return -ENOMEM;
7471 7491
@@ -7608,7 +7628,7 @@ static void __init perf_event_init_all_cpus(void)
7608 } 7628 }
7609} 7629}
7610 7630
7611static void __cpuinit perf_event_init_cpu(int cpu) 7631static void perf_event_init_cpu(int cpu)
7612{ 7632{
7613 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); 7633 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7614 7634
@@ -7697,7 +7717,7 @@ static struct notifier_block perf_reboot_notifier = {
7697 .priority = INT_MIN, 7717 .priority = INT_MIN,
7698}; 7718};
7699 7719
7700static int __cpuinit 7720static int
7701perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 7721perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7702{ 7722{
7703 unsigned int cpu = (long)hcpu; 7723 unsigned int cpu = (long)hcpu;
diff --git a/kernel/exit.c b/kernel/exit.c
index fafe75d9e6f6..a949819055d5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -808,7 +808,7 @@ void do_exit(long code)
808 /* 808 /*
809 * FIXME: do that only when needed, using sched_exit tracepoint 809 * FIXME: do that only when needed, using sched_exit tracepoint
810 */ 810 */
811 ptrace_put_breakpoints(tsk); 811 flush_ptrace_hw_breakpoint(tsk);
812 812
813 exit_notify(tsk, group_dead); 813 exit_notify(tsk, group_dead);
814#ifdef CONFIG_NUMA 814#ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 6e6a1c11b3e5..403d2bb8a968 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -365,8 +365,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
365 mm->locked_vm = 0; 365 mm->locked_vm = 0;
366 mm->mmap = NULL; 366 mm->mmap = NULL;
367 mm->mmap_cache = NULL; 367 mm->mmap_cache = NULL;
368 mm->free_area_cache = oldmm->mmap_base;
369 mm->cached_hole_size = ~0UL;
370 mm->map_count = 0; 368 mm->map_count = 0;
371 cpumask_clear(mm_cpumask(mm)); 369 cpumask_clear(mm_cpumask(mm));
372 mm->mm_rb = RB_ROOT; 370 mm->mm_rb = RB_ROOT;
@@ -540,8 +538,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
540 mm->nr_ptes = 0; 538 mm->nr_ptes = 0;
541 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); 539 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
542 spin_lock_init(&mm->page_table_lock); 540 spin_lock_init(&mm->page_table_lock);
543 mm->free_area_cache = TASK_UNMAPPED_BASE;
544 mm->cached_hole_size = ~0UL;
545 mm_init_aio(mm); 541 mm_init_aio(mm);
546 mm_init_owner(mm, p); 542 mm_init_owner(mm, p);
547 543
@@ -1550,7 +1546,7 @@ static inline void init_idle_pids(struct pid_link *links)
1550 } 1546 }
1551} 1547}
1552 1548
1553struct task_struct * __cpuinit fork_idle(int cpu) 1549struct task_struct *fork_idle(int cpu)
1554{ 1550{
1555 struct task_struct *task; 1551 struct task_struct *task;
1556 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0); 1552 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3ee4d06c6fc2..383319bae3f7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -722,17 +722,20 @@ static int hrtimer_switch_to_hres(void)
722 return 1; 722 return 1;
723} 723}
724 724
725static void clock_was_set_work(struct work_struct *work)
726{
727 clock_was_set();
728}
729
730static DECLARE_WORK(hrtimer_work, clock_was_set_work);
731
725/* 732/*
726 * Called from timekeeping code to reprogramm the hrtimer interrupt 733 * Called from timekeeping and resume code to reprogramm the hrtimer
727 * device. If called from the timer interrupt context we defer it to 734 * interrupt device on all cpus.
728 * softirq context.
729 */ 735 */
730void clock_was_set_delayed(void) 736void clock_was_set_delayed(void)
731{ 737{
732 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 738 schedule_work(&hrtimer_work);
733
734 cpu_base->clock_was_set = 1;
735 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
736} 739}
737 740
738#else 741#else
@@ -774,15 +777,19 @@ void clock_was_set(void)
774 777
775/* 778/*
776 * During resume we might have to reprogram the high resolution timer 779 * During resume we might have to reprogram the high resolution timer
777 * interrupt (on the local CPU): 780 * interrupt on all online CPUs. However, all other CPUs will be
781 * stopped with IRQs interrupts disabled so the clock_was_set() call
782 * must be deferred.
778 */ 783 */
779void hrtimers_resume(void) 784void hrtimers_resume(void)
780{ 785{
781 WARN_ONCE(!irqs_disabled(), 786 WARN_ONCE(!irqs_disabled(),
782 KERN_INFO "hrtimers_resume() called with IRQs enabled!"); 787 KERN_INFO "hrtimers_resume() called with IRQs enabled!");
783 788
789 /* Retrigger on the local CPU */
784 retrigger_next_event(NULL); 790 retrigger_next_event(NULL);
785 timerfd_clock_was_set(); 791 /* And schedule a retrigger for all others */
792 clock_was_set_delayed();
786} 793}
787 794
788static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) 795static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
@@ -1433,13 +1440,6 @@ void hrtimer_peek_ahead_timers(void)
1433 1440
1434static void run_hrtimer_softirq(struct softirq_action *h) 1441static void run_hrtimer_softirq(struct softirq_action *h)
1435{ 1442{
1436 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1437
1438 if (cpu_base->clock_was_set) {
1439 cpu_base->clock_was_set = 0;
1440 clock_was_set();
1441 }
1442
1443 hrtimer_peek_ahead_timers(); 1443 hrtimer_peek_ahead_timers();
1444} 1444}
1445 1445
@@ -1659,7 +1659,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1659/* 1659/*
1660 * Functions related to boot-time initialization: 1660 * Functions related to boot-time initialization:
1661 */ 1661 */
1662static void __cpuinit init_hrtimers_cpu(int cpu) 1662static void init_hrtimers_cpu(int cpu)
1663{ 1663{
1664 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1664 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1665 int i; 1665 int i;
@@ -1740,7 +1740,7 @@ static void migrate_hrtimers(int scpu)
1740 1740
1741#endif /* CONFIG_HOTPLUG_CPU */ 1741#endif /* CONFIG_HOTPLUG_CPU */
1742 1742
1743static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1743static int hrtimer_cpu_notify(struct notifier_block *self,
1744 unsigned long action, void *hcpu) 1744 unsigned long action, void *hcpu)
1745{ 1745{
1746 int scpu = (long)hcpu; 1746 int scpu = (long)hcpu;
@@ -1773,7 +1773,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1773 return NOTIFY_OK; 1773 return NOTIFY_OK;
1774} 1774}
1775 1775
1776static struct notifier_block __cpuinitdata hrtimers_nb = { 1776static struct notifier_block hrtimers_nb = {
1777 .notifier_call = hrtimer_cpu_notify, 1777 .notifier_call = hrtimer_cpu_notify,
1778}; 1778};
1779 1779
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index e3544c19bdd2..452d6f2ba21d 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -275,10 +275,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
275 if (d->gc) 275 if (d->gc)
276 return -EBUSY; 276 return -EBUSY;
277 277
278 if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR) 278 numchips = DIV_ROUND_UP(d->revmap_size, irqs_per_chip);
279 return -EINVAL;
280
281 numchips = d->revmap_data.linear.size / irqs_per_chip;
282 if (!numchips) 279 if (!numchips)
283 return -EINVAL; 280 return -EINVAL;
284 281
@@ -310,6 +307,7 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
310 /* Calc pointer to the next generic chip */ 307 /* Calc pointer to the next generic chip */
311 tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); 308 tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
312 } 309 }
310 d->name = name;
313 return 0; 311 return 0;
314} 312}
315EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips); 313EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1ed8dff17eb9..706724e9835d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -23,9 +23,11 @@ static DEFINE_MUTEX(revmap_trees_mutex);
23static struct irq_domain *irq_default_domain; 23static struct irq_domain *irq_default_domain;
24 24
25/** 25/**
26 * irq_domain_alloc() - Allocate a new irq_domain data structure 26 * __irq_domain_add() - Allocate a new irq_domain data structure
27 * @of_node: optional device-tree node of the interrupt controller 27 * @of_node: optional device-tree node of the interrupt controller
28 * @revmap_type: type of reverse mapping to use 28 * @size: Size of linear map; 0 for radix mapping only
29 * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no
30 * direct mapping
29 * @ops: map/unmap domain callbacks 31 * @ops: map/unmap domain callbacks
30 * @host_data: Controller private data pointer 32 * @host_data: Controller private data pointer
31 * 33 *
@@ -33,41 +35,35 @@ static struct irq_domain *irq_default_domain;
33 * register allocated irq_domain with irq_domain_register(). Returns pointer 35 * register allocated irq_domain with irq_domain_register(). Returns pointer
34 * to IRQ domain, or NULL on failure. 36 * to IRQ domain, or NULL on failure.
35 */ 37 */
36static struct irq_domain *irq_domain_alloc(struct device_node *of_node, 38struct irq_domain *__irq_domain_add(struct device_node *of_node, int size,
37 unsigned int revmap_type, 39 irq_hw_number_t hwirq_max, int direct_max,
38 const struct irq_domain_ops *ops, 40 const struct irq_domain_ops *ops,
39 void *host_data) 41 void *host_data)
40{ 42{
41 struct irq_domain *domain; 43 struct irq_domain *domain;
42 44
43 domain = kzalloc_node(sizeof(*domain), GFP_KERNEL, 45 domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
44 of_node_to_nid(of_node)); 46 GFP_KERNEL, of_node_to_nid(of_node));
45 if (WARN_ON(!domain)) 47 if (WARN_ON(!domain))
46 return NULL; 48 return NULL;
47 49
48 /* Fill structure */ 50 /* Fill structure */
49 domain->revmap_type = revmap_type; 51 INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
50 domain->ops = ops; 52 domain->ops = ops;
51 domain->host_data = host_data; 53 domain->host_data = host_data;
52 domain->of_node = of_node_get(of_node); 54 domain->of_node = of_node_get(of_node);
55 domain->hwirq_max = hwirq_max;
56 domain->revmap_size = size;
57 domain->revmap_direct_max_irq = direct_max;
53 58
54 return domain;
55}
56
57static void irq_domain_free(struct irq_domain *domain)
58{
59 of_node_put(domain->of_node);
60 kfree(domain);
61}
62
63static void irq_domain_add(struct irq_domain *domain)
64{
65 mutex_lock(&irq_domain_mutex); 59 mutex_lock(&irq_domain_mutex);
66 list_add(&domain->link, &irq_domain_list); 60 list_add(&domain->link, &irq_domain_list);
67 mutex_unlock(&irq_domain_mutex); 61 mutex_unlock(&irq_domain_mutex);
68 pr_debug("Allocated domain of type %d @0x%p\n", 62
69 domain->revmap_type, domain); 63 pr_debug("Added domain %s\n", domain->name);
64 return domain;
70} 65}
66EXPORT_SYMBOL_GPL(__irq_domain_add);
71 67
72/** 68/**
73 * irq_domain_remove() - Remove an irq domain. 69 * irq_domain_remove() - Remove an irq domain.
@@ -81,29 +77,12 @@ void irq_domain_remove(struct irq_domain *domain)
81{ 77{
82 mutex_lock(&irq_domain_mutex); 78 mutex_lock(&irq_domain_mutex);
83 79
84 switch (domain->revmap_type) { 80 /*
85 case IRQ_DOMAIN_MAP_LEGACY: 81 * radix_tree_delete() takes care of destroying the root
86 /* 82 * node when all entries are removed. Shout if there are
87 * Legacy domains don't manage their own irq_desc 83 * any mappings left.
88 * allocations, we expect the caller to handle irq_desc 84 */
89 * freeing on their own. 85 WARN_ON(domain->revmap_tree.height);
90 */
91 break;
92 case IRQ_DOMAIN_MAP_TREE:
93 /*
94 * radix_tree_delete() takes care of destroying the root
95 * node when all entries are removed. Shout if there are
96 * any mappings left.
97 */
98 WARN_ON(domain->revmap_data.tree.height);
99 break;
100 case IRQ_DOMAIN_MAP_LINEAR:
101 kfree(domain->revmap_data.linear.revmap);
102 domain->revmap_data.linear.size = 0;
103 break;
104 case IRQ_DOMAIN_MAP_NOMAP:
105 break;
106 }
107 86
108 list_del(&domain->link); 87 list_del(&domain->link);
109 88
@@ -115,44 +94,30 @@ void irq_domain_remove(struct irq_domain *domain)
115 94
116 mutex_unlock(&irq_domain_mutex); 95 mutex_unlock(&irq_domain_mutex);
117 96
118 pr_debug("Removed domain of type %d @0x%p\n", 97 pr_debug("Removed domain %s\n", domain->name);
119 domain->revmap_type, domain);
120 98
121 irq_domain_free(domain); 99 of_node_put(domain->of_node);
100 kfree(domain);
122} 101}
123EXPORT_SYMBOL_GPL(irq_domain_remove); 102EXPORT_SYMBOL_GPL(irq_domain_remove);
124 103
125static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
126 irq_hw_number_t hwirq)
127{
128 irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
129 int size = domain->revmap_data.legacy.size;
130
131 if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
132 return 0;
133 return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
134}
135
136/** 104/**
137 * irq_domain_add_simple() - Allocate and register a simple irq_domain. 105 * irq_domain_add_simple() - Register an irq_domain and optionally map a range of irqs
138 * @of_node: pointer to interrupt controller's device tree node. 106 * @of_node: pointer to interrupt controller's device tree node.
139 * @size: total number of irqs in mapping 107 * @size: total number of irqs in mapping
140 * @first_irq: first number of irq block assigned to the domain, 108 * @first_irq: first number of irq block assigned to the domain,
141 * pass zero to assign irqs on-the-fly. This will result in a 109 * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then
142 * linear IRQ domain so it is important to use irq_create_mapping() 110 * pre-map all of the irqs in the domain to virqs starting at first_irq.
143 * for each used IRQ, especially when SPARSE_IRQ is enabled.
144 * @ops: map/unmap domain callbacks 111 * @ops: map/unmap domain callbacks
145 * @host_data: Controller private data pointer 112 * @host_data: Controller private data pointer
146 * 113 *
147 * Allocates a legacy irq_domain if irq_base is positive or a linear 114 * Allocates an irq_domain, and optionally if first_irq is positive then also
148 * domain otherwise. For the legacy domain, IRQ descriptors will also 115 * allocate irq_descs and map all of the hwirqs to virqs starting at first_irq.
149 * be allocated.
150 * 116 *
151 * This is intended to implement the expected behaviour for most 117 * This is intended to implement the expected behaviour for most
152 * interrupt controllers which is that a linear mapping should 118 * interrupt controllers. If device tree is used, then first_irq will be 0 and
153 * normally be used unless the system requires a legacy mapping in 119 * irqs get mapped dynamically on the fly. However, if the controller requires
154 * order to support supplying interrupt numbers during non-DT 120 * static virq assignments (non-DT boot) then it will set that up correctly.
155 * registration of devices.
156 */ 121 */
157struct irq_domain *irq_domain_add_simple(struct device_node *of_node, 122struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
158 unsigned int size, 123 unsigned int size,
@@ -160,33 +125,25 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
160 const struct irq_domain_ops *ops, 125 const struct irq_domain_ops *ops,
161 void *host_data) 126 void *host_data)
162{ 127{
163 if (first_irq > 0) { 128 struct irq_domain *domain;
164 int irq_base; 129
130 domain = __irq_domain_add(of_node, size, size, 0, ops, host_data);
131 if (!domain)
132 return NULL;
165 133
134 if (first_irq > 0) {
166 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) { 135 if (IS_ENABLED(CONFIG_SPARSE_IRQ)) {
167 /* 136 /* attempt to allocated irq_descs */
168 * Set the descriptor allocator to search for a 137 int rc = irq_alloc_descs(first_irq, first_irq, size,
169 * 1-to-1 mapping, such as irq_alloc_desc_at(). 138 of_node_to_nid(of_node));
170 * Use of_node_to_nid() which is defined to 139 if (rc < 0)
171 * numa_node_id() on platforms that have no custom
172 * implementation.
173 */
174 irq_base = irq_alloc_descs(first_irq, first_irq, size,
175 of_node_to_nid(of_node));
176 if (irq_base < 0) {
177 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", 140 pr_info("Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
178 first_irq); 141 first_irq);
179 irq_base = first_irq; 142 }
180 } 143 irq_domain_associate_many(domain, first_irq, 0, size);
181 } else
182 irq_base = first_irq;
183
184 return irq_domain_add_legacy(of_node, size, irq_base, 0,
185 ops, host_data);
186 } 144 }
187 145
188 /* A linear domain is the default */ 146 return domain;
189 return irq_domain_add_linear(of_node, size, ops, host_data);
190} 147}
191EXPORT_SYMBOL_GPL(irq_domain_add_simple); 148EXPORT_SYMBOL_GPL(irq_domain_add_simple);
192 149
@@ -213,131 +170,19 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
213 void *host_data) 170 void *host_data)
214{ 171{
215 struct irq_domain *domain; 172 struct irq_domain *domain;
216 unsigned int i;
217 173
218 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); 174 domain = __irq_domain_add(of_node, first_hwirq + size,
175 first_hwirq + size, 0, ops, host_data);
219 if (!domain) 176 if (!domain)
220 return NULL; 177 return NULL;
221 178
222 domain->revmap_data.legacy.first_irq = first_irq; 179 irq_domain_associate_many(domain, first_irq, first_hwirq, size);
223 domain->revmap_data.legacy.first_hwirq = first_hwirq;
224 domain->revmap_data.legacy.size = size;
225
226 mutex_lock(&irq_domain_mutex);
227 /* Verify that all the irqs are available */
228 for (i = 0; i < size; i++) {
229 int irq = first_irq + i;
230 struct irq_data *irq_data = irq_get_irq_data(irq);
231
232 if (WARN_ON(!irq_data || irq_data->domain)) {
233 mutex_unlock(&irq_domain_mutex);
234 irq_domain_free(domain);
235 return NULL;
236 }
237 }
238
239 /* Claim all of the irqs before registering a legacy domain */
240 for (i = 0; i < size; i++) {
241 struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
242 irq_data->hwirq = first_hwirq + i;
243 irq_data->domain = domain;
244 }
245 mutex_unlock(&irq_domain_mutex);
246
247 for (i = 0; i < size; i++) {
248 int irq = first_irq + i;
249 int hwirq = first_hwirq + i;
250
251 /* IRQ0 gets ignored */
252 if (!irq)
253 continue;
254
255 /* Legacy flags are left to default at this point,
256 * one can then use irq_create_mapping() to
257 * explicitly change them
258 */
259 if (ops->map)
260 ops->map(domain, irq, hwirq);
261
262 /* Clear norequest flags */
263 irq_clear_status_flags(irq, IRQ_NOREQUEST);
264 }
265 180
266 irq_domain_add(domain);
267 return domain; 181 return domain;
268} 182}
269EXPORT_SYMBOL_GPL(irq_domain_add_legacy); 183EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
270 184
271/** 185/**
272 * irq_domain_add_linear() - Allocate and register a linear revmap irq_domain.
273 * @of_node: pointer to interrupt controller's device tree node.
274 * @size: Number of interrupts in the domain.
275 * @ops: map/unmap domain callbacks
276 * @host_data: Controller private data pointer
277 */
278struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
279 unsigned int size,
280 const struct irq_domain_ops *ops,
281 void *host_data)
282{
283 struct irq_domain *domain;
284 unsigned int *revmap;
285
286 revmap = kzalloc_node(sizeof(*revmap) * size, GFP_KERNEL,
287 of_node_to_nid(of_node));
288 if (WARN_ON(!revmap))
289 return NULL;
290
291 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
292 if (!domain) {
293 kfree(revmap);
294 return NULL;
295 }
296 domain->revmap_data.linear.size = size;
297 domain->revmap_data.linear.revmap = revmap;
298 irq_domain_add(domain);
299 return domain;
300}
301EXPORT_SYMBOL_GPL(irq_domain_add_linear);
302
303struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
304 unsigned int max_irq,
305 const struct irq_domain_ops *ops,
306 void *host_data)
307{
308 struct irq_domain *domain = irq_domain_alloc(of_node,
309 IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
310 if (domain) {
311 domain->revmap_data.nomap.max_irq = max_irq ? max_irq : ~0;
312 irq_domain_add(domain);
313 }
314 return domain;
315}
316EXPORT_SYMBOL_GPL(irq_domain_add_nomap);
317
318/**
319 * irq_domain_add_tree()
320 * @of_node: pointer to interrupt controller's device tree node.
321 * @ops: map/unmap domain callbacks
322 *
323 * Note: The radix tree will be allocated later during boot automatically
324 * (the reverse mapping will use the slow path until that happens).
325 */
326struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
327 const struct irq_domain_ops *ops,
328 void *host_data)
329{
330 struct irq_domain *domain = irq_domain_alloc(of_node,
331 IRQ_DOMAIN_MAP_TREE, ops, host_data);
332 if (domain) {
333 INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
334 irq_domain_add(domain);
335 }
336 return domain;
337}
338EXPORT_SYMBOL_GPL(irq_domain_add_tree);
339
340/**
341 * irq_find_host() - Locates a domain for a given device node 186 * irq_find_host() - Locates a domain for a given device node
342 * @node: device-tree node of the interrupt controller 187 * @node: device-tree node of the interrupt controller
343 */ 188 */
@@ -385,125 +230,108 @@ void irq_set_default_host(struct irq_domain *domain)
385} 230}
386EXPORT_SYMBOL_GPL(irq_set_default_host); 231EXPORT_SYMBOL_GPL(irq_set_default_host);
387 232
388static void irq_domain_disassociate_many(struct irq_domain *domain, 233static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
389 unsigned int irq_base, int count)
390{ 234{
391 /* 235 struct irq_data *irq_data = irq_get_irq_data(irq);
392 * disassociate in reverse order; 236 irq_hw_number_t hwirq;
393 * not strictly necessary, but nice for unwinding
394 */
395 while (count--) {
396 int irq = irq_base + count;
397 struct irq_data *irq_data = irq_get_irq_data(irq);
398 irq_hw_number_t hwirq;
399 237
400 if (WARN_ON(!irq_data || irq_data->domain != domain)) 238 if (WARN(!irq_data || irq_data->domain != domain,
401 continue; 239 "virq%i doesn't exist; cannot disassociate\n", irq))
240 return;
402 241
403 hwirq = irq_data->hwirq; 242 hwirq = irq_data->hwirq;
404 irq_set_status_flags(irq, IRQ_NOREQUEST); 243 irq_set_status_flags(irq, IRQ_NOREQUEST);
405 244
406 /* remove chip and handler */ 245 /* remove chip and handler */
407 irq_set_chip_and_handler(irq, NULL, NULL); 246 irq_set_chip_and_handler(irq, NULL, NULL);
408 247
409 /* Make sure it's completed */ 248 /* Make sure it's completed */
410 synchronize_irq(irq); 249 synchronize_irq(irq);
411 250
412 /* Tell the PIC about it */ 251 /* Tell the PIC about it */
413 if (domain->ops->unmap) 252 if (domain->ops->unmap)
414 domain->ops->unmap(domain, irq); 253 domain->ops->unmap(domain, irq);
415 smp_mb(); 254 smp_mb();
416 255
417 irq_data->domain = NULL; 256 irq_data->domain = NULL;
418 irq_data->hwirq = 0; 257 irq_data->hwirq = 0;
419 258
420 /* Clear reverse map */ 259 /* Clear reverse map for this hwirq */
421 switch(domain->revmap_type) { 260 if (hwirq < domain->revmap_size) {
422 case IRQ_DOMAIN_MAP_LINEAR: 261 domain->linear_revmap[hwirq] = 0;
423 if (hwirq < domain->revmap_data.linear.size) 262 } else {
424 domain->revmap_data.linear.revmap[hwirq] = 0; 263 mutex_lock(&revmap_trees_mutex);
425 break; 264 radix_tree_delete(&domain->revmap_tree, hwirq);
426 case IRQ_DOMAIN_MAP_TREE: 265 mutex_unlock(&revmap_trees_mutex);
427 mutex_lock(&revmap_trees_mutex);
428 radix_tree_delete(&domain->revmap_data.tree, hwirq);
429 mutex_unlock(&revmap_trees_mutex);
430 break;
431 }
432 } 266 }
433} 267}
434 268
435int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, 269int irq_domain_associate(struct irq_domain *domain, unsigned int virq,
436 irq_hw_number_t hwirq_base, int count) 270 irq_hw_number_t hwirq)
437{ 271{
438 unsigned int virq = irq_base; 272 struct irq_data *irq_data = irq_get_irq_data(virq);
439 irq_hw_number_t hwirq = hwirq_base; 273 int ret;
440 int i, ret;
441 274
442 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__, 275 if (WARN(hwirq >= domain->hwirq_max,
443 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count); 276 "error: hwirq 0x%x is too large for %s\n", (int)hwirq, domain->name))
277 return -EINVAL;
278 if (WARN(!irq_data, "error: virq%i is not allocated", virq))
279 return -EINVAL;
280 if (WARN(irq_data->domain, "error: virq%i is already associated", virq))
281 return -EINVAL;
444 282
445 for (i = 0; i < count; i++) { 283 mutex_lock(&irq_domain_mutex);
446 struct irq_data *irq_data = irq_get_irq_data(virq + i); 284 irq_data->hwirq = hwirq;
447 285 irq_data->domain = domain;
448 if (WARN(!irq_data, "error: irq_desc not allocated; " 286 if (domain->ops->map) {
449 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) 287 ret = domain->ops->map(domain, virq, hwirq);
450 return -EINVAL; 288 if (ret != 0) {
451 if (WARN(irq_data->domain, "error: irq_desc already associated; " 289 /*
452 "irq=%i hwirq=0x%x\n", virq + i, (int)hwirq + i)) 290 * If map() returns -EPERM, this interrupt is protected
453 return -EINVAL; 291 * by the firmware or some other service and shall not
454 }; 292 * be mapped. Don't bother telling the user about it.
455 293 */
456 for (i = 0; i < count; i++, virq++, hwirq++) { 294 if (ret != -EPERM) {
457 struct irq_data *irq_data = irq_get_irq_data(virq); 295 pr_info("%s didn't like hwirq-0x%lx to VIRQ%i mapping (rc=%d)\n",
458 296 domain->name, hwirq, virq, ret);
459 irq_data->hwirq = hwirq;
460 irq_data->domain = domain;
461 if (domain->ops->map) {
462 ret = domain->ops->map(domain, virq, hwirq);
463 if (ret != 0) {
464 /*
465 * If map() returns -EPERM, this interrupt is protected
466 * by the firmware or some other service and shall not
467 * be mapped.
468 *
469 * Since on some platforms we blindly try to map everything
470 * we end up with a log full of backtraces.
471 *
472 * So instead, we silently fail on -EPERM, it is the
473 * responsibility of the PIC driver to display a relevant
474 * message if needed.
475 */
476 if (ret != -EPERM) {
477 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
478 virq, hwirq, ret);
479 WARN_ON(1);
480 }
481 irq_data->domain = NULL;
482 irq_data->hwirq = 0;
483 goto err_unmap;
484 } 297 }
298 irq_data->domain = NULL;
299 irq_data->hwirq = 0;
300 mutex_unlock(&irq_domain_mutex);
301 return ret;
485 } 302 }
486 303
487 switch (domain->revmap_type) { 304 /* If not already assigned, give the domain the chip's name */
488 case IRQ_DOMAIN_MAP_LINEAR: 305 if (!domain->name && irq_data->chip)
489 if (hwirq < domain->revmap_data.linear.size) 306 domain->name = irq_data->chip->name;
490 domain->revmap_data.linear.revmap[hwirq] = virq; 307 }
491 break;
492 case IRQ_DOMAIN_MAP_TREE:
493 mutex_lock(&revmap_trees_mutex);
494 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
495 mutex_unlock(&revmap_trees_mutex);
496 break;
497 }
498 308
499 irq_clear_status_flags(virq, IRQ_NOREQUEST); 309 if (hwirq < domain->revmap_size) {
310 domain->linear_revmap[hwirq] = virq;
311 } else {
312 mutex_lock(&revmap_trees_mutex);
313 radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
314 mutex_unlock(&revmap_trees_mutex);
500 } 315 }
316 mutex_unlock(&irq_domain_mutex);
317
318 irq_clear_status_flags(virq, IRQ_NOREQUEST);
501 319
502 return 0; 320 return 0;
321}
322EXPORT_SYMBOL_GPL(irq_domain_associate);
503 323
504 err_unmap: 324void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
505 irq_domain_disassociate_many(domain, irq_base, i); 325 irq_hw_number_t hwirq_base, int count)
506 return -EINVAL; 326{
327 int i;
328
329 pr_debug("%s(%s, irqbase=%i, hwbase=%i, count=%i)\n", __func__,
330 of_node_full_name(domain->of_node), irq_base, (int)hwirq_base, count);
331
332 for (i = 0; i < count; i++) {
333 irq_domain_associate(domain, irq_base + i, hwirq_base + i);
334 }
507} 335}
508EXPORT_SYMBOL_GPL(irq_domain_associate_many); 336EXPORT_SYMBOL_GPL(irq_domain_associate_many);
509 337
@@ -513,7 +341,9 @@ EXPORT_SYMBOL_GPL(irq_domain_associate_many);
513 * 341 *
514 * This routine is used for irq controllers which can choose the hardware 342 * This routine is used for irq controllers which can choose the hardware
515 * interrupt numbers they generate. In such a case it's simplest to use 343 * interrupt numbers they generate. In such a case it's simplest to use
516 * the linux irq as the hardware interrupt number. 344 * the linux irq as the hardware interrupt number. It still uses the linear
345 * or radix tree to store the mapping, but the irq controller can optimize
346 * the revmap path by using the hwirq directly.
517 */ 347 */
518unsigned int irq_create_direct_mapping(struct irq_domain *domain) 348unsigned int irq_create_direct_mapping(struct irq_domain *domain)
519{ 349{
@@ -522,17 +352,14 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
522 if (domain == NULL) 352 if (domain == NULL)
523 domain = irq_default_domain; 353 domain = irq_default_domain;
524 354
525 if (WARN_ON(!domain || domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP))
526 return 0;
527
528 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); 355 virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node));
529 if (!virq) { 356 if (!virq) {
530 pr_debug("create_direct virq allocation failed\n"); 357 pr_debug("create_direct virq allocation failed\n");
531 return 0; 358 return 0;
532 } 359 }
533 if (virq >= domain->revmap_data.nomap.max_irq) { 360 if (virq >= domain->revmap_direct_max_irq) {
534 pr_err("ERROR: no free irqs available below %i maximum\n", 361 pr_err("ERROR: no free irqs available below %i maximum\n",
535 domain->revmap_data.nomap.max_irq); 362 domain->revmap_direct_max_irq);
536 irq_free_desc(virq); 363 irq_free_desc(virq);
537 return 0; 364 return 0;
538 } 365 }
@@ -569,9 +396,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
569 if (domain == NULL) 396 if (domain == NULL)
570 domain = irq_default_domain; 397 domain = irq_default_domain;
571 if (domain == NULL) { 398 if (domain == NULL) {
572 pr_warning("irq_create_mapping called for" 399 WARN(1, "%s(, %lx) called with NULL domain\n", __func__, hwirq);
573 " NULL domain, hwirq=%lx\n", hwirq);
574 WARN_ON(1);
575 return 0; 400 return 0;
576 } 401 }
577 pr_debug("-> using domain @%p\n", domain); 402 pr_debug("-> using domain @%p\n", domain);
@@ -583,10 +408,6 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
583 return virq; 408 return virq;
584 } 409 }
585 410
586 /* Get a virtual interrupt number */
587 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
588 return irq_domain_legacy_revmap(domain, hwirq);
589
590 /* Allocate a virtual interrupt number */ 411 /* Allocate a virtual interrupt number */
591 hint = hwirq % nr_irqs; 412 hint = hwirq % nr_irqs;
592 if (hint == 0) 413 if (hint == 0)
@@ -639,12 +460,7 @@ int irq_create_strict_mappings(struct irq_domain *domain, unsigned int irq_base,
639 if (unlikely(ret < 0)) 460 if (unlikely(ret < 0))
640 return ret; 461 return ret;
641 462
642 ret = irq_domain_associate_many(domain, irq_base, hwirq_base, count); 463 irq_domain_associate_many(domain, irq_base, hwirq_base, count);
643 if (unlikely(ret < 0)) {
644 irq_free_descs(irq_base, count);
645 return ret;
646 }
647
648 return 0; 464 return 0;
649} 465}
650EXPORT_SYMBOL_GPL(irq_create_strict_mappings); 466EXPORT_SYMBOL_GPL(irq_create_strict_mappings);
@@ -659,20 +475,8 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
659 475
660 domain = controller ? irq_find_host(controller) : irq_default_domain; 476 domain = controller ? irq_find_host(controller) : irq_default_domain;
661 if (!domain) { 477 if (!domain) {
662#ifdef CONFIG_MIPS 478 pr_warn("no irq domain found for %s !\n",
663 /* 479 of_node_full_name(controller));
664 * Workaround to avoid breaking interrupt controller drivers
665 * that don't yet register an irq_domain. This is temporary
666 * code. ~~~gcl, Feb 24, 2012
667 *
668 * Scheduled for removal in Linux v3.6. That should be enough
669 * time.
670 */
671 if (intsize > 0)
672 return intspec[0];
673#endif
674 pr_warning("no irq domain found for %s !\n",
675 of_node_full_name(controller));
676 return 0; 480 return 0;
677 } 481 }
678 482
@@ -714,11 +518,7 @@ void irq_dispose_mapping(unsigned int virq)
714 if (WARN_ON(domain == NULL)) 518 if (WARN_ON(domain == NULL))
715 return; 519 return;
716 520
717 /* Never unmap legacy interrupts */ 521 irq_domain_disassociate(domain, virq);
718 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
719 return;
720
721 irq_domain_disassociate_many(domain, virq, 1);
722 irq_free_desc(virq); 522 irq_free_desc(virq);
723} 523}
724EXPORT_SYMBOL_GPL(irq_dispose_mapping); 524EXPORT_SYMBOL_GPL(irq_dispose_mapping);
@@ -739,63 +539,51 @@ unsigned int irq_find_mapping(struct irq_domain *domain,
739 if (domain == NULL) 539 if (domain == NULL)
740 return 0; 540 return 0;
741 541
742 switch (domain->revmap_type) { 542 if (hwirq < domain->revmap_direct_max_irq) {
743 case IRQ_DOMAIN_MAP_LEGACY:
744 return irq_domain_legacy_revmap(domain, hwirq);
745 case IRQ_DOMAIN_MAP_LINEAR:
746 return irq_linear_revmap(domain, hwirq);
747 case IRQ_DOMAIN_MAP_TREE:
748 rcu_read_lock();
749 data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
750 rcu_read_unlock();
751 if (data)
752 return data->irq;
753 break;
754 case IRQ_DOMAIN_MAP_NOMAP:
755 data = irq_get_irq_data(hwirq); 543 data = irq_get_irq_data(hwirq);
756 if (data && (data->domain == domain) && (data->hwirq == hwirq)) 544 if (data && (data->domain == domain) && (data->hwirq == hwirq))
757 return hwirq; 545 return hwirq;
758 break;
759 } 546 }
760 547
761 return 0; 548 /* Check if the hwirq is in the linear revmap. */
762} 549 if (hwirq < domain->revmap_size)
763EXPORT_SYMBOL_GPL(irq_find_mapping); 550 return domain->linear_revmap[hwirq];
764 551
765/** 552 rcu_read_lock();
766 * irq_linear_revmap() - Find a linux irq from a hw irq number. 553 data = radix_tree_lookup(&domain->revmap_tree, hwirq);
767 * @domain: domain owning this hardware interrupt 554 rcu_read_unlock();
768 * @hwirq: hardware irq number in that domain space 555 return data ? data->irq : 0;
769 *
770 * This is a fast path that can be called directly by irq controller code to
771 * save a handful of instructions.
772 */
773unsigned int irq_linear_revmap(struct irq_domain *domain,
774 irq_hw_number_t hwirq)
775{
776 BUG_ON(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR);
777
778 /* Check revmap bounds; complain if exceeded */
779 if (WARN_ON(hwirq >= domain->revmap_data.linear.size))
780 return 0;
781
782 return domain->revmap_data.linear.revmap[hwirq];
783} 556}
784EXPORT_SYMBOL_GPL(irq_linear_revmap); 557EXPORT_SYMBOL_GPL(irq_find_mapping);
785 558
786#ifdef CONFIG_IRQ_DOMAIN_DEBUG 559#ifdef CONFIG_IRQ_DOMAIN_DEBUG
787static int virq_debug_show(struct seq_file *m, void *private) 560static int virq_debug_show(struct seq_file *m, void *private)
788{ 561{
789 unsigned long flags; 562 unsigned long flags;
790 struct irq_desc *desc; 563 struct irq_desc *desc;
791 const char *p; 564 struct irq_domain *domain;
792 static const char none[] = "none"; 565 struct radix_tree_iter iter;
793 void *data; 566 void *data, **slot;
794 int i; 567 int i;
795 568
796 seq_printf(m, "%-5s %-7s %-15s %-*s %s\n", "irq", "hwirq", 569 seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
570 "name", "mapped", "linear-max", "direct-max", "devtree-node");
571 mutex_lock(&irq_domain_mutex);
572 list_for_each_entry(domain, &irq_domain_list, link) {
573 int count = 0;
574 radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0)
575 count++;
576 seq_printf(m, "%c%-16s %6u %10u %10u %s\n",
577 domain == irq_default_domain ? '*' : ' ', domain->name,
578 domain->revmap_size + count, domain->revmap_size,
579 domain->revmap_direct_max_irq,
580 domain->of_node ? of_node_full_name(domain->of_node) : "");
581 }
582 mutex_unlock(&irq_domain_mutex);
583
584 seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq",
797 "chip name", (int)(2 * sizeof(void *) + 2), "chip data", 585 "chip name", (int)(2 * sizeof(void *) + 2), "chip data",
798 "domain name"); 586 "active", "type", "domain");
799 587
800 for (i = 1; i < nr_irqs; i++) { 588 for (i = 1; i < nr_irqs; i++) {
801 desc = irq_to_desc(i); 589 desc = irq_to_desc(i);
@@ -803,28 +591,28 @@ static int virq_debug_show(struct seq_file *m, void *private)
803 continue; 591 continue;
804 592
805 raw_spin_lock_irqsave(&desc->lock, flags); 593 raw_spin_lock_irqsave(&desc->lock, flags);
594 domain = desc->irq_data.domain;
806 595
807 if (desc->action && desc->action->handler) { 596 if (domain) {
808 struct irq_chip *chip; 597 struct irq_chip *chip;
598 int hwirq = desc->irq_data.hwirq;
599 bool direct;
809 600
810 seq_printf(m, "%5d ", i); 601 seq_printf(m, "%5d ", i);
811 seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); 602 seq_printf(m, "0x%05x ", hwirq);
812 603
813 chip = irq_desc_get_chip(desc); 604 chip = irq_desc_get_chip(desc);
814 if (chip && chip->name) 605 seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none");
815 p = chip->name;
816 else
817 p = none;
818 seq_printf(m, "%-15s ", p);
819 606
820 data = irq_desc_get_chip_data(desc); 607 data = irq_desc_get_chip_data(desc);
821 seq_printf(m, data ? "0x%p " : " %p ", data); 608 seq_printf(m, data ? "0x%p " : " %p ", data);
822 609
823 if (desc->irq_data.domain) 610 seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' ');
824 p = of_node_full_name(desc->irq_data.domain->of_node); 611 direct = (i == hwirq) && (i < domain->revmap_direct_max_irq);
825 else 612 seq_printf(m, "%6s%-8s ",
826 p = none; 613 (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX",
827 seq_printf(m, "%s\n", p); 614 direct ? "(DIRECT)" : "");
615 seq_printf(m, "%s\n", desc->irq_data.domain->name);
828 } 616 }
829 617
830 raw_spin_unlock_irqrestore(&desc->lock, flags); 618 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -921,18 +709,3 @@ const struct irq_domain_ops irq_domain_simple_ops = {
921 .xlate = irq_domain_xlate_onetwocell, 709 .xlate = irq_domain_xlate_onetwocell,
922}; 710};
923EXPORT_SYMBOL_GPL(irq_domain_simple_ops); 711EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
924
925#ifdef CONFIG_OF_IRQ
926void irq_domain_generate_simple(const struct of_device_id *match,
927 u64 phys_base, unsigned int irq_start)
928{
929 struct device_node *node;
930 pr_debug("looking for phys_base=%llx, irq_start=%i\n",
931 (unsigned long long) phys_base, (int) irq_start);
932 node = of_find_matching_node_by_address(NULL, match, phys_base);
933 if (node)
934 irq_domain_add_legacy(node, 32, irq_start, 0,
935 &irq_domain_simple_ops, NULL);
936}
937EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
938#endif
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 19ed5c425c3b..36f6ee181b0c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -462,6 +462,8 @@ int show_interrupts(struct seq_file *p, void *v)
462 } else { 462 } else {
463 seq_printf(p, " %8s", "None"); 463 seq_printf(p, " %8s", "None");
464 } 464 }
465 if (desc->irq_data.domain)
466 seq_printf(p, " %*d", prec, (int) desc->irq_data.hwirq);
465#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL 467#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
466 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); 468 seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
467#endif 469#endif
diff --git a/kernel/module.c b/kernel/module.c
index cab4bce49c23..206915830d29 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -455,7 +455,7 @@ const struct kernel_symbol *find_symbol(const char *name,
455EXPORT_SYMBOL_GPL(find_symbol); 455EXPORT_SYMBOL_GPL(find_symbol);
456 456
457/* Search for module by name: must hold module_mutex. */ 457/* Search for module by name: must hold module_mutex. */
458static struct module *find_module_all(const char *name, 458static struct module *find_module_all(const char *name, size_t len,
459 bool even_unformed) 459 bool even_unformed)
460{ 460{
461 struct module *mod; 461 struct module *mod;
@@ -463,7 +463,7 @@ static struct module *find_module_all(const char *name,
463 list_for_each_entry(mod, &modules, list) { 463 list_for_each_entry(mod, &modules, list) {
464 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) 464 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
465 continue; 465 continue;
466 if (strcmp(mod->name, name) == 0) 466 if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
467 return mod; 467 return mod;
468 } 468 }
469 return NULL; 469 return NULL;
@@ -471,7 +471,7 @@ static struct module *find_module_all(const char *name,
471 471
472struct module *find_module(const char *name) 472struct module *find_module(const char *name)
473{ 473{
474 return find_module_all(name, false); 474 return find_module_all(name, strlen(name), false);
475} 475}
476EXPORT_SYMBOL_GPL(find_module); 476EXPORT_SYMBOL_GPL(find_module);
477 477
@@ -482,23 +482,28 @@ static inline void __percpu *mod_percpu(struct module *mod)
482 return mod->percpu; 482 return mod->percpu;
483} 483}
484 484
485static int percpu_modalloc(struct module *mod, 485static int percpu_modalloc(struct module *mod, struct load_info *info)
486 unsigned long size, unsigned long align)
487{ 486{
487 Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
488 unsigned long align = pcpusec->sh_addralign;
489
490 if (!pcpusec->sh_size)
491 return 0;
492
488 if (align > PAGE_SIZE) { 493 if (align > PAGE_SIZE) {
489 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 494 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
490 mod->name, align, PAGE_SIZE); 495 mod->name, align, PAGE_SIZE);
491 align = PAGE_SIZE; 496 align = PAGE_SIZE;
492 } 497 }
493 498
494 mod->percpu = __alloc_reserved_percpu(size, align); 499 mod->percpu = __alloc_reserved_percpu(pcpusec->sh_size, align);
495 if (!mod->percpu) { 500 if (!mod->percpu) {
496 printk(KERN_WARNING 501 printk(KERN_WARNING
497 "%s: Could not allocate %lu bytes percpu data\n", 502 "%s: Could not allocate %lu bytes percpu data\n",
498 mod->name, size); 503 mod->name, (unsigned long)pcpusec->sh_size);
499 return -ENOMEM; 504 return -ENOMEM;
500 } 505 }
501 mod->percpu_size = size; 506 mod->percpu_size = pcpusec->sh_size;
502 return 0; 507 return 0;
503} 508}
504 509
@@ -563,10 +568,12 @@ static inline void __percpu *mod_percpu(struct module *mod)
563{ 568{
564 return NULL; 569 return NULL;
565} 570}
566static inline int percpu_modalloc(struct module *mod, 571static int percpu_modalloc(struct module *mod, struct load_info *info)
567 unsigned long size, unsigned long align)
568{ 572{
569 return -ENOMEM; 573 /* UP modules shouldn't have this section: ENOMEM isn't quite right */
574 if (info->sechdrs[info->index.pcpu].sh_size != 0)
575 return -ENOMEM;
576 return 0;
570} 577}
571static inline void percpu_modfree(struct module *mod) 578static inline void percpu_modfree(struct module *mod)
572{ 579{
@@ -2927,7 +2934,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2927{ 2934{
2928 /* Module within temporary copy. */ 2935 /* Module within temporary copy. */
2929 struct module *mod; 2936 struct module *mod;
2930 Elf_Shdr *pcpusec;
2931 int err; 2937 int err;
2932 2938
2933 mod = setup_load_info(info, flags); 2939 mod = setup_load_info(info, flags);
@@ -2942,17 +2948,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2942 err = module_frob_arch_sections(info->hdr, info->sechdrs, 2948 err = module_frob_arch_sections(info->hdr, info->sechdrs,
2943 info->secstrings, mod); 2949 info->secstrings, mod);
2944 if (err < 0) 2950 if (err < 0)
2945 goto out; 2951 return ERR_PTR(err);
2946 2952
2947 pcpusec = &info->sechdrs[info->index.pcpu]; 2953 /* We will do a special allocation for per-cpu sections later. */
2948 if (pcpusec->sh_size) { 2954 info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
2949 /* We have a special allocation for this section. */
2950 err = percpu_modalloc(mod,
2951 pcpusec->sh_size, pcpusec->sh_addralign);
2952 if (err)
2953 goto out;
2954 pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
2955 }
2956 2955
2957 /* Determine total sizes, and put offsets in sh_entsize. For now 2956 /* Determine total sizes, and put offsets in sh_entsize. For now
2958 this is done generically; there doesn't appear to be any 2957 this is done generically; there doesn't appear to be any
@@ -2963,17 +2962,12 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
2963 /* Allocate and move to the final place */ 2962 /* Allocate and move to the final place */
2964 err = move_module(mod, info); 2963 err = move_module(mod, info);
2965 if (err) 2964 if (err)
2966 goto free_percpu; 2965 return ERR_PTR(err);
2967 2966
2968 /* Module has been copied to its final place now: return it. */ 2967 /* Module has been copied to its final place now: return it. */
2969 mod = (void *)info->sechdrs[info->index.mod].sh_addr; 2968 mod = (void *)info->sechdrs[info->index.mod].sh_addr;
2970 kmemleak_load_module(mod, info); 2969 kmemleak_load_module(mod, info);
2971 return mod; 2970 return mod;
2972
2973free_percpu:
2974 percpu_modfree(mod);
2975out:
2976 return ERR_PTR(err);
2977} 2971}
2978 2972
2979/* mod is no longer valid after this! */ 2973/* mod is no longer valid after this! */
@@ -3014,7 +3008,7 @@ static bool finished_loading(const char *name)
3014 bool ret; 3008 bool ret;
3015 3009
3016 mutex_lock(&module_mutex); 3010 mutex_lock(&module_mutex);
3017 mod = find_module_all(name, true); 3011 mod = find_module_all(name, strlen(name), true);
3018 ret = !mod || mod->state == MODULE_STATE_LIVE 3012 ret = !mod || mod->state == MODULE_STATE_LIVE
3019 || mod->state == MODULE_STATE_GOING; 3013 || mod->state == MODULE_STATE_GOING;
3020 mutex_unlock(&module_mutex); 3014 mutex_unlock(&module_mutex);
@@ -3152,7 +3146,8 @@ static int add_unformed_module(struct module *mod)
3152 3146
3153again: 3147again:
3154 mutex_lock(&module_mutex); 3148 mutex_lock(&module_mutex);
3155 if ((old = find_module_all(mod->name, true)) != NULL) { 3149 old = find_module_all(mod->name, strlen(mod->name), true);
3150 if (old != NULL) {
3156 if (old->state == MODULE_STATE_COMING 3151 if (old->state == MODULE_STATE_COMING
3157 || old->state == MODULE_STATE_UNFORMED) { 3152 || old->state == MODULE_STATE_UNFORMED) {
3158 /* Wait in case it fails to load. */ 3153 /* Wait in case it fails to load. */
@@ -3198,6 +3193,17 @@ out:
3198 return err; 3193 return err;
3199} 3194}
3200 3195
3196static int unknown_module_param_cb(char *param, char *val, const char *modname)
3197{
3198 /* Check for magic 'dyndbg' arg */
3199 int ret = ddebug_dyndbg_module_param_cb(param, val, modname);
3200 if (ret != 0) {
3201 printk(KERN_WARNING "%s: unknown parameter '%s' ignored\n",
3202 modname, param);
3203 }
3204 return 0;
3205}
3206
3201/* Allocate and load the module: note that size of section 0 is always 3207/* Allocate and load the module: note that size of section 0 is always
3202 zero, and we rely on this for optional sections. */ 3208 zero, and we rely on this for optional sections. */
3203static int load_module(struct load_info *info, const char __user *uargs, 3209static int load_module(struct load_info *info, const char __user *uargs,
@@ -3237,6 +3243,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
3237 } 3243 }
3238#endif 3244#endif
3239 3245
3246 /* To avoid stressing percpu allocator, do this once we're unique. */
3247 err = percpu_modalloc(mod, info);
3248 if (err)
3249 goto unlink_mod;
3250
3240 /* Now module is in final location, initialize linked lists, etc. */ 3251 /* Now module is in final location, initialize linked lists, etc. */
3241 err = module_unload_init(mod); 3252 err = module_unload_init(mod);
3242 if (err) 3253 if (err)
@@ -3284,7 +3295,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
3284 3295
3285 /* Module is ready to execute: parsing args may do that. */ 3296 /* Module is ready to execute: parsing args may do that. */
3286 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, 3297 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
3287 -32768, 32767, &ddebug_dyndbg_module_param_cb); 3298 -32768, 32767, unknown_module_param_cb);
3288 if (err < 0) 3299 if (err < 0)
3289 goto bug_cleanup; 3300 goto bug_cleanup;
3290 3301
@@ -3563,10 +3574,8 @@ unsigned long module_kallsyms_lookup_name(const char *name)
3563 /* Don't lock: we're in enough trouble already. */ 3574 /* Don't lock: we're in enough trouble already. */
3564 preempt_disable(); 3575 preempt_disable();
3565 if ((colon = strchr(name, ':')) != NULL) { 3576 if ((colon = strchr(name, ':')) != NULL) {
3566 *colon = '\0'; 3577 if ((mod = find_module_all(name, colon - name, false)) != NULL)
3567 if ((mod = find_module(name)) != NULL)
3568 ret = mod_find_symname(mod, colon+1); 3578 ret = mod_find_symname(mod, colon+1);
3569 *colon = ':';
3570 } else { 3579 } else {
3571 list_for_each_entry_rcu(mod, &modules, list) { 3580 list_for_each_entry_rcu(mod, &modules, list) {
3572 if (mod->state == MODULE_STATE_UNFORMED) 3581 if (mod->state == MODULE_STATE_UNFORMED)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e581ada5faf4..ff05f4bd86eb 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -18,6 +18,7 @@
18 * Also see Documentation/mutex-design.txt. 18 * Also see Documentation/mutex-design.txt.
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/ww_mutex.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
22#include <linux/sched/rt.h> 23#include <linux/sched/rt.h>
23#include <linux/export.h> 24#include <linux/export.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index 167ec097ce8b..801864600514 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -15,6 +15,7 @@
15#include <linux/notifier.h> 15#include <linux/notifier.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/random.h> 17#include <linux/random.h>
18#include <linux/ftrace.h>
18#include <linux/reboot.h> 19#include <linux/reboot.h>
19#include <linux/delay.h> 20#include <linux/delay.h>
20#include <linux/kexec.h> 21#include <linux/kexec.h>
@@ -399,8 +400,11 @@ struct slowpath_args {
399static void warn_slowpath_common(const char *file, int line, void *caller, 400static void warn_slowpath_common(const char *file, int line, void *caller,
400 unsigned taint, struct slowpath_args *args) 401 unsigned taint, struct slowpath_args *args)
401{ 402{
402 printk(KERN_WARNING "------------[ cut here ]------------\n"); 403 disable_trace_on_warning();
403 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); 404
405 pr_warn("------------[ cut here ]------------\n");
406 pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n",
407 raw_smp_processor_id(), current->pid, file, line, caller);
404 408
405 if (args) 409 if (args)
406 vprintk(args->fmt, args->args); 410 vprintk(args->fmt, args->args);
diff --git a/kernel/params.c b/kernel/params.c
index 53b958fcd639..440e65d1a544 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -787,7 +787,7 @@ static void __init kernel_add_sysfs_param(const char *name,
787} 787}
788 788
789/* 789/*
790 * param_sysfs_builtin - add contents in /sys/parameters for built-in modules 790 * param_sysfs_builtin - add sysfs parameters for built-in modules
791 * 791 *
792 * Add module_parameters to sysfs for "modules" built into the kernel. 792 * Add module_parameters to sysfs for "modules" built into the kernel.
793 * 793 *
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 42670e9b44e0..c7f31aa272f7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -51,59 +51,28 @@ static int check_clock(const clockid_t which_clock)
51 return error; 51 return error;
52} 52}
53 53
54static inline union cpu_time_count 54static inline unsigned long long
55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) 55timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
56{ 56{
57 union cpu_time_count ret; 57 unsigned long long ret;
58 ret.sched = 0; /* high half always zero when .cpu used */ 58
59 ret = 0; /* high half always zero when .cpu used */
59 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 60 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
60 ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; 61 ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
61 } else { 62 } else {
62 ret.cpu = timespec_to_cputime(tp); 63 ret = cputime_to_expires(timespec_to_cputime(tp));
63 } 64 }
64 return ret; 65 return ret;
65} 66}
66 67
67static void sample_to_timespec(const clockid_t which_clock, 68static void sample_to_timespec(const clockid_t which_clock,
68 union cpu_time_count cpu, 69 unsigned long long expires,
69 struct timespec *tp) 70 struct timespec *tp)
70{ 71{
71 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) 72 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
72 *tp = ns_to_timespec(cpu.sched); 73 *tp = ns_to_timespec(expires);
73 else 74 else
74 cputime_to_timespec(cpu.cpu, tp); 75 cputime_to_timespec((__force cputime_t)expires, tp);
75}
76
77static inline int cpu_time_before(const clockid_t which_clock,
78 union cpu_time_count now,
79 union cpu_time_count then)
80{
81 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
82 return now.sched < then.sched;
83 } else {
84 return now.cpu < then.cpu;
85 }
86}
87static inline void cpu_time_add(const clockid_t which_clock,
88 union cpu_time_count *acc,
89 union cpu_time_count val)
90{
91 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
92 acc->sched += val.sched;
93 } else {
94 acc->cpu += val.cpu;
95 }
96}
97static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
98 union cpu_time_count a,
99 union cpu_time_count b)
100{
101 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
102 a.sched -= b.sched;
103 } else {
104 a.cpu -= b.cpu;
105 }
106 return a;
107} 76}
108 77
109/* 78/*
@@ -111,47 +80,31 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
111 * given the current clock sample. 80 * given the current clock sample.
112 */ 81 */
113static void bump_cpu_timer(struct k_itimer *timer, 82static void bump_cpu_timer(struct k_itimer *timer,
114 union cpu_time_count now) 83 unsigned long long now)
115{ 84{
116 int i; 85 int i;
86 unsigned long long delta, incr;
117 87
118 if (timer->it.cpu.incr.sched == 0) 88 if (timer->it.cpu.incr == 0)
119 return; 89 return;
120 90
121 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 91 if (now < timer->it.cpu.expires)
122 unsigned long long delta, incr; 92 return;
123 93
124 if (now.sched < timer->it.cpu.expires.sched) 94 incr = timer->it.cpu.incr;
125 return; 95 delta = now + incr - timer->it.cpu.expires;
126 incr = timer->it.cpu.incr.sched;
127 delta = now.sched + incr - timer->it.cpu.expires.sched;
128 /* Don't use (incr*2 < delta), incr*2 might overflow. */
129 for (i = 0; incr < delta - incr; i++)
130 incr = incr << 1;
131 for (; i >= 0; incr >>= 1, i--) {
132 if (delta < incr)
133 continue;
134 timer->it.cpu.expires.sched += incr;
135 timer->it_overrun += 1 << i;
136 delta -= incr;
137 }
138 } else {
139 cputime_t delta, incr;
140 96
141 if (now.cpu < timer->it.cpu.expires.cpu) 97 /* Don't use (incr*2 < delta), incr*2 might overflow. */
142 return; 98 for (i = 0; incr < delta - incr; i++)
143 incr = timer->it.cpu.incr.cpu; 99 incr = incr << 1;
144 delta = now.cpu + incr - timer->it.cpu.expires.cpu; 100
145 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 101 for (; i >= 0; incr >>= 1, i--) {
146 for (i = 0; incr < delta - incr; i++) 102 if (delta < incr)
147 incr += incr; 103 continue;
148 for (; i >= 0; incr = incr >> 1, i--) { 104
149 if (delta < incr) 105 timer->it.cpu.expires += incr;
150 continue; 106 timer->it_overrun += 1 << i;
151 timer->it.cpu.expires.cpu += incr; 107 delta -= incr;
152 timer->it_overrun += 1 << i;
153 delta -= incr;
154 }
155 } 108 }
156} 109}
157 110
@@ -170,21 +123,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime)
170 return 0; 123 return 0;
171} 124}
172 125
173static inline cputime_t prof_ticks(struct task_struct *p) 126static inline unsigned long long prof_ticks(struct task_struct *p)
174{ 127{
175 cputime_t utime, stime; 128 cputime_t utime, stime;
176 129
177 task_cputime(p, &utime, &stime); 130 task_cputime(p, &utime, &stime);
178 131
179 return utime + stime; 132 return cputime_to_expires(utime + stime);
180} 133}
181static inline cputime_t virt_ticks(struct task_struct *p) 134static inline unsigned long long virt_ticks(struct task_struct *p)
182{ 135{
183 cputime_t utime; 136 cputime_t utime;
184 137
185 task_cputime(p, &utime, NULL); 138 task_cputime(p, &utime, NULL);
186 139
187 return utime; 140 return cputime_to_expires(utime);
188} 141}
189 142
190static int 143static int
@@ -225,19 +178,19 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
225 * Sample a per-thread clock for the given task. 178 * Sample a per-thread clock for the given task.
226 */ 179 */
227static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, 180static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
228 union cpu_time_count *cpu) 181 unsigned long long *sample)
229{ 182{
230 switch (CPUCLOCK_WHICH(which_clock)) { 183 switch (CPUCLOCK_WHICH(which_clock)) {
231 default: 184 default:
232 return -EINVAL; 185 return -EINVAL;
233 case CPUCLOCK_PROF: 186 case CPUCLOCK_PROF:
234 cpu->cpu = prof_ticks(p); 187 *sample = prof_ticks(p);
235 break; 188 break;
236 case CPUCLOCK_VIRT: 189 case CPUCLOCK_VIRT:
237 cpu->cpu = virt_ticks(p); 190 *sample = virt_ticks(p);
238 break; 191 break;
239 case CPUCLOCK_SCHED: 192 case CPUCLOCK_SCHED:
240 cpu->sched = task_sched_runtime(p); 193 *sample = task_sched_runtime(p);
241 break; 194 break;
242 } 195 }
243 return 0; 196 return 0;
@@ -284,7 +237,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
284 */ 237 */
285static int cpu_clock_sample_group(const clockid_t which_clock, 238static int cpu_clock_sample_group(const clockid_t which_clock,
286 struct task_struct *p, 239 struct task_struct *p,
287 union cpu_time_count *cpu) 240 unsigned long long *sample)
288{ 241{
289 struct task_cputime cputime; 242 struct task_cputime cputime;
290 243
@@ -293,15 +246,15 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
293 return -EINVAL; 246 return -EINVAL;
294 case CPUCLOCK_PROF: 247 case CPUCLOCK_PROF:
295 thread_group_cputime(p, &cputime); 248 thread_group_cputime(p, &cputime);
296 cpu->cpu = cputime.utime + cputime.stime; 249 *sample = cputime_to_expires(cputime.utime + cputime.stime);
297 break; 250 break;
298 case CPUCLOCK_VIRT: 251 case CPUCLOCK_VIRT:
299 thread_group_cputime(p, &cputime); 252 thread_group_cputime(p, &cputime);
300 cpu->cpu = cputime.utime; 253 *sample = cputime_to_expires(cputime.utime);
301 break; 254 break;
302 case CPUCLOCK_SCHED: 255 case CPUCLOCK_SCHED:
303 thread_group_cputime(p, &cputime); 256 thread_group_cputime(p, &cputime);
304 cpu->sched = cputime.sum_exec_runtime; 257 *sample = cputime.sum_exec_runtime;
305 break; 258 break;
306 } 259 }
307 return 0; 260 return 0;
@@ -312,7 +265,7 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
312{ 265{
313 const pid_t pid = CPUCLOCK_PID(which_clock); 266 const pid_t pid = CPUCLOCK_PID(which_clock);
314 int error = -EINVAL; 267 int error = -EINVAL;
315 union cpu_time_count rtn; 268 unsigned long long rtn;
316 269
317 if (pid == 0) { 270 if (pid == 0) {
318 /* 271 /*
@@ -446,6 +399,15 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
446 return ret; 399 return ret;
447} 400}
448 401
402static void cleanup_timers_list(struct list_head *head,
403 unsigned long long curr)
404{
405 struct cpu_timer_list *timer, *next;
406
407 list_for_each_entry_safe(timer, next, head, entry)
408 list_del_init(&timer->entry);
409}
410
449/* 411/*
450 * Clean out CPU timers still ticking when a thread exited. The task 412 * Clean out CPU timers still ticking when a thread exited. The task
451 * pointer is cleared, and the expiry time is replaced with the residual 413 * pointer is cleared, and the expiry time is replaced with the residual
@@ -456,37 +418,12 @@ static void cleanup_timers(struct list_head *head,
456 cputime_t utime, cputime_t stime, 418 cputime_t utime, cputime_t stime,
457 unsigned long long sum_exec_runtime) 419 unsigned long long sum_exec_runtime)
458{ 420{
459 struct cpu_timer_list *timer, *next;
460 cputime_t ptime = utime + stime;
461
462 list_for_each_entry_safe(timer, next, head, entry) {
463 list_del_init(&timer->entry);
464 if (timer->expires.cpu < ptime) {
465 timer->expires.cpu = 0;
466 } else {
467 timer->expires.cpu -= ptime;
468 }
469 }
470 421
471 ++head; 422 cputime_t ptime = utime + stime;
472 list_for_each_entry_safe(timer, next, head, entry) {
473 list_del_init(&timer->entry);
474 if (timer->expires.cpu < utime) {
475 timer->expires.cpu = 0;
476 } else {
477 timer->expires.cpu -= utime;
478 }
479 }
480 423
481 ++head; 424 cleanup_timers_list(head, cputime_to_expires(ptime));
482 list_for_each_entry_safe(timer, next, head, entry) { 425 cleanup_timers_list(++head, cputime_to_expires(utime));
483 list_del_init(&timer->entry); 426 cleanup_timers_list(++head, sum_exec_runtime);
484 if (timer->expires.sched < sum_exec_runtime) {
485 timer->expires.sched = 0;
486 } else {
487 timer->expires.sched -= sum_exec_runtime;
488 }
489 }
490} 427}
491 428
492/* 429/*
@@ -516,17 +453,21 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
516 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 453 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
517} 454}
518 455
519static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 456static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
520{ 457{
458 struct cpu_timer_list *timer = &itimer->it.cpu;
459
521 /* 460 /*
522 * That's all for this thread or process. 461 * That's all for this thread or process.
523 * We leave our residual in expires to be reported. 462 * We leave our residual in expires to be reported.
524 */ 463 */
525 put_task_struct(timer->it.cpu.task); 464 put_task_struct(timer->task);
526 timer->it.cpu.task = NULL; 465 timer->task = NULL;
527 timer->it.cpu.expires = cpu_time_sub(timer->it_clock, 466 if (timer->expires < now) {
528 timer->it.cpu.expires, 467 timer->expires = 0;
529 now); 468 } else {
469 timer->expires -= now;
470 }
530} 471}
531 472
532static inline int expires_gt(cputime_t expires, cputime_t new_exp) 473static inline int expires_gt(cputime_t expires, cputime_t new_exp)
@@ -558,14 +499,14 @@ static void arm_timer(struct k_itimer *timer)
558 499
559 listpos = head; 500 listpos = head;
560 list_for_each_entry(next, head, entry) { 501 list_for_each_entry(next, head, entry) {
561 if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) 502 if (nt->expires < next->expires)
562 break; 503 break;
563 listpos = &next->entry; 504 listpos = &next->entry;
564 } 505 }
565 list_add(&nt->entry, listpos); 506 list_add(&nt->entry, listpos);
566 507
567 if (listpos == head) { 508 if (listpos == head) {
568 union cpu_time_count *exp = &nt->expires; 509 unsigned long long exp = nt->expires;
569 510
570 /* 511 /*
571 * We are the new earliest-expiring POSIX 1.b timer, hence 512 * We are the new earliest-expiring POSIX 1.b timer, hence
@@ -576,17 +517,17 @@ static void arm_timer(struct k_itimer *timer)
576 517
577 switch (CPUCLOCK_WHICH(timer->it_clock)) { 518 switch (CPUCLOCK_WHICH(timer->it_clock)) {
578 case CPUCLOCK_PROF: 519 case CPUCLOCK_PROF:
579 if (expires_gt(cputime_expires->prof_exp, exp->cpu)) 520 if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp)))
580 cputime_expires->prof_exp = exp->cpu; 521 cputime_expires->prof_exp = expires_to_cputime(exp);
581 break; 522 break;
582 case CPUCLOCK_VIRT: 523 case CPUCLOCK_VIRT:
583 if (expires_gt(cputime_expires->virt_exp, exp->cpu)) 524 if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp)))
584 cputime_expires->virt_exp = exp->cpu; 525 cputime_expires->virt_exp = expires_to_cputime(exp);
585 break; 526 break;
586 case CPUCLOCK_SCHED: 527 case CPUCLOCK_SCHED:
587 if (cputime_expires->sched_exp == 0 || 528 if (cputime_expires->sched_exp == 0 ||
588 cputime_expires->sched_exp > exp->sched) 529 cputime_expires->sched_exp > exp)
589 cputime_expires->sched_exp = exp->sched; 530 cputime_expires->sched_exp = exp;
590 break; 531 break;
591 } 532 }
592 } 533 }
@@ -601,20 +542,20 @@ static void cpu_timer_fire(struct k_itimer *timer)
601 /* 542 /*
602 * User don't want any signal. 543 * User don't want any signal.
603 */ 544 */
604 timer->it.cpu.expires.sched = 0; 545 timer->it.cpu.expires = 0;
605 } else if (unlikely(timer->sigq == NULL)) { 546 } else if (unlikely(timer->sigq == NULL)) {
606 /* 547 /*
607 * This a special case for clock_nanosleep, 548 * This a special case for clock_nanosleep,
608 * not a normal timer from sys_timer_create. 549 * not a normal timer from sys_timer_create.
609 */ 550 */
610 wake_up_process(timer->it_process); 551 wake_up_process(timer->it_process);
611 timer->it.cpu.expires.sched = 0; 552 timer->it.cpu.expires = 0;
612 } else if (timer->it.cpu.incr.sched == 0) { 553 } else if (timer->it.cpu.incr == 0) {
613 /* 554 /*
614 * One-shot timer. Clear it as soon as it's fired. 555 * One-shot timer. Clear it as soon as it's fired.
615 */ 556 */
616 posix_timer_event(timer, 0); 557 posix_timer_event(timer, 0);
617 timer->it.cpu.expires.sched = 0; 558 timer->it.cpu.expires = 0;
618 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { 559 } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) {
619 /* 560 /*
620 * The signal did not get queued because the signal 561 * The signal did not get queued because the signal
@@ -632,7 +573,7 @@ static void cpu_timer_fire(struct k_itimer *timer)
632 */ 573 */
633static int cpu_timer_sample_group(const clockid_t which_clock, 574static int cpu_timer_sample_group(const clockid_t which_clock,
634 struct task_struct *p, 575 struct task_struct *p,
635 union cpu_time_count *cpu) 576 unsigned long long *sample)
636{ 577{
637 struct task_cputime cputime; 578 struct task_cputime cputime;
638 579
@@ -641,13 +582,13 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
641 default: 582 default:
642 return -EINVAL; 583 return -EINVAL;
643 case CPUCLOCK_PROF: 584 case CPUCLOCK_PROF:
644 cpu->cpu = cputime.utime + cputime.stime; 585 *sample = cputime_to_expires(cputime.utime + cputime.stime);
645 break; 586 break;
646 case CPUCLOCK_VIRT: 587 case CPUCLOCK_VIRT:
647 cpu->cpu = cputime.utime; 588 *sample = cputime_to_expires(cputime.utime);
648 break; 589 break;
649 case CPUCLOCK_SCHED: 590 case CPUCLOCK_SCHED:
650 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); 591 *sample = cputime.sum_exec_runtime + task_delta_exec(p);
651 break; 592 break;
652 } 593 }
653 return 0; 594 return 0;
@@ -694,7 +635,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
694 struct itimerspec *new, struct itimerspec *old) 635 struct itimerspec *new, struct itimerspec *old)
695{ 636{
696 struct task_struct *p = timer->it.cpu.task; 637 struct task_struct *p = timer->it.cpu.task;
697 union cpu_time_count old_expires, new_expires, old_incr, val; 638 unsigned long long old_expires, new_expires, old_incr, val;
698 int ret; 639 int ret;
699 640
700 if (unlikely(p == NULL)) { 641 if (unlikely(p == NULL)) {
@@ -749,7 +690,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
749 } 690 }
750 691
751 if (old) { 692 if (old) {
752 if (old_expires.sched == 0) { 693 if (old_expires == 0) {
753 old->it_value.tv_sec = 0; 694 old->it_value.tv_sec = 0;
754 old->it_value.tv_nsec = 0; 695 old->it_value.tv_nsec = 0;
755 } else { 696 } else {
@@ -764,11 +705,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
764 * new setting. 705 * new setting.
765 */ 706 */
766 bump_cpu_timer(timer, val); 707 bump_cpu_timer(timer, val);
767 if (cpu_time_before(timer->it_clock, val, 708 if (val < timer->it.cpu.expires) {
768 timer->it.cpu.expires)) { 709 old_expires = timer->it.cpu.expires - val;
769 old_expires = cpu_time_sub(
770 timer->it_clock,
771 timer->it.cpu.expires, val);
772 sample_to_timespec(timer->it_clock, 710 sample_to_timespec(timer->it_clock,
773 old_expires, 711 old_expires,
774 &old->it_value); 712 &old->it_value);
@@ -791,8 +729,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
791 goto out; 729 goto out;
792 } 730 }
793 731
794 if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { 732 if (new_expires != 0 && !(flags & TIMER_ABSTIME)) {
795 cpu_time_add(timer->it_clock, &new_expires, val); 733 new_expires += val;
796 } 734 }
797 735
798 /* 736 /*
@@ -801,8 +739,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
801 * arm the timer (we'll just fake it for timer_gettime). 739 * arm the timer (we'll just fake it for timer_gettime).
802 */ 740 */
803 timer->it.cpu.expires = new_expires; 741 timer->it.cpu.expires = new_expires;
804 if (new_expires.sched != 0 && 742 if (new_expires != 0 && val < new_expires) {
805 cpu_time_before(timer->it_clock, val, new_expires)) {
806 arm_timer(timer); 743 arm_timer(timer);
807 } 744 }
808 745
@@ -826,8 +763,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
826 timer->it_overrun_last = 0; 763 timer->it_overrun_last = 0;
827 timer->it_overrun = -1; 764 timer->it_overrun = -1;
828 765
829 if (new_expires.sched != 0 && 766 if (new_expires != 0 && !(val < new_expires)) {
830 !cpu_time_before(timer->it_clock, val, new_expires)) {
831 /* 767 /*
832 * The designated time already passed, so we notify 768 * The designated time already passed, so we notify
833 * immediately, even if the thread never runs to 769 * immediately, even if the thread never runs to
@@ -849,7 +785,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
849 785
850static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) 786static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
851{ 787{
852 union cpu_time_count now; 788 unsigned long long now;
853 struct task_struct *p = timer->it.cpu.task; 789 struct task_struct *p = timer->it.cpu.task;
854 int clear_dead; 790 int clear_dead;
855 791
@@ -859,7 +795,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
859 sample_to_timespec(timer->it_clock, 795 sample_to_timespec(timer->it_clock,
860 timer->it.cpu.incr, &itp->it_interval); 796 timer->it.cpu.incr, &itp->it_interval);
861 797
862 if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ 798 if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */
863 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; 799 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
864 return; 800 return;
865 } 801 }
@@ -891,7 +827,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
891 */ 827 */
892 put_task_struct(p); 828 put_task_struct(p);
893 timer->it.cpu.task = NULL; 829 timer->it.cpu.task = NULL;
894 timer->it.cpu.expires.sched = 0; 830 timer->it.cpu.expires = 0;
895 read_unlock(&tasklist_lock); 831 read_unlock(&tasklist_lock);
896 goto dead; 832 goto dead;
897 } else { 833 } else {
@@ -912,10 +848,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
912 goto dead; 848 goto dead;
913 } 849 }
914 850
915 if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { 851 if (now < timer->it.cpu.expires) {
916 sample_to_timespec(timer->it_clock, 852 sample_to_timespec(timer->it_clock,
917 cpu_time_sub(timer->it_clock, 853 timer->it.cpu.expires - now,
918 timer->it.cpu.expires, now),
919 &itp->it_value); 854 &itp->it_value);
920 } else { 855 } else {
921 /* 856 /*
@@ -927,6 +862,28 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 } 862 }
928} 863}
929 864
865static unsigned long long
866check_timers_list(struct list_head *timers,
867 struct list_head *firing,
868 unsigned long long curr)
869{
870 int maxfire = 20;
871
872 while (!list_empty(timers)) {
873 struct cpu_timer_list *t;
874
875 t = list_first_entry(timers, struct cpu_timer_list, entry);
876
877 if (!--maxfire || curr < t->expires)
878 return t->expires;
879
880 t->firing = 1;
881 list_move_tail(&t->entry, firing);
882 }
883
884 return 0;
885}
886
930/* 887/*
931 * Check for any per-thread CPU timers that have fired and move them off 888 * Check for any per-thread CPU timers that have fired and move them off
932 * the tsk->cpu_timers[N] list onto the firing list. Here we update the 889 * the tsk->cpu_timers[N] list onto the firing list. Here we update the
@@ -935,54 +892,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
935static void check_thread_timers(struct task_struct *tsk, 892static void check_thread_timers(struct task_struct *tsk,
936 struct list_head *firing) 893 struct list_head *firing)
937{ 894{
938 int maxfire;
939 struct list_head *timers = tsk->cpu_timers; 895 struct list_head *timers = tsk->cpu_timers;
940 struct signal_struct *const sig = tsk->signal; 896 struct signal_struct *const sig = tsk->signal;
897 struct task_cputime *tsk_expires = &tsk->cputime_expires;
898 unsigned long long expires;
941 unsigned long soft; 899 unsigned long soft;
942 900
943 maxfire = 20; 901 expires = check_timers_list(timers, firing, prof_ticks(tsk));
944 tsk->cputime_expires.prof_exp = 0; 902 tsk_expires->prof_exp = expires_to_cputime(expires);
945 while (!list_empty(timers)) {
946 struct cpu_timer_list *t = list_first_entry(timers,
947 struct cpu_timer_list,
948 entry);
949 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
950 tsk->cputime_expires.prof_exp = t->expires.cpu;
951 break;
952 }
953 t->firing = 1;
954 list_move_tail(&t->entry, firing);
955 }
956 903
957 ++timers; 904 expires = check_timers_list(++timers, firing, virt_ticks(tsk));
958 maxfire = 20; 905 tsk_expires->virt_exp = expires_to_cputime(expires);
959 tsk->cputime_expires.virt_exp = 0;
960 while (!list_empty(timers)) {
961 struct cpu_timer_list *t = list_first_entry(timers,
962 struct cpu_timer_list,
963 entry);
964 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
965 tsk->cputime_expires.virt_exp = t->expires.cpu;
966 break;
967 }
968 t->firing = 1;
969 list_move_tail(&t->entry, firing);
970 }
971 906
972 ++timers; 907 tsk_expires->sched_exp = check_timers_list(++timers, firing,
973 maxfire = 20; 908 tsk->se.sum_exec_runtime);
974 tsk->cputime_expires.sched_exp = 0;
975 while (!list_empty(timers)) {
976 struct cpu_timer_list *t = list_first_entry(timers,
977 struct cpu_timer_list,
978 entry);
979 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
980 tsk->cputime_expires.sched_exp = t->expires.sched;
981 break;
982 }
983 t->firing = 1;
984 list_move_tail(&t->entry, firing);
985 }
986 909
987 /* 910 /*
988 * Check for the special case thread timers. 911 * Check for the special case thread timers.
@@ -1030,7 +953,8 @@ static void stop_process_timers(struct signal_struct *sig)
1030static u32 onecputick; 953static u32 onecputick;
1031 954
1032static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 955static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1033 cputime_t *expires, cputime_t cur_time, int signo) 956 unsigned long long *expires,
957 unsigned long long cur_time, int signo)
1034{ 958{
1035 if (!it->expires) 959 if (!it->expires)
1036 return; 960 return;
@@ -1066,9 +990,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1066static void check_process_timers(struct task_struct *tsk, 990static void check_process_timers(struct task_struct *tsk,
1067 struct list_head *firing) 991 struct list_head *firing)
1068{ 992{
1069 int maxfire;
1070 struct signal_struct *const sig = tsk->signal; 993 struct signal_struct *const sig = tsk->signal;
1071 cputime_t utime, ptime, virt_expires, prof_expires; 994 unsigned long long utime, ptime, virt_expires, prof_expires;
1072 unsigned long long sum_sched_runtime, sched_expires; 995 unsigned long long sum_sched_runtime, sched_expires;
1073 struct list_head *timers = sig->cpu_timers; 996 struct list_head *timers = sig->cpu_timers;
1074 struct task_cputime cputime; 997 struct task_cputime cputime;
@@ -1078,52 +1001,13 @@ static void check_process_timers(struct task_struct *tsk,
1078 * Collect the current process totals. 1001 * Collect the current process totals.
1079 */ 1002 */
1080 thread_group_cputimer(tsk, &cputime); 1003 thread_group_cputimer(tsk, &cputime);
1081 utime = cputime.utime; 1004 utime = cputime_to_expires(cputime.utime);
1082 ptime = utime + cputime.stime; 1005 ptime = utime + cputime_to_expires(cputime.stime);
1083 sum_sched_runtime = cputime.sum_exec_runtime; 1006 sum_sched_runtime = cputime.sum_exec_runtime;
1084 maxfire = 20;
1085 prof_expires = 0;
1086 while (!list_empty(timers)) {
1087 struct cpu_timer_list *tl = list_first_entry(timers,
1088 struct cpu_timer_list,
1089 entry);
1090 if (!--maxfire || ptime < tl->expires.cpu) {
1091 prof_expires = tl->expires.cpu;
1092 break;
1093 }
1094 tl->firing = 1;
1095 list_move_tail(&tl->entry, firing);
1096 }
1097 1007
1098 ++timers; 1008 prof_expires = check_timers_list(timers, firing, ptime);
1099 maxfire = 20; 1009 virt_expires = check_timers_list(++timers, firing, utime);
1100 virt_expires = 0; 1010 sched_expires = check_timers_list(++timers, firing, sum_sched_runtime);
1101 while (!list_empty(timers)) {
1102 struct cpu_timer_list *tl = list_first_entry(timers,
1103 struct cpu_timer_list,
1104 entry);
1105 if (!--maxfire || utime < tl->expires.cpu) {
1106 virt_expires = tl->expires.cpu;
1107 break;
1108 }
1109 tl->firing = 1;
1110 list_move_tail(&tl->entry, firing);
1111 }
1112
1113 ++timers;
1114 maxfire = 20;
1115 sched_expires = 0;
1116 while (!list_empty(timers)) {
1117 struct cpu_timer_list *tl = list_first_entry(timers,
1118 struct cpu_timer_list,
1119 entry);
1120 if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
1121 sched_expires = tl->expires.sched;
1122 break;
1123 }
1124 tl->firing = 1;
1125 list_move_tail(&tl->entry, firing);
1126 }
1127 1011
1128 /* 1012 /*
1129 * Check for the special case process timers. 1013 * Check for the special case process timers.
@@ -1162,8 +1046,8 @@ static void check_process_timers(struct task_struct *tsk,
1162 } 1046 }
1163 } 1047 }
1164 1048
1165 sig->cputime_expires.prof_exp = prof_expires; 1049 sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires);
1166 sig->cputime_expires.virt_exp = virt_expires; 1050 sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires);
1167 sig->cputime_expires.sched_exp = sched_expires; 1051 sig->cputime_expires.sched_exp = sched_expires;
1168 if (task_cputime_zero(&sig->cputime_expires)) 1052 if (task_cputime_zero(&sig->cputime_expires))
1169 stop_process_timers(sig); 1053 stop_process_timers(sig);
@@ -1176,7 +1060,7 @@ static void check_process_timers(struct task_struct *tsk,
1176void posix_cpu_timer_schedule(struct k_itimer *timer) 1060void posix_cpu_timer_schedule(struct k_itimer *timer)
1177{ 1061{
1178 struct task_struct *p = timer->it.cpu.task; 1062 struct task_struct *p = timer->it.cpu.task;
1179 union cpu_time_count now; 1063 unsigned long long now;
1180 1064
1181 if (unlikely(p == NULL)) 1065 if (unlikely(p == NULL))
1182 /* 1066 /*
@@ -1205,7 +1089,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1205 */ 1089 */
1206 put_task_struct(p); 1090 put_task_struct(p);
1207 timer->it.cpu.task = p = NULL; 1091 timer->it.cpu.task = p = NULL;
1208 timer->it.cpu.expires.sched = 0; 1092 timer->it.cpu.expires = 0;
1209 goto out_unlock; 1093 goto out_unlock;
1210 } else if (unlikely(p->exit_state) && thread_group_empty(p)) { 1094 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1211 /* 1095 /*
@@ -1213,6 +1097,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1213 * not yet reaped. Take this opportunity to 1097 * not yet reaped. Take this opportunity to
1214 * drop our task ref. 1098 * drop our task ref.
1215 */ 1099 */
1100 cpu_timer_sample_group(timer->it_clock, p, &now);
1216 clear_dead_task(timer, now); 1101 clear_dead_task(timer, now);
1217 goto out_unlock; 1102 goto out_unlock;
1218 } 1103 }
@@ -1387,7 +1272,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1387void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1272void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1388 cputime_t *newval, cputime_t *oldval) 1273 cputime_t *newval, cputime_t *oldval)
1389{ 1274{
1390 union cpu_time_count now; 1275 unsigned long long now;
1391 1276
1392 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1277 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1393 cpu_timer_sample_group(clock_idx, tsk, &now); 1278 cpu_timer_sample_group(clock_idx, tsk, &now);
@@ -1399,17 +1284,17 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1399 * it to be absolute. 1284 * it to be absolute.
1400 */ 1285 */
1401 if (*oldval) { 1286 if (*oldval) {
1402 if (*oldval <= now.cpu) { 1287 if (*oldval <= now) {
1403 /* Just about to fire. */ 1288 /* Just about to fire. */
1404 *oldval = cputime_one_jiffy; 1289 *oldval = cputime_one_jiffy;
1405 } else { 1290 } else {
1406 *oldval -= now.cpu; 1291 *oldval -= now;
1407 } 1292 }
1408 } 1293 }
1409 1294
1410 if (!*newval) 1295 if (!*newval)
1411 goto out; 1296 goto out;
1412 *newval += now.cpu; 1297 *newval += now;
1413 } 1298 }
1414 1299
1415 /* 1300 /*
@@ -1459,7 +1344,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1459 } 1344 }
1460 1345
1461 while (!signal_pending(current)) { 1346 while (!signal_pending(current)) {
1462 if (timer.it.cpu.expires.sched == 0) { 1347 if (timer.it.cpu.expires == 0) {
1463 /* 1348 /*
1464 * Our timer fired and was reset, below 1349 * Our timer fired and was reset, below
1465 * deletion can not fail. 1350 * deletion can not fail.
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index c6422ffeda9a..9012ecf7b814 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work)
32 32
33 mutex_lock(&autosleep_lock); 33 mutex_lock(&autosleep_lock);
34 34
35 if (!pm_save_wakeup_count(initial_count)) { 35 if (!pm_save_wakeup_count(initial_count) ||
36 system_state != SYSTEM_RUNNING) {
36 mutex_unlock(&autosleep_lock); 37 mutex_unlock(&autosleep_lock);
37 goto out; 38 goto out;
38 } 39 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 8212c1aef125..69b0890ed7e5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1369,9 +1369,9 @@ static int console_trylock_for_printk(unsigned int cpu)
1369 } 1369 }
1370 } 1370 }
1371 logbuf_cpu = UINT_MAX; 1371 logbuf_cpu = UINT_MAX;
1372 raw_spin_unlock(&logbuf_lock);
1372 if (wake) 1373 if (wake)
1373 up(&console_sem); 1374 up(&console_sem);
1374 raw_spin_unlock(&logbuf_lock);
1375 return retval; 1375 return retval;
1376} 1376}
1377 1377
@@ -1921,7 +1921,7 @@ void resume_console(void)
1921 * called when a new CPU comes online (or fails to come up), and ensures 1921 * called when a new CPU comes online (or fails to come up), and ensures
1922 * that any such output gets printed. 1922 * that any such output gets printed.
1923 */ 1923 */
1924static int __cpuinit console_cpu_notify(struct notifier_block *self, 1924static int console_cpu_notify(struct notifier_block *self,
1925 unsigned long action, void *hcpu) 1925 unsigned long action, void *hcpu)
1926{ 1926{
1927 switch (action) { 1927 switch (action) {
diff --git a/kernel/profile.c b/kernel/profile.c
index 0bf400737660..6631e1ef55ab 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -331,7 +331,7 @@ out:
331 put_cpu(); 331 put_cpu();
332} 332}
333 333
334static int __cpuinit profile_cpu_callback(struct notifier_block *info, 334static int profile_cpu_callback(struct notifier_block *info,
335 unsigned long action, void *__cpu) 335 unsigned long action, void *__cpu)
336{ 336{
337 int node, cpu = (unsigned long)__cpu; 337 int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ba5e6cea181a..4041f5747e73 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -469,6 +469,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
469 /* Architecture-specific hardware disable .. */ 469 /* Architecture-specific hardware disable .. */
470 ptrace_disable(child); 470 ptrace_disable(child);
471 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); 471 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
472 flush_ptrace_hw_breakpoint(child);
472 473
473 write_lock_irq(&tasklist_lock); 474 write_lock_irq(&tasklist_lock);
474 /* 475 /*
@@ -1221,19 +1222,3 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
1221 return ret; 1222 return ret;
1222} 1223}
1223#endif /* CONFIG_COMPAT */ 1224#endif /* CONFIG_COMPAT */
1224
1225#ifdef CONFIG_HAVE_HW_BREAKPOINT
1226int ptrace_get_breakpoints(struct task_struct *tsk)
1227{
1228 if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
1229 return 0;
1230
1231 return -1;
1232}
1233
1234void ptrace_put_breakpoints(struct task_struct *tsk)
1235{
1236 if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
1237 flush_ptrace_hw_breakpoint(tsk);
1238}
1239#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b1fa5510388d..f4871e52c546 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1476,7 +1476,7 @@ rcu_torture_shutdown(void *arg)
1476 * Execute random CPU-hotplug operations at the interval specified 1476 * Execute random CPU-hotplug operations at the interval specified
1477 * by the onoff_interval. 1477 * by the onoff_interval.
1478 */ 1478 */
1479static int __cpuinit 1479static int
1480rcu_torture_onoff(void *arg) 1480rcu_torture_onoff(void *arg)
1481{ 1481{
1482 int cpu; 1482 int cpu;
@@ -1558,7 +1558,7 @@ rcu_torture_onoff(void *arg)
1558 return 0; 1558 return 0;
1559} 1559}
1560 1560
1561static int __cpuinit 1561static int
1562rcu_torture_onoff_init(void) 1562rcu_torture_onoff_init(void)
1563{ 1563{
1564 int ret; 1564 int ret;
@@ -1601,7 +1601,7 @@ static void rcu_torture_onoff_cleanup(void)
1601 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then 1601 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
1602 * induces a CPU stall for the time specified by stall_cpu. 1602 * induces a CPU stall for the time specified by stall_cpu.
1603 */ 1603 */
1604static int __cpuinit rcu_torture_stall(void *args) 1604static int rcu_torture_stall(void *args)
1605{ 1605{
1606 unsigned long stop_at; 1606 unsigned long stop_at;
1607 1607
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e08abb9461ac..068de3a93606 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -2910,7 +2910,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2910 * can accept some slop in the rsp->completed access due to the fact 2910 * can accept some slop in the rsp->completed access due to the fact
2911 * that this CPU cannot possibly have any RCU callbacks in flight yet. 2911 * that this CPU cannot possibly have any RCU callbacks in flight yet.
2912 */ 2912 */
2913static void __cpuinit 2913static void
2914rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) 2914rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2915{ 2915{
2916 unsigned long flags; 2916 unsigned long flags;
@@ -2962,7 +2962,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2962 mutex_unlock(&rsp->onoff_mutex); 2962 mutex_unlock(&rsp->onoff_mutex);
2963} 2963}
2964 2964
2965static void __cpuinit rcu_prepare_cpu(int cpu) 2965static void rcu_prepare_cpu(int cpu)
2966{ 2966{
2967 struct rcu_state *rsp; 2967 struct rcu_state *rsp;
2968 2968
@@ -2974,7 +2974,7 @@ static void __cpuinit rcu_prepare_cpu(int cpu)
2974/* 2974/*
2975 * Handle CPU online/offline notification events. 2975 * Handle CPU online/offline notification events.
2976 */ 2976 */
2977static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 2977static int rcu_cpu_notify(struct notifier_block *self,
2978 unsigned long action, void *hcpu) 2978 unsigned long action, void *hcpu)
2979{ 2979{
2980 long cpu = (long)hcpu; 2980 long cpu = (long)hcpu;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a39d364493c..b3832581043c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -521,10 +521,10 @@ static void invoke_rcu_callbacks_kthread(void);
521static bool rcu_is_callbacks_kthread(void); 521static bool rcu_is_callbacks_kthread(void);
522#ifdef CONFIG_RCU_BOOST 522#ifdef CONFIG_RCU_BOOST
523static void rcu_preempt_do_callbacks(void); 523static void rcu_preempt_do_callbacks(void);
524static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 524static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
525 struct rcu_node *rnp); 525 struct rcu_node *rnp);
526#endif /* #ifdef CONFIG_RCU_BOOST */ 526#endif /* #ifdef CONFIG_RCU_BOOST */
527static void __cpuinit rcu_prepare_kthreads(int cpu); 527static void rcu_prepare_kthreads(int cpu);
528static void rcu_cleanup_after_idle(int cpu); 528static void rcu_cleanup_after_idle(int cpu);
529static void rcu_prepare_for_idle(int cpu); 529static void rcu_prepare_for_idle(int cpu);
530static void rcu_idle_count_callbacks_posted(void); 530static void rcu_idle_count_callbacks_posted(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 63098a59216e..769e12e3151b 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1352,7 +1352,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1352 * already exist. We only create this kthread for preemptible RCU. 1352 * already exist. We only create this kthread for preemptible RCU.
1353 * Returns zero if all is well, a negated errno otherwise. 1353 * Returns zero if all is well, a negated errno otherwise.
1354 */ 1354 */
1355static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1355static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1356 struct rcu_node *rnp) 1356 struct rcu_node *rnp)
1357{ 1357{
1358 int rnp_index = rnp - &rsp->node[0]; 1358 int rnp_index = rnp - &rsp->node[0];
@@ -1507,7 +1507,7 @@ static int __init rcu_spawn_kthreads(void)
1507} 1507}
1508early_initcall(rcu_spawn_kthreads); 1508early_initcall(rcu_spawn_kthreads);
1509 1509
1510static void __cpuinit rcu_prepare_kthreads(int cpu) 1510static void rcu_prepare_kthreads(int cpu)
1511{ 1511{
1512 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 1512 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
1513 struct rcu_node *rnp = rdp->mynode; 1513 struct rcu_node *rnp = rdp->mynode;
@@ -1549,7 +1549,7 @@ static int __init rcu_scheduler_really_started(void)
1549} 1549}
1550early_initcall(rcu_scheduler_really_started); 1550early_initcall(rcu_scheduler_really_started);
1551 1551
1552static void __cpuinit rcu_prepare_kthreads(int cpu) 1552static void rcu_prepare_kthreads(int cpu)
1553{ 1553{
1554} 1554}
1555 1555
diff --git a/kernel/reboot.c b/kernel/reboot.c
new file mode 100644
index 000000000000..269ed9384cc4
--- /dev/null
+++ b/kernel/reboot.c
@@ -0,0 +1,419 @@
1/*
2 * linux/kernel/reboot.c
3 *
4 * Copyright (C) 2013 Linus Torvalds
5 */
6
7#define pr_fmt(fmt) "reboot: " fmt
8
9#include <linux/ctype.h>
10#include <linux/export.h>
11#include <linux/kexec.h>
12#include <linux/kmod.h>
13#include <linux/kmsg_dump.h>
14#include <linux/reboot.h>
15#include <linux/suspend.h>
16#include <linux/syscalls.h>
17#include <linux/syscore_ops.h>
18#include <linux/uaccess.h>
19
20/*
21 * this indicates whether you can reboot with ctrl-alt-del: the default is yes
22 */
23
24int C_A_D = 1;
25struct pid *cad_pid;
26EXPORT_SYMBOL(cad_pid);
27
28#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
29#define DEFAULT_REBOOT_MODE = REBOOT_HARD
30#else
31#define DEFAULT_REBOOT_MODE
32#endif
33enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
34
35int reboot_default;
36int reboot_cpu;
37enum reboot_type reboot_type = BOOT_ACPI;
38int reboot_force;
39
40/*
41 * If set, this is used for preparing the system to power off.
42 */
43
44void (*pm_power_off_prepare)(void);
45
46/**
47 * emergency_restart - reboot the system
48 *
49 * Without shutting down any hardware or taking any locks
50 * reboot the system. This is called when we know we are in
51 * trouble so this is our best effort to reboot. This is
52 * safe to call in interrupt context.
53 */
54void emergency_restart(void)
55{
56 kmsg_dump(KMSG_DUMP_EMERG);
57 machine_emergency_restart();
58}
59EXPORT_SYMBOL_GPL(emergency_restart);
60
61void kernel_restart_prepare(char *cmd)
62{
63 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
64 system_state = SYSTEM_RESTART;
65 usermodehelper_disable();
66 device_shutdown();
67}
68
69/**
70 * register_reboot_notifier - Register function to be called at reboot time
71 * @nb: Info about notifier function to be called
72 *
73 * Registers a function with the list of functions
74 * to be called at reboot time.
75 *
76 * Currently always returns zero, as blocking_notifier_chain_register()
77 * always returns zero.
78 */
79int register_reboot_notifier(struct notifier_block *nb)
80{
81 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
82}
83EXPORT_SYMBOL(register_reboot_notifier);
84
85/**
86 * unregister_reboot_notifier - Unregister previously registered reboot notifier
87 * @nb: Hook to be unregistered
88 *
89 * Unregisters a previously registered reboot
90 * notifier function.
91 *
92 * Returns zero on success, or %-ENOENT on failure.
93 */
94int unregister_reboot_notifier(struct notifier_block *nb)
95{
96 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
97}
98EXPORT_SYMBOL(unregister_reboot_notifier);
99
100static void migrate_to_reboot_cpu(void)
101{
102 /* The boot cpu is always logical cpu 0 */
103 int cpu = reboot_cpu;
104
105 cpu_hotplug_disable();
106
107 /* Make certain the cpu I'm about to reboot on is online */
108 if (!cpu_online(cpu))
109 cpu = cpumask_first(cpu_online_mask);
110
111 /* Prevent races with other tasks migrating this task */
112 current->flags |= PF_NO_SETAFFINITY;
113
114 /* Make certain I only run on the appropriate processor */
115 set_cpus_allowed_ptr(current, cpumask_of(cpu));
116}
117
118/**
119 * kernel_restart - reboot the system
120 * @cmd: pointer to buffer containing command to execute for restart
121 * or %NULL
122 *
123 * Shutdown everything and perform a clean reboot.
124 * This is not safe to call in interrupt context.
125 */
126void kernel_restart(char *cmd)
127{
128 kernel_restart_prepare(cmd);
129 migrate_to_reboot_cpu();
130 syscore_shutdown();
131 if (!cmd)
132 pr_emerg("Restarting system\n");
133 else
134 pr_emerg("Restarting system with command '%s'\n", cmd);
135 kmsg_dump(KMSG_DUMP_RESTART);
136 machine_restart(cmd);
137}
138EXPORT_SYMBOL_GPL(kernel_restart);
139
140static void kernel_shutdown_prepare(enum system_states state)
141{
142 blocking_notifier_call_chain(&reboot_notifier_list,
143 (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
144 system_state = state;
145 usermodehelper_disable();
146 device_shutdown();
147}
148/**
149 * kernel_halt - halt the system
150 *
151 * Shutdown everything and perform a clean system halt.
152 */
153void kernel_halt(void)
154{
155 kernel_shutdown_prepare(SYSTEM_HALT);
156 migrate_to_reboot_cpu();
157 syscore_shutdown();
158 pr_emerg("System halted\n");
159 kmsg_dump(KMSG_DUMP_HALT);
160 machine_halt();
161}
162EXPORT_SYMBOL_GPL(kernel_halt);
163
164/**
165 * kernel_power_off - power_off the system
166 *
167 * Shutdown everything and perform a clean system power_off.
168 */
169void kernel_power_off(void)
170{
171 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
172 if (pm_power_off_prepare)
173 pm_power_off_prepare();
174 migrate_to_reboot_cpu();
175 syscore_shutdown();
176 pr_emerg("Power down\n");
177 kmsg_dump(KMSG_DUMP_POWEROFF);
178 machine_power_off();
179}
180EXPORT_SYMBOL_GPL(kernel_power_off);
181
182static DEFINE_MUTEX(reboot_mutex);
183
184/*
185 * Reboot system call: for obvious reasons only root may call it,
186 * and even root needs to set up some magic numbers in the registers
187 * so that some mistake won't make this reboot the whole machine.
188 * You can also set the meaning of the ctrl-alt-del-key here.
189 *
190 * reboot doesn't sync: do that yourself before calling this.
191 */
192SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
193 void __user *, arg)
194{
195 struct pid_namespace *pid_ns = task_active_pid_ns(current);
196 char buffer[256];
197 int ret = 0;
198
199 /* We only trust the superuser with rebooting the system. */
200 if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
201 return -EPERM;
202
203 /* For safety, we require "magic" arguments. */
204 if (magic1 != LINUX_REBOOT_MAGIC1 ||
205 (magic2 != LINUX_REBOOT_MAGIC2 &&
206 magic2 != LINUX_REBOOT_MAGIC2A &&
207 magic2 != LINUX_REBOOT_MAGIC2B &&
208 magic2 != LINUX_REBOOT_MAGIC2C))
209 return -EINVAL;
210
211 /*
212 * If pid namespaces are enabled and the current task is in a child
213 * pid_namespace, the command is handled by reboot_pid_ns() which will
214 * call do_exit().
215 */
216 ret = reboot_pid_ns(pid_ns, cmd);
217 if (ret)
218 return ret;
219
220 /* Instead of trying to make the power_off code look like
221 * halt when pm_power_off is not set do it the easy way.
222 */
223 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
224 cmd = LINUX_REBOOT_CMD_HALT;
225
226 mutex_lock(&reboot_mutex);
227 switch (cmd) {
228 case LINUX_REBOOT_CMD_RESTART:
229 kernel_restart(NULL);
230 break;
231
232 case LINUX_REBOOT_CMD_CAD_ON:
233 C_A_D = 1;
234 break;
235
236 case LINUX_REBOOT_CMD_CAD_OFF:
237 C_A_D = 0;
238 break;
239
240 case LINUX_REBOOT_CMD_HALT:
241 kernel_halt();
242 do_exit(0);
243 panic("cannot halt");
244
245 case LINUX_REBOOT_CMD_POWER_OFF:
246 kernel_power_off();
247 do_exit(0);
248 break;
249
250 case LINUX_REBOOT_CMD_RESTART2:
251 ret = strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1);
252 if (ret < 0) {
253 ret = -EFAULT;
254 break;
255 }
256 buffer[sizeof(buffer) - 1] = '\0';
257
258 kernel_restart(buffer);
259 break;
260
261#ifdef CONFIG_KEXEC
262 case LINUX_REBOOT_CMD_KEXEC:
263 ret = kernel_kexec();
264 break;
265#endif
266
267#ifdef CONFIG_HIBERNATION
268 case LINUX_REBOOT_CMD_SW_SUSPEND:
269 ret = hibernate();
270 break;
271#endif
272
273 default:
274 ret = -EINVAL;
275 break;
276 }
277 mutex_unlock(&reboot_mutex);
278 return ret;
279}
280
281static void deferred_cad(struct work_struct *dummy)
282{
283 kernel_restart(NULL);
284}
285
286/*
287 * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
288 * As it's called within an interrupt, it may NOT sync: the only choice
289 * is whether to reboot at once, or just ignore the ctrl-alt-del.
290 */
291void ctrl_alt_del(void)
292{
293 static DECLARE_WORK(cad_work, deferred_cad);
294
295 if (C_A_D)
296 schedule_work(&cad_work);
297 else
298 kill_cad_pid(SIGINT, 1);
299}
300
301char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
302
303static int __orderly_poweroff(bool force)
304{
305 char **argv;
306 static char *envp[] = {
307 "HOME=/",
308 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
309 NULL
310 };
311 int ret;
312
313 argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
314 if (argv) {
315 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
316 argv_free(argv);
317 } else {
318 ret = -ENOMEM;
319 }
320
321 if (ret && force) {
322 pr_warn("Failed to start orderly shutdown: forcing the issue\n");
323 /*
324 * I guess this should try to kick off some daemon to sync and
325 * poweroff asap. Or not even bother syncing if we're doing an
326 * emergency shutdown?
327 */
328 emergency_sync();
329 kernel_power_off();
330 }
331
332 return ret;
333}
334
335static bool poweroff_force;
336
337static void poweroff_work_func(struct work_struct *work)
338{
339 __orderly_poweroff(poweroff_force);
340}
341
342static DECLARE_WORK(poweroff_work, poweroff_work_func);
343
344/**
345 * orderly_poweroff - Trigger an orderly system poweroff
346 * @force: force poweroff if command execution fails
347 *
348 * This may be called from any context to trigger a system shutdown.
349 * If the orderly shutdown fails, it will force an immediate shutdown.
350 */
351int orderly_poweroff(bool force)
352{
353 if (force) /* do not override the pending "true" */
354 poweroff_force = true;
355 schedule_work(&poweroff_work);
356 return 0;
357}
358EXPORT_SYMBOL_GPL(orderly_poweroff);
359
360static int __init reboot_setup(char *str)
361{
362 for (;;) {
363 /*
364 * Having anything passed on the command line via
365 * reboot= will cause us to disable DMI checking
366 * below.
367 */
368 reboot_default = 0;
369
370 switch (*str) {
371 case 'w':
372 reboot_mode = REBOOT_WARM;
373 break;
374
375 case 'c':
376 reboot_mode = REBOOT_COLD;
377 break;
378
379 case 'h':
380 reboot_mode = REBOOT_HARD;
381 break;
382
383 case 's':
384 if (isdigit(*(str+1)))
385 reboot_cpu = simple_strtoul(str+1, NULL, 0);
386 else if (str[1] == 'm' && str[2] == 'p' &&
387 isdigit(*(str+3)))
388 reboot_cpu = simple_strtoul(str+3, NULL, 0);
389 else
390 reboot_mode = REBOOT_SOFT;
391 break;
392
393 case 'g':
394 reboot_mode = REBOOT_GPIO;
395 break;
396
397 case 'b':
398 case 'a':
399 case 'k':
400 case 't':
401 case 'e':
402 case 'p':
403 reboot_type = *str;
404 break;
405
406 case 'f':
407 reboot_force = 1;
408 break;
409 }
410
411 str = strchr(str, ',');
412 if (str)
413 str++;
414 else
415 break;
416 }
417 return 1;
418}
419__setup("reboot=", reboot_setup);
diff --git a/kernel/relay.c b/kernel/relay.c
index b91488ba2e5a..5001c9887db1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan,
516 * 516 *
517 * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) 517 * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
518 */ 518 */
519static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, 519static int relay_hotcpu_callback(struct notifier_block *nb,
520 unsigned long action, 520 unsigned long action,
521 void *hcpu) 521 void *hcpu)
522{ 522{
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9b1f2e533b95..b7c32cb7bfeb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -370,13 +370,6 @@ static struct rq *this_rq_lock(void)
370#ifdef CONFIG_SCHED_HRTICK 370#ifdef CONFIG_SCHED_HRTICK
371/* 371/*
372 * Use HR-timers to deliver accurate preemption points. 372 * Use HR-timers to deliver accurate preemption points.
373 *
374 * Its all a bit involved since we cannot program an hrt while holding the
375 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
376 * reschedule event.
377 *
378 * When we get rescheduled we reprogram the hrtick_timer outside of the
379 * rq->lock.
380 */ 373 */
381 374
382static void hrtick_clear(struct rq *rq) 375static void hrtick_clear(struct rq *rq)
@@ -404,6 +397,15 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
404} 397}
405 398
406#ifdef CONFIG_SMP 399#ifdef CONFIG_SMP
400
401static int __hrtick_restart(struct rq *rq)
402{
403 struct hrtimer *timer = &rq->hrtick_timer;
404 ktime_t time = hrtimer_get_softexpires(timer);
405
406 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
407}
408
407/* 409/*
408 * called from hardirq (IPI) context 410 * called from hardirq (IPI) context
409 */ 411 */
@@ -412,7 +414,7 @@ static void __hrtick_start(void *arg)
412 struct rq *rq = arg; 414 struct rq *rq = arg;
413 415
414 raw_spin_lock(&rq->lock); 416 raw_spin_lock(&rq->lock);
415 hrtimer_restart(&rq->hrtick_timer); 417 __hrtick_restart(rq);
416 rq->hrtick_csd_pending = 0; 418 rq->hrtick_csd_pending = 0;
417 raw_spin_unlock(&rq->lock); 419 raw_spin_unlock(&rq->lock);
418} 420}
@@ -430,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)
430 hrtimer_set_expires(timer, time); 432 hrtimer_set_expires(timer, time);
431 433
432 if (rq == this_rq()) { 434 if (rq == this_rq()) {
433 hrtimer_restart(timer); 435 __hrtick_restart(rq);
434 } else if (!rq->hrtick_csd_pending) { 436 } else if (!rq->hrtick_csd_pending) {
435 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 437 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
436 rq->hrtick_csd_pending = 1; 438 rq->hrtick_csd_pending = 1;
@@ -4131,7 +4133,7 @@ void show_state_filter(unsigned long state_filter)
4131 debug_show_all_locks(); 4133 debug_show_all_locks();
4132} 4134}
4133 4135
4134void __cpuinit init_idle_bootup_task(struct task_struct *idle) 4136void init_idle_bootup_task(struct task_struct *idle)
4135{ 4137{
4136 idle->sched_class = &idle_sched_class; 4138 idle->sched_class = &idle_sched_class;
4137} 4139}
@@ -4144,7 +4146,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4144 * NOTE: this function does not set the idle thread's NEED_RESCHED 4146 * NOTE: this function does not set the idle thread's NEED_RESCHED
4145 * flag, to make booting more robust. 4147 * flag, to make booting more robust.
4146 */ 4148 */
4147void __cpuinit init_idle(struct task_struct *idle, int cpu) 4149void init_idle(struct task_struct *idle, int cpu)
4148{ 4150{
4149 struct rq *rq = cpu_rq(cpu); 4151 struct rq *rq = cpu_rq(cpu);
4150 unsigned long flags; 4152 unsigned long flags;
@@ -4628,7 +4630,7 @@ static void set_rq_offline(struct rq *rq)
4628 * migration_call - callback that gets triggered when a CPU is added. 4630 * migration_call - callback that gets triggered when a CPU is added.
4629 * Here we can start up the necessary migration thread for the new CPU. 4631 * Here we can start up the necessary migration thread for the new CPU.
4630 */ 4632 */
4631static int __cpuinit 4633static int
4632migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 4634migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4633{ 4635{
4634 int cpu = (long)hcpu; 4636 int cpu = (long)hcpu;
@@ -4682,12 +4684,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4682 * happens before everything else. This has to be lower priority than 4684 * happens before everything else. This has to be lower priority than
4683 * the notifier in the perf_event subsystem, though. 4685 * the notifier in the perf_event subsystem, though.
4684 */ 4686 */
4685static struct notifier_block __cpuinitdata migration_notifier = { 4687static struct notifier_block migration_notifier = {
4686 .notifier_call = migration_call, 4688 .notifier_call = migration_call,
4687 .priority = CPU_PRI_MIGRATION, 4689 .priority = CPU_PRI_MIGRATION,
4688}; 4690};
4689 4691
4690static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 4692static int sched_cpu_active(struct notifier_block *nfb,
4691 unsigned long action, void *hcpu) 4693 unsigned long action, void *hcpu)
4692{ 4694{
4693 switch (action & ~CPU_TASKS_FROZEN) { 4695 switch (action & ~CPU_TASKS_FROZEN) {
@@ -4700,7 +4702,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
4700 } 4702 }
4701} 4703}
4702 4704
4703static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 4705static int sched_cpu_inactive(struct notifier_block *nfb,
4704 unsigned long action, void *hcpu) 4706 unsigned long action, void *hcpu)
4705{ 4707{
4706 switch (action & ~CPU_TASKS_FROZEN) { 4708 switch (action & ~CPU_TASKS_FROZEN) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c527449..bb456f44b7b1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5506,7 +5506,7 @@ void nohz_balance_enter_idle(int cpu)
5506 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 5506 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5507} 5507}
5508 5508
5509static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, 5509static int sched_ilb_notifier(struct notifier_block *nfb,
5510 unsigned long action, void *hcpu) 5510 unsigned long action, void *hcpu)
5511{ 5511{
5512 switch (action & ~CPU_TASKS_FROZEN) { 5512 switch (action & ~CPU_TASKS_FROZEN) {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 17d7065c3872..5aef494fc8b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -162,6 +162,39 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
162 */ 162 */
163 163
164/** 164/**
165 * cputimer_running - return true if cputimer is running
166 *
167 * @tsk: Pointer to target task.
168 */
169static inline bool cputimer_running(struct task_struct *tsk)
170
171{
172 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
173
174 if (!cputimer->running)
175 return false;
176
177 /*
178 * After we flush the task's sum_exec_runtime to sig->sum_sched_runtime
179 * in __exit_signal(), we won't account to the signal struct further
180 * cputime consumed by that task, even though the task can still be
181 * ticking after __exit_signal().
182 *
183 * In order to keep a consistent behaviour between thread group cputime
184 * and thread group cputimer accounting, lets also ignore the cputime
185 * elapsing after __exit_signal() in any thread group timer running.
186 *
187 * This makes sure that POSIX CPU clocks and timers are synchronized, so
188 * that a POSIX CPU timer won't expire while the corresponding POSIX CPU
189 * clock delta is behind the expiring timer value.
190 */
191 if (unlikely(!tsk->sighand))
192 return false;
193
194 return true;
195}
196
197/**
165 * account_group_user_time - Maintain utime for a thread group. 198 * account_group_user_time - Maintain utime for a thread group.
166 * 199 *
167 * @tsk: Pointer to task structure. 200 * @tsk: Pointer to task structure.
@@ -176,7 +209,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
176{ 209{
177 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 210 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
178 211
179 if (!cputimer->running) 212 if (!cputimer_running(tsk))
180 return; 213 return;
181 214
182 raw_spin_lock(&cputimer->lock); 215 raw_spin_lock(&cputimer->lock);
@@ -199,7 +232,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
199{ 232{
200 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 233 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
201 234
202 if (!cputimer->running) 235 if (!cputimer_running(tsk))
203 return; 236 return;
204 237
205 raw_spin_lock(&cputimer->lock); 238 raw_spin_lock(&cputimer->lock);
@@ -222,7 +255,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
222{ 255{
223 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 256 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
224 257
225 if (!cputimer->running) 258 if (!cputimer_running(tsk))
226 return; 259 return;
227 260
228 raw_spin_lock(&cputimer->lock); 261 raw_spin_lock(&cputimer->lock);
diff --git a/kernel/smp.c b/kernel/smp.c
index 4dba0f7b72ad..fe9f773d7114 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -73,7 +73,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
73 return NOTIFY_OK; 73 return NOTIFY_OK;
74} 74}
75 75
76static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { 76static struct notifier_block hotplug_cfd_notifier = {
77 .notifier_call = hotplug_cfd, 77 .notifier_call = hotplug_cfd,
78}; 78};
79 79
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 02fc5c933673..eb89e1807408 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -24,7 +24,7 @@
24 */ 24 */
25static DEFINE_PER_CPU(struct task_struct *, idle_threads); 25static DEFINE_PER_CPU(struct task_struct *, idle_threads);
26 26
27struct task_struct * __cpuinit idle_thread_get(unsigned int cpu) 27struct task_struct *idle_thread_get(unsigned int cpu)
28{ 28{
29 struct task_struct *tsk = per_cpu(idle_threads, cpu); 29 struct task_struct *tsk = per_cpu(idle_threads, cpu);
30 30
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ca25e6e704a2..be3d3514c325 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -699,7 +699,7 @@ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
699} 699}
700EXPORT_SYMBOL(send_remote_softirq); 700EXPORT_SYMBOL(send_remote_softirq);
701 701
702static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, 702static int remote_softirq_cpu_notify(struct notifier_block *self,
703 unsigned long action, void *hcpu) 703 unsigned long action, void *hcpu)
704{ 704{
705 /* 705 /*
@@ -728,7 +728,7 @@ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
728 return NOTIFY_OK; 728 return NOTIFY_OK;
729} 729}
730 730
731static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { 731static struct notifier_block remote_softirq_cpu_notifier = {
732 .notifier_call = remote_softirq_cpu_notify, 732 .notifier_call = remote_softirq_cpu_notify,
733}; 733};
734 734
@@ -830,7 +830,7 @@ static void takeover_tasklets(unsigned int cpu)
830} 830}
831#endif /* CONFIG_HOTPLUG_CPU */ 831#endif /* CONFIG_HOTPLUG_CPU */
832 832
833static int __cpuinit cpu_callback(struct notifier_block *nfb, 833static int cpu_callback(struct notifier_block *nfb,
834 unsigned long action, 834 unsigned long action,
835 void *hcpu) 835 void *hcpu)
836{ 836{
@@ -845,7 +845,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
845 return NOTIFY_OK; 845 return NOTIFY_OK;
846} 846}
847 847
848static struct notifier_block __cpuinitdata cpu_nfb = { 848static struct notifier_block cpu_nfb = {
849 .notifier_call = cpu_callback 849 .notifier_call = cpu_callback
850}; 850};
851 851
diff --git a/kernel/sys.c b/kernel/sys.c
index 071de900c824..771129b299f8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -116,20 +116,6 @@ EXPORT_SYMBOL(fs_overflowuid);
116EXPORT_SYMBOL(fs_overflowgid); 116EXPORT_SYMBOL(fs_overflowgid);
117 117
118/* 118/*
119 * this indicates whether you can reboot with ctrl-alt-del: the default is yes
120 */
121
122int C_A_D = 1;
123struct pid *cad_pid;
124EXPORT_SYMBOL(cad_pid);
125
126/*
127 * If set, this is used for preparing the system to power off.
128 */
129
130void (*pm_power_off_prepare)(void);
131
132/*
133 * Returns true if current's euid is same as p's uid or euid, 119 * Returns true if current's euid is same as p's uid or euid,
134 * or has CAP_SYS_NICE to p's user_ns. 120 * or has CAP_SYS_NICE to p's user_ns.
135 * 121 *
@@ -308,266 +294,6 @@ out_unlock:
308 return retval; 294 return retval;
309} 295}
310 296
311/**
312 * emergency_restart - reboot the system
313 *
314 * Without shutting down any hardware or taking any locks
315 * reboot the system. This is called when we know we are in
316 * trouble so this is our best effort to reboot. This is
317 * safe to call in interrupt context.
318 */
319void emergency_restart(void)
320{
321 kmsg_dump(KMSG_DUMP_EMERG);
322 machine_emergency_restart();
323}
324EXPORT_SYMBOL_GPL(emergency_restart);
325
326void kernel_restart_prepare(char *cmd)
327{
328 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
329 system_state = SYSTEM_RESTART;
330 usermodehelper_disable();
331 device_shutdown();
332}
333
334/**
335 * register_reboot_notifier - Register function to be called at reboot time
336 * @nb: Info about notifier function to be called
337 *
338 * Registers a function with the list of functions
339 * to be called at reboot time.
340 *
341 * Currently always returns zero, as blocking_notifier_chain_register()
342 * always returns zero.
343 */
344int register_reboot_notifier(struct notifier_block *nb)
345{
346 return blocking_notifier_chain_register(&reboot_notifier_list, nb);
347}
348EXPORT_SYMBOL(register_reboot_notifier);
349
350/**
351 * unregister_reboot_notifier - Unregister previously registered reboot notifier
352 * @nb: Hook to be unregistered
353 *
354 * Unregisters a previously registered reboot
355 * notifier function.
356 *
357 * Returns zero on success, or %-ENOENT on failure.
358 */
359int unregister_reboot_notifier(struct notifier_block *nb)
360{
361 return blocking_notifier_chain_unregister(&reboot_notifier_list, nb);
362}
363EXPORT_SYMBOL(unregister_reboot_notifier);
364
365/* Add backwards compatibility for stable trees. */
366#ifndef PF_NO_SETAFFINITY
367#define PF_NO_SETAFFINITY PF_THREAD_BOUND
368#endif
369
370static void migrate_to_reboot_cpu(void)
371{
372 /* The boot cpu is always logical cpu 0 */
373 int cpu = 0;
374
375 cpu_hotplug_disable();
376
377 /* Make certain the cpu I'm about to reboot on is online */
378 if (!cpu_online(cpu))
379 cpu = cpumask_first(cpu_online_mask);
380
381 /* Prevent races with other tasks migrating this task */
382 current->flags |= PF_NO_SETAFFINITY;
383
384 /* Make certain I only run on the appropriate processor */
385 set_cpus_allowed_ptr(current, cpumask_of(cpu));
386}
387
388/**
389 * kernel_restart - reboot the system
390 * @cmd: pointer to buffer containing command to execute for restart
391 * or %NULL
392 *
393 * Shutdown everything and perform a clean reboot.
394 * This is not safe to call in interrupt context.
395 */
396void kernel_restart(char *cmd)
397{
398 kernel_restart_prepare(cmd);
399 migrate_to_reboot_cpu();
400 syscore_shutdown();
401 if (!cmd)
402 printk(KERN_EMERG "Restarting system.\n");
403 else
404 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
405 kmsg_dump(KMSG_DUMP_RESTART);
406 machine_restart(cmd);
407}
408EXPORT_SYMBOL_GPL(kernel_restart);
409
410static void kernel_shutdown_prepare(enum system_states state)
411{
412 blocking_notifier_call_chain(&reboot_notifier_list,
413 (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
414 system_state = state;
415 usermodehelper_disable();
416 device_shutdown();
417}
418/**
419 * kernel_halt - halt the system
420 *
421 * Shutdown everything and perform a clean system halt.
422 */
423void kernel_halt(void)
424{
425 kernel_shutdown_prepare(SYSTEM_HALT);
426 migrate_to_reboot_cpu();
427 syscore_shutdown();
428 printk(KERN_EMERG "System halted.\n");
429 kmsg_dump(KMSG_DUMP_HALT);
430 machine_halt();
431}
432
433EXPORT_SYMBOL_GPL(kernel_halt);
434
435/**
436 * kernel_power_off - power_off the system
437 *
438 * Shutdown everything and perform a clean system power_off.
439 */
440void kernel_power_off(void)
441{
442 kernel_shutdown_prepare(SYSTEM_POWER_OFF);
443 if (pm_power_off_prepare)
444 pm_power_off_prepare();
445 migrate_to_reboot_cpu();
446 syscore_shutdown();
447 printk(KERN_EMERG "Power down.\n");
448 kmsg_dump(KMSG_DUMP_POWEROFF);
449 machine_power_off();
450}
451EXPORT_SYMBOL_GPL(kernel_power_off);
452
453static DEFINE_MUTEX(reboot_mutex);
454
455/*
456 * Reboot system call: for obvious reasons only root may call it,
457 * and even root needs to set up some magic numbers in the registers
458 * so that some mistake won't make this reboot the whole machine.
459 * You can also set the meaning of the ctrl-alt-del-key here.
460 *
461 * reboot doesn't sync: do that yourself before calling this.
462 */
463SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
464 void __user *, arg)
465{
466 struct pid_namespace *pid_ns = task_active_pid_ns(current);
467 char buffer[256];
468 int ret = 0;
469
470 /* We only trust the superuser with rebooting the system. */
471 if (!ns_capable(pid_ns->user_ns, CAP_SYS_BOOT))
472 return -EPERM;
473
474 /* For safety, we require "magic" arguments. */
475 if (magic1 != LINUX_REBOOT_MAGIC1 ||
476 (magic2 != LINUX_REBOOT_MAGIC2 &&
477 magic2 != LINUX_REBOOT_MAGIC2A &&
478 magic2 != LINUX_REBOOT_MAGIC2B &&
479 magic2 != LINUX_REBOOT_MAGIC2C))
480 return -EINVAL;
481
482 /*
483 * If pid namespaces are enabled and the current task is in a child
484 * pid_namespace, the command is handled by reboot_pid_ns() which will
485 * call do_exit().
486 */
487 ret = reboot_pid_ns(pid_ns, cmd);
488 if (ret)
489 return ret;
490
491 /* Instead of trying to make the power_off code look like
492 * halt when pm_power_off is not set do it the easy way.
493 */
494 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
495 cmd = LINUX_REBOOT_CMD_HALT;
496
497 mutex_lock(&reboot_mutex);
498 switch (cmd) {
499 case LINUX_REBOOT_CMD_RESTART:
500 kernel_restart(NULL);
501 break;
502
503 case LINUX_REBOOT_CMD_CAD_ON:
504 C_A_D = 1;
505 break;
506
507 case LINUX_REBOOT_CMD_CAD_OFF:
508 C_A_D = 0;
509 break;
510
511 case LINUX_REBOOT_CMD_HALT:
512 kernel_halt();
513 do_exit(0);
514 panic("cannot halt.\n");
515
516 case LINUX_REBOOT_CMD_POWER_OFF:
517 kernel_power_off();
518 do_exit(0);
519 break;
520
521 case LINUX_REBOOT_CMD_RESTART2:
522 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
523 ret = -EFAULT;
524 break;
525 }
526 buffer[sizeof(buffer) - 1] = '\0';
527
528 kernel_restart(buffer);
529 break;
530
531#ifdef CONFIG_KEXEC
532 case LINUX_REBOOT_CMD_KEXEC:
533 ret = kernel_kexec();
534 break;
535#endif
536
537#ifdef CONFIG_HIBERNATION
538 case LINUX_REBOOT_CMD_SW_SUSPEND:
539 ret = hibernate();
540 break;
541#endif
542
543 default:
544 ret = -EINVAL;
545 break;
546 }
547 mutex_unlock(&reboot_mutex);
548 return ret;
549}
550
551static void deferred_cad(struct work_struct *dummy)
552{
553 kernel_restart(NULL);
554}
555
556/*
557 * This function gets called by ctrl-alt-del - ie the keyboard interrupt.
558 * As it's called within an interrupt, it may NOT sync: the only choice
559 * is whether to reboot at once, or just ignore the ctrl-alt-del.
560 */
561void ctrl_alt_del(void)
562{
563 static DECLARE_WORK(cad_work, deferred_cad);
564
565 if (C_A_D)
566 schedule_work(&cad_work);
567 else
568 kill_cad_pid(SIGINT, 1);
569}
570
571/* 297/*
572 * Unprivileged users may change the real gid to the effective gid 298 * Unprivileged users may change the real gid to the effective gid
573 * or vice versa. (BSD-style) 299 * or vice versa. (BSD-style)
@@ -2292,68 +2018,6 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2292 return err ? -EFAULT : 0; 2018 return err ? -EFAULT : 0;
2293} 2019}
2294 2020
2295char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2296
2297static int __orderly_poweroff(bool force)
2298{
2299 char **argv;
2300 static char *envp[] = {
2301 "HOME=/",
2302 "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
2303 NULL
2304 };
2305 int ret;
2306
2307 argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
2308 if (argv) {
2309 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
2310 argv_free(argv);
2311 } else {
2312 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2313 __func__, poweroff_cmd);
2314 ret = -ENOMEM;
2315 }
2316
2317 if (ret && force) {
2318 printk(KERN_WARNING "Failed to start orderly shutdown: "
2319 "forcing the issue\n");
2320 /*
2321 * I guess this should try to kick off some daemon to sync and
2322 * poweroff asap. Or not even bother syncing if we're doing an
2323 * emergency shutdown?
2324 */
2325 emergency_sync();
2326 kernel_power_off();
2327 }
2328
2329 return ret;
2330}
2331
2332static bool poweroff_force;
2333
2334static void poweroff_work_func(struct work_struct *work)
2335{
2336 __orderly_poweroff(poweroff_force);
2337}
2338
2339static DECLARE_WORK(poweroff_work, poweroff_work_func);
2340
2341/**
2342 * orderly_poweroff - Trigger an orderly system poweroff
2343 * @force: force poweroff if command execution fails
2344 *
2345 * This may be called from any context to trigger a system shutdown.
2346 * If the orderly shutdown fails, it will force an immediate shutdown.
2347 */
2348int orderly_poweroff(bool force)
2349{
2350 if (force) /* do not override the pending "true" */
2351 poweroff_force = true;
2352 schedule_work(&poweroff_work);
2353 return 0;
2354}
2355EXPORT_SYMBOL_GPL(orderly_poweroff);
2356
2357/** 2021/**
2358 * do_sysinfo - fill in sysinfo struct 2022 * do_sysinfo - fill in sysinfo struct
2359 * @info: pointer to buffer to fill 2023 * @info: pointer to buffer to fill
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ce13c3cedb9..ac09d98490aa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -599,6 +599,13 @@ static struct ctl_table kern_table[] = {
599 .mode = 0644, 599 .mode = 0644,
600 .proc_handler = proc_dointvec, 600 .proc_handler = proc_dointvec,
601 }, 601 },
602 {
603 .procname = "traceoff_on_warning",
604 .data = &__disable_trace_on_warning,
605 .maxlen = sizeof(__disable_trace_on_warning),
606 .mode = 0644,
607 .proc_handler = proc_dointvec,
608 },
602#endif 609#endif
603#ifdef CONFIG_MODULES 610#ifdef CONFIG_MODULES
604 { 611 {
@@ -800,7 +807,7 @@ static struct ctl_table kern_table[] = {
800#if defined(CONFIG_LOCKUP_DETECTOR) 807#if defined(CONFIG_LOCKUP_DETECTOR)
801 { 808 {
802 .procname = "watchdog", 809 .procname = "watchdog",
803 .data = &watchdog_enabled, 810 .data = &watchdog_user_enabled,
804 .maxlen = sizeof (int), 811 .maxlen = sizeof (int),
805 .mode = 0644, 812 .mode = 0644,
806 .proc_handler = proc_dowatchdog, 813 .proc_handler = proc_dowatchdog,
@@ -827,7 +834,7 @@ static struct ctl_table kern_table[] = {
827 }, 834 },
828 { 835 {
829 .procname = "nmi_watchdog", 836 .procname = "nmi_watchdog",
830 .data = &watchdog_enabled, 837 .data = &watchdog_user_enabled,
831 .maxlen = sizeof (int), 838 .maxlen = sizeof (int),
832 .mode = 0644, 839 .mode = 0644,
833 .proc_handler = proc_dowatchdog, 840 .proc_handler = proc_dowatchdog,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index aea4a9ea6fc8..b609213ca9a2 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -3,7 +3,6 @@
3#include "../fs/xfs/xfs_sysctl.h" 3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h> 4#include <linux/sunrpc/debug.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <net/ip_vs.h>
7#include <linux/syscalls.h> 6#include <linux/syscalls.h>
8#include <linux/namei.h> 7#include <linux/namei.h>
9#include <linux/mount.h> 8#include <linux/mount.h>
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ff7d9d2ab504..9250130646f5 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -4,6 +4,8 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 5obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
6obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o 6obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o
7obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
7obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o 8obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
8obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o 9obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
9obj-$(CONFIG_TIMER_STATS) += timer_stats.o 10obj-$(CONFIG_TIMER_STATS) += timer_stats.o
11obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f11d83b12949..eec50fcef9e4 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -199,6 +199,13 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
199 199
200} 200}
201 201
202ktime_t alarm_expires_remaining(const struct alarm *alarm)
203{
204 struct alarm_base *base = &alarm_bases[alarm->type];
205 return ktime_sub(alarm->node.expires, base->gettime());
206}
207EXPORT_SYMBOL_GPL(alarm_expires_remaining);
208
202#ifdef CONFIG_RTC_CLASS 209#ifdef CONFIG_RTC_CLASS
203/** 210/**
204 * alarmtimer_suspend - Suspend time callback 211 * alarmtimer_suspend - Suspend time callback
@@ -303,9 +310,10 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
303 alarm->type = type; 310 alarm->type = type;
304 alarm->state = ALARMTIMER_STATE_INACTIVE; 311 alarm->state = ALARMTIMER_STATE_INACTIVE;
305} 312}
313EXPORT_SYMBOL_GPL(alarm_init);
306 314
307/** 315/**
308 * alarm_start - Sets an alarm to fire 316 * alarm_start - Sets an absolute alarm to fire
309 * @alarm: ptr to alarm to set 317 * @alarm: ptr to alarm to set
310 * @start: time to run the alarm 318 * @start: time to run the alarm
311 */ 319 */
@@ -323,6 +331,34 @@ int alarm_start(struct alarm *alarm, ktime_t start)
323 spin_unlock_irqrestore(&base->lock, flags); 331 spin_unlock_irqrestore(&base->lock, flags);
324 return ret; 332 return ret;
325} 333}
334EXPORT_SYMBOL_GPL(alarm_start);
335
336/**
337 * alarm_start_relative - Sets a relative alarm to fire
338 * @alarm: ptr to alarm to set
339 * @start: time relative to now to run the alarm
340 */
341int alarm_start_relative(struct alarm *alarm, ktime_t start)
342{
343 struct alarm_base *base = &alarm_bases[alarm->type];
344
345 start = ktime_add(start, base->gettime());
346 return alarm_start(alarm, start);
347}
348EXPORT_SYMBOL_GPL(alarm_start_relative);
349
350void alarm_restart(struct alarm *alarm)
351{
352 struct alarm_base *base = &alarm_bases[alarm->type];
353 unsigned long flags;
354
355 spin_lock_irqsave(&base->lock, flags);
356 hrtimer_set_expires(&alarm->timer, alarm->node.expires);
357 hrtimer_restart(&alarm->timer);
358 alarmtimer_enqueue(base, alarm);
359 spin_unlock_irqrestore(&base->lock, flags);
360}
361EXPORT_SYMBOL_GPL(alarm_restart);
326 362
327/** 363/**
328 * alarm_try_to_cancel - Tries to cancel an alarm timer 364 * alarm_try_to_cancel - Tries to cancel an alarm timer
@@ -344,6 +380,7 @@ int alarm_try_to_cancel(struct alarm *alarm)
344 spin_unlock_irqrestore(&base->lock, flags); 380 spin_unlock_irqrestore(&base->lock, flags);
345 return ret; 381 return ret;
346} 382}
383EXPORT_SYMBOL_GPL(alarm_try_to_cancel);
347 384
348 385
349/** 386/**
@@ -361,6 +398,7 @@ int alarm_cancel(struct alarm *alarm)
361 cpu_relax(); 398 cpu_relax();
362 } 399 }
363} 400}
401EXPORT_SYMBOL_GPL(alarm_cancel);
364 402
365 403
366u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) 404u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
@@ -393,8 +431,15 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
393 alarm->node.expires = ktime_add(alarm->node.expires, interval); 431 alarm->node.expires = ktime_add(alarm->node.expires, interval);
394 return overrun; 432 return overrun;
395} 433}
434EXPORT_SYMBOL_GPL(alarm_forward);
396 435
436u64 alarm_forward_now(struct alarm *alarm, ktime_t interval)
437{
438 struct alarm_base *base = &alarm_bases[alarm->type];
397 439
440 return alarm_forward(alarm, base->gettime(), interval);
441}
442EXPORT_SYMBOL_GPL(alarm_forward_now);
398 443
399 444
400/** 445/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index c6d6400ee137..38959c866789 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -15,20 +15,23 @@
15#include <linux/hrtimer.h> 15#include <linux/hrtimer.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/notifier.h>
19#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/device.h>
20 20
21#include "tick-internal.h" 21#include "tick-internal.h"
22 22
23/* The registered clock event devices */ 23/* The registered clock event devices */
24static LIST_HEAD(clockevent_devices); 24static LIST_HEAD(clockevent_devices);
25static LIST_HEAD(clockevents_released); 25static LIST_HEAD(clockevents_released);
26
27/* Notification for clock events */
28static RAW_NOTIFIER_HEAD(clockevents_chain);
29
30/* Protection for the above */ 26/* Protection for the above */
31static DEFINE_RAW_SPINLOCK(clockevents_lock); 27static DEFINE_RAW_SPINLOCK(clockevents_lock);
28/* Protection for unbind operations */
29static DEFINE_MUTEX(clockevents_mutex);
30
31struct ce_unbind {
32 struct clock_event_device *ce;
33 int res;
34};
32 35
33/** 36/**
34 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 37 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -232,47 +235,107 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
232 return (rc && force) ? clockevents_program_min_delta(dev) : rc; 235 return (rc && force) ? clockevents_program_min_delta(dev) : rc;
233} 236}
234 237
235/** 238/*
236 * clockevents_register_notifier - register a clock events change listener 239 * Called after a notify add to make devices available which were
240 * released from the notifier call.
237 */ 241 */
238int clockevents_register_notifier(struct notifier_block *nb) 242static void clockevents_notify_released(void)
239{ 243{
240 unsigned long flags; 244 struct clock_event_device *dev;
241 int ret;
242 245
243 raw_spin_lock_irqsave(&clockevents_lock, flags); 246 while (!list_empty(&clockevents_released)) {
244 ret = raw_notifier_chain_register(&clockevents_chain, nb); 247 dev = list_entry(clockevents_released.next,
245 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 248 struct clock_event_device, list);
249 list_del(&dev->list);
250 list_add(&dev->list, &clockevent_devices);
251 tick_check_new_device(dev);
252 }
253}
246 254
247 return ret; 255/*
256 * Try to install a replacement clock event device
257 */
258static int clockevents_replace(struct clock_event_device *ced)
259{
260 struct clock_event_device *dev, *newdev = NULL;
261
262 list_for_each_entry(dev, &clockevent_devices, list) {
263 if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED)
264 continue;
265
266 if (!tick_check_replacement(newdev, dev))
267 continue;
268
269 if (!try_module_get(dev->owner))
270 continue;
271
272 if (newdev)
273 module_put(newdev->owner);
274 newdev = dev;
275 }
276 if (newdev) {
277 tick_install_replacement(newdev);
278 list_del_init(&ced->list);
279 }
280 return newdev ? 0 : -EBUSY;
248} 281}
249 282
250/* 283/*
251 * Notify about a clock event change. Called with clockevents_lock 284 * Called with clockevents_mutex and clockevents_lock held
252 * held.
253 */ 285 */
254static void clockevents_do_notify(unsigned long reason, void *dev) 286static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
255{ 287{
256 raw_notifier_call_chain(&clockevents_chain, reason, dev); 288 /* Fast track. Device is unused */
289 if (ced->mode == CLOCK_EVT_MODE_UNUSED) {
290 list_del_init(&ced->list);
291 return 0;
292 }
293
294 return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY;
257} 295}
258 296
259/* 297/*
260 * Called after a notify add to make devices available which were 298 * SMP function call to unbind a device
261 * released from the notifier call.
262 */ 299 */
263static void clockevents_notify_released(void) 300static void __clockevents_unbind(void *arg)
264{ 301{
265 struct clock_event_device *dev; 302 struct ce_unbind *cu = arg;
303 int res;
304
305 raw_spin_lock(&clockevents_lock);
306 res = __clockevents_try_unbind(cu->ce, smp_processor_id());
307 if (res == -EAGAIN)
308 res = clockevents_replace(cu->ce);
309 cu->res = res;
310 raw_spin_unlock(&clockevents_lock);
311}
266 312
267 while (!list_empty(&clockevents_released)) { 313/*
268 dev = list_entry(clockevents_released.next, 314 * Issues smp function call to unbind a per cpu device. Called with
269 struct clock_event_device, list); 315 * clockevents_mutex held.
270 list_del(&dev->list); 316 */
271 list_add(&dev->list, &clockevent_devices); 317static int clockevents_unbind(struct clock_event_device *ced, int cpu)
272 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 318{
273 } 319 struct ce_unbind cu = { .ce = ced, .res = -ENODEV };
320
321 smp_call_function_single(cpu, __clockevents_unbind, &cu, 1);
322 return cu.res;
274} 323}
275 324
325/*
326 * Unbind a clockevents device.
327 */
328int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
329{
330 int ret;
331
332 mutex_lock(&clockevents_mutex);
333 ret = clockevents_unbind(ced, cpu);
334 mutex_unlock(&clockevents_mutex);
335 return ret;
336}
337EXPORT_SYMBOL_GPL(clockevents_unbind);
338
276/** 339/**
277 * clockevents_register_device - register a clock event device 340 * clockevents_register_device - register a clock event device
278 * @dev: device to register 341 * @dev: device to register
@@ -290,7 +353,7 @@ void clockevents_register_device(struct clock_event_device *dev)
290 raw_spin_lock_irqsave(&clockevents_lock, flags); 353 raw_spin_lock_irqsave(&clockevents_lock, flags);
291 354
292 list_add(&dev->list, &clockevent_devices); 355 list_add(&dev->list, &clockevent_devices);
293 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 356 tick_check_new_device(dev);
294 clockevents_notify_released(); 357 clockevents_notify_released();
295 358
296 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 359 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
@@ -386,6 +449,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
386 * released list and do a notify add later. 449 * released list and do a notify add later.
387 */ 450 */
388 if (old) { 451 if (old) {
452 module_put(old->owner);
389 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 453 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
390 list_del(&old->list); 454 list_del(&old->list);
391 list_add(&old->list, &clockevents_released); 455 list_add(&old->list, &clockevents_released);
@@ -433,10 +497,36 @@ void clockevents_notify(unsigned long reason, void *arg)
433 int cpu; 497 int cpu;
434 498
435 raw_spin_lock_irqsave(&clockevents_lock, flags); 499 raw_spin_lock_irqsave(&clockevents_lock, flags);
436 clockevents_do_notify(reason, arg);
437 500
438 switch (reason) { 501 switch (reason) {
502 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
503 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
504 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
505 tick_broadcast_on_off(reason, arg);
506 break;
507
508 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
509 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
510 tick_broadcast_oneshot_control(reason);
511 break;
512
513 case CLOCK_EVT_NOTIFY_CPU_DYING:
514 tick_handover_do_timer(arg);
515 break;
516
517 case CLOCK_EVT_NOTIFY_SUSPEND:
518 tick_suspend();
519 tick_suspend_broadcast();
520 break;
521
522 case CLOCK_EVT_NOTIFY_RESUME:
523 tick_resume();
524 break;
525
439 case CLOCK_EVT_NOTIFY_CPU_DEAD: 526 case CLOCK_EVT_NOTIFY_CPU_DEAD:
527 tick_shutdown_broadcast_oneshot(arg);
528 tick_shutdown_broadcast(arg);
529 tick_shutdown(arg);
440 /* 530 /*
441 * Unregister the clock event devices which were 531 * Unregister the clock event devices which were
442 * released from the users in the notify chain. 532 * released from the users in the notify chain.
@@ -462,4 +552,123 @@ void clockevents_notify(unsigned long reason, void *arg)
462 raw_spin_unlock_irqrestore(&clockevents_lock, flags); 552 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
463} 553}
464EXPORT_SYMBOL_GPL(clockevents_notify); 554EXPORT_SYMBOL_GPL(clockevents_notify);
555
556#ifdef CONFIG_SYSFS
557struct bus_type clockevents_subsys = {
558 .name = "clockevents",
559 .dev_name = "clockevent",
560};
561
562static DEFINE_PER_CPU(struct device, tick_percpu_dev);
563static struct tick_device *tick_get_tick_dev(struct device *dev);
564
565static ssize_t sysfs_show_current_tick_dev(struct device *dev,
566 struct device_attribute *attr,
567 char *buf)
568{
569 struct tick_device *td;
570 ssize_t count = 0;
571
572 raw_spin_lock_irq(&clockevents_lock);
573 td = tick_get_tick_dev(dev);
574 if (td && td->evtdev)
575 count = snprintf(buf, PAGE_SIZE, "%s\n", td->evtdev->name);
576 raw_spin_unlock_irq(&clockevents_lock);
577 return count;
578}
579static DEVICE_ATTR(current_device, 0444, sysfs_show_current_tick_dev, NULL);
580
581/* We don't support the abomination of removable broadcast devices */
582static ssize_t sysfs_unbind_tick_dev(struct device *dev,
583 struct device_attribute *attr,
584 const char *buf, size_t count)
585{
586 char name[CS_NAME_LEN];
587 size_t ret = sysfs_get_uname(buf, name, count);
588 struct clock_event_device *ce;
589
590 if (ret < 0)
591 return ret;
592
593 ret = -ENODEV;
594 mutex_lock(&clockevents_mutex);
595 raw_spin_lock_irq(&clockevents_lock);
596 list_for_each_entry(ce, &clockevent_devices, list) {
597 if (!strcmp(ce->name, name)) {
598 ret = __clockevents_try_unbind(ce, dev->id);
599 break;
600 }
601 }
602 raw_spin_unlock_irq(&clockevents_lock);
603 /*
604 * We hold clockevents_mutex, so ce can't go away
605 */
606 if (ret == -EAGAIN)
607 ret = clockevents_unbind(ce, dev->id);
608 mutex_unlock(&clockevents_mutex);
609 return ret ? ret : count;
610}
611static DEVICE_ATTR(unbind_device, 0200, NULL, sysfs_unbind_tick_dev);
612
613#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
614static struct device tick_bc_dev = {
615 .init_name = "broadcast",
616 .id = 0,
617 .bus = &clockevents_subsys,
618};
619
620static struct tick_device *tick_get_tick_dev(struct device *dev)
621{
622 return dev == &tick_bc_dev ? tick_get_broadcast_device() :
623 &per_cpu(tick_cpu_device, dev->id);
624}
625
626static __init int tick_broadcast_init_sysfs(void)
627{
628 int err = device_register(&tick_bc_dev);
629
630 if (!err)
631 err = device_create_file(&tick_bc_dev, &dev_attr_current_device);
632 return err;
633}
634#else
635static struct tick_device *tick_get_tick_dev(struct device *dev)
636{
637 return &per_cpu(tick_cpu_device, dev->id);
638}
639static inline int tick_broadcast_init_sysfs(void) { return 0; }
465#endif 640#endif
641
642static int __init tick_init_sysfs(void)
643{
644 int cpu;
645
646 for_each_possible_cpu(cpu) {
647 struct device *dev = &per_cpu(tick_percpu_dev, cpu);
648 int err;
649
650 dev->id = cpu;
651 dev->bus = &clockevents_subsys;
652 err = device_register(dev);
653 if (!err)
654 err = device_create_file(dev, &dev_attr_current_device);
655 if (!err)
656 err = device_create_file(dev, &dev_attr_unbind_device);
657 if (err)
658 return err;
659 }
660 return tick_broadcast_init_sysfs();
661}
662
663static int __init clockevents_init_sysfs(void)
664{
665 int err = subsys_system_register(&clockevents_subsys, NULL);
666
667 if (!err)
668 err = tick_init_sysfs();
669 return err;
670}
671device_initcall(clockevents_init_sysfs);
672#endif /* SYSFS */
673
674#endif /* GENERIC_CLOCK_EVENTS */
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c9583382141a..50a8736757f3 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,8 @@
31#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h> 32#include <linux/kthread.h>
33 33
34#include "tick-internal.h"
35
34void timecounter_init(struct timecounter *tc, 36void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 37 const struct cyclecounter *cc,
36 u64 start_tstamp) 38 u64 start_tstamp)
@@ -174,11 +176,12 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
174static struct clocksource *curr_clocksource; 176static struct clocksource *curr_clocksource;
175static LIST_HEAD(clocksource_list); 177static LIST_HEAD(clocksource_list);
176static DEFINE_MUTEX(clocksource_mutex); 178static DEFINE_MUTEX(clocksource_mutex);
177static char override_name[32]; 179static char override_name[CS_NAME_LEN];
178static int finished_booting; 180static int finished_booting;
179 181
180#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 182#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
181static void clocksource_watchdog_work(struct work_struct *work); 183static void clocksource_watchdog_work(struct work_struct *work);
184static void clocksource_select(void);
182 185
183static LIST_HEAD(watchdog_list); 186static LIST_HEAD(watchdog_list);
184static struct clocksource *watchdog; 187static struct clocksource *watchdog;
@@ -299,13 +302,30 @@ static void clocksource_watchdog(unsigned long data)
299 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && 302 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
300 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && 303 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
301 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { 304 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
305 /* Mark it valid for high-res. */
302 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 306 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
307
308 /*
309 * clocksource_done_booting() will sort it if
310 * finished_booting is not set yet.
311 */
312 if (!finished_booting)
313 continue;
314
303 /* 315 /*
304 * We just marked the clocksource as highres-capable, 316 * If this is not the current clocksource let
305 * notify the rest of the system as well so that we 317 * the watchdog thread reselect it. Due to the
306 * transition into high-res mode: 318 * change to high res this clocksource might
319 * be preferred now. If it is the current
320 * clocksource let the tick code know about
321 * that change.
307 */ 322 */
308 tick_clock_notify(); 323 if (cs != curr_clocksource) {
324 cs->flags |= CLOCK_SOURCE_RESELECT;
325 schedule_work(&watchdog_work);
326 } else {
327 tick_clock_notify();
328 }
309 } 329 }
310 } 330 }
311 331
@@ -388,44 +408,39 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
388 408
389static void clocksource_dequeue_watchdog(struct clocksource *cs) 409static void clocksource_dequeue_watchdog(struct clocksource *cs)
390{ 410{
391 struct clocksource *tmp;
392 unsigned long flags; 411 unsigned long flags;
393 412
394 spin_lock_irqsave(&watchdog_lock, flags); 413 spin_lock_irqsave(&watchdog_lock, flags);
395 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 414 if (cs != watchdog) {
396 /* cs is a watched clocksource. */ 415 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
397 list_del_init(&cs->wd_list); 416 /* cs is a watched clocksource. */
398 } else if (cs == watchdog) { 417 list_del_init(&cs->wd_list);
399 /* Reset watchdog cycles */ 418 /* Check if the watchdog timer needs to be stopped. */
400 clocksource_reset_watchdog(); 419 clocksource_stop_watchdog();
401 /* Current watchdog is removed. Find an alternative. */
402 watchdog = NULL;
403 list_for_each_entry(tmp, &clocksource_list, list) {
404 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
405 continue;
406 if (!watchdog || tmp->rating > watchdog->rating)
407 watchdog = tmp;
408 } 420 }
409 } 421 }
410 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
411 /* Check if the watchdog timer needs to be stopped. */
412 clocksource_stop_watchdog();
413 spin_unlock_irqrestore(&watchdog_lock, flags); 422 spin_unlock_irqrestore(&watchdog_lock, flags);
414} 423}
415 424
416static int clocksource_watchdog_kthread(void *data) 425static int __clocksource_watchdog_kthread(void)
417{ 426{
418 struct clocksource *cs, *tmp; 427 struct clocksource *cs, *tmp;
419 unsigned long flags; 428 unsigned long flags;
420 LIST_HEAD(unstable); 429 LIST_HEAD(unstable);
430 int select = 0;
421 431
422 mutex_lock(&clocksource_mutex);
423 spin_lock_irqsave(&watchdog_lock, flags); 432 spin_lock_irqsave(&watchdog_lock, flags);
424 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) 433 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
425 if (cs->flags & CLOCK_SOURCE_UNSTABLE) { 434 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
426 list_del_init(&cs->wd_list); 435 list_del_init(&cs->wd_list);
427 list_add(&cs->wd_list, &unstable); 436 list_add(&cs->wd_list, &unstable);
437 select = 1;
428 } 438 }
439 if (cs->flags & CLOCK_SOURCE_RESELECT) {
440 cs->flags &= ~CLOCK_SOURCE_RESELECT;
441 select = 1;
442 }
443 }
429 /* Check if the watchdog timer needs to be stopped. */ 444 /* Check if the watchdog timer needs to be stopped. */
430 clocksource_stop_watchdog(); 445 clocksource_stop_watchdog();
431 spin_unlock_irqrestore(&watchdog_lock, flags); 446 spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -435,10 +450,23 @@ static int clocksource_watchdog_kthread(void *data)
435 list_del_init(&cs->wd_list); 450 list_del_init(&cs->wd_list);
436 __clocksource_change_rating(cs, 0); 451 __clocksource_change_rating(cs, 0);
437 } 452 }
453 return select;
454}
455
456static int clocksource_watchdog_kthread(void *data)
457{
458 mutex_lock(&clocksource_mutex);
459 if (__clocksource_watchdog_kthread())
460 clocksource_select();
438 mutex_unlock(&clocksource_mutex); 461 mutex_unlock(&clocksource_mutex);
439 return 0; 462 return 0;
440} 463}
441 464
465static bool clocksource_is_watchdog(struct clocksource *cs)
466{
467 return cs == watchdog;
468}
469
442#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ 470#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
443 471
444static void clocksource_enqueue_watchdog(struct clocksource *cs) 472static void clocksource_enqueue_watchdog(struct clocksource *cs)
@@ -449,7 +477,8 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
449 477
450static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } 478static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
451static inline void clocksource_resume_watchdog(void) { } 479static inline void clocksource_resume_watchdog(void) { }
452static inline int clocksource_watchdog_kthread(void *data) { return 0; } 480static inline int __clocksource_watchdog_kthread(void) { return 0; }
481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
453 482
454#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 483#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
455 484
@@ -553,24 +582,42 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
553 582
554#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET 583#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
555 584
556/** 585static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
557 * clocksource_select - Select the best clocksource available
558 *
559 * Private function. Must hold clocksource_mutex when called.
560 *
561 * Select the clocksource with the best rating, or the clocksource,
562 * which is selected by userspace override.
563 */
564static void clocksource_select(void)
565{ 586{
566 struct clocksource *best, *cs; 587 struct clocksource *cs;
567 588
568 if (!finished_booting || list_empty(&clocksource_list)) 589 if (!finished_booting || list_empty(&clocksource_list))
590 return NULL;
591
592 /*
593 * We pick the clocksource with the highest rating. If oneshot
594 * mode is active, we pick the highres valid clocksource with
595 * the best rating.
596 */
597 list_for_each_entry(cs, &clocksource_list, list) {
598 if (skipcur && cs == curr_clocksource)
599 continue;
600 if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
601 continue;
602 return cs;
603 }
604 return NULL;
605}
606
607static void __clocksource_select(bool skipcur)
608{
609 bool oneshot = tick_oneshot_mode_active();
610 struct clocksource *best, *cs;
611
612 /* Find the best suitable clocksource */
613 best = clocksource_find_best(oneshot, skipcur);
614 if (!best)
569 return; 615 return;
570 /* First clocksource on the list has the best rating. */ 616
571 best = list_first_entry(&clocksource_list, struct clocksource, list);
572 /* Check for the override clocksource. */ 617 /* Check for the override clocksource. */
573 list_for_each_entry(cs, &clocksource_list, list) { 618 list_for_each_entry(cs, &clocksource_list, list) {
619 if (skipcur && cs == curr_clocksource)
620 continue;
574 if (strcmp(cs->name, override_name) != 0) 621 if (strcmp(cs->name, override_name) != 0)
575 continue; 622 continue;
576 /* 623 /*
@@ -578,8 +625,7 @@ static void clocksource_select(void)
578 * capable clocksource if the tick code is in oneshot 625 * capable clocksource if the tick code is in oneshot
579 * mode (highres or nohz) 626 * mode (highres or nohz)
580 */ 627 */
581 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && 628 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
582 tick_oneshot_mode_active()) {
583 /* Override clocksource cannot be used. */ 629 /* Override clocksource cannot be used. */
584 printk(KERN_WARNING "Override clocksource %s is not " 630 printk(KERN_WARNING "Override clocksource %s is not "
585 "HRT compatible. Cannot switch while in " 631 "HRT compatible. Cannot switch while in "
@@ -590,16 +636,35 @@ static void clocksource_select(void)
590 best = cs; 636 best = cs;
591 break; 637 break;
592 } 638 }
593 if (curr_clocksource != best) { 639
594 printk(KERN_INFO "Switching to clocksource %s\n", best->name); 640 if (curr_clocksource != best && !timekeeping_notify(best)) {
641 pr_info("Switched to clocksource %s\n", best->name);
595 curr_clocksource = best; 642 curr_clocksource = best;
596 timekeeping_notify(curr_clocksource);
597 } 643 }
598} 644}
599 645
646/**
647 * clocksource_select - Select the best clocksource available
648 *
649 * Private function. Must hold clocksource_mutex when called.
650 *
651 * Select the clocksource with the best rating, or the clocksource,
652 * which is selected by userspace override.
653 */
654static void clocksource_select(void)
655{
656 return __clocksource_select(false);
657}
658
659static void clocksource_select_fallback(void)
660{
661 return __clocksource_select(true);
662}
663
600#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ 664#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
601 665
602static inline void clocksource_select(void) { } 666static inline void clocksource_select(void) { }
667static inline void clocksource_select_fallback(void) { }
603 668
604#endif 669#endif
605 670
@@ -614,16 +679,11 @@ static int __init clocksource_done_booting(void)
614{ 679{
615 mutex_lock(&clocksource_mutex); 680 mutex_lock(&clocksource_mutex);
616 curr_clocksource = clocksource_default_clock(); 681 curr_clocksource = clocksource_default_clock();
617 mutex_unlock(&clocksource_mutex);
618
619 finished_booting = 1; 682 finished_booting = 1;
620
621 /* 683 /*
622 * Run the watchdog first to eliminate unstable clock sources 684 * Run the watchdog first to eliminate unstable clock sources
623 */ 685 */
624 clocksource_watchdog_kthread(NULL); 686 __clocksource_watchdog_kthread();
625
626 mutex_lock(&clocksource_mutex);
627 clocksource_select(); 687 clocksource_select();
628 mutex_unlock(&clocksource_mutex); 688 mutex_unlock(&clocksource_mutex);
629 return 0; 689 return 0;
@@ -756,7 +816,6 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
756 list_del(&cs->list); 816 list_del(&cs->list);
757 cs->rating = rating; 817 cs->rating = rating;
758 clocksource_enqueue(cs); 818 clocksource_enqueue(cs);
759 clocksource_select();
760} 819}
761 820
762/** 821/**
@@ -768,21 +827,47 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
768{ 827{
769 mutex_lock(&clocksource_mutex); 828 mutex_lock(&clocksource_mutex);
770 __clocksource_change_rating(cs, rating); 829 __clocksource_change_rating(cs, rating);
830 clocksource_select();
771 mutex_unlock(&clocksource_mutex); 831 mutex_unlock(&clocksource_mutex);
772} 832}
773EXPORT_SYMBOL(clocksource_change_rating); 833EXPORT_SYMBOL(clocksource_change_rating);
774 834
835/*
836 * Unbind clocksource @cs. Called with clocksource_mutex held
837 */
838static int clocksource_unbind(struct clocksource *cs)
839{
840 /*
841 * I really can't convince myself to support this on hardware
842 * designed by lobotomized monkeys.
843 */
844 if (clocksource_is_watchdog(cs))
845 return -EBUSY;
846
847 if (cs == curr_clocksource) {
848 /* Select and try to install a replacement clock source */
849 clocksource_select_fallback();
850 if (curr_clocksource == cs)
851 return -EBUSY;
852 }
853 clocksource_dequeue_watchdog(cs);
854 list_del_init(&cs->list);
855 return 0;
856}
857
775/** 858/**
776 * clocksource_unregister - remove a registered clocksource 859 * clocksource_unregister - remove a registered clocksource
777 * @cs: clocksource to be unregistered 860 * @cs: clocksource to be unregistered
778 */ 861 */
779void clocksource_unregister(struct clocksource *cs) 862int clocksource_unregister(struct clocksource *cs)
780{ 863{
864 int ret = 0;
865
781 mutex_lock(&clocksource_mutex); 866 mutex_lock(&clocksource_mutex);
782 clocksource_dequeue_watchdog(cs); 867 if (!list_empty(&cs->list))
783 list_del(&cs->list); 868 ret = clocksource_unbind(cs);
784 clocksource_select();
785 mutex_unlock(&clocksource_mutex); 869 mutex_unlock(&clocksource_mutex);
870 return ret;
786} 871}
787EXPORT_SYMBOL(clocksource_unregister); 872EXPORT_SYMBOL(clocksource_unregister);
788 873
@@ -808,6 +893,23 @@ sysfs_show_current_clocksources(struct device *dev,
808 return count; 893 return count;
809} 894}
810 895
896size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
897{
898 size_t ret = cnt;
899
900 /* strings from sysfs write are not 0 terminated! */
901 if (!cnt || cnt >= CS_NAME_LEN)
902 return -EINVAL;
903
904 /* strip of \n: */
905 if (buf[cnt-1] == '\n')
906 cnt--;
907 if (cnt > 0)
908 memcpy(dst, buf, cnt);
909 dst[cnt] = 0;
910 return ret;
911}
912
811/** 913/**
812 * sysfs_override_clocksource - interface for manually overriding clocksource 914 * sysfs_override_clocksource - interface for manually overriding clocksource
813 * @dev: unused 915 * @dev: unused
@@ -822,22 +924,13 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
822 struct device_attribute *attr, 924 struct device_attribute *attr,
823 const char *buf, size_t count) 925 const char *buf, size_t count)
824{ 926{
825 size_t ret = count; 927 size_t ret;
826
827 /* strings from sysfs write are not 0 terminated! */
828 if (count >= sizeof(override_name))
829 return -EINVAL;
830
831 /* strip of \n: */
832 if (buf[count-1] == '\n')
833 count--;
834 928
835 mutex_lock(&clocksource_mutex); 929 mutex_lock(&clocksource_mutex);
836 930
837 if (count > 0) 931 ret = sysfs_get_uname(buf, override_name, count);
838 memcpy(override_name, buf, count); 932 if (ret >= 0)
839 override_name[count] = 0; 933 clocksource_select();
840 clocksource_select();
841 934
842 mutex_unlock(&clocksource_mutex); 935 mutex_unlock(&clocksource_mutex);
843 936
@@ -845,6 +938,40 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
845} 938}
846 939
847/** 940/**
941 * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource
942 * @dev: unused
943 * @attr: unused
944 * @buf: unused
945 * @count: length of buffer
946 *
947 * Takes input from sysfs interface for manually unbinding a clocksource.
948 */
949static ssize_t sysfs_unbind_clocksource(struct device *dev,
950 struct device_attribute *attr,
951 const char *buf, size_t count)
952{
953 struct clocksource *cs;
954 char name[CS_NAME_LEN];
955 size_t ret;
956
957 ret = sysfs_get_uname(buf, name, count);
958 if (ret < 0)
959 return ret;
960
961 ret = -ENODEV;
962 mutex_lock(&clocksource_mutex);
963 list_for_each_entry(cs, &clocksource_list, list) {
964 if (strcmp(cs->name, name))
965 continue;
966 ret = clocksource_unbind(cs);
967 break;
968 }
969 mutex_unlock(&clocksource_mutex);
970
971 return ret ? ret : count;
972}
973
974/**
848 * sysfs_show_available_clocksources - sysfs interface for listing clocksource 975 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
849 * @dev: unused 976 * @dev: unused
850 * @attr: unused 977 * @attr: unused
@@ -886,6 +1013,8 @@ sysfs_show_available_clocksources(struct device *dev,
886static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, 1013static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
887 sysfs_override_clocksource); 1014 sysfs_override_clocksource);
888 1015
1016static DEVICE_ATTR(unbind_clocksource, 0200, NULL, sysfs_unbind_clocksource);
1017
889static DEVICE_ATTR(available_clocksource, 0444, 1018static DEVICE_ATTR(available_clocksource, 0444,
890 sysfs_show_available_clocksources, NULL); 1019 sysfs_show_available_clocksources, NULL);
891 1020
@@ -910,6 +1039,9 @@ static int __init init_clocksource_sysfs(void)
910 &device_clocksource, 1039 &device_clocksource,
911 &dev_attr_current_clocksource); 1040 &dev_attr_current_clocksource);
912 if (!error) 1041 if (!error)
1042 error = device_create_file(&device_clocksource,
1043 &dev_attr_unbind_clocksource);
1044 if (!error)
913 error = device_create_file( 1045 error = device_create_file(
914 &device_clocksource, 1046 &device_clocksource,
915 &dev_attr_available_clocksource); 1047 &dev_attr_available_clocksource);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
new file mode 100644
index 000000000000..a326f27d7f09
--- /dev/null
+++ b/kernel/time/sched_clock.c
@@ -0,0 +1,212 @@
1/*
2 * sched_clock.c: support for extending counters to full 64-bit ns counter
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 */
8#include <linux/clocksource.h>
9#include <linux/init.h>
10#include <linux/jiffies.h>
11#include <linux/kernel.h>
12#include <linux/moduleparam.h>
13#include <linux/sched.h>
14#include <linux/syscore_ops.h>
15#include <linux/timer.h>
16#include <linux/sched_clock.h>
17
18struct clock_data {
19 u64 epoch_ns;
20 u32 epoch_cyc;
21 u32 epoch_cyc_copy;
22 unsigned long rate;
23 u32 mult;
24 u32 shift;
25 bool suspended;
26};
27
28static void sched_clock_poll(unsigned long wrap_ticks);
29static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
30static int irqtime = -1;
31
32core_param(irqtime, irqtime, int, 0400);
33
34static struct clock_data cd = {
35 .mult = NSEC_PER_SEC / HZ,
36};
37
38static u32 __read_mostly sched_clock_mask = 0xffffffff;
39
40static u32 notrace jiffy_sched_clock_read(void)
41{
42 return (u32)(jiffies - INITIAL_JIFFIES);
43}
44
45static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
46
47static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
48{
49 return (cyc * mult) >> shift;
50}
51
52static unsigned long long notrace sched_clock_32(void)
53{
54 u64 epoch_ns;
55 u32 epoch_cyc;
56 u32 cyc;
57
58 if (cd.suspended)
59 return cd.epoch_ns;
60
61 /*
62 * Load the epoch_cyc and epoch_ns atomically. We do this by
63 * ensuring that we always write epoch_cyc, epoch_ns and
64 * epoch_cyc_copy in strict order, and read them in strict order.
65 * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
66 * the middle of an update, and we should repeat the load.
67 */
68 do {
69 epoch_cyc = cd.epoch_cyc;
70 smp_rmb();
71 epoch_ns = cd.epoch_ns;
72 smp_rmb();
73 } while (epoch_cyc != cd.epoch_cyc_copy);
74
75 cyc = read_sched_clock();
76 cyc = (cyc - epoch_cyc) & sched_clock_mask;
77 return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
78}
79
80/*
81 * Atomically update the sched_clock epoch.
82 */
83static void notrace update_sched_clock(void)
84{
85 unsigned long flags;
86 u32 cyc;
87 u64 ns;
88
89 cyc = read_sched_clock();
90 ns = cd.epoch_ns +
91 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
92 cd.mult, cd.shift);
93 /*
94 * Write epoch_cyc and epoch_ns in a way that the update is
95 * detectable in cyc_to_fixed_sched_clock().
96 */
97 raw_local_irq_save(flags);
98 cd.epoch_cyc_copy = cyc;
99 smp_wmb();
100 cd.epoch_ns = ns;
101 smp_wmb();
102 cd.epoch_cyc = cyc;
103 raw_local_irq_restore(flags);
104}
105
106static void sched_clock_poll(unsigned long wrap_ticks)
107{
108 mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
109 update_sched_clock();
110}
111
112void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
113{
114 unsigned long r, w;
115 u64 res, wrap;
116 char r_unit;
117
118 if (cd.rate > rate)
119 return;
120
121 BUG_ON(bits > 32);
122 WARN_ON(!irqs_disabled());
123 read_sched_clock = read;
124 sched_clock_mask = (1 << bits) - 1;
125 cd.rate = rate;
126
127 /* calculate the mult/shift to convert counter ticks to ns. */
128 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0);
129
130 r = rate;
131 if (r >= 4000000) {
132 r /= 1000000;
133 r_unit = 'M';
134 } else if (r >= 1000) {
135 r /= 1000;
136 r_unit = 'k';
137 } else
138 r_unit = ' ';
139
140 /* calculate how many ns until we wrap */
141 wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift);
142 do_div(wrap, NSEC_PER_MSEC);
143 w = wrap;
144
145 /* calculate the ns resolution of this counter */
146 res = cyc_to_ns(1ULL, cd.mult, cd.shift);
147 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n",
148 bits, r, r_unit, res, w);
149
150 /*
151 * Start the timer to keep sched_clock() properly updated and
152 * sets the initial epoch.
153 */
154 sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
155 update_sched_clock();
156
157 /*
158 * Ensure that sched_clock() starts off at 0ns
159 */
160 cd.epoch_ns = 0;
161
162 /* Enable IRQ time accounting if we have a fast enough sched_clock */
163 if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
164 enable_sched_clock_irqtime();
165
166 pr_debug("Registered %pF as sched_clock source\n", read);
167}
168
169unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32;
170
171unsigned long long notrace sched_clock(void)
172{
173 return sched_clock_func();
174}
175
176void __init sched_clock_postinit(void)
177{
178 /*
179 * If no sched_clock function has been provided at that point,
180 * make it the final one one.
181 */
182 if (read_sched_clock == jiffy_sched_clock_read)
183 setup_sched_clock(jiffy_sched_clock_read, 32, HZ);
184
185 sched_clock_poll(sched_clock_timer.data);
186}
187
188static int sched_clock_suspend(void)
189{
190 sched_clock_poll(sched_clock_timer.data);
191 cd.suspended = true;
192 return 0;
193}
194
195static void sched_clock_resume(void)
196{
197 cd.epoch_cyc = read_sched_clock();
198 cd.epoch_cyc_copy = cd.epoch_cyc;
199 cd.suspended = false;
200}
201
202static struct syscore_ops sched_clock_ops = {
203 .suspend = sched_clock_suspend,
204 .resume = sched_clock_resume,
205};
206
207static int __init sched_clock_syscore_init(void)
208{
209 register_syscore_ops(&sched_clock_ops);
210 return 0;
211}
212device_initcall(sched_clock_syscore_init);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 20d6fba70652..218bcb565fed 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -19,6 +19,7 @@
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/smp.h> 21#include <linux/smp.h>
22#include <linux/module.h>
22 23
23#include "tick-internal.h" 24#include "tick-internal.h"
24 25
@@ -29,6 +30,7 @@
29 30
30static struct tick_device tick_broadcast_device; 31static struct tick_device tick_broadcast_device;
31static cpumask_var_t tick_broadcast_mask; 32static cpumask_var_t tick_broadcast_mask;
33static cpumask_var_t tick_broadcast_on;
32static cpumask_var_t tmpmask; 34static cpumask_var_t tmpmask;
33static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); 35static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
34static int tick_broadcast_force; 36static int tick_broadcast_force;
@@ -64,17 +66,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
64/* 66/*
65 * Check, if the device can be utilized as broadcast device: 67 * Check, if the device can be utilized as broadcast device:
66 */ 68 */
67int tick_check_broadcast_device(struct clock_event_device *dev) 69static bool tick_check_broadcast_device(struct clock_event_device *curdev,
70 struct clock_event_device *newdev)
71{
72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
73 (newdev->features & CLOCK_EVT_FEAT_C3STOP))
74 return false;
75
76 if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT &&
77 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
78 return false;
79
80 return !curdev || newdev->rating > curdev->rating;
81}
82
83/*
84 * Conditionally install/replace broadcast device
85 */
86void tick_install_broadcast_device(struct clock_event_device *dev)
68{ 87{
69 struct clock_event_device *cur = tick_broadcast_device.evtdev; 88 struct clock_event_device *cur = tick_broadcast_device.evtdev;
70 89
71 if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || 90 if (!tick_check_broadcast_device(cur, dev))
72 (tick_broadcast_device.evtdev && 91 return;
73 tick_broadcast_device.evtdev->rating >= dev->rating) || 92
74 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 93 if (!try_module_get(dev->owner))
75 return 0; 94 return;
76 95
77 clockevents_exchange_device(tick_broadcast_device.evtdev, dev); 96 clockevents_exchange_device(cur, dev);
78 if (cur) 97 if (cur)
79 cur->event_handler = clockevents_handle_noop; 98 cur->event_handler = clockevents_handle_noop;
80 tick_broadcast_device.evtdev = dev; 99 tick_broadcast_device.evtdev = dev;
@@ -90,7 +109,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
90 */ 109 */
91 if (dev->features & CLOCK_EVT_FEAT_ONESHOT) 110 if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
92 tick_clock_notify(); 111 tick_clock_notify();
93 return 1;
94} 112}
95 113
96/* 114/*
@@ -123,8 +141,9 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
123 */ 141 */
124int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) 142int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
125{ 143{
144 struct clock_event_device *bc = tick_broadcast_device.evtdev;
126 unsigned long flags; 145 unsigned long flags;
127 int ret = 0; 146 int ret;
128 147
129 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 148 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
130 149
@@ -138,20 +157,62 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
138 dev->event_handler = tick_handle_periodic; 157 dev->event_handler = tick_handle_periodic;
139 tick_device_setup_broadcast_func(dev); 158 tick_device_setup_broadcast_func(dev);
140 cpumask_set_cpu(cpu, tick_broadcast_mask); 159 cpumask_set_cpu(cpu, tick_broadcast_mask);
141 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 160 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
161 tick_broadcast_start_periodic(bc);
162 else
163 tick_broadcast_setup_oneshot(bc);
142 ret = 1; 164 ret = 1;
143 } else { 165 } else {
144 /* 166 /*
145 * When the new device is not affected by the stop 167 * Clear the broadcast bit for this cpu if the
146 * feature and the cpu is marked in the broadcast mask 168 * device is not power state affected.
147 * then clear the broadcast bit.
148 */ 169 */
149 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 170 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
150 int cpu = smp_processor_id();
151 cpumask_clear_cpu(cpu, tick_broadcast_mask); 171 cpumask_clear_cpu(cpu, tick_broadcast_mask);
152 tick_broadcast_clear_oneshot(cpu); 172 else
153 } else {
154 tick_device_setup_broadcast_func(dev); 173 tick_device_setup_broadcast_func(dev);
174
175 /*
176 * Clear the broadcast bit if the CPU is not in
177 * periodic broadcast on state.
178 */
179 if (!cpumask_test_cpu(cpu, tick_broadcast_on))
180 cpumask_clear_cpu(cpu, tick_broadcast_mask);
181
182 switch (tick_broadcast_device.mode) {
183 case TICKDEV_MODE_ONESHOT:
184 /*
185 * If the system is in oneshot mode we can
186 * unconditionally clear the oneshot mask bit,
187 * because the CPU is running and therefore
188 * not in an idle state which causes the power
189 * state affected device to stop. Let the
190 * caller initialize the device.
191 */
192 tick_broadcast_clear_oneshot(cpu);
193 ret = 0;
194 break;
195
196 case TICKDEV_MODE_PERIODIC:
197 /*
198 * If the system is in periodic mode, check
199 * whether the broadcast device can be
200 * switched off now.
201 */
202 if (cpumask_empty(tick_broadcast_mask) && bc)
203 clockevents_shutdown(bc);
204 /*
205 * If we kept the cpu in the broadcast mask,
206 * tell the caller to leave the per cpu device
207 * in shutdown state. The periodic interrupt
208 * is delivered by the broadcast device.
209 */
210 ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
211 break;
212 default:
213 /* Nothing to do */
214 ret = 0;
215 break;
155 } 216 }
156 } 217 }
157 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 218 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -281,6 +342,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
281 switch (*reason) { 342 switch (*reason) {
282 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 343 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
283 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 344 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
345 cpumask_set_cpu(cpu, tick_broadcast_on);
284 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { 346 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
285 if (tick_broadcast_device.mode == 347 if (tick_broadcast_device.mode ==
286 TICKDEV_MODE_PERIODIC) 348 TICKDEV_MODE_PERIODIC)
@@ -290,8 +352,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
290 tick_broadcast_force = 1; 352 tick_broadcast_force = 1;
291 break; 353 break;
292 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 354 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
293 if (!tick_broadcast_force && 355 if (tick_broadcast_force)
294 cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) { 356 break;
357 cpumask_clear_cpu(cpu, tick_broadcast_on);
358 if (!tick_device_is_functional(dev))
359 break;
360 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
295 if (tick_broadcast_device.mode == 361 if (tick_broadcast_device.mode ==
296 TICKDEV_MODE_PERIODIC) 362 TICKDEV_MODE_PERIODIC)
297 tick_setup_periodic(dev, 0); 363 tick_setup_periodic(dev, 0);
@@ -349,6 +415,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
349 415
350 bc = tick_broadcast_device.evtdev; 416 bc = tick_broadcast_device.evtdev;
351 cpumask_clear_cpu(cpu, tick_broadcast_mask); 417 cpumask_clear_cpu(cpu, tick_broadcast_mask);
418 cpumask_clear_cpu(cpu, tick_broadcast_on);
352 419
353 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 420 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
354 if (bc && cpumask_empty(tick_broadcast_mask)) 421 if (bc && cpumask_empty(tick_broadcast_mask))
@@ -475,7 +542,15 @@ void tick_check_oneshot_broadcast(int cpu)
475 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) { 542 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
476 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 543 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
477 544
478 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 545 /*
546 * We might be in the middle of switching over from
547 * periodic to oneshot. If the CPU has not yet
548 * switched over, leave the device alone.
549 */
550 if (td->mode == TICKDEV_MODE_ONESHOT) {
551 clockevents_set_mode(td->evtdev,
552 CLOCK_EVT_MODE_ONESHOT);
553 }
479 } 554 }
480} 555}
481 556
@@ -522,6 +597,13 @@ again:
522 cpumask_clear(tick_broadcast_force_mask); 597 cpumask_clear(tick_broadcast_force_mask);
523 598
524 /* 599 /*
600 * Sanity check. Catch the case where we try to broadcast to
601 * offline cpus.
602 */
603 if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask)))
604 cpumask_and(tmpmask, tmpmask, cpu_online_mask);
605
606 /*
525 * Wakeup the cpus which have an expired event. 607 * Wakeup the cpus which have an expired event.
526 */ 608 */
527 tick_do_broadcast(tmpmask); 609 tick_do_broadcast(tmpmask);
@@ -761,10 +843,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
761 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 843 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
762 844
763 /* 845 /*
764 * Clear the broadcast mask flag for the dead cpu, but do not 846 * Clear the broadcast masks for the dead cpu, but do not stop
765 * stop the broadcast device! 847 * the broadcast device!
766 */ 848 */
767 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); 849 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
850 cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
851 cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
768 852
769 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 853 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
770} 854}
@@ -792,6 +876,7 @@ bool tick_broadcast_oneshot_available(void)
792void __init tick_broadcast_init(void) 876void __init tick_broadcast_init(void)
793{ 877{
794 zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT); 878 zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
879 zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
795 zalloc_cpumask_var(&tmpmask, GFP_NOWAIT); 880 zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
796#ifdef CONFIG_TICK_ONESHOT 881#ifdef CONFIG_TICK_ONESHOT
797 zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT); 882 zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 5d3fb100bc06..64522ecdfe0e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,6 +18,7 @@
18#include <linux/percpu.h> 18#include <linux/percpu.h>
19#include <linux/profile.h> 19#include <linux/profile.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/module.h>
21 22
22#include <asm/irq_regs.h> 23#include <asm/irq_regs.h>
23 24
@@ -33,7 +34,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33ktime_t tick_next_period; 34ktime_t tick_next_period;
34ktime_t tick_period; 35ktime_t tick_period;
35int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
36static DEFINE_RAW_SPINLOCK(tick_device_lock);
37 37
38/* 38/*
39 * Debugging: see timer_list.c 39 * Debugging: see timer_list.c
@@ -194,7 +194,8 @@ static void tick_setup_device(struct tick_device *td,
194 * When global broadcasting is active, check if the current 194 * When global broadcasting is active, check if the current
195 * device is registered as a placeholder for broadcast mode. 195 * device is registered as a placeholder for broadcast mode.
196 * This allows us to handle this x86 misfeature in a generic 196 * This allows us to handle this x86 misfeature in a generic
197 * way. 197 * way. This function also returns !=0 when we keep the
198 * current active broadcast state for this CPU.
198 */ 199 */
199 if (tick_device_uses_broadcast(newdev, cpu)) 200 if (tick_device_uses_broadcast(newdev, cpu))
200 return; 201 return;
@@ -205,17 +206,75 @@ static void tick_setup_device(struct tick_device *td,
205 tick_setup_oneshot(newdev, handler, next_event); 206 tick_setup_oneshot(newdev, handler, next_event);
206} 207}
207 208
209void tick_install_replacement(struct clock_event_device *newdev)
210{
211 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
212 int cpu = smp_processor_id();
213
214 clockevents_exchange_device(td->evtdev, newdev);
215 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
216 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
217 tick_oneshot_notify();
218}
219
220static bool tick_check_percpu(struct clock_event_device *curdev,
221 struct clock_event_device *newdev, int cpu)
222{
223 if (!cpumask_test_cpu(cpu, newdev->cpumask))
224 return false;
225 if (cpumask_equal(newdev->cpumask, cpumask_of(cpu)))
226 return true;
227 /* Check if irq affinity can be set */
228 if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq))
229 return false;
230 /* Prefer an existing cpu local device */
231 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
232 return false;
233 return true;
234}
235
236static bool tick_check_preferred(struct clock_event_device *curdev,
237 struct clock_event_device *newdev)
238{
239 /* Prefer oneshot capable device */
240 if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) {
241 if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT))
242 return false;
243 if (tick_oneshot_mode_active())
244 return false;
245 }
246
247 /*
248 * Use the higher rated one, but prefer a CPU local device with a lower
249 * rating than a non-CPU local device
250 */
251 return !curdev ||
252 newdev->rating > curdev->rating ||
253 !cpumask_equal(curdev->cpumask, newdev->cpumask);
254}
255
256/*
257 * Check whether the new device is a better fit than curdev. curdev
258 * can be NULL !
259 */
260bool tick_check_replacement(struct clock_event_device *curdev,
261 struct clock_event_device *newdev)
262{
263 if (tick_check_percpu(curdev, newdev, smp_processor_id()))
264 return false;
265
266 return tick_check_preferred(curdev, newdev);
267}
268
208/* 269/*
209 * Check, if the new registered device should be used. 270 * Check, if the new registered device should be used. Called with
271 * clockevents_lock held and interrupts disabled.
210 */ 272 */
211static int tick_check_new_device(struct clock_event_device *newdev) 273void tick_check_new_device(struct clock_event_device *newdev)
212{ 274{
213 struct clock_event_device *curdev; 275 struct clock_event_device *curdev;
214 struct tick_device *td; 276 struct tick_device *td;
215 int cpu, ret = NOTIFY_OK; 277 int cpu;
216 unsigned long flags;
217
218 raw_spin_lock_irqsave(&tick_device_lock, flags);
219 278
220 cpu = smp_processor_id(); 279 cpu = smp_processor_id();
221 if (!cpumask_test_cpu(cpu, newdev->cpumask)) 280 if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -225,40 +284,15 @@ static int tick_check_new_device(struct clock_event_device *newdev)
225 curdev = td->evtdev; 284 curdev = td->evtdev;
226 285
227 /* cpu local device ? */ 286 /* cpu local device ? */
228 if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { 287 if (!tick_check_percpu(curdev, newdev, cpu))
229 288 goto out_bc;
230 /*
231 * If the cpu affinity of the device interrupt can not
232 * be set, ignore it.
233 */
234 if (!irq_can_set_affinity(newdev->irq))
235 goto out_bc;
236 289
237 /* 290 /* Preference decision */
238 * If we have a cpu local device already, do not replace it 291 if (!tick_check_preferred(curdev, newdev))
239 * by a non cpu local device 292 goto out_bc;
240 */
241 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
242 goto out_bc;
243 }
244 293
245 /* 294 if (!try_module_get(newdev->owner))
246 * If we have an active device, then check the rating and the oneshot 295 return;
247 * feature.
248 */
249 if (curdev) {
250 /*
251 * Prefer one shot capable devices !
252 */
253 if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
254 !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
255 goto out_bc;
256 /*
257 * Check the rating
258 */
259 if (curdev->rating >= newdev->rating)
260 goto out_bc;
261 }
262 296
263 /* 297 /*
264 * Replace the eventually existing device by the new 298 * Replace the eventually existing device by the new
@@ -273,20 +307,13 @@ static int tick_check_new_device(struct clock_event_device *newdev)
273 tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); 307 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
274 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 308 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
275 tick_oneshot_notify(); 309 tick_oneshot_notify();
276 310 return;
277 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
278 return NOTIFY_STOP;
279 311
280out_bc: 312out_bc:
281 /* 313 /*
282 * Can the new device be used as a broadcast device ? 314 * Can the new device be used as a broadcast device ?
283 */ 315 */
284 if (tick_check_broadcast_device(newdev)) 316 tick_install_broadcast_device(newdev);
285 ret = NOTIFY_STOP;
286
287 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
288
289 return ret;
290} 317}
291 318
292/* 319/*
@@ -294,7 +321,7 @@ out_bc:
294 * 321 *
295 * Called with interrupts disabled. 322 * Called with interrupts disabled.
296 */ 323 */
297static void tick_handover_do_timer(int *cpup) 324void tick_handover_do_timer(int *cpup)
298{ 325{
299 if (*cpup == tick_do_timer_cpu) { 326 if (*cpup == tick_do_timer_cpu) {
300 int cpu = cpumask_first(cpu_online_mask); 327 int cpu = cpumask_first(cpu_online_mask);
@@ -311,13 +338,11 @@ static void tick_handover_do_timer(int *cpup)
311 * access the hardware device itself. 338 * access the hardware device itself.
312 * We just set the mode and remove it from the lists. 339 * We just set the mode and remove it from the lists.
313 */ 340 */
314static void tick_shutdown(unsigned int *cpup) 341void tick_shutdown(unsigned int *cpup)
315{ 342{
316 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); 343 struct tick_device *td = &per_cpu(tick_cpu_device, *cpup);
317 struct clock_event_device *dev = td->evtdev; 344 struct clock_event_device *dev = td->evtdev;
318 unsigned long flags;
319 345
320 raw_spin_lock_irqsave(&tick_device_lock, flags);
321 td->mode = TICKDEV_MODE_PERIODIC; 346 td->mode = TICKDEV_MODE_PERIODIC;
322 if (dev) { 347 if (dev) {
323 /* 348 /*
@@ -329,26 +354,20 @@ static void tick_shutdown(unsigned int *cpup)
329 dev->event_handler = clockevents_handle_noop; 354 dev->event_handler = clockevents_handle_noop;
330 td->evtdev = NULL; 355 td->evtdev = NULL;
331 } 356 }
332 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
333} 357}
334 358
335static void tick_suspend(void) 359void tick_suspend(void)
336{ 360{
337 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 361 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
338 unsigned long flags;
339 362
340 raw_spin_lock_irqsave(&tick_device_lock, flags);
341 clockevents_shutdown(td->evtdev); 363 clockevents_shutdown(td->evtdev);
342 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
343} 364}
344 365
345static void tick_resume(void) 366void tick_resume(void)
346{ 367{
347 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 368 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
348 unsigned long flags;
349 int broadcast = tick_resume_broadcast(); 369 int broadcast = tick_resume_broadcast();
350 370
351 raw_spin_lock_irqsave(&tick_device_lock, flags);
352 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 371 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
353 372
354 if (!broadcast) { 373 if (!broadcast) {
@@ -357,68 +376,12 @@ static void tick_resume(void)
357 else 376 else
358 tick_resume_oneshot(); 377 tick_resume_oneshot();
359 } 378 }
360 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
361} 379}
362 380
363/*
364 * Notification about clock event devices
365 */
366static int tick_notify(struct notifier_block *nb, unsigned long reason,
367 void *dev)
368{
369 switch (reason) {
370
371 case CLOCK_EVT_NOTIFY_ADD:
372 return tick_check_new_device(dev);
373
374 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
375 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
376 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
377 tick_broadcast_on_off(reason, dev);
378 break;
379
380 case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
381 case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
382 tick_broadcast_oneshot_control(reason);
383 break;
384
385 case CLOCK_EVT_NOTIFY_CPU_DYING:
386 tick_handover_do_timer(dev);
387 break;
388
389 case CLOCK_EVT_NOTIFY_CPU_DEAD:
390 tick_shutdown_broadcast_oneshot(dev);
391 tick_shutdown_broadcast(dev);
392 tick_shutdown(dev);
393 break;
394
395 case CLOCK_EVT_NOTIFY_SUSPEND:
396 tick_suspend();
397 tick_suspend_broadcast();
398 break;
399
400 case CLOCK_EVT_NOTIFY_RESUME:
401 tick_resume();
402 break;
403
404 default:
405 break;
406 }
407
408 return NOTIFY_OK;
409}
410
411static struct notifier_block tick_notifier = {
412 .notifier_call = tick_notify,
413};
414
415/** 381/**
416 * tick_init - initialize the tick control 382 * tick_init - initialize the tick control
417 *
418 * Register the notifier with the clockevents framework
419 */ 383 */
420void __init tick_init(void) 384void __init tick_init(void)
421{ 385{
422 clockevents_register_notifier(&tick_notifier);
423 tick_broadcast_init(); 386 tick_broadcast_init();
424} 387}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f0299eae4602..bc906cad709b 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,6 +6,8 @@
6 6
7extern seqlock_t jiffies_lock; 7extern seqlock_t jiffies_lock;
8 8
9#define CS_NAME_LEN 32
10
9#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD 11#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
10 12
11#define TICK_DO_TIMER_NONE -1 13#define TICK_DO_TIMER_NONE -1
@@ -18,9 +20,19 @@ extern int tick_do_timer_cpu __read_mostly;
18 20
19extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 21extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
20extern void tick_handle_periodic(struct clock_event_device *dev); 22extern void tick_handle_periodic(struct clock_event_device *dev);
23extern void tick_check_new_device(struct clock_event_device *dev);
24extern void tick_handover_do_timer(int *cpup);
25extern void tick_shutdown(unsigned int *cpup);
26extern void tick_suspend(void);
27extern void tick_resume(void);
28extern bool tick_check_replacement(struct clock_event_device *curdev,
29 struct clock_event_device *newdev);
30extern void tick_install_replacement(struct clock_event_device *dev);
21 31
22extern void clockevents_shutdown(struct clock_event_device *dev); 32extern void clockevents_shutdown(struct clock_event_device *dev);
23 33
34extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
35
24/* 36/*
25 * NO_HZ / high resolution timer shared code 37 * NO_HZ / high resolution timer shared code
26 */ 38 */
@@ -90,7 +102,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; }
90 */ 102 */
91#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 103#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
92extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); 104extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
93extern int tick_check_broadcast_device(struct clock_event_device *dev); 105extern void tick_install_broadcast_device(struct clock_event_device *dev);
94extern int tick_is_broadcast_device(struct clock_event_device *dev); 106extern int tick_is_broadcast_device(struct clock_event_device *dev);
95extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); 107extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
96extern void tick_shutdown_broadcast(unsigned int *cpup); 108extern void tick_shutdown_broadcast(unsigned int *cpup);
@@ -102,9 +114,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
102 114
103#else /* !BROADCAST */ 115#else /* !BROADCAST */
104 116
105static inline int tick_check_broadcast_device(struct clock_event_device *dev) 117static inline void tick_install_broadcast_device(struct clock_event_device *dev)
106{ 118{
107 return 0;
108} 119}
109 120
110static inline int tick_is_broadcast_device(struct clock_event_device *dev) 121static inline int tick_is_broadcast_device(struct clock_event_device *dev)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0cf1c1453181..e80183f4a6c4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -178,6 +178,11 @@ static bool can_stop_full_tick(void)
178 */ 178 */
179 if (!sched_clock_stable) { 179 if (!sched_clock_stable) {
180 trace_tick_stop(0, "unstable sched clock\n"); 180 trace_tick_stop(0, "unstable sched clock\n");
181 /*
182 * Don't allow the user to think they can get
183 * full NO_HZ with this machine.
184 */
185 WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock");
181 return false; 186 return false;
182 } 187 }
183#endif 188#endif
@@ -293,7 +298,7 @@ static int __init tick_nohz_full_setup(char *str)
293} 298}
294__setup("nohz_full=", tick_nohz_full_setup); 299__setup("nohz_full=", tick_nohz_full_setup);
295 300
296static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, 301static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
297 unsigned long action, 302 unsigned long action,
298 void *hcpu) 303 void *hcpu)
299{ 304{
@@ -346,16 +351,6 @@ void __init tick_nohz_init(void)
346 } 351 }
347 352
348 cpu_notifier(tick_nohz_cpu_down_callback, 0); 353 cpu_notifier(tick_nohz_cpu_down_callback, 0);
349
350 /* Make sure full dynticks CPU are also RCU nocbs */
351 for_each_cpu(cpu, nohz_full_mask) {
352 if (!rcu_is_nocb_cpu(cpu)) {
353 pr_warning("NO_HZ: CPU %d is not RCU nocb: "
354 "cleared from nohz_full range", cpu);
355 cpumask_clear_cpu(cpu, nohz_full_mask);
356 }
357 }
358
359 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); 354 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
360 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); 355 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
361} 356}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index baeeb5c87cf1..48b9fffabdc2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -25,6 +25,11 @@
25 25
26#include "tick-internal.h" 26#include "tick-internal.h"
27#include "ntp_internal.h" 27#include "ntp_internal.h"
28#include "timekeeping_internal.h"
29
30#define TK_CLEAR_NTP (1 << 0)
31#define TK_MIRROR (1 << 1)
32#define TK_CLOCK_WAS_SET (1 << 2)
28 33
29static struct timekeeper timekeeper; 34static struct timekeeper timekeeper;
30static DEFINE_RAW_SPINLOCK(timekeeper_lock); 35static DEFINE_RAW_SPINLOCK(timekeeper_lock);
@@ -200,9 +205,9 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
200 205
201static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); 206static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
202 207
203static void update_pvclock_gtod(struct timekeeper *tk) 208static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
204{ 209{
205 raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); 210 raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
206} 211}
207 212
208/** 213/**
@@ -216,7 +221,7 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
216 221
217 raw_spin_lock_irqsave(&timekeeper_lock, flags); 222 raw_spin_lock_irqsave(&timekeeper_lock, flags);
218 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); 223 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
219 update_pvclock_gtod(tk); 224 update_pvclock_gtod(tk, true);
220 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 225 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
221 226
222 return ret; 227 return ret;
@@ -241,16 +246,16 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
241EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); 246EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
242 247
243/* must hold timekeeper_lock */ 248/* must hold timekeeper_lock */
244static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror) 249static void timekeeping_update(struct timekeeper *tk, unsigned int action)
245{ 250{
246 if (clearntp) { 251 if (action & TK_CLEAR_NTP) {
247 tk->ntp_error = 0; 252 tk->ntp_error = 0;
248 ntp_clear(); 253 ntp_clear();
249 } 254 }
250 update_vsyscall(tk); 255 update_vsyscall(tk);
251 update_pvclock_gtod(tk); 256 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
252 257
253 if (mirror) 258 if (action & TK_MIRROR)
254 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); 259 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
255} 260}
256 261
@@ -508,7 +513,7 @@ int do_settimeofday(const struct timespec *tv)
508 513
509 tk_set_xtime(tk, tv); 514 tk_set_xtime(tk, tv);
510 515
511 timekeeping_update(tk, true, true); 516 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
512 517
513 write_seqcount_end(&timekeeper_seq); 518 write_seqcount_end(&timekeeper_seq);
514 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 519 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -552,7 +557,7 @@ int timekeeping_inject_offset(struct timespec *ts)
552 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 557 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
553 558
554error: /* even if we error out, we forwarded the time, so call update */ 559error: /* even if we error out, we forwarded the time, so call update */
555 timekeeping_update(tk, true, true); 560 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
556 561
557 write_seqcount_end(&timekeeper_seq); 562 write_seqcount_end(&timekeeper_seq);
558 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 563 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -627,13 +632,22 @@ static int change_clocksource(void *data)
627 write_seqcount_begin(&timekeeper_seq); 632 write_seqcount_begin(&timekeeper_seq);
628 633
629 timekeeping_forward_now(tk); 634 timekeeping_forward_now(tk);
630 if (!new->enable || new->enable(new) == 0) { 635 /*
631 old = tk->clock; 636 * If the cs is in module, get a module reference. Succeeds
632 tk_setup_internals(tk, new); 637 * for built-in code (owner == NULL) as well.
633 if (old->disable) 638 */
634 old->disable(old); 639 if (try_module_get(new->owner)) {
640 if (!new->enable || new->enable(new) == 0) {
641 old = tk->clock;
642 tk_setup_internals(tk, new);
643 if (old->disable)
644 old->disable(old);
645 module_put(old->owner);
646 } else {
647 module_put(new->owner);
648 }
635 } 649 }
636 timekeeping_update(tk, true, true); 650 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
637 651
638 write_seqcount_end(&timekeeper_seq); 652 write_seqcount_end(&timekeeper_seq);
639 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 653 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -648,14 +662,15 @@ static int change_clocksource(void *data)
648 * This function is called from clocksource.c after a new, better clock 662 * This function is called from clocksource.c after a new, better clock
649 * source has been registered. The caller holds the clocksource_mutex. 663 * source has been registered. The caller holds the clocksource_mutex.
650 */ 664 */
651void timekeeping_notify(struct clocksource *clock) 665int timekeeping_notify(struct clocksource *clock)
652{ 666{
653 struct timekeeper *tk = &timekeeper; 667 struct timekeeper *tk = &timekeeper;
654 668
655 if (tk->clock == clock) 669 if (tk->clock == clock)
656 return; 670 return 0;
657 stop_machine(change_clocksource, clock, NULL); 671 stop_machine(change_clocksource, clock, NULL);
658 tick_clock_notify(); 672 tick_clock_notify();
673 return tk->clock == clock ? 0 : -1;
659} 674}
660 675
661/** 676/**
@@ -841,6 +856,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
841 tk_xtime_add(tk, delta); 856 tk_xtime_add(tk, delta);
842 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); 857 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
843 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); 858 tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
859 tk_debug_account_sleep_time(delta);
844} 860}
845 861
846/** 862/**
@@ -872,7 +888,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
872 888
873 __timekeeping_inject_sleeptime(tk, delta); 889 __timekeeping_inject_sleeptime(tk, delta);
874 890
875 timekeeping_update(tk, true, true); 891 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
876 892
877 write_seqcount_end(&timekeeper_seq); 893 write_seqcount_end(&timekeeper_seq);
878 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 894 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -954,7 +970,7 @@ static void timekeeping_resume(void)
954 tk->cycle_last = clock->cycle_last = cycle_now; 970 tk->cycle_last = clock->cycle_last = cycle_now;
955 tk->ntp_error = 0; 971 tk->ntp_error = 0;
956 timekeeping_suspended = 0; 972 timekeeping_suspended = 0;
957 timekeeping_update(tk, false, true); 973 timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
958 write_seqcount_end(&timekeeper_seq); 974 write_seqcount_end(&timekeeper_seq);
959 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 975 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
960 976
@@ -1236,9 +1252,10 @@ out_adjust:
1236 * It also calls into the NTP code to handle leapsecond processing. 1252 * It also calls into the NTP code to handle leapsecond processing.
1237 * 1253 *
1238 */ 1254 */
1239static inline void accumulate_nsecs_to_secs(struct timekeeper *tk) 1255static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
1240{ 1256{
1241 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; 1257 u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
1258 unsigned int action = 0;
1242 1259
1243 while (tk->xtime_nsec >= nsecps) { 1260 while (tk->xtime_nsec >= nsecps) {
1244 int leap; 1261 int leap;
@@ -1261,8 +1278,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1261 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap); 1278 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1262 1279
1263 clock_was_set_delayed(); 1280 clock_was_set_delayed();
1281 action = TK_CLOCK_WAS_SET;
1264 } 1282 }
1265 } 1283 }
1284 return action;
1266} 1285}
1267 1286
1268/** 1287/**
@@ -1347,6 +1366,7 @@ static void update_wall_time(void)
1347 struct timekeeper *tk = &shadow_timekeeper; 1366 struct timekeeper *tk = &shadow_timekeeper;
1348 cycle_t offset; 1367 cycle_t offset;
1349 int shift = 0, maxshift; 1368 int shift = 0, maxshift;
1369 unsigned int action;
1350 unsigned long flags; 1370 unsigned long flags;
1351 1371
1352 raw_spin_lock_irqsave(&timekeeper_lock, flags); 1372 raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1399,7 +1419,7 @@ static void update_wall_time(void)
1399 * Finally, make sure that after the rounding 1419 * Finally, make sure that after the rounding
1400 * xtime_nsec isn't larger than NSEC_PER_SEC 1420 * xtime_nsec isn't larger than NSEC_PER_SEC
1401 */ 1421 */
1402 accumulate_nsecs_to_secs(tk); 1422 action = accumulate_nsecs_to_secs(tk);
1403 1423
1404 write_seqcount_begin(&timekeeper_seq); 1424 write_seqcount_begin(&timekeeper_seq);
1405 /* Update clock->cycle_last with the new value */ 1425 /* Update clock->cycle_last with the new value */
@@ -1415,7 +1435,7 @@ static void update_wall_time(void)
1415 * updating. 1435 * updating.
1416 */ 1436 */
1417 memcpy(real_tk, tk, sizeof(*tk)); 1437 memcpy(real_tk, tk, sizeof(*tk));
1418 timekeeping_update(real_tk, false, false); 1438 timekeeping_update(real_tk, action);
1419 write_seqcount_end(&timekeeper_seq); 1439 write_seqcount_end(&timekeeper_seq);
1420out: 1440out:
1421 raw_spin_unlock_irqrestore(&timekeeper_lock, flags); 1441 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1677,6 +1697,7 @@ int do_adjtimex(struct timex *txc)
1677 1697
1678 if (tai != orig_tai) { 1698 if (tai != orig_tai) {
1679 __timekeeping_set_tai_offset(tk, tai); 1699 __timekeeping_set_tai_offset(tk, tai);
1700 update_pvclock_gtod(tk, true);
1680 clock_was_set_delayed(); 1701 clock_was_set_delayed();
1681 } 1702 }
1682 write_seqcount_end(&timekeeper_seq); 1703 write_seqcount_end(&timekeeper_seq);
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
new file mode 100644
index 000000000000..802433a4f5eb
--- /dev/null
+++ b/kernel/time/timekeeping_debug.c
@@ -0,0 +1,72 @@
1/*
2 * debugfs file to track time spent in suspend
3 *
4 * Copyright (c) 2011, Google, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 */
16
17#include <linux/debugfs.h>
18#include <linux/err.h>
19#include <linux/init.h>
20#include <linux/kernel.h>
21#include <linux/seq_file.h>
22#include <linux/time.h>
23
24static unsigned int sleep_time_bin[32] = {0};
25
26static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
27{
28 unsigned int bin;
29 seq_puts(s, " time (secs) count\n");
30 seq_puts(s, "------------------------------\n");
31 for (bin = 0; bin < 32; bin++) {
32 if (sleep_time_bin[bin] == 0)
33 continue;
34 seq_printf(s, "%10u - %-10u %4u\n",
35 bin ? 1 << (bin - 1) : 0, 1 << bin,
36 sleep_time_bin[bin]);
37 }
38 return 0;
39}
40
41static int tk_debug_sleep_time_open(struct inode *inode, struct file *file)
42{
43 return single_open(file, tk_debug_show_sleep_time, NULL);
44}
45
46static const struct file_operations tk_debug_sleep_time_fops = {
47 .open = tk_debug_sleep_time_open,
48 .read = seq_read,
49 .llseek = seq_lseek,
50 .release = single_release,
51};
52
53static int __init tk_debug_sleep_time_init(void)
54{
55 struct dentry *d;
56
57 d = debugfs_create_file("sleep_time", 0444, NULL, NULL,
58 &tk_debug_sleep_time_fops);
59 if (!d) {
60 pr_err("Failed to create sleep_time debug file\n");
61 return -ENOMEM;
62 }
63
64 return 0;
65}
66late_initcall(tk_debug_sleep_time_init);
67
68void tk_debug_account_sleep_time(struct timespec *t)
69{
70 sleep_time_bin[fls(t->tv_sec)]++;
71}
72
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
new file mode 100644
index 000000000000..13323ea08ffa
--- /dev/null
+++ b/kernel/time/timekeeping_internal.h
@@ -0,0 +1,14 @@
1#ifndef _TIMEKEEPING_INTERNAL_H
2#define _TIMEKEEPING_INTERNAL_H
3/*
4 * timekeeping debug functions
5 */
6#include <linux/time.h>
7
8#ifdef CONFIG_DEBUG_FS
9extern void tk_debug_account_sleep_time(struct timespec *t);
10#else
11#define tk_debug_account_sleep_time(x)
12#endif
13
14#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/timer.c b/kernel/timer.c
index 15ffdb3f1948..4296d13db3d1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -149,9 +149,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
149 /* now that we have rounded, subtract the extra skew again */ 149 /* now that we have rounded, subtract the extra skew again */
150 j -= cpu * 3; 150 j -= cpu * 3;
151 151
152 if (j <= jiffies) /* rounding ate our timeout entirely; */ 152 /*
153 return original; 153 * Make sure j is still in the future. Otherwise return the
154 return j; 154 * unmodified value.
155 */
156 return time_is_after_jiffies(j) ? j : original;
155} 157}
156 158
157/** 159/**
@@ -1503,11 +1505,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1503} 1505}
1504EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1506EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1505 1507
1506static int __cpuinit init_timers_cpu(int cpu) 1508static int init_timers_cpu(int cpu)
1507{ 1509{
1508 int j; 1510 int j;
1509 struct tvec_base *base; 1511 struct tvec_base *base;
1510 static char __cpuinitdata tvec_base_done[NR_CPUS]; 1512 static char tvec_base_done[NR_CPUS];
1511 1513
1512 if (!tvec_base_done[cpu]) { 1514 if (!tvec_base_done[cpu]) {
1513 static char boot_done; 1515 static char boot_done;
@@ -1575,7 +1577,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1575 } 1577 }
1576} 1578}
1577 1579
1578static void __cpuinit migrate_timers(int cpu) 1580static void migrate_timers(int cpu)
1579{ 1581{
1580 struct tvec_base *old_base; 1582 struct tvec_base *old_base;
1581 struct tvec_base *new_base; 1583 struct tvec_base *new_base;
@@ -1608,7 +1610,7 @@ static void __cpuinit migrate_timers(int cpu)
1608} 1610}
1609#endif /* CONFIG_HOTPLUG_CPU */ 1611#endif /* CONFIG_HOTPLUG_CPU */
1610 1612
1611static int __cpuinit timer_cpu_notify(struct notifier_block *self, 1613static int timer_cpu_notify(struct notifier_block *self,
1612 unsigned long action, void *hcpu) 1614 unsigned long action, void *hcpu)
1613{ 1615{
1614 long cpu = (long)hcpu; 1616 long cpu = (long)hcpu;
@@ -1633,7 +1635,7 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1633 return NOTIFY_OK; 1635 return NOTIFY_OK;
1634} 1636}
1635 1637
1636static struct notifier_block __cpuinitdata timers_nb = { 1638static struct notifier_block timers_nb = {
1637 .notifier_call = timer_cpu_notify, 1639 .notifier_call = timer_cpu_notify,
1638}; 1640};
1639 1641
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6c508ff33c62..67708f46baae 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -413,6 +413,17 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
413 return 0; 413 return 0;
414} 414}
415 415
416static void ftrace_sync(struct work_struct *work)
417{
418 /*
419 * This function is just a stub to implement a hard force
420 * of synchronize_sched(). This requires synchronizing
421 * tasks even in userspace and idle.
422 *
423 * Yes, function tracing is rude.
424 */
425}
426
416static int __unregister_ftrace_function(struct ftrace_ops *ops) 427static int __unregister_ftrace_function(struct ftrace_ops *ops)
417{ 428{
418 int ret; 429 int ret;
@@ -440,8 +451,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
440 * so there'll be no new users. We must ensure 451 * so there'll be no new users. We must ensure
441 * all current users are done before we free 452 * all current users are done before we free
442 * the control data. 453 * the control data.
454 * Note synchronize_sched() is not enough, as we
455 * use preempt_disable() to do RCU, but the function
456 * tracer can be called where RCU is not active
457 * (before user_exit()).
443 */ 458 */
444 synchronize_sched(); 459 schedule_on_each_cpu(ftrace_sync);
445 control_ops_free(ops); 460 control_ops_free(ops);
446 } 461 }
447 } else 462 } else
@@ -456,9 +471,13 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
456 /* 471 /*
457 * Dynamic ops may be freed, we must make sure that all 472 * Dynamic ops may be freed, we must make sure that all
458 * callers are done before leaving this function. 473 * callers are done before leaving this function.
474 *
475 * Again, normal synchronize_sched() is not good enough.
476 * We need to do a hard force of sched synchronization.
459 */ 477 */
460 if (ops->flags & FTRACE_OPS_FL_DYNAMIC) 478 if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
461 synchronize_sched(); 479 schedule_on_each_cpu(ftrace_sync);
480
462 481
463 return 0; 482 return 0;
464} 483}
@@ -622,12 +641,18 @@ static int function_stat_show(struct seq_file *m, void *v)
622 if (rec->counter <= 1) 641 if (rec->counter <= 1)
623 stddev = 0; 642 stddev = 0;
624 else { 643 else {
625 stddev = rec->time_squared - rec->counter * avg * avg; 644 /*
645 * Apply Welford's method:
646 * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
647 */
648 stddev = rec->counter * rec->time_squared -
649 rec->time * rec->time;
650
626 /* 651 /*
627 * Divide only 1000 for ns^2 -> us^2 conversion. 652 * Divide only 1000 for ns^2 -> us^2 conversion.
628 * trace_print_graph_duration will divide 1000 again. 653 * trace_print_graph_duration will divide 1000 again.
629 */ 654 */
630 do_div(stddev, (rec->counter - 1) * 1000); 655 do_div(stddev, rec->counter * (rec->counter - 1) * 1000);
631 } 656 }
632 657
633 trace_seq_init(&s); 658 trace_seq_init(&s);
@@ -3512,8 +3537,12 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
3512static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 3537static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
3513static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; 3538static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
3514 3539
3540/* Used by function selftest to not test if filter is set */
3541bool ftrace_filter_param __initdata;
3542
3515static int __init set_ftrace_notrace(char *str) 3543static int __init set_ftrace_notrace(char *str)
3516{ 3544{
3545 ftrace_filter_param = true;
3517 strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); 3546 strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
3518 return 1; 3547 return 1;
3519} 3548}
@@ -3521,6 +3550,7 @@ __setup("ftrace_notrace=", set_ftrace_notrace);
3521 3550
3522static int __init set_ftrace_filter(char *str) 3551static int __init set_ftrace_filter(char *str)
3523{ 3552{
3553 ftrace_filter_param = true;
3524 strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); 3554 strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
3525 return 1; 3555 return 1;
3526} 3556}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e444ff88f0a4..cc2f66f68dc5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -36,11 +36,11 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
36{ 36{
37 int ret; 37 int ret;
38 38
39 ret = trace_seq_printf(s, "# compressed entry header\n"); 39 ret = trace_seq_puts(s, "# compressed entry header\n");
40 ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); 40 ret = trace_seq_puts(s, "\ttype_len : 5 bits\n");
41 ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); 41 ret = trace_seq_puts(s, "\ttime_delta : 27 bits\n");
42 ret = trace_seq_printf(s, "\tarray : 32 bits\n"); 42 ret = trace_seq_puts(s, "\tarray : 32 bits\n");
43 ret = trace_seq_printf(s, "\n"); 43 ret = trace_seq_putc(s, '\n');
44 ret = trace_seq_printf(s, "\tpadding : type == %d\n", 44 ret = trace_seq_printf(s, "\tpadding : type == %d\n",
45 RINGBUF_TYPE_PADDING); 45 RINGBUF_TYPE_PADDING);
46 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", 46 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
@@ -1066,7 +1066,7 @@ static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
1066} 1066}
1067 1067
1068/** 1068/**
1069 * check_pages - integrity check of buffer pages 1069 * rb_check_pages - integrity check of buffer pages
1070 * @cpu_buffer: CPU buffer with pages to test 1070 * @cpu_buffer: CPU buffer with pages to test
1071 * 1071 *
1072 * As a safety measure we check to make sure the data pages have not 1072 * As a safety measure we check to make sure the data pages have not
@@ -1258,7 +1258,7 @@ static int rb_cpu_notify(struct notifier_block *self,
1258#endif 1258#endif
1259 1259
1260/** 1260/**
1261 * ring_buffer_alloc - allocate a new ring_buffer 1261 * __ring_buffer_alloc - allocate a new ring_buffer
1262 * @size: the size in bytes per cpu that is needed. 1262 * @size: the size in bytes per cpu that is needed.
1263 * @flags: attributes to set for the ring buffer. 1263 * @flags: attributes to set for the ring buffer.
1264 * 1264 *
@@ -1607,6 +1607,7 @@ static void update_pages_handler(struct work_struct *work)
1607 * ring_buffer_resize - resize the ring buffer 1607 * ring_buffer_resize - resize the ring buffer
1608 * @buffer: the buffer to resize. 1608 * @buffer: the buffer to resize.
1609 * @size: the new size. 1609 * @size: the new size.
1610 * @cpu_id: the cpu buffer to resize
1610 * 1611 *
1611 * Minimum size is 2 * BUF_PAGE_SIZE. 1612 * Minimum size is 2 * BUF_PAGE_SIZE.
1612 * 1613 *
@@ -3956,11 +3957,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
3956 * expected. 3957 * expected.
3957 * 3958 *
3958 * After a sequence of ring_buffer_read_prepare calls, the user is 3959 * After a sequence of ring_buffer_read_prepare calls, the user is
3959 * expected to make at least one call to ring_buffer_prepare_sync. 3960 * expected to make at least one call to ring_buffer_read_prepare_sync.
3960 * Afterwards, ring_buffer_read_start is invoked to get things going 3961 * Afterwards, ring_buffer_read_start is invoked to get things going
3961 * for real. 3962 * for real.
3962 * 3963 *
3963 * This overall must be paired with ring_buffer_finish. 3964 * This overall must be paired with ring_buffer_read_finish.
3964 */ 3965 */
3965struct ring_buffer_iter * 3966struct ring_buffer_iter *
3966ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) 3967ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
@@ -4009,7 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
4009 * an intervening ring_buffer_read_prepare_sync must have been 4010 * an intervening ring_buffer_read_prepare_sync must have been
4010 * performed. 4011 * performed.
4011 * 4012 *
4012 * Must be paired with ring_buffer_finish. 4013 * Must be paired with ring_buffer_read_finish.
4013 */ 4014 */
4014void 4015void
4015ring_buffer_read_start(struct ring_buffer_iter *iter) 4016ring_buffer_read_start(struct ring_buffer_iter *iter)
@@ -4031,7 +4032,7 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
4031EXPORT_SYMBOL_GPL(ring_buffer_read_start); 4032EXPORT_SYMBOL_GPL(ring_buffer_read_start);
4032 4033
4033/** 4034/**
4034 * ring_buffer_finish - finish reading the iterator of the buffer 4035 * ring_buffer_read_finish - finish reading the iterator of the buffer
4035 * @iter: The iterator retrieved by ring_buffer_start 4036 * @iter: The iterator retrieved by ring_buffer_start
4036 * 4037 *
4037 * This re-enables the recording to the buffer, and frees the 4038 * This re-enables the recording to the buffer, and frees the
@@ -4346,6 +4347,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
4346/** 4347/**
4347 * ring_buffer_alloc_read_page - allocate a page to read from buffer 4348 * ring_buffer_alloc_read_page - allocate a page to read from buffer
4348 * @buffer: the buffer to allocate for. 4349 * @buffer: the buffer to allocate for.
4350 * @cpu: the cpu buffer to allocate.
4349 * 4351 *
4350 * This function is used in conjunction with ring_buffer_read_page. 4352 * This function is used in conjunction with ring_buffer_read_page.
4351 * When reading a full page from the ring buffer, these functions 4353 * When reading a full page from the ring buffer, these functions
@@ -4403,7 +4405,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
4403 * to swap with a page in the ring buffer. 4405 * to swap with a page in the ring buffer.
4404 * 4406 *
4405 * for example: 4407 * for example:
4406 * rpage = ring_buffer_alloc_read_page(buffer); 4408 * rpage = ring_buffer_alloc_read_page(buffer, cpu);
4407 * if (!rpage) 4409 * if (!rpage)
4408 * return error; 4410 * return error;
4409 * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); 4411 * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e71a8be4a6ee..3f2477713aca 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -115,6 +115,9 @@ cpumask_var_t __read_mostly tracing_buffer_mask;
115 115
116enum ftrace_dump_mode ftrace_dump_on_oops; 116enum ftrace_dump_mode ftrace_dump_on_oops;
117 117
118/* When set, tracing will stop when a WARN*() is hit */
119int __disable_trace_on_warning;
120
118static int tracing_set_tracer(const char *buf); 121static int tracing_set_tracer(const char *buf);
119 122
120#define MAX_TRACER_SIZE 100 123#define MAX_TRACER_SIZE 100
@@ -149,6 +152,13 @@ static int __init set_ftrace_dump_on_oops(char *str)
149} 152}
150__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 153__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
151 154
155static int __init stop_trace_on_warning(char *str)
156{
157 __disable_trace_on_warning = 1;
158 return 1;
159}
160__setup("traceoff_on_warning=", stop_trace_on_warning);
161
152static int __init boot_alloc_snapshot(char *str) 162static int __init boot_alloc_snapshot(char *str)
153{ 163{
154 allocate_snapshot = true; 164 allocate_snapshot = true;
@@ -170,6 +180,7 @@ static int __init set_trace_boot_options(char *str)
170} 180}
171__setup("trace_options=", set_trace_boot_options); 181__setup("trace_options=", set_trace_boot_options);
172 182
183
173unsigned long long ns2usecs(cycle_t nsec) 184unsigned long long ns2usecs(cycle_t nsec)
174{ 185{
175 nsec += 500; 186 nsec += 500;
@@ -193,6 +204,37 @@ static struct trace_array global_trace;
193 204
194LIST_HEAD(ftrace_trace_arrays); 205LIST_HEAD(ftrace_trace_arrays);
195 206
207int trace_array_get(struct trace_array *this_tr)
208{
209 struct trace_array *tr;
210 int ret = -ENODEV;
211
212 mutex_lock(&trace_types_lock);
213 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
214 if (tr == this_tr) {
215 tr->ref++;
216 ret = 0;
217 break;
218 }
219 }
220 mutex_unlock(&trace_types_lock);
221
222 return ret;
223}
224
225static void __trace_array_put(struct trace_array *this_tr)
226{
227 WARN_ON(!this_tr->ref);
228 this_tr->ref--;
229}
230
231void trace_array_put(struct trace_array *this_tr)
232{
233 mutex_lock(&trace_types_lock);
234 __trace_array_put(this_tr);
235 mutex_unlock(&trace_types_lock);
236}
237
196int filter_current_check_discard(struct ring_buffer *buffer, 238int filter_current_check_discard(struct ring_buffer *buffer,
197 struct ftrace_event_call *call, void *rec, 239 struct ftrace_event_call *call, void *rec,
198 struct ring_buffer_event *event) 240 struct ring_buffer_event *event)
@@ -215,9 +257,24 @@ cycle_t ftrace_now(int cpu)
215 return ts; 257 return ts;
216} 258}
217 259
260/**
261 * tracing_is_enabled - Show if global_trace has been disabled
262 *
263 * Shows if the global trace has been enabled or not. It uses the
264 * mirror flag "buffer_disabled" to be used in fast paths such as for
265 * the irqsoff tracer. But it may be inaccurate due to races. If you
266 * need to know the accurate state, use tracing_is_on() which is a little
267 * slower, but accurate.
268 */
218int tracing_is_enabled(void) 269int tracing_is_enabled(void)
219{ 270{
220 return tracing_is_on(); 271 /*
272 * For quick access (irqsoff uses this in fast path), just
273 * return the mirror variable of the state of the ring buffer.
274 * It's a little racy, but we don't really care.
275 */
276 smp_rmb();
277 return !global_trace.buffer_disabled;
221} 278}
222 279
223/* 280/*
@@ -240,7 +297,7 @@ static struct tracer *trace_types __read_mostly;
240/* 297/*
241 * trace_types_lock is used to protect the trace_types list. 298 * trace_types_lock is used to protect the trace_types list.
242 */ 299 */
243static DEFINE_MUTEX(trace_types_lock); 300DEFINE_MUTEX(trace_types_lock);
244 301
245/* 302/*
246 * serialize the access of the ring buffer 303 * serialize the access of the ring buffer
@@ -330,6 +387,23 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
330 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | 387 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
331 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; 388 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
332 389
390static void tracer_tracing_on(struct trace_array *tr)
391{
392 if (tr->trace_buffer.buffer)
393 ring_buffer_record_on(tr->trace_buffer.buffer);
394 /*
395 * This flag is looked at when buffers haven't been allocated
396 * yet, or by some tracers (like irqsoff), that just want to
397 * know if the ring buffer has been disabled, but it can handle
398 * races of where it gets disabled but we still do a record.
399 * As the check is in the fast path of the tracers, it is more
400 * important to be fast than accurate.
401 */
402 tr->buffer_disabled = 0;
403 /* Make the flag seen by readers */
404 smp_wmb();
405}
406
333/** 407/**
334 * tracing_on - enable tracing buffers 408 * tracing_on - enable tracing buffers
335 * 409 *
@@ -338,15 +412,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
338 */ 412 */
339void tracing_on(void) 413void tracing_on(void)
340{ 414{
341 if (global_trace.trace_buffer.buffer) 415 tracer_tracing_on(&global_trace);
342 ring_buffer_record_on(global_trace.trace_buffer.buffer);
343 /*
344 * This flag is only looked at when buffers haven't been
345 * allocated yet. We don't really care about the race
346 * between setting this flag and actually turning
347 * on the buffer.
348 */
349 global_trace.buffer_disabled = 0;
350} 416}
351EXPORT_SYMBOL_GPL(tracing_on); 417EXPORT_SYMBOL_GPL(tracing_on);
352 418
@@ -540,6 +606,23 @@ void tracing_snapshot_alloc(void)
540EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); 606EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
541#endif /* CONFIG_TRACER_SNAPSHOT */ 607#endif /* CONFIG_TRACER_SNAPSHOT */
542 608
609static void tracer_tracing_off(struct trace_array *tr)
610{
611 if (tr->trace_buffer.buffer)
612 ring_buffer_record_off(tr->trace_buffer.buffer);
613 /*
614 * This flag is looked at when buffers haven't been allocated
615 * yet, or by some tracers (like irqsoff), that just want to
616 * know if the ring buffer has been disabled, but it can handle
617 * races of where it gets disabled but we still do a record.
618 * As the check is in the fast path of the tracers, it is more
619 * important to be fast than accurate.
620 */
621 tr->buffer_disabled = 1;
622 /* Make the flag seen by readers */
623 smp_wmb();
624}
625
543/** 626/**
544 * tracing_off - turn off tracing buffers 627 * tracing_off - turn off tracing buffers
545 * 628 *
@@ -550,26 +633,35 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
550 */ 633 */
551void tracing_off(void) 634void tracing_off(void)
552{ 635{
553 if (global_trace.trace_buffer.buffer) 636 tracer_tracing_off(&global_trace);
554 ring_buffer_record_off(global_trace.trace_buffer.buffer);
555 /*
556 * This flag is only looked at when buffers haven't been
557 * allocated yet. We don't really care about the race
558 * between setting this flag and actually turning
559 * on the buffer.
560 */
561 global_trace.buffer_disabled = 1;
562} 637}
563EXPORT_SYMBOL_GPL(tracing_off); 638EXPORT_SYMBOL_GPL(tracing_off);
564 639
640void disable_trace_on_warning(void)
641{
642 if (__disable_trace_on_warning)
643 tracing_off();
644}
645
646/**
647 * tracer_tracing_is_on - show real state of ring buffer enabled
648 * @tr : the trace array to know if ring buffer is enabled
649 *
650 * Shows real state of the ring buffer if it is enabled or not.
651 */
652static int tracer_tracing_is_on(struct trace_array *tr)
653{
654 if (tr->trace_buffer.buffer)
655 return ring_buffer_record_is_on(tr->trace_buffer.buffer);
656 return !tr->buffer_disabled;
657}
658
565/** 659/**
566 * tracing_is_on - show state of ring buffers enabled 660 * tracing_is_on - show state of ring buffers enabled
567 */ 661 */
568int tracing_is_on(void) 662int tracing_is_on(void)
569{ 663{
570 if (global_trace.trace_buffer.buffer) 664 return tracer_tracing_is_on(&global_trace);
571 return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
572 return !global_trace.buffer_disabled;
573} 665}
574EXPORT_SYMBOL_GPL(tracing_is_on); 666EXPORT_SYMBOL_GPL(tracing_is_on);
575 667
@@ -1543,15 +1635,6 @@ trace_function(struct trace_array *tr,
1543 __buffer_unlock_commit(buffer, event); 1635 __buffer_unlock_commit(buffer, event);
1544} 1636}
1545 1637
1546void
1547ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1548 unsigned long ip, unsigned long parent_ip, unsigned long flags,
1549 int pc)
1550{
1551 if (likely(!atomic_read(&data->disabled)))
1552 trace_function(tr, ip, parent_ip, flags, pc);
1553}
1554
1555#ifdef CONFIG_STACKTRACE 1638#ifdef CONFIG_STACKTRACE
1556 1639
1557#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) 1640#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long))
@@ -2768,10 +2851,9 @@ static const struct seq_operations tracer_seq_ops = {
2768}; 2851};
2769 2852
2770static struct trace_iterator * 2853static struct trace_iterator *
2771__tracing_open(struct inode *inode, struct file *file, bool snapshot) 2854__tracing_open(struct trace_array *tr, struct trace_cpu *tc,
2855 struct inode *inode, struct file *file, bool snapshot)
2772{ 2856{
2773 struct trace_cpu *tc = inode->i_private;
2774 struct trace_array *tr = tc->tr;
2775 struct trace_iterator *iter; 2857 struct trace_iterator *iter;
2776 int cpu; 2858 int cpu;
2777 2859
@@ -2850,8 +2932,6 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2850 tracing_iter_reset(iter, cpu); 2932 tracing_iter_reset(iter, cpu);
2851 } 2933 }
2852 2934
2853 tr->ref++;
2854
2855 mutex_unlock(&trace_types_lock); 2935 mutex_unlock(&trace_types_lock);
2856 2936
2857 return iter; 2937 return iter;
@@ -2874,6 +2954,43 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2874 return 0; 2954 return 0;
2875} 2955}
2876 2956
2957/*
2958 * Open and update trace_array ref count.
2959 * Must have the current trace_array passed to it.
2960 */
2961static int tracing_open_generic_tr(struct inode *inode, struct file *filp)
2962{
2963 struct trace_array *tr = inode->i_private;
2964
2965 if (tracing_disabled)
2966 return -ENODEV;
2967
2968 if (trace_array_get(tr) < 0)
2969 return -ENODEV;
2970
2971 filp->private_data = inode->i_private;
2972
2973 return 0;
2974
2975}
2976
2977static int tracing_open_generic_tc(struct inode *inode, struct file *filp)
2978{
2979 struct trace_cpu *tc = inode->i_private;
2980 struct trace_array *tr = tc->tr;
2981
2982 if (tracing_disabled)
2983 return -ENODEV;
2984
2985 if (trace_array_get(tr) < 0)
2986 return -ENODEV;
2987
2988 filp->private_data = inode->i_private;
2989
2990 return 0;
2991
2992}
2993
2877static int tracing_release(struct inode *inode, struct file *file) 2994static int tracing_release(struct inode *inode, struct file *file)
2878{ 2995{
2879 struct seq_file *m = file->private_data; 2996 struct seq_file *m = file->private_data;
@@ -2881,17 +2998,19 @@ static int tracing_release(struct inode *inode, struct file *file)
2881 struct trace_array *tr; 2998 struct trace_array *tr;
2882 int cpu; 2999 int cpu;
2883 3000
2884 if (!(file->f_mode & FMODE_READ)) 3001 /* Writes do not use seq_file, need to grab tr from inode */
3002 if (!(file->f_mode & FMODE_READ)) {
3003 struct trace_cpu *tc = inode->i_private;
3004
3005 trace_array_put(tc->tr);
2885 return 0; 3006 return 0;
3007 }
2886 3008
2887 iter = m->private; 3009 iter = m->private;
2888 tr = iter->tr; 3010 tr = iter->tr;
2889 3011
2890 mutex_lock(&trace_types_lock); 3012 mutex_lock(&trace_types_lock);
2891 3013
2892 WARN_ON(!tr->ref);
2893 tr->ref--;
2894
2895 for_each_tracing_cpu(cpu) { 3014 for_each_tracing_cpu(cpu) {
2896 if (iter->buffer_iter[cpu]) 3015 if (iter->buffer_iter[cpu])
2897 ring_buffer_read_finish(iter->buffer_iter[cpu]); 3016 ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2903,6 +3022,9 @@ static int tracing_release(struct inode *inode, struct file *file)
2903 if (!iter->snapshot) 3022 if (!iter->snapshot)
2904 /* reenable tracing if it was previously enabled */ 3023 /* reenable tracing if it was previously enabled */
2905 tracing_start_tr(tr); 3024 tracing_start_tr(tr);
3025
3026 __trace_array_put(tr);
3027
2906 mutex_unlock(&trace_types_lock); 3028 mutex_unlock(&trace_types_lock);
2907 3029
2908 mutex_destroy(&iter->mutex); 3030 mutex_destroy(&iter->mutex);
@@ -2910,20 +3032,49 @@ static int tracing_release(struct inode *inode, struct file *file)
2910 kfree(iter->trace); 3032 kfree(iter->trace);
2911 kfree(iter->buffer_iter); 3033 kfree(iter->buffer_iter);
2912 seq_release_private(inode, file); 3034 seq_release_private(inode, file);
3035
2913 return 0; 3036 return 0;
2914} 3037}
2915 3038
3039static int tracing_release_generic_tr(struct inode *inode, struct file *file)
3040{
3041 struct trace_array *tr = inode->i_private;
3042
3043 trace_array_put(tr);
3044 return 0;
3045}
3046
3047static int tracing_release_generic_tc(struct inode *inode, struct file *file)
3048{
3049 struct trace_cpu *tc = inode->i_private;
3050 struct trace_array *tr = tc->tr;
3051
3052 trace_array_put(tr);
3053 return 0;
3054}
3055
3056static int tracing_single_release_tr(struct inode *inode, struct file *file)
3057{
3058 struct trace_array *tr = inode->i_private;
3059
3060 trace_array_put(tr);
3061
3062 return single_release(inode, file);
3063}
3064
2916static int tracing_open(struct inode *inode, struct file *file) 3065static int tracing_open(struct inode *inode, struct file *file)
2917{ 3066{
3067 struct trace_cpu *tc = inode->i_private;
3068 struct trace_array *tr = tc->tr;
2918 struct trace_iterator *iter; 3069 struct trace_iterator *iter;
2919 int ret = 0; 3070 int ret = 0;
2920 3071
3072 if (trace_array_get(tr) < 0)
3073 return -ENODEV;
3074
2921 /* If this file was open for write, then erase contents */ 3075 /* If this file was open for write, then erase contents */
2922 if ((file->f_mode & FMODE_WRITE) && 3076 if ((file->f_mode & FMODE_WRITE) &&
2923 (file->f_flags & O_TRUNC)) { 3077 (file->f_flags & O_TRUNC)) {
2924 struct trace_cpu *tc = inode->i_private;
2925 struct trace_array *tr = tc->tr;
2926
2927 if (tc->cpu == RING_BUFFER_ALL_CPUS) 3078 if (tc->cpu == RING_BUFFER_ALL_CPUS)
2928 tracing_reset_online_cpus(&tr->trace_buffer); 3079 tracing_reset_online_cpus(&tr->trace_buffer);
2929 else 3080 else
@@ -2931,12 +3082,16 @@ static int tracing_open(struct inode *inode, struct file *file)
2931 } 3082 }
2932 3083
2933 if (file->f_mode & FMODE_READ) { 3084 if (file->f_mode & FMODE_READ) {
2934 iter = __tracing_open(inode, file, false); 3085 iter = __tracing_open(tr, tc, inode, file, false);
2935 if (IS_ERR(iter)) 3086 if (IS_ERR(iter))
2936 ret = PTR_ERR(iter); 3087 ret = PTR_ERR(iter);
2937 else if (trace_flags & TRACE_ITER_LATENCY_FMT) 3088 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
2938 iter->iter_flags |= TRACE_FILE_LAT_FMT; 3089 iter->iter_flags |= TRACE_FILE_LAT_FMT;
2939 } 3090 }
3091
3092 if (ret < 0)
3093 trace_array_put(tr);
3094
2940 return ret; 3095 return ret;
2941} 3096}
2942 3097
@@ -3293,17 +3448,27 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
3293 3448
3294static int tracing_trace_options_open(struct inode *inode, struct file *file) 3449static int tracing_trace_options_open(struct inode *inode, struct file *file)
3295{ 3450{
3451 struct trace_array *tr = inode->i_private;
3452 int ret;
3453
3296 if (tracing_disabled) 3454 if (tracing_disabled)
3297 return -ENODEV; 3455 return -ENODEV;
3298 3456
3299 return single_open(file, tracing_trace_options_show, inode->i_private); 3457 if (trace_array_get(tr) < 0)
3458 return -ENODEV;
3459
3460 ret = single_open(file, tracing_trace_options_show, inode->i_private);
3461 if (ret < 0)
3462 trace_array_put(tr);
3463
3464 return ret;
3300} 3465}
3301 3466
3302static const struct file_operations tracing_iter_fops = { 3467static const struct file_operations tracing_iter_fops = {
3303 .open = tracing_trace_options_open, 3468 .open = tracing_trace_options_open,
3304 .read = seq_read, 3469 .read = seq_read,
3305 .llseek = seq_lseek, 3470 .llseek = seq_lseek,
3306 .release = single_release, 3471 .release = tracing_single_release_tr,
3307 .write = tracing_trace_options_write, 3472 .write = tracing_trace_options_write,
3308}; 3473};
3309 3474
@@ -3379,14 +3544,14 @@ static const char readme_msg[] =
3379 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" 3544 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
3380 "\t\t\t Read the contents for more information\n" 3545 "\t\t\t Read the contents for more information\n"
3381#endif 3546#endif
3382#ifdef CONFIG_STACKTRACE 3547#ifdef CONFIG_STACK_TRACER
3383 " stack_trace\t\t- Shows the max stack trace when active\n" 3548 " stack_trace\t\t- Shows the max stack trace when active\n"
3384 " stack_max_size\t- Shows current max stack size that was traced\n" 3549 " stack_max_size\t- Shows current max stack size that was traced\n"
3385 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" 3550 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n"
3386#ifdef CONFIG_DYNAMIC_FTRACE 3551#ifdef CONFIG_DYNAMIC_FTRACE
3387 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" 3552 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
3388#endif 3553#endif
3389#endif /* CONFIG_STACKTRACE */ 3554#endif /* CONFIG_STACK_TRACER */
3390; 3555;
3391 3556
3392static ssize_t 3557static ssize_t
@@ -3791,12 +3956,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3791 if (tracing_disabled) 3956 if (tracing_disabled)
3792 return -ENODEV; 3957 return -ENODEV;
3793 3958
3959 if (trace_array_get(tr) < 0)
3960 return -ENODEV;
3961
3794 mutex_lock(&trace_types_lock); 3962 mutex_lock(&trace_types_lock);
3795 3963
3796 /* create a buffer to store the information to pass to userspace */ 3964 /* create a buffer to store the information to pass to userspace */
3797 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 3965 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
3798 if (!iter) { 3966 if (!iter) {
3799 ret = -ENOMEM; 3967 ret = -ENOMEM;
3968 __trace_array_put(tr);
3800 goto out; 3969 goto out;
3801 } 3970 }
3802 3971
@@ -3843,6 +4012,7 @@ out:
3843fail: 4012fail:
3844 kfree(iter->trace); 4013 kfree(iter->trace);
3845 kfree(iter); 4014 kfree(iter);
4015 __trace_array_put(tr);
3846 mutex_unlock(&trace_types_lock); 4016 mutex_unlock(&trace_types_lock);
3847 return ret; 4017 return ret;
3848} 4018}
@@ -3850,6 +4020,8 @@ fail:
3850static int tracing_release_pipe(struct inode *inode, struct file *file) 4020static int tracing_release_pipe(struct inode *inode, struct file *file)
3851{ 4021{
3852 struct trace_iterator *iter = file->private_data; 4022 struct trace_iterator *iter = file->private_data;
4023 struct trace_cpu *tc = inode->i_private;
4024 struct trace_array *tr = tc->tr;
3853 4025
3854 mutex_lock(&trace_types_lock); 4026 mutex_lock(&trace_types_lock);
3855 4027
@@ -3863,6 +4035,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3863 kfree(iter->trace); 4035 kfree(iter->trace);
3864 kfree(iter); 4036 kfree(iter);
3865 4037
4038 trace_array_put(tr);
4039
3866 return 0; 4040 return 0;
3867} 4041}
3868 4042
@@ -3939,7 +4113,7 @@ static int tracing_wait_pipe(struct file *filp)
3939 * 4113 *
3940 * iter->pos will be 0 if we haven't read anything. 4114 * iter->pos will be 0 if we haven't read anything.
3941 */ 4115 */
3942 if (!tracing_is_enabled() && iter->pos) 4116 if (!tracing_is_on() && iter->pos)
3943 break; 4117 break;
3944 } 4118 }
3945 4119
@@ -4320,6 +4494,8 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
4320 /* resize the ring buffer to 0 */ 4494 /* resize the ring buffer to 0 */
4321 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); 4495 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
4322 4496
4497 trace_array_put(tr);
4498
4323 return 0; 4499 return 0;
4324} 4500}
4325 4501
@@ -4328,6 +4504,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4328 size_t cnt, loff_t *fpos) 4504 size_t cnt, loff_t *fpos)
4329{ 4505{
4330 unsigned long addr = (unsigned long)ubuf; 4506 unsigned long addr = (unsigned long)ubuf;
4507 struct trace_array *tr = filp->private_data;
4331 struct ring_buffer_event *event; 4508 struct ring_buffer_event *event;
4332 struct ring_buffer *buffer; 4509 struct ring_buffer *buffer;
4333 struct print_entry *entry; 4510 struct print_entry *entry;
@@ -4387,7 +4564,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4387 4564
4388 local_save_flags(irq_flags); 4565 local_save_flags(irq_flags);
4389 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 4566 size = sizeof(*entry) + cnt + 2; /* possible \n added */
4390 buffer = global_trace.trace_buffer.buffer; 4567 buffer = tr->trace_buffer.buffer;
4391 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 4568 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
4392 irq_flags, preempt_count()); 4569 irq_flags, preempt_count());
4393 if (!event) { 4570 if (!event) {
@@ -4495,10 +4672,20 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4495 4672
4496static int tracing_clock_open(struct inode *inode, struct file *file) 4673static int tracing_clock_open(struct inode *inode, struct file *file)
4497{ 4674{
4675 struct trace_array *tr = inode->i_private;
4676 int ret;
4677
4498 if (tracing_disabled) 4678 if (tracing_disabled)
4499 return -ENODEV; 4679 return -ENODEV;
4500 4680
4501 return single_open(file, tracing_clock_show, inode->i_private); 4681 if (trace_array_get(tr))
4682 return -ENODEV;
4683
4684 ret = single_open(file, tracing_clock_show, inode->i_private);
4685 if (ret < 0)
4686 trace_array_put(tr);
4687
4688 return ret;
4502} 4689}
4503 4690
4504struct ftrace_buffer_info { 4691struct ftrace_buffer_info {
@@ -4511,30 +4698,40 @@ struct ftrace_buffer_info {
4511static int tracing_snapshot_open(struct inode *inode, struct file *file) 4698static int tracing_snapshot_open(struct inode *inode, struct file *file)
4512{ 4699{
4513 struct trace_cpu *tc = inode->i_private; 4700 struct trace_cpu *tc = inode->i_private;
4701 struct trace_array *tr = tc->tr;
4514 struct trace_iterator *iter; 4702 struct trace_iterator *iter;
4515 struct seq_file *m; 4703 struct seq_file *m;
4516 int ret = 0; 4704 int ret = 0;
4517 4705
4706 if (trace_array_get(tr) < 0)
4707 return -ENODEV;
4708
4518 if (file->f_mode & FMODE_READ) { 4709 if (file->f_mode & FMODE_READ) {
4519 iter = __tracing_open(inode, file, true); 4710 iter = __tracing_open(tr, tc, inode, file, true);
4520 if (IS_ERR(iter)) 4711 if (IS_ERR(iter))
4521 ret = PTR_ERR(iter); 4712 ret = PTR_ERR(iter);
4522 } else { 4713 } else {
4523 /* Writes still need the seq_file to hold the private data */ 4714 /* Writes still need the seq_file to hold the private data */
4715 ret = -ENOMEM;
4524 m = kzalloc(sizeof(*m), GFP_KERNEL); 4716 m = kzalloc(sizeof(*m), GFP_KERNEL);
4525 if (!m) 4717 if (!m)
4526 return -ENOMEM; 4718 goto out;
4527 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 4719 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
4528 if (!iter) { 4720 if (!iter) {
4529 kfree(m); 4721 kfree(m);
4530 return -ENOMEM; 4722 goto out;
4531 } 4723 }
4532 iter->tr = tc->tr; 4724 ret = 0;
4725
4726 iter->tr = tr;
4533 iter->trace_buffer = &tc->tr->max_buffer; 4727 iter->trace_buffer = &tc->tr->max_buffer;
4534 iter->cpu_file = tc->cpu; 4728 iter->cpu_file = tc->cpu;
4535 m->private = iter; 4729 m->private = iter;
4536 file->private_data = m; 4730 file->private_data = m;
4537 } 4731 }
4732out:
4733 if (ret < 0)
4734 trace_array_put(tr);
4538 4735
4539 return ret; 4736 return ret;
4540} 4737}
@@ -4616,9 +4813,12 @@ out:
4616static int tracing_snapshot_release(struct inode *inode, struct file *file) 4813static int tracing_snapshot_release(struct inode *inode, struct file *file)
4617{ 4814{
4618 struct seq_file *m = file->private_data; 4815 struct seq_file *m = file->private_data;
4816 int ret;
4817
4818 ret = tracing_release(inode, file);
4619 4819
4620 if (file->f_mode & FMODE_READ) 4820 if (file->f_mode & FMODE_READ)
4621 return tracing_release(inode, file); 4821 return ret;
4622 4822
4623 /* If write only, the seq_file is just a stub */ 4823 /* If write only, the seq_file is just a stub */
4624 if (m) 4824 if (m)
@@ -4684,34 +4884,38 @@ static const struct file_operations tracing_pipe_fops = {
4684}; 4884};
4685 4885
4686static const struct file_operations tracing_entries_fops = { 4886static const struct file_operations tracing_entries_fops = {
4687 .open = tracing_open_generic, 4887 .open = tracing_open_generic_tc,
4688 .read = tracing_entries_read, 4888 .read = tracing_entries_read,
4689 .write = tracing_entries_write, 4889 .write = tracing_entries_write,
4690 .llseek = generic_file_llseek, 4890 .llseek = generic_file_llseek,
4891 .release = tracing_release_generic_tc,
4691}; 4892};
4692 4893
4693static const struct file_operations tracing_total_entries_fops = { 4894static const struct file_operations tracing_total_entries_fops = {
4694 .open = tracing_open_generic, 4895 .open = tracing_open_generic_tr,
4695 .read = tracing_total_entries_read, 4896 .read = tracing_total_entries_read,
4696 .llseek = generic_file_llseek, 4897 .llseek = generic_file_llseek,
4898 .release = tracing_release_generic_tr,
4697}; 4899};
4698 4900
4699static const struct file_operations tracing_free_buffer_fops = { 4901static const struct file_operations tracing_free_buffer_fops = {
4902 .open = tracing_open_generic_tr,
4700 .write = tracing_free_buffer_write, 4903 .write = tracing_free_buffer_write,
4701 .release = tracing_free_buffer_release, 4904 .release = tracing_free_buffer_release,
4702}; 4905};
4703 4906
4704static const struct file_operations tracing_mark_fops = { 4907static const struct file_operations tracing_mark_fops = {
4705 .open = tracing_open_generic, 4908 .open = tracing_open_generic_tr,
4706 .write = tracing_mark_write, 4909 .write = tracing_mark_write,
4707 .llseek = generic_file_llseek, 4910 .llseek = generic_file_llseek,
4911 .release = tracing_release_generic_tr,
4708}; 4912};
4709 4913
4710static const struct file_operations trace_clock_fops = { 4914static const struct file_operations trace_clock_fops = {
4711 .open = tracing_clock_open, 4915 .open = tracing_clock_open,
4712 .read = seq_read, 4916 .read = seq_read,
4713 .llseek = seq_lseek, 4917 .llseek = seq_lseek,
4714 .release = single_release, 4918 .release = tracing_single_release_tr,
4715 .write = tracing_clock_write, 4919 .write = tracing_clock_write,
4716}; 4920};
4717 4921
@@ -4739,18 +4943,22 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
4739 struct trace_cpu *tc = inode->i_private; 4943 struct trace_cpu *tc = inode->i_private;
4740 struct trace_array *tr = tc->tr; 4944 struct trace_array *tr = tc->tr;
4741 struct ftrace_buffer_info *info; 4945 struct ftrace_buffer_info *info;
4946 int ret;
4742 4947
4743 if (tracing_disabled) 4948 if (tracing_disabled)
4744 return -ENODEV; 4949 return -ENODEV;
4745 4950
4951 if (trace_array_get(tr) < 0)
4952 return -ENODEV;
4953
4746 info = kzalloc(sizeof(*info), GFP_KERNEL); 4954 info = kzalloc(sizeof(*info), GFP_KERNEL);
4747 if (!info) 4955 if (!info) {
4956 trace_array_put(tr);
4748 return -ENOMEM; 4957 return -ENOMEM;
4958 }
4749 4959
4750 mutex_lock(&trace_types_lock); 4960 mutex_lock(&trace_types_lock);
4751 4961
4752 tr->ref++;
4753
4754 info->iter.tr = tr; 4962 info->iter.tr = tr;
4755 info->iter.cpu_file = tc->cpu; 4963 info->iter.cpu_file = tc->cpu;
4756 info->iter.trace = tr->current_trace; 4964 info->iter.trace = tr->current_trace;
@@ -4763,7 +4971,11 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
4763 4971
4764 mutex_unlock(&trace_types_lock); 4972 mutex_unlock(&trace_types_lock);
4765 4973
4766 return nonseekable_open(inode, filp); 4974 ret = nonseekable_open(inode, filp);
4975 if (ret < 0)
4976 trace_array_put(tr);
4977
4978 return ret;
4767} 4979}
4768 4980
4769static unsigned int 4981static unsigned int
@@ -4863,8 +5075,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
4863 5075
4864 mutex_lock(&trace_types_lock); 5076 mutex_lock(&trace_types_lock);
4865 5077
4866 WARN_ON(!iter->tr->ref); 5078 __trace_array_put(iter->tr);
4867 iter->tr->ref--;
4868 5079
4869 if (info->spare) 5080 if (info->spare)
4870 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); 5081 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
@@ -5126,9 +5337,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
5126} 5337}
5127 5338
5128static const struct file_operations tracing_stats_fops = { 5339static const struct file_operations tracing_stats_fops = {
5129 .open = tracing_open_generic, 5340 .open = tracing_open_generic_tc,
5130 .read = tracing_stats_read, 5341 .read = tracing_stats_read,
5131 .llseek = generic_file_llseek, 5342 .llseek = generic_file_llseek,
5343 .release = tracing_release_generic_tc,
5132}; 5344};
5133 5345
5134#ifdef CONFIG_DYNAMIC_FTRACE 5346#ifdef CONFIG_DYNAMIC_FTRACE
@@ -5612,15 +5824,10 @@ rb_simple_read(struct file *filp, char __user *ubuf,
5612 size_t cnt, loff_t *ppos) 5824 size_t cnt, loff_t *ppos)
5613{ 5825{
5614 struct trace_array *tr = filp->private_data; 5826 struct trace_array *tr = filp->private_data;
5615 struct ring_buffer *buffer = tr->trace_buffer.buffer;
5616 char buf[64]; 5827 char buf[64];
5617 int r; 5828 int r;
5618 5829
5619 if (buffer) 5830 r = tracer_tracing_is_on(tr);
5620 r = ring_buffer_record_is_on(buffer);
5621 else
5622 r = 0;
5623
5624 r = sprintf(buf, "%d\n", r); 5831 r = sprintf(buf, "%d\n", r);
5625 5832
5626 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 5833 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -5642,11 +5849,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
5642 if (buffer) { 5849 if (buffer) {
5643 mutex_lock(&trace_types_lock); 5850 mutex_lock(&trace_types_lock);
5644 if (val) { 5851 if (val) {
5645 ring_buffer_record_on(buffer); 5852 tracer_tracing_on(tr);
5646 if (tr->current_trace->start) 5853 if (tr->current_trace->start)
5647 tr->current_trace->start(tr); 5854 tr->current_trace->start(tr);
5648 } else { 5855 } else {
5649 ring_buffer_record_off(buffer); 5856 tracer_tracing_off(tr);
5650 if (tr->current_trace->stop) 5857 if (tr->current_trace->stop)
5651 tr->current_trace->stop(tr); 5858 tr->current_trace->stop(tr);
5652 } 5859 }
@@ -5659,9 +5866,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
5659} 5866}
5660 5867
5661static const struct file_operations rb_simple_fops = { 5868static const struct file_operations rb_simple_fops = {
5662 .open = tracing_open_generic, 5869 .open = tracing_open_generic_tr,
5663 .read = rb_simple_read, 5870 .read = rb_simple_read,
5664 .write = rb_simple_write, 5871 .write = rb_simple_write,
5872 .release = tracing_release_generic_tr,
5665 .llseek = default_llseek, 5873 .llseek = default_llseek,
5666}; 5874};
5667 5875
@@ -5775,8 +5983,10 @@ static int new_instance_create(const char *name)
5775 goto out_free_tr; 5983 goto out_free_tr;
5776 5984
5777 ret = event_trace_add_tracer(tr->dir, tr); 5985 ret = event_trace_add_tracer(tr->dir, tr);
5778 if (ret) 5986 if (ret) {
5987 debugfs_remove_recursive(tr->dir);
5779 goto out_free_tr; 5988 goto out_free_tr;
5989 }
5780 5990
5781 init_tracer_debugfs(tr, tr->dir); 5991 init_tracer_debugfs(tr, tr->dir);
5782 5992
@@ -5933,7 +6143,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
5933 trace_create_file("buffer_total_size_kb", 0444, d_tracer, 6143 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5934 tr, &tracing_total_entries_fops); 6144 tr, &tracing_total_entries_fops);
5935 6145
5936 trace_create_file("free_buffer", 0644, d_tracer, 6146 trace_create_file("free_buffer", 0200, d_tracer,
5937 tr, &tracing_free_buffer_fops); 6147 tr, &tracing_free_buffer_fops);
5938 6148
5939 trace_create_file("trace_marker", 0220, d_tracer, 6149 trace_create_file("trace_marker", 0220, d_tracer,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 20572ed88c5c..e7d643b8a907 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -214,7 +214,6 @@ struct trace_array {
214 struct dentry *event_dir; 214 struct dentry *event_dir;
215 struct list_head systems; 215 struct list_head systems;
216 struct list_head events; 216 struct list_head events;
217 struct task_struct *waiter;
218 int ref; 217 int ref;
219}; 218};
220 219
@@ -224,6 +223,11 @@ enum {
224 223
225extern struct list_head ftrace_trace_arrays; 224extern struct list_head ftrace_trace_arrays;
226 225
226extern struct mutex trace_types_lock;
227
228extern int trace_array_get(struct trace_array *tr);
229extern void trace_array_put(struct trace_array *tr);
230
227/* 231/*
228 * The global tracer (top) should be the first trace array added, 232 * The global tracer (top) should be the first trace array added,
229 * but we check the flag anyway. 233 * but we check the flag anyway.
@@ -554,11 +558,6 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu);
554 558
555void poll_wait_pipe(struct trace_iterator *iter); 559void poll_wait_pipe(struct trace_iterator *iter);
556 560
557void ftrace(struct trace_array *tr,
558 struct trace_array_cpu *data,
559 unsigned long ip,
560 unsigned long parent_ip,
561 unsigned long flags, int pc);
562void tracing_sched_switch_trace(struct trace_array *tr, 561void tracing_sched_switch_trace(struct trace_array *tr,
563 struct task_struct *prev, 562 struct task_struct *prev,
564 struct task_struct *next, 563 struct task_struct *next,
@@ -680,6 +679,15 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
680 struct trace_array *tr); 679 struct trace_array *tr);
681extern int trace_selftest_startup_branch(struct tracer *trace, 680extern int trace_selftest_startup_branch(struct tracer *trace,
682 struct trace_array *tr); 681 struct trace_array *tr);
682/*
683 * Tracer data references selftest functions that only occur
684 * on boot up. These can be __init functions. Thus, when selftests
685 * are enabled, then the tracers need to reference __init functions.
686 */
687#define __tracer_data __refdata
688#else
689/* Tracers are seldom changed. Optimize when selftests are disabled. */
690#define __tracer_data __read_mostly
683#endif /* CONFIG_FTRACE_STARTUP_TEST */ 691#endif /* CONFIG_FTRACE_STARTUP_TEST */
684 692
685extern void *head_page(struct trace_array_cpu *data); 693extern void *head_page(struct trace_array_cpu *data);
@@ -774,6 +782,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
774extern struct list_head ftrace_pids; 782extern struct list_head ftrace_pids;
775 783
776#ifdef CONFIG_FUNCTION_TRACER 784#ifdef CONFIG_FUNCTION_TRACER
785extern bool ftrace_filter_param __initdata;
777static inline int ftrace_trace_task(struct task_struct *task) 786static inline int ftrace_trace_task(struct task_struct *task)
778{ 787{
779 if (list_empty(&ftrace_pids)) 788 if (list_empty(&ftrace_pids))
@@ -899,12 +908,6 @@ static inline void trace_branch_disable(void)
899/* set ring buffers to default size if not already done so */ 908/* set ring buffers to default size if not already done so */
900int tracing_update_buffers(void); 909int tracing_update_buffers(void);
901 910
902/* trace event type bit fields, not numeric */
903enum {
904 TRACE_EVENT_TYPE_PRINTF = 1,
905 TRACE_EVENT_TYPE_RAW = 2,
906};
907
908struct ftrace_event_field { 911struct ftrace_event_field {
909 struct list_head link; 912 struct list_head link;
910 const char *name; 913 const char *name;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 84b1e045faba..80c36bcf66e8 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -236,6 +236,10 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
236 236
237 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); 237 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
238 238
239 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
240 "perf buffer not large enough"))
241 return NULL;
242
239 pc = preempt_count(); 243 pc = preempt_count();
240 244
241 *rctxp = perf_swevent_get_recursion_context(); 245 *rctxp = perf_swevent_get_recursion_context();
@@ -266,6 +270,10 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
266 struct pt_regs regs; 270 struct pt_regs regs;
267 int rctx; 271 int rctx;
268 272
273 head = this_cpu_ptr(event_function.perf_events);
274 if (hlist_empty(head))
275 return;
276
269#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ 277#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
270 sizeof(u64)) - sizeof(u32)) 278 sizeof(u64)) - sizeof(u32))
271 279
@@ -279,8 +287,6 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
279 287
280 entry->ip = ip; 288 entry->ip = ip;
281 entry->parent_ip = parent_ip; 289 entry->parent_ip = parent_ip;
282
283 head = this_cpu_ptr(event_function.perf_events);
284 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 290 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
285 1, &regs, head, NULL); 291 1, &regs, head, NULL);
286 292
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 27963e2bf4bf..898f868833f2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -41,6 +41,23 @@ static LIST_HEAD(ftrace_common_fields);
41static struct kmem_cache *field_cachep; 41static struct kmem_cache *field_cachep;
42static struct kmem_cache *file_cachep; 42static struct kmem_cache *file_cachep;
43 43
44#define SYSTEM_FL_FREE_NAME (1 << 31)
45
46static inline int system_refcount(struct event_subsystem *system)
47{
48 return system->ref_count & ~SYSTEM_FL_FREE_NAME;
49}
50
51static int system_refcount_inc(struct event_subsystem *system)
52{
53 return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME;
54}
55
56static int system_refcount_dec(struct event_subsystem *system)
57{
58 return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME;
59}
60
44/* Double loops, do not use break, only goto's work */ 61/* Double loops, do not use break, only goto's work */
45#define do_for_each_event_file(tr, file) \ 62#define do_for_each_event_file(tr, file) \
46 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ 63 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
@@ -97,7 +114,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
97 114
98 field = kmem_cache_alloc(field_cachep, GFP_TRACE); 115 field = kmem_cache_alloc(field_cachep, GFP_TRACE);
99 if (!field) 116 if (!field)
100 goto err; 117 return -ENOMEM;
101 118
102 field->name = name; 119 field->name = name;
103 field->type = type; 120 field->type = type;
@@ -114,11 +131,6 @@ static int __trace_define_field(struct list_head *head, const char *type,
114 list_add(&field->link, head); 131 list_add(&field->link, head);
115 132
116 return 0; 133 return 0;
117
118err:
119 kmem_cache_free(field_cachep, field);
120
121 return -ENOMEM;
122} 134}
123 135
124int trace_define_field(struct ftrace_event_call *call, const char *type, 136int trace_define_field(struct ftrace_event_call *call, const char *type,
@@ -279,9 +291,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
279 } 291 }
280 call->class->reg(call, TRACE_REG_UNREGISTER, file); 292 call->class->reg(call, TRACE_REG_UNREGISTER, file);
281 } 293 }
282 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ 294 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
283 if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) 295 if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
284 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); 296 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
297 else
298 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
285 break; 299 break;
286 case 1: 300 case 1:
287 /* 301 /*
@@ -349,8 +363,8 @@ static void __put_system(struct event_subsystem *system)
349{ 363{
350 struct event_filter *filter = system->filter; 364 struct event_filter *filter = system->filter;
351 365
352 WARN_ON_ONCE(system->ref_count == 0); 366 WARN_ON_ONCE(system_refcount(system) == 0);
353 if (--system->ref_count) 367 if (system_refcount_dec(system))
354 return; 368 return;
355 369
356 list_del(&system->list); 370 list_del(&system->list);
@@ -359,13 +373,15 @@ static void __put_system(struct event_subsystem *system)
359 kfree(filter->filter_string); 373 kfree(filter->filter_string);
360 kfree(filter); 374 kfree(filter);
361 } 375 }
376 if (system->ref_count & SYSTEM_FL_FREE_NAME)
377 kfree(system->name);
362 kfree(system); 378 kfree(system);
363} 379}
364 380
365static void __get_system(struct event_subsystem *system) 381static void __get_system(struct event_subsystem *system)
366{ 382{
367 WARN_ON_ONCE(system->ref_count == 0); 383 WARN_ON_ONCE(system_refcount(system) == 0);
368 system->ref_count++; 384 system_refcount_inc(system);
369} 385}
370 386
371static void __get_system_dir(struct ftrace_subsystem_dir *dir) 387static void __get_system_dir(struct ftrace_subsystem_dir *dir)
@@ -379,7 +395,7 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir)
379{ 395{
380 WARN_ON_ONCE(dir->ref_count == 0); 396 WARN_ON_ONCE(dir->ref_count == 0);
381 /* If the subsystem is about to be freed, the dir must be too */ 397 /* If the subsystem is about to be freed, the dir must be too */
382 WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); 398 WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1);
383 399
384 __put_system(dir->subsystem); 400 __put_system(dir->subsystem);
385 if (!--dir->ref_count) 401 if (!--dir->ref_count)
@@ -394,16 +410,45 @@ static void put_system(struct ftrace_subsystem_dir *dir)
394} 410}
395 411
396/* 412/*
413 * Open and update trace_array ref count.
414 * Must have the current trace_array passed to it.
415 */
416static int tracing_open_generic_file(struct inode *inode, struct file *filp)
417{
418 struct ftrace_event_file *file = inode->i_private;
419 struct trace_array *tr = file->tr;
420 int ret;
421
422 if (trace_array_get(tr) < 0)
423 return -ENODEV;
424
425 ret = tracing_open_generic(inode, filp);
426 if (ret < 0)
427 trace_array_put(tr);
428 return ret;
429}
430
431static int tracing_release_generic_file(struct inode *inode, struct file *filp)
432{
433 struct ftrace_event_file *file = inode->i_private;
434 struct trace_array *tr = file->tr;
435
436 trace_array_put(tr);
437
438 return 0;
439}
440
441/*
397 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 442 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
398 */ 443 */
399static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, 444static int
400 const char *sub, const char *event, int set) 445__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
446 const char *sub, const char *event, int set)
401{ 447{
402 struct ftrace_event_file *file; 448 struct ftrace_event_file *file;
403 struct ftrace_event_call *call; 449 struct ftrace_event_call *call;
404 int ret = -EINVAL; 450 int ret = -EINVAL;
405 451
406 mutex_lock(&event_mutex);
407 list_for_each_entry(file, &tr->events, list) { 452 list_for_each_entry(file, &tr->events, list) {
408 453
409 call = file->event_call; 454 call = file->event_call;
@@ -429,6 +474,17 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
429 474
430 ret = 0; 475 ret = 0;
431 } 476 }
477
478 return ret;
479}
480
481static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
482 const char *sub, const char *event, int set)
483{
484 int ret;
485
486 mutex_lock(&event_mutex);
487 ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
432 mutex_unlock(&event_mutex); 488 mutex_unlock(&event_mutex);
433 489
434 return ret; 490 return ret;
@@ -624,17 +680,17 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
624 loff_t *ppos) 680 loff_t *ppos)
625{ 681{
626 struct ftrace_event_file *file = filp->private_data; 682 struct ftrace_event_file *file = filp->private_data;
627 char *buf; 683 char buf[4] = "0";
628 684
629 if (file->flags & FTRACE_EVENT_FL_ENABLED) { 685 if (file->flags & FTRACE_EVENT_FL_ENABLED &&
630 if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) 686 !(file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
631 buf = "0*\n"; 687 strcpy(buf, "1");
632 else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) 688
633 buf = "1*\n"; 689 if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED ||
634 else 690 file->flags & FTRACE_EVENT_FL_SOFT_MODE)
635 buf = "1\n"; 691 strcat(buf, "*");
636 } else 692
637 buf = "0\n"; 693 strcat(buf, "\n");
638 694
639 return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); 695 return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
640} 696}
@@ -770,59 +826,33 @@ enum {
770static void *f_next(struct seq_file *m, void *v, loff_t *pos) 826static void *f_next(struct seq_file *m, void *v, loff_t *pos)
771{ 827{
772 struct ftrace_event_call *call = m->private; 828 struct ftrace_event_call *call = m->private;
773 struct ftrace_event_field *field;
774 struct list_head *common_head = &ftrace_common_fields; 829 struct list_head *common_head = &ftrace_common_fields;
775 struct list_head *head = trace_get_fields(call); 830 struct list_head *head = trace_get_fields(call);
831 struct list_head *node = v;
776 832
777 (*pos)++; 833 (*pos)++;
778 834
779 switch ((unsigned long)v) { 835 switch ((unsigned long)v) {
780 case FORMAT_HEADER: 836 case FORMAT_HEADER:
781 if (unlikely(list_empty(common_head))) 837 node = common_head;
782 return NULL; 838 break;
783
784 field = list_entry(common_head->prev,
785 struct ftrace_event_field, link);
786 return field;
787 839
788 case FORMAT_FIELD_SEPERATOR: 840 case FORMAT_FIELD_SEPERATOR:
789 if (unlikely(list_empty(head))) 841 node = head;
790 return NULL; 842 break;
791
792 field = list_entry(head->prev, struct ftrace_event_field, link);
793 return field;
794 843
795 case FORMAT_PRINTFMT: 844 case FORMAT_PRINTFMT:
796 /* all done */ 845 /* all done */
797 return NULL; 846 return NULL;
798 } 847 }
799 848
800 field = v; 849 node = node->prev;
801 if (field->link.prev == common_head) 850 if (node == common_head)
802 return (void *)FORMAT_FIELD_SEPERATOR; 851 return (void *)FORMAT_FIELD_SEPERATOR;
803 else if (field->link.prev == head) 852 else if (node == head)
804 return (void *)FORMAT_PRINTFMT; 853 return (void *)FORMAT_PRINTFMT;
805 854 else
806 field = list_entry(field->link.prev, struct ftrace_event_field, link); 855 return node;
807
808 return field;
809}
810
811static void *f_start(struct seq_file *m, loff_t *pos)
812{
813 loff_t l = 0;
814 void *p;
815
816 /* Start by showing the header */
817 if (!*pos)
818 return (void *)FORMAT_HEADER;
819
820 p = (void *)FORMAT_HEADER;
821 do {
822 p = f_next(m, p, &l);
823 } while (p && l < *pos);
824
825 return p;
826} 856}
827 857
828static int f_show(struct seq_file *m, void *v) 858static int f_show(struct seq_file *m, void *v)
@@ -848,8 +878,7 @@ static int f_show(struct seq_file *m, void *v)
848 return 0; 878 return 0;
849 } 879 }
850 880
851 field = v; 881 field = list_entry(v, struct ftrace_event_field, link);
852
853 /* 882 /*
854 * Smartly shows the array type(except dynamic array). 883 * Smartly shows the array type(except dynamic array).
855 * Normal: 884 * Normal:
@@ -876,6 +905,17 @@ static int f_show(struct seq_file *m, void *v)
876 return 0; 905 return 0;
877} 906}
878 907
908static void *f_start(struct seq_file *m, loff_t *pos)
909{
910 void *p = (void *)FORMAT_HEADER;
911 loff_t l = 0;
912
913 while (l < *pos && p)
914 p = f_next(m, p, &l);
915
916 return p;
917}
918
879static void f_stop(struct seq_file *m, void *p) 919static void f_stop(struct seq_file *m, void *p)
880{ 920{
881} 921}
@@ -907,23 +947,14 @@ static ssize_t
907event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) 947event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
908{ 948{
909 struct ftrace_event_call *call = filp->private_data; 949 struct ftrace_event_call *call = filp->private_data;
910 struct trace_seq *s; 950 char buf[32];
911 int r; 951 int len;
912 952
913 if (*ppos) 953 if (*ppos)
914 return 0; 954 return 0;
915 955
916 s = kmalloc(sizeof(*s), GFP_KERNEL); 956 len = sprintf(buf, "%d\n", call->event.type);
917 if (!s) 957 return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
918 return -ENOMEM;
919
920 trace_seq_init(s);
921 trace_seq_printf(s, "%d\n", call->event.type);
922
923 r = simple_read_from_buffer(ubuf, cnt, ppos,
924 s->buffer, s->len);
925 kfree(s);
926 return r;
927} 958}
928 959
929static ssize_t 960static ssize_t
@@ -992,6 +1023,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
992 int ret; 1023 int ret;
993 1024
994 /* Make sure the system still exists */ 1025 /* Make sure the system still exists */
1026 mutex_lock(&trace_types_lock);
995 mutex_lock(&event_mutex); 1027 mutex_lock(&event_mutex);
996 list_for_each_entry(tr, &ftrace_trace_arrays, list) { 1028 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
997 list_for_each_entry(dir, &tr->systems, list) { 1029 list_for_each_entry(dir, &tr->systems, list) {
@@ -1007,6 +1039,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
1007 } 1039 }
1008 exit_loop: 1040 exit_loop:
1009 mutex_unlock(&event_mutex); 1041 mutex_unlock(&event_mutex);
1042 mutex_unlock(&trace_types_lock);
1010 1043
1011 if (!system) 1044 if (!system)
1012 return -ENODEV; 1045 return -ENODEV;
@@ -1014,9 +1047,17 @@ static int subsystem_open(struct inode *inode, struct file *filp)
1014 /* Some versions of gcc think dir can be uninitialized here */ 1047 /* Some versions of gcc think dir can be uninitialized here */
1015 WARN_ON(!dir); 1048 WARN_ON(!dir);
1016 1049
1050 /* Still need to increment the ref count of the system */
1051 if (trace_array_get(tr) < 0) {
1052 put_system(dir);
1053 return -ENODEV;
1054 }
1055
1017 ret = tracing_open_generic(inode, filp); 1056 ret = tracing_open_generic(inode, filp);
1018 if (ret < 0) 1057 if (ret < 0) {
1058 trace_array_put(tr);
1019 put_system(dir); 1059 put_system(dir);
1060 }
1020 1061
1021 return ret; 1062 return ret;
1022} 1063}
@@ -1027,16 +1068,23 @@ static int system_tr_open(struct inode *inode, struct file *filp)
1027 struct trace_array *tr = inode->i_private; 1068 struct trace_array *tr = inode->i_private;
1028 int ret; 1069 int ret;
1029 1070
1071 if (trace_array_get(tr) < 0)
1072 return -ENODEV;
1073
1030 /* Make a temporary dir that has no system but points to tr */ 1074 /* Make a temporary dir that has no system but points to tr */
1031 dir = kzalloc(sizeof(*dir), GFP_KERNEL); 1075 dir = kzalloc(sizeof(*dir), GFP_KERNEL);
1032 if (!dir) 1076 if (!dir) {
1077 trace_array_put(tr);
1033 return -ENOMEM; 1078 return -ENOMEM;
1079 }
1034 1080
1035 dir->tr = tr; 1081 dir->tr = tr;
1036 1082
1037 ret = tracing_open_generic(inode, filp); 1083 ret = tracing_open_generic(inode, filp);
1038 if (ret < 0) 1084 if (ret < 0) {
1085 trace_array_put(tr);
1039 kfree(dir); 1086 kfree(dir);
1087 }
1040 1088
1041 filp->private_data = dir; 1089 filp->private_data = dir;
1042 1090
@@ -1047,6 +1095,8 @@ static int subsystem_release(struct inode *inode, struct file *file)
1047{ 1095{
1048 struct ftrace_subsystem_dir *dir = file->private_data; 1096 struct ftrace_subsystem_dir *dir = file->private_data;
1049 1097
1098 trace_array_put(dir->tr);
1099
1050 /* 1100 /*
1051 * If dir->subsystem is NULL, then this is a temporary 1101 * If dir->subsystem is NULL, then this is a temporary
1052 * descriptor that was made for a trace_array to enable 1102 * descriptor that was made for a trace_array to enable
@@ -1143,6 +1193,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
1143 1193
1144static int ftrace_event_avail_open(struct inode *inode, struct file *file); 1194static int ftrace_event_avail_open(struct inode *inode, struct file *file);
1145static int ftrace_event_set_open(struct inode *inode, struct file *file); 1195static int ftrace_event_set_open(struct inode *inode, struct file *file);
1196static int ftrace_event_release(struct inode *inode, struct file *file);
1146 1197
1147static const struct seq_operations show_event_seq_ops = { 1198static const struct seq_operations show_event_seq_ops = {
1148 .start = t_start, 1199 .start = t_start,
@@ -1170,13 +1221,14 @@ static const struct file_operations ftrace_set_event_fops = {
1170 .read = seq_read, 1221 .read = seq_read,
1171 .write = ftrace_event_write, 1222 .write = ftrace_event_write,
1172 .llseek = seq_lseek, 1223 .llseek = seq_lseek,
1173 .release = seq_release, 1224 .release = ftrace_event_release,
1174}; 1225};
1175 1226
1176static const struct file_operations ftrace_enable_fops = { 1227static const struct file_operations ftrace_enable_fops = {
1177 .open = tracing_open_generic, 1228 .open = tracing_open_generic_file,
1178 .read = event_enable_read, 1229 .read = event_enable_read,
1179 .write = event_enable_write, 1230 .write = event_enable_write,
1231 .release = tracing_release_generic_file,
1180 .llseek = default_llseek, 1232 .llseek = default_llseek,
1181}; 1233};
1182 1234
@@ -1247,6 +1299,15 @@ ftrace_event_open(struct inode *inode, struct file *file,
1247 return ret; 1299 return ret;
1248} 1300}
1249 1301
1302static int ftrace_event_release(struct inode *inode, struct file *file)
1303{
1304 struct trace_array *tr = inode->i_private;
1305
1306 trace_array_put(tr);
1307
1308 return seq_release(inode, file);
1309}
1310
1250static int 1311static int
1251ftrace_event_avail_open(struct inode *inode, struct file *file) 1312ftrace_event_avail_open(struct inode *inode, struct file *file)
1252{ 1313{
@@ -1260,12 +1321,19 @@ ftrace_event_set_open(struct inode *inode, struct file *file)
1260{ 1321{
1261 const struct seq_operations *seq_ops = &show_set_event_seq_ops; 1322 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1262 struct trace_array *tr = inode->i_private; 1323 struct trace_array *tr = inode->i_private;
1324 int ret;
1325
1326 if (trace_array_get(tr) < 0)
1327 return -ENODEV;
1263 1328
1264 if ((file->f_mode & FMODE_WRITE) && 1329 if ((file->f_mode & FMODE_WRITE) &&
1265 (file->f_flags & O_TRUNC)) 1330 (file->f_flags & O_TRUNC))
1266 ftrace_clear_events(tr); 1331 ftrace_clear_events(tr);
1267 1332
1268 return ftrace_event_open(inode, file, seq_ops); 1333 ret = ftrace_event_open(inode, file, seq_ops);
1334 if (ret < 0)
1335 trace_array_put(tr);
1336 return ret;
1269} 1337}
1270 1338
1271static struct event_subsystem * 1339static struct event_subsystem *
@@ -1279,7 +1347,15 @@ create_new_subsystem(const char *name)
1279 return NULL; 1347 return NULL;
1280 1348
1281 system->ref_count = 1; 1349 system->ref_count = 1;
1282 system->name = name; 1350
1351 /* Only allocate if dynamic (kprobes and modules) */
1352 if (!core_kernel_data((unsigned long)name)) {
1353 system->ref_count |= SYSTEM_FL_FREE_NAME;
1354 system->name = kstrdup(name, GFP_KERNEL);
1355 if (!system->name)
1356 goto out_free;
1357 } else
1358 system->name = name;
1283 1359
1284 system->filter = NULL; 1360 system->filter = NULL;
1285 1361
@@ -1292,6 +1368,8 @@ create_new_subsystem(const char *name)
1292 return system; 1368 return system;
1293 1369
1294 out_free: 1370 out_free:
1371 if (system->ref_count & SYSTEM_FL_FREE_NAME)
1372 kfree(system->name);
1295 kfree(system); 1373 kfree(system);
1296 return NULL; 1374 return NULL;
1297} 1375}
@@ -1591,6 +1669,7 @@ static void __add_event_to_tracers(struct ftrace_event_call *call,
1591int trace_add_event_call(struct ftrace_event_call *call) 1669int trace_add_event_call(struct ftrace_event_call *call)
1592{ 1670{
1593 int ret; 1671 int ret;
1672 mutex_lock(&trace_types_lock);
1594 mutex_lock(&event_mutex); 1673 mutex_lock(&event_mutex);
1595 1674
1596 ret = __register_event(call, NULL); 1675 ret = __register_event(call, NULL);
@@ -1598,11 +1677,13 @@ int trace_add_event_call(struct ftrace_event_call *call)
1598 __add_event_to_tracers(call, NULL); 1677 __add_event_to_tracers(call, NULL);
1599 1678
1600 mutex_unlock(&event_mutex); 1679 mutex_unlock(&event_mutex);
1680 mutex_unlock(&trace_types_lock);
1601 return ret; 1681 return ret;
1602} 1682}
1603 1683
1604/* 1684/*
1605 * Must be called under locking both of event_mutex and trace_event_sem. 1685 * Must be called under locking of trace_types_lock, event_mutex and
1686 * trace_event_sem.
1606 */ 1687 */
1607static void __trace_remove_event_call(struct ftrace_event_call *call) 1688static void __trace_remove_event_call(struct ftrace_event_call *call)
1608{ 1689{
@@ -1614,11 +1695,13 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
1614/* Remove an event_call */ 1695/* Remove an event_call */
1615void trace_remove_event_call(struct ftrace_event_call *call) 1696void trace_remove_event_call(struct ftrace_event_call *call)
1616{ 1697{
1698 mutex_lock(&trace_types_lock);
1617 mutex_lock(&event_mutex); 1699 mutex_lock(&event_mutex);
1618 down_write(&trace_event_sem); 1700 down_write(&trace_event_sem);
1619 __trace_remove_event_call(call); 1701 __trace_remove_event_call(call);
1620 up_write(&trace_event_sem); 1702 up_write(&trace_event_sem);
1621 mutex_unlock(&event_mutex); 1703 mutex_unlock(&event_mutex);
1704 mutex_unlock(&trace_types_lock);
1622} 1705}
1623 1706
1624#define for_each_event(event, start, end) \ 1707#define for_each_event(event, start, end) \
@@ -1762,6 +1845,7 @@ static int trace_module_notify(struct notifier_block *self,
1762{ 1845{
1763 struct module *mod = data; 1846 struct module *mod = data;
1764 1847
1848 mutex_lock(&trace_types_lock);
1765 mutex_lock(&event_mutex); 1849 mutex_lock(&event_mutex);
1766 switch (val) { 1850 switch (val) {
1767 case MODULE_STATE_COMING: 1851 case MODULE_STATE_COMING:
@@ -1772,6 +1856,7 @@ static int trace_module_notify(struct notifier_block *self,
1772 break; 1856 break;
1773 } 1857 }
1774 mutex_unlock(&event_mutex); 1858 mutex_unlock(&event_mutex);
1859 mutex_unlock(&trace_types_lock);
1775 1860
1776 return 0; 1861 return 0;
1777} 1862}
@@ -2011,10 +2096,7 @@ event_enable_func(struct ftrace_hash *hash,
2011 int ret; 2096 int ret;
2012 2097
2013 /* hash funcs only work with set_ftrace_filter */ 2098 /* hash funcs only work with set_ftrace_filter */
2014 if (!enabled) 2099 if (!enabled || !param)
2015 return -EINVAL;
2016
2017 if (!param)
2018 return -EINVAL; 2100 return -EINVAL;
2019 2101
2020 system = strsep(&param, ":"); 2102 system = strsep(&param, ":");
@@ -2329,11 +2411,11 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
2329 2411
2330int event_trace_del_tracer(struct trace_array *tr) 2412int event_trace_del_tracer(struct trace_array *tr)
2331{ 2413{
2332 /* Disable any running events */
2333 __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
2334
2335 mutex_lock(&event_mutex); 2414 mutex_lock(&event_mutex);
2336 2415
2416 /* Disable any running events */
2417 __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
2418
2337 down_write(&trace_event_sem); 2419 down_write(&trace_event_sem);
2338 __trace_remove_event_dirs(tr); 2420 __trace_remove_event_dirs(tr);
2339 debugfs_remove_recursive(tr->event_dir); 2421 debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e1b653f7e1ca..0c7b75a8acc8 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -44,6 +44,7 @@ enum filter_op_ids
44 OP_LE, 44 OP_LE,
45 OP_GT, 45 OP_GT,
46 OP_GE, 46 OP_GE,
47 OP_BAND,
47 OP_NONE, 48 OP_NONE,
48 OP_OPEN_PAREN, 49 OP_OPEN_PAREN,
49}; 50};
@@ -54,6 +55,7 @@ struct filter_op {
54 int precedence; 55 int precedence;
55}; 56};
56 57
58/* Order must be the same as enum filter_op_ids above */
57static struct filter_op filter_ops[] = { 59static struct filter_op filter_ops[] = {
58 { OP_OR, "||", 1 }, 60 { OP_OR, "||", 1 },
59 { OP_AND, "&&", 2 }, 61 { OP_AND, "&&", 2 },
@@ -64,6 +66,7 @@ static struct filter_op filter_ops[] = {
64 { OP_LE, "<=", 5 }, 66 { OP_LE, "<=", 5 },
65 { OP_GT, ">", 5 }, 67 { OP_GT, ">", 5 },
66 { OP_GE, ">=", 5 }, 68 { OP_GE, ">=", 5 },
69 { OP_BAND, "&", 6 },
67 { OP_NONE, "OP_NONE", 0 }, 70 { OP_NONE, "OP_NONE", 0 },
68 { OP_OPEN_PAREN, "(", 0 }, 71 { OP_OPEN_PAREN, "(", 0 },
69}; 72};
@@ -156,6 +159,9 @@ static int filter_pred_##type(struct filter_pred *pred, void *event) \
156 case OP_GE: \ 159 case OP_GE: \
157 match = (*addr >= val); \ 160 match = (*addr >= val); \
158 break; \ 161 break; \
162 case OP_BAND: \
163 match = (*addr & val); \
164 break; \
159 default: \ 165 default: \
160 break; \ 166 break; \
161 } \ 167 } \
@@ -640,7 +646,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
640 if (filter && filter->filter_string) 646 if (filter && filter->filter_string)
641 trace_seq_printf(s, "%s\n", filter->filter_string); 647 trace_seq_printf(s, "%s\n", filter->filter_string);
642 else 648 else
643 trace_seq_printf(s, "none\n"); 649 trace_seq_puts(s, "none\n");
644 mutex_unlock(&event_mutex); 650 mutex_unlock(&event_mutex);
645} 651}
646 652
@@ -654,7 +660,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
654 if (filter && filter->filter_string) 660 if (filter && filter->filter_string)
655 trace_seq_printf(s, "%s\n", filter->filter_string); 661 trace_seq_printf(s, "%s\n", filter->filter_string);
656 else 662 else
657 trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); 663 trace_seq_puts(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
658 mutex_unlock(&event_mutex); 664 mutex_unlock(&event_mutex);
659} 665}
660 666
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c4d6d7191988..38fe1483c508 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -199,7 +199,7 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
199 return 0; 199 return 0;
200} 200}
201 201
202static struct tracer function_trace __read_mostly = 202static struct tracer function_trace __tracer_data =
203{ 203{
204 .name = "function", 204 .name = "function",
205 .init = function_trace_init, 205 .init = function_trace_init,
@@ -290,6 +290,21 @@ ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
290 trace_dump_stack(STACK_SKIP); 290 trace_dump_stack(STACK_SKIP);
291} 291}
292 292
293static void
294ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data)
295{
296 if (update_count(data))
297 ftrace_dump(DUMP_ALL);
298}
299
300/* Only dump the current CPU buffer. */
301static void
302ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data)
303{
304 if (update_count(data))
305 ftrace_dump(DUMP_ORIG);
306}
307
293static int 308static int
294ftrace_probe_print(const char *name, struct seq_file *m, 309ftrace_probe_print(const char *name, struct seq_file *m,
295 unsigned long ip, void *data) 310 unsigned long ip, void *data)
@@ -327,6 +342,20 @@ ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
327 return ftrace_probe_print("stacktrace", m, ip, data); 342 return ftrace_probe_print("stacktrace", m, ip, data);
328} 343}
329 344
345static int
346ftrace_dump_print(struct seq_file *m, unsigned long ip,
347 struct ftrace_probe_ops *ops, void *data)
348{
349 return ftrace_probe_print("dump", m, ip, data);
350}
351
352static int
353ftrace_cpudump_print(struct seq_file *m, unsigned long ip,
354 struct ftrace_probe_ops *ops, void *data)
355{
356 return ftrace_probe_print("cpudump", m, ip, data);
357}
358
330static struct ftrace_probe_ops traceon_count_probe_ops = { 359static struct ftrace_probe_ops traceon_count_probe_ops = {
331 .func = ftrace_traceon_count, 360 .func = ftrace_traceon_count,
332 .print = ftrace_traceon_print, 361 .print = ftrace_traceon_print,
@@ -342,6 +371,16 @@ static struct ftrace_probe_ops stacktrace_count_probe_ops = {
342 .print = ftrace_stacktrace_print, 371 .print = ftrace_stacktrace_print,
343}; 372};
344 373
374static struct ftrace_probe_ops dump_probe_ops = {
375 .func = ftrace_dump_probe,
376 .print = ftrace_dump_print,
377};
378
379static struct ftrace_probe_ops cpudump_probe_ops = {
380 .func = ftrace_cpudump_probe,
381 .print = ftrace_cpudump_print,
382};
383
345static struct ftrace_probe_ops traceon_probe_ops = { 384static struct ftrace_probe_ops traceon_probe_ops = {
346 .func = ftrace_traceon, 385 .func = ftrace_traceon,
347 .print = ftrace_traceon_print, 386 .print = ftrace_traceon_print,
@@ -425,6 +464,32 @@ ftrace_stacktrace_callback(struct ftrace_hash *hash,
425 param, enable); 464 param, enable);
426} 465}
427 466
467static int
468ftrace_dump_callback(struct ftrace_hash *hash,
469 char *glob, char *cmd, char *param, int enable)
470{
471 struct ftrace_probe_ops *ops;
472
473 ops = &dump_probe_ops;
474
475 /* Only dump once. */
476 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
477 "1", enable);
478}
479
480static int
481ftrace_cpudump_callback(struct ftrace_hash *hash,
482 char *glob, char *cmd, char *param, int enable)
483{
484 struct ftrace_probe_ops *ops;
485
486 ops = &cpudump_probe_ops;
487
488 /* Only dump once. */
489 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
490 "1", enable);
491}
492
428static struct ftrace_func_command ftrace_traceon_cmd = { 493static struct ftrace_func_command ftrace_traceon_cmd = {
429 .name = "traceon", 494 .name = "traceon",
430 .func = ftrace_trace_onoff_callback, 495 .func = ftrace_trace_onoff_callback,
@@ -440,6 +505,16 @@ static struct ftrace_func_command ftrace_stacktrace_cmd = {
440 .func = ftrace_stacktrace_callback, 505 .func = ftrace_stacktrace_callback,
441}; 506};
442 507
508static struct ftrace_func_command ftrace_dump_cmd = {
509 .name = "dump",
510 .func = ftrace_dump_callback,
511};
512
513static struct ftrace_func_command ftrace_cpudump_cmd = {
514 .name = "cpudump",
515 .func = ftrace_cpudump_callback,
516};
517
443static int __init init_func_cmd_traceon(void) 518static int __init init_func_cmd_traceon(void)
444{ 519{
445 int ret; 520 int ret;
@@ -450,13 +525,31 @@ static int __init init_func_cmd_traceon(void)
450 525
451 ret = register_ftrace_command(&ftrace_traceon_cmd); 526 ret = register_ftrace_command(&ftrace_traceon_cmd);
452 if (ret) 527 if (ret)
453 unregister_ftrace_command(&ftrace_traceoff_cmd); 528 goto out_free_traceoff;
454 529
455 ret = register_ftrace_command(&ftrace_stacktrace_cmd); 530 ret = register_ftrace_command(&ftrace_stacktrace_cmd);
456 if (ret) { 531 if (ret)
457 unregister_ftrace_command(&ftrace_traceoff_cmd); 532 goto out_free_traceon;
458 unregister_ftrace_command(&ftrace_traceon_cmd); 533
459 } 534 ret = register_ftrace_command(&ftrace_dump_cmd);
535 if (ret)
536 goto out_free_stacktrace;
537
538 ret = register_ftrace_command(&ftrace_cpudump_cmd);
539 if (ret)
540 goto out_free_dump;
541
542 return 0;
543
544 out_free_dump:
545 unregister_ftrace_command(&ftrace_dump_cmd);
546 out_free_stacktrace:
547 unregister_ftrace_command(&ftrace_stacktrace_cmd);
548 out_free_traceon:
549 unregister_ftrace_command(&ftrace_traceon_cmd);
550 out_free_traceoff:
551 unregister_ftrace_command(&ftrace_traceoff_cmd);
552
460 return ret; 553 return ret;
461} 554}
462#else 555#else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8388bc99f2ee..b5c09242683d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -446,7 +446,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
446 446
447 /* First spaces to align center */ 447 /* First spaces to align center */
448 for (i = 0; i < spaces / 2; i++) { 448 for (i = 0; i < spaces / 2; i++) {
449 ret = trace_seq_printf(s, " "); 449 ret = trace_seq_putc(s, ' ');
450 if (!ret) 450 if (!ret)
451 return TRACE_TYPE_PARTIAL_LINE; 451 return TRACE_TYPE_PARTIAL_LINE;
452 } 452 }
@@ -457,7 +457,7 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
457 457
458 /* Last spaces to align center */ 458 /* Last spaces to align center */
459 for (i = 0; i < spaces - (spaces / 2); i++) { 459 for (i = 0; i < spaces - (spaces / 2); i++) {
460 ret = trace_seq_printf(s, " "); 460 ret = trace_seq_putc(s, ' ');
461 if (!ret) 461 if (!ret)
462 return TRACE_TYPE_PARTIAL_LINE; 462 return TRACE_TYPE_PARTIAL_LINE;
463 } 463 }
@@ -503,7 +503,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
503 ------------------------------------------ 503 ------------------------------------------
504 504
505 */ 505 */
506 ret = trace_seq_printf(s, 506 ret = trace_seq_puts(s,
507 " ------------------------------------------\n"); 507 " ------------------------------------------\n");
508 if (!ret) 508 if (!ret)
509 return TRACE_TYPE_PARTIAL_LINE; 509 return TRACE_TYPE_PARTIAL_LINE;
@@ -516,7 +516,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
516 if (ret == TRACE_TYPE_PARTIAL_LINE) 516 if (ret == TRACE_TYPE_PARTIAL_LINE)
517 return TRACE_TYPE_PARTIAL_LINE; 517 return TRACE_TYPE_PARTIAL_LINE;
518 518
519 ret = trace_seq_printf(s, " => "); 519 ret = trace_seq_puts(s, " => ");
520 if (!ret) 520 if (!ret)
521 return TRACE_TYPE_PARTIAL_LINE; 521 return TRACE_TYPE_PARTIAL_LINE;
522 522
@@ -524,7 +524,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
524 if (ret == TRACE_TYPE_PARTIAL_LINE) 524 if (ret == TRACE_TYPE_PARTIAL_LINE)
525 return TRACE_TYPE_PARTIAL_LINE; 525 return TRACE_TYPE_PARTIAL_LINE;
526 526
527 ret = trace_seq_printf(s, 527 ret = trace_seq_puts(s,
528 "\n ------------------------------------------\n\n"); 528 "\n ------------------------------------------\n\n");
529 if (!ret) 529 if (!ret)
530 return TRACE_TYPE_PARTIAL_LINE; 530 return TRACE_TYPE_PARTIAL_LINE;
@@ -645,7 +645,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
645 ret = print_graph_proc(s, pid); 645 ret = print_graph_proc(s, pid);
646 if (ret == TRACE_TYPE_PARTIAL_LINE) 646 if (ret == TRACE_TYPE_PARTIAL_LINE)
647 return TRACE_TYPE_PARTIAL_LINE; 647 return TRACE_TYPE_PARTIAL_LINE;
648 ret = trace_seq_printf(s, " | "); 648 ret = trace_seq_puts(s, " | ");
649 if (!ret) 649 if (!ret)
650 return TRACE_TYPE_PARTIAL_LINE; 650 return TRACE_TYPE_PARTIAL_LINE;
651 } 651 }
@@ -657,9 +657,9 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
657 return ret; 657 return ret;
658 658
659 if (type == TRACE_GRAPH_ENT) 659 if (type == TRACE_GRAPH_ENT)
660 ret = trace_seq_printf(s, "==========>"); 660 ret = trace_seq_puts(s, "==========>");
661 else 661 else
662 ret = trace_seq_printf(s, "<=========="); 662 ret = trace_seq_puts(s, "<==========");
663 663
664 if (!ret) 664 if (!ret)
665 return TRACE_TYPE_PARTIAL_LINE; 665 return TRACE_TYPE_PARTIAL_LINE;
@@ -668,7 +668,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
668 if (ret != TRACE_TYPE_HANDLED) 668 if (ret != TRACE_TYPE_HANDLED)
669 return ret; 669 return ret;
670 670
671 ret = trace_seq_printf(s, "\n"); 671 ret = trace_seq_putc(s, '\n');
672 672
673 if (!ret) 673 if (!ret)
674 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
@@ -705,13 +705,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
705 len += strlen(nsecs_str); 705 len += strlen(nsecs_str);
706 } 706 }
707 707
708 ret = trace_seq_printf(s, " us "); 708 ret = trace_seq_puts(s, " us ");
709 if (!ret) 709 if (!ret)
710 return TRACE_TYPE_PARTIAL_LINE; 710 return TRACE_TYPE_PARTIAL_LINE;
711 711
712 /* Print remaining spaces to fit the row's width */ 712 /* Print remaining spaces to fit the row's width */
713 for (i = len; i < 7; i++) { 713 for (i = len; i < 7; i++) {
714 ret = trace_seq_printf(s, " "); 714 ret = trace_seq_putc(s, ' ');
715 if (!ret) 715 if (!ret)
716 return TRACE_TYPE_PARTIAL_LINE; 716 return TRACE_TYPE_PARTIAL_LINE;
717 } 717 }
@@ -731,13 +731,13 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
731 /* No real adata, just filling the column with spaces */ 731 /* No real adata, just filling the column with spaces */
732 switch (duration) { 732 switch (duration) {
733 case DURATION_FILL_FULL: 733 case DURATION_FILL_FULL:
734 ret = trace_seq_printf(s, " | "); 734 ret = trace_seq_puts(s, " | ");
735 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 735 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
736 case DURATION_FILL_START: 736 case DURATION_FILL_START:
737 ret = trace_seq_printf(s, " "); 737 ret = trace_seq_puts(s, " ");
738 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 738 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
739 case DURATION_FILL_END: 739 case DURATION_FILL_END:
740 ret = trace_seq_printf(s, " |"); 740 ret = trace_seq_puts(s, " |");
741 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 741 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
742 } 742 }
743 743
@@ -745,10 +745,10 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
745 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { 745 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
746 /* Duration exceeded 100 msecs */ 746 /* Duration exceeded 100 msecs */
747 if (duration > 100000ULL) 747 if (duration > 100000ULL)
748 ret = trace_seq_printf(s, "! "); 748 ret = trace_seq_puts(s, "! ");
749 /* Duration exceeded 10 msecs */ 749 /* Duration exceeded 10 msecs */
750 else if (duration > 10000ULL) 750 else if (duration > 10000ULL)
751 ret = trace_seq_printf(s, "+ "); 751 ret = trace_seq_puts(s, "+ ");
752 } 752 }
753 753
754 /* 754 /*
@@ -757,7 +757,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
757 * to fill out the space. 757 * to fill out the space.
758 */ 758 */
759 if (ret == -1) 759 if (ret == -1)
760 ret = trace_seq_printf(s, " "); 760 ret = trace_seq_puts(s, " ");
761 761
762 /* Catching here any failure happenned above */ 762 /* Catching here any failure happenned above */
763 if (!ret) 763 if (!ret)
@@ -767,7 +767,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s,
767 if (ret != TRACE_TYPE_HANDLED) 767 if (ret != TRACE_TYPE_HANDLED)
768 return ret; 768 return ret;
769 769
770 ret = trace_seq_printf(s, "| "); 770 ret = trace_seq_puts(s, "| ");
771 if (!ret) 771 if (!ret)
772 return TRACE_TYPE_PARTIAL_LINE; 772 return TRACE_TYPE_PARTIAL_LINE;
773 773
@@ -817,7 +817,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
817 817
818 /* Function */ 818 /* Function */
819 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 819 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
820 ret = trace_seq_printf(s, " "); 820 ret = trace_seq_putc(s, ' ');
821 if (!ret) 821 if (!ret)
822 return TRACE_TYPE_PARTIAL_LINE; 822 return TRACE_TYPE_PARTIAL_LINE;
823 } 823 }
@@ -858,7 +858,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
858 858
859 /* Function */ 859 /* Function */
860 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 860 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
861 ret = trace_seq_printf(s, " "); 861 ret = trace_seq_putc(s, ' ');
862 if (!ret) 862 if (!ret)
863 return TRACE_TYPE_PARTIAL_LINE; 863 return TRACE_TYPE_PARTIAL_LINE;
864 } 864 }
@@ -917,7 +917,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
917 if (ret == TRACE_TYPE_PARTIAL_LINE) 917 if (ret == TRACE_TYPE_PARTIAL_LINE)
918 return TRACE_TYPE_PARTIAL_LINE; 918 return TRACE_TYPE_PARTIAL_LINE;
919 919
920 ret = trace_seq_printf(s, " | "); 920 ret = trace_seq_puts(s, " | ");
921 if (!ret) 921 if (!ret)
922 return TRACE_TYPE_PARTIAL_LINE; 922 return TRACE_TYPE_PARTIAL_LINE;
923 } 923 }
@@ -1117,7 +1117,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1117 1117
1118 /* Closing brace */ 1118 /* Closing brace */
1119 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 1119 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
1120 ret = trace_seq_printf(s, " "); 1120 ret = trace_seq_putc(s, ' ');
1121 if (!ret) 1121 if (!ret)
1122 return TRACE_TYPE_PARTIAL_LINE; 1122 return TRACE_TYPE_PARTIAL_LINE;
1123 } 1123 }
@@ -1129,7 +1129,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
1129 * belongs to, write out the function name. 1129 * belongs to, write out the function name.
1130 */ 1130 */
1131 if (func_match) { 1131 if (func_match) {
1132 ret = trace_seq_printf(s, "}\n"); 1132 ret = trace_seq_puts(s, "}\n");
1133 if (!ret) 1133 if (!ret)
1134 return TRACE_TYPE_PARTIAL_LINE; 1134 return TRACE_TYPE_PARTIAL_LINE;
1135 } else { 1135 } else {
@@ -1179,13 +1179,13 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1179 /* Indentation */ 1179 /* Indentation */
1180 if (depth > 0) 1180 if (depth > 0)
1181 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { 1181 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
1182 ret = trace_seq_printf(s, " "); 1182 ret = trace_seq_putc(s, ' ');
1183 if (!ret) 1183 if (!ret)
1184 return TRACE_TYPE_PARTIAL_LINE; 1184 return TRACE_TYPE_PARTIAL_LINE;
1185 } 1185 }
1186 1186
1187 /* The comment */ 1187 /* The comment */
1188 ret = trace_seq_printf(s, "/* "); 1188 ret = trace_seq_puts(s, "/* ");
1189 if (!ret) 1189 if (!ret)
1190 return TRACE_TYPE_PARTIAL_LINE; 1190 return TRACE_TYPE_PARTIAL_LINE;
1191 1191
@@ -1216,7 +1216,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1216 s->len--; 1216 s->len--;
1217 } 1217 }
1218 1218
1219 ret = trace_seq_printf(s, " */\n"); 1219 ret = trace_seq_puts(s, " */\n");
1220 if (!ret) 1220 if (!ret)
1221 return TRACE_TYPE_PARTIAL_LINE; 1221 return TRACE_TYPE_PARTIAL_LINE;
1222 1222
@@ -1448,7 +1448,7 @@ static struct trace_event graph_trace_ret_event = {
1448 .funcs = &graph_functions 1448 .funcs = &graph_functions
1449}; 1449};
1450 1450
1451static struct tracer graph_trace __read_mostly = { 1451static struct tracer graph_trace __tracer_data = {
1452 .name = "function_graph", 1452 .name = "function_graph",
1453 .open = graph_trace_open, 1453 .open = graph_trace_open,
1454 .pipe_open = graph_trace_open, 1454 .pipe_open = graph_trace_open,
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b19d065a28cb..2aefbee93a6d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -373,7 +373,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
373 struct trace_array_cpu *data; 373 struct trace_array_cpu *data;
374 unsigned long flags; 374 unsigned long flags;
375 375
376 if (likely(!tracer_enabled)) 376 if (!tracer_enabled || !tracing_is_enabled())
377 return; 377 return;
378 378
379 cpu = raw_smp_processor_id(); 379 cpu = raw_smp_processor_id();
@@ -416,7 +416,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
416 else 416 else
417 return; 417 return;
418 418
419 if (!tracer_enabled) 419 if (!tracer_enabled || !tracing_is_enabled())
420 return; 420 return;
421 421
422 data = per_cpu_ptr(tr->trace_buffer.data, cpu); 422 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9f46e98ba8f2..3811487e7a7a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,12 +35,17 @@ struct trace_probe {
35 const char *symbol; /* symbol name */ 35 const char *symbol; /* symbol name */
36 struct ftrace_event_class class; 36 struct ftrace_event_class class;
37 struct ftrace_event_call call; 37 struct ftrace_event_call call;
38 struct ftrace_event_file * __rcu *files; 38 struct list_head files;
39 ssize_t size; /* trace entry size */ 39 ssize_t size; /* trace entry size */
40 unsigned int nr_args; 40 unsigned int nr_args;
41 struct probe_arg args[]; 41 struct probe_arg args[];
42}; 42};
43 43
44struct event_file_link {
45 struct ftrace_event_file *file;
46 struct list_head list;
47};
48
44#define SIZEOF_TRACE_PROBE(n) \ 49#define SIZEOF_TRACE_PROBE(n) \
45 (offsetof(struct trace_probe, args) + \ 50 (offsetof(struct trace_probe, args) + \
46 (sizeof(struct probe_arg) * (n))) 51 (sizeof(struct probe_arg) * (n)))
@@ -150,6 +155,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
150 goto error; 155 goto error;
151 156
152 INIT_LIST_HEAD(&tp->list); 157 INIT_LIST_HEAD(&tp->list);
158 INIT_LIST_HEAD(&tp->files);
153 return tp; 159 return tp;
154error: 160error:
155 kfree(tp->call.name); 161 kfree(tp->call.name);
@@ -183,25 +189,6 @@ static struct trace_probe *find_trace_probe(const char *event,
183 return NULL; 189 return NULL;
184} 190}
185 191
186static int trace_probe_nr_files(struct trace_probe *tp)
187{
188 struct ftrace_event_file **file;
189 int ret = 0;
190
191 /*
192 * Since all tp->files updater is protected by probe_enable_lock,
193 * we don't need to lock an rcu_read_lock.
194 */
195 file = rcu_dereference_raw(tp->files);
196 if (file)
197 while (*(file++))
198 ret++;
199
200 return ret;
201}
202
203static DEFINE_MUTEX(probe_enable_lock);
204
205/* 192/*
206 * Enable trace_probe 193 * Enable trace_probe
207 * if the file is NULL, enable "perf" handler, or enable "trace" handler. 194 * if the file is NULL, enable "perf" handler, or enable "trace" handler.
@@ -211,67 +198,42 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
211{ 198{
212 int ret = 0; 199 int ret = 0;
213 200
214 mutex_lock(&probe_enable_lock);
215
216 if (file) { 201 if (file) {
217 struct ftrace_event_file **new, **old; 202 struct event_file_link *link;
218 int n = trace_probe_nr_files(tp); 203
219 204 link = kmalloc(sizeof(*link), GFP_KERNEL);
220 old = rcu_dereference_raw(tp->files); 205 if (!link) {
221 /* 1 is for new one and 1 is for stopper */
222 new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *),
223 GFP_KERNEL);
224 if (!new) {
225 ret = -ENOMEM; 206 ret = -ENOMEM;
226 goto out_unlock; 207 goto out;
227 } 208 }
228 memcpy(new, old, n * sizeof(struct ftrace_event_file *));
229 new[n] = file;
230 /* The last one keeps a NULL */
231 209
232 rcu_assign_pointer(tp->files, new); 210 link->file = file;
233 tp->flags |= TP_FLAG_TRACE; 211 list_add_tail_rcu(&link->list, &tp->files);
234 212
235 if (old) { 213 tp->flags |= TP_FLAG_TRACE;
236 /* Make sure the probe is done with old files */
237 synchronize_sched();
238 kfree(old);
239 }
240 } else 214 } else
241 tp->flags |= TP_FLAG_PROFILE; 215 tp->flags |= TP_FLAG_PROFILE;
242 216
243 if (trace_probe_is_enabled(tp) && trace_probe_is_registered(tp) && 217 if (trace_probe_is_registered(tp) && !trace_probe_has_gone(tp)) {
244 !trace_probe_has_gone(tp)) {
245 if (trace_probe_is_return(tp)) 218 if (trace_probe_is_return(tp))
246 ret = enable_kretprobe(&tp->rp); 219 ret = enable_kretprobe(&tp->rp);
247 else 220 else
248 ret = enable_kprobe(&tp->rp.kp); 221 ret = enable_kprobe(&tp->rp.kp);
249 } 222 }
250 223 out:
251 out_unlock:
252 mutex_unlock(&probe_enable_lock);
253
254 return ret; 224 return ret;
255} 225}
256 226
257static int 227static struct event_file_link *
258trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file) 228find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
259{ 229{
260 struct ftrace_event_file **files; 230 struct event_file_link *link;
261 int i;
262 231
263 /* 232 list_for_each_entry(link, &tp->files, list)
264 * Since all tp->files updater is protected by probe_enable_lock, 233 if (link->file == file)
265 * we don't need to lock an rcu_read_lock. 234 return link;
266 */
267 files = rcu_dereference_raw(tp->files);
268 if (files) {
269 for (i = 0; files[i]; i++)
270 if (files[i] == file)
271 return i;
272 }
273 235
274 return -1; 236 return NULL;
275} 237}
276 238
277/* 239/*
@@ -281,43 +243,23 @@ trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)
281static int 243static int
282disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file) 244disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
283{ 245{
246 struct event_file_link *link = NULL;
247 int wait = 0;
284 int ret = 0; 248 int ret = 0;
285 249
286 mutex_lock(&probe_enable_lock);
287
288 if (file) { 250 if (file) {
289 struct ftrace_event_file **new, **old; 251 link = find_event_file_link(tp, file);
290 int n = trace_probe_nr_files(tp); 252 if (!link) {
291 int i, j;
292
293 old = rcu_dereference_raw(tp->files);
294 if (n == 0 || trace_probe_file_index(tp, file) < 0) {
295 ret = -EINVAL; 253 ret = -EINVAL;
296 goto out_unlock; 254 goto out;
297 }
298
299 if (n == 1) { /* Remove the last file */
300 tp->flags &= ~TP_FLAG_TRACE;
301 new = NULL;
302 } else {
303 new = kzalloc(n * sizeof(struct ftrace_event_file *),
304 GFP_KERNEL);
305 if (!new) {
306 ret = -ENOMEM;
307 goto out_unlock;
308 }
309
310 /* This copy & check loop copies the NULL stopper too */
311 for (i = 0, j = 0; j < n && i < n + 1; i++)
312 if (old[i] != file)
313 new[j++] = old[i];
314 } 255 }
315 256
316 rcu_assign_pointer(tp->files, new); 257 list_del_rcu(&link->list);
258 wait = 1;
259 if (!list_empty(&tp->files))
260 goto out;
317 261
318 /* Make sure the probe is done with old files */ 262 tp->flags &= ~TP_FLAG_TRACE;
319 synchronize_sched();
320 kfree(old);
321 } else 263 } else
322 tp->flags &= ~TP_FLAG_PROFILE; 264 tp->flags &= ~TP_FLAG_PROFILE;
323 265
@@ -326,10 +268,21 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
326 disable_kretprobe(&tp->rp); 268 disable_kretprobe(&tp->rp);
327 else 269 else
328 disable_kprobe(&tp->rp.kp); 270 disable_kprobe(&tp->rp.kp);
271 wait = 1;
272 }
273 out:
274 if (wait) {
275 /*
276 * Synchronize with kprobe_trace_func/kretprobe_trace_func
277 * to ensure disabled (all running handlers are finished).
278 * This is not only for kfree(), but also the caller,
279 * trace_remove_event_call() supposes it for releasing
280 * event_call related objects, which will be accessed in
281 * the kprobe_trace_func/kretprobe_trace_func.
282 */
283 synchronize_sched();
284 kfree(link); /* Ignored if link == NULL */
329 } 285 }
330
331 out_unlock:
332 mutex_unlock(&probe_enable_lock);
333 286
334 return ret; 287 return ret;
335} 288}
@@ -885,20 +838,10 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
885static __kprobes void 838static __kprobes void
886kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs) 839kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
887{ 840{
888 /* 841 struct event_file_link *link;
889 * Note: preempt is already disabled around the kprobe handler.
890 * However, we still need an smp_read_barrier_depends() corresponding
891 * to smp_wmb() in rcu_assign_pointer() to access the pointer.
892 */
893 struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
894
895 if (unlikely(!file))
896 return;
897 842
898 while (*file) { 843 list_for_each_entry_rcu(link, &tp->files, list)
899 __kprobe_trace_func(tp, regs, *file); 844 __kprobe_trace_func(tp, regs, link->file);
900 file++;
901 }
902} 845}
903 846
904/* Kretprobe handler */ 847/* Kretprobe handler */
@@ -945,20 +888,10 @@ static __kprobes void
945kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri, 888kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
946 struct pt_regs *regs) 889 struct pt_regs *regs)
947{ 890{
948 /* 891 struct event_file_link *link;
949 * Note: preempt is already disabled around the kprobe handler.
950 * However, we still need an smp_read_barrier_depends() corresponding
951 * to smp_wmb() in rcu_assign_pointer() to access the pointer.
952 */
953 struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
954
955 if (unlikely(!file))
956 return;
957 892
958 while (*file) { 893 list_for_each_entry_rcu(link, &tp->files, list)
959 __kretprobe_trace_func(tp, ri, regs, *file); 894 __kretprobe_trace_func(tp, ri, regs, link->file);
960 file++;
961 }
962} 895}
963 896
964/* Event entry printers */ 897/* Event entry printers */
@@ -1157,13 +1090,14 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1157 int size, __size, dsize; 1090 int size, __size, dsize;
1158 int rctx; 1091 int rctx;
1159 1092
1093 head = this_cpu_ptr(call->perf_events);
1094 if (hlist_empty(head))
1095 return;
1096
1160 dsize = __get_data_size(tp, regs); 1097 dsize = __get_data_size(tp, regs);
1161 __size = sizeof(*entry) + tp->size + dsize; 1098 __size = sizeof(*entry) + tp->size + dsize;
1162 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1099 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1163 size -= sizeof(u32); 1100 size -= sizeof(u32);
1164 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1165 "profile buffer not large enough"))
1166 return;
1167 1101
1168 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1102 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1169 if (!entry) 1103 if (!entry)
@@ -1172,10 +1106,7 @@ kprobe_perf_func(struct trace_probe *tp, struct pt_regs *regs)
1172 entry->ip = (unsigned long)tp->rp.kp.addr; 1106 entry->ip = (unsigned long)tp->rp.kp.addr;
1173 memset(&entry[1], 0, dsize); 1107 memset(&entry[1], 0, dsize);
1174 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1108 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1175 1109 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1176 head = this_cpu_ptr(call->perf_events);
1177 perf_trace_buf_submit(entry, size, rctx,
1178 entry->ip, 1, regs, head, NULL);
1179} 1110}
1180 1111
1181/* Kretprobe profile handler */ 1112/* Kretprobe profile handler */
@@ -1189,13 +1120,14 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1189 int size, __size, dsize; 1120 int size, __size, dsize;
1190 int rctx; 1121 int rctx;
1191 1122
1123 head = this_cpu_ptr(call->perf_events);
1124 if (hlist_empty(head))
1125 return;
1126
1192 dsize = __get_data_size(tp, regs); 1127 dsize = __get_data_size(tp, regs);
1193 __size = sizeof(*entry) + tp->size + dsize; 1128 __size = sizeof(*entry) + tp->size + dsize;
1194 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1129 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1195 size -= sizeof(u32); 1130 size -= sizeof(u32);
1196 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1197 "profile buffer not large enough"))
1198 return;
1199 1131
1200 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1132 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1201 if (!entry) 1133 if (!entry)
@@ -1204,13 +1136,16 @@ kretprobe_perf_func(struct trace_probe *tp, struct kretprobe_instance *ri,
1204 entry->func = (unsigned long)tp->rp.kp.addr; 1136 entry->func = (unsigned long)tp->rp.kp.addr;
1205 entry->ret_ip = (unsigned long)ri->ret_addr; 1137 entry->ret_ip = (unsigned long)ri->ret_addr;
1206 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); 1138 store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
1207 1139 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
1208 head = this_cpu_ptr(call->perf_events);
1209 perf_trace_buf_submit(entry, size, rctx,
1210 entry->ret_ip, 1, regs, head, NULL);
1211} 1140}
1212#endif /* CONFIG_PERF_EVENTS */ 1141#endif /* CONFIG_PERF_EVENTS */
1213 1142
1143/*
1144 * called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
1145 *
1146 * kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
1147 * lockless, but we can't race with this __init function.
1148 */
1214static __kprobes 1149static __kprobes
1215int kprobe_register(struct ftrace_event_call *event, 1150int kprobe_register(struct ftrace_event_call *event,
1216 enum trace_reg type, void *data) 1151 enum trace_reg type, void *data)
@@ -1376,6 +1311,10 @@ find_trace_probe_file(struct trace_probe *tp, struct trace_array *tr)
1376 return NULL; 1311 return NULL;
1377} 1312}
1378 1313
1314/*
1315 * Nobody but us can call enable_trace_probe/disable_trace_probe at this
1316 * stage, we can do this lockless.
1317 */
1379static __init int kprobe_trace_self_tests_init(void) 1318static __init int kprobe_trace_self_tests_init(void)
1380{ 1319{
1381 int ret, warn = 0; 1320 int ret, warn = 0;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index a5e8f4878bfa..b3dcfb2f0fef 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -90,7 +90,7 @@ static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
90 if (drv) 90 if (drv)
91 ret += trace_seq_printf(s, " %s\n", drv->name); 91 ret += trace_seq_printf(s, " %s\n", drv->name);
92 else 92 else
93 ret += trace_seq_printf(s, " \n"); 93 ret += trace_seq_puts(s, " \n");
94 return ret; 94 return ret;
95} 95}
96 96
@@ -107,7 +107,7 @@ static void mmio_pipe_open(struct trace_iterator *iter)
107 struct header_iter *hiter; 107 struct header_iter *hiter;
108 struct trace_seq *s = &iter->seq; 108 struct trace_seq *s = &iter->seq;
109 109
110 trace_seq_printf(s, "VERSION 20070824\n"); 110 trace_seq_puts(s, "VERSION 20070824\n");
111 111
112 hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); 112 hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
113 if (!hiter) 113 if (!hiter)
@@ -209,7 +209,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
209 (rw->value >> 0) & 0xff, rw->pc, 0); 209 (rw->value >> 0) & 0xff, rw->pc, 0);
210 break; 210 break;
211 default: 211 default:
212 ret = trace_seq_printf(s, "rw what?\n"); 212 ret = trace_seq_puts(s, "rw what?\n");
213 break; 213 break;
214 } 214 }
215 if (ret) 215 if (ret)
@@ -245,7 +245,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
245 secs, usec_rem, m->map_id, 0UL, 0); 245 secs, usec_rem, m->map_id, 0UL, 0);
246 break; 246 break;
247 default: 247 default:
248 ret = trace_seq_printf(s, "map what?\n"); 248 ret = trace_seq_puts(s, "map what?\n");
249 break; 249 break;
250 } 250 }
251 if (ret) 251 if (ret)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bb922d9ee51b..34e7cbac0c9c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -78,7 +78,7 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
78 78
79 trace_assign_type(field, entry); 79 trace_assign_type(field, entry);
80 80
81 ret = trace_seq_printf(s, "%s", field->buf); 81 ret = trace_seq_puts(s, field->buf);
82 if (!ret) 82 if (!ret)
83 return TRACE_TYPE_PARTIAL_LINE; 83 return TRACE_TYPE_PARTIAL_LINE;
84 84
@@ -558,14 +558,14 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
558 if (ret) 558 if (ret)
559 ret = trace_seq_puts(s, "??"); 559 ret = trace_seq_puts(s, "??");
560 if (ret) 560 if (ret)
561 ret = trace_seq_puts(s, "\n"); 561 ret = trace_seq_putc(s, '\n');
562 continue; 562 continue;
563 } 563 }
564 if (!ret) 564 if (!ret)
565 break; 565 break;
566 if (ret) 566 if (ret)
567 ret = seq_print_user_ip(s, mm, ip, sym_flags); 567 ret = seq_print_user_ip(s, mm, ip, sym_flags);
568 ret = trace_seq_puts(s, "\n"); 568 ret = trace_seq_putc(s, '\n');
569 } 569 }
570 570
571 if (mm) 571 if (mm)
@@ -579,7 +579,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
579 int ret; 579 int ret;
580 580
581 if (!ip) 581 if (!ip)
582 return trace_seq_printf(s, "0"); 582 return trace_seq_putc(s, '0');
583 583
584 if (sym_flags & TRACE_ITER_SYM_OFFSET) 584 if (sym_flags & TRACE_ITER_SYM_OFFSET)
585 ret = seq_print_sym_offset(s, "%s", ip); 585 ret = seq_print_sym_offset(s, "%s", ip);
@@ -964,14 +964,14 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
964 goto partial; 964 goto partial;
965 965
966 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { 966 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
967 if (!trace_seq_printf(s, " <-")) 967 if (!trace_seq_puts(s, " <-"))
968 goto partial; 968 goto partial;
969 if (!seq_print_ip_sym(s, 969 if (!seq_print_ip_sym(s,
970 field->parent_ip, 970 field->parent_ip,
971 flags)) 971 flags))
972 goto partial; 972 goto partial;
973 } 973 }
974 if (!trace_seq_printf(s, "\n")) 974 if (!trace_seq_putc(s, '\n'))
975 goto partial; 975 goto partial;
976 976
977 return TRACE_TYPE_HANDLED; 977 return TRACE_TYPE_HANDLED;
@@ -1210,7 +1210,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1210 1210
1211 if (!seq_print_ip_sym(s, *p, flags)) 1211 if (!seq_print_ip_sym(s, *p, flags))
1212 goto partial; 1212 goto partial;
1213 if (!trace_seq_puts(s, "\n")) 1213 if (!trace_seq_putc(s, '\n'))
1214 goto partial; 1214 goto partial;
1215 } 1215 }
1216 1216
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 2901e3b88590..a7329b7902f8 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -640,13 +640,20 @@ out:
640 * Enable ftrace, sleep 1/10 second, and then read the trace 640 * Enable ftrace, sleep 1/10 second, and then read the trace
641 * buffer to see if all is in order. 641 * buffer to see if all is in order.
642 */ 642 */
643int 643__init int
644trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 644trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
645{ 645{
646 int save_ftrace_enabled = ftrace_enabled; 646 int save_ftrace_enabled = ftrace_enabled;
647 unsigned long count; 647 unsigned long count;
648 int ret; 648 int ret;
649 649
650#ifdef CONFIG_DYNAMIC_FTRACE
651 if (ftrace_filter_param) {
652 printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
653 return 0;
654 }
655#endif
656
650 /* make sure msleep has been recorded */ 657 /* make sure msleep has been recorded */
651 msleep(1); 658 msleep(1);
652 659
@@ -727,13 +734,20 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
727 * Pretty much the same than for the function tracer from which the selftest 734 * Pretty much the same than for the function tracer from which the selftest
728 * has been borrowed. 735 * has been borrowed.
729 */ 736 */
730int 737__init int
731trace_selftest_startup_function_graph(struct tracer *trace, 738trace_selftest_startup_function_graph(struct tracer *trace,
732 struct trace_array *tr) 739 struct trace_array *tr)
733{ 740{
734 int ret; 741 int ret;
735 unsigned long count; 742 unsigned long count;
736 743
744#ifdef CONFIG_DYNAMIC_FTRACE
745 if (ftrace_filter_param) {
746 printk(KERN_CONT " ... kernel command line filter set: force PASS ... ");
747 return 0;
748 }
749#endif
750
737 /* 751 /*
738 * Simulate the init() callback but we attach a watchdog callback 752 * Simulate the init() callback but we attach a watchdog callback
739 * to detect and recover from possible hangs 753 * to detect and recover from possible hangs
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8f2ac73c7a5f..8fd03657bc7d 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -175,7 +175,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
175 entry = syscall_nr_to_meta(syscall); 175 entry = syscall_nr_to_meta(syscall);
176 176
177 if (!entry) { 177 if (!entry) {
178 trace_seq_printf(s, "\n"); 178 trace_seq_putc(s, '\n');
179 return TRACE_TYPE_HANDLED; 179 return TRACE_TYPE_HANDLED;
180 } 180 }
181 181
@@ -306,6 +306,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
306 struct syscall_metadata *sys_data; 306 struct syscall_metadata *sys_data;
307 struct ring_buffer_event *event; 307 struct ring_buffer_event *event;
308 struct ring_buffer *buffer; 308 struct ring_buffer *buffer;
309 unsigned long irq_flags;
310 int pc;
309 int syscall_nr; 311 int syscall_nr;
310 int size; 312 int size;
311 313
@@ -321,9 +323,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
321 323
322 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 324 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
323 325
326 local_save_flags(irq_flags);
327 pc = preempt_count();
328
324 buffer = tr->trace_buffer.buffer; 329 buffer = tr->trace_buffer.buffer;
325 event = trace_buffer_lock_reserve(buffer, 330 event = trace_buffer_lock_reserve(buffer,
326 sys_data->enter_event->event.type, size, 0, 0); 331 sys_data->enter_event->event.type, size, irq_flags, pc);
327 if (!event) 332 if (!event)
328 return; 333 return;
329 334
@@ -333,7 +338,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
333 338
334 if (!filter_current_check_discard(buffer, sys_data->enter_event, 339 if (!filter_current_check_discard(buffer, sys_data->enter_event,
335 entry, event)) 340 entry, event))
336 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 341 trace_current_buffer_unlock_commit(buffer, event,
342 irq_flags, pc);
337} 343}
338 344
339static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) 345static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -343,6 +349,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
343 struct syscall_metadata *sys_data; 349 struct syscall_metadata *sys_data;
344 struct ring_buffer_event *event; 350 struct ring_buffer_event *event;
345 struct ring_buffer *buffer; 351 struct ring_buffer *buffer;
352 unsigned long irq_flags;
353 int pc;
346 int syscall_nr; 354 int syscall_nr;
347 355
348 syscall_nr = trace_get_syscall_nr(current, regs); 356 syscall_nr = trace_get_syscall_nr(current, regs);
@@ -355,9 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
355 if (!sys_data) 363 if (!sys_data)
356 return; 364 return;
357 365
366 local_save_flags(irq_flags);
367 pc = preempt_count();
368
358 buffer = tr->trace_buffer.buffer; 369 buffer = tr->trace_buffer.buffer;
359 event = trace_buffer_lock_reserve(buffer, 370 event = trace_buffer_lock_reserve(buffer,
360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0); 371 sys_data->exit_event->event.type, sizeof(*entry),
372 irq_flags, pc);
361 if (!event) 373 if (!event)
362 return; 374 return;
363 375
@@ -367,7 +379,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
367 379
368 if (!filter_current_check_discard(buffer, sys_data->exit_event, 380 if (!filter_current_check_discard(buffer, sys_data->exit_event,
369 entry, event)) 381 entry, event))
370 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 382 trace_current_buffer_unlock_commit(buffer, event,
383 irq_flags, pc);
371} 384}
372 385
373static int reg_event_syscall_enter(struct ftrace_event_file *file, 386static int reg_event_syscall_enter(struct ftrace_event_file *file,
@@ -553,15 +566,15 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
553 if (!sys_data) 566 if (!sys_data)
554 return; 567 return;
555 568
569 head = this_cpu_ptr(sys_data->enter_event->perf_events);
570 if (hlist_empty(head))
571 return;
572
556 /* get the size after alignment with the u32 buffer size field */ 573 /* get the size after alignment with the u32 buffer size field */
557 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); 574 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
558 size = ALIGN(size + sizeof(u32), sizeof(u64)); 575 size = ALIGN(size + sizeof(u32), sizeof(u64));
559 size -= sizeof(u32); 576 size -= sizeof(u32);
560 577
561 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
562 "perf buffer not large enough"))
563 return;
564
565 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 578 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
566 sys_data->enter_event->event.type, regs, &rctx); 579 sys_data->enter_event->event.type, regs, &rctx);
567 if (!rec) 580 if (!rec)
@@ -570,8 +583,6 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
570 rec->nr = syscall_nr; 583 rec->nr = syscall_nr;
571 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 584 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
572 (unsigned long *)&rec->args); 585 (unsigned long *)&rec->args);
573
574 head = this_cpu_ptr(sys_data->enter_event->perf_events);
575 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 586 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
576} 587}
577 588
@@ -629,18 +640,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
629 if (!sys_data) 640 if (!sys_data)
630 return; 641 return;
631 642
643 head = this_cpu_ptr(sys_data->exit_event->perf_events);
644 if (hlist_empty(head))
645 return;
646
632 /* We can probably do that at build time */ 647 /* We can probably do that at build time */
633 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); 648 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
634 size -= sizeof(u32); 649 size -= sizeof(u32);
635 650
636 /*
637 * Impossible, but be paranoid with the future
638 * How to put this check outside runtime?
639 */
640 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
641 "exit event has grown above perf buffer size"))
642 return;
643
644 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 651 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
645 sys_data->exit_event->event.type, regs, &rctx); 652 sys_data->exit_event->event.type, regs, &rctx);
646 if (!rec) 653 if (!rec)
@@ -648,8 +655,6 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
648 655
649 rec->nr = syscall_nr; 656 rec->nr = syscall_nr;
650 rec->ret = syscall_get_return_value(current, regs); 657 rec->ret = syscall_get_return_value(current, regs);
651
652 head = this_cpu_ptr(sys_data->exit_event->perf_events);
653 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 658 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
654} 659}
655 660
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 32494fb0ee64..a23d2d71188e 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -283,8 +283,10 @@ static int create_trace_uprobe(int argc, char **argv)
283 return -EINVAL; 283 return -EINVAL;
284 } 284 }
285 arg = strchr(argv[1], ':'); 285 arg = strchr(argv[1], ':');
286 if (!arg) 286 if (!arg) {
287 ret = -EINVAL;
287 goto fail_address_parse; 288 goto fail_address_parse;
289 }
288 290
289 *arg++ = '\0'; 291 *arg++ = '\0';
290 filename = argv[1]; 292 filename = argv[1];
@@ -816,8 +818,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu,
816 818
817 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 819 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
818 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); 820 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
819 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
820 return;
821 821
822 preempt_disable(); 822 preempt_disable();
823 head = this_cpu_ptr(call->perf_events); 823 head = this_cpu_ptr(call->perf_events);
diff --git a/kernel/wait.c b/kernel/wait.c
index ce0daa320a26..dec68bd4e9d8 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -333,7 +333,8 @@ int __wait_on_atomic_t(wait_queue_head_t *wq, struct wait_bit_queue *q,
333 prepare_to_wait(wq, &q->wait, mode); 333 prepare_to_wait(wq, &q->wait, mode);
334 val = q->key.flags; 334 val = q->key.flags;
335 if (atomic_read(val) == 0) 335 if (atomic_read(val) == 0)
336 ret = (*action)(val); 336 break;
337 ret = (*action)(val);
337 } while (!ret && atomic_read(val) != 0); 338 } while (!ret && atomic_read(val) != 0);
338 finish_wait(wq, &q->wait); 339 finish_wait(wq, &q->wait);
339 return ret; 340 return ret;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 05039e348f07..1241d8c91d5e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,9 +29,9 @@
29#include <linux/kvm_para.h> 29#include <linux/kvm_para.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31 31
32int watchdog_enabled = 1; 32int watchdog_user_enabled = 1;
33int __read_mostly watchdog_thresh = 10; 33int __read_mostly watchdog_thresh = 10;
34static int __read_mostly watchdog_disabled; 34static int __read_mostly watchdog_running;
35static u64 __read_mostly sample_period; 35static u64 __read_mostly sample_period;
36 36
37static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 37static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -63,7 +63,7 @@ static int __init hardlockup_panic_setup(char *str)
63 else if (!strncmp(str, "nopanic", 7)) 63 else if (!strncmp(str, "nopanic", 7))
64 hardlockup_panic = 0; 64 hardlockup_panic = 0;
65 else if (!strncmp(str, "0", 1)) 65 else if (!strncmp(str, "0", 1))
66 watchdog_enabled = 0; 66 watchdog_user_enabled = 0;
67 return 1; 67 return 1;
68} 68}
69__setup("nmi_watchdog=", hardlockup_panic_setup); 69__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -82,7 +82,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
82 82
83static int __init nowatchdog_setup(char *str) 83static int __init nowatchdog_setup(char *str)
84{ 84{
85 watchdog_enabled = 0; 85 watchdog_user_enabled = 0;
86 return 1; 86 return 1;
87} 87}
88__setup("nowatchdog", nowatchdog_setup); 88__setup("nowatchdog", nowatchdog_setup);
@@ -90,7 +90,7 @@ __setup("nowatchdog", nowatchdog_setup);
90/* deprecated */ 90/* deprecated */
91static int __init nosoftlockup_setup(char *str) 91static int __init nosoftlockup_setup(char *str)
92{ 92{
93 watchdog_enabled = 0; 93 watchdog_user_enabled = 0;
94 return 1; 94 return 1;
95} 95}
96__setup("nosoftlockup", nosoftlockup_setup); 96__setup("nosoftlockup", nosoftlockup_setup);
@@ -158,7 +158,7 @@ void touch_all_softlockup_watchdogs(void)
158#ifdef CONFIG_HARDLOCKUP_DETECTOR 158#ifdef CONFIG_HARDLOCKUP_DETECTOR
159void touch_nmi_watchdog(void) 159void touch_nmi_watchdog(void)
160{ 160{
161 if (watchdog_enabled) { 161 if (watchdog_user_enabled) {
162 unsigned cpu; 162 unsigned cpu;
163 163
164 for_each_present_cpu(cpu) { 164 for_each_present_cpu(cpu) {
@@ -347,11 +347,6 @@ static void watchdog_enable(unsigned int cpu)
347 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 347 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
348 hrtimer->function = watchdog_timer_fn; 348 hrtimer->function = watchdog_timer_fn;
349 349
350 if (!watchdog_enabled) {
351 kthread_park(current);
352 return;
353 }
354
355 /* Enable the perf event */ 350 /* Enable the perf event */
356 watchdog_nmi_enable(cpu); 351 watchdog_nmi_enable(cpu);
357 352
@@ -374,6 +369,11 @@ static void watchdog_disable(unsigned int cpu)
374 watchdog_nmi_disable(cpu); 369 watchdog_nmi_disable(cpu);
375} 370}
376 371
372static void watchdog_cleanup(unsigned int cpu, bool online)
373{
374 watchdog_disable(cpu);
375}
376
377static int watchdog_should_run(unsigned int cpu) 377static int watchdog_should_run(unsigned int cpu)
378{ 378{
379 return __this_cpu_read(hrtimer_interrupts) != 379 return __this_cpu_read(hrtimer_interrupts) !=
@@ -475,28 +475,40 @@ static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
475static void watchdog_nmi_disable(unsigned int cpu) { return; } 475static void watchdog_nmi_disable(unsigned int cpu) { return; }
476#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 476#endif /* CONFIG_HARDLOCKUP_DETECTOR */
477 477
478/* prepare/enable/disable routines */ 478static struct smp_hotplug_thread watchdog_threads = {
479/* sysctl functions */ 479 .store = &softlockup_watchdog,
480#ifdef CONFIG_SYSCTL 480 .thread_should_run = watchdog_should_run,
481static void watchdog_enable_all_cpus(void) 481 .thread_fn = watchdog,
482 .thread_comm = "watchdog/%u",
483 .setup = watchdog_enable,
484 .cleanup = watchdog_cleanup,
485 .park = watchdog_disable,
486 .unpark = watchdog_enable,
487};
488
489static int watchdog_enable_all_cpus(void)
482{ 490{
483 unsigned int cpu; 491 int err = 0;
484 492
485 if (watchdog_disabled) { 493 if (!watchdog_running) {
486 watchdog_disabled = 0; 494 err = smpboot_register_percpu_thread(&watchdog_threads);
487 for_each_online_cpu(cpu) 495 if (err)
488 kthread_unpark(per_cpu(softlockup_watchdog, cpu)); 496 pr_err("Failed to create watchdog threads, disabled\n");
497 else
498 watchdog_running = 1;
489 } 499 }
500
501 return err;
490} 502}
491 503
504/* prepare/enable/disable routines */
505/* sysctl functions */
506#ifdef CONFIG_SYSCTL
492static void watchdog_disable_all_cpus(void) 507static void watchdog_disable_all_cpus(void)
493{ 508{
494 unsigned int cpu; 509 if (watchdog_running) {
495 510 watchdog_running = 0;
496 if (!watchdog_disabled) { 511 smpboot_unregister_percpu_thread(&watchdog_threads);
497 watchdog_disabled = 1;
498 for_each_online_cpu(cpu)
499 kthread_park(per_cpu(softlockup_watchdog, cpu));
500 } 512 }
501} 513}
502 514
@@ -507,45 +519,48 @@ static void watchdog_disable_all_cpus(void)
507int proc_dowatchdog(struct ctl_table *table, int write, 519int proc_dowatchdog(struct ctl_table *table, int write,
508 void __user *buffer, size_t *lenp, loff_t *ppos) 520 void __user *buffer, size_t *lenp, loff_t *ppos)
509{ 521{
510 int ret; 522 int err, old_thresh, old_enabled;
511 523
512 if (watchdog_disabled < 0) 524 old_thresh = ACCESS_ONCE(watchdog_thresh);
513 return -ENODEV; 525 old_enabled = ACCESS_ONCE(watchdog_user_enabled);
514 526
515 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 527 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
516 if (ret || !write) 528 if (err || !write)
517 return ret; 529 return err;
518 530
519 set_sample_period(); 531 set_sample_period();
520 /* 532 /*
521 * Watchdog threads shouldn't be enabled if they are 533 * Watchdog threads shouldn't be enabled if they are
522 * disabled. The 'watchdog_disabled' variable check in 534 * disabled. The 'watchdog_running' variable check in
523 * watchdog_*_all_cpus() function takes care of this. 535 * watchdog_*_all_cpus() function takes care of this.
524 */ 536 */
525 if (watchdog_enabled && watchdog_thresh) 537 if (watchdog_user_enabled && watchdog_thresh)
526 watchdog_enable_all_cpus(); 538 err = watchdog_enable_all_cpus();
527 else 539 else
528 watchdog_disable_all_cpus(); 540 watchdog_disable_all_cpus();
529 541
530 return ret; 542 /* Restore old values on failure */
543 if (err) {
544 watchdog_thresh = old_thresh;
545 watchdog_user_enabled = old_enabled;
546 }
547
548 return err;
531} 549}
532#endif /* CONFIG_SYSCTL */ 550#endif /* CONFIG_SYSCTL */
533 551
534static struct smp_hotplug_thread watchdog_threads = {
535 .store = &softlockup_watchdog,
536 .thread_should_run = watchdog_should_run,
537 .thread_fn = watchdog,
538 .thread_comm = "watchdog/%u",
539 .setup = watchdog_enable,
540 .park = watchdog_disable,
541 .unpark = watchdog_enable,
542};
543
544void __init lockup_detector_init(void) 552void __init lockup_detector_init(void)
545{ 553{
546 set_sample_period(); 554 set_sample_period();
547 if (smpboot_register_percpu_thread(&watchdog_threads)) { 555
548 pr_err("Failed to create watchdog threads, disabled\n"); 556#ifdef CONFIG_NO_HZ_FULL
549 watchdog_disabled = -ENODEV; 557 if (watchdog_user_enabled) {
558 watchdog_user_enabled = 0;
559 pr_warning("Disabled lockup detectors by default for full dynticks\n");
560 pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n");
550 } 561 }
562#endif
563
564 if (watchdog_user_enabled)
565 watchdog_enable_all_cpus();
551} 566}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4a0c3c..0b72e816b8d0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4644,7 +4644,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4644 * Workqueues should be brought up before normal priority CPU notifiers. 4644 * Workqueues should be brought up before normal priority CPU notifiers.
4645 * This will be registered high priority CPU notifier. 4645 * This will be registered high priority CPU notifier.
4646 */ 4646 */
4647static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, 4647static int workqueue_cpu_up_callback(struct notifier_block *nfb,
4648 unsigned long action, 4648 unsigned long action,
4649 void *hcpu) 4649 void *hcpu)
4650{ 4650{
@@ -4697,7 +4697,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
4697 * Workqueues should be brought down after normal priority CPU notifiers. 4697 * Workqueues should be brought down after normal priority CPU notifiers.
4698 * This will be registered as low priority CPU notifier. 4698 * This will be registered as low priority CPU notifier.
4699 */ 4699 */
4700static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, 4700static int workqueue_cpu_down_callback(struct notifier_block *nfb,
4701 unsigned long action, 4701 unsigned long action,
4702 void *hcpu) 4702 void *hcpu)
4703{ 4703{