aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/async.c40
-rw-r--r--kernel/audit.c12
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/auditsc.c14
-rw-r--r--kernel/cgroup.c780
-rw-r--r--kernel/compat.c84
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpu/idle.c116
-rw-r--r--kernel/cpuset.c143
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/events/core.c56
-rw-r--r--kernel/events/uprobes.c300
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/extable.c6
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/hrtimer.c26
-rw-r--r--kernel/kexec.c43
-rw-r--r--kernel/kmod.c98
-rw-r--r--kernel/kthread.c65
-rw-r--r--kernel/lockdep.c29
-rw-r--r--kernel/mutex.c151
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/pid.c11
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/posix-timers.c121
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/suspend.c22
-rw-r--r--kernel/printk.c97
-rw-r--r--kernel/ptrace.c80
-rw-r--r--kernel/range.c3
-rw-r--r--kernel/rcutree.c260
-rw-r--r--kernel/rcutree.h41
-rw-r--r--kernel/rcutree_plugin.h601
-rw-r--r--kernel/rcutree_trace.c2
-rw-r--r--kernel/relay.c14
-rw-r--r--kernel/resource.c198
-rw-r--r--kernel/rtmutex-tester.c5
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c314
-rw-r--r--kernel/sched/cpuacct.c296
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cputime.c214
-rw-r--r--kernel/sched/fair.c148
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/idle_task.c16
-rw-r--r--kernel/sched/sched.h219
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/semaphore.c8
-rw-r--r--kernel/signal.c9
-rw-r--r--kernel/smp.c91
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/sys.c235
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/test_kprobes.c2
-rw-r--r--kernel/time.c11
-rw-r--r--kernel/time/ntp.c105
-rw-r--r--kernel/time/ntp_internal.h12
-rw-r--r--kernel/time/tick-broadcast.c239
-rw-r--r--kernel/time/tick-common.c2
-rw-r--r--kernel/time/tick-internal.h5
-rw-r--r--kernel/time/tick-sched.c4
-rw-r--r--kernel/time/timekeeping.c396
-rw-r--r--kernel/time/timer_list.c104
-rw-r--r--kernel/timer.c143
-rw-r--r--kernel/trace/Kconfig49
-rw-r--r--kernel/trace/blktrace.c4
-rw-r--r--kernel/trace/ftrace.c98
-rw-r--r--kernel/trace/ring_buffer.c500
-rw-r--r--kernel/trace/trace.c2204
-rw-r--r--kernel/trace/trace.h149
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c10
-rw-r--r--kernel/trace/trace_entries.h23
-rw-r--r--kernel/trace/trace_events.c1397
-rw-r--r--kernel/trace/trace_events_filter.c34
-rw-r--r--kernel/trace/trace_export.c4
-rw-r--r--kernel/trace/trace_functions.c207
-rw-r--r--kernel/trace/trace_functions_graph.c12
-rw-r--r--kernel/trace/trace_irqsoff.c85
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_mmiotrace.c12
-rw-r--r--kernel/trace/trace_output.c119
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_sched_switch.c8
-rw-r--r--kernel/trace/trace_sched_wakeup.c87
-rw-r--r--kernel/trace/trace_selftest.c51
-rw-r--r--kernel/trace/trace_stack.c76
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c90
-rw-r--r--kernel/trace/trace_uprobe.c203
-rw-r--r--kernel/tracepoint.c21
-rw-r--r--kernel/uid16.c55
-rw-r--r--kernel/watchdog.c5
-rw-r--r--kernel/workqueue.c2907
-rw-r--r--kernel/workqueue_internal.h19
100 files changed, 9893 insertions, 4640 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f437..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,4 @@
4config_data.h 4config_data.h
5config_data.gz 5config_data.gz
6timeconst.h 6timeconst.h
7hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index bbde5f1a4486..d1574d47cf27 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -24,6 +24,7 @@ endif
24 24
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27obj-y += cpu/
27 28
28obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 29obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
29obj-$(CONFIG_FREEZER) += freezer.o 30obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/async.c b/kernel/async.c
index 8ddee2c3e5b0..61f023ce0228 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -73,7 +73,7 @@ struct async_entry {
73 struct list_head global_list; 73 struct list_head global_list;
74 struct work_struct work; 74 struct work_struct work;
75 async_cookie_t cookie; 75 async_cookie_t cookie;
76 async_func_ptr *func; 76 async_func_t func;
77 void *data; 77 void *data;
78 struct async_domain *domain; 78 struct async_domain *domain;
79}; 79};
@@ -84,24 +84,20 @@ static atomic_t entry_count;
84 84
85static async_cookie_t lowest_in_progress(struct async_domain *domain) 85static async_cookie_t lowest_in_progress(struct async_domain *domain)
86{ 86{
87 struct async_entry *first = NULL; 87 struct list_head *pending;
88 async_cookie_t ret = ASYNC_COOKIE_MAX; 88 async_cookie_t ret = ASYNC_COOKIE_MAX;
89 unsigned long flags; 89 unsigned long flags;
90 90
91 spin_lock_irqsave(&async_lock, flags); 91 spin_lock_irqsave(&async_lock, flags);
92 92
93 if (domain) { 93 if (domain)
94 if (!list_empty(&domain->pending)) 94 pending = &domain->pending;
95 first = list_first_entry(&domain->pending, 95 else
96 struct async_entry, domain_list); 96 pending = &async_global_pending;
97 } else {
98 if (!list_empty(&async_global_pending))
99 first = list_first_entry(&async_global_pending,
100 struct async_entry, global_list);
101 }
102 97
103 if (first) 98 if (!list_empty(pending))
104 ret = first->cookie; 99 ret = list_first_entry(pending, struct async_entry,
100 domain_list)->cookie;
105 101
106 spin_unlock_irqrestore(&async_lock, flags); 102 spin_unlock_irqrestore(&async_lock, flags);
107 return ret; 103 return ret;
@@ -149,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work)
149 wake_up(&async_done); 145 wake_up(&async_done);
150} 146}
151 147
152static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) 148static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
153{ 149{
154 struct async_entry *entry; 150 struct async_entry *entry;
155 unsigned long flags; 151 unsigned long flags;
@@ -169,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
169 spin_unlock_irqrestore(&async_lock, flags); 165 spin_unlock_irqrestore(&async_lock, flags);
170 166
171 /* low on memory.. run synchronously */ 167 /* low on memory.. run synchronously */
172 ptr(data, newcookie); 168 func(data, newcookie);
173 return newcookie; 169 return newcookie;
174 } 170 }
175 INIT_LIST_HEAD(&entry->domain_list); 171 INIT_LIST_HEAD(&entry->domain_list);
176 INIT_LIST_HEAD(&entry->global_list); 172 INIT_LIST_HEAD(&entry->global_list);
177 INIT_WORK(&entry->work, async_run_entry_fn); 173 INIT_WORK(&entry->work, async_run_entry_fn);
178 entry->func = ptr; 174 entry->func = func;
179 entry->data = data; 175 entry->data = data;
180 entry->domain = domain; 176 entry->domain = domain;
181 177
@@ -202,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
202 198
203/** 199/**
204 * async_schedule - schedule a function for asynchronous execution 200 * async_schedule - schedule a function for asynchronous execution
205 * @ptr: function to execute asynchronously 201 * @func: function to execute asynchronously
206 * @data: data pointer to pass to the function 202 * @data: data pointer to pass to the function
207 * 203 *
208 * Returns an async_cookie_t that may be used for checkpointing later. 204 * Returns an async_cookie_t that may be used for checkpointing later.
209 * Note: This function may be called from atomic or non-atomic contexts. 205 * Note: This function may be called from atomic or non-atomic contexts.
210 */ 206 */
211async_cookie_t async_schedule(async_func_ptr *ptr, void *data) 207async_cookie_t async_schedule(async_func_t func, void *data)
212{ 208{
213 return __async_schedule(ptr, data, &async_dfl_domain); 209 return __async_schedule(func, data, &async_dfl_domain);
214} 210}
215EXPORT_SYMBOL_GPL(async_schedule); 211EXPORT_SYMBOL_GPL(async_schedule);
216 212
217/** 213/**
218 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain 214 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
219 * @ptr: function to execute asynchronously 215 * @func: function to execute asynchronously
220 * @data: data pointer to pass to the function 216 * @data: data pointer to pass to the function
221 * @domain: the domain 217 * @domain: the domain
222 * 218 *
@@ -226,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule);
226 * synchronization domain is specified via @domain. Note: This function 222 * synchronization domain is specified via @domain. Note: This function
227 * may be called from atomic or non-atomic contexts. 223 * may be called from atomic or non-atomic contexts.
228 */ 224 */
229async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 225async_cookie_t async_schedule_domain(async_func_t func, void *data,
230 struct async_domain *domain) 226 struct async_domain *domain)
231{ 227{
232 return __async_schedule(ptr, data, domain); 228 return __async_schedule(func, data, domain);
233} 229}
234EXPORT_SYMBOL_GPL(async_schedule_domain); 230EXPORT_SYMBOL_GPL(async_schedule_domain);
235 231
diff --git a/kernel/audit.c b/kernel/audit.c
index 488f85f76335..0b084fa44b1f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
660 660
661 /* As soon as there's any sign of userspace auditd, 661 /* As soon as there's any sign of userspace auditd,
662 * start kauditd to talk to it */ 662 * start kauditd to talk to it */
663 if (!kauditd_task) 663 if (!kauditd_task) {
664 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); 664 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
665 if (IS_ERR(kauditd_task)) { 665 if (IS_ERR(kauditd_task)) {
666 err = PTR_ERR(kauditd_task); 666 err = PTR_ERR(kauditd_task);
667 kauditd_task = NULL; 667 kauditd_task = NULL;
668 return err; 668 return err;
669 }
669 } 670 }
670
671 loginuid = audit_get_loginuid(current); 671 loginuid = audit_get_loginuid(current);
672 sessionid = audit_get_sessionid(current); 672 sessionid = audit_get_sessionid(current);
673 security_task_getsecid(current, &sid); 673 security_task_getsecid(current, &sid);
diff --git a/kernel/audit.h b/kernel/audit.h
index d51cba868e1b..11468d99dad0 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -59,10 +59,7 @@ struct audit_entry {
59 struct audit_krule rule; 59 struct audit_krule rule;
60}; 60};
61 61
62#ifdef CONFIG_AUDIT
63extern int audit_enabled;
64extern int audit_ever_enabled; 62extern int audit_ever_enabled;
65#endif
66 63
67extern int audit_pid; 64extern int audit_pid;
68 65
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 642a89c4f3d6..a291aa23fb3f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -617,9 +617,9 @@ void audit_trim_trees(void)
617 } 617 }
618 spin_unlock(&hash_lock); 618 spin_unlock(&hash_lock);
619 trim_marked(tree); 619 trim_marked(tree);
620 put_tree(tree);
621 drop_collected_mounts(root_mnt); 620 drop_collected_mounts(root_mnt);
622skip_it: 621skip_it:
622 put_tree(tree);
623 mutex_lock(&audit_filter_mutex); 623 mutex_lock(&audit_filter_mutex);
624 } 624 }
625 list_del(&cursor); 625 list_del(&cursor);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f9fc54bbe06f..267436826c3b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -594,6 +594,10 @@ exit_nofree:
594 return entry; 594 return entry;
595 595
596exit_free: 596exit_free:
597 if (entry->rule.watch)
598 audit_put_watch(entry->rule.watch); /* matches initial get */
599 if (entry->rule.tree)
600 audit_put_tree(entry->rule.tree); /* that's the temporary one */
597 audit_free_rule(entry); 601 audit_free_rule(entry);
598 return ERR_PTR(err); 602 return ERR_PTR(err);
599} 603}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f857a0a9..c68229411a7c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context)
1034 } 1034 }
1035} 1035}
1036 1036
1037static inline void audit_zero_context(struct audit_context *context,
1038 enum audit_state state)
1039{
1040 memset(context, 0, sizeof(*context));
1041 context->state = state;
1042 context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1043}
1044
1045static inline struct audit_context *audit_alloc_context(enum audit_state state) 1037static inline struct audit_context *audit_alloc_context(enum audit_state state)
1046{ 1038{
1047 struct audit_context *context; 1039 struct audit_context *context;
1048 1040
1049 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) 1041 context = kzalloc(sizeof(*context), GFP_KERNEL);
1042 if (!context)
1050 return NULL; 1043 return NULL;
1051 audit_zero_context(context, state); 1044 context->state = state;
1045 context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1052 INIT_LIST_HEAD(&context->killed_trees); 1046 INIT_LIST_HEAD(&context->killed_trees);
1053 INIT_LIST_HEAD(&context->names_list); 1047 INIT_LIST_HEAD(&context->names_list);
1054 return context; 1048 return context;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..d3abce2d6455 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -30,7 +30,6 @@
30#include <linux/cred.h> 30#include <linux/cred.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
32#include <linux/errno.h> 32#include <linux/errno.h>
33#include <linux/fs.h>
34#include <linux/init_task.h> 33#include <linux/init_task.h>
35#include <linux/kernel.h> 34#include <linux/kernel.h>
36#include <linux/list.h> 35#include <linux/list.h>
@@ -59,7 +58,7 @@
59#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60#include <linux/eventfd.h> 59#include <linux/eventfd.h>
61#include <linux/poll.h> 60#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 61#include <linux/flex_array.h> /* used in cgroup_attach_task */
63#include <linux/kthread.h> 62#include <linux/kthread.h>
64 63
65#include <linux/atomic.h> 64#include <linux/atomic.h>
@@ -83,7 +82,13 @@
83 * B happens only through cgroup_show_options() and using cgroup_root_mutex 82 * B happens only through cgroup_show_options() and using cgroup_root_mutex
84 * breaks it. 83 * breaks it.
85 */ 84 */
85#ifdef CONFIG_PROVE_RCU
86DEFINE_MUTEX(cgroup_mutex);
87EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */
88#else
86static DEFINE_MUTEX(cgroup_mutex); 89static DEFINE_MUTEX(cgroup_mutex);
90#endif
91
87static DEFINE_MUTEX(cgroup_root_mutex); 92static DEFINE_MUTEX(cgroup_root_mutex);
88 93
89/* 94/*
@@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
98#include <linux/cgroup_subsys.h> 103#include <linux/cgroup_subsys.h>
99}; 104};
100 105
101#define MAX_CGROUP_ROOT_NAMELEN 64
102
103/*
104 * A cgroupfs_root represents the root of a cgroup hierarchy,
105 * and may be associated with a superblock to form an active
106 * hierarchy
107 */
108struct cgroupfs_root {
109 struct super_block *sb;
110
111 /*
112 * The bitmask of subsystems intended to be attached to this
113 * hierarchy
114 */
115 unsigned long subsys_mask;
116
117 /* Unique id for this hierarchy. */
118 int hierarchy_id;
119
120 /* The bitmask of subsystems currently attached to this hierarchy */
121 unsigned long actual_subsys_mask;
122
123 /* A list running through the attached subsystems */
124 struct list_head subsys_list;
125
126 /* The root cgroup for this hierarchy */
127 struct cgroup top_cgroup;
128
129 /* Tracks how many cgroups are currently defined in hierarchy.*/
130 int number_of_cgroups;
131
132 /* A list running through the active hierarchies */
133 struct list_head root_list;
134
135 /* All cgroups on this root, cgroup_mutex protected */
136 struct list_head allcg_list;
137
138 /* Hierarchy-specific flags */
139 unsigned long flags;
140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
144 /* The path to use for release notifications. */
145 char release_agent_path[PATH_MAX];
146
147 /* The name for this hierarchy - may be empty */
148 char name[MAX_CGROUP_ROOT_NAMELEN];
149};
150
151/* 106/*
152 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 107 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
153 * subsystems that are otherwise unattached - it never has more than a 108 * subsystems that are otherwise unattached - it never has more than a
@@ -162,6 +117,9 @@ struct cfent {
162 struct list_head node; 117 struct list_head node;
163 struct dentry *dentry; 118 struct dentry *dentry;
164 struct cftype *type; 119 struct cftype *type;
120
121 /* file xattrs */
122 struct simple_xattrs xattrs;
165}; 123};
166 124
167/* 125/*
@@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
238/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 196/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
239#define dummytop (&rootnode.top_cgroup) 197#define dummytop (&rootnode.top_cgroup)
240 198
199static struct cgroup_name root_cgroup_name = { .name = "/" };
200
241/* This flag indicates whether tasks in the fork and exit paths should 201/* This flag indicates whether tasks in the fork and exit paths should
242 * check for fork/exit handlers to call. This avoids us having to do 202 * check for fork/exit handlers to call. This avoids us having to do
243 * extra work in the fork/exit path if none of the subsystems need to 203 * extra work in the fork/exit path if none of the subsystems need to
@@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 209static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add); 210 struct cftype cfts[], bool is_add);
251 211
252#ifdef CONFIG_PROVE_LOCKING
253int cgroup_lock_is_held(void)
254{
255 return lockdep_is_held(&cgroup_mutex);
256}
257#else /* #ifdef CONFIG_PROVE_LOCKING */
258int cgroup_lock_is_held(void)
259{
260 return mutex_is_locked(&cgroup_mutex);
261}
262#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
263
264EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
265
266static int css_unbias_refcnt(int refcnt) 212static int css_unbias_refcnt(int refcnt)
267{ 213{
268 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; 214 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
@@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
282 return test_bit(CGRP_REMOVED, &cgrp->flags); 228 return test_bit(CGRP_REMOVED, &cgrp->flags);
283} 229}
284 230
285/* bits in struct cgroupfs_root flags field */ 231/**
286enum { 232 * cgroup_is_descendant - test ancestry
287 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 233 * @cgrp: the cgroup to be tested
288 ROOT_XATTR, /* supports extended attributes */ 234 * @ancestor: possible ancestor of @cgrp
289}; 235 *
236 * Test whether @cgrp is a descendant of @ancestor. It also returns %true
237 * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
238 * and @ancestor are accessible.
239 */
240bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
241{
242 while (cgrp) {
243 if (cgrp == ancestor)
244 return true;
245 cgrp = cgrp->parent;
246 }
247 return false;
248}
249EXPORT_SYMBOL_GPL(cgroup_is_descendant);
290 250
291static int cgroup_is_releasable(const struct cgroup *cgrp) 251static int cgroup_is_releasable(const struct cgroup *cgrp)
292{ 252{
@@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
327 return __d_cfe(dentry)->type; 287 return __d_cfe(dentry)->type;
328} 288}
329 289
290/**
291 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
292 * @cgrp: the cgroup to be checked for liveness
293 *
294 * On success, returns true; the mutex should be later unlocked. On
295 * failure returns false with no lock held.
296 */
297static bool cgroup_lock_live_group(struct cgroup *cgrp)
298{
299 mutex_lock(&cgroup_mutex);
300 if (cgroup_is_removed(cgrp)) {
301 mutex_unlock(&cgroup_mutex);
302 return false;
303 }
304 return true;
305}
306
330/* the list of cgroups eligible for automatic release. Protected by 307/* the list of cgroups eligible for automatic release. Protected by
331 * release_list_lock */ 308 * release_list_lock */
332static LIST_HEAD(release_list); 309static LIST_HEAD(release_list);
@@ -800,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
800 * update of a tasks cgroup pointer by cgroup_attach_task() 777 * update of a tasks cgroup pointer by cgroup_attach_task()
801 */ 778 */
802 779
803/**
804 * cgroup_lock - lock out any changes to cgroup structures
805 *
806 */
807void cgroup_lock(void)
808{
809 mutex_lock(&cgroup_mutex);
810}
811EXPORT_SYMBOL_GPL(cgroup_lock);
812
813/**
814 * cgroup_unlock - release lock on cgroup changes
815 *
816 * Undo the lock taken in a previous cgroup_lock() call.
817 */
818void cgroup_unlock(void)
819{
820 mutex_unlock(&cgroup_mutex);
821}
822EXPORT_SYMBOL_GPL(cgroup_unlock);
823
824/* 780/*
825 * A couple of forward declarations required, due to cyclic reference loop: 781 * A couple of forward declarations required, due to cyclic reference loop:
826 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> 782 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
@@ -859,6 +815,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
859 return inode; 815 return inode;
860} 816}
861 817
818static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
819{
820 struct cgroup_name *name;
821
822 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
823 if (!name)
824 return NULL;
825 strcpy(name->name, dentry->d_name.name);
826 return name;
827}
828
862static void cgroup_free_fn(struct work_struct *work) 829static void cgroup_free_fn(struct work_struct *work)
863{ 830{
864 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 831 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -875,8 +842,18 @@ static void cgroup_free_fn(struct work_struct *work)
875 mutex_unlock(&cgroup_mutex); 842 mutex_unlock(&cgroup_mutex);
876 843
877 /* 844 /*
845 * We get a ref to the parent's dentry, and put the ref when
846 * this cgroup is being freed, so it's guaranteed that the
847 * parent won't be destroyed before its children.
848 */
849 dput(cgrp->parent->dentry);
850
851 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
852
853 /*
878 * Drop the active superblock reference that we took when we 854 * Drop the active superblock reference that we took when we
879 * created the cgroup 855 * created the cgroup. This will free cgrp->root, if we are
856 * holding the last reference to @sb.
880 */ 857 */
881 deactivate_super(cgrp->root->sb); 858 deactivate_super(cgrp->root->sb);
882 859
@@ -888,7 +865,7 @@ static void cgroup_free_fn(struct work_struct *work)
888 865
889 simple_xattrs_free(&cgrp->xattrs); 866 simple_xattrs_free(&cgrp->xattrs);
890 867
891 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 868 kfree(rcu_dereference_raw(cgrp->name));
892 kfree(cgrp); 869 kfree(cgrp);
893} 870}
894 871
@@ -910,13 +887,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
910 } else { 887 } else {
911 struct cfent *cfe = __d_cfe(dentry); 888 struct cfent *cfe = __d_cfe(dentry);
912 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 889 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
913 struct cftype *cft = cfe->type;
914 890
915 WARN_ONCE(!list_empty(&cfe->node) && 891 WARN_ONCE(!list_empty(&cfe->node) &&
916 cgrp != &cgrp->root->top_cgroup, 892 cgrp != &cgrp->root->top_cgroup,
917 "cfe still linked for %s\n", cfe->type->name); 893 "cfe still linked for %s\n", cfe->type->name);
894 simple_xattrs_free(&cfe->xattrs);
918 kfree(cfe); 895 kfree(cfe);
919 simple_xattrs_free(&cft->xattrs);
920 } 896 }
921 iput(inode); 897 iput(inode);
922} 898}
@@ -1108,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1108 mutex_lock(&cgroup_root_mutex); 1084 mutex_lock(&cgroup_root_mutex);
1109 for_each_subsys(root, ss) 1085 for_each_subsys(root, ss)
1110 seq_printf(seq, ",%s", ss->name); 1086 seq_printf(seq, ",%s", ss->name);
1111 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1087 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1088 seq_puts(seq, ",sane_behavior");
1089 if (root->flags & CGRP_ROOT_NOPREFIX)
1112 seq_puts(seq, ",noprefix"); 1090 seq_puts(seq, ",noprefix");
1113 if (test_bit(ROOT_XATTR, &root->flags)) 1091 if (root->flags & CGRP_ROOT_XATTR)
1114 seq_puts(seq, ",xattr"); 1092 seq_puts(seq, ",xattr");
1115 if (strlen(root->release_agent_path)) 1093 if (strlen(root->release_agent_path))
1116 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1094 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
@@ -1172,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1172 all_ss = true; 1150 all_ss = true;
1173 continue; 1151 continue;
1174 } 1152 }
1153 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1154 opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1155 continue;
1156 }
1175 if (!strcmp(token, "noprefix")) { 1157 if (!strcmp(token, "noprefix")) {
1176 set_bit(ROOT_NOPREFIX, &opts->flags); 1158 opts->flags |= CGRP_ROOT_NOPREFIX;
1177 continue; 1159 continue;
1178 } 1160 }
1179 if (!strcmp(token, "clone_children")) { 1161 if (!strcmp(token, "clone_children")) {
@@ -1181,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1181 continue; 1163 continue;
1182 } 1164 }
1183 if (!strcmp(token, "xattr")) { 1165 if (!strcmp(token, "xattr")) {
1184 set_bit(ROOT_XATTR, &opts->flags); 1166 opts->flags |= CGRP_ROOT_XATTR;
1185 continue; 1167 continue;
1186 } 1168 }
1187 if (!strncmp(token, "release_agent=", 14)) { 1169 if (!strncmp(token, "release_agent=", 14)) {
@@ -1259,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1259 1241
1260 /* Consistency checks */ 1242 /* Consistency checks */
1261 1243
1244 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1245 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1246
1247 if (opts->flags & CGRP_ROOT_NOPREFIX) {
1248 pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
1249 return -EINVAL;
1250 }
1251
1252 if (opts->cpuset_clone_children) {
1253 pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
1254 return -EINVAL;
1255 }
1256 }
1257
1262 /* 1258 /*
1263 * Option noprefix was introduced just for backward compatibility 1259 * Option noprefix was introduced just for backward compatibility
1264 * with the old cpuset, so we allow noprefix only if mounting just 1260 * with the old cpuset, so we allow noprefix only if mounting just
1265 * the cpuset subsystem. 1261 * the cpuset subsystem.
1266 */ 1262 */
1267 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1263 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1268 (opts->subsys_mask & mask))
1269 return -EINVAL; 1264 return -EINVAL;
1270 1265
1271 1266
@@ -1336,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1336 struct cgroup_sb_opts opts; 1331 struct cgroup_sb_opts opts;
1337 unsigned long added_mask, removed_mask; 1332 unsigned long added_mask, removed_mask;
1338 1333
1334 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1335 pr_err("cgroup: sane_behavior: remount is not allowed\n");
1336 return -EINVAL;
1337 }
1338
1339 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1339 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1340 mutex_lock(&cgroup_mutex); 1340 mutex_lock(&cgroup_mutex);
1341 mutex_lock(&cgroup_root_mutex); 1341 mutex_lock(&cgroup_root_mutex);
@@ -1421,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1421 INIT_LIST_HEAD(&root->allcg_list); 1421 INIT_LIST_HEAD(&root->allcg_list);
1422 root->number_of_cgroups = 1; 1422 root->number_of_cgroups = 1;
1423 cgrp->root = root; 1423 cgrp->root = root;
1424 cgrp->top_cgroup = cgrp; 1424 cgrp->name = &root_cgroup_name;
1425 init_cgroup_housekeeping(cgrp); 1425 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1427} 1427}
@@ -1685,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1685 * any) is not needed 1685 * any) is not needed
1686 */ 1686 */
1687 cgroup_drop_root(opts.new_root); 1687 cgroup_drop_root(opts.new_root);
1688
1689 if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
1690 root->flags != opts.flags) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL;
1693 goto drop_new_super;
1694 }
1695
1688 /* no subsys rebinding, so refcounts don't change */ 1696 /* no subsys rebinding, so refcounts don't change */
1689 drop_parsed_module_refcounts(opts.subsys_mask); 1697 drop_parsed_module_refcounts(opts.subsys_mask);
1690 } 1698 }
@@ -1769,49 +1777,48 @@ static struct kobject *cgroup_kobj;
1769 * @buf: the buffer to write the path into 1777 * @buf: the buffer to write the path into
1770 * @buflen: the length of the buffer 1778 * @buflen: the length of the buffer
1771 * 1779 *
1772 * Called with cgroup_mutex held or else with an RCU-protected cgroup 1780 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1773 * reference. Writes path of cgroup into buf. Returns 0 on success, 1781 *
1774 * -errno on error. 1782 * We can't generate cgroup path using dentry->d_name, as accessing
1783 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1784 * inode's i_mutex, while on the other hand cgroup_path() can be called
1785 * with some irq-safe spinlocks held.
1775 */ 1786 */
1776int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1787int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1777{ 1788{
1778 struct dentry *dentry = cgrp->dentry; 1789 int ret = -ENAMETOOLONG;
1779 char *start; 1790 char *start;
1780 1791
1781 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1792 if (!cgrp->parent) {
1782 "cgroup_path() called without proper locking"); 1793 if (strlcpy(buf, "/", buflen) >= buflen)
1783 1794 return -ENAMETOOLONG;
1784 if (cgrp == dummytop) {
1785 /*
1786 * Inactive subsystems have no dentry for their root
1787 * cgroup
1788 */
1789 strcpy(buf, "/");
1790 return 0; 1795 return 0;
1791 } 1796 }
1792 1797
1793 start = buf + buflen - 1; 1798 start = buf + buflen - 1;
1794
1795 *start = '\0'; 1799 *start = '\0';
1796 for (;;) {
1797 int len = dentry->d_name.len;
1798 1800
1801 rcu_read_lock();
1802 do {
1803 const char *name = cgroup_name(cgrp);
1804 int len;
1805
1806 len = strlen(name);
1799 if ((start -= len) < buf) 1807 if ((start -= len) < buf)
1800 return -ENAMETOOLONG; 1808 goto out;
1801 memcpy(start, dentry->d_name.name, len); 1809 memcpy(start, name, len);
1802 cgrp = cgrp->parent;
1803 if (!cgrp)
1804 break;
1805 1810
1806 dentry = cgrp->dentry;
1807 if (!cgrp->parent)
1808 continue;
1809 if (--start < buf) 1811 if (--start < buf)
1810 return -ENAMETOOLONG; 1812 goto out;
1811 *start = '/'; 1813 *start = '/';
1812 } 1814
1815 cgrp = cgrp->parent;
1816 } while (cgrp->parent);
1817 ret = 0;
1813 memmove(buf, start, buf + buflen - start); 1818 memmove(buf, start, buf + buflen - start);
1814 return 0; 1819out:
1820 rcu_read_unlock();
1821 return ret;
1815} 1822}
1816EXPORT_SYMBOL_GPL(cgroup_path); 1823EXPORT_SYMBOL_GPL(cgroup_path);
1817 1824
@@ -1900,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1900 * 1907 *
1901 * Must be called with cgroup_mutex and threadgroup locked. 1908 * Must be called with cgroup_mutex and threadgroup locked.
1902 */ 1909 */
1903static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1910static void cgroup_task_migrate(struct cgroup *oldcgrp,
1904 struct task_struct *tsk, struct css_set *newcg) 1911 struct task_struct *tsk, struct css_set *newcg)
1905{ 1912{
1906 struct css_set *oldcg; 1913 struct css_set *oldcg;
@@ -1933,121 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1933} 1940}
1934 1941
1935/** 1942/**
1936 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1943 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
1937 * @cgrp: the cgroup the task is attaching to
1938 * @tsk: the task to be attached
1939 *
1940 * Call with cgroup_mutex and threadgroup locked. May take task_lock of
1941 * @tsk during call.
1942 */
1943int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1944{
1945 int retval = 0;
1946 struct cgroup_subsys *ss, *failed_ss = NULL;
1947 struct cgroup *oldcgrp;
1948 struct cgroupfs_root *root = cgrp->root;
1949 struct cgroup_taskset tset = { };
1950 struct css_set *newcg;
1951
1952 /* @tsk either already exited or can't exit until the end */
1953 if (tsk->flags & PF_EXITING)
1954 return -ESRCH;
1955
1956 /* Nothing to do if the task is already in that cgroup */
1957 oldcgrp = task_cgroup_from_root(tsk, root);
1958 if (cgrp == oldcgrp)
1959 return 0;
1960
1961 tset.single.task = tsk;
1962 tset.single.cgrp = oldcgrp;
1963
1964 for_each_subsys(root, ss) {
1965 if (ss->can_attach) {
1966 retval = ss->can_attach(cgrp, &tset);
1967 if (retval) {
1968 /*
1969 * Remember on which subsystem the can_attach()
1970 * failed, so that we only call cancel_attach()
1971 * against the subsystems whose can_attach()
1972 * succeeded. (See below)
1973 */
1974 failed_ss = ss;
1975 goto out;
1976 }
1977 }
1978 }
1979
1980 newcg = find_css_set(tsk->cgroups, cgrp);
1981 if (!newcg) {
1982 retval = -ENOMEM;
1983 goto out;
1984 }
1985
1986 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
1987
1988 for_each_subsys(root, ss) {
1989 if (ss->attach)
1990 ss->attach(cgrp, &tset);
1991 }
1992
1993out:
1994 if (retval) {
1995 for_each_subsys(root, ss) {
1996 if (ss == failed_ss)
1997 /*
1998 * This subsystem was the one that failed the
1999 * can_attach() check earlier, so we don't need
2000 * to call cancel_attach() against it or any
2001 * remaining subsystems.
2002 */
2003 break;
2004 if (ss->cancel_attach)
2005 ss->cancel_attach(cgrp, &tset);
2006 }
2007 }
2008 return retval;
2009}
2010
2011/**
2012 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2013 * @from: attach to all cgroups of a given task
2014 * @tsk: the task to be attached
2015 */
2016int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2017{
2018 struct cgroupfs_root *root;
2019 int retval = 0;
2020
2021 cgroup_lock();
2022 for_each_active_root(root) {
2023 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2024
2025 retval = cgroup_attach_task(from_cg, tsk);
2026 if (retval)
2027 break;
2028 }
2029 cgroup_unlock();
2030
2031 return retval;
2032}
2033EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2034
2035/**
2036 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2037 * @cgrp: the cgroup to attach to 1944 * @cgrp: the cgroup to attach to
2038 * @leader: the threadgroup leader task_struct of the group to be attached 1945 * @tsk: the task or the leader of the threadgroup to be attached
1946 * @threadgroup: attach the whole threadgroup?
2039 * 1947 *
2040 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1948 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
2041 * task_lock of each thread in leader's threadgroup individually in turn. 1949 * task_lock of @tsk or each thread in the threadgroup individually in turn.
2042 */ 1950 */
2043static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 1951static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1952 bool threadgroup)
2044{ 1953{
2045 int retval, i, group_size; 1954 int retval, i, group_size;
2046 struct cgroup_subsys *ss, *failed_ss = NULL; 1955 struct cgroup_subsys *ss, *failed_ss = NULL;
2047 /* guaranteed to be initialized later, but the compiler needs this */
2048 struct cgroupfs_root *root = cgrp->root; 1956 struct cgroupfs_root *root = cgrp->root;
2049 /* threadgroup list cursor and array */ 1957 /* threadgroup list cursor and array */
2050 struct task_struct *tsk; 1958 struct task_struct *leader = tsk;
2051 struct task_and_cgroup *tc; 1959 struct task_and_cgroup *tc;
2052 struct flex_array *group; 1960 struct flex_array *group;
2053 struct cgroup_taskset tset = { }; 1961 struct cgroup_taskset tset = { };
@@ -2059,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2059 * group - group_rwsem prevents new threads from appearing, and if 1967 * group - group_rwsem prevents new threads from appearing, and if
2060 * threads exit, this will just be an over-estimate. 1968 * threads exit, this will just be an over-estimate.
2061 */ 1969 */
2062 group_size = get_nr_threads(leader); 1970 if (threadgroup)
1971 group_size = get_nr_threads(tsk);
1972 else
1973 group_size = 1;
2063 /* flex_array supports very large thread-groups better than kmalloc. */ 1974 /* flex_array supports very large thread-groups better than kmalloc. */
2064 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1975 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2065 if (!group) 1976 if (!group)
2066 return -ENOMEM; 1977 return -ENOMEM;
2067 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1978 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2068 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); 1979 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
2069 if (retval) 1980 if (retval)
2070 goto out_free_group_list; 1981 goto out_free_group_list;
2071 1982
2072 tsk = leader;
2073 i = 0; 1983 i = 0;
2074 /* 1984 /*
2075 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1985 * Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2098,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2098 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2008 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2099 BUG_ON(retval != 0); 2009 BUG_ON(retval != 0);
2100 i++; 2010 i++;
2011
2012 if (!threadgroup)
2013 break;
2101 } while_each_thread(leader, tsk); 2014 } while_each_thread(leader, tsk);
2102 rcu_read_unlock(); 2015 rcu_read_unlock();
2103 /* remember the number of threads in the array for later. */ 2016 /* remember the number of threads in the array for later. */
@@ -2143,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2143 */ 2056 */
2144 for (i = 0; i < group_size; i++) { 2057 for (i = 0; i < group_size; i++) {
2145 tc = flex_array_get(group, i); 2058 tc = flex_array_get(group, i);
2146 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); 2059 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
2147 } 2060 }
2148 /* nothing is sensitive to fork() after this point. */ 2061 /* nothing is sensitive to fork() after this point. */
2149 2062
@@ -2224,11 +2137,11 @@ retry_find_task:
2224 tsk = tsk->group_leader; 2137 tsk = tsk->group_leader;
2225 2138
2226 /* 2139 /*
2227 * Workqueue threads may acquire PF_THREAD_BOUND and become 2140 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2228 * trapped in a cpuset, or RT worker may be born in a cgroup 2141 * trapped in a cpuset, or RT worker may be born in a cgroup
2229 * with no rt_runtime allocated. Just say no. 2142 * with no rt_runtime allocated. Just say no.
2230 */ 2143 */
2231 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { 2144 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2232 ret = -EINVAL; 2145 ret = -EINVAL;
2233 rcu_read_unlock(); 2146 rcu_read_unlock();
2234 goto out_unlock_cgroup; 2147 goto out_unlock_cgroup;
@@ -2251,17 +2164,42 @@ retry_find_task:
2251 put_task_struct(tsk); 2164 put_task_struct(tsk);
2252 goto retry_find_task; 2165 goto retry_find_task;
2253 } 2166 }
2254 ret = cgroup_attach_proc(cgrp, tsk); 2167 }
2255 } else 2168
2256 ret = cgroup_attach_task(cgrp, tsk); 2169 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2170
2257 threadgroup_unlock(tsk); 2171 threadgroup_unlock(tsk);
2258 2172
2259 put_task_struct(tsk); 2173 put_task_struct(tsk);
2260out_unlock_cgroup: 2174out_unlock_cgroup:
2261 cgroup_unlock(); 2175 mutex_unlock(&cgroup_mutex);
2262 return ret; 2176 return ret;
2263} 2177}
2264 2178
2179/**
2180 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2181 * @from: attach to all cgroups of a given task
2182 * @tsk: the task to be attached
2183 */
2184int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2185{
2186 struct cgroupfs_root *root;
2187 int retval = 0;
2188
2189 mutex_lock(&cgroup_mutex);
2190 for_each_active_root(root) {
2191 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2192
2193 retval = cgroup_attach_task(from_cg, tsk, false);
2194 if (retval)
2195 break;
2196 }
2197 mutex_unlock(&cgroup_mutex);
2198
2199 return retval;
2200}
2201EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2202
2265static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2203static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2266{ 2204{
2267 return attach_task_by_pid(cgrp, pid, false); 2205 return attach_task_by_pid(cgrp, pid, false);
@@ -2272,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2272 return attach_task_by_pid(cgrp, tgid, true); 2210 return attach_task_by_pid(cgrp, tgid, true);
2273} 2211}
2274 2212
2275/**
2276 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
2277 * @cgrp: the cgroup to be checked for liveness
2278 *
2279 * On success, returns true; the lock should be later released with
2280 * cgroup_unlock(). On failure returns false with no lock held.
2281 */
2282bool cgroup_lock_live_group(struct cgroup *cgrp)
2283{
2284 mutex_lock(&cgroup_mutex);
2285 if (cgroup_is_removed(cgrp)) {
2286 mutex_unlock(&cgroup_mutex);
2287 return false;
2288 }
2289 return true;
2290}
2291EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
2292
2293static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2213static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2294 const char *buffer) 2214 const char *buffer)
2295{ 2215{
@@ -2301,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2301 mutex_lock(&cgroup_root_mutex); 2221 mutex_lock(&cgroup_root_mutex);
2302 strcpy(cgrp->root->release_agent_path, buffer); 2222 strcpy(cgrp->root->release_agent_path, buffer);
2303 mutex_unlock(&cgroup_root_mutex); 2223 mutex_unlock(&cgroup_root_mutex);
2304 cgroup_unlock(); 2224 mutex_unlock(&cgroup_mutex);
2305 return 0; 2225 return 0;
2306} 2226}
2307 2227
@@ -2312,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2312 return -ENODEV; 2232 return -ENODEV;
2313 seq_puts(seq, cgrp->root->release_agent_path); 2233 seq_puts(seq, cgrp->root->release_agent_path);
2314 seq_putc(seq, '\n'); 2234 seq_putc(seq, '\n');
2315 cgroup_unlock(); 2235 mutex_unlock(&cgroup_mutex);
2236 return 0;
2237}
2238
2239static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
2240 struct seq_file *seq)
2241{
2242 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2316 return 0; 2243 return 0;
2317} 2244}
2318 2245
@@ -2537,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
2537static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2464static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2538 struct inode *new_dir, struct dentry *new_dentry) 2465 struct inode *new_dir, struct dentry *new_dentry)
2539{ 2466{
2467 int ret;
2468 struct cgroup_name *name, *old_name;
2469 struct cgroup *cgrp;
2470
2471 /*
2472 * It's convinient to use parent dir's i_mutex to protected
2473 * cgrp->name.
2474 */
2475 lockdep_assert_held(&old_dir->i_mutex);
2476
2540 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2477 if (!S_ISDIR(old_dentry->d_inode->i_mode))
2541 return -ENOTDIR; 2478 return -ENOTDIR;
2542 if (new_dentry->d_inode) 2479 if (new_dentry->d_inode)
2543 return -EEXIST; 2480 return -EEXIST;
2544 if (old_dir != new_dir) 2481 if (old_dir != new_dir)
2545 return -EIO; 2482 return -EIO;
2546 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2483
2484 cgrp = __d_cgrp(old_dentry);
2485
2486 name = cgroup_alloc_name(new_dentry);
2487 if (!name)
2488 return -ENOMEM;
2489
2490 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2491 if (ret) {
2492 kfree(name);
2493 return ret;
2494 }
2495
2496 old_name = cgrp->name;
2497 rcu_assign_pointer(cgrp->name, name);
2498
2499 kfree_rcu(old_name, rcu_head);
2500 return 0;
2547} 2501}
2548 2502
2549static struct simple_xattrs *__d_xattrs(struct dentry *dentry) 2503static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -2551,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2551 if (S_ISDIR(dentry->d_inode->i_mode)) 2505 if (S_ISDIR(dentry->d_inode->i_mode))
2552 return &__d_cgrp(dentry)->xattrs; 2506 return &__d_cgrp(dentry)->xattrs;
2553 else 2507 else
2554 return &__d_cft(dentry)->xattrs; 2508 return &__d_cfe(dentry)->xattrs;
2555} 2509}
2556 2510
2557static inline int xattr_enabled(struct dentry *dentry) 2511static inline int xattr_enabled(struct dentry *dentry)
2558{ 2512{
2559 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 2513 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2560 return test_bit(ROOT_XATTR, &root->flags); 2514 return root->flags & CGRP_ROOT_XATTR;
2561} 2515}
2562 2516
2563static bool is_valid_xattr(const char *name) 2517static bool is_valid_xattr(const char *name)
@@ -2727,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2727 umode_t mode; 2681 umode_t mode;
2728 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2682 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2729 2683
2730 simple_xattrs_init(&cft->xattrs); 2684 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2731
2732 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2733 strcpy(name, subsys->name); 2685 strcpy(name, subsys->name);
2734 strcat(name, "."); 2686 strcat(name, ".");
2735 } 2687 }
@@ -2753,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2753 cfe->type = (void *)cft; 2705 cfe->type = (void *)cft;
2754 cfe->dentry = dentry; 2706 cfe->dentry = dentry;
2755 dentry->d_fsdata = cfe; 2707 dentry->d_fsdata = cfe;
2708 simple_xattrs_init(&cfe->xattrs);
2756 list_add_tail(&cfe->node, &parent->files); 2709 list_add_tail(&cfe->node, &parent->files);
2757 cfe = NULL; 2710 cfe = NULL;
2758 } 2711 }
@@ -2770,6 +2723,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2770 2723
2771 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2724 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2772 /* does cft->flags tell us to skip this file on @cgrp? */ 2725 /* does cft->flags tell us to skip this file on @cgrp? */
2726 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2727 continue;
2773 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2728 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2774 continue; 2729 continue;
2775 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2730 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
@@ -3300,6 +3255,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3300 return 0; 3255 return 0;
3301} 3256}
3302 3257
3258static void cgroup_transfer_one_task(struct task_struct *task,
3259 struct cgroup_scanner *scan)
3260{
3261 struct cgroup *new_cgroup = scan->data;
3262
3263 mutex_lock(&cgroup_mutex);
3264 cgroup_attach_task(new_cgroup, task, false);
3265 mutex_unlock(&cgroup_mutex);
3266}
3267
3268/**
3269 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3270 * @to: cgroup to which the tasks will be moved
3271 * @from: cgroup in which the tasks currently reside
3272 */
3273int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3274{
3275 struct cgroup_scanner scan;
3276
3277 scan.cg = from;
3278 scan.test_task = NULL; /* select all tasks in cgroup */
3279 scan.process_task = cgroup_transfer_one_task;
3280 scan.heap = NULL;
3281 scan.data = to;
3282
3283 return cgroup_scan_tasks(&scan);
3284}
3285
3303/* 3286/*
3304 * Stuff for reading the 'tasks'/'procs' files. 3287 * Stuff for reading the 'tasks'/'procs' files.
3305 * 3288 *
@@ -3362,35 +3345,14 @@ static void pidlist_free(void *p)
3362 else 3345 else
3363 kfree(p); 3346 kfree(p);
3364} 3347}
3365static void *pidlist_resize(void *p, int newcount)
3366{
3367 void *newlist;
3368 /* note: if new alloc fails, old p will still be valid either way */
3369 if (is_vmalloc_addr(p)) {
3370 newlist = vmalloc(newcount * sizeof(pid_t));
3371 if (!newlist)
3372 return NULL;
3373 memcpy(newlist, p, newcount * sizeof(pid_t));
3374 vfree(p);
3375 } else {
3376 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
3377 }
3378 return newlist;
3379}
3380 3348
3381/* 3349/*
3382 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3350 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3383 * If the new stripped list is sufficiently smaller and there's enough memory 3351 * Returns the number of unique elements.
3384 * to allocate a new buffer, will let go of the unneeded memory. Returns the
3385 * number of unique elements.
3386 */ 3352 */
3387/* is the size difference enough that we should re-allocate the array? */ 3353static int pidlist_uniq(pid_t *list, int length)
3388#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
3389static int pidlist_uniq(pid_t **p, int length)
3390{ 3354{
3391 int src, dest = 1; 3355 int src, dest = 1;
3392 pid_t *list = *p;
3393 pid_t *newlist;
3394 3356
3395 /* 3357 /*
3396 * we presume the 0th element is unique, so i starts at 1. trivial 3358 * we presume the 0th element is unique, so i starts at 1. trivial
@@ -3411,16 +3373,6 @@ static int pidlist_uniq(pid_t **p, int length)
3411 dest++; 3373 dest++;
3412 } 3374 }
3413after: 3375after:
3414 /*
3415 * if the length difference is large enough, we want to allocate a
3416 * smaller buffer to save memory. if this fails due to out of memory,
3417 * we'll just stay with what we've got.
3418 */
3419 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
3420 newlist = pidlist_resize(list, dest);
3421 if (newlist)
3422 *p = newlist;
3423 }
3424 return dest; 3376 return dest;
3425} 3377}
3426 3378
@@ -3516,7 +3468,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3516 /* now sort & (if procs) strip out duplicates */ 3468 /* now sort & (if procs) strip out duplicates */
3517 sort(array, length, sizeof(pid_t), cmppid, NULL); 3469 sort(array, length, sizeof(pid_t), cmppid, NULL);
3518 if (type == CGROUP_FILE_PROCS) 3470 if (type == CGROUP_FILE_PROCS)
3519 length = pidlist_uniq(&array, length); 3471 length = pidlist_uniq(array, length);
3520 l = cgroup_pidlist_find(cgrp, type); 3472 l = cgroup_pidlist_find(cgrp, type);
3521 if (!l) { 3473 if (!l) {
3522 pidlist_free(array); 3474 pidlist_free(array);
@@ -3930,11 +3882,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3930 if (ret) 3882 if (ret)
3931 goto fail; 3883 goto fail;
3932 3884
3933 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { 3885 efile->f_op->poll(efile, &event->pt);
3934 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3935 ret = 0;
3936 goto fail;
3937 }
3938 3886
3939 /* 3887 /*
3940 * Events should be removed after rmdir of cgroup directory, but before 3888 * Events should be removed after rmdir of cgroup directory, but before
@@ -4016,10 +3964,16 @@ static struct cftype files[] = {
4016 }, 3964 },
4017 { 3965 {
4018 .name = "cgroup.clone_children", 3966 .name = "cgroup.clone_children",
3967 .flags = CFTYPE_INSANE,
4019 .read_u64 = cgroup_clone_children_read, 3968 .read_u64 = cgroup_clone_children_read,
4020 .write_u64 = cgroup_clone_children_write, 3969 .write_u64 = cgroup_clone_children_write,
4021 }, 3970 },
4022 { 3971 {
3972 .name = "cgroup.sane_behavior",
3973 .flags = CFTYPE_ONLY_ON_ROOT,
3974 .read_seq_string = cgroup_sane_behavior_show,
3975 },
3976 {
4023 .name = "release_agent", 3977 .name = "release_agent",
4024 .flags = CFTYPE_ONLY_ON_ROOT, 3978 .flags = CFTYPE_ONLY_ON_ROOT,
4025 .read_seq_string = cgroup_release_agent_show, 3979 .read_seq_string = cgroup_release_agent_show,
@@ -4131,17 +4085,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4131 if (!(css->flags & CSS_ONLINE)) 4085 if (!(css->flags & CSS_ONLINE))
4132 return; 4086 return;
4133 4087
4134 /* 4088 if (ss->css_offline)
4135 * css_offline() should be called with cgroup_mutex unlocked. See
4136 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4137 * details. This temporary unlocking should go away once
4138 * cgroup_mutex is unexported from controllers.
4139 */
4140 if (ss->css_offline) {
4141 mutex_unlock(&cgroup_mutex);
4142 ss->css_offline(cgrp); 4089 ss->css_offline(cgrp);
4143 mutex_lock(&cgroup_mutex);
4144 }
4145 4090
4146 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4091 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4147} 4092}
@@ -4158,6 +4103,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4158 umode_t mode) 4103 umode_t mode)
4159{ 4104{
4160 struct cgroup *cgrp; 4105 struct cgroup *cgrp;
4106 struct cgroup_name *name;
4161 struct cgroupfs_root *root = parent->root; 4107 struct cgroupfs_root *root = parent->root;
4162 int err = 0; 4108 int err = 0;
4163 struct cgroup_subsys *ss; 4109 struct cgroup_subsys *ss;
@@ -4168,9 +4114,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4168 if (!cgrp) 4114 if (!cgrp)
4169 return -ENOMEM; 4115 return -ENOMEM;
4170 4116
4117 name = cgroup_alloc_name(dentry);
4118 if (!name)
4119 goto err_free_cgrp;
4120 rcu_assign_pointer(cgrp->name, name);
4121
4171 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4122 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4172 if (cgrp->id < 0) 4123 if (cgrp->id < 0)
4173 goto err_free_cgrp; 4124 goto err_free_name;
4174 4125
4175 /* 4126 /*
4176 * Only live parents can have children. Note that the liveliness 4127 * Only live parents can have children. Note that the liveliness
@@ -4198,7 +4149,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4198 4149
4199 cgrp->parent = parent; 4150 cgrp->parent = parent;
4200 cgrp->root = parent->root; 4151 cgrp->root = parent->root;
4201 cgrp->top_cgroup = parent->top_cgroup;
4202 4152
4203 if (notify_on_release(parent)) 4153 if (notify_on_release(parent))
4204 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4154 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4241,6 +4191,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4241 for_each_subsys(root, ss) 4191 for_each_subsys(root, ss)
4242 dget(dentry); 4192 dget(dentry);
4243 4193
4194 /* hold a ref to the parent's dentry */
4195 dget(parent->dentry);
4196
4244 /* creation succeeded, notify subsystems */ 4197 /* creation succeeded, notify subsystems */
4245 for_each_subsys(root, ss) { 4198 for_each_subsys(root, ss) {
4246 err = online_css(ss, cgrp); 4199 err = online_css(ss, cgrp);
@@ -4276,6 +4229,8 @@ err_free_all:
4276 deactivate_super(sb); 4229 deactivate_super(sb);
4277err_free_id: 4230err_free_id:
4278 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4231 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4232err_free_name:
4233 kfree(rcu_dereference_raw(cgrp->name));
4279err_free_cgrp: 4234err_free_cgrp:
4280 kfree(cgrp); 4235 kfree(cgrp);
4281 return err; 4236 return err;
@@ -4295,56 +4250,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4295 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4250 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4296} 4251}
4297 4252
4298/*
4299 * Check the reference count on each subsystem. Since we already
4300 * established that there are no tasks in the cgroup, if the css refcount
4301 * is also 1, then there should be no outstanding references, so the
4302 * subsystem is safe to destroy. We scan across all subsystems rather than
4303 * using the per-hierarchy linked list of mounted subsystems since we can
4304 * be called via check_for_release() with no synchronization other than
4305 * RCU, and the subsystem linked list isn't RCU-safe.
4306 */
4307static int cgroup_has_css_refs(struct cgroup *cgrp)
4308{
4309 int i;
4310
4311 /*
4312 * We won't need to lock the subsys array, because the subsystems
4313 * we're concerned about aren't going anywhere since our cgroup root
4314 * has a reference on them.
4315 */
4316 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4317 struct cgroup_subsys *ss = subsys[i];
4318 struct cgroup_subsys_state *css;
4319
4320 /* Skip subsystems not present or not in this hierarchy */
4321 if (ss == NULL || ss->root != cgrp->root)
4322 continue;
4323
4324 css = cgrp->subsys[ss->subsys_id];
4325 /*
4326 * When called from check_for_release() it's possible
4327 * that by this point the cgroup has been removed
4328 * and the css deleted. But a false-positive doesn't
4329 * matter, since it can only happen if the cgroup
4330 * has been deleted and hence no longer needs the
4331 * release agent to be called anyway.
4332 */
4333 if (css && css_refcnt(css) > 1)
4334 return 1;
4335 }
4336 return 0;
4337}
4338
4339static int cgroup_destroy_locked(struct cgroup *cgrp) 4253static int cgroup_destroy_locked(struct cgroup *cgrp)
4340 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4254 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4341{ 4255{
4342 struct dentry *d = cgrp->dentry; 4256 struct dentry *d = cgrp->dentry;
4343 struct cgroup *parent = cgrp->parent; 4257 struct cgroup *parent = cgrp->parent;
4344 DEFINE_WAIT(wait);
4345 struct cgroup_event *event, *tmp; 4258 struct cgroup_event *event, *tmp;
4346 struct cgroup_subsys *ss; 4259 struct cgroup_subsys *ss;
4347 LIST_HEAD(tmp_list);
4348 4260
4349 lockdep_assert_held(&d->d_inode->i_mutex); 4261 lockdep_assert_held(&d->d_inode->i_mutex);
4350 lockdep_assert_held(&cgroup_mutex); 4262 lockdep_assert_held(&cgroup_mutex);
@@ -4468,7 +4380,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4468 * need to invoke fork callbacks here. */ 4380 * need to invoke fork callbacks here. */
4469 BUG_ON(!list_empty(&init_task.tasks)); 4381 BUG_ON(!list_empty(&init_task.tasks));
4470 4382
4471 ss->active = 1;
4472 BUG_ON(online_css(ss, dummytop)); 4383 BUG_ON(online_css(ss, dummytop));
4473 4384
4474 mutex_unlock(&cgroup_mutex); 4385 mutex_unlock(&cgroup_mutex);
@@ -4573,7 +4484,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4573 } 4484 }
4574 write_unlock(&css_set_lock); 4485 write_unlock(&css_set_lock);
4575 4486
4576 ss->active = 1;
4577 ret = online_css(ss, dummytop); 4487 ret = online_css(ss, dummytop);
4578 if (ret) 4488 if (ret)
4579 goto err_unload; 4489 goto err_unload;
@@ -4614,7 +4524,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4614 mutex_lock(&cgroup_mutex); 4524 mutex_lock(&cgroup_mutex);
4615 4525
4616 offline_css(ss, dummytop); 4526 offline_css(ss, dummytop);
4617 ss->active = 0;
4618 4527
4619 if (ss->use_id) 4528 if (ss->use_id)
4620 idr_destroy(&ss->idr); 4529 idr_destroy(&ss->idr);
@@ -4935,17 +4844,17 @@ void cgroup_post_fork(struct task_struct *child)
4935 * and addition to css_set. 4844 * and addition to css_set.
4936 */ 4845 */
4937 if (need_forkexit_callback) { 4846 if (need_forkexit_callback) {
4938 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4847 /*
4848 * fork/exit callbacks are supported only for builtin
4849 * subsystems, and the builtin section of the subsys
4850 * array is immutable, so we don't need to lock the
4851 * subsys array here. On the other hand, modular section
4852 * of the array can be freed at module unload, so we
4853 * can't touch that.
4854 */
4855 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4939 struct cgroup_subsys *ss = subsys[i]; 4856 struct cgroup_subsys *ss = subsys[i];
4940 4857
4941 /*
4942 * fork/exit callbacks are supported only for
4943 * builtin subsystems and we don't need further
4944 * synchronization as they never go away.
4945 */
4946 if (!ss || ss->module)
4947 continue;
4948
4949 if (ss->fork) 4858 if (ss->fork)
4950 ss->fork(child); 4859 ss->fork(child);
4951 } 4860 }
@@ -5010,13 +4919,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5010 tsk->cgroups = &init_css_set; 4919 tsk->cgroups = &init_css_set;
5011 4920
5012 if (run_callbacks && need_forkexit_callback) { 4921 if (run_callbacks && need_forkexit_callback) {
5013 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4922 /*
4923 * fork/exit callbacks are supported only for builtin
4924 * subsystems, see cgroup_post_fork() for details.
4925 */
4926 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
5014 struct cgroup_subsys *ss = subsys[i]; 4927 struct cgroup_subsys *ss = subsys[i];
5015 4928
5016 /* modular subsystems can't use callbacks */
5017 if (!ss || ss->module)
5018 continue;
5019
5020 if (ss->exit) { 4929 if (ss->exit) {
5021 struct cgroup *old_cgrp = 4930 struct cgroup *old_cgrp =
5022 rcu_dereference_raw(cg->subsys[i])->cgroup; 4931 rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5030,44 +4939,19 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5030 put_css_set_taskexit(cg); 4939 put_css_set_taskexit(cg);
5031} 4940}
5032 4941
5033/**
5034 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
5035 * @cgrp: the cgroup in question
5036 * @task: the task in question
5037 *
5038 * See if @cgrp is a descendant of @task's cgroup in the appropriate
5039 * hierarchy.
5040 *
5041 * If we are sending in dummytop, then presumably we are creating
5042 * the top cgroup in the subsystem.
5043 *
5044 * Called only by the ns (nsproxy) cgroup.
5045 */
5046int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
5047{
5048 int ret;
5049 struct cgroup *target;
5050
5051 if (cgrp == dummytop)
5052 return 1;
5053
5054 target = task_cgroup_from_root(task, cgrp->root);
5055 while (cgrp != target && cgrp!= cgrp->top_cgroup)
5056 cgrp = cgrp->parent;
5057 ret = (cgrp == target);
5058 return ret;
5059}
5060
5061static void check_for_release(struct cgroup *cgrp) 4942static void check_for_release(struct cgroup *cgrp)
5062{ 4943{
5063 /* All of these checks rely on RCU to keep the cgroup 4944 /* All of these checks rely on RCU to keep the cgroup
5064 * structure alive */ 4945 * structure alive */
5065 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) 4946 if (cgroup_is_releasable(cgrp) &&
5066 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { 4947 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
5067 /* Control Group is currently removeable. If it's not 4948 /*
4949 * Control Group is currently removeable. If it's not
5068 * already queued for a userspace notification, queue 4950 * already queued for a userspace notification, queue
5069 * it now */ 4951 * it now
4952 */
5070 int need_schedule_work = 0; 4953 int need_schedule_work = 0;
4954
5071 raw_spin_lock(&release_list_lock); 4955 raw_spin_lock(&release_list_lock);
5072 if (!cgroup_is_removed(cgrp) && 4956 if (!cgroup_is_removed(cgrp) &&
5073 list_empty(&cgrp->release_list)) { 4957 list_empty(&cgrp->release_list)) {
@@ -5100,24 +4984,11 @@ EXPORT_SYMBOL_GPL(__css_tryget);
5100/* Caller must verify that the css is not for root cgroup */ 4984/* Caller must verify that the css is not for root cgroup */
5101void __css_put(struct cgroup_subsys_state *css) 4985void __css_put(struct cgroup_subsys_state *css)
5102{ 4986{
5103 struct cgroup *cgrp = css->cgroup;
5104 int v; 4987 int v;
5105 4988
5106 rcu_read_lock();
5107 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); 4989 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
5108 4990 if (v == 0)
5109 switch (v) {
5110 case 1:
5111 if (notify_on_release(cgrp)) {
5112 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5113 check_for_release(cgrp);
5114 }
5115 break;
5116 case 0:
5117 schedule_work(&css->dput_work); 4991 schedule_work(&css->dput_work);
5118 break;
5119 }
5120 rcu_read_unlock();
5121} 4992}
5122EXPORT_SYMBOL_GPL(__css_put); 4993EXPORT_SYMBOL_GPL(__css_put);
5123 4994
@@ -5416,55 +5287,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5416} 5287}
5417EXPORT_SYMBOL_GPL(css_lookup); 5288EXPORT_SYMBOL_GPL(css_lookup);
5418 5289
5419/**
5420 * css_get_next - lookup next cgroup under specified hierarchy.
5421 * @ss: pointer to subsystem
5422 * @id: current position of iteration.
5423 * @root: pointer to css. search tree under this.
5424 * @foundid: position of found object.
5425 *
5426 * Search next css under the specified hierarchy of rootid. Calling under
5427 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
5428 */
5429struct cgroup_subsys_state *
5430css_get_next(struct cgroup_subsys *ss, int id,
5431 struct cgroup_subsys_state *root, int *foundid)
5432{
5433 struct cgroup_subsys_state *ret = NULL;
5434 struct css_id *tmp;
5435 int tmpid;
5436 int rootid = css_id(root);
5437 int depth = css_depth(root);
5438
5439 if (!rootid)
5440 return NULL;
5441
5442 BUG_ON(!ss->use_id);
5443 WARN_ON_ONCE(!rcu_read_lock_held());
5444
5445 /* fill start point for scan */
5446 tmpid = id;
5447 while (1) {
5448 /*
5449 * scan next entry from bitmap(tree), tmpid is updated after
5450 * idr_get_next().
5451 */
5452 tmp = idr_get_next(&ss->idr, &tmpid);
5453 if (!tmp)
5454 break;
5455 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
5456 ret = rcu_dereference(tmp->css);
5457 if (ret) {
5458 *foundid = tmpid;
5459 break;
5460 }
5461 }
5462 /* continue to scan from next id */
5463 tmpid = tmpid + 1;
5464 }
5465 return ret;
5466}
5467
5468/* 5290/*
5469 * get corresponding css from file open on cgroupfs directory 5291 * get corresponding css from file open on cgroupfs directory
5470 */ 5292 */
diff --git a/kernel/compat.c b/kernel/compat.c
index 19971d8c7299..0a09e481b70b 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -516,25 +516,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
516 return 0; 516 return 0;
517} 517}
518 518
519asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
520{
521 struct rusage r;
522 int ret;
523 mm_segment_t old_fs = get_fs();
524
525 set_fs(KERNEL_DS);
526 ret = sys_getrusage(who, (struct rusage __user *) &r);
527 set_fs(old_fs);
528
529 if (ret)
530 return ret;
531
532 if (put_compat_rusage(&r, ru))
533 return -EFAULT;
534
535 return 0;
536}
537
538COMPAT_SYSCALL_DEFINE4(wait4, 519COMPAT_SYSCALL_DEFINE4(wait4,
539 compat_pid_t, pid, 520 compat_pid_t, pid,
540 compat_uint_t __user *, stat_addr, 521 compat_uint_t __user *, stat_addr,
@@ -1138,71 +1119,6 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
1138} 1119}
1139#endif 1120#endif
1140 1121
1141struct compat_sysinfo {
1142 s32 uptime;
1143 u32 loads[3];
1144 u32 totalram;
1145 u32 freeram;
1146 u32 sharedram;
1147 u32 bufferram;
1148 u32 totalswap;
1149 u32 freeswap;
1150 u16 procs;
1151 u16 pad;
1152 u32 totalhigh;
1153 u32 freehigh;
1154 u32 mem_unit;
1155 char _f[20-2*sizeof(u32)-sizeof(int)];
1156};
1157
1158asmlinkage long
1159compat_sys_sysinfo(struct compat_sysinfo __user *info)
1160{
1161 struct sysinfo s;
1162
1163 do_sysinfo(&s);
1164
1165 /* Check to see if any memory value is too large for 32-bit and scale
1166 * down if needed
1167 */
1168 if ((s.totalram >> 32) || (s.totalswap >> 32)) {
1169 int bitcount = 0;
1170
1171 while (s.mem_unit < PAGE_SIZE) {
1172 s.mem_unit <<= 1;
1173 bitcount++;
1174 }
1175
1176 s.totalram >>= bitcount;
1177 s.freeram >>= bitcount;
1178 s.sharedram >>= bitcount;
1179 s.bufferram >>= bitcount;
1180 s.totalswap >>= bitcount;
1181 s.freeswap >>= bitcount;
1182 s.totalhigh >>= bitcount;
1183 s.freehigh >>= bitcount;
1184 }
1185
1186 if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
1187 __put_user (s.uptime, &info->uptime) ||
1188 __put_user (s.loads[0], &info->loads[0]) ||
1189 __put_user (s.loads[1], &info->loads[1]) ||
1190 __put_user (s.loads[2], &info->loads[2]) ||
1191 __put_user (s.totalram, &info->totalram) ||
1192 __put_user (s.freeram, &info->freeram) ||
1193 __put_user (s.sharedram, &info->sharedram) ||
1194 __put_user (s.bufferram, &info->bufferram) ||
1195 __put_user (s.totalswap, &info->totalswap) ||
1196 __put_user (s.freeswap, &info->freeswap) ||
1197 __put_user (s.procs, &info->procs) ||
1198 __put_user (s.totalhigh, &info->totalhigh) ||
1199 __put_user (s.freehigh, &info->freehigh) ||
1200 __put_user (s.mem_unit, &info->mem_unit))
1201 return -EFAULT;
1202
1203 return 0;
1204}
1205
1206COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, 1122COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
1207 compat_pid_t, pid, 1123 compat_pid_t, pid,
1208 struct compat_timespec __user *, interval) 1124 struct compat_timespec __user *, interval)
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
new file mode 100644
index 000000000000..59ab052ef7a0
--- /dev/null
+++ b/kernel/cpu/Makefile
@@ -0,0 +1 @@
obj-y = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
new file mode 100644
index 000000000000..8b86c0c68edf
--- /dev/null
+++ b/kernel/cpu/idle.c
@@ -0,0 +1,116 @@
1/*
2 * Generic entry point for the idle threads
3 */
4#include <linux/sched.h>
5#include <linux/cpu.h>
6#include <linux/tick.h>
7#include <linux/mm.h>
8
9#include <asm/tlb.h>
10
11#include <trace/events/power.h>
12
13static int __read_mostly cpu_idle_force_poll;
14
15void cpu_idle_poll_ctrl(bool enable)
16{
17 if (enable) {
18 cpu_idle_force_poll++;
19 } else {
20 cpu_idle_force_poll--;
21 WARN_ON_ONCE(cpu_idle_force_poll < 0);
22 }
23}
24
25#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
26static int __init cpu_idle_poll_setup(char *__unused)
27{
28 cpu_idle_force_poll = 1;
29 return 1;
30}
31__setup("nohlt", cpu_idle_poll_setup);
32
33static int __init cpu_idle_nopoll_setup(char *__unused)
34{
35 cpu_idle_force_poll = 0;
36 return 1;
37}
38__setup("hlt", cpu_idle_nopoll_setup);
39#endif
40
41static inline int cpu_idle_poll(void)
42{
43 trace_cpu_idle_rcuidle(0, smp_processor_id());
44 local_irq_enable();
45 while (!need_resched())
46 cpu_relax();
47 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
48 return 1;
49}
50
51/* Weak implementations for optional arch specific functions */
52void __weak arch_cpu_idle_prepare(void) { }
53void __weak arch_cpu_idle_enter(void) { }
54void __weak arch_cpu_idle_exit(void) { }
55void __weak arch_cpu_idle_dead(void) { }
56void __weak arch_cpu_idle(void)
57{
58 cpu_idle_force_poll = 1;
59}
60
61/*
62 * Generic idle loop implementation
63 */
64static void cpu_idle_loop(void)
65{
66 while (1) {
67 tick_nohz_idle_enter();
68
69 while (!need_resched()) {
70 check_pgt_cache();
71 rmb();
72
73 if (cpu_is_offline(smp_processor_id()))
74 arch_cpu_idle_dead();
75
76 local_irq_disable();
77 arch_cpu_idle_enter();
78
79 /*
80 * In poll mode we reenable interrupts and spin.
81 *
82 * Also if we detected in the wakeup from idle
83 * path that the tick broadcast device expired
84 * for us, we don't want to go deep idle as we
85 * know that the IPI is going to arrive right
86 * away
87 */
88 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
89 cpu_idle_poll();
90 } else {
91 current_clr_polling();
92 if (!need_resched()) {
93 stop_critical_timings();
94 rcu_idle_enter();
95 arch_cpu_idle();
96 WARN_ON_ONCE(irqs_disabled());
97 rcu_idle_exit();
98 start_critical_timings();
99 } else {
100 local_irq_enable();
101 }
102 current_set_polling();
103 }
104 arch_cpu_idle_exit();
105 }
106 tick_nohz_idle_exit();
107 schedule_preempt_disabled();
108 }
109}
110
111void cpu_startup_entry(enum cpuhp_state state)
112{
113 current_set_polling();
114 arch_cpu_idle_prepare();
115 cpu_idle_loop();
116}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4f9dfe43ecbd..12331120767c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -265,17 +265,6 @@ static DEFINE_MUTEX(cpuset_mutex);
265static DEFINE_MUTEX(callback_mutex); 265static DEFINE_MUTEX(callback_mutex);
266 266
267/* 267/*
268 * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
269 * buffers. They are statically allocated to prevent using excess stack
270 * when calling cpuset_print_task_mems_allowed().
271 */
272#define CPUSET_NAME_LEN (128)
273#define CPUSET_NODELIST_LEN (256)
274static char cpuset_name[CPUSET_NAME_LEN];
275static char cpuset_nodelist[CPUSET_NODELIST_LEN];
276static DEFINE_SPINLOCK(cpuset_buffer_lock);
277
278/*
279 * CPU / memory hotplug is handled asynchronously. 268 * CPU / memory hotplug is handled asynchronously.
280 */ 269 */
281static struct workqueue_struct *cpuset_propagate_hotplug_wq; 270static struct workqueue_struct *cpuset_propagate_hotplug_wq;
@@ -780,25 +769,26 @@ static void rebuild_sched_domains_locked(void)
780 lockdep_assert_held(&cpuset_mutex); 769 lockdep_assert_held(&cpuset_mutex);
781 get_online_cpus(); 770 get_online_cpus();
782 771
772 /*
773 * We have raced with CPU hotplug. Don't do anything to avoid
774 * passing doms with offlined cpu to partition_sched_domains().
775 * Anyways, hotplug work item will rebuild sched domains.
776 */
777 if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
778 goto out;
779
783 /* Generate domain masks and attrs */ 780 /* Generate domain masks and attrs */
784 ndoms = generate_sched_domains(&doms, &attr); 781 ndoms = generate_sched_domains(&doms, &attr);
785 782
786 /* Have scheduler rebuild the domains */ 783 /* Have scheduler rebuild the domains */
787 partition_sched_domains(ndoms, doms, attr); 784 partition_sched_domains(ndoms, doms, attr);
788 785out:
789 put_online_cpus(); 786 put_online_cpus();
790} 787}
791#else /* !CONFIG_SMP */ 788#else /* !CONFIG_SMP */
792static void rebuild_sched_domains_locked(void) 789static void rebuild_sched_domains_locked(void)
793{ 790{
794} 791}
795
796static int generate_sched_domains(cpumask_var_t **domains,
797 struct sched_domain_attr **attributes)
798{
799 *domains = NULL;
800 return 1;
801}
802#endif /* CONFIG_SMP */ 792#endif /* CONFIG_SMP */
803 793
804void rebuild_sched_domains(void) 794void rebuild_sched_domains(void)
@@ -1388,16 +1378,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1388 1378
1389 cgroup_taskset_for_each(task, cgrp, tset) { 1379 cgroup_taskset_for_each(task, cgrp, tset) {
1390 /* 1380 /*
1391 * Kthreads bound to specific cpus cannot be moved to a new 1381 * Kthreads which disallow setaffinity shouldn't be moved
1392 * cpuset; we cannot change their cpu affinity and 1382 * to a new cpuset; we don't want to change their cpu
1393 * isolating such threads by their set of allowed nodes is 1383 * affinity and isolating such threads by their set of
1394 * unnecessary. Thus, cpusets are not applicable for such 1384 * allowed nodes is unnecessary. Thus, cpusets are not
1395 * threads. This prevents checking for success of 1385 * applicable for such threads. This prevents checking for
1396 * set_cpus_allowed_ptr() on all attached tasks before 1386 * success of set_cpus_allowed_ptr() on all attached tasks
1397 * cpus_allowed may be changed. 1387 * before cpus_allowed may be changed.
1398 */ 1388 */
1399 ret = -EINVAL; 1389 ret = -EINVAL;
1400 if (task->flags & PF_THREAD_BOUND) 1390 if (task->flags & PF_NO_SETAFFINITY)
1401 goto out_unlock; 1391 goto out_unlock;
1402 ret = security_task_setscheduler(task); 1392 ret = security_task_setscheduler(task);
1403 if (ret) 1393 if (ret)
@@ -2005,50 +1995,6 @@ int __init cpuset_init(void)
2005 return 0; 1995 return 0;
2006} 1996}
2007 1997
2008/**
2009 * cpuset_do_move_task - move a given task to another cpuset
2010 * @tsk: pointer to task_struct the task to move
2011 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
2012 *
2013 * Called by cgroup_scan_tasks() for each task in a cgroup.
2014 * Return nonzero to stop the walk through the tasks.
2015 */
2016static void cpuset_do_move_task(struct task_struct *tsk,
2017 struct cgroup_scanner *scan)
2018{
2019 struct cgroup *new_cgroup = scan->data;
2020
2021 cgroup_lock();
2022 cgroup_attach_task(new_cgroup, tsk);
2023 cgroup_unlock();
2024}
2025
2026/**
2027 * move_member_tasks_to_cpuset - move tasks from one cpuset to another
2028 * @from: cpuset in which the tasks currently reside
2029 * @to: cpuset to which the tasks will be moved
2030 *
2031 * Called with cpuset_mutex held
2032 * callback_mutex must not be held, as cpuset_attach() will take it.
2033 *
2034 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
2035 * calling callback functions for each.
2036 */
2037static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
2038{
2039 struct cgroup_scanner scan;
2040
2041 scan.cg = from->css.cgroup;
2042 scan.test_task = NULL; /* select all tasks in cgroup */
2043 scan.process_task = cpuset_do_move_task;
2044 scan.heap = NULL;
2045 scan.data = to->css.cgroup;
2046
2047 if (cgroup_scan_tasks(&scan))
2048 printk(KERN_ERR "move_member_tasks_to_cpuset: "
2049 "cgroup_scan_tasks failed\n");
2050}
2051
2052/* 1998/*
2053 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 1999 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
2054 * or memory nodes, we need to walk over the cpuset hierarchy, 2000 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2069,7 +2015,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2069 nodes_empty(parent->mems_allowed)) 2015 nodes_empty(parent->mems_allowed))
2070 parent = parent_cs(parent); 2016 parent = parent_cs(parent);
2071 2017
2072 move_member_tasks_to_cpuset(cs, parent); 2018 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2019 rcu_read_lock();
2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
2021 cgroup_name(cs->css.cgroup));
2022 rcu_read_unlock();
2023 }
2073} 2024}
2074 2025
2075/** 2026/**
@@ -2222,17 +2173,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2222 flush_workqueue(cpuset_propagate_hotplug_wq); 2173 flush_workqueue(cpuset_propagate_hotplug_wq);
2223 2174
2224 /* rebuild sched domains if cpus_allowed has changed */ 2175 /* rebuild sched domains if cpus_allowed has changed */
2225 if (cpus_updated) { 2176 if (cpus_updated)
2226 struct sched_domain_attr *attr; 2177 rebuild_sched_domains();
2227 cpumask_var_t *doms;
2228 int ndoms;
2229
2230 mutex_lock(&cpuset_mutex);
2231 ndoms = generate_sched_domains(&doms, &attr);
2232 mutex_unlock(&cpuset_mutex);
2233
2234 partition_sched_domains(ndoms, doms, attr);
2235 }
2236} 2178}
2237 2179
2238void cpuset_update_active_cpus(bool cpu_online) 2180void cpuset_update_active_cpus(bool cpu_online)
@@ -2251,7 +2193,6 @@ void cpuset_update_active_cpus(bool cpu_online)
2251 schedule_work(&cpuset_hotplug_work); 2193 schedule_work(&cpuset_hotplug_work);
2252} 2194}
2253 2195
2254#ifdef CONFIG_MEMORY_HOTPLUG
2255/* 2196/*
2256 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 2197 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
2257 * Call this routine anytime after node_states[N_MEMORY] changes. 2198 * Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2263,20 +2204,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2263 schedule_work(&cpuset_hotplug_work); 2204 schedule_work(&cpuset_hotplug_work);
2264 return NOTIFY_OK; 2205 return NOTIFY_OK;
2265} 2206}
2266#endif 2207
2208static struct notifier_block cpuset_track_online_nodes_nb = {
2209 .notifier_call = cpuset_track_online_nodes,
2210 .priority = 10, /* ??! */
2211};
2267 2212
2268/** 2213/**
2269 * cpuset_init_smp - initialize cpus_allowed 2214 * cpuset_init_smp - initialize cpus_allowed
2270 * 2215 *
2271 * Description: Finish top cpuset after cpu, node maps are initialized 2216 * Description: Finish top cpuset after cpu, node maps are initialized
2272 **/ 2217 */
2273
2274void __init cpuset_init_smp(void) 2218void __init cpuset_init_smp(void)
2275{ 2219{
2276 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2220 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2277 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2221 top_cpuset.mems_allowed = node_states[N_MEMORY];
2278 2222
2279 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2223 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2280 2224
2281 cpuset_propagate_hotplug_wq = 2225 cpuset_propagate_hotplug_wq =
2282 alloc_ordered_workqueue("cpuset_hotplug", 0); 2226 alloc_ordered_workqueue("cpuset_hotplug", 0);
@@ -2592,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2592 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2536 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2593} 2537}
2594 2538
2539#define CPUSET_NODELIST_LEN (256)
2540
2595/** 2541/**
2596 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2542 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2597 * @task: pointer to task_struct of some task. 2543 * @task: pointer to task_struct of some task.
@@ -2602,25 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2602 */ 2548 */
2603void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2549void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2604{ 2550{
2605 struct dentry *dentry; 2551 /* Statically allocated to prevent using excess stack. */
2552 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2553 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2606 2554
2607 dentry = task_cs(tsk)->css.cgroup->dentry; 2555 struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
2608 spin_lock(&cpuset_buffer_lock);
2609 2556
2610 if (!dentry) { 2557 rcu_read_lock();
2611 strcpy(cpuset_name, "/"); 2558 spin_lock(&cpuset_buffer_lock);
2612 } else {
2613 spin_lock(&dentry->d_lock);
2614 strlcpy(cpuset_name, (const char *)dentry->d_name.name,
2615 CPUSET_NAME_LEN);
2616 spin_unlock(&dentry->d_lock);
2617 }
2618 2559
2619 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2560 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2620 tsk->mems_allowed); 2561 tsk->mems_allowed);
2621 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2562 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2622 tsk->comm, cpuset_name, cpuset_nodelist); 2563 tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
2564
2623 spin_unlock(&cpuset_buffer_lock); 2565 spin_unlock(&cpuset_buffer_lock);
2566 rcu_read_unlock();
2624} 2567}
2625 2568
2626/* 2569/*
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index c26278fd4851..0506d447aed2 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -775,7 +775,7 @@ static void sysrq_handle_dbg(int key)
775 775
776static struct sysrq_key_op sysrq_dbg_op = { 776static struct sysrq_key_op sysrq_dbg_op = {
777 .handler = sysrq_handle_dbg, 777 .handler = sysrq_handle_dbg,
778 .help_msg = "debug(G)", 778 .help_msg = "debug(g)",
779 .action_msg = "DEBUG", 779 .action_msg = "DEBUG",
780}; 780};
781#endif 781#endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d3124b39277..3820e3cefbae 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -37,6 +37,7 @@
37#include <linux/ftrace_event.h> 37#include <linux/ftrace_event.h>
38#include <linux/hw_breakpoint.h> 38#include <linux/hw_breakpoint.h>
39#include <linux/mm_types.h> 39#include <linux/mm_types.h>
40#include <linux/cgroup.h>
40 41
41#include "internal.h" 42#include "internal.h"
42 43
@@ -234,6 +235,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
234#ifdef CONFIG_CGROUP_PERF 235#ifdef CONFIG_CGROUP_PERF
235 236
236/* 237/*
238 * perf_cgroup_info keeps track of time_enabled for a cgroup.
239 * This is a per-cpu dynamically allocated data structure.
240 */
241struct perf_cgroup_info {
242 u64 time;
243 u64 timestamp;
244};
245
246struct perf_cgroup {
247 struct cgroup_subsys_state css;
248 struct perf_cgroup_info __percpu *info;
249};
250
251/*
237 * Must ensure cgroup is pinned (css_get) before calling 252 * Must ensure cgroup is pinned (css_get) before calling
238 * this function. In other words, we cannot call this function 253 * this function. In other words, we cannot call this function
239 * if there is no cgroup event for the current CPU context. 254 * if there is no cgroup event for the current CPU context.
@@ -251,7 +266,22 @@ perf_cgroup_match(struct perf_event *event)
251 struct perf_event_context *ctx = event->ctx; 266 struct perf_event_context *ctx = event->ctx;
252 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 267 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
253 268
254 return !event->cgrp || event->cgrp == cpuctx->cgrp; 269 /* @event doesn't care about cgroup */
270 if (!event->cgrp)
271 return true;
272
273 /* wants specific cgroup scope but @cpuctx isn't associated with any */
274 if (!cpuctx->cgrp)
275 return false;
276
277 /*
278 * Cgroup scoping is recursive. An event enabled for a cgroup is
279 * also enabled for all its descendant cgroups. If @cpuctx's
280 * cgroup is a descendant of @event's (the test covers identity
281 * case), it's a match.
282 */
283 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
284 event->cgrp->css.cgroup);
255} 285}
256 286
257static inline bool perf_tryget_cgroup(struct perf_event *event) 287static inline bool perf_tryget_cgroup(struct perf_event *event)
@@ -961,9 +991,15 @@ static void perf_event__header_size(struct perf_event *event)
961 if (sample_type & PERF_SAMPLE_PERIOD) 991 if (sample_type & PERF_SAMPLE_PERIOD)
962 size += sizeof(data->period); 992 size += sizeof(data->period);
963 993
994 if (sample_type & PERF_SAMPLE_WEIGHT)
995 size += sizeof(data->weight);
996
964 if (sample_type & PERF_SAMPLE_READ) 997 if (sample_type & PERF_SAMPLE_READ)
965 size += event->read_size; 998 size += event->read_size;
966 999
1000 if (sample_type & PERF_SAMPLE_DATA_SRC)
1001 size += sizeof(data->data_src.val);
1002
967 event->header_size = size; 1003 event->header_size = size;
968} 1004}
969 1005
@@ -4178,6 +4214,12 @@ void perf_output_sample(struct perf_output_handle *handle,
4178 perf_output_sample_ustack(handle, 4214 perf_output_sample_ustack(handle,
4179 data->stack_user_size, 4215 data->stack_user_size,
4180 data->regs_user.regs); 4216 data->regs_user.regs);
4217
4218 if (sample_type & PERF_SAMPLE_WEIGHT)
4219 perf_output_put(handle, data->weight);
4220
4221 if (sample_type & PERF_SAMPLE_DATA_SRC)
4222 perf_output_put(handle, data->data_src.val);
4181} 4223}
4182 4224
4183void perf_prepare_sample(struct perf_event_header *header, 4225void perf_prepare_sample(struct perf_event_header *header,
@@ -4596,6 +4638,7 @@ void perf_event_comm(struct task_struct *task)
4596 struct perf_event_context *ctx; 4638 struct perf_event_context *ctx;
4597 int ctxn; 4639 int ctxn;
4598 4640
4641 rcu_read_lock();
4599 for_each_task_context_nr(ctxn) { 4642 for_each_task_context_nr(ctxn) {
4600 ctx = task->perf_event_ctxp[ctxn]; 4643 ctx = task->perf_event_ctxp[ctxn];
4601 if (!ctx) 4644 if (!ctx)
@@ -4603,6 +4646,7 @@ void perf_event_comm(struct task_struct *task)
4603 4646
4604 perf_event_enable_on_exec(ctx); 4647 perf_event_enable_on_exec(ctx);
4605 } 4648 }
4649 rcu_read_unlock();
4606 4650
4607 if (!atomic_read(&nr_comm_events)) 4651 if (!atomic_read(&nr_comm_events))
4608 return; 4652 return;
@@ -4765,6 +4809,9 @@ got_name:
4765 mmap_event->file_name = name; 4809 mmap_event->file_name = name;
4766 mmap_event->file_size = size; 4810 mmap_event->file_size = size;
4767 4811
4812 if (!(vma->vm_flags & VM_EXEC))
4813 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
4814
4768 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4815 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4769 4816
4770 rcu_read_lock(); 4817 rcu_read_lock();
@@ -7515,12 +7562,5 @@ struct cgroup_subsys perf_subsys = {
7515 .css_free = perf_cgroup_css_free, 7562 .css_free = perf_cgroup_css_free,
7516 .exit = perf_cgroup_exit, 7563 .exit = perf_cgroup_exit,
7517 .attach = perf_cgroup_attach, 7564 .attach = perf_cgroup_attach,
7518
7519 /*
7520 * perf_event cgroup doesn't handle nesting correctly.
7521 * ctx->nr_cgroups adjustments should be propagated through the
7522 * cgroup hierarchy. Fix it and remove the following.
7523 */
7524 .broken_hierarchy = true,
7525}; 7565};
7526#endif /* CONFIG_CGROUP_PERF */ 7566#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a567c8c7ef31..f3569747d629 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -75,6 +75,15 @@ struct uprobe {
75 struct arch_uprobe arch; 75 struct arch_uprobe arch;
76}; 76};
77 77
78struct return_instance {
79 struct uprobe *uprobe;
80 unsigned long func;
81 unsigned long orig_ret_vaddr; /* original return address */
82 bool chained; /* true, if instance is nested */
83
84 struct return_instance *next; /* keep as stack */
85};
86
78/* 87/*
79 * valid_vma: Verify if the specified vma is an executable vma 88 * valid_vma: Verify if the specified vma is an executable vma
80 * Relax restrictions while unregistering: vm_flags might have 89 * Relax restrictions while unregistering: vm_flags might have
@@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
173 return *insn == UPROBE_SWBP_INSN; 182 return *insn == UPROBE_SWBP_INSN;
174} 183}
175 184
176static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) 185/**
186 * is_trap_insn - check if instruction is breakpoint instruction.
187 * @insn: instruction to be checked.
188 * Default implementation of is_trap_insn
189 * Returns true if @insn is a breakpoint instruction.
190 *
191 * This function is needed for the case where an architecture has multiple
192 * trap instructions (like powerpc).
193 */
194bool __weak is_trap_insn(uprobe_opcode_t *insn)
195{
196 return is_swbp_insn(insn);
197}
198
199static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
177{ 200{
178 void *kaddr = kmap_atomic(page); 201 void *kaddr = kmap_atomic(page);
179 memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); 202 memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
203 kunmap_atomic(kaddr);
204}
205
206static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
207{
208 void *kaddr = kmap_atomic(page);
209 memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
180 kunmap_atomic(kaddr); 210 kunmap_atomic(kaddr);
181} 211}
182 212
@@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
185 uprobe_opcode_t old_opcode; 215 uprobe_opcode_t old_opcode;
186 bool is_swbp; 216 bool is_swbp;
187 217
188 copy_opcode(page, vaddr, &old_opcode); 218 /*
219 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
220 * We do not check if it is any other 'trap variant' which could
221 * be conditional trap instruction such as the one powerpc supports.
222 *
223 * The logic is that we do not care if the underlying instruction
224 * is a trap variant; uprobes always wins over any other (gdb)
225 * breakpoint.
226 */
227 copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
189 is_swbp = is_swbp_insn(&old_opcode); 228 is_swbp = is_swbp_insn(&old_opcode);
190 229
191 if (is_swbp_insn(new_opcode)) { 230 if (is_swbp_insn(new_opcode)) {
@@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
204 * Expect the breakpoint instruction to be the smallest size instruction for 243 * Expect the breakpoint instruction to be the smallest size instruction for
205 * the architecture. If an arch has variable length instruction and the 244 * the architecture. If an arch has variable length instruction and the
206 * breakpoint instruction is not of the smallest length instruction 245 * breakpoint instruction is not of the smallest length instruction
207 * supported by that architecture then we need to modify is_swbp_at_addr and 246 * supported by that architecture then we need to modify is_trap_at_addr and
208 * write_opcode accordingly. This would never be a problem for archs that 247 * write_opcode accordingly. This would never be a problem for archs that
209 * have fixed length instructions. 248 * have fixed length instructions.
210 */ 249 */
@@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
225 uprobe_opcode_t opcode) 264 uprobe_opcode_t opcode)
226{ 265{
227 struct page *old_page, *new_page; 266 struct page *old_page, *new_page;
228 void *vaddr_old, *vaddr_new;
229 struct vm_area_struct *vma; 267 struct vm_area_struct *vma;
230 int ret; 268 int ret;
231 269
@@ -246,15 +284,8 @@ retry:
246 284
247 __SetPageUptodate(new_page); 285 __SetPageUptodate(new_page);
248 286
249 /* copy the page now that we've got it stable */ 287 copy_highpage(new_page, old_page);
250 vaddr_old = kmap_atomic(old_page); 288 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
251 vaddr_new = kmap_atomic(new_page);
252
253 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
254 memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
255
256 kunmap_atomic(vaddr_new);
257 kunmap_atomic(vaddr_old);
258 289
259 ret = anon_vma_prepare(vma); 290 ret = anon_vma_prepare(vma);
260 if (ret) 291 if (ret)
@@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
477 unsigned long nbytes, loff_t offset) 508 unsigned long nbytes, loff_t offset)
478{ 509{
479 struct page *page; 510 struct page *page;
480 void *vaddr;
481 unsigned long off;
482 pgoff_t idx;
483
484 if (!filp)
485 return -EINVAL;
486 511
487 if (!mapping->a_ops->readpage) 512 if (!mapping->a_ops->readpage)
488 return -EIO; 513 return -EIO;
489
490 idx = offset >> PAGE_CACHE_SHIFT;
491 off = offset & ~PAGE_MASK;
492
493 /* 514 /*
494 * Ensure that the page that has the original instruction is 515 * Ensure that the page that has the original instruction is
495 * populated and in page-cache. 516 * populated and in page-cache.
496 */ 517 */
497 page = read_mapping_page(mapping, idx, filp); 518 page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
498 if (IS_ERR(page)) 519 if (IS_ERR(page))
499 return PTR_ERR(page); 520 return PTR_ERR(page);
500 521
501 vaddr = kmap_atomic(page); 522 copy_from_page(page, offset, insn, nbytes);
502 memcpy(insn, vaddr + off, nbytes);
503 kunmap_atomic(vaddr);
504 page_cache_release(page); 523 page_cache_release(page);
505 524
506 return 0; 525 return 0;
@@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
550 goto out; 569 goto out;
551 570
552 ret = -ENOTSUPP; 571 ret = -ENOTSUPP;
553 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) 572 if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
554 goto out; 573 goto out;
555 574
556 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); 575 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
758 down_write(&mm->mmap_sem); 777 down_write(&mm->mmap_sem);
759 vma = find_vma(mm, info->vaddr); 778 vma = find_vma(mm, info->vaddr);
760 if (!vma || !valid_vma(vma, is_register) || 779 if (!vma || !valid_vma(vma, is_register) ||
761 vma->vm_file->f_mapping->host != uprobe->inode) 780 file_inode(vma->vm_file) != uprobe->inode)
762 goto unlock; 781 goto unlock;
763 782
764 if (vma->vm_start > info->vaddr || 783 if (vma->vm_start > info->vaddr ||
@@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
828 struct uprobe *uprobe; 847 struct uprobe *uprobe;
829 int ret; 848 int ret;
830 849
850 /* Uprobe must have at least one set consumer */
851 if (!uc->handler && !uc->ret_handler)
852 return -EINVAL;
853
831 /* Racy, just to catch the obvious mistakes */ 854 /* Racy, just to catch the obvious mistakes */
832 if (offset > i_size_read(inode)) 855 if (offset > i_size_read(inode))
833 return -EINVAL; 856 return -EINVAL;
@@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
917 loff_t offset; 940 loff_t offset;
918 941
919 if (!valid_vma(vma, false) || 942 if (!valid_vma(vma, false) ||
920 vma->vm_file->f_mapping->host != uprobe->inode) 943 file_inode(vma->vm_file) != uprobe->inode)
921 continue; 944 continue;
922 945
923 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 946 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
1010 if (no_uprobe_events() || !valid_vma(vma, true)) 1033 if (no_uprobe_events() || !valid_vma(vma, true))
1011 return 0; 1034 return 0;
1012 1035
1013 inode = vma->vm_file->f_mapping->host; 1036 inode = file_inode(vma->vm_file);
1014 if (!inode) 1037 if (!inode)
1015 return 0; 1038 return 0;
1016 1039
@@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
1041 struct inode *inode; 1064 struct inode *inode;
1042 struct rb_node *n; 1065 struct rb_node *n;
1043 1066
1044 inode = vma->vm_file->f_mapping->host; 1067 inode = file_inode(vma->vm_file);
1045 1068
1046 min = vaddr_to_offset(vma, start); 1069 min = vaddr_to_offset(vma, start);
1047 max = min + (end - start) - 1; 1070 max = min + (end - start) - 1;
@@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void)
1114{ 1137{
1115 struct mm_struct *mm = current->mm; 1138 struct mm_struct *mm = current->mm;
1116 struct xol_area *area; 1139 struct xol_area *area;
1140 uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1117 1141
1118 area = mm->uprobes_state.xol_area; 1142 area = mm->uprobes_state.xol_area;
1119 if (area) 1143 if (area)
@@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void)
1131 if (!area->page) 1155 if (!area->page)
1132 goto free_bitmap; 1156 goto free_bitmap;
1133 1157
1158 /* allocate first slot of task's xol_area for the return probes */
1159 set_bit(0, area->bitmap);
1160 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1161 atomic_set(&area->slot_count, 1);
1134 init_waitqueue_head(&area->wq); 1162 init_waitqueue_head(&area->wq);
1163
1135 if (!xol_add_vma(area)) 1164 if (!xol_add_vma(area))
1136 return area; 1165 return area;
1137 1166
@@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
1216static unsigned long xol_get_insn_slot(struct uprobe *uprobe) 1245static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1217{ 1246{
1218 struct xol_area *area; 1247 struct xol_area *area;
1219 unsigned long offset;
1220 unsigned long xol_vaddr; 1248 unsigned long xol_vaddr;
1221 void *vaddr;
1222 1249
1223 area = get_xol_area(); 1250 area = get_xol_area();
1224 if (!area) 1251 if (!area)
@@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1229 return 0; 1256 return 0;
1230 1257
1231 /* Initialize the slot */ 1258 /* Initialize the slot */
1232 offset = xol_vaddr & ~PAGE_MASK; 1259 copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
1233 vaddr = kmap_atomic(area->page);
1234 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1235 kunmap_atomic(vaddr);
1236 /* 1260 /*
1237 * We probably need flush_icache_user_range() but it needs vma. 1261 * We probably need flush_icache_user_range() but it needs vma.
1238 * This should work on supported architectures too. 1262 * This should work on supported architectures too.
@@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1298void uprobe_free_utask(struct task_struct *t) 1322void uprobe_free_utask(struct task_struct *t)
1299{ 1323{
1300 struct uprobe_task *utask = t->utask; 1324 struct uprobe_task *utask = t->utask;
1325 struct return_instance *ri, *tmp;
1301 1326
1302 if (!utask) 1327 if (!utask)
1303 return; 1328 return;
@@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)
1305 if (utask->active_uprobe) 1330 if (utask->active_uprobe)
1306 put_uprobe(utask->active_uprobe); 1331 put_uprobe(utask->active_uprobe);
1307 1332
1333 ri = utask->return_instances;
1334 while (ri) {
1335 tmp = ri;
1336 ri = ri->next;
1337
1338 put_uprobe(tmp->uprobe);
1339 kfree(tmp);
1340 }
1341
1308 xol_free_insn_slot(t); 1342 xol_free_insn_slot(t);
1309 kfree(utask); 1343 kfree(utask);
1310 t->utask = NULL; 1344 t->utask = NULL;
@@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void)
1333 return current->utask; 1367 return current->utask;
1334} 1368}
1335 1369
1370/*
1371 * Current area->vaddr notion assume the trampoline address is always
1372 * equal area->vaddr.
1373 *
1374 * Returns -1 in case the xol_area is not allocated.
1375 */
1376static unsigned long get_trampoline_vaddr(void)
1377{
1378 struct xol_area *area;
1379 unsigned long trampoline_vaddr = -1;
1380
1381 area = current->mm->uprobes_state.xol_area;
1382 smp_read_barrier_depends();
1383 if (area)
1384 trampoline_vaddr = area->vaddr;
1385
1386 return trampoline_vaddr;
1387}
1388
1389static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
1390{
1391 struct return_instance *ri;
1392 struct uprobe_task *utask;
1393 unsigned long orig_ret_vaddr, trampoline_vaddr;
1394 bool chained = false;
1395
1396 if (!get_xol_area())
1397 return;
1398
1399 utask = get_utask();
1400 if (!utask)
1401 return;
1402
1403 if (utask->depth >= MAX_URETPROBE_DEPTH) {
1404 printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
1405 " nestedness limit pid/tgid=%d/%d\n",
1406 current->pid, current->tgid);
1407 return;
1408 }
1409
1410 ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
1411 if (!ri)
1412 goto fail;
1413
1414 trampoline_vaddr = get_trampoline_vaddr();
1415 orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
1416 if (orig_ret_vaddr == -1)
1417 goto fail;
1418
1419 /*
1420 * We don't want to keep trampoline address in stack, rather keep the
1421 * original return address of first caller thru all the consequent
1422 * instances. This also makes breakpoint unwrapping easier.
1423 */
1424 if (orig_ret_vaddr == trampoline_vaddr) {
1425 if (!utask->return_instances) {
1426 /*
1427 * This situation is not possible. Likely we have an
1428 * attack from user-space.
1429 */
1430 pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
1431 current->pid, current->tgid);
1432 goto fail;
1433 }
1434
1435 chained = true;
1436 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
1437 }
1438
1439 atomic_inc(&uprobe->ref);
1440 ri->uprobe = uprobe;
1441 ri->func = instruction_pointer(regs);
1442 ri->orig_ret_vaddr = orig_ret_vaddr;
1443 ri->chained = chained;
1444
1445 utask->depth++;
1446
1447 /* add instance to the stack */
1448 ri->next = utask->return_instances;
1449 utask->return_instances = ri;
1450
1451 return;
1452
1453 fail:
1454 kfree(ri);
1455}
1456
1336/* Prepare to single-step probed instruction out of line. */ 1457/* Prepare to single-step probed instruction out of line. */
1337static int 1458static int
1338pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) 1459pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
@@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
1431 clear_bit(MMF_HAS_UPROBES, &mm->flags); 1552 clear_bit(MMF_HAS_UPROBES, &mm->flags);
1432} 1553}
1433 1554
1434static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) 1555static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1435{ 1556{
1436 struct page *page; 1557 struct page *page;
1437 uprobe_opcode_t opcode; 1558 uprobe_opcode_t opcode;
@@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
1449 if (result < 0) 1570 if (result < 0)
1450 return result; 1571 return result;
1451 1572
1452 copy_opcode(page, vaddr, &opcode); 1573 copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
1453 put_page(page); 1574 put_page(page);
1454 out: 1575 out:
1455 return is_swbp_insn(&opcode); 1576 /* This needs to return true for any variant of the trap insn */
1577 return is_trap_insn(&opcode);
1456} 1578}
1457 1579
1458static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) 1580static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1465 vma = find_vma(mm, bp_vaddr); 1587 vma = find_vma(mm, bp_vaddr);
1466 if (vma && vma->vm_start <= bp_vaddr) { 1588 if (vma && vma->vm_start <= bp_vaddr) {
1467 if (valid_vma(vma, false)) { 1589 if (valid_vma(vma, false)) {
1468 struct inode *inode = vma->vm_file->f_mapping->host; 1590 struct inode *inode = file_inode(vma->vm_file);
1469 loff_t offset = vaddr_to_offset(vma, bp_vaddr); 1591 loff_t offset = vaddr_to_offset(vma, bp_vaddr);
1470 1592
1471 uprobe = find_uprobe(inode, offset); 1593 uprobe = find_uprobe(inode, offset);
1472 } 1594 }
1473 1595
1474 if (!uprobe) 1596 if (!uprobe)
1475 *is_swbp = is_swbp_at_addr(mm, bp_vaddr); 1597 *is_swbp = is_trap_at_addr(mm, bp_vaddr);
1476 } else { 1598 } else {
1477 *is_swbp = -EFAULT; 1599 *is_swbp = -EFAULT;
1478 } 1600 }
@@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1488{ 1610{
1489 struct uprobe_consumer *uc; 1611 struct uprobe_consumer *uc;
1490 int remove = UPROBE_HANDLER_REMOVE; 1612 int remove = UPROBE_HANDLER_REMOVE;
1613 bool need_prep = false; /* prepare return uprobe, when needed */
1491 1614
1492 down_read(&uprobe->register_rwsem); 1615 down_read(&uprobe->register_rwsem);
1493 for (uc = uprobe->consumers; uc; uc = uc->next) { 1616 for (uc = uprobe->consumers; uc; uc = uc->next) {
1494 int rc = uc->handler(uc, regs); 1617 int rc = 0;
1618
1619 if (uc->handler) {
1620 rc = uc->handler(uc, regs);
1621 WARN(rc & ~UPROBE_HANDLER_MASK,
1622 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1623 }
1624
1625 if (uc->ret_handler)
1626 need_prep = true;
1495 1627
1496 WARN(rc & ~UPROBE_HANDLER_MASK,
1497 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1498 remove &= rc; 1628 remove &= rc;
1499 } 1629 }
1500 1630
1631 if (need_prep && !remove)
1632 prepare_uretprobe(uprobe, regs); /* put bp at return */
1633
1501 if (remove && uprobe->consumers) { 1634 if (remove && uprobe->consumers) {
1502 WARN_ON(!uprobe_is_active(uprobe)); 1635 WARN_ON(!uprobe_is_active(uprobe));
1503 unapply_uprobe(uprobe, current->mm); 1636 unapply_uprobe(uprobe, current->mm);
@@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1505 up_read(&uprobe->register_rwsem); 1638 up_read(&uprobe->register_rwsem);
1506} 1639}
1507 1640
1641static void
1642handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
1643{
1644 struct uprobe *uprobe = ri->uprobe;
1645 struct uprobe_consumer *uc;
1646
1647 down_read(&uprobe->register_rwsem);
1648 for (uc = uprobe->consumers; uc; uc = uc->next) {
1649 if (uc->ret_handler)
1650 uc->ret_handler(uc, ri->func, regs);
1651 }
1652 up_read(&uprobe->register_rwsem);
1653}
1654
1655static bool handle_trampoline(struct pt_regs *regs)
1656{
1657 struct uprobe_task *utask;
1658 struct return_instance *ri, *tmp;
1659 bool chained;
1660
1661 utask = current->utask;
1662 if (!utask)
1663 return false;
1664
1665 ri = utask->return_instances;
1666 if (!ri)
1667 return false;
1668
1669 /*
1670 * TODO: we should throw out return_instance's invalidated by
1671 * longjmp(), currently we assume that the probed function always
1672 * returns.
1673 */
1674 instruction_pointer_set(regs, ri->orig_ret_vaddr);
1675
1676 for (;;) {
1677 handle_uretprobe_chain(ri, regs);
1678
1679 chained = ri->chained;
1680 put_uprobe(ri->uprobe);
1681
1682 tmp = ri;
1683 ri = ri->next;
1684 kfree(tmp);
1685
1686 if (!chained)
1687 break;
1688
1689 utask->depth--;
1690
1691 BUG_ON(!ri);
1692 }
1693
1694 utask->return_instances = ri;
1695
1696 return true;
1697}
1698
1508/* 1699/*
1509 * Run handler and ask thread to singlestep. 1700 * Run handler and ask thread to singlestep.
1510 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1701 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs)
1516 int uninitialized_var(is_swbp); 1707 int uninitialized_var(is_swbp);
1517 1708
1518 bp_vaddr = uprobe_get_swbp_addr(regs); 1709 bp_vaddr = uprobe_get_swbp_addr(regs);
1519 uprobe = find_active_uprobe(bp_vaddr, &is_swbp); 1710 if (bp_vaddr == get_trampoline_vaddr()) {
1711 if (handle_trampoline(regs))
1712 return;
1713
1714 pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
1715 current->pid, current->tgid);
1716 }
1520 1717
1718 uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1521 if (!uprobe) { 1719 if (!uprobe) {
1522 if (is_swbp > 0) { 1720 if (is_swbp > 0) {
1523 /* No matching uprobe; signal SIGTRAP. */ 1721 /* No matching uprobe; signal SIGTRAP. */
@@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
1616 */ 1814 */
1617int uprobe_pre_sstep_notifier(struct pt_regs *regs) 1815int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1618{ 1816{
1619 if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags)) 1817 if (!current->mm)
1818 return 0;
1819
1820 if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
1821 (!current->utask || !current->utask->return_instances))
1620 return 0; 1822 return 0;
1621 1823
1622 set_thread_flag(TIF_UPROBE); 1824 set_thread_flag(TIF_UPROBE);
diff --git a/kernel/exit.c b/kernel/exit.c
index 60bc027c61c3..6e3151ec900f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1629,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1629 } 1629 }
1630 1630
1631 put_pid(pid); 1631 put_pid(pid);
1632
1633 /* avoid REGPARM breakage on x86: */
1634 asmlinkage_protect(5, ret, which, upid, infop, options, ru);
1635 return ret; 1632 return ret;
1636} 1633}
1637 1634
@@ -1669,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1669 ret = do_wait(&wo); 1666 ret = do_wait(&wo);
1670 put_pid(pid); 1667 put_pid(pid);
1671 1668
1672 /* avoid REGPARM breakage on x86: */
1673 asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
1674 return ret; 1669 return ret;
1675} 1670}
1676 1671
diff --git a/kernel/extable.c b/kernel/extable.c
index fe35a634bf76..67460b93b1a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1;
41/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
42void __init sort_main_extable(void) 42void __init sort_main_extable(void)
43{ 43{
44 if (main_extable_sort_needed) 44 if (main_extable_sort_needed) {
45 pr_notice("Sorting __ex_table...\n");
45 sort_extable(__start___ex_table, __stop___ex_table); 46 sort_extable(__start___ex_table, __stop___ex_table);
46 else 47 }
47 pr_notice("__ex_table already sorted, skipping sort\n");
48} 48}
49 49
50/* Given an address, look for it in the exception tables. */ 50/* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1766d324d5e3..7d40687b1434 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1233,7 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1233 1233
1234 p->utime = p->stime = p->gtime = 0; 1234 p->utime = p->stime = p->gtime = 0;
1235 p->utimescaled = p->stimescaled = 0; 1235 p->utimescaled = p->stimescaled = 0;
1236#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1236#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1237 p->prev_cputime.utime = p->prev_cputime.stime = 0; 1237 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1238#endif 1238#endif
1239#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1239#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -1677,10 +1677,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1677 int, tls_val) 1677 int, tls_val)
1678#endif 1678#endif
1679{ 1679{
1680 long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); 1680 return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
1681 asmlinkage_protect(5, ret, clone_flags, newsp,
1682 parent_tidptr, child_tidptr, tls_val);
1683 return ret;
1684} 1681}
1685#endif 1682#endif
1686 1683
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 14be27feda49..609d8ff38b74 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -84,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
84 .get_time = &ktime_get_boottime, 84 .get_time = &ktime_get_boottime,
85 .resolution = KTIME_LOW_RES, 85 .resolution = KTIME_LOW_RES,
86 }, 86 },
87 {
88 .index = HRTIMER_BASE_TAI,
89 .clockid = CLOCK_TAI,
90 .get_time = &ktime_get_clocktai,
91 .resolution = KTIME_LOW_RES,
92 },
87 } 93 }
88}; 94};
89 95
@@ -91,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
91 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, 97 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
92 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, 98 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
93 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, 99 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
100 [CLOCK_TAI] = HRTIMER_BASE_TAI,
94}; 101};
95 102
96static inline int hrtimer_clockid_to_base(clockid_t clock_id) 103static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -107,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
107{ 114{
108 ktime_t xtim, mono, boot; 115 ktime_t xtim, mono, boot;
109 struct timespec xts, tom, slp; 116 struct timespec xts, tom, slp;
117 s32 tai_offset;
110 118
111 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); 119 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
120 tai_offset = timekeeping_get_tai_offset();
112 121
113 xtim = timespec_to_ktime(xts); 122 xtim = timespec_to_ktime(xts);
114 mono = ktime_add(xtim, timespec_to_ktime(tom)); 123 mono = ktime_add(xtim, timespec_to_ktime(tom));
@@ -116,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
116 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; 125 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
117 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; 126 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
118 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; 127 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
128 base->clock_base[HRTIMER_BASE_TAI].softirq_time =
129 ktime_add(xtim, ktime_set(tai_offset, 0));
119} 130}
120 131
121/* 132/*
@@ -276,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
276 } else { 287 } else {
277 unsigned long rem = do_div(nsec, NSEC_PER_SEC); 288 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
278 289
290 /* Make sure nsec fits into long */
291 if (unlikely(nsec > KTIME_SEC_MAX))
292 return (ktime_t){ .tv64 = KTIME_MAX };
293
279 tmp = ktime_set((long)nsec, rem); 294 tmp = ktime_set((long)nsec, rem);
280 } 295 }
281 296
@@ -652,8 +667,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
652{ 667{
653 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; 668 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
654 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; 669 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
670 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
655 671
656 return ktime_get_update_offsets(offs_real, offs_boot); 672 return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
657} 673}
658 674
659/* 675/*
@@ -1011,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1011 * @timer: the timer to be added 1027 * @timer: the timer to be added
1012 * @tim: expiry time 1028 * @tim: expiry time
1013 * @delta_ns: "slack" range for the timer 1029 * @delta_ns: "slack" range for the timer
1014 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 1030 * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
1031 * relative (HRTIMER_MODE_REL)
1015 * 1032 *
1016 * Returns: 1033 * Returns:
1017 * 0 on success 1034 * 0 on success
@@ -1028,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1028 * hrtimer_start - (re)start an hrtimer on the current CPU 1045 * hrtimer_start - (re)start an hrtimer on the current CPU
1029 * @timer: the timer to be added 1046 * @timer: the timer to be added
1030 * @tim: expiry time 1047 * @tim: expiry time
1031 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 1048 * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
1049 * relative (HRTIMER_MODE_REL)
1032 * 1050 *
1033 * Returns: 1051 * Returns:
1034 * 0 on success 1052 * 0 on success
@@ -1310,6 +1328,8 @@ retry:
1310 1328
1311 expires = ktime_sub(hrtimer_get_expires(timer), 1329 expires = ktime_sub(hrtimer_get_expires(timer),
1312 base->offset); 1330 base->offset);
1331 if (expires.tv64 < 0)
1332 expires.tv64 = KTIME_MAX;
1313 if (expires.tv64 < expires_next.tv64) 1333 if (expires.tv64 < expires_next.tv64)
1314 expires_next = expires; 1334 expires_next = expires;
1315 break; 1335 break;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ffd4e111fd67..59f7b55ba745 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -786,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image,
786 struct kexec_segment *segment) 786 struct kexec_segment *segment)
787{ 787{
788 unsigned long maddr; 788 unsigned long maddr;
789 unsigned long ubytes, mbytes; 789 size_t ubytes, mbytes;
790 int result; 790 int result;
791 unsigned char __user *buf; 791 unsigned char __user *buf;
792 792
@@ -819,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image,
819 /* Start with a clear page */ 819 /* Start with a clear page */
820 clear_page(ptr); 820 clear_page(ptr);
821 ptr += maddr & ~PAGE_MASK; 821 ptr += maddr & ~PAGE_MASK;
822 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 822 mchunk = min_t(size_t, mbytes,
823 if (mchunk > mbytes) 823 PAGE_SIZE - (maddr & ~PAGE_MASK));
824 mchunk = mbytes; 824 uchunk = min(ubytes, mchunk);
825
826 uchunk = mchunk;
827 if (uchunk > ubytes)
828 uchunk = ubytes;
829 825
830 result = copy_from_user(ptr, buf, uchunk); 826 result = copy_from_user(ptr, buf, uchunk);
831 kunmap(page); 827 kunmap(page);
@@ -850,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image,
850 * We do things a page at a time for the sake of kmap. 846 * We do things a page at a time for the sake of kmap.
851 */ 847 */
852 unsigned long maddr; 848 unsigned long maddr;
853 unsigned long ubytes, mbytes; 849 size_t ubytes, mbytes;
854 int result; 850 int result;
855 unsigned char __user *buf; 851 unsigned char __user *buf;
856 852
@@ -871,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image,
871 } 867 }
872 ptr = kmap(page); 868 ptr = kmap(page);
873 ptr += maddr & ~PAGE_MASK; 869 ptr += maddr & ~PAGE_MASK;
874 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 870 mchunk = min_t(size_t, mbytes,
875 if (mchunk > mbytes) 871 PAGE_SIZE - (maddr & ~PAGE_MASK));
876 mchunk = mbytes; 872 uchunk = min(ubytes, mchunk);
877 873 if (mchunk > uchunk) {
878 uchunk = mchunk;
879 if (uchunk > ubytes) {
880 uchunk = ubytes;
881 /* Zero the trailing part of the page */ 874 /* Zero the trailing part of the page */
882 memset(ptr + uchunk, 0, mchunk - uchunk); 875 memset(ptr + uchunk, 0, mchunk - uchunk);
883 } 876 }
@@ -1118,12 +1111,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
1118{ 1111{
1119 unsigned long addr; 1112 unsigned long addr;
1120 1113
1121 for (addr = begin; addr < end; addr += PAGE_SIZE) { 1114 for (addr = begin; addr < end; addr += PAGE_SIZE)
1122 ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); 1115 free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
1123 init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1124 free_page((unsigned long)__va(addr));
1125 totalram_pages++;
1126 }
1127} 1116}
1128 1117
1129int crash_shrink_memory(unsigned long new_size) 1118int crash_shrink_memory(unsigned long new_size)
@@ -1544,14 +1533,13 @@ void vmcoreinfo_append_str(const char *fmt, ...)
1544{ 1533{
1545 va_list args; 1534 va_list args;
1546 char buf[0x50]; 1535 char buf[0x50];
1547 int r; 1536 size_t r;
1548 1537
1549 va_start(args, fmt); 1538 va_start(args, fmt);
1550 r = vsnprintf(buf, sizeof(buf), fmt, args); 1539 r = vsnprintf(buf, sizeof(buf), fmt, args);
1551 va_end(args); 1540 va_end(args);
1552 1541
1553 if (r + vmcoreinfo_size > vmcoreinfo_max_size) 1542 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1554 r = vmcoreinfo_max_size - vmcoreinfo_size;
1555 1543
1556 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 1544 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1557 1545
@@ -1581,7 +1569,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1581 VMCOREINFO_SYMBOL(swapper_pg_dir); 1569 VMCOREINFO_SYMBOL(swapper_pg_dir);
1582#endif 1570#endif
1583 VMCOREINFO_SYMBOL(_stext); 1571 VMCOREINFO_SYMBOL(_stext);
1584 VMCOREINFO_SYMBOL(vmlist); 1572 VMCOREINFO_SYMBOL(vmap_area_list);
1585 1573
1586#ifndef CONFIG_NEED_MULTIPLE_NODES 1574#ifndef CONFIG_NEED_MULTIPLE_NODES
1587 VMCOREINFO_SYMBOL(mem_map); 1575 VMCOREINFO_SYMBOL(mem_map);
@@ -1619,7 +1607,8 @@ static int __init crash_save_vmcoreinfo_init(void)
1619 VMCOREINFO_OFFSET(free_area, free_list); 1607 VMCOREINFO_OFFSET(free_area, free_list);
1620 VMCOREINFO_OFFSET(list_head, next); 1608 VMCOREINFO_OFFSET(list_head, next);
1621 VMCOREINFO_OFFSET(list_head, prev); 1609 VMCOREINFO_OFFSET(list_head, prev);
1622 VMCOREINFO_OFFSET(vm_struct, addr); 1610 VMCOREINFO_OFFSET(vmap_area, va_start);
1611 VMCOREINFO_OFFSET(vmap_area, list);
1623 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1612 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1624 log_buf_kexec_setup(); 1613 log_buf_kexec_setup();
1625 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1614 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 56dd34976d7b..1296e72e4161 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -77,6 +77,7 @@ static void free_modprobe_argv(struct subprocess_info *info)
77 77
78static int call_modprobe(char *module_name, int wait) 78static int call_modprobe(char *module_name, int wait)
79{ 79{
80 struct subprocess_info *info;
80 static char *envp[] = { 81 static char *envp[] = {
81 "HOME=/", 82 "HOME=/",
82 "TERM=linux", 83 "TERM=linux",
@@ -98,8 +99,15 @@ static int call_modprobe(char *module_name, int wait)
98 argv[3] = module_name; /* check free_modprobe_argv() */ 99 argv[3] = module_name; /* check free_modprobe_argv() */
99 argv[4] = NULL; 100 argv[4] = NULL;
100 101
101 return call_usermodehelper_fns(modprobe_path, argv, envp, 102 info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
102 wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); 103 NULL, free_modprobe_argv, NULL);
104 if (!info)
105 goto free_module_name;
106
107 return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
108
109free_module_name:
110 kfree(module_name);
103free_argv: 111free_argv:
104 kfree(argv); 112 kfree(argv);
105out: 113out:
@@ -502,14 +510,28 @@ static void helper_unlock(void)
502 * @argv: arg vector for process 510 * @argv: arg vector for process
503 * @envp: environment for process 511 * @envp: environment for process
504 * @gfp_mask: gfp mask for memory allocation 512 * @gfp_mask: gfp mask for memory allocation
513 * @cleanup: a cleanup function
514 * @init: an init function
515 * @data: arbitrary context sensitive data
505 * 516 *
506 * Returns either %NULL on allocation failure, or a subprocess_info 517 * Returns either %NULL on allocation failure, or a subprocess_info
507 * structure. This should be passed to call_usermodehelper_exec to 518 * structure. This should be passed to call_usermodehelper_exec to
508 * exec the process and free the structure. 519 * exec the process and free the structure.
520 *
521 * The init function is used to customize the helper process prior to
522 * exec. A non-zero return code causes the process to error out, exit,
523 * and return the failure to the calling process
524 *
525 * The cleanup function is just before ethe subprocess_info is about to
526 * be freed. This can be used for freeing the argv and envp. The
527 * Function must be runnable in either a process context or the
528 * context in which call_usermodehelper_exec is called.
509 */ 529 */
510static
511struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, 530struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
512 char **envp, gfp_t gfp_mask) 531 char **envp, gfp_t gfp_mask,
532 int (*init)(struct subprocess_info *info, struct cred *new),
533 void (*cleanup)(struct subprocess_info *info),
534 void *data)
513{ 535{
514 struct subprocess_info *sub_info; 536 struct subprocess_info *sub_info;
515 sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); 537 sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
@@ -520,50 +542,27 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
520 sub_info->path = path; 542 sub_info->path = path;
521 sub_info->argv = argv; 543 sub_info->argv = argv;
522 sub_info->envp = envp; 544 sub_info->envp = envp;
545
546 sub_info->cleanup = cleanup;
547 sub_info->init = init;
548 sub_info->data = data;
523 out: 549 out:
524 return sub_info; 550 return sub_info;
525} 551}
526 552EXPORT_SYMBOL(call_usermodehelper_setup);
527/**
528 * call_usermodehelper_setfns - set a cleanup/init function
529 * @info: a subprocess_info returned by call_usermodehelper_setup
530 * @cleanup: a cleanup function
531 * @init: an init function
532 * @data: arbitrary context sensitive data
533 *
534 * The init function is used to customize the helper process prior to
535 * exec. A non-zero return code causes the process to error out, exit,
536 * and return the failure to the calling process
537 *
538 * The cleanup function is just before ethe subprocess_info is about to
539 * be freed. This can be used for freeing the argv and envp. The
540 * Function must be runnable in either a process context or the
541 * context in which call_usermodehelper_exec is called.
542 */
543static
544void call_usermodehelper_setfns(struct subprocess_info *info,
545 int (*init)(struct subprocess_info *info, struct cred *new),
546 void (*cleanup)(struct subprocess_info *info),
547 void *data)
548{
549 info->cleanup = cleanup;
550 info->init = init;
551 info->data = data;
552}
553 553
554/** 554/**
555 * call_usermodehelper_exec - start a usermode application 555 * call_usermodehelper_exec - start a usermode application
556 * @sub_info: information about the subprocessa 556 * @sub_info: information about the subprocessa
557 * @wait: wait for the application to finish and return status. 557 * @wait: wait for the application to finish and return status.
558 * when -1 don't wait at all, but you get no useful error back when 558 * when UMH_NO_WAIT don't wait at all, but you get no useful error back
559 * the program couldn't be exec'ed. This makes it safe to call 559 * when the program couldn't be exec'ed. This makes it safe to call
560 * from interrupt context. 560 * from interrupt context.
561 * 561 *
562 * Runs a user-space application. The application is started 562 * Runs a user-space application. The application is started
563 * asynchronously if wait is not set, and runs as a child of keventd. 563 * asynchronously if wait is not set, and runs as a child of keventd.
564 * (ie. it runs with full root capabilities). 564 * (ie. it runs with full root capabilities).
565 */ 565 */
566static
567int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) 566int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
568{ 567{
569 DECLARE_COMPLETION_ONSTACK(done); 568 DECLARE_COMPLETION_ONSTACK(done);
@@ -615,31 +614,34 @@ unlock:
615 helper_unlock(); 614 helper_unlock();
616 return retval; 615 return retval;
617} 616}
617EXPORT_SYMBOL(call_usermodehelper_exec);
618 618
619/* 619/**
620 * call_usermodehelper_fns() will not run the caller-provided cleanup function 620 * call_usermodehelper() - prepare and start a usermode application
621 * if a memory allocation failure is experienced. So the caller might need to 621 * @path: path to usermode executable
622 * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform 622 * @argv: arg vector for process
623 * the necessaary cleanup within the caller. 623 * @envp: environment for process
624 * @wait: wait for the application to finish and return status.
625 * when UMH_NO_WAIT don't wait at all, but you get no useful error back
626 * when the program couldn't be exec'ed. This makes it safe to call
627 * from interrupt context.
628 *
629 * This function is the equivalent to use call_usermodehelper_setup() and
630 * call_usermodehelper_exec().
624 */ 631 */
625int call_usermodehelper_fns( 632int call_usermodehelper(char *path, char **argv, char **envp, int wait)
626 char *path, char **argv, char **envp, int wait,
627 int (*init)(struct subprocess_info *info, struct cred *new),
628 void (*cleanup)(struct subprocess_info *), void *data)
629{ 633{
630 struct subprocess_info *info; 634 struct subprocess_info *info;
631 gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; 635 gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
632 636
633 info = call_usermodehelper_setup(path, argv, envp, gfp_mask); 637 info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
634 638 NULL, NULL, NULL);
635 if (info == NULL) 639 if (info == NULL)
636 return -ENOMEM; 640 return -ENOMEM;
637 641
638 call_usermodehelper_setfns(info, init, cleanup, data);
639
640 return call_usermodehelper_exec(info, wait); 642 return call_usermodehelper_exec(info, wait);
641} 643}
642EXPORT_SYMBOL(call_usermodehelper_fns); 644EXPORT_SYMBOL(call_usermodehelper);
643 645
644static int proc_cap_handler(struct ctl_table *table, int write, 646static int proc_cap_handler(struct ctl_table *table, int write,
645 void __user *buffer, size_t *lenp, loff_t *ppos) 647 void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9eb7fed0bbaa..760e86df8c20 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -17,6 +17,7 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/freezer.h> 18#include <linux/freezer.h>
19#include <linux/ptrace.h> 19#include <linux/ptrace.h>
20#include <linux/uaccess.h>
20#include <trace/events/sched.h> 21#include <trace/events/sched.h>
21 22
22static DEFINE_SPINLOCK(kthread_create_lock); 23static DEFINE_SPINLOCK(kthread_create_lock);
@@ -52,8 +53,21 @@ enum KTHREAD_BITS {
52 KTHREAD_IS_PARKED, 53 KTHREAD_IS_PARKED,
53}; 54};
54 55
55#define to_kthread(tsk) \ 56#define __to_kthread(vfork) \
56 container_of((tsk)->vfork_done, struct kthread, exited) 57 container_of(vfork, struct kthread, exited)
58
59static inline struct kthread *to_kthread(struct task_struct *k)
60{
61 return __to_kthread(k->vfork_done);
62}
63
64static struct kthread *to_live_kthread(struct task_struct *k)
65{
66 struct completion *vfork = ACCESS_ONCE(k->vfork_done);
67 if (likely(vfork))
68 return __to_kthread(vfork);
69 return NULL;
70}
57 71
58/** 72/**
59 * kthread_should_stop - should this kthread return now? 73 * kthread_should_stop - should this kthread return now?
@@ -122,6 +136,24 @@ void *kthread_data(struct task_struct *task)
122 return to_kthread(task)->data; 136 return to_kthread(task)->data;
123} 137}
124 138
139/**
140 * probe_kthread_data - speculative version of kthread_data()
141 * @task: possible kthread task in question
142 *
143 * @task could be a kthread task. Return the data value specified when it
144 * was created if accessible. If @task isn't a kthread task or its data is
145 * inaccessible for any reason, %NULL is returned. This function requires
146 * that @task itself is safe to dereference.
147 */
148void *probe_kthread_data(struct task_struct *task)
149{
150 struct kthread *kthread = to_kthread(task);
151 void *data = NULL;
152
153 probe_kernel_read(&data, &kthread->data, sizeof(data));
154 return data;
155}
156
125static void __kthread_parkme(struct kthread *self) 157static void __kthread_parkme(struct kthread *self)
126{ 158{
127 __set_current_state(TASK_PARKED); 159 __set_current_state(TASK_PARKED);
@@ -265,7 +297,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
265 } 297 }
266 /* It's safe because the task is inactive. */ 298 /* It's safe because the task is inactive. */
267 do_set_cpus_allowed(p, cpumask_of(cpu)); 299 do_set_cpus_allowed(p, cpumask_of(cpu));
268 p->flags |= PF_THREAD_BOUND; 300 p->flags |= PF_NO_SETAFFINITY;
269} 301}
270 302
271/** 303/**
@@ -311,19 +343,6 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
311 return p; 343 return p;
312} 344}
313 345
314static struct kthread *task_get_live_kthread(struct task_struct *k)
315{
316 struct kthread *kthread;
317
318 get_task_struct(k);
319 kthread = to_kthread(k);
320 /* It might have exited */
321 barrier();
322 if (k->vfork_done != NULL)
323 return kthread;
324 return NULL;
325}
326
327static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) 346static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
328{ 347{
329 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 348 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
@@ -350,11 +369,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
350 */ 369 */
351void kthread_unpark(struct task_struct *k) 370void kthread_unpark(struct task_struct *k)
352{ 371{
353 struct kthread *kthread = task_get_live_kthread(k); 372 struct kthread *kthread = to_live_kthread(k);
354 373
355 if (kthread) 374 if (kthread)
356 __kthread_unpark(k, kthread); 375 __kthread_unpark(k, kthread);
357 put_task_struct(k);
358} 376}
359 377
360/** 378/**
@@ -371,7 +389,7 @@ void kthread_unpark(struct task_struct *k)
371 */ 389 */
372int kthread_park(struct task_struct *k) 390int kthread_park(struct task_struct *k)
373{ 391{
374 struct kthread *kthread = task_get_live_kthread(k); 392 struct kthread *kthread = to_live_kthread(k);
375 int ret = -ENOSYS; 393 int ret = -ENOSYS;
376 394
377 if (kthread) { 395 if (kthread) {
@@ -384,7 +402,6 @@ int kthread_park(struct task_struct *k)
384 } 402 }
385 ret = 0; 403 ret = 0;
386 } 404 }
387 put_task_struct(k);
388 return ret; 405 return ret;
389} 406}
390 407
@@ -405,10 +422,13 @@ int kthread_park(struct task_struct *k)
405 */ 422 */
406int kthread_stop(struct task_struct *k) 423int kthread_stop(struct task_struct *k)
407{ 424{
408 struct kthread *kthread = task_get_live_kthread(k); 425 struct kthread *kthread;
409 int ret; 426 int ret;
410 427
411 trace_sched_kthread_stop(k); 428 trace_sched_kthread_stop(k);
429
430 get_task_struct(k);
431 kthread = to_live_kthread(k);
412 if (kthread) { 432 if (kthread) {
413 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); 433 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
414 __kthread_unpark(k, kthread); 434 __kthread_unpark(k, kthread);
@@ -416,10 +436,9 @@ int kthread_stop(struct task_struct *k)
416 wait_for_completion(&kthread->exited); 436 wait_for_completion(&kthread->exited);
417 } 437 }
418 ret = k->exit_code; 438 ret = k->exit_code;
419
420 put_task_struct(k); 439 put_task_struct(k);
421 trace_sched_kthread_stop_ret(ret);
422 440
441 trace_sched_kthread_stop_ret(ret);
423 return ret; 442 return ret;
424} 443}
425EXPORT_SYMBOL(kthread_stop); 444EXPORT_SYMBOL(kthread_stop);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8a0efac4f99d..6a3bccba7e7d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
380unsigned long nr_stack_trace_entries; 380unsigned long nr_stack_trace_entries;
381static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; 381static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
382 382
383static void print_lockdep_off(const char *bug_msg)
384{
385 printk(KERN_DEBUG "%s\n", bug_msg);
386 printk(KERN_DEBUG "turning off the locking correctness validator.\n");
387 printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
388}
389
383static int save_trace(struct stack_trace *trace) 390static int save_trace(struct stack_trace *trace)
384{ 391{
385 trace->nr_entries = 0; 392 trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
409 if (!debug_locks_off_graph_unlock()) 416 if (!debug_locks_off_graph_unlock())
410 return 0; 417 return 0;
411 418
412 printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); 419 print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
413 printk("turning off the locking correctness validator.\n");
414 dump_stack(); 420 dump_stack();
415 421
416 return 0; 422 return 0;
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
763 } 769 }
764 raw_local_irq_restore(flags); 770 raw_local_irq_restore(flags);
765 771
766 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 772 print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
767 printk("turning off the locking correctness validator.\n");
768 dump_stack(); 773 dump_stack();
769 return NULL; 774 return NULL;
770 } 775 }
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)
834 if (!debug_locks_off_graph_unlock()) 839 if (!debug_locks_off_graph_unlock())
835 return NULL; 840 return NULL;
836 841
837 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); 842 print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
838 printk("turning off the locking correctness validator.\n");
839 dump_stack(); 843 dump_stack();
840 return NULL; 844 return NULL;
841 } 845 }
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
2000 struct lock_class *class = hlock_class(hlock); 2004 struct lock_class *class = hlock_class(hlock);
2001 struct list_head *hash_head = chainhashentry(chain_key); 2005 struct list_head *hash_head = chainhashentry(chain_key);
2002 struct lock_chain *chain; 2006 struct lock_chain *chain;
2003 struct held_lock *hlock_curr, *hlock_next; 2007 struct held_lock *hlock_curr;
2004 int i, j; 2008 int i, j;
2005 2009
2006 /* 2010 /*
@@ -2048,8 +2052,7 @@ cache_hit:
2048 if (!debug_locks_off_graph_unlock()) 2052 if (!debug_locks_off_graph_unlock())
2049 return 0; 2053 return 0;
2050 2054
2051 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); 2055 print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
2052 printk("turning off the locking correctness validator.\n");
2053 dump_stack(); 2056 dump_stack();
2054 return 0; 2057 return 0;
2055 } 2058 }
@@ -2057,12 +2060,10 @@ cache_hit:
2057 chain->chain_key = chain_key; 2060 chain->chain_key = chain_key;
2058 chain->irq_context = hlock->irq_context; 2061 chain->irq_context = hlock->irq_context;
2059 /* Find the first held_lock of current chain */ 2062 /* Find the first held_lock of current chain */
2060 hlock_next = hlock;
2061 for (i = curr->lockdep_depth - 1; i >= 0; i--) { 2063 for (i = curr->lockdep_depth - 1; i >= 0; i--) {
2062 hlock_curr = curr->held_locks + i; 2064 hlock_curr = curr->held_locks + i;
2063 if (hlock_curr->irq_context != hlock_next->irq_context) 2065 if (hlock_curr->irq_context != hlock->irq_context)
2064 break; 2066 break;
2065 hlock_next = hlock;
2066 } 2067 }
2067 i++; 2068 i++;
2068 chain->depth = curr->lockdep_depth + 1 - i; 2069 chain->depth = curr->lockdep_depth + 1 - i;
@@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3190#endif 3191#endif
3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { 3192 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
3192 debug_locks_off(); 3193 debug_locks_off();
3193 printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", 3194 print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
3195 printk(KERN_DEBUG "depth: %i max: %lu!\n",
3194 curr->lockdep_depth, MAX_LOCK_DEPTH); 3196 curr->lockdep_depth, MAX_LOCK_DEPTH);
3195 printk("turning off the locking correctness validator.\n");
3196 3197
3197 lockdep_print_held_locks(current); 3198 lockdep_print_held_locks(current);
3198 debug_show_all_locks(); 3199 debug_show_all_locks();
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 52f23011b6e0..ad53a664f113 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -37,6 +37,12 @@
37# include <asm/mutex.h> 37# include <asm/mutex.h>
38#endif 38#endif
39 39
40/*
41 * A negative mutex count indicates that waiters are sleeping waiting for the
42 * mutex.
43 */
44#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
45
40void 46void
41__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) 47__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
42{ 48{
@@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
44 spin_lock_init(&lock->wait_lock); 50 spin_lock_init(&lock->wait_lock);
45 INIT_LIST_HEAD(&lock->wait_list); 51 INIT_LIST_HEAD(&lock->wait_list);
46 mutex_clear_owner(lock); 52 mutex_clear_owner(lock);
53#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
54 lock->spin_mlock = NULL;
55#endif
47 56
48 debug_mutex_init(lock, name, key); 57 debug_mutex_init(lock, name, key);
49} 58}
@@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)
95EXPORT_SYMBOL(mutex_lock); 104EXPORT_SYMBOL(mutex_lock);
96#endif 105#endif
97 106
107#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
108/*
109 * In order to avoid a stampede of mutex spinners from acquiring the mutex
110 * more or less simultaneously, the spinners need to acquire a MCS lock
111 * first before spinning on the owner field.
112 *
113 * We don't inline mspin_lock() so that perf can correctly account for the
114 * time spent in this lock function.
115 */
116struct mspin_node {
117 struct mspin_node *next ;
118 int locked; /* 1 if lock acquired */
119};
120#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
121
122static noinline
123void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
124{
125 struct mspin_node *prev;
126
127 /* Init node */
128 node->locked = 0;
129 node->next = NULL;
130
131 prev = xchg(lock, node);
132 if (likely(prev == NULL)) {
133 /* Lock acquired */
134 node->locked = 1;
135 return;
136 }
137 ACCESS_ONCE(prev->next) = node;
138 smp_wmb();
139 /* Wait until the lock holder passes the lock down */
140 while (!ACCESS_ONCE(node->locked))
141 arch_mutex_cpu_relax();
142}
143
144static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
145{
146 struct mspin_node *next = ACCESS_ONCE(node->next);
147
148 if (likely(!next)) {
149 /*
150 * Release the lock by setting it to NULL
151 */
152 if (cmpxchg(lock, node, NULL) == node)
153 return;
154 /* Wait until the next pointer is set */
155 while (!(next = ACCESS_ONCE(node->next)))
156 arch_mutex_cpu_relax();
157 }
158 ACCESS_ONCE(next->locked) = 1;
159 smp_wmb();
160}
161
162/*
163 * Mutex spinning code migrated from kernel/sched/core.c
164 */
165
166static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
167{
168 if (lock->owner != owner)
169 return false;
170
171 /*
172 * Ensure we emit the owner->on_cpu, dereference _after_ checking
173 * lock->owner still matches owner, if that fails, owner might
174 * point to free()d memory, if it still matches, the rcu_read_lock()
175 * ensures the memory stays valid.
176 */
177 barrier();
178
179 return owner->on_cpu;
180}
181
182/*
183 * Look out! "owner" is an entirely speculative pointer
184 * access and not reliable.
185 */
186static noinline
187int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
188{
189 rcu_read_lock();
190 while (owner_running(lock, owner)) {
191 if (need_resched())
192 break;
193
194 arch_mutex_cpu_relax();
195 }
196 rcu_read_unlock();
197
198 /*
199 * We break out the loop above on need_resched() and when the
200 * owner changed, which is a sign for heavy contention. Return
201 * success only when lock->owner is NULL.
202 */
203 return lock->owner == NULL;
204}
205
206/*
207 * Initial check for entering the mutex spinning loop
208 */
209static inline int mutex_can_spin_on_owner(struct mutex *lock)
210{
211 int retval = 1;
212
213 rcu_read_lock();
214 if (lock->owner)
215 retval = lock->owner->on_cpu;
216 rcu_read_unlock();
217 /*
218 * if lock->owner is not set, the mutex owner may have just acquired
219 * it and not set the owner yet or the mutex has been released.
220 */
221 return retval;
222}
223#endif
224
98static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 225static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
99 226
100/** 227/**
@@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
158 * 285 *
159 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock 286 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
160 * to serialize everything. 287 * to serialize everything.
288 *
289 * The mutex spinners are queued up using MCS lock so that only one
290 * spinner can compete for the mutex. However, if mutex spinning isn't
291 * going to happen, there is no point in going through the lock/unlock
292 * overhead.
161 */ 293 */
294 if (!mutex_can_spin_on_owner(lock))
295 goto slowpath;
162 296
163 for (;;) { 297 for (;;) {
164 struct task_struct *owner; 298 struct task_struct *owner;
299 struct mspin_node node;
165 300
166 /* 301 /*
167 * If there's an owner, wait for it to either 302 * If there's an owner, wait for it to either
168 * release the lock or go to sleep. 303 * release the lock or go to sleep.
169 */ 304 */
305 mspin_lock(MLOCK(lock), &node);
170 owner = ACCESS_ONCE(lock->owner); 306 owner = ACCESS_ONCE(lock->owner);
171 if (owner && !mutex_spin_on_owner(lock, owner)) 307 if (owner && !mutex_spin_on_owner(lock, owner)) {
308 mspin_unlock(MLOCK(lock), &node);
172 break; 309 break;
310 }
173 311
174 if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { 312 if ((atomic_read(&lock->count) == 1) &&
313 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
175 lock_acquired(&lock->dep_map, ip); 314 lock_acquired(&lock->dep_map, ip);
176 mutex_set_owner(lock); 315 mutex_set_owner(lock);
316 mspin_unlock(MLOCK(lock), &node);
177 preempt_enable(); 317 preempt_enable();
178 return 0; 318 return 0;
179 } 319 }
320 mspin_unlock(MLOCK(lock), &node);
180 321
181 /* 322 /*
182 * When there's no owner, we might have preempted between the 323 * When there's no owner, we might have preempted between the
@@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
195 */ 336 */
196 arch_mutex_cpu_relax(); 337 arch_mutex_cpu_relax();
197 } 338 }
339slowpath:
198#endif 340#endif
199 spin_lock_mutex(&lock->wait_lock, flags); 341 spin_lock_mutex(&lock->wait_lock, flags);
200 342
@@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
205 list_add_tail(&waiter.list, &lock->wait_list); 347 list_add_tail(&waiter.list, &lock->wait_list);
206 waiter.task = task; 348 waiter.task = task;
207 349
208 if (atomic_xchg(&lock->count, -1) == 1) 350 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
209 goto done; 351 goto done;
210 352
211 lock_contended(&lock->dep_map, ip); 353 lock_contended(&lock->dep_map, ip);
@@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
220 * that when we release the lock, we properly wake up the 362 * that when we release the lock, we properly wake up the
221 * other waiters: 363 * other waiters:
222 */ 364 */
223 if (atomic_xchg(&lock->count, -1) == 1) 365 if (MUTEX_SHOW_NO_WAITER(lock) &&
366 (atomic_xchg(&lock->count, -1) == 1))
224 break; 367 break;
225 368
226 /* 369 /*
diff --git a/kernel/panic.c b/kernel/panic.c
index 7c57cc9eee2c..167ec097ce8b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -22,7 +22,6 @@
22#include <linux/sysrq.h> 22#include <linux/sysrq.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/nmi.h> 24#include <linux/nmi.h>
25#include <linux/dmi.h>
26 25
27#define PANIC_TIMER_STEP 100 26#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18 27#define PANIC_BLINK_SPD 18
@@ -400,13 +399,8 @@ struct slowpath_args {
400static void warn_slowpath_common(const char *file, int line, void *caller, 399static void warn_slowpath_common(const char *file, int line, void *caller,
401 unsigned taint, struct slowpath_args *args) 400 unsigned taint, struct slowpath_args *args)
402{ 401{
403 const char *board;
404
405 printk(KERN_WARNING "------------[ cut here ]------------\n"); 402 printk(KERN_WARNING "------------[ cut here ]------------\n");
406 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); 403 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
407 board = dmi_get_system_info(DMI_PRODUCT_NAME);
408 if (board)
409 printk(KERN_WARNING "Hardware name: %s\n", board);
410 404
411 if (args) 405 if (args)
412 vprintk(args->fmt, args->args); 406 vprintk(args->fmt, args->args);
diff --git a/kernel/pid.c b/kernel/pid.c
index 047dc6264638..6283d6412aff 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -51,9 +51,6 @@ int pid_max = PID_MAX_DEFAULT;
51int pid_max_min = RESERVED_PIDS + 1; 51int pid_max_min = RESERVED_PIDS + 1;
52int pid_max_max = PID_MAX_LIMIT; 52int pid_max_max = PID_MAX_LIMIT;
53 53
54#define BITS_PER_PAGE (PAGE_SIZE*8)
55#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
56
57static inline int mk_pid(struct pid_namespace *pid_ns, 54static inline int mk_pid(struct pid_namespace *pid_ns,
58 struct pidmap *map, int off) 55 struct pidmap *map, int off)
59{ 56{
@@ -183,15 +180,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
183 break; 180 break;
184 } 181 }
185 if (likely(atomic_read(&map->nr_free))) { 182 if (likely(atomic_read(&map->nr_free))) {
186 do { 183 for ( ; ; ) {
187 if (!test_and_set_bit(offset, map->page)) { 184 if (!test_and_set_bit(offset, map->page)) {
188 atomic_dec(&map->nr_free); 185 atomic_dec(&map->nr_free);
189 set_last_pid(pid_ns, last, pid); 186 set_last_pid(pid_ns, last, pid);
190 return pid; 187 return pid;
191 } 188 }
192 offset = find_next_offset(map, offset); 189 offset = find_next_offset(map, offset);
190 if (offset >= BITS_PER_PAGE)
191 break;
193 pid = mk_pid(pid_ns, map, offset); 192 pid = mk_pid(pid_ns, map, offset);
194 } while (offset < BITS_PER_PAGE && pid < pid_max); 193 if (pid >= pid_max)
194 break;
195 }
195 } 196 }
196 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 197 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
197 ++map; 198 ++map;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index bea15bdf82b0..69473c4a653f 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -19,8 +19,6 @@
19#include <linux/reboot.h> 19#include <linux/reboot.h>
20#include <linux/export.h> 20#include <linux/export.h>
21 21
22#define BITS_PER_PAGE (PAGE_SIZE*8)
23
24struct pid_cache { 22struct pid_cache {
25 int nr_ids; 23 int nr_ids;
26 char name[16]; 24 char name[16];
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 6edbb2c55c22..424c2d4265c9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -40,38 +40,31 @@
40#include <linux/list.h> 40#include <linux/list.h>
41#include <linux/init.h> 41#include <linux/init.h>
42#include <linux/compiler.h> 42#include <linux/compiler.h>
43#include <linux/idr.h> 43#include <linux/hash.h>
44#include <linux/posix-clock.h> 44#include <linux/posix-clock.h>
45#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
46#include <linux/syscalls.h> 46#include <linux/syscalls.h>
47#include <linux/wait.h> 47#include <linux/wait.h>
48#include <linux/workqueue.h> 48#include <linux/workqueue.h>
49#include <linux/export.h> 49#include <linux/export.h>
50#include <linux/hashtable.h>
50 51
51/* 52/*
52 * Management arrays for POSIX timers. Timers are kept in slab memory 53 * Management arrays for POSIX timers. Timers are now kept in static hash table
53 * Timer ids are allocated by an external routine that keeps track of the 54 * with 512 entries.
54 * id and the timer. The external interface is: 55 * Timer ids are allocated by local routine, which selects proper hash head by
55 * 56 * key, constructed from current->signal address and per signal struct counter.
56 * void *idr_find(struct idr *idp, int id); to find timer_id <id> 57 * This keeps timer ids unique per process, but now they can intersect between
57 * int idr_get_new(struct idr *idp, void *ptr); to get a new id and 58 * processes.
58 * related it to <ptr>
59 * void idr_remove(struct idr *idp, int id); to release <id>
60 * void idr_init(struct idr *idp); to initialize <idp>
61 * which we supply.
62 * The idr_get_new *may* call slab for more memory so it must not be
63 * called under a spin lock. Likewise idr_remore may release memory
64 * (but it may be ok to do this under a lock...).
65 * idr_find is just a memory look up and is quite fast. A -1 return
66 * indicates that the requested id does not exist.
67 */ 59 */
68 60
69/* 61/*
70 * Lets keep our timers in a slab cache :-) 62 * Lets keep our timers in a slab cache :-)
71 */ 63 */
72static struct kmem_cache *posix_timers_cache; 64static struct kmem_cache *posix_timers_cache;
73static struct idr posix_timers_id; 65
74static DEFINE_SPINLOCK(idr_lock); 66static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
67static DEFINE_SPINLOCK(hash_lock);
75 68
76/* 69/*
77 * we assume that the new SIGEV_THREAD_ID shares no bits with the other 70 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
152 __timr; \ 145 __timr; \
153}) 146})
154 147
148static int hash(struct signal_struct *sig, unsigned int nr)
149{
150 return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
151}
152
153static struct k_itimer *__posix_timers_find(struct hlist_head *head,
154 struct signal_struct *sig,
155 timer_t id)
156{
157 struct k_itimer *timer;
158
159 hlist_for_each_entry_rcu(timer, head, t_hash) {
160 if ((timer->it_signal == sig) && (timer->it_id == id))
161 return timer;
162 }
163 return NULL;
164}
165
166static struct k_itimer *posix_timer_by_id(timer_t id)
167{
168 struct signal_struct *sig = current->signal;
169 struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
170
171 return __posix_timers_find(head, sig, id);
172}
173
174static int posix_timer_add(struct k_itimer *timer)
175{
176 struct signal_struct *sig = current->signal;
177 int first_free_id = sig->posix_timer_id;
178 struct hlist_head *head;
179 int ret = -ENOENT;
180
181 do {
182 spin_lock(&hash_lock);
183 head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
184 if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
185 hlist_add_head_rcu(&timer->t_hash, head);
186 ret = sig->posix_timer_id;
187 }
188 if (++sig->posix_timer_id < 0)
189 sig->posix_timer_id = 0;
190 if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
191 /* Loop over all possible ids completed */
192 ret = -EAGAIN;
193 spin_unlock(&hash_lock);
194 } while (ret == -ENOENT);
195 return ret;
196}
197
155static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) 198static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
156{ 199{
157 spin_unlock_irqrestore(&timr->it_lock, flags); 200 spin_unlock_irqrestore(&timr->it_lock, flags);
@@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
221 return 0; 264 return 0;
222} 265}
223 266
267static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
268{
269 timekeeping_clocktai(tp);
270 return 0;
271}
224 272
225/* 273/*
226 * Initialize everything, well, just everything in Posix clocks/timers ;) 274 * Initialize everything, well, just everything in Posix clocks/timers ;)
@@ -261,6 +309,16 @@ static __init int init_posix_timers(void)
261 .clock_getres = posix_get_coarse_res, 309 .clock_getres = posix_get_coarse_res,
262 .clock_get = posix_get_monotonic_coarse, 310 .clock_get = posix_get_monotonic_coarse,
263 }; 311 };
312 struct k_clock clock_tai = {
313 .clock_getres = hrtimer_get_res,
314 .clock_get = posix_get_tai,
315 .nsleep = common_nsleep,
316 .nsleep_restart = hrtimer_nanosleep_restart,
317 .timer_create = common_timer_create,
318 .timer_set = common_timer_set,
319 .timer_get = common_timer_get,
320 .timer_del = common_timer_del,
321 };
264 struct k_clock clock_boottime = { 322 struct k_clock clock_boottime = {
265 .clock_getres = hrtimer_get_res, 323 .clock_getres = hrtimer_get_res,
266 .clock_get = posix_get_boottime, 324 .clock_get = posix_get_boottime,
@@ -278,11 +336,11 @@ static __init int init_posix_timers(void)
278 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); 336 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
279 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); 337 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
280 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); 338 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
339 posix_timers_register_clock(CLOCK_TAI, &clock_tai);
281 340
282 posix_timers_cache = kmem_cache_create("posix_timers_cache", 341 posix_timers_cache = kmem_cache_create("posix_timers_cache",
283 sizeof (struct k_itimer), 0, SLAB_PANIC, 342 sizeof (struct k_itimer), 0, SLAB_PANIC,
284 NULL); 343 NULL);
285 idr_init(&posix_timers_id);
286 return 0; 344 return 0;
287} 345}
288 346
@@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
504{ 562{
505 if (it_id_set) { 563 if (it_id_set) {
506 unsigned long flags; 564 unsigned long flags;
507 spin_lock_irqsave(&idr_lock, flags); 565 spin_lock_irqsave(&hash_lock, flags);
508 idr_remove(&posix_timers_id, tmr->it_id); 566 hlist_del_rcu(&tmr->t_hash);
509 spin_unlock_irqrestore(&idr_lock, flags); 567 spin_unlock_irqrestore(&hash_lock, flags);
510 } 568 }
511 put_pid(tmr->it_pid); 569 put_pid(tmr->it_pid);
512 sigqueue_free(tmr->sigq); 570 sigqueue_free(tmr->sigq);
@@ -552,22 +610,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
552 return -EAGAIN; 610 return -EAGAIN;
553 611
554 spin_lock_init(&new_timer->it_lock); 612 spin_lock_init(&new_timer->it_lock);
555 613 new_timer_id = posix_timer_add(new_timer);
556 idr_preload(GFP_KERNEL); 614 if (new_timer_id < 0) {
557 spin_lock_irq(&idr_lock); 615 error = new_timer_id;
558 error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
559 spin_unlock_irq(&idr_lock);
560 idr_preload_end();
561 if (error < 0) {
562 /*
563 * Weird looking, but we return EAGAIN if the IDR is
564 * full (proper POSIX return value for this)
565 */
566 if (error == -ENOSPC)
567 error = -EAGAIN;
568 goto out; 616 goto out;
569 } 617 }
570 new_timer_id = error;
571 618
572 it_id_set = IT_ID_SET; 619 it_id_set = IT_ID_SET;
573 new_timer->it_id = (timer_t) new_timer_id; 620 new_timer->it_id = (timer_t) new_timer_id;
@@ -645,7 +692,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
645 return NULL; 692 return NULL;
646 693
647 rcu_read_lock(); 694 rcu_read_lock();
648 timr = idr_find(&posix_timers_id, (int)timer_id); 695 timr = posix_timer_by_id(timer_id);
649 if (timr) { 696 if (timr) {
650 spin_lock_irqsave(&timr->it_lock, *flags); 697 spin_lock_irqsave(&timr->it_lock, *flags);
651 if (timr->it_signal == current->signal) { 698 if (timr->it_signal == current->signal) {
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 68197a4e8fc9..7ef6866b521d 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -32,7 +32,7 @@ static void handle_poweroff(int key)
32 32
33static struct sysrq_key_op sysrq_poweroff_op = { 33static struct sysrq_key_op sysrq_poweroff_op = {
34 .handler = handle_poweroff, 34 .handler = handle_poweroff,
35 .help_msg = "powerOff", 35 .help_msg = "poweroff(o)",
36 .action_msg = "Power Off", 36 .action_msg = "Power Off",
37 .enable_mask = SYSRQ_ENABLE_BOOT, 37 .enable_mask = SYSRQ_ENABLE_BOOT,
38}; 38};
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index d4feda084a3a..bef86d121eb2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -76,8 +76,20 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
76 76
77bool valid_state(suspend_state_t state) 77bool valid_state(suspend_state_t state)
78{ 78{
79 if (state == PM_SUSPEND_FREEZE) 79 if (state == PM_SUSPEND_FREEZE) {
80 return true; 80#ifdef CONFIG_PM_DEBUG
81 if (pm_test_level != TEST_NONE &&
82 pm_test_level != TEST_FREEZER &&
83 pm_test_level != TEST_DEVICES &&
84 pm_test_level != TEST_PLATFORM) {
85 printk(KERN_WARNING "Unsupported pm_test mode for "
86 "freeze state, please choose "
87 "none/freezer/devices/platform.\n");
88 return false;
89 }
90#endif
91 return true;
92 }
81 /* 93 /*
82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel 94 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
83 * support and need to be valid to the lowlevel 95 * support and need to be valid to the lowlevel
@@ -184,6 +196,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
184 goto Platform_wake; 196 goto Platform_wake;
185 } 197 }
186 198
199 if (suspend_test(TEST_PLATFORM))
200 goto Platform_wake;
201
187 /* 202 /*
188 * PM_SUSPEND_FREEZE equals 203 * PM_SUSPEND_FREEZE equals
189 * frozen processes + suspended devices + idle processors. 204 * frozen processes + suspended devices + idle processors.
@@ -195,9 +210,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
195 goto Platform_wake; 210 goto Platform_wake;
196 } 211 }
197 212
198 if (suspend_test(TEST_PLATFORM))
199 goto Platform_wake;
200
201 error = disable_nonboot_cpus(); 213 error = disable_nonboot_cpus();
202 if (error || suspend_test(TEST_CPUS)) 214 if (error || suspend_test(TEST_CPUS))
203 goto Enable_cpus; 215 goto Enable_cpus;
diff --git a/kernel/printk.c b/kernel/printk.c
index abbdd9e2ac82..96dcfcd9a2d4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -43,19 +43,13 @@
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h> 44#include <linux/poll.h>
45#include <linux/irq_work.h> 45#include <linux/irq_work.h>
46#include <linux/utsname.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
49#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
50#include <trace/events/printk.h> 51#include <trace/events/printk.h>
51 52
52/*
53 * Architectures can override it:
54 */
55void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
56{
57}
58
59/* printk's without a loglevel use this.. */ 53/* printk's without a loglevel use this.. */
60#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 54#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
61 55
@@ -608,7 +602,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
608 /* return error when data has vanished underneath us */ 602 /* return error when data has vanished underneath us */
609 if (user->seq < log_first_seq) 603 if (user->seq < log_first_seq)
610 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; 604 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
611 ret = POLLIN|POLLRDNORM; 605 else
606 ret = POLLIN|POLLRDNORM;
612 } 607 }
613 raw_spin_unlock_irq(&logbuf_lock); 608 raw_spin_unlock_irq(&logbuf_lock);
614 609
@@ -1265,7 +1260,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
1265{ 1260{
1266 struct console *con; 1261 struct console *con;
1267 1262
1268 trace_console(text, 0, len, len); 1263 trace_console(text, len);
1269 1264
1270 if (level >= console_loglevel && !ignore_loglevel) 1265 if (level >= console_loglevel && !ignore_loglevel)
1271 return; 1266 return;
@@ -1723,6 +1718,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }
1723 1718
1724#endif /* CONFIG_PRINTK */ 1719#endif /* CONFIG_PRINTK */
1725 1720
1721#ifdef CONFIG_EARLY_PRINTK
1722struct console *early_console;
1723
1724void early_vprintk(const char *fmt, va_list ap)
1725{
1726 if (early_console) {
1727 char buf[512];
1728 int n = vscnprintf(buf, sizeof(buf), fmt, ap);
1729
1730 early_console->write(early_console, buf, n);
1731 }
1732}
1733
1734asmlinkage void early_printk(const char *fmt, ...)
1735{
1736 va_list ap;
1737
1738 va_start(ap, fmt);
1739 early_vprintk(fmt, ap);
1740 va_end(ap);
1741}
1742#endif
1743
1726static int __add_preferred_console(char *name, int idx, char *options, 1744static int __add_preferred_console(char *name, int idx, char *options,
1727 char *brl_options) 1745 char *brl_options)
1728{ 1746{
@@ -2832,4 +2850,65 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
2832 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2850 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2833} 2851}
2834EXPORT_SYMBOL_GPL(kmsg_dump_rewind); 2852EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
2853
2854static char dump_stack_arch_desc_str[128];
2855
2856/**
2857 * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
2858 * @fmt: printf-style format string
2859 * @...: arguments for the format string
2860 *
2861 * The configured string will be printed right after utsname during task
2862 * dumps. Usually used to add arch-specific system identifiers. If an
2863 * arch wants to make use of such an ID string, it should initialize this
2864 * as soon as possible during boot.
2865 */
2866void __init dump_stack_set_arch_desc(const char *fmt, ...)
2867{
2868 va_list args;
2869
2870 va_start(args, fmt);
2871 vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
2872 fmt, args);
2873 va_end(args);
2874}
2875
2876/**
2877 * dump_stack_print_info - print generic debug info for dump_stack()
2878 * @log_lvl: log level
2879 *
2880 * Arch-specific dump_stack() implementations can use this function to
2881 * print out the same debug information as the generic dump_stack().
2882 */
2883void dump_stack_print_info(const char *log_lvl)
2884{
2885 printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
2886 log_lvl, raw_smp_processor_id(), current->pid, current->comm,
2887 print_tainted(), init_utsname()->release,
2888 (int)strcspn(init_utsname()->version, " "),
2889 init_utsname()->version);
2890
2891 if (dump_stack_arch_desc_str[0] != '\0')
2892 printk("%sHardware name: %s\n",
2893 log_lvl, dump_stack_arch_desc_str);
2894
2895 print_worker_info(log_lvl, current);
2896}
2897
2898/**
2899 * show_regs_print_info - print generic debug info for show_regs()
2900 * @log_lvl: log level
2901 *
2902 * show_regs() implementations can use this function to print out generic
2903 * debug information.
2904 */
2905void show_regs_print_info(const char *log_lvl)
2906{
2907 dump_stack_print_info(log_lvl);
2908
2909 printk("%stask: %p ti: %p task.ti: %p\n",
2910 log_lvl, current, current_thread_info(),
2911 task_thread_info(current));
2912}
2913
2835#endif 2914#endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index acbd28424d81..17ae54da0ec2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -24,6 +24,7 @@
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h> 25#include <linux/hw_breakpoint.h>
26#include <linux/cn_proc.h> 26#include <linux/cn_proc.h>
27#include <linux/compat.h>
27 28
28 29
29static int ptrace_trapping_sleep_fn(void *flags) 30static int ptrace_trapping_sleep_fn(void *flags)
@@ -618,6 +619,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
618 return error; 619 return error;
619} 620}
620 621
622static int ptrace_peek_siginfo(struct task_struct *child,
623 unsigned long addr,
624 unsigned long data)
625{
626 struct ptrace_peeksiginfo_args arg;
627 struct sigpending *pending;
628 struct sigqueue *q;
629 int ret, i;
630
631 ret = copy_from_user(&arg, (void __user *) addr,
632 sizeof(struct ptrace_peeksiginfo_args));
633 if (ret)
634 return -EFAULT;
635
636 if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED)
637 return -EINVAL; /* unknown flags */
638
639 if (arg.nr < 0)
640 return -EINVAL;
641
642 if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
643 pending = &child->signal->shared_pending;
644 else
645 pending = &child->pending;
646
647 for (i = 0; i < arg.nr; ) {
648 siginfo_t info;
649 s32 off = arg.off + i;
650
651 spin_lock_irq(&child->sighand->siglock);
652 list_for_each_entry(q, &pending->list, list) {
653 if (!off--) {
654 copy_siginfo(&info, &q->info);
655 break;
656 }
657 }
658 spin_unlock_irq(&child->sighand->siglock);
659
660 if (off >= 0) /* beyond the end of the list */
661 break;
662
663#ifdef CONFIG_COMPAT
664 if (unlikely(is_compat_task())) {
665 compat_siginfo_t __user *uinfo = compat_ptr(data);
666
667 ret = copy_siginfo_to_user32(uinfo, &info);
668 ret |= __put_user(info.si_code, &uinfo->si_code);
669 } else
670#endif
671 {
672 siginfo_t __user *uinfo = (siginfo_t __user *) data;
673
674 ret = copy_siginfo_to_user(uinfo, &info);
675 ret |= __put_user(info.si_code, &uinfo->si_code);
676 }
677
678 if (ret) {
679 ret = -EFAULT;
680 break;
681 }
682
683 data += sizeof(siginfo_t);
684 i++;
685
686 if (signal_pending(current))
687 break;
688
689 cond_resched();
690 }
691
692 if (i > 0)
693 return i;
694
695 return ret;
696}
621 697
622#ifdef PTRACE_SINGLESTEP 698#ifdef PTRACE_SINGLESTEP
623#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) 699#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
@@ -748,6 +824,10 @@ int ptrace_request(struct task_struct *child, long request,
748 ret = put_user(child->ptrace_message, datalp); 824 ret = put_user(child->ptrace_message, datalp);
749 break; 825 break;
750 826
827 case PTRACE_PEEKSIGINFO:
828 ret = ptrace_peek_siginfo(child, addr, data);
829 break;
830
751 case PTRACE_GETSIGINFO: 831 case PTRACE_GETSIGINFO:
752 ret = ptrace_getsiginfo(child, &siginfo); 832 ret = ptrace_getsiginfo(child, &siginfo);
753 if (!ret) 833 if (!ret)
diff --git a/kernel/range.c b/kernel/range.c
index 9b8ae2d6ed68..071b0ab455cb 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -97,7 +97,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
97 range[i].end = range[j].end; 97 range[i].end = range[j].end;
98 range[i].start = end; 98 range[i].start = end;
99 } else { 99 } else {
100 printk(KERN_ERR "run of slot in ranges\n"); 100 pr_err("%s: run out of slot in ranges\n",
101 __func__);
101 } 102 }
102 range[j].end = start; 103 range[j].end = start;
103 continue; 104 continue;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 5b8ad827fd86..d8534308fd05 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -64,7 +64,7 @@
64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66 66
67#define RCU_STATE_INITIALIZER(sname, cr) { \ 67#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
68 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 69 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 70 .fqs_state = RCU_GP_IDLE, \
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78 .name = #sname, \ 78 .name = #sname, \
79 .abbr = sabbr, \
79} 80}
80 81
81struct rcu_state rcu_sched_state = 82struct rcu_state rcu_sched_state =
82 RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); 83 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
83DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 84DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
84 85
85struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); 86struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
86DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 87DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
87 88
88static struct rcu_state *rcu_state; 89static struct rcu_state *rcu_state;
@@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
223module_param(jiffies_till_first_fqs, ulong, 0644); 224module_param(jiffies_till_first_fqs, ulong, 0644);
224module_param(jiffies_till_next_fqs, ulong, 0644); 225module_param(jiffies_till_next_fqs, ulong, 0644);
225 226
227static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
228 struct rcu_data *rdp);
226static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); 229static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
227static void force_quiescent_state(struct rcu_state *rsp); 230static void force_quiescent_state(struct rcu_state *rsp);
228static int rcu_pending(int cpu); 231static int rcu_pending(int cpu);
@@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
310 313
311 if (rcu_gp_in_progress(rsp)) 314 if (rcu_gp_in_progress(rsp))
312 return 0; /* No, a grace period is already in progress. */ 315 return 0; /* No, a grace period is already in progress. */
316 if (rcu_nocb_needs_gp(rsp))
317 return 1; /* Yes, a no-CBs CPU needs one. */
313 if (!rdp->nxttail[RCU_NEXT_TAIL]) 318 if (!rdp->nxttail[RCU_NEXT_TAIL])
314 return 0; /* No, this is a no-CBs (or offline) CPU. */ 319 return 0; /* No, this is a no-CBs (or offline) CPU. */
315 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 320 if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
@@ -1035,10 +1040,11 @@ static void init_callback_list(struct rcu_data *rdp)
1035{ 1040{
1036 int i; 1041 int i;
1037 1042
1043 if (init_nocb_callback_list(rdp))
1044 return;
1038 rdp->nxtlist = NULL; 1045 rdp->nxtlist = NULL;
1039 for (i = 0; i < RCU_NEXT_SIZE; i++) 1046 for (i = 0; i < RCU_NEXT_SIZE; i++)
1040 rdp->nxttail[i] = &rdp->nxtlist; 1047 rdp->nxttail[i] = &rdp->nxtlist;
1041 init_nocb_callback_list(rdp);
1042} 1048}
1043 1049
1044/* 1050/*
@@ -1071,6 +1077,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1071} 1077}
1072 1078
1073/* 1079/*
1080 * Trace-event helper function for rcu_start_future_gp() and
1081 * rcu_nocb_wait_gp().
1082 */
1083static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1084 unsigned long c, char *s)
1085{
1086 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
1087 rnp->completed, c, rnp->level,
1088 rnp->grplo, rnp->grphi, s);
1089}
1090
1091/*
1092 * Start some future grace period, as needed to handle newly arrived
1093 * callbacks. The required future grace periods are recorded in each
1094 * rcu_node structure's ->need_future_gp field.
1095 *
1096 * The caller must hold the specified rcu_node structure's ->lock.
1097 */
1098static unsigned long __maybe_unused
1099rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1100{
1101 unsigned long c;
1102 int i;
1103 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1104
1105 /*
1106 * Pick up grace-period number for new callbacks. If this
1107 * grace period is already marked as needed, return to the caller.
1108 */
1109 c = rcu_cbs_completed(rdp->rsp, rnp);
1110 trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
1111 if (rnp->need_future_gp[c & 0x1]) {
1112 trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
1113 return c;
1114 }
1115
1116 /*
1117 * If either this rcu_node structure or the root rcu_node structure
1118 * believe that a grace period is in progress, then we must wait
1119 * for the one following, which is in "c". Because our request
1120 * will be noticed at the end of the current grace period, we don't
1121 * need to explicitly start one.
1122 */
1123 if (rnp->gpnum != rnp->completed ||
1124 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1125 rnp->need_future_gp[c & 0x1]++;
1126 trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
1127 return c;
1128 }
1129
1130 /*
1131 * There might be no grace period in progress. If we don't already
1132 * hold it, acquire the root rcu_node structure's lock in order to
1133 * start one (if needed).
1134 */
1135 if (rnp != rnp_root)
1136 raw_spin_lock(&rnp_root->lock);
1137
1138 /*
1139 * Get a new grace-period number. If there really is no grace
1140 * period in progress, it will be smaller than the one we obtained
1141 * earlier. Adjust callbacks as needed. Note that even no-CBs
1142 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
1143 */
1144 c = rcu_cbs_completed(rdp->rsp, rnp_root);
1145 for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
1146 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
1147 rdp->nxtcompleted[i] = c;
1148
1149 /*
1150 * If the needed for the required grace period is already
1151 * recorded, trace and leave.
1152 */
1153 if (rnp_root->need_future_gp[c & 0x1]) {
1154 trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
1155 goto unlock_out;
1156 }
1157
1158 /* Record the need for the future grace period. */
1159 rnp_root->need_future_gp[c & 0x1]++;
1160
1161 /* If a grace period is not already in progress, start one. */
1162 if (rnp_root->gpnum != rnp_root->completed) {
1163 trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
1164 } else {
1165 trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
1166 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1167 }
1168unlock_out:
1169 if (rnp != rnp_root)
1170 raw_spin_unlock(&rnp_root->lock);
1171 return c;
1172}
1173
1174/*
1175 * Clean up any old requests for the just-ended grace period. Also return
1176 * whether any additional grace periods have been requested. Also invoke
1177 * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
1178 * waiting for this grace period to complete.
1179 */
1180static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1181{
1182 int c = rnp->completed;
1183 int needmore;
1184 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1185
1186 rcu_nocb_gp_cleanup(rsp, rnp);
1187 rnp->need_future_gp[c & 0x1] = 0;
1188 needmore = rnp->need_future_gp[(c + 1) & 0x1];
1189 trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
1190 return needmore;
1191}
1192
1193/*
1074 * If there is room, assign a ->completed number to any callbacks on 1194 * If there is room, assign a ->completed number to any callbacks on
1075 * this CPU that have not already been assigned. Also accelerate any 1195 * this CPU that have not already been assigned. Also accelerate any
1076 * callbacks that were previously assigned a ->completed number that has 1196 * callbacks that were previously assigned a ->completed number that has
@@ -1129,6 +1249,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1129 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; 1249 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1130 rdp->nxtcompleted[i] = c; 1250 rdp->nxtcompleted[i] = c;
1131 } 1251 }
1252 /* Record any needed additional grace periods. */
1253 rcu_start_future_gp(rnp, rdp);
1132 1254
1133 /* Trace depending on how much we were able to accelerate. */ 1255 /* Trace depending on how much we were able to accelerate. */
1134 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1256 if (!*rdp->nxttail[RCU_WAIT_TAIL])
@@ -1308,9 +1430,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
1308 rdp = this_cpu_ptr(rsp->rda); 1430 rdp = this_cpu_ptr(rsp->rda);
1309 rcu_preempt_check_blocked_tasks(rnp); 1431 rcu_preempt_check_blocked_tasks(rnp);
1310 rnp->qsmask = rnp->qsmaskinit; 1432 rnp->qsmask = rnp->qsmaskinit;
1311 rnp->gpnum = rsp->gpnum; 1433 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
1312 WARN_ON_ONCE(rnp->completed != rsp->completed); 1434 WARN_ON_ONCE(rnp->completed != rsp->completed);
1313 rnp->completed = rsp->completed; 1435 ACCESS_ONCE(rnp->completed) = rsp->completed;
1314 if (rnp == rdp->mynode) 1436 if (rnp == rdp->mynode)
1315 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1437 rcu_start_gp_per_cpu(rsp, rnp, rdp);
1316 rcu_preempt_boost_start_gp(rnp); 1438 rcu_preempt_boost_start_gp(rnp);
@@ -1319,7 +1441,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
1319 rnp->grphi, rnp->qsmask); 1441 rnp->grphi, rnp->qsmask);
1320 raw_spin_unlock_irq(&rnp->lock); 1442 raw_spin_unlock_irq(&rnp->lock);
1321#ifdef CONFIG_PROVE_RCU_DELAY 1443#ifdef CONFIG_PROVE_RCU_DELAY
1322 if ((random32() % (rcu_num_nodes * 8)) == 0) 1444 if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
1445 system_state == SYSTEM_RUNNING)
1323 schedule_timeout_uninterruptible(2); 1446 schedule_timeout_uninterruptible(2);
1324#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ 1447#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1325 cond_resched(); 1448 cond_resched();
@@ -1361,6 +1484,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1361static void rcu_gp_cleanup(struct rcu_state *rsp) 1484static void rcu_gp_cleanup(struct rcu_state *rsp)
1362{ 1485{
1363 unsigned long gp_duration; 1486 unsigned long gp_duration;
1487 int nocb = 0;
1364 struct rcu_data *rdp; 1488 struct rcu_data *rdp;
1365 struct rcu_node *rnp = rcu_get_root(rsp); 1489 struct rcu_node *rnp = rcu_get_root(rsp);
1366 1490
@@ -1390,17 +1514,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1390 */ 1514 */
1391 rcu_for_each_node_breadth_first(rsp, rnp) { 1515 rcu_for_each_node_breadth_first(rsp, rnp) {
1392 raw_spin_lock_irq(&rnp->lock); 1516 raw_spin_lock_irq(&rnp->lock);
1393 rnp->completed = rsp->gpnum; 1517 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1518 rdp = this_cpu_ptr(rsp->rda);
1519 if (rnp == rdp->mynode)
1520 __rcu_process_gp_end(rsp, rnp, rdp);
1521 nocb += rcu_future_gp_cleanup(rsp, rnp);
1394 raw_spin_unlock_irq(&rnp->lock); 1522 raw_spin_unlock_irq(&rnp->lock);
1395 cond_resched(); 1523 cond_resched();
1396 } 1524 }
1397 rnp = rcu_get_root(rsp); 1525 rnp = rcu_get_root(rsp);
1398 raw_spin_lock_irq(&rnp->lock); 1526 raw_spin_lock_irq(&rnp->lock);
1527 rcu_nocb_gp_set(rnp, nocb);
1399 1528
1400 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1529 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1401 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1530 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1402 rsp->fqs_state = RCU_GP_IDLE; 1531 rsp->fqs_state = RCU_GP_IDLE;
1403 rdp = this_cpu_ptr(rsp->rda); 1532 rdp = this_cpu_ptr(rsp->rda);
1533 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
1404 if (cpu_needs_another_gp(rsp, rdp)) 1534 if (cpu_needs_another_gp(rsp, rdp))
1405 rsp->gp_flags = 1; 1535 rsp->gp_flags = 1;
1406 raw_spin_unlock_irq(&rnp->lock); 1536 raw_spin_unlock_irq(&rnp->lock);
@@ -1476,57 +1606,62 @@ static int __noreturn rcu_gp_kthread(void *arg)
1476/* 1606/*
1477 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1607 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1478 * in preparation for detecting the next grace period. The caller must hold 1608 * in preparation for detecting the next grace period. The caller must hold
1479 * the root node's ->lock, which is released before return. Hard irqs must 1609 * the root node's ->lock and hard irqs must be disabled.
1480 * be disabled.
1481 * 1610 *
1482 * Note that it is legal for a dying CPU (which is marked as offline) to 1611 * Note that it is legal for a dying CPU (which is marked as offline) to
1483 * invoke this function. This can happen when the dying CPU reports its 1612 * invoke this function. This can happen when the dying CPU reports its
1484 * quiescent state. 1613 * quiescent state.
1485 */ 1614 */
1486static void 1615static void
1487rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 1616rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1488 __releases(rcu_get_root(rsp)->lock) 1617 struct rcu_data *rdp)
1489{ 1618{
1490 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1619 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
1491 struct rcu_node *rnp = rcu_get_root(rsp);
1492
1493 if (!rsp->gp_kthread ||
1494 !cpu_needs_another_gp(rsp, rdp)) {
1495 /* 1620 /*
1496 * Either we have not yet spawned the grace-period 1621 * Either we have not yet spawned the grace-period
1497 * task, this CPU does not need another grace period, 1622 * task, this CPU does not need another grace period,
1498 * or a grace period is already in progress. 1623 * or a grace period is already in progress.
1499 * Either way, don't start a new grace period. 1624 * Either way, don't start a new grace period.
1500 */ 1625 */
1501 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1502 return; 1626 return;
1503 } 1627 }
1504
1505 /*
1506 * Because there is no grace period in progress right now,
1507 * any callbacks we have up to this point will be satisfied
1508 * by the next grace period. So this is a good place to
1509 * assign a grace period number to recently posted callbacks.
1510 */
1511 rcu_accelerate_cbs(rsp, rnp, rdp);
1512
1513 rsp->gp_flags = RCU_GP_FLAG_INIT; 1628 rsp->gp_flags = RCU_GP_FLAG_INIT;
1514 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1515
1516 /* Ensure that CPU is aware of completion of last grace period. */
1517 rcu_process_gp_end(rsp, rdp);
1518 local_irq_restore(flags);
1519 1629
1520 /* Wake up rcu_gp_kthread() to start the grace period. */ 1630 /* Wake up rcu_gp_kthread() to start the grace period. */
1521 wake_up(&rsp->gp_wq); 1631 wake_up(&rsp->gp_wq);
1522} 1632}
1523 1633
1524/* 1634/*
1635 * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
1636 * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
1637 * is invoked indirectly from rcu_advance_cbs(), which would result in
1638 * endless recursion -- or would do so if it wasn't for the self-deadlock
1639 * that is encountered beforehand.
1640 */
1641static void
1642rcu_start_gp(struct rcu_state *rsp)
1643{
1644 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1645 struct rcu_node *rnp = rcu_get_root(rsp);
1646
1647 /*
1648 * If there is no grace period in progress right now, any
1649 * callbacks we have up to this point will be satisfied by the
1650 * next grace period. Also, advancing the callbacks reduces the
1651 * probability of false positives from cpu_needs_another_gp()
1652 * resulting in pointless grace periods. So, advance callbacks
1653 * then start the grace period!
1654 */
1655 rcu_advance_cbs(rsp, rnp, rdp);
1656 rcu_start_gp_advanced(rsp, rnp, rdp);
1657}
1658
1659/*
1525 * Report a full set of quiescent states to the specified rcu_state 1660 * Report a full set of quiescent states to the specified rcu_state
1526 * data structure. This involves cleaning up after the prior grace 1661 * data structure. This involves cleaning up after the prior grace
1527 * period and letting rcu_start_gp() start up the next grace period 1662 * period and letting rcu_start_gp() start up the next grace period
1528 * if one is needed. Note that the caller must hold rnp->lock, as 1663 * if one is needed. Note that the caller must hold rnp->lock, which
1529 * required by rcu_start_gp(), which will release it. 1664 * is released before return.
1530 */ 1665 */
1531static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 1666static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1532 __releases(rcu_get_root(rsp)->lock) 1667 __releases(rcu_get_root(rsp)->lock)
@@ -2124,7 +2259,8 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2124 local_irq_save(flags); 2259 local_irq_save(flags);
2125 if (cpu_needs_another_gp(rsp, rdp)) { 2260 if (cpu_needs_another_gp(rsp, rdp)) {
2126 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2261 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2127 rcu_start_gp(rsp, flags); /* releases above lock */ 2262 rcu_start_gp(rsp);
2263 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2128 } else { 2264 } else {
2129 local_irq_restore(flags); 2265 local_irq_restore(flags);
2130 } 2266 }
@@ -2169,7 +2305,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
2169 2305
2170static void invoke_rcu_core(void) 2306static void invoke_rcu_core(void)
2171{ 2307{
2172 raise_softirq(RCU_SOFTIRQ); 2308 if (cpu_online(smp_processor_id()))
2309 raise_softirq(RCU_SOFTIRQ);
2173} 2310}
2174 2311
2175/* 2312/*
@@ -2204,11 +2341,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2204 2341
2205 /* Start a new grace period if one not already started. */ 2342 /* Start a new grace period if one not already started. */
2206 if (!rcu_gp_in_progress(rsp)) { 2343 if (!rcu_gp_in_progress(rsp)) {
2207 unsigned long nestflag;
2208 struct rcu_node *rnp_root = rcu_get_root(rsp); 2344 struct rcu_node *rnp_root = rcu_get_root(rsp);
2209 2345
2210 raw_spin_lock_irqsave(&rnp_root->lock, nestflag); 2346 raw_spin_lock(&rnp_root->lock);
2211 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ 2347 rcu_start_gp(rsp);
2348 raw_spin_unlock(&rnp_root->lock);
2212 } else { 2349 } else {
2213 /* Give the grace period a kick. */ 2350 /* Give the grace period a kick. */
2214 rdp->blimit = LONG_MAX; 2351 rdp->blimit = LONG_MAX;
@@ -2628,19 +2765,27 @@ static int rcu_pending(int cpu)
2628} 2765}
2629 2766
2630/* 2767/*
2631 * Check to see if any future RCU-related work will need to be done 2768 * Return true if the specified CPU has any callback. If all_lazy is
2632 * by the current CPU, even if none need be done immediately, returning 2769 * non-NULL, store an indication of whether all callbacks are lazy.
2633 * 1 if so. 2770 * (If there are no callbacks, all of them are deemed to be lazy.)
2634 */ 2771 */
2635static int rcu_cpu_has_callbacks(int cpu) 2772static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2636{ 2773{
2774 bool al = true;
2775 bool hc = false;
2776 struct rcu_data *rdp;
2637 struct rcu_state *rsp; 2777 struct rcu_state *rsp;
2638 2778
2639 /* RCU callbacks either ready or pending? */ 2779 for_each_rcu_flavor(rsp) {
2640 for_each_rcu_flavor(rsp) 2780 rdp = per_cpu_ptr(rsp->rda, cpu);
2641 if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) 2781 if (rdp->qlen != rdp->qlen_lazy)
2642 return 1; 2782 al = false;
2643 return 0; 2783 if (rdp->nxtlist)
2784 hc = true;
2785 }
2786 if (all_lazy)
2787 *all_lazy = al;
2788 return hc;
2644} 2789}
2645 2790
2646/* 2791/*
@@ -2859,7 +3004,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2859 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3004 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2860 atomic_set(&rdp->dynticks->dynticks, 3005 atomic_set(&rdp->dynticks->dynticks,
2861 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 3006 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2862 rcu_prepare_for_idle_init(cpu);
2863 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 3007 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2864 3008
2865 /* Add CPU to rcu_node bitmasks. */ 3009 /* Add CPU to rcu_node bitmasks. */
@@ -2909,7 +3053,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2909 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 3053 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2910 struct rcu_node *rnp = rdp->mynode; 3054 struct rcu_node *rnp = rdp->mynode;
2911 struct rcu_state *rsp; 3055 struct rcu_state *rsp;
2912 int ret = NOTIFY_OK;
2913 3056
2914 trace_rcu_utilization("Start CPU hotplug"); 3057 trace_rcu_utilization("Start CPU hotplug");
2915 switch (action) { 3058 switch (action) {
@@ -2923,21 +3066,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2923 rcu_boost_kthread_setaffinity(rnp, -1); 3066 rcu_boost_kthread_setaffinity(rnp, -1);
2924 break; 3067 break;
2925 case CPU_DOWN_PREPARE: 3068 case CPU_DOWN_PREPARE:
2926 if (nocb_cpu_expendable(cpu)) 3069 rcu_boost_kthread_setaffinity(rnp, cpu);
2927 rcu_boost_kthread_setaffinity(rnp, cpu);
2928 else
2929 ret = NOTIFY_BAD;
2930 break; 3070 break;
2931 case CPU_DYING: 3071 case CPU_DYING:
2932 case CPU_DYING_FROZEN: 3072 case CPU_DYING_FROZEN:
2933 /*
2934 * The whole machine is "stopped" except this CPU, so we can
2935 * touch any data without introducing corruption. We send the
2936 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2937 */
2938 for_each_rcu_flavor(rsp) 3073 for_each_rcu_flavor(rsp)
2939 rcu_cleanup_dying_cpu(rsp); 3074 rcu_cleanup_dying_cpu(rsp);
2940 rcu_cleanup_after_idle(cpu);
2941 break; 3075 break;
2942 case CPU_DEAD: 3076 case CPU_DEAD:
2943 case CPU_DEAD_FROZEN: 3077 case CPU_DEAD_FROZEN:
@@ -2950,7 +3084,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2950 break; 3084 break;
2951 } 3085 }
2952 trace_rcu_utilization("End CPU hotplug"); 3086 trace_rcu_utilization("End CPU hotplug");
2953 return ret; 3087 return NOTIFY_OK;
2954} 3088}
2955 3089
2956/* 3090/*
@@ -3085,6 +3219,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3085 } 3219 }
3086 rnp->level = i; 3220 rnp->level = i;
3087 INIT_LIST_HEAD(&rnp->blkd_tasks); 3221 INIT_LIST_HEAD(&rnp->blkd_tasks);
3222 rcu_init_one_nocb(rnp);
3088 } 3223 }
3089 } 3224 }
3090 3225
@@ -3170,8 +3305,7 @@ void __init rcu_init(void)
3170 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 3305 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3171 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3306 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3172 __rcu_init_preempt(); 3307 __rcu_init_preempt();
3173 rcu_init_nocb(); 3308 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3174 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3175 3309
3176 /* 3310 /*
3177 * We don't need protection against CPU-hotplug here because 3311 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index c896b5045d9d..14ee40795d6f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,18 +88,13 @@ struct rcu_dynticks {
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
89 atomic_t dynticks; /* Even value for idle, else odd. */ 89 atomic_t dynticks; /* Even value for idle, else odd. */
90#ifdef CONFIG_RCU_FAST_NO_HZ 90#ifdef CONFIG_RCU_FAST_NO_HZ
91 int dyntick_drain; /* Prepare-for-idle state variable. */ 91 bool all_lazy; /* Are all CPU's CBs lazy? */
92 unsigned long dyntick_holdoff;
93 /* No retries for the jiffy of failure. */
94 struct timer_list idle_gp_timer;
95 /* Wake up CPU sleeping with callbacks. */
96 unsigned long idle_gp_timer_expires;
97 /* When to wake up CPU (for repost). */
98 bool idle_first_pass; /* First pass of attempt to go idle? */
99 unsigned long nonlazy_posted; 92 unsigned long nonlazy_posted;
100 /* # times non-lazy CBs posted to CPU. */ 93 /* # times non-lazy CBs posted to CPU. */
101 unsigned long nonlazy_posted_snap; 94 unsigned long nonlazy_posted_snap;
102 /* idle-period nonlazy_posted snapshot. */ 95 /* idle-period nonlazy_posted snapshot. */
96 unsigned long last_accelerate;
97 /* Last jiffy CBs were accelerated. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 98 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 99#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105}; 100};
@@ -134,9 +129,6 @@ struct rcu_node {
134 /* elements that need to drain to allow the */ 129 /* elements that need to drain to allow the */
135 /* current expedited grace period to */ 130 /* current expedited grace period to */
136 /* complete (only for TREE_PREEMPT_RCU). */ 131 /* complete (only for TREE_PREEMPT_RCU). */
137 atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
138 /* Since this has meaning only for leaf */
139 /* rcu_node structures, 32 bits suffices. */
140 unsigned long qsmaskinit; 132 unsigned long qsmaskinit;
141 /* Per-GP initial value for qsmask & expmask. */ 133 /* Per-GP initial value for qsmask & expmask. */
142 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 134 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -196,6 +188,12 @@ struct rcu_node {
196 /* Refused to boost: not sure why, though. */ 188 /* Refused to boost: not sure why, though. */
197 /* This can happen due to race conditions. */ 189 /* This can happen due to race conditions. */
198#endif /* #ifdef CONFIG_RCU_BOOST */ 190#endif /* #ifdef CONFIG_RCU_BOOST */
191#ifdef CONFIG_RCU_NOCB_CPU
192 wait_queue_head_t nocb_gp_wq[2];
193 /* Place for rcu_nocb_kthread() to wait GP. */
194#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
195 int need_future_gp[2];
196 /* Counts of upcoming no-CB GP requests. */
199 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; 197 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
200} ____cacheline_internodealigned_in_smp; 198} ____cacheline_internodealigned_in_smp;
201 199
@@ -328,6 +326,11 @@ struct rcu_data {
328 struct task_struct *nocb_kthread; 326 struct task_struct *nocb_kthread;
329#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 327#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
330 328
329 /* 8) RCU CPU stall data. */
330#ifdef CONFIG_RCU_CPU_STALL_INFO
331 unsigned int softirq_snap; /* Snapshot of softirq activity. */
332#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
333
331 int cpu; 334 int cpu;
332 struct rcu_state *rsp; 335 struct rcu_state *rsp;
333}; 336};
@@ -375,12 +378,6 @@ struct rcu_state {
375 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 378 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
376 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 379 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
377 void (*func)(struct rcu_head *head)); 380 void (*func)(struct rcu_head *head));
378#ifdef CONFIG_RCU_NOCB_CPU
379 void (*call_remote)(struct rcu_head *head,
380 void (*func)(struct rcu_head *head));
381 /* call_rcu() flavor, but for */
382 /* placing on remote CPU. */
383#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
384 381
385 /* The following fields are guarded by the root rcu_node's lock. */ 382 /* The following fields are guarded by the root rcu_node's lock. */
386 383
@@ -443,6 +440,7 @@ struct rcu_state {
443 unsigned long gp_max; /* Maximum GP duration in */ 440 unsigned long gp_max; /* Maximum GP duration in */
444 /* jiffies. */ 441 /* jiffies. */
445 char *name; /* Name of structure. */ 442 char *name; /* Name of structure. */
443 char abbr; /* Abbreviated name. */
446 struct list_head flavors; /* List of RCU flavors. */ 444 struct list_head flavors; /* List of RCU flavors. */
447}; 445};
448 446
@@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
520 struct rcu_node *rnp); 518 struct rcu_node *rnp);
521#endif /* #ifdef CONFIG_RCU_BOOST */ 519#endif /* #ifdef CONFIG_RCU_BOOST */
522static void __cpuinit rcu_prepare_kthreads(int cpu); 520static void __cpuinit rcu_prepare_kthreads(int cpu);
523static void rcu_prepare_for_idle_init(int cpu);
524static void rcu_cleanup_after_idle(int cpu); 521static void rcu_cleanup_after_idle(int cpu);
525static void rcu_prepare_for_idle(int cpu); 522static void rcu_prepare_for_idle(int cpu);
526static void rcu_idle_count_callbacks_posted(void); 523static void rcu_idle_count_callbacks_posted(void);
@@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
529static void print_cpu_stall_info_end(void); 526static void print_cpu_stall_info_end(void);
530static void zero_cpu_stall_ticks(struct rcu_data *rdp); 527static void zero_cpu_stall_ticks(struct rcu_data *rdp);
531static void increment_cpu_stall_ticks(void); 528static void increment_cpu_stall_ticks(void);
529static int rcu_nocb_needs_gp(struct rcu_state *rsp);
530static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
531static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
532static void rcu_init_one_nocb(struct rcu_node *rnp);
532static bool is_nocb_cpu(int cpu); 533static bool is_nocb_cpu(int cpu);
533static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 534static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
534 bool lazy); 535 bool lazy);
535static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 536static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
536 struct rcu_data *rdp); 537 struct rcu_data *rdp);
537static bool nocb_cpu_expendable(int cpu);
538static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 538static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
539static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 539static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
540static void init_nocb_callback_list(struct rcu_data *rdp); 540static bool init_nocb_callback_list(struct rcu_data *rdp);
541static void __init rcu_init_nocb(void);
542 541
543#endif /* #ifndef RCU_TREE_NONCORE */ 542#endif /* #ifndef RCU_TREE_NONCORE */
544 543
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c1cc7e17ff9d..d084ae3f281c 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -85,11 +85,21 @@ static void __init rcu_bootup_announce_oddness(void)
85 if (nr_cpu_ids != NR_CPUS) 85 if (nr_cpu_ids != NR_CPUS)
86 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 86 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
87#ifdef CONFIG_RCU_NOCB_CPU 87#ifdef CONFIG_RCU_NOCB_CPU
88#ifndef CONFIG_RCU_NOCB_CPU_NONE
89 if (!have_rcu_nocb_mask) {
90 alloc_bootmem_cpumask_var(&rcu_nocb_mask);
91 have_rcu_nocb_mask = true;
92 }
93#ifdef CONFIG_RCU_NOCB_CPU_ZERO
94 pr_info("\tExperimental no-CBs CPU 0\n");
95 cpumask_set_cpu(0, rcu_nocb_mask);
96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
97#ifdef CONFIG_RCU_NOCB_CPU_ALL
98 pr_info("\tExperimental no-CBs for all CPUs\n");
99 cpumask_setall(rcu_nocb_mask);
100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
88 if (have_rcu_nocb_mask) { 102 if (have_rcu_nocb_mask) {
89 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
90 cpumask_clear_cpu(0, rcu_nocb_mask);
91 pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
92 }
93 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 103 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
94 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); 104 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
95 if (rcu_nocb_poll) 105 if (rcu_nocb_poll)
@@ -101,7 +111,7 @@ static void __init rcu_bootup_announce_oddness(void)
101#ifdef CONFIG_TREE_PREEMPT_RCU 111#ifdef CONFIG_TREE_PREEMPT_RCU
102 112
103struct rcu_state rcu_preempt_state = 113struct rcu_state rcu_preempt_state =
104 RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); 114 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
105DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 115DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
106static struct rcu_state *rcu_state = &rcu_preempt_state; 116static struct rcu_state *rcu_state = &rcu_preempt_state;
107 117
@@ -1533,14 +1543,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1533int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1543int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1534{ 1544{
1535 *delta_jiffies = ULONG_MAX; 1545 *delta_jiffies = ULONG_MAX;
1536 return rcu_cpu_has_callbacks(cpu); 1546 return rcu_cpu_has_callbacks(cpu, NULL);
1537}
1538
1539/*
1540 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1541 */
1542static void rcu_prepare_for_idle_init(int cpu)
1543{
1544} 1547}
1545 1548
1546/* 1549/*
@@ -1577,16 +1580,6 @@ static void rcu_idle_count_callbacks_posted(void)
1577 * 1580 *
1578 * The following three proprocessor symbols control this state machine: 1581 * The following three proprocessor symbols control this state machine:
1579 * 1582 *
1580 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
1581 * to satisfy RCU. Beyond this point, it is better to incur a periodic
1582 * scheduling-clock interrupt than to loop through the state machine
1583 * at full power.
1584 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
1585 * optional if RCU does not need anything immediately from this
1586 * CPU, even if this CPU still has RCU callbacks queued. The first
1587 * times through the state machine are mandatory: we need to give
1588 * the state machine a chance to communicate a quiescent state
1589 * to the RCU core.
1590 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted 1583 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1591 * to sleep in dyntick-idle mode with RCU callbacks pending. This 1584 * to sleep in dyntick-idle mode with RCU callbacks pending. This
1592 * is sized to be roughly one RCU grace period. Those energy-efficiency 1585 * is sized to be roughly one RCU grace period. Those energy-efficiency
@@ -1602,186 +1595,108 @@ static void rcu_idle_count_callbacks_posted(void)
1602 * adjustment, they can be converted into kernel config parameters, though 1595 * adjustment, they can be converted into kernel config parameters, though
1603 * making the state machine smarter might be a better option. 1596 * making the state machine smarter might be a better option.
1604 */ 1597 */
1605#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
1606#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
1607#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ 1598#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1608#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1599#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1609 1600
1610extern int tick_nohz_enabled; 1601static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
1611 1602module_param(rcu_idle_gp_delay, int, 0644);
1612/* 1603static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1613 * Does the specified flavor of RCU have non-lazy callbacks pending on 1604module_param(rcu_idle_lazy_gp_delay, int, 0644);
1614 * the specified CPU? Both RCU flavor and CPU are specified by the
1615 * rcu_data structure.
1616 */
1617static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
1618{
1619 return rdp->qlen != rdp->qlen_lazy;
1620}
1621 1605
1622#ifdef CONFIG_TREE_PREEMPT_RCU 1606extern int tick_nohz_enabled;
1623 1607
1624/* 1608/*
1625 * Are there non-lazy RCU-preempt callbacks? (There cannot be if there 1609 * Try to advance callbacks for all flavors of RCU on the current CPU.
1626 * is no RCU-preempt in the kernel.) 1610 * Afterwards, if there are any callbacks ready for immediate invocation,
1611 * return true.
1627 */ 1612 */
1628static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) 1613static bool rcu_try_advance_all_cbs(void)
1629{ 1614{
1630 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 1615 bool cbs_ready = false;
1631 1616 struct rcu_data *rdp;
1632 return __rcu_cpu_has_nonlazy_callbacks(rdp); 1617 struct rcu_node *rnp;
1633} 1618 struct rcu_state *rsp;
1634
1635#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1636 1619
1637static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) 1620 for_each_rcu_flavor(rsp) {
1638{ 1621 rdp = this_cpu_ptr(rsp->rda);
1639 return 0; 1622 rnp = rdp->mynode;
1640}
1641 1623
1642#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1624 /*
1625 * Don't bother checking unless a grace period has
1626 * completed since we last checked and there are
1627 * callbacks not yet ready to invoke.
1628 */
1629 if (rdp->completed != rnp->completed &&
1630 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1631 rcu_process_gp_end(rsp, rdp);
1643 1632
1644/* 1633 if (cpu_has_callbacks_ready_to_invoke(rdp))
1645 * Does any flavor of RCU have non-lazy callbacks on the specified CPU? 1634 cbs_ready = true;
1646 */ 1635 }
1647static bool rcu_cpu_has_nonlazy_callbacks(int cpu) 1636 return cbs_ready;
1648{
1649 return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
1650 __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
1651 rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
1652} 1637}
1653 1638
1654/* 1639/*
1655 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1640 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
1656 * callbacks on this CPU, (2) this CPU has not yet attempted to enter 1641 * to invoke. If the CPU has callbacks, try to advance them. Tell the
1657 * dyntick-idle mode, or (3) this CPU is in the process of attempting to 1642 * caller to set the timeout based on whether or not there are non-lazy
1658 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed 1643 * callbacks.
1659 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
1660 * it is better to incur scheduling-clock interrupts than to spin
1661 * continuously for the same time duration!
1662 * 1644 *
1663 * The delta_jiffies argument is used to store the time when RCU is 1645 * The caller must have disabled interrupts.
1664 * going to need the CPU again if it still has callbacks. The reason
1665 * for this is that rcu_prepare_for_idle() might need to post a timer,
1666 * but if so, it will do so after tick_nohz_stop_sched_tick() has set
1667 * the wakeup time for this CPU. This means that RCU's timer can be
1668 * delayed until the wakeup time, which defeats the purpose of posting
1669 * a timer.
1670 */ 1646 */
1671int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1647int rcu_needs_cpu(int cpu, unsigned long *dj)
1672{ 1648{
1673 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1649 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1674 1650
1675 /* Flag a new idle sojourn to the idle-entry state machine. */ 1651 /* Snapshot to detect later posting of non-lazy callback. */
1676 rdtp->idle_first_pass = 1; 1652 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1653
1677 /* If no callbacks, RCU doesn't need the CPU. */ 1654 /* If no callbacks, RCU doesn't need the CPU. */
1678 if (!rcu_cpu_has_callbacks(cpu)) { 1655 if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
1679 *delta_jiffies = ULONG_MAX; 1656 *dj = ULONG_MAX;
1680 return 0; 1657 return 0;
1681 } 1658 }
1682 if (rdtp->dyntick_holdoff == jiffies) { 1659
1683 /* RCU recently tried and failed, so don't try again. */ 1660 /* Attempt to advance callbacks. */
1684 *delta_jiffies = 1; 1661 if (rcu_try_advance_all_cbs()) {
1662 /* Some ready to invoke, so initiate later invocation. */
1663 invoke_rcu_core();
1685 return 1; 1664 return 1;
1686 } 1665 }
1687 /* Set up for the possibility that RCU will post a timer. */ 1666 rdtp->last_accelerate = jiffies;
1688 if (rcu_cpu_has_nonlazy_callbacks(cpu)) { 1667
1689 *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, 1668 /* Request timer delay depending on laziness, and round. */
1690 RCU_IDLE_GP_DELAY) - jiffies; 1669 if (rdtp->all_lazy) {
1670 *dj = round_up(rcu_idle_gp_delay + jiffies,
1671 rcu_idle_gp_delay) - jiffies;
1691 } else { 1672 } else {
1692 *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; 1673 *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
1693 *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
1694 } 1674 }
1695 return 0; 1675 return 0;
1696} 1676}
1697 1677
1698/* 1678/*
1699 * Handler for smp_call_function_single(). The only point of this 1679 * Prepare a CPU for idle from an RCU perspective. The first major task
1700 * handler is to wake the CPU up, so the handler does only tracing. 1680 * is to sense whether nohz mode has been enabled or disabled via sysfs.
1701 */ 1681 * The second major task is to check to see if a non-lazy callback has
1702void rcu_idle_demigrate(void *unused) 1682 * arrived at a CPU that previously had only lazy callbacks. The third
1703{ 1683 * major task is to accelerate (that is, assign grace-period numbers to)
1704 trace_rcu_prep_idle("Demigrate"); 1684 * any recently arrived callbacks.
1705}
1706
1707/*
1708 * Timer handler used to force CPU to start pushing its remaining RCU
1709 * callbacks in the case where it entered dyntick-idle mode with callbacks
1710 * pending. The hander doesn't really need to do anything because the
1711 * real work is done upon re-entry to idle, or by the next scheduling-clock
1712 * interrupt should idle not be re-entered.
1713 *
1714 * One special case: the timer gets migrated without awakening the CPU
1715 * on which the timer was scheduled on. In this case, we must wake up
1716 * that CPU. We do so with smp_call_function_single().
1717 */
1718static void rcu_idle_gp_timer_func(unsigned long cpu_in)
1719{
1720 int cpu = (int)cpu_in;
1721
1722 trace_rcu_prep_idle("Timer");
1723 if (cpu != smp_processor_id())
1724 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
1725 else
1726 WARN_ON_ONCE(1); /* Getting here can hang the system... */
1727}
1728
1729/*
1730 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
1731 */
1732static void rcu_prepare_for_idle_init(int cpu)
1733{
1734 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1735
1736 rdtp->dyntick_holdoff = jiffies - 1;
1737 setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
1738 rdtp->idle_gp_timer_expires = jiffies - 1;
1739 rdtp->idle_first_pass = 1;
1740}
1741
1742/*
1743 * Clean up for exit from idle. Because we are exiting from idle, there
1744 * is no longer any point to ->idle_gp_timer, so cancel it. This will
1745 * do nothing if this timer is not active, so just cancel it unconditionally.
1746 */
1747static void rcu_cleanup_after_idle(int cpu)
1748{
1749 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1750
1751 del_timer(&rdtp->idle_gp_timer);
1752 trace_rcu_prep_idle("Cleanup after idle");
1753 rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
1754}
1755
1756/*
1757 * Check to see if any RCU-related work can be done by the current CPU,
1758 * and if so, schedule a softirq to get it done. This function is part
1759 * of the RCU implementation; it is -not- an exported member of the RCU API.
1760 *
1761 * The idea is for the current CPU to clear out all work required by the
1762 * RCU core for the current grace period, so that this CPU can be permitted
1763 * to enter dyntick-idle mode. In some cases, it will need to be awakened
1764 * at the end of the grace period by whatever CPU ends the grace period.
1765 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
1766 * number of wakeups by a modest integer factor.
1767 *
1768 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1769 * disabled, we do one pass of force_quiescent_state(), then do a
1770 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1771 * later. The ->dyntick_drain field controls the sequencing.
1772 * 1685 *
1773 * The caller must have disabled interrupts. 1686 * The caller must have disabled interrupts.
1774 */ 1687 */
1775static void rcu_prepare_for_idle(int cpu) 1688static void rcu_prepare_for_idle(int cpu)
1776{ 1689{
1777 struct timer_list *tp; 1690 struct rcu_data *rdp;
1778 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1691 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1692 struct rcu_node *rnp;
1693 struct rcu_state *rsp;
1779 int tne; 1694 int tne;
1780 1695
1781 /* Handle nohz enablement switches conservatively. */ 1696 /* Handle nohz enablement switches conservatively. */
1782 tne = ACCESS_ONCE(tick_nohz_enabled); 1697 tne = ACCESS_ONCE(tick_nohz_enabled);
1783 if (tne != rdtp->tick_nohz_enabled_snap) { 1698 if (tne != rdtp->tick_nohz_enabled_snap) {
1784 if (rcu_cpu_has_callbacks(cpu)) 1699 if (rcu_cpu_has_callbacks(cpu, NULL))
1785 invoke_rcu_core(); /* force nohz to see update. */ 1700 invoke_rcu_core(); /* force nohz to see update. */
1786 rdtp->tick_nohz_enabled_snap = tne; 1701 rdtp->tick_nohz_enabled_snap = tne;
1787 return; 1702 return;
@@ -1789,125 +1704,56 @@ static void rcu_prepare_for_idle(int cpu)
1789 if (!tne) 1704 if (!tne)
1790 return; 1705 return;
1791 1706
1792 /* Adaptive-tick mode, where usermode execution is idle to RCU. */ 1707 /* If this is a no-CBs CPU, no callbacks, just return. */
1793 if (!is_idle_task(current)) { 1708 if (is_nocb_cpu(cpu))
1794 rdtp->dyntick_holdoff = jiffies - 1;
1795 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1796 trace_rcu_prep_idle("User dyntick with callbacks");
1797 rdtp->idle_gp_timer_expires =
1798 round_up(jiffies + RCU_IDLE_GP_DELAY,
1799 RCU_IDLE_GP_DELAY);
1800 } else if (rcu_cpu_has_callbacks(cpu)) {
1801 rdtp->idle_gp_timer_expires =
1802 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1803 trace_rcu_prep_idle("User dyntick with lazy callbacks");
1804 } else {
1805 return;
1806 }
1807 tp = &rdtp->idle_gp_timer;
1808 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1809 return; 1709 return;
1810 }
1811 1710
1812 /* 1711 /*
1813 * If this is an idle re-entry, for example, due to use of 1712 * If a non-lazy callback arrived at a CPU having only lazy
1814 * RCU_NONIDLE() or the new idle-loop tracing API within the idle 1713 * callbacks, invoke RCU core for the side-effect of recalculating
1815 * loop, then don't take any state-machine actions, unless the 1714 * idle duration on re-entry to idle.
1816 * momentary exit from idle queued additional non-lazy callbacks.
1817 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
1818 * pending.
1819 */ 1715 */
1820 if (!rdtp->idle_first_pass && 1716 if (rdtp->all_lazy &&
1821 (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { 1717 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1822 if (rcu_cpu_has_callbacks(cpu)) { 1718 invoke_rcu_core();
1823 tp = &rdtp->idle_gp_timer;
1824 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1825 }
1826 return; 1719 return;
1827 } 1720 }
1828 rdtp->idle_first_pass = 0;
1829 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
1830 1721
1831 /* 1722 /*
1832 * If there are no callbacks on this CPU, enter dyntick-idle mode. 1723 * If we have not yet accelerated this jiffy, accelerate all
1833 * Also reset state to avoid prejudicing later attempts. 1724 * callbacks on this CPU.
1834 */ 1725 */
1835 if (!rcu_cpu_has_callbacks(cpu)) { 1726 if (rdtp->last_accelerate == jiffies)
1836 rdtp->dyntick_holdoff = jiffies - 1;
1837 rdtp->dyntick_drain = 0;
1838 trace_rcu_prep_idle("No callbacks");
1839 return; 1727 return;
1728 rdtp->last_accelerate = jiffies;
1729 for_each_rcu_flavor(rsp) {
1730 rdp = per_cpu_ptr(rsp->rda, cpu);
1731 if (!*rdp->nxttail[RCU_DONE_TAIL])
1732 continue;
1733 rnp = rdp->mynode;
1734 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1735 rcu_accelerate_cbs(rsp, rnp, rdp);
1736 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1840 } 1737 }
1738}
1841 1739
1842 /* 1740/*
1843 * If in holdoff mode, just return. We will presumably have 1741 * Clean up for exit from idle. Attempt to advance callbacks based on
1844 * refrained from disabling the scheduling-clock tick. 1742 * any grace periods that elapsed while the CPU was idle, and if any
1845 */ 1743 * callbacks are now ready to invoke, initiate invocation.
1846 if (rdtp->dyntick_holdoff == jiffies) { 1744 */
1847 trace_rcu_prep_idle("In holdoff"); 1745static void rcu_cleanup_after_idle(int cpu)
1848 return; 1746{
1849 } 1747 struct rcu_data *rdp;
1748 struct rcu_state *rsp;
1850 1749
1851 /* Check and update the ->dyntick_drain sequencing. */ 1750 if (is_nocb_cpu(cpu))
1852 if (rdtp->dyntick_drain <= 0) {
1853 /* First time through, initialize the counter. */
1854 rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
1855 } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
1856 !rcu_pending(cpu) &&
1857 !local_softirq_pending()) {
1858 /* Can we go dyntick-idle despite still having callbacks? */
1859 rdtp->dyntick_drain = 0;
1860 rdtp->dyntick_holdoff = jiffies;
1861 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1862 trace_rcu_prep_idle("Dyntick with callbacks");
1863 rdtp->idle_gp_timer_expires =
1864 round_up(jiffies + RCU_IDLE_GP_DELAY,
1865 RCU_IDLE_GP_DELAY);
1866 } else {
1867 rdtp->idle_gp_timer_expires =
1868 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1869 trace_rcu_prep_idle("Dyntick with lazy callbacks");
1870 }
1871 tp = &rdtp->idle_gp_timer;
1872 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1873 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1874 return; /* Nothing more to do immediately. */
1875 } else if (--(rdtp->dyntick_drain) <= 0) {
1876 /* We have hit the limit, so time to give up. */
1877 rdtp->dyntick_holdoff = jiffies;
1878 trace_rcu_prep_idle("Begin holdoff");
1879 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
1880 return; 1751 return;
1881 } 1752 rcu_try_advance_all_cbs();
1882 1753 for_each_rcu_flavor(rsp) {
1883 /* 1754 rdp = per_cpu_ptr(rsp->rda, cpu);
1884 * Do one step of pushing the remaining RCU callbacks through 1755 if (cpu_has_callbacks_ready_to_invoke(rdp))
1885 * the RCU core state machine. 1756 invoke_rcu_core();
1886 */
1887#ifdef CONFIG_TREE_PREEMPT_RCU
1888 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
1889 rcu_preempt_qs(cpu);
1890 force_quiescent_state(&rcu_preempt_state);
1891 }
1892#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1893 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1894 rcu_sched_qs(cpu);
1895 force_quiescent_state(&rcu_sched_state);
1896 }
1897 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1898 rcu_bh_qs(cpu);
1899 force_quiescent_state(&rcu_bh_state);
1900 }
1901
1902 /*
1903 * If RCU callbacks are still pending, RCU still needs this CPU.
1904 * So try forcing the callbacks through the grace period.
1905 */
1906 if (rcu_cpu_has_callbacks(cpu)) {
1907 trace_rcu_prep_idle("More callbacks");
1908 invoke_rcu_core();
1909 } else {
1910 trace_rcu_prep_idle("Callbacks drained");
1911 } 1757 }
1912} 1758}
1913 1759
@@ -2015,16 +1861,13 @@ early_initcall(rcu_register_oom_notifier);
2015static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1861static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2016{ 1862{
2017 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1863 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2018 struct timer_list *tltp = &rdtp->idle_gp_timer; 1864 unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
2019 char c;
2020 1865
2021 c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; 1866 sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
2022 if (timer_pending(tltp)) 1867 rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
2023 sprintf(cp, "drain=%d %c timer=%lu", 1868 ulong2long(nlpd),
2024 rdtp->dyntick_drain, c, tltp->expires - jiffies); 1869 rdtp->all_lazy ? 'L' : '.',
2025 else 1870 rdtp->tick_nohz_enabled_snap ? '.' : 'D');
2026 sprintf(cp, "drain=%d %c timer not pending",
2027 rdtp->dyntick_drain, c);
2028} 1871}
2029 1872
2030#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 1873#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2070,10 +1913,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2070 ticks_value = rsp->gpnum - rdp->gpnum; 1913 ticks_value = rsp->gpnum - rdp->gpnum;
2071 } 1914 }
2072 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1915 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2073 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", 1916 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
2074 cpu, ticks_value, ticks_title, 1917 cpu, ticks_value, ticks_title,
2075 atomic_read(&rdtp->dynticks) & 0xfff, 1918 atomic_read(&rdtp->dynticks) & 0xfff,
2076 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1919 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1920 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
2077 fast_no_hz); 1921 fast_no_hz);
2078} 1922}
2079 1923
@@ -2087,6 +1931,7 @@ static void print_cpu_stall_info_end(void)
2087static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1931static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2088{ 1932{
2089 rdp->ticks_this_gp = 0; 1933 rdp->ticks_this_gp = 0;
1934 rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
2090} 1935}
2091 1936
2092/* Increment ->ticks_this_gp for all flavors of RCU. */ 1937/* Increment ->ticks_this_gp for all flavors of RCU. */
@@ -2165,6 +2010,47 @@ static int __init parse_rcu_nocb_poll(char *arg)
2165} 2010}
2166early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 2011early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2167 2012
2013/*
2014 * Do any no-CBs CPUs need another grace period?
2015 *
2016 * Interrupts must be disabled. If the caller does not hold the root
2017 * rnp_node structure's ->lock, the results are advisory only.
2018 */
2019static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2020{
2021 struct rcu_node *rnp = rcu_get_root(rsp);
2022
2023 return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
2024}
2025
2026/*
2027 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
2028 * grace period.
2029 */
2030static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2031{
2032 wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
2033}
2034
2035/*
2036 * Set the root rcu_node structure's ->need_future_gp field
2037 * based on the sum of those of all rcu_node structures. This does
2038 * double-count the root rcu_node structure's requests, but this
2039 * is necessary to handle the possibility of a rcu_nocb_kthread()
2040 * having awakened during the time that the rcu_node structures
2041 * were being updated for the end of the previous grace period.
2042 */
2043static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2044{
2045 rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
2046}
2047
2048static void rcu_init_one_nocb(struct rcu_node *rnp)
2049{
2050 init_waitqueue_head(&rnp->nocb_gp_wq[0]);
2051 init_waitqueue_head(&rnp->nocb_gp_wq[1]);
2052}
2053
2168/* Is the specified CPU a no-CPUs CPU? */ 2054/* Is the specified CPU a no-CPUs CPU? */
2169static bool is_nocb_cpu(int cpu) 2055static bool is_nocb_cpu(int cpu)
2170{ 2056{
@@ -2227,6 +2113,13 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2227 if (!is_nocb_cpu(rdp->cpu)) 2113 if (!is_nocb_cpu(rdp->cpu))
2228 return 0; 2114 return 0;
2229 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); 2115 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2116 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2117 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2118 (unsigned long)rhp->func,
2119 rdp->qlen_lazy, rdp->qlen);
2120 else
2121 trace_rcu_callback(rdp->rsp->name, rhp,
2122 rdp->qlen_lazy, rdp->qlen);
2230 return 1; 2123 return 1;
2231} 2124}
2232 2125
@@ -2265,95 +2158,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2265} 2158}
2266 2159
2267/* 2160/*
2268 * There must be at least one non-no-CBs CPU in operation at any given 2161 * If necessary, kick off a new grace period, and either way wait
2269 * time, because no-CBs CPUs are not capable of initiating grace periods 2162 * for a subsequent grace period to complete.
2270 * independently. This function therefore complains if the specified
2271 * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2272 * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2273 * but you have to have a base case!)
2274 */ 2163 */
2275static bool nocb_cpu_expendable(int cpu) 2164static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2276{ 2165{
2277 cpumask_var_t non_nocb_cpus; 2166 unsigned long c;
2278 int ret; 2167 bool d;
2168 unsigned long flags;
2169 struct rcu_node *rnp = rdp->mynode;
2170
2171 raw_spin_lock_irqsave(&rnp->lock, flags);
2172 c = rcu_start_future_gp(rnp, rdp);
2173 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2279 2174
2280 /* 2175 /*
2281 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, 2176 * Wait for the grace period. Do so interruptibly to avoid messing
2282 * then offlining this CPU is harmless. Let it happen. 2177 * up the load average.
2283 */ 2178 */
2284 if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) 2179 trace_rcu_future_gp(rnp, rdp, c, "StartWait");
2285 return 1; 2180 for (;;) {
2286 2181 wait_event_interruptible(
2287 /* If no memory, play it safe and keep the CPU around. */ 2182 rnp->nocb_gp_wq[c & 0x1],
2288 if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) 2183 (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
2289 return 0; 2184 if (likely(d))
2290 cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); 2185 break;
2291 cpumask_clear_cpu(cpu, non_nocb_cpus); 2186 flush_signals(current);
2292 ret = !cpumask_empty(non_nocb_cpus); 2187 trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
2293 free_cpumask_var(non_nocb_cpus); 2188 }
2294 return ret; 2189 trace_rcu_future_gp(rnp, rdp, c, "EndWait");
2295} 2190 smp_mb(); /* Ensure that CB invocation happens after GP end. */
2296
2297/*
2298 * Helper structure for remote registry of RCU callbacks.
2299 * This is needed for when a no-CBs CPU needs to start a grace period.
2300 * If it just invokes call_rcu(), the resulting callback will be queued,
2301 * which can result in deadlock.
2302 */
2303struct rcu_head_remote {
2304 struct rcu_head *rhp;
2305 call_rcu_func_t *crf;
2306 void (*func)(struct rcu_head *rhp);
2307};
2308
2309/*
2310 * Register a callback as specified by the rcu_head_remote struct.
2311 * This function is intended to be invoked via smp_call_function_single().
2312 */
2313static void call_rcu_local(void *arg)
2314{
2315 struct rcu_head_remote *rhrp =
2316 container_of(arg, struct rcu_head_remote, rhp);
2317
2318 rhrp->crf(rhrp->rhp, rhrp->func);
2319}
2320
2321/*
2322 * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2323 * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2324 * smp_call_function_single().
2325 */
2326static void invoke_crf_remote(struct rcu_head *rhp,
2327 void (*func)(struct rcu_head *rhp),
2328 call_rcu_func_t crf)
2329{
2330 struct rcu_head_remote rhr;
2331
2332 rhr.rhp = rhp;
2333 rhr.crf = crf;
2334 rhr.func = func;
2335 smp_call_function_single(0, call_rcu_local, &rhr, 1);
2336}
2337
2338/*
2339 * Helper functions to be passed to wait_rcu_gp(), each of which
2340 * invokes invoke_crf_remote() to register a callback appropriately.
2341 */
2342static void __maybe_unused
2343call_rcu_preempt_remote(struct rcu_head *rhp,
2344 void (*func)(struct rcu_head *rhp))
2345{
2346 invoke_crf_remote(rhp, func, call_rcu);
2347}
2348static void call_rcu_bh_remote(struct rcu_head *rhp,
2349 void (*func)(struct rcu_head *rhp))
2350{
2351 invoke_crf_remote(rhp, func, call_rcu_bh);
2352}
2353static void call_rcu_sched_remote(struct rcu_head *rhp,
2354 void (*func)(struct rcu_head *rhp))
2355{
2356 invoke_crf_remote(rhp, func, call_rcu_sched);
2357} 2191}
2358 2192
2359/* 2193/*
@@ -2390,7 +2224,7 @@ static int rcu_nocb_kthread(void *arg)
2390 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); 2224 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2391 ACCESS_ONCE(rdp->nocb_p_count) += c; 2225 ACCESS_ONCE(rdp->nocb_p_count) += c;
2392 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; 2226 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2393 wait_rcu_gp(rdp->rsp->call_remote); 2227 rcu_nocb_wait_gp(rdp);
2394 2228
2395 /* Each pass through the following loop invokes a callback. */ 2229 /* Each pass through the following loop invokes a callback. */
2396 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2230 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2436,32 +2270,41 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2436 return; 2270 return;
2437 for_each_cpu(cpu, rcu_nocb_mask) { 2271 for_each_cpu(cpu, rcu_nocb_mask) {
2438 rdp = per_cpu_ptr(rsp->rda, cpu); 2272 rdp = per_cpu_ptr(rsp->rda, cpu);
2439 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); 2273 t = kthread_run(rcu_nocb_kthread, rdp,
2274 "rcuo%c/%d", rsp->abbr, cpu);
2440 BUG_ON(IS_ERR(t)); 2275 BUG_ON(IS_ERR(t));
2441 ACCESS_ONCE(rdp->nocb_kthread) = t; 2276 ACCESS_ONCE(rdp->nocb_kthread) = t;
2442 } 2277 }
2443} 2278}
2444 2279
2445/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ 2280/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2446static void init_nocb_callback_list(struct rcu_data *rdp) 2281static bool init_nocb_callback_list(struct rcu_data *rdp)
2447{ 2282{
2448 if (rcu_nocb_mask == NULL || 2283 if (rcu_nocb_mask == NULL ||
2449 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) 2284 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2450 return; 2285 return false;
2451 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2286 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2287 return true;
2288}
2289
2290#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2291
2292static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2293{
2294 return 0;
2452} 2295}
2453 2296
2454/* Initialize the ->call_remote fields in the rcu_state structures. */ 2297static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2455static void __init rcu_init_nocb(void)
2456{ 2298{
2457#ifdef CONFIG_PREEMPT_RCU
2458 rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2459#endif /* #ifdef CONFIG_PREEMPT_RCU */
2460 rcu_bh_state.call_remote = call_rcu_bh_remote;
2461 rcu_sched_state.call_remote = call_rcu_sched_remote;
2462} 2299}
2463 2300
2464#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2301static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2302{
2303}
2304
2305static void rcu_init_one_nocb(struct rcu_node *rnp)
2306{
2307}
2465 2308
2466static bool is_nocb_cpu(int cpu) 2309static bool is_nocb_cpu(int cpu)
2467{ 2310{
@@ -2480,11 +2323,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2480 return 0; 2323 return 0;
2481} 2324}
2482 2325
2483static bool nocb_cpu_expendable(int cpu)
2484{
2485 return 1;
2486}
2487
2488static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2326static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2489{ 2327{
2490} 2328}
@@ -2493,12 +2331,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2493{ 2331{
2494} 2332}
2495 2333
2496static void init_nocb_callback_list(struct rcu_data *rdp) 2334static bool init_nocb_callback_list(struct rcu_data *rdp)
2497{
2498}
2499
2500static void __init rcu_init_nocb(void)
2501{ 2335{
2336 return false;
2502} 2337}
2503 2338
2504#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 2339#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0d095dcaa670..49099e81c87b 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,8 +46,6 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file, 49static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op) 50 const struct seq_operations *op)
53{ 51{
diff --git a/kernel/relay.c b/kernel/relay.c
index 01ab081ac53a..eef0d113b79e 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename,
588 chan->version = RELAYFS_CHANNEL_VERSION; 588 chan->version = RELAYFS_CHANNEL_VERSION;
589 chan->n_subbufs = n_subbufs; 589 chan->n_subbufs = n_subbufs;
590 chan->subbuf_size = subbuf_size; 590 chan->subbuf_size = subbuf_size;
591 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); 591 chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
592 chan->parent = parent; 592 chan->parent = parent;
593 chan->private_data = private_data; 593 chan->private_data = private_data;
594 if (base_filename) { 594 if (base_filename) {
@@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
1099static int subbuf_read_actor(size_t read_start, 1099static int subbuf_read_actor(size_t read_start,
1100 struct rchan_buf *buf, 1100 struct rchan_buf *buf,
1101 size_t avail, 1101 size_t avail,
1102 read_descriptor_t *desc, 1102 read_descriptor_t *desc)
1103 read_actor_t actor)
1104{ 1103{
1105 void *from; 1104 void *from;
1106 int ret = 0; 1105 int ret = 0;
@@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start,
1121typedef int (*subbuf_actor_t) (size_t read_start, 1120typedef int (*subbuf_actor_t) (size_t read_start,
1122 struct rchan_buf *buf, 1121 struct rchan_buf *buf,
1123 size_t avail, 1122 size_t avail,
1124 read_descriptor_t *desc, 1123 read_descriptor_t *desc);
1125 read_actor_t actor);
1126 1124
1127/* 1125/*
1128 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries 1126 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
1129 */ 1127 */
1130static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, 1128static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1131 subbuf_actor_t subbuf_actor, 1129 subbuf_actor_t subbuf_actor,
1132 read_actor_t actor,
1133 read_descriptor_t *desc) 1130 read_descriptor_t *desc)
1134{ 1131{
1135 struct rchan_buf *buf = filp->private_data; 1132 struct rchan_buf *buf = filp->private_data;
@@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1150 break; 1147 break;
1151 1148
1152 avail = min(desc->count, avail); 1149 avail = min(desc->count, avail);
1153 ret = subbuf_actor(read_start, buf, avail, desc, actor); 1150 ret = subbuf_actor(read_start, buf, avail, desc);
1154 if (desc->error < 0) 1151 if (desc->error < 0)
1155 break; 1152 break;
1156 1153
@@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp,
1174 desc.count = count; 1171 desc.count = count;
1175 desc.arg.buf = buffer; 1172 desc.arg.buf = buffer;
1176 desc.error = 0; 1173 desc.error = 0;
1177 return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, 1174 return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
1178 NULL, &desc);
1179} 1175}
1180 1176
1181static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) 1177static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
diff --git a/kernel/resource.c b/kernel/resource.c
index 73f35d4b30b9..d7386986e10e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -21,6 +21,7 @@
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/pfn.h> 23#include <linux/pfn.h>
24#include <linux/mm.h>
24#include <asm/io.h> 25#include <asm/io.h>
25 26
26 27
@@ -50,6 +51,14 @@ struct resource_constraint {
50 51
51static DEFINE_RWLOCK(resource_lock); 52static DEFINE_RWLOCK(resource_lock);
52 53
54/*
55 * For memory hotplug, there is no way to free resource entries allocated
56 * by boot mem after the system is up. So for reusing the resource entry
57 * we need to remember the resource.
58 */
59static struct resource *bootmem_resource_free;
60static DEFINE_SPINLOCK(bootmem_resource_lock);
61
53static void *r_next(struct seq_file *m, void *v, loff_t *pos) 62static void *r_next(struct seq_file *m, void *v, loff_t *pos)
54{ 63{
55 struct resource *p = v; 64 struct resource *p = v;
@@ -151,6 +160,40 @@ __initcall(ioresources_init);
151 160
152#endif /* CONFIG_PROC_FS */ 161#endif /* CONFIG_PROC_FS */
153 162
163static void free_resource(struct resource *res)
164{
165 if (!res)
166 return;
167
168 if (!PageSlab(virt_to_head_page(res))) {
169 spin_lock(&bootmem_resource_lock);
170 res->sibling = bootmem_resource_free;
171 bootmem_resource_free = res;
172 spin_unlock(&bootmem_resource_lock);
173 } else {
174 kfree(res);
175 }
176}
177
178static struct resource *alloc_resource(gfp_t flags)
179{
180 struct resource *res = NULL;
181
182 spin_lock(&bootmem_resource_lock);
183 if (bootmem_resource_free) {
184 res = bootmem_resource_free;
185 bootmem_resource_free = res->sibling;
186 }
187 spin_unlock(&bootmem_resource_lock);
188
189 if (res)
190 memset(res, 0, sizeof(struct resource));
191 else
192 res = kzalloc(sizeof(struct resource), flags);
193
194 return res;
195}
196
154/* Return the conflict entry if you can't request it */ 197/* Return the conflict entry if you can't request it */
155static struct resource * __request_resource(struct resource *root, struct resource *new) 198static struct resource * __request_resource(struct resource *root, struct resource *new)
156{ 199{
@@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
706 write_unlock(&resource_lock); 749 write_unlock(&resource_lock);
707} 750}
708 751
709/** 752static int __adjust_resource(struct resource *res, resource_size_t start,
710 * adjust_resource - modify a resource's start and size 753 resource_size_t size)
711 * @res: resource to modify
712 * @start: new start value
713 * @size: new size
714 *
715 * Given an existing resource, change its start and size to match the
716 * arguments. Returns 0 on success, -EBUSY if it can't fit.
717 * Existing children of the resource are assumed to be immutable.
718 */
719int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
720{ 754{
721 struct resource *tmp, *parent = res->parent; 755 struct resource *tmp, *parent = res->parent;
722 resource_size_t end = start + size - 1; 756 resource_size_t end = start + size - 1;
723 int result = -EBUSY; 757 int result = -EBUSY;
724 758
725 write_lock(&resource_lock);
726
727 if (!parent) 759 if (!parent)
728 goto skip; 760 goto skip;
729 761
@@ -751,6 +783,26 @@ skip:
751 result = 0; 783 result = 0;
752 784
753 out: 785 out:
786 return result;
787}
788
789/**
790 * adjust_resource - modify a resource's start and size
791 * @res: resource to modify
792 * @start: new start value
793 * @size: new size
794 *
795 * Given an existing resource, change its start and size to match the
796 * arguments. Returns 0 on success, -EBUSY if it can't fit.
797 * Existing children of the resource are assumed to be immutable.
798 */
799int adjust_resource(struct resource *res, resource_size_t start,
800 resource_size_t size)
801{
802 int result;
803
804 write_lock(&resource_lock);
805 result = __adjust_resource(res, start, size);
754 write_unlock(&resource_lock); 806 write_unlock(&resource_lock);
755 return result; 807 return result;
756} 808}
@@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,
762{ 814{
763 struct resource *parent = root; 815 struct resource *parent = root;
764 struct resource *conflict; 816 struct resource *conflict;
765 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); 817 struct resource *res = alloc_resource(GFP_ATOMIC);
766 struct resource *next_res = NULL; 818 struct resource *next_res = NULL;
767 819
768 if (!res) 820 if (!res)
@@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,
787 /* conflict covered whole area */ 839 /* conflict covered whole area */
788 if (conflict->start <= res->start && 840 if (conflict->start <= res->start &&
789 conflict->end >= res->end) { 841 conflict->end >= res->end) {
790 kfree(res); 842 free_resource(res);
791 WARN_ON(next_res); 843 WARN_ON(next_res);
792 break; 844 break;
793 } 845 }
@@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,
797 end = res->end; 849 end = res->end;
798 res->end = conflict->start - 1; 850 res->end = conflict->start - 1;
799 if (conflict->end < end) { 851 if (conflict->end < end) {
800 next_res = kzalloc(sizeof(*next_res), 852 next_res = alloc_resource(GFP_ATOMIC);
801 GFP_ATOMIC);
802 if (!next_res) { 853 if (!next_res) {
803 kfree(res); 854 free_resource(res);
804 break; 855 break;
805 } 856 }
806 next_res->name = name; 857 next_res->name = name;
@@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent,
890 const char *name, int flags) 941 const char *name, int flags)
891{ 942{
892 DECLARE_WAITQUEUE(wait, current); 943 DECLARE_WAITQUEUE(wait, current);
893 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 944 struct resource *res = alloc_resource(GFP_KERNEL);
894 945
895 if (!res) 946 if (!res)
896 return NULL; 947 return NULL;
@@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent,
924 continue; 975 continue;
925 } 976 }
926 /* Uhhuh, that didn't work out.. */ 977 /* Uhhuh, that didn't work out.. */
927 kfree(res); 978 free_resource(res);
928 res = NULL; 979 res = NULL;
929 break; 980 break;
930 } 981 }
@@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,
958 return -EBUSY; 1009 return -EBUSY;
959 1010
960 release_resource(res); 1011 release_resource(res);
961 kfree(res); 1012 free_resource(res);
962 return 0; 1013 return 0;
963} 1014}
964EXPORT_SYMBOL(__check_region); 1015EXPORT_SYMBOL(__check_region);
@@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,
998 write_unlock(&resource_lock); 1049 write_unlock(&resource_lock);
999 if (res->flags & IORESOURCE_MUXED) 1050 if (res->flags & IORESOURCE_MUXED)
1000 wake_up(&muxed_resource_wait); 1051 wake_up(&muxed_resource_wait);
1001 kfree(res); 1052 free_resource(res);
1002 return; 1053 return;
1003 } 1054 }
1004 p = &res->sibling; 1055 p = &res->sibling;
@@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start,
1012} 1063}
1013EXPORT_SYMBOL(__release_region); 1064EXPORT_SYMBOL(__release_region);
1014 1065
1066#ifdef CONFIG_MEMORY_HOTREMOVE
1067/**
1068 * release_mem_region_adjustable - release a previously reserved memory region
1069 * @parent: parent resource descriptor
1070 * @start: resource start address
1071 * @size: resource region size
1072 *
1073 * This interface is intended for memory hot-delete. The requested region
1074 * is released from a currently busy memory resource. The requested region
1075 * must either match exactly or fit into a single busy resource entry. In
1076 * the latter case, the remaining resource is adjusted accordingly.
1077 * Existing children of the busy memory resource must be immutable in the
1078 * request.
1079 *
1080 * Note:
1081 * - Additional release conditions, such as overlapping region, can be
1082 * supported after they are confirmed as valid cases.
1083 * - When a busy memory resource gets split into two entries, the code
1084 * assumes that all children remain in the lower address entry for
1085 * simplicity. Enhance this logic when necessary.
1086 */
1087int release_mem_region_adjustable(struct resource *parent,
1088 resource_size_t start, resource_size_t size)
1089{
1090 struct resource **p;
1091 struct resource *res;
1092 struct resource *new_res;
1093 resource_size_t end;
1094 int ret = -EINVAL;
1095
1096 end = start + size - 1;
1097 if ((start < parent->start) || (end > parent->end))
1098 return ret;
1099
1100 /* The alloc_resource() result gets checked later */
1101 new_res = alloc_resource(GFP_KERNEL);
1102
1103 p = &parent->child;
1104 write_lock(&resource_lock);
1105
1106 while ((res = *p)) {
1107 if (res->start >= end)
1108 break;
1109
1110 /* look for the next resource if it does not fit into */
1111 if (res->start > start || res->end < end) {
1112 p = &res->sibling;
1113 continue;
1114 }
1115
1116 if (!(res->flags & IORESOURCE_MEM))
1117 break;
1118
1119 if (!(res->flags & IORESOURCE_BUSY)) {
1120 p = &res->child;
1121 continue;
1122 }
1123
1124 /* found the target resource; let's adjust accordingly */
1125 if (res->start == start && res->end == end) {
1126 /* free the whole entry */
1127 *p = res->sibling;
1128 free_resource(res);
1129 ret = 0;
1130 } else if (res->start == start && res->end != end) {
1131 /* adjust the start */
1132 ret = __adjust_resource(res, end + 1,
1133 res->end - end);
1134 } else if (res->start != start && res->end == end) {
1135 /* adjust the end */
1136 ret = __adjust_resource(res, res->start,
1137 start - res->start);
1138 } else {
1139 /* split into two entries */
1140 if (!new_res) {
1141 ret = -ENOMEM;
1142 break;
1143 }
1144 new_res->name = res->name;
1145 new_res->start = end + 1;
1146 new_res->end = res->end;
1147 new_res->flags = res->flags;
1148 new_res->parent = res->parent;
1149 new_res->sibling = res->sibling;
1150 new_res->child = NULL;
1151
1152 ret = __adjust_resource(res, res->start,
1153 start - res->start);
1154 if (ret)
1155 break;
1156 res->sibling = new_res;
1157 new_res = NULL;
1158 }
1159
1160 break;
1161 }
1162
1163 write_unlock(&resource_lock);
1164 free_resource(new_res);
1165 return ret;
1166}
1167#endif /* CONFIG_MEMORY_HOTREMOVE */
1168
1015/* 1169/*
1016 * Managed region resource 1170 * Managed region resource
1017 */ 1171 */
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 7890b10084a7..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -14,6 +14,7 @@
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/stat.h>
17 18
18#include "rtmutex.h" 19#include "rtmutex.h"
19 20
@@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
366 return curr - buf; 367 return curr - buf;
367} 368}
368 369
369static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); 370static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
370static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); 371static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
371 372
372static struct bus_type rttest_subsys = { 373static struct bus_type rttest_subsys = {
373 .name = "rttest", 374 .name = "rttest",
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103b..deaf90e4a1de 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67d04651f44b..5662f58f0b69 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
512 * the target CPU. 512 * the target CPU.
513 */ 513 */
514#ifdef CONFIG_SMP 514#ifdef CONFIG_SMP
515
516#ifndef tsk_is_polling
517#define tsk_is_polling(t) 0
518#endif
519
520void resched_task(struct task_struct *p) 515void resched_task(struct task_struct *p)
521{ 516{
522 int cpu; 517 int cpu;
@@ -1288,8 +1283,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1288static void 1283static void
1289ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1284ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1290{ 1285{
1291 trace_sched_wakeup(p, true);
1292 check_preempt_curr(rq, p, wake_flags); 1286 check_preempt_curr(rq, p, wake_flags);
1287 trace_sched_wakeup(p, true);
1293 1288
1294 p->state = TASK_RUNNING; 1289 p->state = TASK_RUNNING;
1295#ifdef CONFIG_SMP 1290#ifdef CONFIG_SMP
@@ -2999,51 +2994,6 @@ void __sched schedule_preempt_disabled(void)
2999 preempt_disable(); 2994 preempt_disable();
3000} 2995}
3001 2996
3002#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3003
3004static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3005{
3006 if (lock->owner != owner)
3007 return false;
3008
3009 /*
3010 * Ensure we emit the owner->on_cpu, dereference _after_ checking
3011 * lock->owner still matches owner, if that fails, owner might
3012 * point to free()d memory, if it still matches, the rcu_read_lock()
3013 * ensures the memory stays valid.
3014 */
3015 barrier();
3016
3017 return owner->on_cpu;
3018}
3019
3020/*
3021 * Look out! "owner" is an entirely speculative pointer
3022 * access and not reliable.
3023 */
3024int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3025{
3026 if (!sched_feat(OWNER_SPIN))
3027 return 0;
3028
3029 rcu_read_lock();
3030 while (owner_running(lock, owner)) {
3031 if (need_resched())
3032 break;
3033
3034 arch_mutex_cpu_relax();
3035 }
3036 rcu_read_unlock();
3037
3038 /*
3039 * We break out the loop above on need_resched() and when the
3040 * owner changed, which is a sign for heavy contention. Return
3041 * success only when lock->owner is NULL.
3042 */
3043 return lock->owner == NULL;
3044}
3045#endif
3046
3047#ifdef CONFIG_PREEMPT 2997#ifdef CONFIG_PREEMPT
3048/* 2998/*
3049 * this is the entry point to schedule() from in-kernel preemption 2999 * this is the entry point to schedule() from in-kernel preemption
@@ -3084,11 +3034,13 @@ EXPORT_SYMBOL(preempt_schedule);
3084asmlinkage void __sched preempt_schedule_irq(void) 3034asmlinkage void __sched preempt_schedule_irq(void)
3085{ 3035{
3086 struct thread_info *ti = current_thread_info(); 3036 struct thread_info *ti = current_thread_info();
3037 enum ctx_state prev_state;
3087 3038
3088 /* Catch callers which need to be fixed */ 3039 /* Catch callers which need to be fixed */
3089 BUG_ON(ti->preempt_count || !irqs_disabled()); 3040 BUG_ON(ti->preempt_count || !irqs_disabled());
3090 3041
3091 user_exit(); 3042 prev_state = exception_enter();
3043
3092 do { 3044 do {
3093 add_preempt_count(PREEMPT_ACTIVE); 3045 add_preempt_count(PREEMPT_ACTIVE);
3094 local_irq_enable(); 3046 local_irq_enable();
@@ -3102,6 +3054,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
3102 */ 3054 */
3103 barrier(); 3055 barrier();
3104 } while (need_resched()); 3056 } while (need_resched());
3057
3058 exception_exit(prev_state);
3105} 3059}
3106 3060
3107#endif /* CONFIG_PREEMPT */ 3061#endif /* CONFIG_PREEMPT */
@@ -4128,6 +4082,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4128 get_task_struct(p); 4082 get_task_struct(p);
4129 rcu_read_unlock(); 4083 rcu_read_unlock();
4130 4084
4085 if (p->flags & PF_NO_SETAFFINITY) {
4086 retval = -EINVAL;
4087 goto out_put_task;
4088 }
4131 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4089 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4132 retval = -ENOMEM; 4090 retval = -ENOMEM;
4133 goto out_put_task; 4091 goto out_put_task;
@@ -4628,6 +4586,7 @@ void sched_show_task(struct task_struct *p)
4628 task_pid_nr(p), ppid, 4586 task_pid_nr(p), ppid,
4629 (unsigned long)task_thread_info(p)->flags); 4587 (unsigned long)task_thread_info(p)->flags);
4630 4588
4589 print_worker_info(KERN_INFO, p);
4631 show_stack(p, NULL); 4590 show_stack(p, NULL);
4632} 4591}
4633 4592
@@ -4775,11 +4734,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4775 goto out; 4734 goto out;
4776 } 4735 }
4777 4736
4778 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4779 ret = -EINVAL;
4780 goto out;
4781 }
4782
4783 do_set_cpus_allowed(p, new_mask); 4737 do_set_cpus_allowed(p, new_mask);
4784 4738
4785 /* Can the task run on the task's current CPU? If so, we're done */ 4739 /* Can the task run on the task's current CPU? If so, we're done */
@@ -6250,7 +6204,7 @@ static void sched_init_numa(void)
6250 * 'level' contains the number of unique distances, excluding the 6204 * 'level' contains the number of unique distances, excluding the
6251 * identity distance node_distance(i,i). 6205 * identity distance node_distance(i,i).
6252 * 6206 *
6253 * The sched_domains_nume_distance[] array includes the actual distance 6207 * The sched_domains_numa_distance[] array includes the actual distance
6254 * numbers. 6208 * numbers.
6255 */ 6209 */
6256 6210
@@ -6863,11 +6817,15 @@ int in_sched_functions(unsigned long addr)
6863} 6817}
6864 6818
6865#ifdef CONFIG_CGROUP_SCHED 6819#ifdef CONFIG_CGROUP_SCHED
6820/*
6821 * Default task group.
6822 * Every task in system belongs to this group at bootup.
6823 */
6866struct task_group root_task_group; 6824struct task_group root_task_group;
6867LIST_HEAD(task_groups); 6825LIST_HEAD(task_groups);
6868#endif 6826#endif
6869 6827
6870DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6828DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6871 6829
6872void __init sched_init(void) 6830void __init sched_init(void)
6873{ 6831{
@@ -6904,7 +6862,7 @@ void __init sched_init(void)
6904#endif /* CONFIG_RT_GROUP_SCHED */ 6862#endif /* CONFIG_RT_GROUP_SCHED */
6905#ifdef CONFIG_CPUMASK_OFFSTACK 6863#ifdef CONFIG_CPUMASK_OFFSTACK
6906 for_each_possible_cpu(i) { 6864 for_each_possible_cpu(i) {
6907 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 6865 per_cpu(load_balance_mask, i) = (void *)ptr;
6908 ptr += cpumask_size(); 6866 ptr += cpumask_size();
6909 } 6867 }
6910#endif /* CONFIG_CPUMASK_OFFSTACK */ 6868#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6930,12 +6888,6 @@ void __init sched_init(void)
6930 6888
6931#endif /* CONFIG_CGROUP_SCHED */ 6889#endif /* CONFIG_CGROUP_SCHED */
6932 6890
6933#ifdef CONFIG_CGROUP_CPUACCT
6934 root_cpuacct.cpustat = &kernel_cpustat;
6935 root_cpuacct.cpuusage = alloc_percpu(u64);
6936 /* Too early, not expected to fail */
6937 BUG_ON(!root_cpuacct.cpuusage);
6938#endif
6939 for_each_possible_cpu(i) { 6891 for_each_possible_cpu(i) {
6940 struct rq *rq; 6892 struct rq *rq;
6941 6893
@@ -7457,7 +7409,7 @@ unlock:
7457 return err; 7409 return err;
7458} 7410}
7459 7411
7460int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7412static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7461{ 7413{
7462 u64 rt_runtime, rt_period; 7414 u64 rt_runtime, rt_period;
7463 7415
@@ -7469,7 +7421,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7469 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7421 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7470} 7422}
7471 7423
7472long sched_group_rt_runtime(struct task_group *tg) 7424static long sched_group_rt_runtime(struct task_group *tg)
7473{ 7425{
7474 u64 rt_runtime_us; 7426 u64 rt_runtime_us;
7475 7427
@@ -7481,7 +7433,7 @@ long sched_group_rt_runtime(struct task_group *tg)
7481 return rt_runtime_us; 7433 return rt_runtime_us;
7482} 7434}
7483 7435
7484int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7436static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7485{ 7437{
7486 u64 rt_runtime, rt_period; 7438 u64 rt_runtime, rt_period;
7487 7439
@@ -7494,7 +7446,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7494 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7446 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7495} 7447}
7496 7448
7497long sched_group_rt_period(struct task_group *tg) 7449static long sched_group_rt_period(struct task_group *tg)
7498{ 7450{
7499 u64 rt_period_us; 7451 u64 rt_period_us;
7500 7452
@@ -7529,7 +7481,7 @@ static int sched_rt_global_constraints(void)
7529 return ret; 7481 return ret;
7530} 7482}
7531 7483
7532int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7484static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7533{ 7485{
7534 /* Don't accept realtime tasks when there is no way for them to run */ 7486 /* Don't accept realtime tasks when there is no way for them to run */
7535 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7487 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -8037,226 +7989,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8037 7989
8038#endif /* CONFIG_CGROUP_SCHED */ 7990#endif /* CONFIG_CGROUP_SCHED */
8039 7991
8040#ifdef CONFIG_CGROUP_CPUACCT
8041
8042/*
8043 * CPU accounting code for task groups.
8044 *
8045 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
8046 * (balbir@in.ibm.com).
8047 */
8048
8049struct cpuacct root_cpuacct;
8050
8051/* create a new cpu accounting group */
8052static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
8053{
8054 struct cpuacct *ca;
8055
8056 if (!cgrp->parent)
8057 return &root_cpuacct.css;
8058
8059 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8060 if (!ca)
8061 goto out;
8062
8063 ca->cpuusage = alloc_percpu(u64);
8064 if (!ca->cpuusage)
8065 goto out_free_ca;
8066
8067 ca->cpustat = alloc_percpu(struct kernel_cpustat);
8068 if (!ca->cpustat)
8069 goto out_free_cpuusage;
8070
8071 return &ca->css;
8072
8073out_free_cpuusage:
8074 free_percpu(ca->cpuusage);
8075out_free_ca:
8076 kfree(ca);
8077out:
8078 return ERR_PTR(-ENOMEM);
8079}
8080
8081/* destroy an existing cpu accounting group */
8082static void cpuacct_css_free(struct cgroup *cgrp)
8083{
8084 struct cpuacct *ca = cgroup_ca(cgrp);
8085
8086 free_percpu(ca->cpustat);
8087 free_percpu(ca->cpuusage);
8088 kfree(ca);
8089}
8090
8091static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8092{
8093 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8094 u64 data;
8095
8096#ifndef CONFIG_64BIT
8097 /*
8098 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
8099 */
8100 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8101 data = *cpuusage;
8102 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8103#else
8104 data = *cpuusage;
8105#endif
8106
8107 return data;
8108}
8109
8110static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8111{
8112 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8113
8114#ifndef CONFIG_64BIT
8115 /*
8116 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8117 */
8118 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8119 *cpuusage = val;
8120 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8121#else
8122 *cpuusage = val;
8123#endif
8124}
8125
8126/* return total cpu usage (in nanoseconds) of a group */
8127static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8128{
8129 struct cpuacct *ca = cgroup_ca(cgrp);
8130 u64 totalcpuusage = 0;
8131 int i;
8132
8133 for_each_present_cpu(i)
8134 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8135
8136 return totalcpuusage;
8137}
8138
8139static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8140 u64 reset)
8141{
8142 struct cpuacct *ca = cgroup_ca(cgrp);
8143 int err = 0;
8144 int i;
8145
8146 if (reset) {
8147 err = -EINVAL;
8148 goto out;
8149 }
8150
8151 for_each_present_cpu(i)
8152 cpuacct_cpuusage_write(ca, i, 0);
8153
8154out:
8155 return err;
8156}
8157
8158static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8159 struct seq_file *m)
8160{
8161 struct cpuacct *ca = cgroup_ca(cgroup);
8162 u64 percpu;
8163 int i;
8164
8165 for_each_present_cpu(i) {
8166 percpu = cpuacct_cpuusage_read(ca, i);
8167 seq_printf(m, "%llu ", (unsigned long long) percpu);
8168 }
8169 seq_printf(m, "\n");
8170 return 0;
8171}
8172
8173static const char *cpuacct_stat_desc[] = {
8174 [CPUACCT_STAT_USER] = "user",
8175 [CPUACCT_STAT_SYSTEM] = "system",
8176};
8177
8178static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8179 struct cgroup_map_cb *cb)
8180{
8181 struct cpuacct *ca = cgroup_ca(cgrp);
8182 int cpu;
8183 s64 val = 0;
8184
8185 for_each_online_cpu(cpu) {
8186 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8187 val += kcpustat->cpustat[CPUTIME_USER];
8188 val += kcpustat->cpustat[CPUTIME_NICE];
8189 }
8190 val = cputime64_to_clock_t(val);
8191 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8192
8193 val = 0;
8194 for_each_online_cpu(cpu) {
8195 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8196 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8197 val += kcpustat->cpustat[CPUTIME_IRQ];
8198 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8199 }
8200
8201 val = cputime64_to_clock_t(val);
8202 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8203
8204 return 0;
8205}
8206
8207static struct cftype files[] = {
8208 {
8209 .name = "usage",
8210 .read_u64 = cpuusage_read,
8211 .write_u64 = cpuusage_write,
8212 },
8213 {
8214 .name = "usage_percpu",
8215 .read_seq_string = cpuacct_percpu_seq_read,
8216 },
8217 {
8218 .name = "stat",
8219 .read_map = cpuacct_stats_show,
8220 },
8221 { } /* terminate */
8222};
8223
8224/*
8225 * charge this task's execution time to its accounting group.
8226 *
8227 * called with rq->lock held.
8228 */
8229void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8230{
8231 struct cpuacct *ca;
8232 int cpu;
8233
8234 if (unlikely(!cpuacct_subsys.active))
8235 return;
8236
8237 cpu = task_cpu(tsk);
8238
8239 rcu_read_lock();
8240
8241 ca = task_ca(tsk);
8242
8243 for (; ca; ca = parent_ca(ca)) {
8244 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8245 *cpuusage += cputime;
8246 }
8247
8248 rcu_read_unlock();
8249}
8250
8251struct cgroup_subsys cpuacct_subsys = {
8252 .name = "cpuacct",
8253 .css_alloc = cpuacct_css_alloc,
8254 .css_free = cpuacct_css_free,
8255 .subsys_id = cpuacct_subsys_id,
8256 .base_cftypes = files,
8257};
8258#endif /* CONFIG_CGROUP_CPUACCT */
8259
8260void dump_cpu_task(int cpu) 7992void dump_cpu_task(int cpu)
8261{ 7993{
8262 pr_info("Task dump for CPU %d:\n", cpu); 7994 pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000000..dbb7e2cd95eb
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
1#include <linux/cgroup.h>
2#include <linux/slab.h>
3#include <linux/percpu.h>
4#include <linux/spinlock.h>
5#include <linux/cpumask.h>
6#include <linux/seq_file.h>
7#include <linux/rcupdate.h>
8#include <linux/kernel_stat.h>
9#include <linux/err.h>
10
11#include "sched.h"
12
13/*
14 * CPU accounting code for task groups.
15 *
16 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
17 * (balbir@in.ibm.com).
18 */
19
20/* Time spent by the tasks of the cpu accounting group executing in ... */
21enum cpuacct_stat_index {
22 CPUACCT_STAT_USER, /* ... user mode */
23 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
24
25 CPUACCT_STAT_NSTATS,
26};
27
28/* track cpu usage of a group of tasks and its child groups */
29struct cpuacct {
30 struct cgroup_subsys_state css;
31 /* cpuusage holds pointer to a u64-type object on every cpu */
32 u64 __percpu *cpuusage;
33 struct kernel_cpustat __percpu *cpustat;
34};
35
36/* return cpu accounting group corresponding to this container */
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
40 struct cpuacct, css);
41}
42
43/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53}
54
55static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{
57 if (!ca->css.cgroup->parent)
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60}
61
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
63static struct cpuacct root_cpuacct = {
64 .cpustat = &kernel_cpustat,
65 .cpuusage = &root_cpuacct_cpuusage,
66};
67
68/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
70{
71 struct cpuacct *ca;
72
73 if (!cgrp->parent)
74 return &root_cpuacct.css;
75
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
77 if (!ca)
78 goto out;
79
80 ca->cpuusage = alloc_percpu(u64);
81 if (!ca->cpuusage)
82 goto out_free_ca;
83
84 ca->cpustat = alloc_percpu(struct kernel_cpustat);
85 if (!ca->cpustat)
86 goto out_free_cpuusage;
87
88 return &ca->css;
89
90out_free_cpuusage:
91 free_percpu(ca->cpuusage);
92out_free_ca:
93 kfree(ca);
94out:
95 return ERR_PTR(-ENOMEM);
96}
97
98/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp)
100{
101 struct cpuacct *ca = cgroup_ca(cgrp);
102
103 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage);
105 kfree(ca);
106}
107
108static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
109{
110 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
111 u64 data;
112
113#ifndef CONFIG_64BIT
114 /*
115 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
116 */
117 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
118 data = *cpuusage;
119 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
120#else
121 data = *cpuusage;
122#endif
123
124 return data;
125}
126
127static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
128{
129 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
130
131#ifndef CONFIG_64BIT
132 /*
133 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
134 */
135 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
136 *cpuusage = val;
137 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
138#else
139 *cpuusage = val;
140#endif
141}
142
143/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
145{
146 struct cpuacct *ca = cgroup_ca(cgrp);
147 u64 totalcpuusage = 0;
148 int i;
149
150 for_each_present_cpu(i)
151 totalcpuusage += cpuacct_cpuusage_read(ca, i);
152
153 return totalcpuusage;
154}
155
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
157 u64 reset)
158{
159 struct cpuacct *ca = cgroup_ca(cgrp);
160 int err = 0;
161 int i;
162
163 if (reset) {
164 err = -EINVAL;
165 goto out;
166 }
167
168 for_each_present_cpu(i)
169 cpuacct_cpuusage_write(ca, i, 0);
170
171out:
172 return err;
173}
174
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
176 struct seq_file *m)
177{
178 struct cpuacct *ca = cgroup_ca(cgroup);
179 u64 percpu;
180 int i;
181
182 for_each_present_cpu(i) {
183 percpu = cpuacct_cpuusage_read(ca, i);
184 seq_printf(m, "%llu ", (unsigned long long) percpu);
185 }
186 seq_printf(m, "\n");
187 return 0;
188}
189
190static const char * const cpuacct_stat_desc[] = {
191 [CPUACCT_STAT_USER] = "user",
192 [CPUACCT_STAT_SYSTEM] = "system",
193};
194
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
196 struct cgroup_map_cb *cb)
197{
198 struct cpuacct *ca = cgroup_ca(cgrp);
199 int cpu;
200 s64 val = 0;
201
202 for_each_online_cpu(cpu) {
203 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
204 val += kcpustat->cpustat[CPUTIME_USER];
205 val += kcpustat->cpustat[CPUTIME_NICE];
206 }
207 val = cputime64_to_clock_t(val);
208 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
209
210 val = 0;
211 for_each_online_cpu(cpu) {
212 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
213 val += kcpustat->cpustat[CPUTIME_SYSTEM];
214 val += kcpustat->cpustat[CPUTIME_IRQ];
215 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
216 }
217
218 val = cputime64_to_clock_t(val);
219 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
220
221 return 0;
222}
223
224static struct cftype files[] = {
225 {
226 .name = "usage",
227 .read_u64 = cpuusage_read,
228 .write_u64 = cpuusage_write,
229 },
230 {
231 .name = "usage_percpu",
232 .read_seq_string = cpuacct_percpu_seq_read,
233 },
234 {
235 .name = "stat",
236 .read_map = cpuacct_stats_show,
237 },
238 { } /* terminate */
239};
240
241/*
242 * charge this task's execution time to its accounting group.
243 *
244 * called with rq->lock held.
245 */
246void cpuacct_charge(struct task_struct *tsk, u64 cputime)
247{
248 struct cpuacct *ca;
249 int cpu;
250
251 cpu = task_cpu(tsk);
252
253 rcu_read_lock();
254
255 ca = task_ca(tsk);
256
257 while (true) {
258 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
259 *cpuusage += cputime;
260
261 ca = parent_ca(ca);
262 if (!ca)
263 break;
264 }
265
266 rcu_read_unlock();
267}
268
269/*
270 * Add user/system time to cpuacct.
271 *
272 * Note: it's the caller that updates the account of the root cgroup.
273 */
274void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275{
276 struct kernel_cpustat *kcpustat;
277 struct cpuacct *ca;
278
279 rcu_read_lock();
280 ca = task_ca(p);
281 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca);
285 }
286 rcu_read_unlock();
287}
288
289struct cgroup_subsys cpuacct_subsys = {
290 .name = "cpuacct",
291 .css_alloc = cpuacct_css_alloc,
292 .css_free = cpuacct_css_free,
293 .subsys_id = cpuacct_subsys_id,
294 .base_cftypes = files,
295 .early_init = 1,
296};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000000..ed605624a5e7
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
1#ifdef CONFIG_CGROUP_CPUACCT
2
3extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
4extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
5
6#else
7
8static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9{
10}
11
12static inline void
13cpuacct_account_field(struct task_struct *p, int index, u64 val)
14{
15}
16
17#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index e93cca92f38b..ea32f02bf2c3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
115static inline void task_group_account_field(struct task_struct *p, int index, 115static inline void task_group_account_field(struct task_struct *p, int index,
116 u64 tmp) 116 u64 tmp)
117{ 117{
118#ifdef CONFIG_CGROUP_CPUACCT
119 struct kernel_cpustat *kcpustat;
120 struct cpuacct *ca;
121#endif
122 /* 118 /*
123 * Since all updates are sure to touch the root cgroup, we 119 * Since all updates are sure to touch the root cgroup, we
124 * get ourselves ahead and touch it first. If the root cgroup 120 * get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
127 */ 123 */
128 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
129 125
130#ifdef CONFIG_CGROUP_CPUACCT 126 cpuacct_account_field(p, index, tmp);
131 if (unlikely(!cpuacct_subsys.active))
132 return;
133
134 rcu_read_lock();
135 ca = task_ca(p);
136 while (ca && (ca != &root_cpuacct)) {
137 kcpustat = this_cpu_ptr(ca->cpustat);
138 kcpustat->cpustat[index] += tmp;
139 ca = parent_ca(ca);
140 }
141 rcu_read_unlock();
142#endif
143} 127}
144 128
145/* 129/*
@@ -388,82 +372,10 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
388 struct rq *rq) {} 372 struct rq *rq) {}
389#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 373#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
390 374
391#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
392/*
393 * Account a single tick of cpu time.
394 * @p: the process that the cpu time gets accounted to
395 * @user_tick: indicates if the tick is a user or a system tick
396 */
397void account_process_tick(struct task_struct *p, int user_tick)
398{
399 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
400 struct rq *rq = this_rq();
401
402 if (vtime_accounting_enabled())
403 return;
404
405 if (sched_clock_irqtime) {
406 irqtime_account_process_tick(p, user_tick, rq);
407 return;
408 }
409
410 if (steal_account_process_tick())
411 return;
412
413 if (user_tick)
414 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
415 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
416 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
417 one_jiffy_scaled);
418 else
419 account_idle_time(cputime_one_jiffy);
420}
421
422/*
423 * Account multiple ticks of steal time.
424 * @p: the process from which the cpu time has been stolen
425 * @ticks: number of stolen ticks
426 */
427void account_steal_ticks(unsigned long ticks)
428{
429 account_steal_time(jiffies_to_cputime(ticks));
430}
431
432/*
433 * Account multiple ticks of idle time.
434 * @ticks: number of stolen ticks
435 */
436void account_idle_ticks(unsigned long ticks)
437{
438
439 if (sched_clock_irqtime) {
440 irqtime_account_idle_ticks(ticks);
441 return;
442 }
443
444 account_idle_time(jiffies_to_cputime(ticks));
445}
446#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
447
448/* 375/*
449 * Use precise platform statistics if available: 376 * Use precise platform statistics if available:
450 */ 377 */
451#ifdef CONFIG_VIRT_CPU_ACCOUNTING 378#ifdef CONFIG_VIRT_CPU_ACCOUNTING
452void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
453{
454 *ut = p->utime;
455 *st = p->stime;
456}
457
458void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
459{
460 struct task_cputime cputime;
461
462 thread_group_cputime(p, &cputime);
463
464 *ut = cputime.utime;
465 *st = cputime.stime;
466}
467 379
468#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
469void vtime_task_switch(struct task_struct *prev) 381void vtime_task_switch(struct task_struct *prev)
@@ -518,21 +430,111 @@ void vtime_account_irq_enter(struct task_struct *tsk)
518} 430}
519EXPORT_SYMBOL_GPL(vtime_account_irq_enter); 431EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
520#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 432#endif /* __ARCH_HAS_VTIME_ACCOUNT */
433#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
434
435
436#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
437void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
438{
439 *ut = p->utime;
440 *st = p->stime;
441}
521 442
522#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ 443void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
444{
445 struct task_cputime cputime;
523 446
524static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) 447 thread_group_cputime(p, &cputime);
448
449 *ut = cputime.utime;
450 *st = cputime.stime;
451}
452#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
453/*
454 * Account a single tick of cpu time.
455 * @p: the process that the cpu time gets accounted to
456 * @user_tick: indicates if the tick is a user or a system tick
457 */
458void account_process_tick(struct task_struct *p, int user_tick)
525{ 459{
526 u64 temp = (__force u64) rtime; 460 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
461 struct rq *rq = this_rq();
527 462
528 temp *= (__force u64) stime; 463 if (vtime_accounting_enabled())
464 return;
465
466 if (sched_clock_irqtime) {
467 irqtime_account_process_tick(p, user_tick, rq);
468 return;
469 }
470
471 if (steal_account_process_tick())
472 return;
529 473
530 if (sizeof(cputime_t) == 4) 474 if (user_tick)
531 temp = div_u64(temp, (__force u32) total); 475 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
476 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
477 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
478 one_jiffy_scaled);
532 else 479 else
533 temp = div64_u64(temp, (__force u64) total); 480 account_idle_time(cputime_one_jiffy);
481}
534 482
535 return (__force cputime_t) temp; 483/*
484 * Account multiple ticks of steal time.
485 * @p: the process from which the cpu time has been stolen
486 * @ticks: number of stolen ticks
487 */
488void account_steal_ticks(unsigned long ticks)
489{
490 account_steal_time(jiffies_to_cputime(ticks));
491}
492
493/*
494 * Account multiple ticks of idle time.
495 * @ticks: number of stolen ticks
496 */
497void account_idle_ticks(unsigned long ticks)
498{
499
500 if (sched_clock_irqtime) {
501 irqtime_account_idle_ticks(ticks);
502 return;
503 }
504
505 account_idle_time(jiffies_to_cputime(ticks));
506}
507
508/*
509 * Perform (stime * rtime) / total with reduced chances
510 * of multiplication overflows by using smaller factors
511 * like quotient and remainders of divisions between
512 * rtime and total.
513 */
514static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
515{
516 u64 rem, res, scaled;
517
518 if (rtime >= total) {
519 /*
520 * Scale up to rtime / total then add
521 * the remainder scaled to stime / total.
522 */
523 res = div64_u64_rem(rtime, total, &rem);
524 scaled = stime * res;
525 scaled += div64_u64(stime * rem, total);
526 } else {
527 /*
528 * Same in reverse: scale down to total / rtime
529 * then substract that result scaled to
530 * to the remaining part.
531 */
532 res = div64_u64_rem(total, rtime, &rem);
533 scaled = div64_u64(stime, res);
534 scaled -= div64_u64(scaled * rem, total);
535 }
536
537 return (__force cputime_t) scaled;
536} 538}
537 539
538/* 540/*
@@ -545,6 +547,12 @@ static void cputime_adjust(struct task_cputime *curr,
545{ 547{
546 cputime_t rtime, stime, total; 548 cputime_t rtime, stime, total;
547 549
550 if (vtime_accounting_enabled()) {
551 *ut = curr->utime;
552 *st = curr->stime;
553 return;
554 }
555
548 stime = curr->stime; 556 stime = curr->stime;
549 total = stime + curr->utime; 557 total = stime + curr->utime;
550 558
@@ -560,10 +568,14 @@ static void cputime_adjust(struct task_cputime *curr,
560 */ 568 */
561 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 569 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
562 570
563 if (total) 571 if (!rtime) {
564 stime = scale_stime(stime, rtime, total); 572 stime = 0;
565 else 573 } else if (!total) {
566 stime = rtime; 574 stime = rtime;
575 } else {
576 stime = scale_stime((__force u64)stime,
577 (__force u64)rtime, (__force u64)total);
578 }
567 579
568 /* 580 /*
569 * If the tick based count grows faster than the scheduler one, 581 * If the tick based count grows faster than the scheduler one,
@@ -597,7 +609,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
597 thread_group_cputime(p, &cputime); 609 thread_group_cputime(p, &cputime);
598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 610 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
599} 611}
600#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ 612#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
601 613
602#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 614#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
603static unsigned long long vtime_delta(struct task_struct *tsk) 615static unsigned long long vtime_delta(struct task_struct *tsk)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e5986fc5..8bf7081b1ec5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
431 * Scheduling class tree data structure manipulation methods: 431 * Scheduling class tree data structure manipulation methods:
432 */ 432 */
433 433
434static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) 434static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
435{ 435{
436 s64 delta = (s64)(vruntime - min_vruntime); 436 s64 delta = (s64)(vruntime - max_vruntime);
437 if (delta > 0) 437 if (delta > 0)
438 min_vruntime = vruntime; 438 max_vruntime = vruntime;
439 439
440 return min_vruntime; 440 return max_vruntime;
441} 441}
442 442
443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) 443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
473 vruntime = min_vruntime(vruntime, se->vruntime); 473 vruntime = min_vruntime(vruntime, se->vruntime);
474 } 474 }
475 475
476 /* ensure we never gain time by being placed backwards. */
476 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 477 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477#ifndef CONFIG_64BIT 478#ifndef CONFIG_64BIT
478 smp_wmb(); 479 smp_wmb();
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
652} 653}
653 654
654/* 655/*
655 * We calculate the vruntime slice of a to be inserted task 656 * We calculate the vruntime slice of a to-be-inserted task.
656 * 657 *
657 * vs = s/w 658 * vs = s/w
658 */ 659 */
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 1563 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */ 1564 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564} 1565}
1566
1567/*
1568 * Update the rq's load with the elapsed running time before entering
1569 * idle. if the last scheduled task is not a CFS task, idle_enter will
1570 * be the only way to update the runnable statistic.
1571 */
1572void idle_enter_fair(struct rq *this_rq)
1573{
1574 update_rq_runnable_avg(this_rq, 1);
1575}
1576
1577/*
1578 * Update the rq's load with the elapsed idle time before a task is
1579 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1580 * be the only way to update the runnable statistic.
1581 */
1582void idle_exit_fair(struct rq *this_rq)
1583{
1584 update_rq_runnable_avg(this_rq, 0);
1585}
1586
1565#else 1587#else
1566static inline void update_entity_load_avg(struct sched_entity *se, 1588static inline void update_entity_load_avg(struct sched_entity *se,
1567 int update_cfs_rq) {} 1589 int update_cfs_rq) {}
@@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3874 int tsk_cache_hot = 0; 3896 int tsk_cache_hot = 0;
3875 /* 3897 /*
3876 * We do not migrate tasks that are: 3898 * We do not migrate tasks that are:
3877 * 1) running (obviously), or 3899 * 1) throttled_lb_pair, or
3878 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3900 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3879 * 3) are cache-hot on their current CPU. 3901 * 3) running (obviously), or
3902 * 4) are cache-hot on their current CPU.
3880 */ 3903 */
3904 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3905 return 0;
3906
3881 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3907 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3882 int new_dst_cpu; 3908 int cpu;
3883 3909
3884 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3910 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3885 3911
@@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3894 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 3920 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3895 return 0; 3921 return 0;
3896 3922
3897 new_dst_cpu = cpumask_first_and(env->dst_grpmask, 3923 /* Prevent to re-select dst_cpu via env's cpus */
3898 tsk_cpus_allowed(p)); 3924 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3899 if (new_dst_cpu < nr_cpu_ids) { 3925 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
3900 env->flags |= LBF_SOME_PINNED; 3926 env->flags |= LBF_SOME_PINNED;
3901 env->new_dst_cpu = new_dst_cpu; 3927 env->new_dst_cpu = cpu;
3928 break;
3929 }
3902 } 3930 }
3931
3903 return 0; 3932 return 0;
3904 } 3933 }
3905 3934
@@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3920 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3949 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3921 if (!tsk_cache_hot || 3950 if (!tsk_cache_hot ||
3922 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3951 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3923#ifdef CONFIG_SCHEDSTATS 3952
3924 if (tsk_cache_hot) { 3953 if (tsk_cache_hot) {
3925 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 3954 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3926 schedstat_inc(p, se.statistics.nr_forced_migrations); 3955 schedstat_inc(p, se.statistics.nr_forced_migrations);
3927 } 3956 }
3928#endif 3957
3929 return 1; 3958 return 1;
3930 } 3959 }
3931 3960
3932 if (tsk_cache_hot) { 3961 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3933 schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 3962 return 0;
3934 return 0;
3935 }
3936 return 1;
3937} 3963}
3938 3964
3939/* 3965/*
@@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
3948 struct task_struct *p, *n; 3974 struct task_struct *p, *n;
3949 3975
3950 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 3976 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3951 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3952 continue;
3953
3954 if (!can_migrate_task(p, env)) 3977 if (!can_migrate_task(p, env))
3955 continue; 3978 continue;
3956 3979
@@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
4002 break; 4025 break;
4003 } 4026 }
4004 4027
4005 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 4028 if (!can_migrate_task(p, env))
4006 goto next; 4029 goto next;
4007 4030
4008 load = task_h_load(p); 4031 load = task_h_load(p);
@@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
4013 if ((load / 2) > env->imbalance) 4036 if ((load / 2) > env->imbalance)
4014 goto next; 4037 goto next;
4015 4038
4016 if (!can_migrate_task(p, env))
4017 goto next;
4018
4019 move_task(p, env); 4039 move_task(p, env);
4020 pulled++; 4040 pulled++;
4021 env->imbalance -= load; 4041 env->imbalance -= load;
@@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
4245 return load_idx; 4265 return load_idx;
4246} 4266}
4247 4267
4248unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 4268static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4249{ 4269{
4250 return SCHED_POWER_SCALE; 4270 return SCHED_POWER_SCALE;
4251} 4271}
@@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4255 return default_scale_freq_power(sd, cpu); 4275 return default_scale_freq_power(sd, cpu);
4256} 4276}
4257 4277
4258unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 4278static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4259{ 4279{
4260 unsigned long weight = sd->span_weight; 4280 unsigned long weight = sd->span_weight;
4261 unsigned long smt_gain = sd->smt_gain; 4281 unsigned long smt_gain = sd->smt_gain;
@@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4270 return default_scale_smt_power(sd, cpu); 4290 return default_scale_smt_power(sd, cpu);
4271} 4291}
4272 4292
4273unsigned long scale_rt_power(int cpu) 4293static unsigned long scale_rt_power(int cpu)
4274{ 4294{
4275 struct rq *rq = cpu_rq(cpu); 4295 struct rq *rq = cpu_rq(cpu);
4276 u64 total, available, age_stamp, avg; 4296 u64 total, available, age_stamp, avg;
@@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4960#define MAX_PINNED_INTERVAL 512 4980#define MAX_PINNED_INTERVAL 512
4961 4981
4962/* Working cpumask for load_balance and load_balance_newidle. */ 4982/* Working cpumask for load_balance and load_balance_newidle. */
4963DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4983DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4964 4984
4965static int need_active_balance(struct lb_env *env) 4985static int need_active_balance(struct lb_env *env)
4966{ 4986{
@@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4991 int *balance) 5011 int *balance)
4992{ 5012{
4993 int ld_moved, cur_ld_moved, active_balance = 0; 5013 int ld_moved, cur_ld_moved, active_balance = 0;
4994 int lb_iterations, max_lb_iterations;
4995 struct sched_group *group; 5014 struct sched_group *group;
4996 struct rq *busiest; 5015 struct rq *busiest;
4997 unsigned long flags; 5016 unsigned long flags;
4998 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 5017 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
4999 5018
5000 struct lb_env env = { 5019 struct lb_env env = {
5001 .sd = sd, 5020 .sd = sd,
@@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5007 .cpus = cpus, 5026 .cpus = cpus,
5008 }; 5027 };
5009 5028
5029 /*
5030 * For NEWLY_IDLE load_balancing, we don't need to consider
5031 * other cpus in our group
5032 */
5033 if (idle == CPU_NEWLY_IDLE)
5034 env.dst_grpmask = NULL;
5035
5010 cpumask_copy(cpus, cpu_active_mask); 5036 cpumask_copy(cpus, cpu_active_mask);
5011 max_lb_iterations = cpumask_weight(env.dst_grpmask);
5012 5037
5013 schedstat_inc(sd, lb_count[idle]); 5038 schedstat_inc(sd, lb_count[idle]);
5014 5039
@@ -5034,7 +5059,6 @@ redo:
5034 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 5059 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5035 5060
5036 ld_moved = 0; 5061 ld_moved = 0;
5037 lb_iterations = 1;
5038 if (busiest->nr_running > 1) { 5062 if (busiest->nr_running > 1) {
5039 /* 5063 /*
5040 * Attempt to move tasks. If find_busiest_group has found 5064 * Attempt to move tasks. If find_busiest_group has found
@@ -5061,17 +5085,17 @@ more_balance:
5061 double_rq_unlock(env.dst_rq, busiest); 5085 double_rq_unlock(env.dst_rq, busiest);
5062 local_irq_restore(flags); 5086 local_irq_restore(flags);
5063 5087
5064 if (env.flags & LBF_NEED_BREAK) {
5065 env.flags &= ~LBF_NEED_BREAK;
5066 goto more_balance;
5067 }
5068
5069 /* 5088 /*
5070 * some other cpu did the load balance for us. 5089 * some other cpu did the load balance for us.
5071 */ 5090 */
5072 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 5091 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5073 resched_cpu(env.dst_cpu); 5092 resched_cpu(env.dst_cpu);
5074 5093
5094 if (env.flags & LBF_NEED_BREAK) {
5095 env.flags &= ~LBF_NEED_BREAK;
5096 goto more_balance;
5097 }
5098
5075 /* 5099 /*
5076 * Revisit (affine) tasks on src_cpu that couldn't be moved to 5100 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5077 * us and move them to an alternate dst_cpu in our sched_group 5101 * us and move them to an alternate dst_cpu in our sched_group
@@ -5091,14 +5115,17 @@ more_balance:
5091 * moreover subsequent load balance cycles should correct the 5115 * moreover subsequent load balance cycles should correct the
5092 * excess load moved. 5116 * excess load moved.
5093 */ 5117 */
5094 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 5118 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5095 lb_iterations++ < max_lb_iterations) {
5096 5119
5097 env.dst_rq = cpu_rq(env.new_dst_cpu); 5120 env.dst_rq = cpu_rq(env.new_dst_cpu);
5098 env.dst_cpu = env.new_dst_cpu; 5121 env.dst_cpu = env.new_dst_cpu;
5099 env.flags &= ~LBF_SOME_PINNED; 5122 env.flags &= ~LBF_SOME_PINNED;
5100 env.loop = 0; 5123 env.loop = 0;
5101 env.loop_break = sched_nr_migrate_break; 5124 env.loop_break = sched_nr_migrate_break;
5125
5126 /* Prevent to re-select dst_cpu via env's cpus */
5127 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5128
5102 /* 5129 /*
5103 * Go back to "more_balance" rather than "redo" since we 5130 * Go back to "more_balance" rather than "redo" since we
5104 * need to continue with same src_cpu. 5131 * need to continue with same src_cpu.
@@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5219 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5246 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5220 return; 5247 return;
5221 5248
5222 update_rq_runnable_avg(this_rq, 1);
5223
5224 /* 5249 /*
5225 * Drop the rq->lock, but keep IRQ/preempt disabled. 5250 * Drop the rq->lock, but keep IRQ/preempt disabled.
5226 */ 5251 */
@@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
5395 struct sched_domain *sd; 5420 struct sched_domain *sd;
5396 int cpu = smp_processor_id(); 5421 int cpu = smp_processor_id();
5397 5422
5398 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5399 return;
5400 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5401
5402 rcu_read_lock(); 5423 rcu_read_lock();
5403 for_each_domain(cpu, sd) 5424 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5425
5426 if (!sd || !sd->nohz_idle)
5427 goto unlock;
5428 sd->nohz_idle = 0;
5429
5430 for (; sd; sd = sd->parent)
5404 atomic_inc(&sd->groups->sgp->nr_busy_cpus); 5431 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5432unlock:
5405 rcu_read_unlock(); 5433 rcu_read_unlock();
5406} 5434}
5407 5435
@@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
5410 struct sched_domain *sd; 5438 struct sched_domain *sd;
5411 int cpu = smp_processor_id(); 5439 int cpu = smp_processor_id();
5412 5440
5413 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5414 return;
5415 set_bit(NOHZ_IDLE, nohz_flags(cpu));
5416
5417 rcu_read_lock(); 5441 rcu_read_lock();
5418 for_each_domain(cpu, sd) 5442 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5443
5444 if (!sd || sd->nohz_idle)
5445 goto unlock;
5446 sd->nohz_idle = 1;
5447
5448 for (; sd; sd = sd->parent)
5419 atomic_dec(&sd->groups->sgp->nr_busy_cpus); 5449 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5450unlock:
5420 rcu_read_unlock(); 5451 rcu_read_unlock();
5421} 5452}
5422 5453
@@ -5468,7 +5499,7 @@ void update_max_interval(void)
5468 * It checks each scheduling domain to see if it is due to be balanced, 5499 * It checks each scheduling domain to see if it is due to be balanced,
5469 * and initiates a balancing operation if so. 5500 * and initiates a balancing operation if so.
5470 * 5501 *
5471 * Balancing parameters are set up in arch_init_sched_domains. 5502 * Balancing parameters are set up in init_sched_domains.
5472 */ 5503 */
5473static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5504static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5474{ 5505{
@@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5506 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5537 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5507 if (load_balance(cpu, rq, sd, idle, &balance)) { 5538 if (load_balance(cpu, rq, sd, idle, &balance)) {
5508 /* 5539 /*
5509 * We've pulled tasks over so either we're no 5540 * The LBF_SOME_PINNED logic could have changed
5510 * longer idle. 5541 * env->dst_cpu, so we can't know our idle
5542 * state even if we migrated tasks. Update it.
5511 */ 5543 */
5512 idle = CPU_NOT_IDLE; 5544 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
5513 } 5545 }
5514 sd->last_balance = jiffies; 5546 sd->last_balance = jiffies;
5515 } 5547 }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
46SCHED_FEAT(LB_BIAS, true) 46SCHED_FEAT(LB_BIAS, true)
47 47
48/* 48/*
49 * Spin-wait on mutex acquisition when the mutex owner is running on
50 * another cpu -- assumes that when the owner is running, it will soon
51 * release the lock. Decreases scheduling overhead.
52 */
53SCHED_FEAT(OWNER_SPIN, true)
54
55/*
56 * Decrement CPU power based on time not spent running tasks 49 * Decrement CPU power based on time not spent running tasks
57 */ 50 */
58SCHED_FEAT(NONTASK_POWER, true) 51SCHED_FEAT(NONTASK_POWER, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..b8ce77328341 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20}
21
22static void post_schedule_idle(struct rq *rq)
23{
24 idle_enter_fair(rq);
25}
16#endif /* CONFIG_SMP */ 26#endif /* CONFIG_SMP */
17/* 27/*
18 * Idle tasks are unconditionally rescheduled: 28 * Idle tasks are unconditionally rescheduled:
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 35static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 36{
27 schedstat_inc(rq, sched_goidle); 37 schedstat_inc(rq, sched_goidle);
38#ifdef CONFIG_SMP
39 /* Trigger the post schedule to do an idle_enter for CFS */
40 rq->post_schedule = 1;
41#endif
28 return rq->idle; 42 return rq->idle;
29} 43}
30 44
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {
86 100
87#ifdef CONFIG_SMP 101#ifdef CONFIG_SMP
88 .select_task_rq = select_task_rq_idle, 102 .select_task_rq = select_task_rq_idle,
103 .pre_schedule = pre_schedule_idle,
104 .post_schedule = post_schedule_idle,
89#endif 105#endif
90 106
91 .set_curr_task = set_curr_task_idle, 107 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfdf469f..4c225c4c7111 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -7,6 +7,7 @@
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8 8
9#include "cpupri.h" 9#include "cpupri.h"
10#include "cpuacct.h"
10 11
11extern __read_mostly int scheduler_running; 12extern __read_mostly int scheduler_running;
12 13
@@ -33,6 +34,31 @@ extern __read_mostly int scheduler_running;
33 */ 34 */
34#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 35#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
35 36
37/*
38 * Increase resolution of nice-level calculations for 64-bit architectures.
39 * The extra resolution improves shares distribution and load balancing of
40 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
41 * hierarchies, especially on larger systems. This is not a user-visible change
42 * and does not change the user-interface for setting shares/weights.
43 *
44 * We increase resolution only if we have enough bits to allow this increased
45 * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
46 * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
47 * increased costs.
48 */
49#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
50# define SCHED_LOAD_RESOLUTION 10
51# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
52# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
53#else
54# define SCHED_LOAD_RESOLUTION 0
55# define scale_load(w) (w)
56# define scale_load_down(w) (w)
57#endif
58
59#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
60#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
61
36#define NICE_0_LOAD SCHED_LOAD_SCALE 62#define NICE_0_LOAD SCHED_LOAD_SCALE
37#define NICE_0_SHIFT SCHED_LOAD_SHIFT 63#define NICE_0_SHIFT SCHED_LOAD_SHIFT
38 64
@@ -154,11 +180,6 @@ struct task_group {
154#define MAX_SHARES (1UL << 18) 180#define MAX_SHARES (1UL << 18)
155#endif 181#endif
156 182
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *); 183typedef int (*tg_visitor)(struct task_group *, void *);
163 184
164extern int walk_tg_tree_from(struct task_group *from, 185extern int walk_tg_tree_from(struct task_group *from,
@@ -196,6 +217,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu, 217 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent); 218 struct sched_rt_entity *parent);
198 219
220extern struct task_group *sched_create_group(struct task_group *parent);
221extern void sched_online_group(struct task_group *tg,
222 struct task_group *parent);
223extern void sched_destroy_group(struct task_group *tg);
224extern void sched_offline_group(struct task_group *tg);
225
226extern void sched_move_task(struct task_struct *tsk);
227
228#ifdef CONFIG_FAIR_GROUP_SCHED
229extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
230#endif
231
199#else /* CONFIG_CGROUP_SCHED */ 232#else /* CONFIG_CGROUP_SCHED */
200 233
201struct cfs_bandwidth { }; 234struct cfs_bandwidth { };
@@ -547,6 +580,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
547DECLARE_PER_CPU(struct sched_domain *, sd_llc); 580DECLARE_PER_CPU(struct sched_domain *, sd_llc);
548DECLARE_PER_CPU(int, sd_llc_id); 581DECLARE_PER_CPU(int, sd_llc_id);
549 582
583struct sched_group_power {
584 atomic_t ref;
585 /*
586 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
587 * single CPU.
588 */
589 unsigned int power, power_orig;
590 unsigned long next_update;
591 /*
592 * Number of busy cpus in this group.
593 */
594 atomic_t nr_busy_cpus;
595
596 unsigned long cpumask[0]; /* iteration mask */
597};
598
599struct sched_group {
600 struct sched_group *next; /* Must be a circular list */
601 atomic_t ref;
602
603 unsigned int group_weight;
604 struct sched_group_power *sgp;
605
606 /*
607 * The CPUs this group covers.
608 *
609 * NOTE: this field is variable length. (Allocated dynamically
610 * by attaching extra space to the end of the structure,
611 * depending on how many CPUs the kernel has booted up with)
612 */
613 unsigned long cpumask[0];
614};
615
616static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
617{
618 return to_cpumask(sg->cpumask);
619}
620
621/*
622 * cpumask masking which cpus in the group are allowed to iterate up the domain
623 * tree.
624 */
625static inline struct cpumask *sched_group_mask(struct sched_group *sg)
626{
627 return to_cpumask(sg->sgp->cpumask);
628}
629
630/**
631 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
632 * @group: The group whose first cpu is to be returned.
633 */
634static inline unsigned int group_first_cpu(struct sched_group *group)
635{
636 return cpumask_first(sched_group_cpus(group));
637}
638
550extern int group_balance_cpu(struct sched_group *sg); 639extern int group_balance_cpu(struct sched_group *sg);
551 640
552#endif /* CONFIG_SMP */ 641#endif /* CONFIG_SMP */
@@ -784,6 +873,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
784} 873}
785#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 874#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
786 875
876/*
877 * wake flags
878 */
879#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
880#define WF_FORK 0x02 /* child wakeup after fork */
881#define WF_MIGRATED 0x4 /* internal use, task got migrated */
787 882
788static inline void update_load_add(struct load_weight *lw, unsigned long inc) 883static inline void update_load_add(struct load_weight *lw, unsigned long inc)
789{ 884{
@@ -856,14 +951,61 @@ static const u32 prio_to_wmult[40] = {
856 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 951 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
857}; 952};
858 953
859/* Time spent by the tasks of the cpu accounting group executing in ... */ 954#define ENQUEUE_WAKEUP 1
860enum cpuacct_stat_index { 955#define ENQUEUE_HEAD 2
861 CPUACCT_STAT_USER, /* ... user mode */ 956#ifdef CONFIG_SMP
862 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 957#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
958#else
959#define ENQUEUE_WAKING 0
960#endif
863 961
864 CPUACCT_STAT_NSTATS, 962#define DEQUEUE_SLEEP 1
865}; 963
964struct sched_class {
965 const struct sched_class *next;
966
967 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
968 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
969 void (*yield_task) (struct rq *rq);
970 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
971
972 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
973
974 struct task_struct * (*pick_next_task) (struct rq *rq);
975 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
976
977#ifdef CONFIG_SMP
978 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
979 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
980
981 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
982 void (*post_schedule) (struct rq *this_rq);
983 void (*task_waking) (struct task_struct *task);
984 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
985
986 void (*set_cpus_allowed)(struct task_struct *p,
987 const struct cpumask *newmask);
866 988
989 void (*rq_online)(struct rq *rq);
990 void (*rq_offline)(struct rq *rq);
991#endif
992
993 void (*set_curr_task) (struct rq *rq);
994 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
995 void (*task_fork) (struct task_struct *p);
996
997 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
998 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
999 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1000 int oldprio);
1001
1002 unsigned int (*get_rr_interval) (struct rq *rq,
1003 struct task_struct *task);
1004
1005#ifdef CONFIG_FAIR_GROUP_SCHED
1006 void (*task_move_group) (struct task_struct *p, int on_rq);
1007#endif
1008};
867 1009
868#define sched_class_highest (&stop_sched_class) 1010#define sched_class_highest (&stop_sched_class)
869#define for_each_class(class) \ 1011#define for_each_class(class) \
@@ -877,9 +1019,23 @@ extern const struct sched_class idle_sched_class;
877 1019
878#ifdef CONFIG_SMP 1020#ifdef CONFIG_SMP
879 1021
1022extern void update_group_power(struct sched_domain *sd, int cpu);
1023
880extern void trigger_load_balance(struct rq *rq, int cpu); 1024extern void trigger_load_balance(struct rq *rq, int cpu);
881extern void idle_balance(int this_cpu, struct rq *this_rq); 1025extern void idle_balance(int this_cpu, struct rq *this_rq);
882 1026
1027/*
1028 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1029 * becomes useful in lb
1030 */
1031#if defined(CONFIG_FAIR_GROUP_SCHED)
1032extern void idle_enter_fair(struct rq *this_rq);
1033extern void idle_exit_fair(struct rq *this_rq);
1034#else
1035static inline void idle_enter_fair(struct rq *this_rq) {}
1036static inline void idle_exit_fair(struct rq *this_rq) {}
1037#endif
1038
883#else /* CONFIG_SMP */ 1039#else /* CONFIG_SMP */
884 1040
885static inline void idle_balance(int cpu, struct rq *rq) 1041static inline void idle_balance(int cpu, struct rq *rq)
@@ -891,7 +1047,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
891extern void sysrq_sched_debug_show(void); 1047extern void sysrq_sched_debug_show(void);
892extern void sched_init_granularity(void); 1048extern void sched_init_granularity(void);
893extern void update_max_interval(void); 1049extern void update_max_interval(void);
894extern void update_group_power(struct sched_domain *sd, int cpu);
895extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); 1050extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
896extern void init_sched_rt_class(void); 1051extern void init_sched_rt_class(void);
897extern void init_sched_fair_class(void); 1052extern void init_sched_fair_class(void);
@@ -904,45 +1059,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
904 1059
905extern void update_idle_cpu_load(struct rq *this_rq); 1060extern void update_idle_cpu_load(struct rq *this_rq);
906 1061
907#ifdef CONFIG_CGROUP_CPUACCT
908#include <linux/cgroup.h>
909/* track cpu usage of a group of tasks and its child groups */
910struct cpuacct {
911 struct cgroup_subsys_state css;
912 /* cpuusage holds pointer to a u64-type object on every cpu */
913 u64 __percpu *cpuusage;
914 struct kernel_cpustat __percpu *cpustat;
915};
916
917extern struct cgroup_subsys cpuacct_subsys;
918extern struct cpuacct root_cpuacct;
919
920/* return cpu accounting group corresponding to this container */
921static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
922{
923 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
924 struct cpuacct, css);
925}
926
927/* return cpu accounting group to which this task belongs */
928static inline struct cpuacct *task_ca(struct task_struct *tsk)
929{
930 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
931 struct cpuacct, css);
932}
933
934static inline struct cpuacct *parent_ca(struct cpuacct *ca)
935{
936 if (!ca || !ca->css.cgroup->parent)
937 return NULL;
938 return cgroup_ca(ca->css.cgroup->parent);
939}
940
941extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
942#else
943static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
944#endif
945
946#ifdef CONFIG_PARAVIRT 1062#ifdef CONFIG_PARAVIRT
947static inline u64 steal_ticks(u64 steal) 1063static inline u64 steal_ticks(u64 steal)
948{ 1064{
@@ -1187,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1187enum rq_nohz_flag_bits { 1303enum rq_nohz_flag_bits {
1188 NOHZ_TICK_STOPPED, 1304 NOHZ_TICK_STOPPED,
1189 NOHZ_BALANCE_KICK, 1305 NOHZ_BALANCE_KICK,
1190 NOHZ_IDLE,
1191}; 1306};
1192 1307
1193#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 1308#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5af44b593770..b7a10048a32c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -160,6 +160,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
160 case BPF_S_ALU_AND_X: 160 case BPF_S_ALU_AND_X:
161 case BPF_S_ALU_OR_K: 161 case BPF_S_ALU_OR_K:
162 case BPF_S_ALU_OR_X: 162 case BPF_S_ALU_OR_X:
163 case BPF_S_ALU_XOR_K:
164 case BPF_S_ALU_XOR_X:
163 case BPF_S_ALU_LSH_K: 165 case BPF_S_ALU_LSH_K:
164 case BPF_S_ALU_LSH_X: 166 case BPF_S_ALU_LSH_X:
165 case BPF_S_ALU_RSH_K: 167 case BPF_S_ALU_RSH_K:
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 4567fc020fe3..6815171a4fff 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(up);
193struct semaphore_waiter { 193struct semaphore_waiter {
194 struct list_head list; 194 struct list_head list;
195 struct task_struct *task; 195 struct task_struct *task;
196 int up; 196 bool up;
197}; 197};
198 198
199/* 199/*
@@ -209,12 +209,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
209 209
210 list_add_tail(&waiter.list, &sem->wait_list); 210 list_add_tail(&waiter.list, &sem->wait_list);
211 waiter.task = task; 211 waiter.task = task;
212 waiter.up = 0; 212 waiter.up = false;
213 213
214 for (;;) { 214 for (;;) {
215 if (signal_pending_state(state, task)) 215 if (signal_pending_state(state, task))
216 goto interrupted; 216 goto interrupted;
217 if (timeout <= 0) 217 if (unlikely(timeout <= 0))
218 goto timed_out; 218 goto timed_out;
219 __set_task_state(task, state); 219 __set_task_state(task, state);
220 raw_spin_unlock_irq(&sem->lock); 220 raw_spin_unlock_irq(&sem->lock);
@@ -258,6 +258,6 @@ static noinline void __sched __up(struct semaphore *sem)
258 struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, 258 struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
259 struct semaphore_waiter, list); 259 struct semaphore_waiter, list);
260 list_del(&waiter->list); 260 list_del(&waiter->list);
261 waiter->up = 1; 261 waiter->up = true;
262 wake_up_process(waiter->task); 262 wake_up_process(waiter->task);
263} 263}
diff --git a/kernel/signal.c b/kernel/signal.c
index 06ff7764ab7c..113411bfe8b1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -855,12 +855,14 @@ static void ptrace_trap_notify(struct task_struct *t)
855 * Returns true if the signal should be actually delivered, otherwise 855 * Returns true if the signal should be actually delivered, otherwise
856 * it should be dropped. 856 * it should be dropped.
857 */ 857 */
858static int prepare_signal(int sig, struct task_struct *p, bool force) 858static bool prepare_signal(int sig, struct task_struct *p, bool force)
859{ 859{
860 struct signal_struct *signal = p->signal; 860 struct signal_struct *signal = p->signal;
861 struct task_struct *t; 861 struct task_struct *t;
862 862
863 if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { 863 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
864 if (signal->flags & SIGNAL_GROUP_COREDUMP)
865 return sig == SIGKILL;
864 /* 866 /*
865 * The process is in the middle of dying, nothing to do. 867 * The process is in the middle of dying, nothing to do.
866 */ 868 */
@@ -1161,8 +1163,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1161static void print_fatal_signal(int signr) 1163static void print_fatal_signal(int signr)
1162{ 1164{
1163 struct pt_regs *regs = signal_pt_regs(); 1165 struct pt_regs *regs = signal_pt_regs();
1164 printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n", 1166 printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
1165 current->comm, task_pid_nr(current), signr);
1166 1167
1167#if defined(__i386__) && !defined(__arch_um__) 1168#if defined(__i386__) && !defined(__arch_um__)
1168 printk(KERN_INFO "code at %08lx: ", regs->ip); 1169 printk(KERN_INFO "code at %08lx: ", regs->ip);
diff --git a/kernel/smp.c b/kernel/smp.c
index 8e451f3ff51b..4dba0f7b72ad 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -100,16 +100,16 @@ void __init call_function_init(void)
100 * previous function call. For multi-cpu calls its even more interesting 100 * previous function call. For multi-cpu calls its even more interesting
101 * as we'll have to ensure no other cpu is observing our csd. 101 * as we'll have to ensure no other cpu is observing our csd.
102 */ 102 */
103static void csd_lock_wait(struct call_single_data *data) 103static void csd_lock_wait(struct call_single_data *csd)
104{ 104{
105 while (data->flags & CSD_FLAG_LOCK) 105 while (csd->flags & CSD_FLAG_LOCK)
106 cpu_relax(); 106 cpu_relax();
107} 107}
108 108
109static void csd_lock(struct call_single_data *data) 109static void csd_lock(struct call_single_data *csd)
110{ 110{
111 csd_lock_wait(data); 111 csd_lock_wait(csd);
112 data->flags = CSD_FLAG_LOCK; 112 csd->flags |= CSD_FLAG_LOCK;
113 113
114 /* 114 /*
115 * prevent CPU from reordering the above assignment 115 * prevent CPU from reordering the above assignment
@@ -119,16 +119,16 @@ static void csd_lock(struct call_single_data *data)
119 smp_mb(); 119 smp_mb();
120} 120}
121 121
122static void csd_unlock(struct call_single_data *data) 122static void csd_unlock(struct call_single_data *csd)
123{ 123{
124 WARN_ON(!(data->flags & CSD_FLAG_LOCK)); 124 WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
125 125
126 /* 126 /*
127 * ensure we're all done before releasing data: 127 * ensure we're all done before releasing data:
128 */ 128 */
129 smp_mb(); 129 smp_mb();
130 130
131 data->flags &= ~CSD_FLAG_LOCK; 131 csd->flags &= ~CSD_FLAG_LOCK;
132} 132}
133 133
134/* 134/*
@@ -137,7 +137,7 @@ static void csd_unlock(struct call_single_data *data)
137 * ->func, ->info, and ->flags set. 137 * ->func, ->info, and ->flags set.
138 */ 138 */
139static 139static
140void generic_exec_single(int cpu, struct call_single_data *data, int wait) 140void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
141{ 141{
142 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); 142 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
143 unsigned long flags; 143 unsigned long flags;
@@ -145,7 +145,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
145 145
146 raw_spin_lock_irqsave(&dst->lock, flags); 146 raw_spin_lock_irqsave(&dst->lock, flags);
147 ipi = list_empty(&dst->list); 147 ipi = list_empty(&dst->list);
148 list_add_tail(&data->list, &dst->list); 148 list_add_tail(&csd->list, &dst->list);
149 raw_spin_unlock_irqrestore(&dst->lock, flags); 149 raw_spin_unlock_irqrestore(&dst->lock, flags);
150 150
151 /* 151 /*
@@ -163,7 +163,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
163 arch_send_call_function_single_ipi(cpu); 163 arch_send_call_function_single_ipi(cpu);
164 164
165 if (wait) 165 if (wait)
166 csd_lock_wait(data); 166 csd_lock_wait(csd);
167} 167}
168 168
169/* 169/*
@@ -173,7 +173,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
173void generic_smp_call_function_single_interrupt(void) 173void generic_smp_call_function_single_interrupt(void)
174{ 174{
175 struct call_single_queue *q = &__get_cpu_var(call_single_queue); 175 struct call_single_queue *q = &__get_cpu_var(call_single_queue);
176 unsigned int data_flags;
177 LIST_HEAD(list); 176 LIST_HEAD(list);
178 177
179 /* 178 /*
@@ -186,25 +185,26 @@ void generic_smp_call_function_single_interrupt(void)
186 raw_spin_unlock(&q->lock); 185 raw_spin_unlock(&q->lock);
187 186
188 while (!list_empty(&list)) { 187 while (!list_empty(&list)) {
189 struct call_single_data *data; 188 struct call_single_data *csd;
189 unsigned int csd_flags;
190 190
191 data = list_entry(list.next, struct call_single_data, list); 191 csd = list_entry(list.next, struct call_single_data, list);
192 list_del(&data->list); 192 list_del(&csd->list);
193 193
194 /* 194 /*
195 * 'data' can be invalid after this call if flags == 0 195 * 'csd' can be invalid after this call if flags == 0
196 * (when called through generic_exec_single()), 196 * (when called through generic_exec_single()),
197 * so save them away before making the call: 197 * so save them away before making the call:
198 */ 198 */
199 data_flags = data->flags; 199 csd_flags = csd->flags;
200 200
201 data->func(data->info); 201 csd->func(csd->info);
202 202
203 /* 203 /*
204 * Unlocked CSDs are valid through generic_exec_single(): 204 * Unlocked CSDs are valid through generic_exec_single():
205 */ 205 */
206 if (data_flags & CSD_FLAG_LOCK) 206 if (csd_flags & CSD_FLAG_LOCK)
207 csd_unlock(data); 207 csd_unlock(csd);
208 } 208 }
209} 209}
210 210
@@ -249,16 +249,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
249 local_irq_restore(flags); 249 local_irq_restore(flags);
250 } else { 250 } else {
251 if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { 251 if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
252 struct call_single_data *data = &d; 252 struct call_single_data *csd = &d;
253 253
254 if (!wait) 254 if (!wait)
255 data = &__get_cpu_var(csd_data); 255 csd = &__get_cpu_var(csd_data);
256 256
257 csd_lock(data); 257 csd_lock(csd);
258 258
259 data->func = func; 259 csd->func = func;
260 data->info = info; 260 csd->info = info;
261 generic_exec_single(cpu, data, wait); 261 generic_exec_single(cpu, csd, wait);
262 } else { 262 } else {
263 err = -ENXIO; /* CPU not online */ 263 err = -ENXIO; /* CPU not online */
264 } 264 }
@@ -325,7 +325,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
325 * pre-allocated data structure. Useful for embedding @data inside 325 * pre-allocated data structure. Useful for embedding @data inside
326 * other structures, for instance. 326 * other structures, for instance.
327 */ 327 */
328void __smp_call_function_single(int cpu, struct call_single_data *data, 328void __smp_call_function_single(int cpu, struct call_single_data *csd,
329 int wait) 329 int wait)
330{ 330{
331 unsigned int this_cpu; 331 unsigned int this_cpu;
@@ -343,11 +343,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
343 343
344 if (cpu == this_cpu) { 344 if (cpu == this_cpu) {
345 local_irq_save(flags); 345 local_irq_save(flags);
346 data->func(data->info); 346 csd->func(csd->info);
347 local_irq_restore(flags); 347 local_irq_restore(flags);
348 } else { 348 } else {
349 csd_lock(data); 349 csd_lock(csd);
350 generic_exec_single(cpu, data, wait); 350 generic_exec_single(cpu, csd, wait);
351 } 351 }
352 put_cpu(); 352 put_cpu();
353} 353}
@@ -369,7 +369,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
369void smp_call_function_many(const struct cpumask *mask, 369void smp_call_function_many(const struct cpumask *mask,
370 smp_call_func_t func, void *info, bool wait) 370 smp_call_func_t func, void *info, bool wait)
371{ 371{
372 struct call_function_data *data; 372 struct call_function_data *cfd;
373 int cpu, next_cpu, this_cpu = smp_processor_id(); 373 int cpu, next_cpu, this_cpu = smp_processor_id();
374 374
375 /* 375 /*
@@ -401,24 +401,24 @@ void smp_call_function_many(const struct cpumask *mask,
401 return; 401 return;
402 } 402 }
403 403
404 data = &__get_cpu_var(cfd_data); 404 cfd = &__get_cpu_var(cfd_data);
405 405
406 cpumask_and(data->cpumask, mask, cpu_online_mask); 406 cpumask_and(cfd->cpumask, mask, cpu_online_mask);
407 cpumask_clear_cpu(this_cpu, data->cpumask); 407 cpumask_clear_cpu(this_cpu, cfd->cpumask);
408 408
409 /* Some callers race with other cpus changing the passed mask */ 409 /* Some callers race with other cpus changing the passed mask */
410 if (unlikely(!cpumask_weight(data->cpumask))) 410 if (unlikely(!cpumask_weight(cfd->cpumask)))
411 return; 411 return;
412 412
413 /* 413 /*
414 * After we put an entry into the list, data->cpumask 414 * After we put an entry into the list, cfd->cpumask may be cleared
415 * may be cleared again when another CPU sends another IPI for 415 * again when another CPU sends another IPI for a SMP function call, so
416 * a SMP function call, so data->cpumask will be zero. 416 * cfd->cpumask will be zero.
417 */ 417 */
418 cpumask_copy(data->cpumask_ipi, data->cpumask); 418 cpumask_copy(cfd->cpumask_ipi, cfd->cpumask);
419 419
420 for_each_cpu(cpu, data->cpumask) { 420 for_each_cpu(cpu, cfd->cpumask) {
421 struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); 421 struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
422 struct call_single_queue *dst = 422 struct call_single_queue *dst =
423 &per_cpu(call_single_queue, cpu); 423 &per_cpu(call_single_queue, cpu);
424 unsigned long flags; 424 unsigned long flags;
@@ -433,12 +433,13 @@ void smp_call_function_many(const struct cpumask *mask,
433 } 433 }
434 434
435 /* Send a message to all CPUs in the map */ 435 /* Send a message to all CPUs in the map */
436 arch_send_call_function_ipi_mask(data->cpumask_ipi); 436 arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
437 437
438 if (wait) { 438 if (wait) {
439 for_each_cpu(cpu, data->cpumask) { 439 for_each_cpu(cpu, cfd->cpumask) {
440 struct call_single_data *csd = 440 struct call_single_data *csd;
441 per_cpu_ptr(data->csd, cpu); 441
442 csd = per_cpu_ptr(cfd->csd, cpu);
442 csd_lock_wait(csd); 443 csd_lock_wait(csd);
443 } 444 }
444 } 445 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 14d7758074aa..aa82723c7202 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -620,8 +620,7 @@ static void remote_softirq_receive(void *data)
620 unsigned long flags; 620 unsigned long flags;
621 int softirq; 621 int softirq;
622 622
623 softirq = cp->priv; 623 softirq = *(int *)cp->info;
624
625 local_irq_save(flags); 624 local_irq_save(flags);
626 __local_trigger(cp, softirq); 625 __local_trigger(cp, softirq);
627 local_irq_restore(flags); 626 local_irq_restore(flags);
@@ -631,9 +630,8 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
631{ 630{
632 if (cpu_online(cpu)) { 631 if (cpu_online(cpu)) {
633 cp->func = remote_softirq_receive; 632 cp->func = remote_softirq_receive;
634 cp->info = cp; 633 cp->info = &softirq;
635 cp->flags = 0; 634 cp->flags = 0;
636 cp->priv = softirq;
637 635
638 __smp_call_function_single(cpu, cp, 0); 636 __smp_call_function_single(cpu, cp, 0);
639 return 0; 637 return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 0da73cf73e60..b95d3c72ba21 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,11 @@
49#include <linux/user_namespace.h> 49#include <linux/user_namespace.h>
50#include <linux/binfmts.h> 50#include <linux/binfmts.h>
51 51
52#include <linux/sched.h>
53#include <linux/rcupdate.h>
54#include <linux/uidgid.h>
55#include <linux/cred.h>
56
52#include <linux/kmsg_dump.h> 57#include <linux/kmsg_dump.h>
53/* Move somewhere else to avoid recompiling? */ 58/* Move somewhere else to avoid recompiling? */
54#include <generated/utsrelease.h> 59#include <generated/utsrelease.h>
@@ -1044,6 +1049,67 @@ change_okay:
1044 return old_fsgid; 1049 return old_fsgid;
1045} 1050}
1046 1051
1052/**
1053 * sys_getpid - return the thread group id of the current process
1054 *
1055 * Note, despite the name, this returns the tgid not the pid. The tgid and
1056 * the pid are identical unless CLONE_THREAD was specified on clone() in
1057 * which case the tgid is the same in all threads of the same group.
1058 *
1059 * This is SMP safe as current->tgid does not change.
1060 */
1061SYSCALL_DEFINE0(getpid)
1062{
1063 return task_tgid_vnr(current);
1064}
1065
1066/* Thread ID - the internal kernel "pid" */
1067SYSCALL_DEFINE0(gettid)
1068{
1069 return task_pid_vnr(current);
1070}
1071
1072/*
1073 * Accessing ->real_parent is not SMP-safe, it could
1074 * change from under us. However, we can use a stale
1075 * value of ->real_parent under rcu_read_lock(), see
1076 * release_task()->call_rcu(delayed_put_task_struct).
1077 */
1078SYSCALL_DEFINE0(getppid)
1079{
1080 int pid;
1081
1082 rcu_read_lock();
1083 pid = task_tgid_vnr(rcu_dereference(current->real_parent));
1084 rcu_read_unlock();
1085
1086 return pid;
1087}
1088
1089SYSCALL_DEFINE0(getuid)
1090{
1091 /* Only we change this so SMP safe */
1092 return from_kuid_munged(current_user_ns(), current_uid());
1093}
1094
1095SYSCALL_DEFINE0(geteuid)
1096{
1097 /* Only we change this so SMP safe */
1098 return from_kuid_munged(current_user_ns(), current_euid());
1099}
1100
1101SYSCALL_DEFINE0(getgid)
1102{
1103 /* Only we change this so SMP safe */
1104 return from_kgid_munged(current_user_ns(), current_gid());
1105}
1106
1107SYSCALL_DEFINE0(getegid)
1108{
1109 /* Only we change this so SMP safe */
1110 return from_kgid_munged(current_user_ns(), current_egid());
1111}
1112
1047void do_sys_times(struct tms *tms) 1113void do_sys_times(struct tms *tms)
1048{ 1114{
1049 cputime_t tgutime, tgstime, cutime, cstime; 1115 cputime_t tgutime, tgstime, cutime, cstime;
@@ -1785,13 +1851,26 @@ SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1785 return getrusage(current, who, ru); 1851 return getrusage(current, who, ru);
1786} 1852}
1787 1853
1854#ifdef CONFIG_COMPAT
1855COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
1856{
1857 struct rusage r;
1858
1859 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1860 who != RUSAGE_THREAD)
1861 return -EINVAL;
1862
1863 k_getrusage(current, who, &r);
1864 return put_compat_rusage(&r, ru);
1865}
1866#endif
1867
1788SYSCALL_DEFINE1(umask, int, mask) 1868SYSCALL_DEFINE1(umask, int, mask)
1789{ 1869{
1790 mask = xchg(&current->fs->umask, mask & S_IRWXUGO); 1870 mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1791 return mask; 1871 return mask;
1792} 1872}
1793 1873
1794#ifdef CONFIG_CHECKPOINT_RESTORE
1795static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1874static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1796{ 1875{
1797 struct fd exe; 1876 struct fd exe;
@@ -1985,17 +2064,12 @@ out:
1985 return error; 2064 return error;
1986} 2065}
1987 2066
2067#ifdef CONFIG_CHECKPOINT_RESTORE
1988static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) 2068static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1989{ 2069{
1990 return put_user(me->clear_child_tid, tid_addr); 2070 return put_user(me->clear_child_tid, tid_addr);
1991} 2071}
1992 2072#else
1993#else /* CONFIG_CHECKPOINT_RESTORE */
1994static int prctl_set_mm(int opt, unsigned long addr,
1995 unsigned long arg4, unsigned long arg5)
1996{
1997 return -EINVAL;
1998}
1999static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) 2073static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
2000{ 2074{
2001 return -EINVAL; 2075 return -EINVAL;
@@ -2245,3 +2319,148 @@ int orderly_poweroff(bool force)
2245 return 0; 2319 return 0;
2246} 2320}
2247EXPORT_SYMBOL_GPL(orderly_poweroff); 2321EXPORT_SYMBOL_GPL(orderly_poweroff);
2322
2323/**
2324 * do_sysinfo - fill in sysinfo struct
2325 * @info: pointer to buffer to fill
2326 */
2327static int do_sysinfo(struct sysinfo *info)
2328{
2329 unsigned long mem_total, sav_total;
2330 unsigned int mem_unit, bitcount;
2331 struct timespec tp;
2332
2333 memset(info, 0, sizeof(struct sysinfo));
2334
2335 ktime_get_ts(&tp);
2336 monotonic_to_bootbased(&tp);
2337 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
2338
2339 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
2340
2341 info->procs = nr_threads;
2342
2343 si_meminfo(info);
2344 si_swapinfo(info);
2345
2346 /*
2347 * If the sum of all the available memory (i.e. ram + swap)
2348 * is less than can be stored in a 32 bit unsigned long then
2349 * we can be binary compatible with 2.2.x kernels. If not,
2350 * well, in that case 2.2.x was broken anyways...
2351 *
2352 * -Erik Andersen <andersee@debian.org>
2353 */
2354
2355 mem_total = info->totalram + info->totalswap;
2356 if (mem_total < info->totalram || mem_total < info->totalswap)
2357 goto out;
2358 bitcount = 0;
2359 mem_unit = info->mem_unit;
2360 while (mem_unit > 1) {
2361 bitcount++;
2362 mem_unit >>= 1;
2363 sav_total = mem_total;
2364 mem_total <<= 1;
2365 if (mem_total < sav_total)
2366 goto out;
2367 }
2368
2369 /*
2370 * If mem_total did not overflow, multiply all memory values by
2371 * info->mem_unit and set it to 1. This leaves things compatible
2372 * with 2.2.x, and also retains compatibility with earlier 2.4.x
2373 * kernels...
2374 */
2375
2376 info->mem_unit = 1;
2377 info->totalram <<= bitcount;
2378 info->freeram <<= bitcount;
2379 info->sharedram <<= bitcount;
2380 info->bufferram <<= bitcount;
2381 info->totalswap <<= bitcount;
2382 info->freeswap <<= bitcount;
2383 info->totalhigh <<= bitcount;
2384 info->freehigh <<= bitcount;
2385
2386out:
2387 return 0;
2388}
2389
2390SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
2391{
2392 struct sysinfo val;
2393
2394 do_sysinfo(&val);
2395
2396 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
2397 return -EFAULT;
2398
2399 return 0;
2400}
2401
2402#ifdef CONFIG_COMPAT
2403struct compat_sysinfo {
2404 s32 uptime;
2405 u32 loads[3];
2406 u32 totalram;
2407 u32 freeram;
2408 u32 sharedram;
2409 u32 bufferram;
2410 u32 totalswap;
2411 u32 freeswap;
2412 u16 procs;
2413 u16 pad;
2414 u32 totalhigh;
2415 u32 freehigh;
2416 u32 mem_unit;
2417 char _f[20-2*sizeof(u32)-sizeof(int)];
2418};
2419
2420COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
2421{
2422 struct sysinfo s;
2423
2424 do_sysinfo(&s);
2425
2426 /* Check to see if any memory value is too large for 32-bit and scale
2427 * down if needed
2428 */
2429 if ((s.totalram >> 32) || (s.totalswap >> 32)) {
2430 int bitcount = 0;
2431
2432 while (s.mem_unit < PAGE_SIZE) {
2433 s.mem_unit <<= 1;
2434 bitcount++;
2435 }
2436
2437 s.totalram >>= bitcount;
2438 s.freeram >>= bitcount;
2439 s.sharedram >>= bitcount;
2440 s.bufferram >>= bitcount;
2441 s.totalswap >>= bitcount;
2442 s.freeswap >>= bitcount;
2443 s.totalhigh >>= bitcount;
2444 s.freehigh >>= bitcount;
2445 }
2446
2447 if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
2448 __put_user(s.uptime, &info->uptime) ||
2449 __put_user(s.loads[0], &info->loads[0]) ||
2450 __put_user(s.loads[1], &info->loads[1]) ||
2451 __put_user(s.loads[2], &info->loads[2]) ||
2452 __put_user(s.totalram, &info->totalram) ||
2453 __put_user(s.freeram, &info->freeram) ||
2454 __put_user(s.sharedram, &info->sharedram) ||
2455 __put_user(s.bufferram, &info->bufferram) ||
2456 __put_user(s.totalswap, &info->totalswap) ||
2457 __put_user(s.freeswap, &info->freeswap) ||
2458 __put_user(s.procs, &info->procs) ||
2459 __put_user(s.totalhigh, &info->totalhigh) ||
2460 __put_user(s.freehigh, &info->freehigh) ||
2461 __put_user(s.mem_unit, &info->mem_unit))
2462 return -EFAULT;
2463
2464 return 0;
2465}
2466#endif /* CONFIG_COMPAT */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 395084d4ce16..bfd6787b355a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -20,6 +20,7 @@ cond_syscall(sys_quotactl);
20cond_syscall(sys32_quotactl); 20cond_syscall(sys32_quotactl);
21cond_syscall(sys_acct); 21cond_syscall(sys_acct);
22cond_syscall(sys_lookup_dcookie); 22cond_syscall(sys_lookup_dcookie);
23cond_syscall(compat_sys_lookup_dcookie);
23cond_syscall(sys_swapon); 24cond_syscall(sys_swapon);
24cond_syscall(sys_swapoff); 25cond_syscall(sys_swapoff);
25cond_syscall(sys_kexec_load); 26cond_syscall(sys_kexec_load);
@@ -155,7 +156,7 @@ cond_syscall(compat_sys_process_vm_writev);
155cond_syscall(sys_pciconfig_read); 156cond_syscall(sys_pciconfig_read);
156cond_syscall(sys_pciconfig_write); 157cond_syscall(sys_pciconfig_write);
157cond_syscall(sys_pciconfig_iobase); 158cond_syscall(sys_pciconfig_iobase);
158cond_syscall(sys32_ipc); 159cond_syscall(compat_sys_s390_ipc);
159cond_syscall(ppc_rtas); 160cond_syscall(ppc_rtas);
160cond_syscall(sys_spu_run); 161cond_syscall(sys_spu_run);
161cond_syscall(sys_spu_create); 162cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afc1dc60f3f8..9edcf456e0fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;
106#endif 106#endif
107extern int pid_max; 107extern int pid_max;
108extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
109extern int sysctl_drop_caches;
110extern int percpu_pagelist_fraction; 109extern int percpu_pagelist_fraction;
111extern int compat_log; 110extern int compat_log;
112extern int latencytop_enabled; 111extern int latencytop_enabled;
@@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = {
1430 .extra2 = &one, 1429 .extra2 = &one,
1431 }, 1430 },
1432#endif 1431#endif
1432 {
1433 .procname = "user_reserve_kbytes",
1434 .data = &sysctl_user_reserve_kbytes,
1435 .maxlen = sizeof(sysctl_user_reserve_kbytes),
1436 .mode = 0644,
1437 .proc_handler = proc_doulongvec_minmax,
1438 },
1439 {
1440 .procname = "admin_reserve_kbytes",
1441 .data = &sysctl_admin_reserve_kbytes,
1442 .maxlen = sizeof(sysctl_admin_reserve_kbytes),
1443 .mode = 0644,
1444 .proc_handler = proc_doulongvec_minmax,
1445 },
1433 { } 1446 { }
1434}; 1447};
1435 1448
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index f8b11a283171..12d6ebbfdd83 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -365,7 +365,7 @@ int init_test_probes(void)
365 target2 = kprobe_target2; 365 target2 = kprobe_target2;
366 366
367 do { 367 do {
368 rand1 = random32(); 368 rand1 = prandom_u32();
369 } while (rand1 <= div_factor); 369 } while (rand1 <= div_factor);
370 370
371 printk(KERN_INFO "Kprobe smoke test started\n"); 371 printk(KERN_INFO "Kprobe smoke test started\n");
diff --git a/kernel/time.c b/kernel/time.c
index f8342a41efa6..d3617dbd3dca 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -138,13 +138,14 @@ int persistent_clock_is_local;
138 */ 138 */
139static inline void warp_clock(void) 139static inline void warp_clock(void)
140{ 140{
141 struct timespec adjust; 141 if (sys_tz.tz_minuteswest != 0) {
142 struct timespec adjust;
142 143
143 adjust = current_kernel_time();
144 if (sys_tz.tz_minuteswest != 0)
145 persistent_clock_is_local = 1; 144 persistent_clock_is_local = 1;
146 adjust.tv_sec += sys_tz.tz_minuteswest * 60; 145 adjust.tv_sec = sys_tz.tz_minuteswest * 60;
147 do_settimeofday(&adjust); 146 adjust.tv_nsec = 0;
147 timekeeping_inject_offset(&adjust);
148 }
148} 149}
149 150
150/* 151/*
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 072bb066bb7d..12ff13a838c6 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,13 +18,14 @@
18#include <linux/rtc.h> 18#include <linux/rtc.h>
19 19
20#include "tick-internal.h" 20#include "tick-internal.h"
21#include "ntp_internal.h"
21 22
22/* 23/*
23 * NTP timekeeping variables: 24 * NTP timekeeping variables:
25 *
26 * Note: All of the NTP state is protected by the timekeeping locks.
24 */ 27 */
25 28
26DEFINE_RAW_SPINLOCK(ntp_lock);
27
28 29
29/* USER_HZ period (usecs): */ 30/* USER_HZ period (usecs): */
30unsigned long tick_usec = TICK_USEC; 31unsigned long tick_usec = TICK_USEC;
@@ -53,9 +54,6 @@ static int time_state = TIME_OK;
53/* clock status bits: */ 54/* clock status bits: */
54static int time_status = STA_UNSYNC; 55static int time_status = STA_UNSYNC;
55 56
56/* TAI offset (secs): */
57static long time_tai;
58
59/* time adjustment (nsecs): */ 57/* time adjustment (nsecs): */
60static s64 time_offset; 58static s64 time_offset;
61 59
@@ -134,8 +132,6 @@ static inline void pps_reset_freq_interval(void)
134 132
135/** 133/**
136 * pps_clear - Clears the PPS state variables 134 * pps_clear - Clears the PPS state variables
137 *
138 * Must be called while holding a write on the ntp_lock
139 */ 135 */
140static inline void pps_clear(void) 136static inline void pps_clear(void)
141{ 137{
@@ -150,8 +146,6 @@ static inline void pps_clear(void)
150/* Decrease pps_valid to indicate that another second has passed since 146/* Decrease pps_valid to indicate that another second has passed since
151 * the last PPS signal. When it reaches 0, indicate that PPS signal is 147 * the last PPS signal. When it reaches 0, indicate that PPS signal is
152 * missing. 148 * missing.
153 *
154 * Must be called while holding a write on the ntp_lock
155 */ 149 */
156static inline void pps_dec_valid(void) 150static inline void pps_dec_valid(void)
157{ 151{
@@ -346,10 +340,6 @@ static void ntp_update_offset(long offset)
346 */ 340 */
347void ntp_clear(void) 341void ntp_clear(void)
348{ 342{
349 unsigned long flags;
350
351 raw_spin_lock_irqsave(&ntp_lock, flags);
352
353 time_adjust = 0; /* stop active adjtime() */ 343 time_adjust = 0; /* stop active adjtime() */
354 time_status |= STA_UNSYNC; 344 time_status |= STA_UNSYNC;
355 time_maxerror = NTP_PHASE_LIMIT; 345 time_maxerror = NTP_PHASE_LIMIT;
@@ -362,20 +352,12 @@ void ntp_clear(void)
362 352
363 /* Clear PPS state variables */ 353 /* Clear PPS state variables */
364 pps_clear(); 354 pps_clear();
365 raw_spin_unlock_irqrestore(&ntp_lock, flags);
366
367} 355}
368 356
369 357
370u64 ntp_tick_length(void) 358u64 ntp_tick_length(void)
371{ 359{
372 unsigned long flags; 360 return tick_length;
373 s64 ret;
374
375 raw_spin_lock_irqsave(&ntp_lock, flags);
376 ret = tick_length;
377 raw_spin_unlock_irqrestore(&ntp_lock, flags);
378 return ret;
379} 361}
380 362
381 363
@@ -393,9 +375,6 @@ int second_overflow(unsigned long secs)
393{ 375{
394 s64 delta; 376 s64 delta;
395 int leap = 0; 377 int leap = 0;
396 unsigned long flags;
397
398 raw_spin_lock_irqsave(&ntp_lock, flags);
399 378
400 /* 379 /*
401 * Leap second processing. If in leap-insert state at the end of the 380 * Leap second processing. If in leap-insert state at the end of the
@@ -415,7 +394,6 @@ int second_overflow(unsigned long secs)
415 else if (secs % 86400 == 0) { 394 else if (secs % 86400 == 0) {
416 leap = -1; 395 leap = -1;
417 time_state = TIME_OOP; 396 time_state = TIME_OOP;
418 time_tai++;
419 printk(KERN_NOTICE 397 printk(KERN_NOTICE
420 "Clock: inserting leap second 23:59:60 UTC\n"); 398 "Clock: inserting leap second 23:59:60 UTC\n");
421 } 399 }
@@ -425,7 +403,6 @@ int second_overflow(unsigned long secs)
425 time_state = TIME_OK; 403 time_state = TIME_OK;
426 else if ((secs + 1) % 86400 == 0) { 404 else if ((secs + 1) % 86400 == 0) {
427 leap = 1; 405 leap = 1;
428 time_tai--;
429 time_state = TIME_WAIT; 406 time_state = TIME_WAIT;
430 printk(KERN_NOTICE 407 printk(KERN_NOTICE
431 "Clock: deleting leap second 23:59:59 UTC\n"); 408 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -479,8 +456,6 @@ int second_overflow(unsigned long secs)
479 time_adjust = 0; 456 time_adjust = 0;
480 457
481out: 458out:
482 raw_spin_unlock_irqrestore(&ntp_lock, flags);
483
484 return leap; 459 return leap;
485} 460}
486 461
@@ -575,11 +550,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
575 time_status |= txc->status & ~STA_RONLY; 550 time_status |= txc->status & ~STA_RONLY;
576} 551}
577 552
578/* 553
579 * Called with ntp_lock held, so we can access and modify 554static inline void process_adjtimex_modes(struct timex *txc,
580 * all the global NTP state: 555 struct timespec *ts,
581 */ 556 s32 *time_tai)
582static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
583{ 557{
584 if (txc->modes & ADJ_STATUS) 558 if (txc->modes & ADJ_STATUS)
585 process_adj_status(txc, ts); 559 process_adj_status(txc, ts);
@@ -613,7 +587,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
613 } 587 }
614 588
615 if (txc->modes & ADJ_TAI && txc->constant > 0) 589 if (txc->modes & ADJ_TAI && txc->constant > 0)
616 time_tai = txc->constant; 590 *time_tai = txc->constant;
617 591
618 if (txc->modes & ADJ_OFFSET) 592 if (txc->modes & ADJ_OFFSET)
619 ntp_update_offset(txc->offset); 593 ntp_update_offset(txc->offset);
@@ -625,16 +599,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
625 ntp_update_frequency(); 599 ntp_update_frequency();
626} 600}
627 601
628/* 602
629 * adjtimex mainly allows reading (and writing, if superuser) of 603
630 * kernel time-keeping variables. used by xntpd. 604/**
605 * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
631 */ 606 */
632int do_adjtimex(struct timex *txc) 607int ntp_validate_timex(struct timex *txc)
633{ 608{
634 struct timespec ts;
635 int result;
636
637 /* Validate the data before disabling interrupts */
638 if (txc->modes & ADJ_ADJTIME) { 609 if (txc->modes & ADJ_ADJTIME) {
639 /* singleshot must not be used with any other mode bits */ 610 /* singleshot must not be used with any other mode bits */
640 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) 611 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
@@ -646,7 +617,6 @@ int do_adjtimex(struct timex *txc)
646 /* In order to modify anything, you gotta be super-user! */ 617 /* In order to modify anything, you gotta be super-user! */
647 if (txc->modes && !capable(CAP_SYS_TIME)) 618 if (txc->modes && !capable(CAP_SYS_TIME))
648 return -EPERM; 619 return -EPERM;
649
650 /* 620 /*
651 * if the quartz is off by more than 10% then 621 * if the quartz is off by more than 10% then
652 * something is VERY wrong! 622 * something is VERY wrong!
@@ -657,22 +627,20 @@ int do_adjtimex(struct timex *txc)
657 return -EINVAL; 627 return -EINVAL;
658 } 628 }
659 629
660 if (txc->modes & ADJ_SETOFFSET) { 630 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
661 struct timespec delta; 631 return -EPERM;
662 delta.tv_sec = txc->time.tv_sec;
663 delta.tv_nsec = txc->time.tv_usec;
664 if (!capable(CAP_SYS_TIME))
665 return -EPERM;
666 if (!(txc->modes & ADJ_NANO))
667 delta.tv_nsec *= 1000;
668 result = timekeeping_inject_offset(&delta);
669 if (result)
670 return result;
671 }
672 632
673 getnstimeofday(&ts); 633 return 0;
634}
674 635
675 raw_spin_lock_irq(&ntp_lock); 636
637/*
638 * adjtimex mainly allows reading (and writing, if superuser) of
639 * kernel time-keeping variables. used by xntpd.
640 */
641int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
642{
643 int result;
676 644
677 if (txc->modes & ADJ_ADJTIME) { 645 if (txc->modes & ADJ_ADJTIME) {
678 long save_adjust = time_adjust; 646 long save_adjust = time_adjust;
@@ -687,7 +655,7 @@ int do_adjtimex(struct timex *txc)
687 655
688 /* If there are input parameters, then process them: */ 656 /* If there are input parameters, then process them: */
689 if (txc->modes) 657 if (txc->modes)
690 process_adjtimex_modes(txc, &ts); 658 process_adjtimex_modes(txc, ts, time_tai);
691 659
692 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, 660 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
693 NTP_SCALE_SHIFT); 661 NTP_SCALE_SHIFT);
@@ -709,15 +677,13 @@ int do_adjtimex(struct timex *txc)
709 txc->precision = 1; 677 txc->precision = 1;
710 txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; 678 txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
711 txc->tick = tick_usec; 679 txc->tick = tick_usec;
712 txc->tai = time_tai; 680 txc->tai = *time_tai;
713 681
714 /* fill PPS status fields */ 682 /* fill PPS status fields */
715 pps_fill_timex(txc); 683 pps_fill_timex(txc);
716 684
717 raw_spin_unlock_irq(&ntp_lock); 685 txc->time.tv_sec = ts->tv_sec;
718 686 txc->time.tv_usec = ts->tv_nsec;
719 txc->time.tv_sec = ts.tv_sec;
720 txc->time.tv_usec = ts.tv_nsec;
721 if (!(time_status & STA_NANO)) 687 if (!(time_status & STA_NANO))
722 txc->time.tv_usec /= NSEC_PER_USEC; 688 txc->time.tv_usec /= NSEC_PER_USEC;
723 689
@@ -894,7 +860,7 @@ static void hardpps_update_phase(long error)
894} 860}
895 861
896/* 862/*
897 * hardpps() - discipline CPU clock oscillator to external PPS signal 863 * __hardpps() - discipline CPU clock oscillator to external PPS signal
898 * 864 *
899 * This routine is called at each PPS signal arrival in order to 865 * This routine is called at each PPS signal arrival in order to
900 * discipline the CPU clock oscillator to the PPS signal. It takes two 866 * discipline the CPU clock oscillator to the PPS signal. It takes two
@@ -905,15 +871,13 @@ static void hardpps_update_phase(long error)
905 * This code is based on David Mills's reference nanokernel 871 * This code is based on David Mills's reference nanokernel
906 * implementation. It was mostly rewritten but keeps the same idea. 872 * implementation. It was mostly rewritten but keeps the same idea.
907 */ 873 */
908void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) 874void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
909{ 875{
910 struct pps_normtime pts_norm, freq_norm; 876 struct pps_normtime pts_norm, freq_norm;
911 unsigned long flags; 877 unsigned long flags;
912 878
913 pts_norm = pps_normalize_ts(*phase_ts); 879 pts_norm = pps_normalize_ts(*phase_ts);
914 880
915 raw_spin_lock_irqsave(&ntp_lock, flags);
916
917 /* clear the error bits, they will be set again if needed */ 881 /* clear the error bits, they will be set again if needed */
918 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 882 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
919 883
@@ -925,7 +889,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
925 * just start the frequency interval */ 889 * just start the frequency interval */
926 if (unlikely(pps_fbase.tv_sec == 0)) { 890 if (unlikely(pps_fbase.tv_sec == 0)) {
927 pps_fbase = *raw_ts; 891 pps_fbase = *raw_ts;
928 raw_spin_unlock_irqrestore(&ntp_lock, flags);
929 return; 892 return;
930 } 893 }
931 894
@@ -940,7 +903,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
940 time_status |= STA_PPSJITTER; 903 time_status |= STA_PPSJITTER;
941 /* restart the frequency calibration interval */ 904 /* restart the frequency calibration interval */
942 pps_fbase = *raw_ts; 905 pps_fbase = *raw_ts;
943 raw_spin_unlock_irqrestore(&ntp_lock, flags);
944 pr_err("hardpps: PPSJITTER: bad pulse\n"); 906 pr_err("hardpps: PPSJITTER: bad pulse\n");
945 return; 907 return;
946 } 908 }
@@ -957,10 +919,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
957 919
958 hardpps_update_phase(pts_norm.nsec); 920 hardpps_update_phase(pts_norm.nsec);
959 921
960 raw_spin_unlock_irqrestore(&ntp_lock, flags);
961} 922}
962EXPORT_SYMBOL(hardpps);
963
964#endif /* CONFIG_NTP_PPS */ 923#endif /* CONFIG_NTP_PPS */
965 924
966static int __init ntp_tick_adj_setup(char *str) 925static int __init ntp_tick_adj_setup(char *str)
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
new file mode 100644
index 000000000000..1950cb4ca2a4
--- /dev/null
+++ b/kernel/time/ntp_internal.h
@@ -0,0 +1,12 @@
1#ifndef _LINUX_NTP_INTERNAL_H
2#define _LINUX_NTP_INTERNAL_H
3
4extern void ntp_init(void);
5extern void ntp_clear(void);
6/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
7extern u64 ntp_tick_length(void);
8extern int second_overflow(unsigned long secs);
9extern int ntp_validate_timex(struct timex *);
10extern int __do_adjtimex(struct timex *, struct timespec *, s32 *);
11extern void __hardpps(const struct timespec *, const struct timespec *);
12#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 7f32fe0e52cd..61d00a8cdf2f 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,9 +28,8 @@
28 */ 28 */
29 29
30static struct tick_device tick_broadcast_device; 30static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31static cpumask_var_t tick_broadcast_mask;
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static cpumask_var_t tmpmask;
33static DECLARE_BITMAP(tmpmask, NR_CPUS);
34static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); 33static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
35static int tick_broadcast_force; 34static int tick_broadcast_force;
36 35
@@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void)
50 49
51struct cpumask *tick_get_broadcast_mask(void) 50struct cpumask *tick_get_broadcast_mask(void)
52{ 51{
53 return to_cpumask(tick_broadcast_mask); 52 return tick_broadcast_mask;
54} 53}
55 54
56/* 55/*
@@ -67,6 +66,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
67 */ 66 */
68int tick_check_broadcast_device(struct clock_event_device *dev) 67int tick_check_broadcast_device(struct clock_event_device *dev)
69{ 68{
69 struct clock_event_device *cur = tick_broadcast_device.evtdev;
70
70 if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || 71 if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
71 (tick_broadcast_device.evtdev && 72 (tick_broadcast_device.evtdev &&
72 tick_broadcast_device.evtdev->rating >= dev->rating) || 73 tick_broadcast_device.evtdev->rating >= dev->rating) ||
@@ -74,9 +75,21 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
74 return 0; 75 return 0;
75 76
76 clockevents_exchange_device(tick_broadcast_device.evtdev, dev); 77 clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
78 if (cur)
79 cur->event_handler = clockevents_handle_noop;
77 tick_broadcast_device.evtdev = dev; 80 tick_broadcast_device.evtdev = dev;
78 if (!cpumask_empty(tick_get_broadcast_mask())) 81 if (!cpumask_empty(tick_broadcast_mask))
79 tick_broadcast_start_periodic(dev); 82 tick_broadcast_start_periodic(dev);
83 /*
84 * Inform all cpus about this. We might be in a situation
85 * where we did not switch to oneshot mode because the per cpu
86 * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
87 * of a oneshot capable broadcast device. Without that
88 * notification the systems stays stuck in periodic mode
89 * forever.
90 */
91 if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
92 tick_clock_notify();
80 return 1; 93 return 1;
81} 94}
82 95
@@ -124,7 +137,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
124 if (!tick_device_is_functional(dev)) { 137 if (!tick_device_is_functional(dev)) {
125 dev->event_handler = tick_handle_periodic; 138 dev->event_handler = tick_handle_periodic;
126 tick_device_setup_broadcast_func(dev); 139 tick_device_setup_broadcast_func(dev);
127 cpumask_set_cpu(cpu, tick_get_broadcast_mask()); 140 cpumask_set_cpu(cpu, tick_broadcast_mask);
128 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 141 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
129 ret = 1; 142 ret = 1;
130 } else { 143 } else {
@@ -135,7 +148,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
135 */ 148 */
136 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 149 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
137 int cpu = smp_processor_id(); 150 int cpu = smp_processor_id();
138 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 151 cpumask_clear_cpu(cpu, tick_broadcast_mask);
139 tick_broadcast_clear_oneshot(cpu); 152 tick_broadcast_clear_oneshot(cpu);
140 } else { 153 } else {
141 tick_device_setup_broadcast_func(dev); 154 tick_device_setup_broadcast_func(dev);
@@ -199,9 +212,8 @@ static void tick_do_periodic_broadcast(void)
199{ 212{
200 raw_spin_lock(&tick_broadcast_lock); 213 raw_spin_lock(&tick_broadcast_lock);
201 214
202 cpumask_and(to_cpumask(tmpmask), 215 cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
203 cpu_online_mask, tick_get_broadcast_mask()); 216 tick_do_broadcast(tmpmask);
204 tick_do_broadcast(to_cpumask(tmpmask));
205 217
206 raw_spin_unlock(&tick_broadcast_lock); 218 raw_spin_unlock(&tick_broadcast_lock);
207} 219}
@@ -264,13 +276,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
264 if (!tick_device_is_functional(dev)) 276 if (!tick_device_is_functional(dev))
265 goto out; 277 goto out;
266 278
267 bc_stopped = cpumask_empty(tick_get_broadcast_mask()); 279 bc_stopped = cpumask_empty(tick_broadcast_mask);
268 280
269 switch (*reason) { 281 switch (*reason) {
270 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 282 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
271 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 283 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
272 if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { 284 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
273 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
274 if (tick_broadcast_device.mode == 285 if (tick_broadcast_device.mode ==
275 TICKDEV_MODE_PERIODIC) 286 TICKDEV_MODE_PERIODIC)
276 clockevents_shutdown(dev); 287 clockevents_shutdown(dev);
@@ -280,8 +291,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
280 break; 291 break;
281 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 292 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
282 if (!tick_broadcast_force && 293 if (!tick_broadcast_force &&
283 cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { 294 cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
284 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
285 if (tick_broadcast_device.mode == 295 if (tick_broadcast_device.mode ==
286 TICKDEV_MODE_PERIODIC) 296 TICKDEV_MODE_PERIODIC)
287 tick_setup_periodic(dev, 0); 297 tick_setup_periodic(dev, 0);
@@ -289,7 +299,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
289 break; 299 break;
290 } 300 }
291 301
292 if (cpumask_empty(tick_get_broadcast_mask())) { 302 if (cpumask_empty(tick_broadcast_mask)) {
293 if (!bc_stopped) 303 if (!bc_stopped)
294 clockevents_shutdown(bc); 304 clockevents_shutdown(bc);
295 } else if (bc_stopped) { 305 } else if (bc_stopped) {
@@ -338,10 +348,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
338 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 348 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
339 349
340 bc = tick_broadcast_device.evtdev; 350 bc = tick_broadcast_device.evtdev;
341 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 351 cpumask_clear_cpu(cpu, tick_broadcast_mask);
342 352
343 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 353 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
344 if (bc && cpumask_empty(tick_get_broadcast_mask())) 354 if (bc && cpumask_empty(tick_broadcast_mask))
345 clockevents_shutdown(bc); 355 clockevents_shutdown(bc);
346 } 356 }
347 357
@@ -377,13 +387,13 @@ int tick_resume_broadcast(void)
377 387
378 switch (tick_broadcast_device.mode) { 388 switch (tick_broadcast_device.mode) {
379 case TICKDEV_MODE_PERIODIC: 389 case TICKDEV_MODE_PERIODIC:
380 if (!cpumask_empty(tick_get_broadcast_mask())) 390 if (!cpumask_empty(tick_broadcast_mask))
381 tick_broadcast_start_periodic(bc); 391 tick_broadcast_start_periodic(bc);
382 broadcast = cpumask_test_cpu(smp_processor_id(), 392 broadcast = cpumask_test_cpu(smp_processor_id(),
383 tick_get_broadcast_mask()); 393 tick_broadcast_mask);
384 break; 394 break;
385 case TICKDEV_MODE_ONESHOT: 395 case TICKDEV_MODE_ONESHOT:
386 if (!cpumask_empty(tick_get_broadcast_mask())) 396 if (!cpumask_empty(tick_broadcast_mask))
387 broadcast = tick_resume_broadcast_oneshot(bc); 397 broadcast = tick_resume_broadcast_oneshot(bc);
388 break; 398 break;
389 } 399 }
@@ -396,25 +406,58 @@ int tick_resume_broadcast(void)
396 406
397#ifdef CONFIG_TICK_ONESHOT 407#ifdef CONFIG_TICK_ONESHOT
398 408
399/* FIXME: use cpumask_var_t. */ 409static cpumask_var_t tick_broadcast_oneshot_mask;
400static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); 410static cpumask_var_t tick_broadcast_pending_mask;
411static cpumask_var_t tick_broadcast_force_mask;
401 412
402/* 413/*
403 * Exposed for debugging: see timer_list.c 414 * Exposed for debugging: see timer_list.c
404 */ 415 */
405struct cpumask *tick_get_broadcast_oneshot_mask(void) 416struct cpumask *tick_get_broadcast_oneshot_mask(void)
406{ 417{
407 return to_cpumask(tick_broadcast_oneshot_mask); 418 return tick_broadcast_oneshot_mask;
408} 419}
409 420
410static int tick_broadcast_set_event(ktime_t expires, int force) 421/*
422 * Called before going idle with interrupts disabled. Checks whether a
423 * broadcast event from the other core is about to happen. We detected
424 * that in tick_broadcast_oneshot_control(). The callsite can use this
425 * to avoid a deep idle transition as we are about to get the
426 * broadcast IPI right away.
427 */
428int tick_check_broadcast_expired(void)
411{ 429{
412 struct clock_event_device *bc = tick_broadcast_device.evtdev; 430 return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
431}
432
433/*
434 * Set broadcast interrupt affinity
435 */
436static void tick_broadcast_set_affinity(struct clock_event_device *bc,
437 const struct cpumask *cpumask)
438{
439 if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
440 return;
441
442 if (cpumask_equal(bc->cpumask, cpumask))
443 return;
444
445 bc->cpumask = cpumask;
446 irq_set_affinity(bc->irq, bc->cpumask);
447}
448
449static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
450 ktime_t expires, int force)
451{
452 int ret;
413 453
414 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) 454 if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
415 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 455 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
416 456
417 return clockevents_program_event(bc, expires, force); 457 ret = clockevents_program_event(bc, expires, force);
458 if (!ret)
459 tick_broadcast_set_affinity(bc, cpumask_of(cpu));
460 return ret;
418} 461}
419 462
420int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 463int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -429,7 +472,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
429 */ 472 */
430void tick_check_oneshot_broadcast(int cpu) 473void tick_check_oneshot_broadcast(int cpu)
431{ 474{
432 if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { 475 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
433 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 476 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
434 477
435 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 478 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -443,27 +486,39 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
443{ 486{
444 struct tick_device *td; 487 struct tick_device *td;
445 ktime_t now, next_event; 488 ktime_t now, next_event;
446 int cpu; 489 int cpu, next_cpu = 0;
447 490
448 raw_spin_lock(&tick_broadcast_lock); 491 raw_spin_lock(&tick_broadcast_lock);
449again: 492again:
450 dev->next_event.tv64 = KTIME_MAX; 493 dev->next_event.tv64 = KTIME_MAX;
451 next_event.tv64 = KTIME_MAX; 494 next_event.tv64 = KTIME_MAX;
452 cpumask_clear(to_cpumask(tmpmask)); 495 cpumask_clear(tmpmask);
453 now = ktime_get(); 496 now = ktime_get();
454 /* Find all expired events */ 497 /* Find all expired events */
455 for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { 498 for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
456 td = &per_cpu(tick_cpu_device, cpu); 499 td = &per_cpu(tick_cpu_device, cpu);
457 if (td->evtdev->next_event.tv64 <= now.tv64) 500 if (td->evtdev->next_event.tv64 <= now.tv64) {
458 cpumask_set_cpu(cpu, to_cpumask(tmpmask)); 501 cpumask_set_cpu(cpu, tmpmask);
459 else if (td->evtdev->next_event.tv64 < next_event.tv64) 502 /*
503 * Mark the remote cpu in the pending mask, so
504 * it can avoid reprogramming the cpu local
505 * timer in tick_broadcast_oneshot_control().
506 */
507 cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
508 } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
460 next_event.tv64 = td->evtdev->next_event.tv64; 509 next_event.tv64 = td->evtdev->next_event.tv64;
510 next_cpu = cpu;
511 }
461 } 512 }
462 513
514 /* Take care of enforced broadcast requests */
515 cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
516 cpumask_clear(tick_broadcast_force_mask);
517
463 /* 518 /*
464 * Wakeup the cpus which have an expired event. 519 * Wakeup the cpus which have an expired event.
465 */ 520 */
466 tick_do_broadcast(to_cpumask(tmpmask)); 521 tick_do_broadcast(tmpmask);
467 522
468 /* 523 /*
469 * Two reasons for reprogram: 524 * Two reasons for reprogram:
@@ -480,7 +535,7 @@ again:
480 * Rearm the broadcast device. If event expired, 535 * Rearm the broadcast device. If event expired,
481 * repeat the above 536 * repeat the above
482 */ 537 */
483 if (tick_broadcast_set_event(next_event, 0)) 538 if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
484 goto again; 539 goto again;
485 } 540 }
486 raw_spin_unlock(&tick_broadcast_lock); 541 raw_spin_unlock(&tick_broadcast_lock);
@@ -495,6 +550,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
495 struct clock_event_device *bc, *dev; 550 struct clock_event_device *bc, *dev;
496 struct tick_device *td; 551 struct tick_device *td;
497 unsigned long flags; 552 unsigned long flags;
553 ktime_t now;
498 int cpu; 554 int cpu;
499 555
500 /* 556 /*
@@ -519,21 +575,84 @@ void tick_broadcast_oneshot_control(unsigned long reason)
519 575
520 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 576 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
521 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 577 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
522 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 578 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
523 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); 579 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
524 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 580 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
525 if (dev->next_event.tv64 < bc->next_event.tv64) 581 /*
526 tick_broadcast_set_event(dev->next_event, 1); 582 * We only reprogram the broadcast timer if we
583 * did not mark ourself in the force mask and
584 * if the cpu local event is earlier than the
585 * broadcast event. If the current CPU is in
586 * the force mask, then we are going to be
587 * woken by the IPI right away.
588 */
589 if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
590 dev->next_event.tv64 < bc->next_event.tv64)
591 tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
527 } 592 }
528 } else { 593 } else {
529 if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 594 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
530 cpumask_clear_cpu(cpu,
531 tick_get_broadcast_oneshot_mask());
532 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 595 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
533 if (dev->next_event.tv64 != KTIME_MAX) 596 if (dev->next_event.tv64 == KTIME_MAX)
534 tick_program_event(dev->next_event, 1); 597 goto out;
598 /*
599 * The cpu which was handling the broadcast
600 * timer marked this cpu in the broadcast
601 * pending mask and fired the broadcast
602 * IPI. So we are going to handle the expired
603 * event anyway via the broadcast IPI
604 * handler. No need to reprogram the timer
605 * with an already expired event.
606 */
607 if (cpumask_test_and_clear_cpu(cpu,
608 tick_broadcast_pending_mask))
609 goto out;
610
611 /*
612 * If the pending bit is not set, then we are
613 * either the CPU handling the broadcast
614 * interrupt or we got woken by something else.
615 *
616 * We are not longer in the broadcast mask, so
617 * if the cpu local expiry time is already
618 * reached, we would reprogram the cpu local
619 * timer with an already expired event.
620 *
621 * This can lead to a ping-pong when we return
622 * to idle and therefor rearm the broadcast
623 * timer before the cpu local timer was able
624 * to fire. This happens because the forced
625 * reprogramming makes sure that the event
626 * will happen in the future and depending on
627 * the min_delta setting this might be far
628 * enough out that the ping-pong starts.
629 *
630 * If the cpu local next_event has expired
631 * then we know that the broadcast timer
632 * next_event has expired as well and
633 * broadcast is about to be handled. So we
634 * avoid reprogramming and enforce that the
635 * broadcast handler, which did not run yet,
636 * will invoke the cpu local handler.
637 *
638 * We cannot call the handler directly from
639 * here, because we might be in a NOHZ phase
640 * and we did not go through the irq_enter()
641 * nohz fixups.
642 */
643 now = ktime_get();
644 if (dev->next_event.tv64 <= now.tv64) {
645 cpumask_set_cpu(cpu, tick_broadcast_force_mask);
646 goto out;
647 }
648 /*
649 * We got woken by something else. Reprogram
650 * the cpu local timer device.
651 */
652 tick_program_event(dev->next_event, 1);
535 } 653 }
536 } 654 }
655out:
537 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 656 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
538} 657}
539 658
@@ -544,7 +663,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
544 */ 663 */
545static void tick_broadcast_clear_oneshot(int cpu) 664static void tick_broadcast_clear_oneshot(int cpu)
546{ 665{
547 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 666 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
548} 667}
549 668
550static void tick_broadcast_init_next_event(struct cpumask *mask, 669static void tick_broadcast_init_next_event(struct cpumask *mask,
@@ -582,17 +701,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
582 * oneshot_mask bits for those and program the 701 * oneshot_mask bits for those and program the
583 * broadcast device to fire. 702 * broadcast device to fire.
584 */ 703 */
585 cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); 704 cpumask_copy(tmpmask, tick_broadcast_mask);
586 cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); 705 cpumask_clear_cpu(cpu, tmpmask);
587 cpumask_or(tick_get_broadcast_oneshot_mask(), 706 cpumask_or(tick_broadcast_oneshot_mask,
588 tick_get_broadcast_oneshot_mask(), 707 tick_broadcast_oneshot_mask, tmpmask);
589 to_cpumask(tmpmask));
590 708
591 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { 709 if (was_periodic && !cpumask_empty(tmpmask)) {
592 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 710 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
593 tick_broadcast_init_next_event(to_cpumask(tmpmask), 711 tick_broadcast_init_next_event(tmpmask,
594 tick_next_period); 712 tick_next_period);
595 tick_broadcast_set_event(tick_next_period, 1); 713 tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
596 } else 714 } else
597 bc->next_event.tv64 = KTIME_MAX; 715 bc->next_event.tv64 = KTIME_MAX;
598 } else { 716 } else {
@@ -640,7 +758,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
640 * Clear the broadcast mask flag for the dead cpu, but do not 758 * Clear the broadcast mask flag for the dead cpu, but do not
641 * stop the broadcast device! 759 * stop the broadcast device!
642 */ 760 */
643 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 761 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
644 762
645 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 763 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
646} 764}
@@ -664,3 +782,14 @@ bool tick_broadcast_oneshot_available(void)
664} 782}
665 783
666#endif 784#endif
785
786void __init tick_broadcast_init(void)
787{
788 alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
789 alloc_cpumask_var(&tmpmask, GFP_NOWAIT);
790#ifdef CONFIG_TICK_ONESHOT
791 alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
792 alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
793 alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
794#endif
795}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6973f4..6176a3e45709 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -323,6 +323,7 @@ static void tick_shutdown(unsigned int *cpup)
323 */ 323 */
324 dev->mode = CLOCK_EVT_MODE_UNUSED; 324 dev->mode = CLOCK_EVT_MODE_UNUSED;
325 clockevents_exchange_device(dev, NULL); 325 clockevents_exchange_device(dev, NULL);
326 dev->event_handler = clockevents_handle_noop;
326 td->evtdev = NULL; 327 td->evtdev = NULL;
327 } 328 }
328 raw_spin_unlock_irqrestore(&tick_device_lock, flags); 329 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
@@ -416,4 +417,5 @@ static struct notifier_block tick_notifier = {
416void __init tick_init(void) 417void __init tick_init(void)
417{ 418{
418 clockevents_register_notifier(&tick_notifier); 419 clockevents_register_notifier(&tick_notifier);
420 tick_broadcast_init();
419} 421}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59ed6dc0..f0299eae4602 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
4#include <linux/hrtimer.h> 4#include <linux/hrtimer.h>
5#include <linux/tick.h> 5#include <linux/tick.h>
6 6
7extern seqlock_t jiffies_lock;
8
7#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD 9#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
8 10
9#define TICK_DO_TIMER_NONE -1 11#define TICK_DO_TIMER_NONE -1
@@ -94,7 +96,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
94extern void tick_shutdown_broadcast(unsigned int *cpup); 96extern void tick_shutdown_broadcast(unsigned int *cpup);
95extern void tick_suspend_broadcast(void); 97extern void tick_suspend_broadcast(void);
96extern int tick_resume_broadcast(void); 98extern int tick_resume_broadcast(void);
97 99extern void tick_broadcast_init(void);
98extern void 100extern void
99tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); 101tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
100 102
@@ -119,6 +121,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
119static inline void tick_shutdown_broadcast(unsigned int *cpup) { } 121static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
120static inline void tick_suspend_broadcast(void) { } 122static inline void tick_suspend_broadcast(void) { }
121static inline int tick_resume_broadcast(void) { return 0; } 123static inline int tick_resume_broadcast(void) { return 0; }
124static inline void tick_broadcast_init(void) { }
122 125
123/* 126/*
124 * Set the periodic handler in non broadcast mode 127 * Set the periodic handler in non broadcast mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a19a39952c1b..225f8bf19095 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -482,8 +482,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
482 482
483 if (ratelimit < 10 && 483 if (ratelimit < 10 &&
484 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { 484 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
485 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 485 pr_warn("NOHZ: local_softirq_pending %02x\n",
486 (unsigned int) local_softirq_pending()); 486 (unsigned int) local_softirq_pending());
487 ratelimit++; 487 ratelimit++;
488 } 488 }
489 return false; 489 return false;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9a0bc98fbe1d..98cd470bbe49 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -23,8 +23,13 @@
23#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
24#include <linux/pvclock_gtod.h> 24#include <linux/pvclock_gtod.h>
25 25
26#include "tick-internal.h"
27#include "ntp_internal.h"
26 28
27static struct timekeeper timekeeper; 29static struct timekeeper timekeeper;
30static DEFINE_RAW_SPINLOCK(timekeeper_lock);
31static seqcount_t timekeeper_seq;
32static struct timekeeper shadow_timekeeper;
28 33
29/* flag for if timekeeping is suspended */ 34/* flag for if timekeeping is suspended */
30int __read_mostly timekeeping_suspended; 35int __read_mostly timekeeping_suspended;
@@ -67,6 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
67 tk->wall_to_monotonic = wtm; 72 tk->wall_to_monotonic = wtm;
68 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); 73 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
69 tk->offs_real = timespec_to_ktime(tmp); 74 tk->offs_real = timespec_to_ktime(tmp);
75 tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));
70} 76}
71 77
72static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) 78static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -96,7 +102,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
96 102
97 old_clock = tk->clock; 103 old_clock = tk->clock;
98 tk->clock = clock; 104 tk->clock = clock;
99 clock->cycle_last = clock->read(clock); 105 tk->cycle_last = clock->cycle_last = clock->read(clock);
100 106
101 /* Do the ns -> cycle conversion first, using original mult */ 107 /* Do the ns -> cycle conversion first, using original mult */
102 tmp = NTP_INTERVAL_LENGTH; 108 tmp = NTP_INTERVAL_LENGTH;
@@ -201,8 +207,6 @@ static void update_pvclock_gtod(struct timekeeper *tk)
201 207
202/** 208/**
203 * pvclock_gtod_register_notifier - register a pvclock timedata update listener 209 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
204 *
205 * Must hold write on timekeeper.lock
206 */ 210 */
207int pvclock_gtod_register_notifier(struct notifier_block *nb) 211int pvclock_gtod_register_notifier(struct notifier_block *nb)
208{ 212{
@@ -210,11 +214,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
210 unsigned long flags; 214 unsigned long flags;
211 int ret; 215 int ret;
212 216
213 write_seqlock_irqsave(&tk->lock, flags); 217 raw_spin_lock_irqsave(&timekeeper_lock, flags);
214 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); 218 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
215 /* update timekeeping data */
216 update_pvclock_gtod(tk); 219 update_pvclock_gtod(tk);
217 write_sequnlock_irqrestore(&tk->lock, flags); 220 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
218 221
219 return ret; 222 return ret;
220} 223}
@@ -223,25 +226,22 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
223/** 226/**
224 * pvclock_gtod_unregister_notifier - unregister a pvclock 227 * pvclock_gtod_unregister_notifier - unregister a pvclock
225 * timedata update listener 228 * timedata update listener
226 *
227 * Must hold write on timekeeper.lock
228 */ 229 */
229int pvclock_gtod_unregister_notifier(struct notifier_block *nb) 230int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
230{ 231{
231 struct timekeeper *tk = &timekeeper;
232 unsigned long flags; 232 unsigned long flags;
233 int ret; 233 int ret;
234 234
235 write_seqlock_irqsave(&tk->lock, flags); 235 raw_spin_lock_irqsave(&timekeeper_lock, flags);
236 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); 236 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
237 write_sequnlock_irqrestore(&tk->lock, flags); 237 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
238 238
239 return ret; 239 return ret;
240} 240}
241EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); 241EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
242 242
243/* must hold write on timekeeper.lock */ 243/* must hold timekeeper_lock */
244static void timekeeping_update(struct timekeeper *tk, bool clearntp) 244static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)
245{ 245{
246 if (clearntp) { 246 if (clearntp) {
247 tk->ntp_error = 0; 247 tk->ntp_error = 0;
@@ -249,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
249 } 249 }
250 update_vsyscall(tk); 250 update_vsyscall(tk);
251 update_pvclock_gtod(tk); 251 update_pvclock_gtod(tk);
252
253 if (mirror)
254 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
252} 255}
253 256
254/** 257/**
@@ -267,7 +270,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
267 clock = tk->clock; 270 clock = tk->clock;
268 cycle_now = clock->read(clock); 271 cycle_now = clock->read(clock);
269 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 272 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
270 clock->cycle_last = cycle_now; 273 tk->cycle_last = clock->cycle_last = cycle_now;
271 274
272 tk->xtime_nsec += cycle_delta * tk->mult; 275 tk->xtime_nsec += cycle_delta * tk->mult;
273 276
@@ -294,12 +297,12 @@ int __getnstimeofday(struct timespec *ts)
294 s64 nsecs = 0; 297 s64 nsecs = 0;
295 298
296 do { 299 do {
297 seq = read_seqbegin(&tk->lock); 300 seq = read_seqcount_begin(&timekeeper_seq);
298 301
299 ts->tv_sec = tk->xtime_sec; 302 ts->tv_sec = tk->xtime_sec;
300 nsecs = timekeeping_get_ns(tk); 303 nsecs = timekeeping_get_ns(tk);
301 304
302 } while (read_seqretry(&tk->lock, seq)); 305 } while (read_seqcount_retry(&timekeeper_seq, seq));
303 306
304 ts->tv_nsec = 0; 307 ts->tv_nsec = 0;
305 timespec_add_ns(ts, nsecs); 308 timespec_add_ns(ts, nsecs);
@@ -335,11 +338,11 @@ ktime_t ktime_get(void)
335 WARN_ON(timekeeping_suspended); 338 WARN_ON(timekeeping_suspended);
336 339
337 do { 340 do {
338 seq = read_seqbegin(&tk->lock); 341 seq = read_seqcount_begin(&timekeeper_seq);
339 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; 342 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
340 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; 343 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
341 344
342 } while (read_seqretry(&tk->lock, seq)); 345 } while (read_seqcount_retry(&timekeeper_seq, seq));
343 /* 346 /*
344 * Use ktime_set/ktime_add_ns to create a proper ktime on 347 * Use ktime_set/ktime_add_ns to create a proper ktime on
345 * 32-bit architectures without CONFIG_KTIME_SCALAR. 348 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -366,12 +369,12 @@ void ktime_get_ts(struct timespec *ts)
366 WARN_ON(timekeeping_suspended); 369 WARN_ON(timekeeping_suspended);
367 370
368 do { 371 do {
369 seq = read_seqbegin(&tk->lock); 372 seq = read_seqcount_begin(&timekeeper_seq);
370 ts->tv_sec = tk->xtime_sec; 373 ts->tv_sec = tk->xtime_sec;
371 nsec = timekeeping_get_ns(tk); 374 nsec = timekeeping_get_ns(tk);
372 tomono = tk->wall_to_monotonic; 375 tomono = tk->wall_to_monotonic;
373 376
374 } while (read_seqretry(&tk->lock, seq)); 377 } while (read_seqcount_retry(&timekeeper_seq, seq));
375 378
376 ts->tv_sec += tomono.tv_sec; 379 ts->tv_sec += tomono.tv_sec;
377 ts->tv_nsec = 0; 380 ts->tv_nsec = 0;
@@ -379,6 +382,50 @@ void ktime_get_ts(struct timespec *ts)
379} 382}
380EXPORT_SYMBOL_GPL(ktime_get_ts); 383EXPORT_SYMBOL_GPL(ktime_get_ts);
381 384
385
386/**
387 * timekeeping_clocktai - Returns the TAI time of day in a timespec
388 * @ts: pointer to the timespec to be set
389 *
390 * Returns the time of day in a timespec.
391 */
392void timekeeping_clocktai(struct timespec *ts)
393{
394 struct timekeeper *tk = &timekeeper;
395 unsigned long seq;
396 u64 nsecs;
397
398 WARN_ON(timekeeping_suspended);
399
400 do {
401 seq = read_seqcount_begin(&timekeeper_seq);
402
403 ts->tv_sec = tk->xtime_sec + tk->tai_offset;
404 nsecs = timekeeping_get_ns(tk);
405
406 } while (read_seqcount_retry(&timekeeper_seq, seq));
407
408 ts->tv_nsec = 0;
409 timespec_add_ns(ts, nsecs);
410
411}
412EXPORT_SYMBOL(timekeeping_clocktai);
413
414
415/**
416 * ktime_get_clocktai - Returns the TAI time of day in a ktime
417 *
418 * Returns the time of day in a ktime.
419 */
420ktime_t ktime_get_clocktai(void)
421{
422 struct timespec ts;
423
424 timekeeping_clocktai(&ts);
425 return timespec_to_ktime(ts);
426}
427EXPORT_SYMBOL(ktime_get_clocktai);
428
382#ifdef CONFIG_NTP_PPS 429#ifdef CONFIG_NTP_PPS
383 430
384/** 431/**
@@ -399,7 +446,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
399 WARN_ON_ONCE(timekeeping_suspended); 446 WARN_ON_ONCE(timekeeping_suspended);
400 447
401 do { 448 do {
402 seq = read_seqbegin(&tk->lock); 449 seq = read_seqcount_begin(&timekeeper_seq);
403 450
404 *ts_raw = tk->raw_time; 451 *ts_raw = tk->raw_time;
405 ts_real->tv_sec = tk->xtime_sec; 452 ts_real->tv_sec = tk->xtime_sec;
@@ -408,7 +455,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
408 nsecs_raw = timekeeping_get_ns_raw(tk); 455 nsecs_raw = timekeeping_get_ns_raw(tk);
409 nsecs_real = timekeeping_get_ns(tk); 456 nsecs_real = timekeeping_get_ns(tk);
410 457
411 } while (read_seqretry(&tk->lock, seq)); 458 } while (read_seqcount_retry(&timekeeper_seq, seq));
412 459
413 timespec_add_ns(ts_raw, nsecs_raw); 460 timespec_add_ns(ts_raw, nsecs_raw);
414 timespec_add_ns(ts_real, nsecs_real); 461 timespec_add_ns(ts_real, nsecs_real);
@@ -448,7 +495,8 @@ int do_settimeofday(const struct timespec *tv)
448 if (!timespec_valid_strict(tv)) 495 if (!timespec_valid_strict(tv))
449 return -EINVAL; 496 return -EINVAL;
450 497
451 write_seqlock_irqsave(&tk->lock, flags); 498 raw_spin_lock_irqsave(&timekeeper_lock, flags);
499 write_seqcount_begin(&timekeeper_seq);
452 500
453 timekeeping_forward_now(tk); 501 timekeeping_forward_now(tk);
454 502
@@ -460,9 +508,10 @@ int do_settimeofday(const struct timespec *tv)
460 508
461 tk_set_xtime(tk, tv); 509 tk_set_xtime(tk, tv);
462 510
463 timekeeping_update(tk, true); 511 timekeeping_update(tk, true, true);
464 512
465 write_sequnlock_irqrestore(&tk->lock, flags); 513 write_seqcount_end(&timekeeper_seq);
514 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
466 515
467 /* signal hrtimers about time change */ 516 /* signal hrtimers about time change */
468 clock_was_set(); 517 clock_was_set();
@@ -487,7 +536,8 @@ int timekeeping_inject_offset(struct timespec *ts)
487 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 536 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
488 return -EINVAL; 537 return -EINVAL;
489 538
490 write_seqlock_irqsave(&tk->lock, flags); 539 raw_spin_lock_irqsave(&timekeeper_lock, flags);
540 write_seqcount_begin(&timekeeper_seq);
491 541
492 timekeeping_forward_now(tk); 542 timekeeping_forward_now(tk);
493 543
@@ -502,9 +552,10 @@ int timekeeping_inject_offset(struct timespec *ts)
502 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 552 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
503 553
504error: /* even if we error out, we forwarded the time, so call update */ 554error: /* even if we error out, we forwarded the time, so call update */
505 timekeeping_update(tk, true); 555 timekeeping_update(tk, true, true);
506 556
507 write_sequnlock_irqrestore(&tk->lock, flags); 557 write_seqcount_end(&timekeeper_seq);
558 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
508 559
509 /* signal hrtimers about time change */ 560 /* signal hrtimers about time change */
510 clock_was_set(); 561 clock_was_set();
@@ -513,6 +564,52 @@ error: /* even if we error out, we forwarded the time, so call update */
513} 564}
514EXPORT_SYMBOL(timekeeping_inject_offset); 565EXPORT_SYMBOL(timekeeping_inject_offset);
515 566
567
568/**
569 * timekeeping_get_tai_offset - Returns current TAI offset from UTC
570 *
571 */
572s32 timekeeping_get_tai_offset(void)
573{
574 struct timekeeper *tk = &timekeeper;
575 unsigned int seq;
576 s32 ret;
577
578 do {
579 seq = read_seqcount_begin(&timekeeper_seq);
580 ret = tk->tai_offset;
581 } while (read_seqcount_retry(&timekeeper_seq, seq));
582
583 return ret;
584}
585
586/**
587 * __timekeeping_set_tai_offset - Lock free worker function
588 *
589 */
590static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
591{
592 tk->tai_offset = tai_offset;
593 tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));
594}
595
596/**
597 * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
598 *
599 */
600void timekeeping_set_tai_offset(s32 tai_offset)
601{
602 struct timekeeper *tk = &timekeeper;
603 unsigned long flags;
604
605 raw_spin_lock_irqsave(&timekeeper_lock, flags);
606 write_seqcount_begin(&timekeeper_seq);
607 __timekeeping_set_tai_offset(tk, tai_offset);
608 write_seqcount_end(&timekeeper_seq);
609 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
610 clock_was_set();
611}
612
516/** 613/**
517 * change_clocksource - Swaps clocksources if a new one is available 614 * change_clocksource - Swaps clocksources if a new one is available
518 * 615 *
@@ -526,7 +623,8 @@ static int change_clocksource(void *data)
526 623
527 new = (struct clocksource *) data; 624 new = (struct clocksource *) data;
528 625
529 write_seqlock_irqsave(&tk->lock, flags); 626 raw_spin_lock_irqsave(&timekeeper_lock, flags);
627 write_seqcount_begin(&timekeeper_seq);
530 628
531 timekeeping_forward_now(tk); 629 timekeeping_forward_now(tk);
532 if (!new->enable || new->enable(new) == 0) { 630 if (!new->enable || new->enable(new) == 0) {
@@ -535,9 +633,10 @@ static int change_clocksource(void *data)
535 if (old->disable) 633 if (old->disable)
536 old->disable(old); 634 old->disable(old);
537 } 635 }
538 timekeeping_update(tk, true); 636 timekeeping_update(tk, true, true);
539 637
540 write_sequnlock_irqrestore(&tk->lock, flags); 638 write_seqcount_end(&timekeeper_seq);
639 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
541 640
542 return 0; 641 return 0;
543} 642}
@@ -587,11 +686,11 @@ void getrawmonotonic(struct timespec *ts)
587 s64 nsecs; 686 s64 nsecs;
588 687
589 do { 688 do {
590 seq = read_seqbegin(&tk->lock); 689 seq = read_seqcount_begin(&timekeeper_seq);
591 nsecs = timekeeping_get_ns_raw(tk); 690 nsecs = timekeeping_get_ns_raw(tk);
592 *ts = tk->raw_time; 691 *ts = tk->raw_time;
593 692
594 } while (read_seqretry(&tk->lock, seq)); 693 } while (read_seqcount_retry(&timekeeper_seq, seq));
595 694
596 timespec_add_ns(ts, nsecs); 695 timespec_add_ns(ts, nsecs);
597} 696}
@@ -607,11 +706,11 @@ int timekeeping_valid_for_hres(void)
607 int ret; 706 int ret;
608 707
609 do { 708 do {
610 seq = read_seqbegin(&tk->lock); 709 seq = read_seqcount_begin(&timekeeper_seq);
611 710
612 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 711 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
613 712
614 } while (read_seqretry(&tk->lock, seq)); 713 } while (read_seqcount_retry(&timekeeper_seq, seq));
615 714
616 return ret; 715 return ret;
617} 716}
@@ -626,11 +725,11 @@ u64 timekeeping_max_deferment(void)
626 u64 ret; 725 u64 ret;
627 726
628 do { 727 do {
629 seq = read_seqbegin(&tk->lock); 728 seq = read_seqcount_begin(&timekeeper_seq);
630 729
631 ret = tk->clock->max_idle_ns; 730 ret = tk->clock->max_idle_ns;
632 731
633 } while (read_seqretry(&tk->lock, seq)); 732 } while (read_seqcount_retry(&timekeeper_seq, seq));
634 733
635 return ret; 734 return ret;
636} 735}
@@ -693,11 +792,10 @@ void __init timekeeping_init(void)
693 boot.tv_nsec = 0; 792 boot.tv_nsec = 0;
694 } 793 }
695 794
696 seqlock_init(&tk->lock); 795 raw_spin_lock_irqsave(&timekeeper_lock, flags);
697 796 write_seqcount_begin(&timekeeper_seq);
698 ntp_init(); 797 ntp_init();
699 798
700 write_seqlock_irqsave(&tk->lock, flags);
701 clock = clocksource_default_clock(); 799 clock = clocksource_default_clock();
702 if (clock->enable) 800 if (clock->enable)
703 clock->enable(clock); 801 clock->enable(clock);
@@ -716,7 +814,10 @@ void __init timekeeping_init(void)
716 tmp.tv_nsec = 0; 814 tmp.tv_nsec = 0;
717 tk_set_sleep_time(tk, tmp); 815 tk_set_sleep_time(tk, tmp);
718 816
719 write_sequnlock_irqrestore(&tk->lock, flags); 817 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
818
819 write_seqcount_end(&timekeeper_seq);
820 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
720} 821}
721 822
722/* time in seconds when suspend began */ 823/* time in seconds when suspend began */
@@ -764,15 +865,17 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
764 if (has_persistent_clock()) 865 if (has_persistent_clock())
765 return; 866 return;
766 867
767 write_seqlock_irqsave(&tk->lock, flags); 868 raw_spin_lock_irqsave(&timekeeper_lock, flags);
869 write_seqcount_begin(&timekeeper_seq);
768 870
769 timekeeping_forward_now(tk); 871 timekeeping_forward_now(tk);
770 872
771 __timekeeping_inject_sleeptime(tk, delta); 873 __timekeeping_inject_sleeptime(tk, delta);
772 874
773 timekeeping_update(tk, true); 875 timekeeping_update(tk, true, true);
774 876
775 write_sequnlock_irqrestore(&tk->lock, flags); 877 write_seqcount_end(&timekeeper_seq);
878 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
776 879
777 /* signal hrtimers about time change */ 880 /* signal hrtimers about time change */
778 clock_was_set(); 881 clock_was_set();
@@ -788,26 +891,72 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
788static void timekeeping_resume(void) 891static void timekeeping_resume(void)
789{ 892{
790 struct timekeeper *tk = &timekeeper; 893 struct timekeeper *tk = &timekeeper;
894 struct clocksource *clock = tk->clock;
791 unsigned long flags; 895 unsigned long flags;
792 struct timespec ts; 896 struct timespec ts_new, ts_delta;
897 cycle_t cycle_now, cycle_delta;
898 bool suspendtime_found = false;
793 899
794 read_persistent_clock(&ts); 900 read_persistent_clock(&ts_new);
795 901
796 clockevents_resume(); 902 clockevents_resume();
797 clocksource_resume(); 903 clocksource_resume();
798 904
799 write_seqlock_irqsave(&tk->lock, flags); 905 raw_spin_lock_irqsave(&timekeeper_lock, flags);
906 write_seqcount_begin(&timekeeper_seq);
907
908 /*
909 * After system resumes, we need to calculate the suspended time and
910 * compensate it for the OS time. There are 3 sources that could be
911 * used: Nonstop clocksource during suspend, persistent clock and rtc
912 * device.
913 *
914 * One specific platform may have 1 or 2 or all of them, and the
915 * preference will be:
916 * suspend-nonstop clocksource -> persistent clock -> rtc
917 * The less preferred source will only be tried if there is no better
918 * usable source. The rtc part is handled separately in rtc core code.
919 */
920 cycle_now = clock->read(clock);
921 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
922 cycle_now > clock->cycle_last) {
923 u64 num, max = ULLONG_MAX;
924 u32 mult = clock->mult;
925 u32 shift = clock->shift;
926 s64 nsec = 0;
927
928 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
800 929
801 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 930 /*
802 ts = timespec_sub(ts, timekeeping_suspend_time); 931 * "cycle_delta * mutl" may cause 64 bits overflow, if the
803 __timekeeping_inject_sleeptime(tk, &ts); 932 * suspended time is too long. In that case we need do the
933 * 64 bits math carefully
934 */
935 do_div(max, mult);
936 if (cycle_delta > max) {
937 num = div64_u64(cycle_delta, max);
938 nsec = (((u64) max * mult) >> shift) * num;
939 cycle_delta -= num * max;
940 }
941 nsec += ((u64) cycle_delta * mult) >> shift;
942
943 ts_delta = ns_to_timespec(nsec);
944 suspendtime_found = true;
945 } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) {
946 ts_delta = timespec_sub(ts_new, timekeeping_suspend_time);
947 suspendtime_found = true;
804 } 948 }
805 /* re-base the last cycle value */ 949
806 tk->clock->cycle_last = tk->clock->read(tk->clock); 950 if (suspendtime_found)
951 __timekeeping_inject_sleeptime(tk, &ts_delta);
952
953 /* Re-base the last cycle value */
954 tk->cycle_last = clock->cycle_last = cycle_now;
807 tk->ntp_error = 0; 955 tk->ntp_error = 0;
808 timekeeping_suspended = 0; 956 timekeeping_suspended = 0;
809 timekeeping_update(tk, false); 957 timekeeping_update(tk, false, true);
810 write_sequnlock_irqrestore(&tk->lock, flags); 958 write_seqcount_end(&timekeeper_seq);
959 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
811 960
812 touch_softlockup_watchdog(); 961 touch_softlockup_watchdog();
813 962
@@ -826,7 +975,8 @@ static int timekeeping_suspend(void)
826 975
827 read_persistent_clock(&timekeeping_suspend_time); 976 read_persistent_clock(&timekeeping_suspend_time);
828 977
829 write_seqlock_irqsave(&tk->lock, flags); 978 raw_spin_lock_irqsave(&timekeeper_lock, flags);
979 write_seqcount_begin(&timekeeper_seq);
830 timekeeping_forward_now(tk); 980 timekeeping_forward_now(tk);
831 timekeeping_suspended = 1; 981 timekeeping_suspended = 1;
832 982
@@ -849,7 +999,8 @@ static int timekeeping_suspend(void)
849 timekeeping_suspend_time = 999 timekeeping_suspend_time =
850 timespec_add(timekeeping_suspend_time, delta_delta); 1000 timespec_add(timekeeping_suspend_time, delta_delta);
851 } 1001 }
852 write_sequnlock_irqrestore(&tk->lock, flags); 1002 write_seqcount_end(&timekeeper_seq);
1003 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
853 1004
854 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 1005 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
855 clocksource_suspend(); 1006 clocksource_suspend();
@@ -1099,6 +1250,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1099 tk_set_wall_to_mono(tk, 1250 tk_set_wall_to_mono(tk,
1100 timespec_sub(tk->wall_to_monotonic, ts)); 1251 timespec_sub(tk->wall_to_monotonic, ts));
1101 1252
1253 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1254
1102 clock_was_set_delayed(); 1255 clock_was_set_delayed();
1103 } 1256 }
1104 } 1257 }
@@ -1116,15 +1269,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1116static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, 1269static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1117 u32 shift) 1270 u32 shift)
1118{ 1271{
1272 cycle_t interval = tk->cycle_interval << shift;
1119 u64 raw_nsecs; 1273 u64 raw_nsecs;
1120 1274
1121 /* If the offset is smaller then a shifted interval, do nothing */ 1275 /* If the offset is smaller then a shifted interval, do nothing */
1122 if (offset < tk->cycle_interval<<shift) 1276 if (offset < interval)
1123 return offset; 1277 return offset;
1124 1278
1125 /* Accumulate one shifted interval */ 1279 /* Accumulate one shifted interval */
1126 offset -= tk->cycle_interval << shift; 1280 offset -= interval;
1127 tk->clock->cycle_last += tk->cycle_interval << shift; 1281 tk->cycle_last += interval;
1128 1282
1129 tk->xtime_nsec += tk->xtime_interval << shift; 1283 tk->xtime_nsec += tk->xtime_interval << shift;
1130 accumulate_nsecs_to_secs(tk); 1284 accumulate_nsecs_to_secs(tk);
@@ -1181,27 +1335,28 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
1181static void update_wall_time(void) 1335static void update_wall_time(void)
1182{ 1336{
1183 struct clocksource *clock; 1337 struct clocksource *clock;
1184 struct timekeeper *tk = &timekeeper; 1338 struct timekeeper *real_tk = &timekeeper;
1339 struct timekeeper *tk = &shadow_timekeeper;
1185 cycle_t offset; 1340 cycle_t offset;
1186 int shift = 0, maxshift; 1341 int shift = 0, maxshift;
1187 unsigned long flags; 1342 unsigned long flags;
1188 1343
1189 write_seqlock_irqsave(&tk->lock, flags); 1344 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1190 1345
1191 /* Make sure we're fully resumed: */ 1346 /* Make sure we're fully resumed: */
1192 if (unlikely(timekeeping_suspended)) 1347 if (unlikely(timekeeping_suspended))
1193 goto out; 1348 goto out;
1194 1349
1195 clock = tk->clock; 1350 clock = real_tk->clock;
1196 1351
1197#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1352#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1198 offset = tk->cycle_interval; 1353 offset = real_tk->cycle_interval;
1199#else 1354#else
1200 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1355 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
1201#endif 1356#endif
1202 1357
1203 /* Check if there's really nothing to do */ 1358 /* Check if there's really nothing to do */
1204 if (offset < tk->cycle_interval) 1359 if (offset < real_tk->cycle_interval)
1205 goto out; 1360 goto out;
1206 1361
1207 /* 1362 /*
@@ -1238,11 +1393,24 @@ static void update_wall_time(void)
1238 */ 1393 */
1239 accumulate_nsecs_to_secs(tk); 1394 accumulate_nsecs_to_secs(tk);
1240 1395
1241 timekeeping_update(tk, false); 1396 write_seqcount_begin(&timekeeper_seq);
1242 1397 /* Update clock->cycle_last with the new value */
1398 clock->cycle_last = tk->cycle_last;
1399 /*
1400 * Update the real timekeeper.
1401 *
1402 * We could avoid this memcpy by switching pointers, but that
1403 * requires changes to all other timekeeper usage sites as
1404 * well, i.e. move the timekeeper pointer getter into the
1405 * spinlocked/seqcount protected sections. And we trade this
1406 * memcpy under the timekeeper_seq against one before we start
1407 * updating.
1408 */
1409 memcpy(real_tk, tk, sizeof(*tk));
1410 timekeeping_update(real_tk, false, false);
1411 write_seqcount_end(&timekeeper_seq);
1243out: 1412out:
1244 write_sequnlock_irqrestore(&tk->lock, flags); 1413 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1245
1246} 1414}
1247 1415
1248/** 1416/**
@@ -1289,13 +1457,13 @@ void get_monotonic_boottime(struct timespec *ts)
1289 WARN_ON(timekeeping_suspended); 1457 WARN_ON(timekeeping_suspended);
1290 1458
1291 do { 1459 do {
1292 seq = read_seqbegin(&tk->lock); 1460 seq = read_seqcount_begin(&timekeeper_seq);
1293 ts->tv_sec = tk->xtime_sec; 1461 ts->tv_sec = tk->xtime_sec;
1294 nsec = timekeeping_get_ns(tk); 1462 nsec = timekeeping_get_ns(tk);
1295 tomono = tk->wall_to_monotonic; 1463 tomono = tk->wall_to_monotonic;
1296 sleep = tk->total_sleep_time; 1464 sleep = tk->total_sleep_time;
1297 1465
1298 } while (read_seqretry(&tk->lock, seq)); 1466 } while (read_seqcount_retry(&timekeeper_seq, seq));
1299 1467
1300 ts->tv_sec += tomono.tv_sec + sleep.tv_sec; 1468 ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
1301 ts->tv_nsec = 0; 1469 ts->tv_nsec = 0;
@@ -1354,10 +1522,10 @@ struct timespec current_kernel_time(void)
1354 unsigned long seq; 1522 unsigned long seq;
1355 1523
1356 do { 1524 do {
1357 seq = read_seqbegin(&tk->lock); 1525 seq = read_seqcount_begin(&timekeeper_seq);
1358 1526
1359 now = tk_xtime(tk); 1527 now = tk_xtime(tk);
1360 } while (read_seqretry(&tk->lock, seq)); 1528 } while (read_seqcount_retry(&timekeeper_seq, seq));
1361 1529
1362 return now; 1530 return now;
1363} 1531}
@@ -1370,11 +1538,11 @@ struct timespec get_monotonic_coarse(void)
1370 unsigned long seq; 1538 unsigned long seq;
1371 1539
1372 do { 1540 do {
1373 seq = read_seqbegin(&tk->lock); 1541 seq = read_seqcount_begin(&timekeeper_seq);
1374 1542
1375 now = tk_xtime(tk); 1543 now = tk_xtime(tk);
1376 mono = tk->wall_to_monotonic; 1544 mono = tk->wall_to_monotonic;
1377 } while (read_seqretry(&tk->lock, seq)); 1545 } while (read_seqcount_retry(&timekeeper_seq, seq));
1378 1546
1379 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1547 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
1380 now.tv_nsec + mono.tv_nsec); 1548 now.tv_nsec + mono.tv_nsec);
@@ -1405,11 +1573,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1405 unsigned long seq; 1573 unsigned long seq;
1406 1574
1407 do { 1575 do {
1408 seq = read_seqbegin(&tk->lock); 1576 seq = read_seqcount_begin(&timekeeper_seq);
1409 *xtim = tk_xtime(tk); 1577 *xtim = tk_xtime(tk);
1410 *wtom = tk->wall_to_monotonic; 1578 *wtom = tk->wall_to_monotonic;
1411 *sleep = tk->total_sleep_time; 1579 *sleep = tk->total_sleep_time;
1412 } while (read_seqretry(&tk->lock, seq)); 1580 } while (read_seqcount_retry(&timekeeper_seq, seq));
1413} 1581}
1414 1582
1415#ifdef CONFIG_HIGH_RES_TIMERS 1583#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1421,7 +1589,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1421 * Returns current monotonic time and updates the offsets 1589 * Returns current monotonic time and updates the offsets
1422 * Called from hrtimer_interupt() or retrigger_next_event() 1590 * Called from hrtimer_interupt() or retrigger_next_event()
1423 */ 1591 */
1424ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) 1592ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
1593 ktime_t *offs_tai)
1425{ 1594{
1426 struct timekeeper *tk = &timekeeper; 1595 struct timekeeper *tk = &timekeeper;
1427 ktime_t now; 1596 ktime_t now;
@@ -1429,14 +1598,15 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1429 u64 secs, nsecs; 1598 u64 secs, nsecs;
1430 1599
1431 do { 1600 do {
1432 seq = read_seqbegin(&tk->lock); 1601 seq = read_seqcount_begin(&timekeeper_seq);
1433 1602
1434 secs = tk->xtime_sec; 1603 secs = tk->xtime_sec;
1435 nsecs = timekeeping_get_ns(tk); 1604 nsecs = timekeeping_get_ns(tk);
1436 1605
1437 *offs_real = tk->offs_real; 1606 *offs_real = tk->offs_real;
1438 *offs_boot = tk->offs_boot; 1607 *offs_boot = tk->offs_boot;
1439 } while (read_seqretry(&tk->lock, seq)); 1608 *offs_tai = tk->offs_tai;
1609 } while (read_seqcount_retry(&timekeeper_seq, seq));
1440 1610
1441 now = ktime_add_ns(ktime_set(secs, 0), nsecs); 1611 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1442 now = ktime_sub(now, *offs_real); 1612 now = ktime_sub(now, *offs_real);
@@ -1454,15 +1624,79 @@ ktime_t ktime_get_monotonic_offset(void)
1454 struct timespec wtom; 1624 struct timespec wtom;
1455 1625
1456 do { 1626 do {
1457 seq = read_seqbegin(&tk->lock); 1627 seq = read_seqcount_begin(&timekeeper_seq);
1458 wtom = tk->wall_to_monotonic; 1628 wtom = tk->wall_to_monotonic;
1459 } while (read_seqretry(&tk->lock, seq)); 1629 } while (read_seqcount_retry(&timekeeper_seq, seq));
1460 1630
1461 return timespec_to_ktime(wtom); 1631 return timespec_to_ktime(wtom);
1462} 1632}
1463EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); 1633EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1464 1634
1465/** 1635/**
1636 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
1637 */
1638int do_adjtimex(struct timex *txc)
1639{
1640 struct timekeeper *tk = &timekeeper;
1641 unsigned long flags;
1642 struct timespec ts;
1643 s32 orig_tai, tai;
1644 int ret;
1645
1646 /* Validate the data before disabling interrupts */
1647 ret = ntp_validate_timex(txc);
1648 if (ret)
1649 return ret;
1650
1651 if (txc->modes & ADJ_SETOFFSET) {
1652 struct timespec delta;
1653 delta.tv_sec = txc->time.tv_sec;
1654 delta.tv_nsec = txc->time.tv_usec;
1655 if (!(txc->modes & ADJ_NANO))
1656 delta.tv_nsec *= 1000;
1657 ret = timekeeping_inject_offset(&delta);
1658 if (ret)
1659 return ret;
1660 }
1661
1662 getnstimeofday(&ts);
1663
1664 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1665 write_seqcount_begin(&timekeeper_seq);
1666
1667 orig_tai = tai = tk->tai_offset;
1668 ret = __do_adjtimex(txc, &ts, &tai);
1669
1670 if (tai != orig_tai) {
1671 __timekeeping_set_tai_offset(tk, tai);
1672 clock_was_set_delayed();
1673 }
1674 write_seqcount_end(&timekeeper_seq);
1675 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1676
1677 return ret;
1678}
1679
1680#ifdef CONFIG_NTP_PPS
1681/**
1682 * hardpps() - Accessor function to NTP __hardpps function
1683 */
1684void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
1685{
1686 unsigned long flags;
1687
1688 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1689 write_seqcount_begin(&timekeeper_seq);
1690
1691 __hardpps(phase_ts, raw_ts);
1692
1693 write_seqcount_end(&timekeeper_seq);
1694 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1695}
1696EXPORT_SYMBOL(hardpps);
1697#endif
1698
1699/**
1466 * xtime_update() - advances the timekeeping infrastructure 1700 * xtime_update() - advances the timekeeping infrastructure
1467 * @ticks: number of ticks, that have elapsed since the last call. 1701 * @ticks: number of ticks, that have elapsed since the last call.
1468 * 1702 *
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9f164b..3bdf28323012 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -20,6 +20,13 @@
20 20
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22 22
23
24struct timer_list_iter {
25 int cpu;
26 bool second_pass;
27 u64 now;
28};
29
23typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); 30typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
24 31
25DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); 32DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
@@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
133 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 140 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
134 int i; 141 int i;
135 142
136 SEQ_printf(m, "\n");
137 SEQ_printf(m, "cpu: %d\n", cpu); 143 SEQ_printf(m, "cpu: %d\n", cpu);
138 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 144 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
139 SEQ_printf(m, " clock %d:\n", i); 145 SEQ_printf(m, " clock %d:\n", i);
@@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
187 193
188#undef P 194#undef P
189#undef P_ns 195#undef P_ns
196 SEQ_printf(m, "\n");
190} 197}
191 198
192#ifdef CONFIG_GENERIC_CLOCKEVENTS 199#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
195{ 202{
196 struct clock_event_device *dev = td->evtdev; 203 struct clock_event_device *dev = td->evtdev;
197 204
198 SEQ_printf(m, "\n");
199 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); 205 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
200 if (cpu < 0) 206 if (cpu < 0)
201 SEQ_printf(m, "Broadcast device\n"); 207 SEQ_printf(m, "Broadcast device\n");
@@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
230 print_name_offset(m, dev->event_handler); 236 print_name_offset(m, dev->event_handler);
231 SEQ_printf(m, "\n"); 237 SEQ_printf(m, "\n");
232 SEQ_printf(m, " retries: %lu\n", dev->retries); 238 SEQ_printf(m, " retries: %lu\n", dev->retries);
239 SEQ_printf(m, "\n");
233} 240}
234 241
235static void timer_list_show_tickdevices(struct seq_file *m) 242static void timer_list_show_tickdevices_header(struct seq_file *m)
236{ 243{
237 int cpu;
238
239#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 244#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
240 print_tickdevice(m, tick_get_broadcast_device(), -1); 245 print_tickdevice(m, tick_get_broadcast_device(), -1);
241 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 246 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
@@ -246,47 +251,104 @@ static void timer_list_show_tickdevices(struct seq_file *m)
246#endif 251#endif
247 SEQ_printf(m, "\n"); 252 SEQ_printf(m, "\n");
248#endif 253#endif
249 for_each_online_cpu(cpu)
250 print_tickdevice(m, tick_get_device(cpu), cpu);
251 SEQ_printf(m, "\n");
252} 254}
253#else
254static void timer_list_show_tickdevices(struct seq_file *m) { }
255#endif 255#endif
256 256
257static inline void timer_list_header(struct seq_file *m, u64 now)
258{
259 SEQ_printf(m, "Timer List Version: v0.7\n");
260 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
261 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
262 SEQ_printf(m, "\n");
263}
264
257static int timer_list_show(struct seq_file *m, void *v) 265static int timer_list_show(struct seq_file *m, void *v)
258{ 266{
267 struct timer_list_iter *iter = v;
268 u64 now = ktime_to_ns(ktime_get());
269
270 if (iter->cpu == -1 && !iter->second_pass)
271 timer_list_header(m, now);
272 else if (!iter->second_pass)
273 print_cpu(m, iter->cpu, iter->now);
274#ifdef CONFIG_GENERIC_CLOCKEVENTS
275 else if (iter->cpu == -1 && iter->second_pass)
276 timer_list_show_tickdevices_header(m);
277 else
278 print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
279#endif
280 return 0;
281}
282
283void sysrq_timer_list_show(void)
284{
259 u64 now = ktime_to_ns(ktime_get()); 285 u64 now = ktime_to_ns(ktime_get());
260 int cpu; 286 int cpu;
261 287
262 SEQ_printf(m, "Timer List Version: v0.7\n"); 288 timer_list_header(NULL, now);
263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
265 289
266 for_each_online_cpu(cpu) 290 for_each_online_cpu(cpu)
267 print_cpu(m, cpu, now); 291 print_cpu(NULL, cpu, now);
268 292
269 SEQ_printf(m, "\n"); 293#ifdef CONFIG_GENERIC_CLOCKEVENTS
270 timer_list_show_tickdevices(m); 294 timer_list_show_tickdevices_header(NULL);
295 for_each_online_cpu(cpu)
296 print_tickdevice(NULL, tick_get_device(cpu), cpu);
297#endif
298 return;
299}
271 300
272 return 0; 301static void *timer_list_start(struct seq_file *file, loff_t *offset)
302{
303 struct timer_list_iter *iter = file->private;
304
305 if (!*offset) {
306 iter->cpu = -1;
307 iter->now = ktime_to_ns(ktime_get());
308 } else if (iter->cpu >= nr_cpu_ids) {
309#ifdef CONFIG_GENERIC_CLOCKEVENTS
310 if (!iter->second_pass) {
311 iter->cpu = -1;
312 iter->second_pass = true;
313 } else
314 return NULL;
315#else
316 return NULL;
317#endif
318 }
319 return iter;
273} 320}
274 321
275void sysrq_timer_list_show(void) 322static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
323{
324 struct timer_list_iter *iter = file->private;
325 iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
326 ++*offset;
327 return timer_list_start(file, offset);
328}
329
330static void timer_list_stop(struct seq_file *seq, void *v)
276{ 331{
277 timer_list_show(NULL, NULL);
278} 332}
279 333
334static const struct seq_operations timer_list_sops = {
335 .start = timer_list_start,
336 .next = timer_list_next,
337 .stop = timer_list_stop,
338 .show = timer_list_show,
339};
340
280static int timer_list_open(struct inode *inode, struct file *filp) 341static int timer_list_open(struct inode *inode, struct file *filp)
281{ 342{
282 return single_open(filp, timer_list_show, NULL); 343 return seq_open_private(filp, &timer_list_sops,
344 sizeof(struct timer_list_iter));
283} 345}
284 346
285static const struct file_operations timer_list_fops = { 347static const struct file_operations timer_list_fops = {
286 .open = timer_list_open, 348 .open = timer_list_open,
287 .read = seq_read, 349 .read = seq_read,
288 .llseek = seq_lseek, 350 .llseek = seq_lseek,
289 .release = single_release, 351 .release = seq_release_private,
290}; 352};
291 353
292static int __init init_timer_list_procfs(void) 354static int __init init_timer_list_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index dbf7a78a1ef1..09bca8ce9771 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/kernel/timer.c 2 * linux/kernel/timer.c
3 * 3 *
4 * Kernel internal timers, basic process system calls 4 * Kernel internal timers
5 * 5 *
6 * Copyright (C) 1991, 1992 Linus Torvalds 6 * Copyright (C) 1991, 1992 Linus Torvalds
7 * 7 *
@@ -41,6 +41,7 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/sched/sysctl.h> 42#include <linux/sched/sysctl.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/compat.h>
44 45
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46#include <asm/unistd.h> 47#include <asm/unistd.h>
@@ -1395,61 +1396,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1395 1396
1396#endif 1397#endif
1397 1398
1398/**
1399 * sys_getpid - return the thread group id of the current process
1400 *
1401 * Note, despite the name, this returns the tgid not the pid. The tgid and
1402 * the pid are identical unless CLONE_THREAD was specified on clone() in
1403 * which case the tgid is the same in all threads of the same group.
1404 *
1405 * This is SMP safe as current->tgid does not change.
1406 */
1407SYSCALL_DEFINE0(getpid)
1408{
1409 return task_tgid_vnr(current);
1410}
1411
1412/*
1413 * Accessing ->real_parent is not SMP-safe, it could
1414 * change from under us. However, we can use a stale
1415 * value of ->real_parent under rcu_read_lock(), see
1416 * release_task()->call_rcu(delayed_put_task_struct).
1417 */
1418SYSCALL_DEFINE0(getppid)
1419{
1420 int pid;
1421
1422 rcu_read_lock();
1423 pid = task_tgid_vnr(rcu_dereference(current->real_parent));
1424 rcu_read_unlock();
1425
1426 return pid;
1427}
1428
1429SYSCALL_DEFINE0(getuid)
1430{
1431 /* Only we change this so SMP safe */
1432 return from_kuid_munged(current_user_ns(), current_uid());
1433}
1434
1435SYSCALL_DEFINE0(geteuid)
1436{
1437 /* Only we change this so SMP safe */
1438 return from_kuid_munged(current_user_ns(), current_euid());
1439}
1440
1441SYSCALL_DEFINE0(getgid)
1442{
1443 /* Only we change this so SMP safe */
1444 return from_kgid_munged(current_user_ns(), current_gid());
1445}
1446
1447SYSCALL_DEFINE0(getegid)
1448{
1449 /* Only we change this so SMP safe */
1450 return from_kgid_munged(current_user_ns(), current_egid());
1451}
1452
1453static void process_timeout(unsigned long __data) 1399static void process_timeout(unsigned long __data)
1454{ 1400{
1455 wake_up_process((struct task_struct *)__data); 1401 wake_up_process((struct task_struct *)__data);
@@ -1557,91 +1503,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1557} 1503}
1558EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1504EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1559 1505
1560/* Thread ID - the internal kernel "pid" */
1561SYSCALL_DEFINE0(gettid)
1562{
1563 return task_pid_vnr(current);
1564}
1565
1566/**
1567 * do_sysinfo - fill in sysinfo struct
1568 * @info: pointer to buffer to fill
1569 */
1570int do_sysinfo(struct sysinfo *info)
1571{
1572 unsigned long mem_total, sav_total;
1573 unsigned int mem_unit, bitcount;
1574 struct timespec tp;
1575
1576 memset(info, 0, sizeof(struct sysinfo));
1577
1578 ktime_get_ts(&tp);
1579 monotonic_to_bootbased(&tp);
1580 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1581
1582 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1583
1584 info->procs = nr_threads;
1585
1586 si_meminfo(info);
1587 si_swapinfo(info);
1588
1589 /*
1590 * If the sum of all the available memory (i.e. ram + swap)
1591 * is less than can be stored in a 32 bit unsigned long then
1592 * we can be binary compatible with 2.2.x kernels. If not,
1593 * well, in that case 2.2.x was broken anyways...
1594 *
1595 * -Erik Andersen <andersee@debian.org>
1596 */
1597
1598 mem_total = info->totalram + info->totalswap;
1599 if (mem_total < info->totalram || mem_total < info->totalswap)
1600 goto out;
1601 bitcount = 0;
1602 mem_unit = info->mem_unit;
1603 while (mem_unit > 1) {
1604 bitcount++;
1605 mem_unit >>= 1;
1606 sav_total = mem_total;
1607 mem_total <<= 1;
1608 if (mem_total < sav_total)
1609 goto out;
1610 }
1611
1612 /*
1613 * If mem_total did not overflow, multiply all memory values by
1614 * info->mem_unit and set it to 1. This leaves things compatible
1615 * with 2.2.x, and also retains compatibility with earlier 2.4.x
1616 * kernels...
1617 */
1618
1619 info->mem_unit = 1;
1620 info->totalram <<= bitcount;
1621 info->freeram <<= bitcount;
1622 info->sharedram <<= bitcount;
1623 info->bufferram <<= bitcount;
1624 info->totalswap <<= bitcount;
1625 info->freeswap <<= bitcount;
1626 info->totalhigh <<= bitcount;
1627 info->freehigh <<= bitcount;
1628
1629out:
1630 return 0;
1631}
1632
1633SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
1634{
1635 struct sysinfo val;
1636
1637 do_sysinfo(&val);
1638
1639 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1640 return -EFAULT;
1641
1642 return 0;
1643}
1644
1645static int __cpuinit init_timers_cpu(int cpu) 1506static int __cpuinit init_timers_cpu(int cpu)
1646{ 1507{
1647 int j; 1508 int j;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index fc382d6e2765..5e9efd4b83a4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -176,6 +176,8 @@ config IRQSOFF_TRACER
176 select GENERIC_TRACER 176 select GENERIC_TRACER
177 select TRACER_MAX_TRACE 177 select TRACER_MAX_TRACE
178 select RING_BUFFER_ALLOW_SWAP 178 select RING_BUFFER_ALLOW_SWAP
179 select TRACER_SNAPSHOT
180 select TRACER_SNAPSHOT_PER_CPU_SWAP
179 help 181 help
180 This option measures the time spent in irqs-off critical 182 This option measures the time spent in irqs-off critical
181 sections, with microsecond accuracy. 183 sections, with microsecond accuracy.
@@ -198,6 +200,8 @@ config PREEMPT_TRACER
198 select GENERIC_TRACER 200 select GENERIC_TRACER
199 select TRACER_MAX_TRACE 201 select TRACER_MAX_TRACE
200 select RING_BUFFER_ALLOW_SWAP 202 select RING_BUFFER_ALLOW_SWAP
203 select TRACER_SNAPSHOT
204 select TRACER_SNAPSHOT_PER_CPU_SWAP
201 help 205 help
202 This option measures the time spent in preemption-off critical 206 This option measures the time spent in preemption-off critical
203 sections, with microsecond accuracy. 207 sections, with microsecond accuracy.
@@ -217,6 +221,7 @@ config SCHED_TRACER
217 select GENERIC_TRACER 221 select GENERIC_TRACER
218 select CONTEXT_SWITCH_TRACER 222 select CONTEXT_SWITCH_TRACER
219 select TRACER_MAX_TRACE 223 select TRACER_MAX_TRACE
224 select TRACER_SNAPSHOT
220 help 225 help
221 This tracer tracks the latency of the highest priority task 226 This tracer tracks the latency of the highest priority task
222 to be scheduled in, starting from the point it has woken up. 227 to be scheduled in, starting from the point it has woken up.
@@ -248,6 +253,27 @@ config TRACER_SNAPSHOT
248 echo 1 > /sys/kernel/debug/tracing/snapshot 253 echo 1 > /sys/kernel/debug/tracing/snapshot
249 cat snapshot 254 cat snapshot
250 255
256config TRACER_SNAPSHOT_PER_CPU_SWAP
257 bool "Allow snapshot to swap per CPU"
258 depends on TRACER_SNAPSHOT
259 select RING_BUFFER_ALLOW_SWAP
260 help
261 Allow doing a snapshot of a single CPU buffer instead of a
262 full swap (all buffers). If this is set, then the following is
263 allowed:
264
265 echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
266
267 After which, only the tracing buffer for CPU 2 was swapped with
268 the main tracing buffer, and the other CPU buffers remain the same.
269
270 When this is enabled, this adds a little more overhead to the
271 trace recording, as it needs to add some checks to synchronize
272 recording with swaps. But this does not affect the performance
273 of the overall system. This is enabled by default when the preempt
274 or irq latency tracers are enabled, as those need to swap as well
275 and already adds the overhead (plus a lot more).
276
251config TRACE_BRANCH_PROFILING 277config TRACE_BRANCH_PROFILING
252 bool 278 bool
253 select GENERIC_TRACER 279 select GENERIC_TRACER
@@ -524,6 +550,29 @@ config RING_BUFFER_BENCHMARK
524 550
525 If unsure, say N. 551 If unsure, say N.
526 552
553config RING_BUFFER_STARTUP_TEST
554 bool "Ring buffer startup self test"
555 depends on RING_BUFFER
556 help
557 Run a simple self test on the ring buffer on boot up. Late in the
558 kernel boot sequence, the test will start that kicks off
559 a thread per cpu. Each thread will write various size events
560 into the ring buffer. Another thread is created to send IPIs
561 to each of the threads, where the IPI handler will also write
562 to the ring buffer, to test/stress the nesting ability.
563 If any anomalies are discovered, a warning will be displayed
564 and all ring buffers will be disabled.
565
566 The test runs for 10 seconds. This will slow your boot time
567 by at least 10 more seconds.
568
569 At the end of the test, statics and more checks are done.
570 It will output the stats of each per cpu buffer. What
571 was written, the sizes, what was read, what was lost, and
572 other similar details.
573
574 If unsure, say N
575
527endif # FTRACE 576endif # FTRACE
528 577
529endif # TRACING_SUPPORT 578endif # TRACING_SUPPORT
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5a0f781cd729..ed58a3216a6d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
72 bool blk_tracer = blk_tracer_enabled; 72 bool blk_tracer = blk_tracer_enabled;
73 73
74 if (blk_tracer) { 74 if (blk_tracer) {
75 buffer = blk_tr->buffer; 75 buffer = blk_tr->trace_buffer.buffer;
76 pc = preempt_count(); 76 pc = preempt_count();
77 event = trace_buffer_lock_reserve(buffer, TRACE_BLK, 77 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
78 sizeof(*t) + len, 78 sizeof(*t) + len,
@@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
218 if (blk_tracer) { 218 if (blk_tracer) {
219 tracing_record_cmdline(current); 219 tracing_record_cmdline(current);
220 220
221 buffer = blk_tr->buffer; 221 buffer = blk_tr->trace_buffer.buffer;
222 pc = preempt_count(); 222 pc = preempt_count();
223 event = trace_buffer_lock_reserve(buffer, TRACE_BLK, 223 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
224 sizeof(*t) + pdu_len, 224 sizeof(*t) + pdu_len,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b3fde6d7b7fc..8a5c017bb50c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -486,7 +486,6 @@ struct ftrace_profile_stat {
486#define PROFILES_PER_PAGE \ 486#define PROFILES_PER_PAGE \
487 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) 487 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
488 488
489static int ftrace_profile_bits __read_mostly;
490static int ftrace_profile_enabled __read_mostly; 489static int ftrace_profile_enabled __read_mostly;
491 490
492/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ 491/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
@@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);
494 493
495static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); 494static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
496 495
497#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ 496#define FTRACE_PROFILE_HASH_BITS 10
497#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)
498 498
499static void * 499static void *
500function_stat_next(void *v, int idx) 500function_stat_next(void *v, int idx)
@@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
676 676
677 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); 677 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
678 678
679 for (i = 0; i < pages; i++) { 679 for (i = 1; i < pages; i++) {
680 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 680 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
681 if (!pg->next) 681 if (!pg->next)
682 goto out_free; 682 goto out_free;
@@ -724,13 +724,6 @@ static int ftrace_profile_init_cpu(int cpu)
724 if (!stat->hash) 724 if (!stat->hash)
725 return -ENOMEM; 725 return -ENOMEM;
726 726
727 if (!ftrace_profile_bits) {
728 size--;
729
730 for (; size; size >>= 1)
731 ftrace_profile_bits++;
732 }
733
734 /* Preallocate the function profiling pages */ 727 /* Preallocate the function profiling pages */
735 if (ftrace_profile_pages_init(stat) < 0) { 728 if (ftrace_profile_pages_init(stat) < 0) {
736 kfree(stat->hash); 729 kfree(stat->hash);
@@ -763,7 +756,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
763 struct hlist_head *hhd; 756 struct hlist_head *hhd;
764 unsigned long key; 757 unsigned long key;
765 758
766 key = hash_long(ip, ftrace_profile_bits); 759 key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
767 hhd = &stat->hash[key]; 760 hhd = &stat->hash[key];
768 761
769 if (hlist_empty(hhd)) 762 if (hlist_empty(hhd))
@@ -782,7 +775,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
782{ 775{
783 unsigned long key; 776 unsigned long key;
784 777
785 key = hash_long(rec->ip, ftrace_profile_bits); 778 key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);
786 hlist_add_head_rcu(&rec->node, &stat->hash[key]); 779 hlist_add_head_rcu(&rec->node, &stat->hash[key]);
787} 780}
788 781
@@ -1079,7 +1072,7 @@ struct ftrace_func_probe {
1079 unsigned long flags; 1072 unsigned long flags;
1080 unsigned long ip; 1073 unsigned long ip;
1081 void *data; 1074 void *data;
1082 struct rcu_head rcu; 1075 struct list_head free_list;
1083}; 1076};
1084 1077
1085struct ftrace_func_entry { 1078struct ftrace_func_entry {
@@ -1329,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1329 struct hlist_head *hhd; 1322 struct hlist_head *hhd;
1330 struct ftrace_hash *old_hash; 1323 struct ftrace_hash *old_hash;
1331 struct ftrace_hash *new_hash; 1324 struct ftrace_hash *new_hash;
1332 unsigned long key;
1333 int size = src->count; 1325 int size = src->count;
1334 int bits = 0; 1326 int bits = 0;
1335 int ret; 1327 int ret;
@@ -1372,10 +1364,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1372 for (i = 0; i < size; i++) { 1364 for (i = 0; i < size; i++) {
1373 hhd = &src->buckets[i]; 1365 hhd = &src->buckets[i];
1374 hlist_for_each_entry_safe(entry, tn, hhd, hlist) { 1366 hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
1375 if (bits > 0)
1376 key = hash_long(entry->ip, bits);
1377 else
1378 key = 0;
1379 remove_hash_entry(src, entry); 1367 remove_hash_entry(src, entry);
1380 __add_hash_entry(new_hash, entry); 1368 __add_hash_entry(new_hash, entry);
1381 } 1369 }
@@ -2973,28 +2961,27 @@ static void __disable_ftrace_function_probe(void)
2973} 2961}
2974 2962
2975 2963
2976static void ftrace_free_entry_rcu(struct rcu_head *rhp) 2964static void ftrace_free_entry(struct ftrace_func_probe *entry)
2977{ 2965{
2978 struct ftrace_func_probe *entry =
2979 container_of(rhp, struct ftrace_func_probe, rcu);
2980
2981 if (entry->ops->free) 2966 if (entry->ops->free)
2982 entry->ops->free(&entry->data); 2967 entry->ops->free(entry->ops, entry->ip, &entry->data);
2983 kfree(entry); 2968 kfree(entry);
2984} 2969}
2985 2970
2986
2987int 2971int
2988register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 2972register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2989 void *data) 2973 void *data)
2990{ 2974{
2991 struct ftrace_func_probe *entry; 2975 struct ftrace_func_probe *entry;
2976 struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
2977 struct ftrace_hash *hash;
2992 struct ftrace_page *pg; 2978 struct ftrace_page *pg;
2993 struct dyn_ftrace *rec; 2979 struct dyn_ftrace *rec;
2994 int type, len, not; 2980 int type, len, not;
2995 unsigned long key; 2981 unsigned long key;
2996 int count = 0; 2982 int count = 0;
2997 char *search; 2983 char *search;
2984 int ret;
2998 2985
2999 type = filter_parse_regex(glob, strlen(glob), &search, &not); 2986 type = filter_parse_regex(glob, strlen(glob), &search, &not);
3000 len = strlen(search); 2987 len = strlen(search);
@@ -3005,8 +2992,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3005 2992
3006 mutex_lock(&ftrace_lock); 2993 mutex_lock(&ftrace_lock);
3007 2994
3008 if (unlikely(ftrace_disabled)) 2995 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
2996 if (!hash) {
2997 count = -ENOMEM;
3009 goto out_unlock; 2998 goto out_unlock;
2999 }
3000
3001 if (unlikely(ftrace_disabled)) {
3002 count = -ENODEV;
3003 goto out_unlock;
3004 }
3010 3005
3011 do_for_each_ftrace_rec(pg, rec) { 3006 do_for_each_ftrace_rec(pg, rec) {
3012 3007
@@ -3030,14 +3025,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3030 * for each function we find. We call the callback 3025 * for each function we find. We call the callback
3031 * to give the caller an opportunity to do so. 3026 * to give the caller an opportunity to do so.
3032 */ 3027 */
3033 if (ops->callback) { 3028 if (ops->init) {
3034 if (ops->callback(rec->ip, &entry->data) < 0) { 3029 if (ops->init(ops, rec->ip, &entry->data) < 0) {
3035 /* caller does not like this func */ 3030 /* caller does not like this func */
3036 kfree(entry); 3031 kfree(entry);
3037 continue; 3032 continue;
3038 } 3033 }
3039 } 3034 }
3040 3035
3036 ret = enter_record(hash, rec, 0);
3037 if (ret < 0) {
3038 kfree(entry);
3039 count = ret;
3040 goto out_unlock;
3041 }
3042
3041 entry->ops = ops; 3043 entry->ops = ops;
3042 entry->ip = rec->ip; 3044 entry->ip = rec->ip;
3043 3045
@@ -3045,10 +3047,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3045 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); 3047 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
3046 3048
3047 } while_for_each_ftrace_rec(); 3049 } while_for_each_ftrace_rec();
3050
3051 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3052 if (ret < 0)
3053 count = ret;
3054
3048 __enable_ftrace_function_probe(); 3055 __enable_ftrace_function_probe();
3049 3056
3050 out_unlock: 3057 out_unlock:
3051 mutex_unlock(&ftrace_lock); 3058 mutex_unlock(&ftrace_lock);
3059 free_ftrace_hash(hash);
3052 3060
3053 return count; 3061 return count;
3054} 3062}
@@ -3062,7 +3070,12 @@ static void
3062__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 3070__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3063 void *data, int flags) 3071 void *data, int flags)
3064{ 3072{
3073 struct ftrace_func_entry *rec_entry;
3065 struct ftrace_func_probe *entry; 3074 struct ftrace_func_probe *entry;
3075 struct ftrace_func_probe *p;
3076 struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
3077 struct list_head free_list;
3078 struct ftrace_hash *hash;
3066 struct hlist_node *tmp; 3079 struct hlist_node *tmp;
3067 char str[KSYM_SYMBOL_LEN]; 3080 char str[KSYM_SYMBOL_LEN];
3068 int type = MATCH_FULL; 3081 int type = MATCH_FULL;
@@ -3083,6 +3096,14 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3083 } 3096 }
3084 3097
3085 mutex_lock(&ftrace_lock); 3098 mutex_lock(&ftrace_lock);
3099
3100 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3101 if (!hash)
3102 /* Hmm, should report this somehow */
3103 goto out_unlock;
3104
3105 INIT_LIST_HEAD(&free_list);
3106
3086 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3107 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3087 struct hlist_head *hhd = &ftrace_func_hash[i]; 3108 struct hlist_head *hhd = &ftrace_func_hash[i];
3088 3109
@@ -3103,12 +3124,30 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3103 continue; 3124 continue;
3104 } 3125 }
3105 3126
3127 rec_entry = ftrace_lookup_ip(hash, entry->ip);
3128 /* It is possible more than one entry had this ip */
3129 if (rec_entry)
3130 free_hash_entry(hash, rec_entry);
3131
3106 hlist_del_rcu(&entry->node); 3132 hlist_del_rcu(&entry->node);
3107 call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu); 3133 list_add(&entry->free_list, &free_list);
3108 } 3134 }
3109 } 3135 }
3110 __disable_ftrace_function_probe(); 3136 __disable_ftrace_function_probe();
3137 /*
3138 * Remove after the disable is called. Otherwise, if the last
3139 * probe is removed, a null hash means *all enabled*.
3140 */
3141 ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3142 synchronize_sched();
3143 list_for_each_entry_safe(entry, p, &free_list, free_list) {
3144 list_del(&entry->free_list);
3145 ftrace_free_entry(entry);
3146 }
3147
3148 out_unlock:
3111 mutex_unlock(&ftrace_lock); 3149 mutex_unlock(&ftrace_lock);
3150 free_ftrace_hash(hash);
3112} 3151}
3113 3152
3114void 3153void
@@ -3736,7 +3775,8 @@ out:
3736 if (fail) 3775 if (fail)
3737 return -EINVAL; 3776 return -EINVAL;
3738 3777
3739 ftrace_graph_filter_enabled = 1; 3778 ftrace_graph_filter_enabled = !!(*idx);
3779
3740 return 0; 3780 return 0;
3741} 3781}
3742 3782
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6989df2ba194..b59aea2c48c2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,13 +8,16 @@
8#include <linux/trace_clock.h> 8#include <linux/trace_clock.h>
9#include <linux/trace_seq.h> 9#include <linux/trace_seq.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/irq_work.h>
11#include <linux/debugfs.h> 12#include <linux/debugfs.h>
12#include <linux/uaccess.h> 13#include <linux/uaccess.h>
13#include <linux/hardirq.h> 14#include <linux/hardirq.h>
15#include <linux/kthread.h> /* for self test */
14#include <linux/kmemcheck.h> 16#include <linux/kmemcheck.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/percpu.h> 18#include <linux/percpu.h>
17#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/delay.h>
18#include <linux/slab.h> 21#include <linux/slab.h>
19#include <linux/init.h> 22#include <linux/init.h>
20#include <linux/hash.h> 23#include <linux/hash.h>
@@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
444 return ret; 447 return ret;
445} 448}
446 449
450struct rb_irq_work {
451 struct irq_work work;
452 wait_queue_head_t waiters;
453 bool waiters_pending;
454};
455
447/* 456/*
448 * head_page == tail_page && head == tail then buffer is empty. 457 * head_page == tail_page && head == tail then buffer is empty.
449 */ 458 */
@@ -478,6 +487,8 @@ struct ring_buffer_per_cpu {
478 struct list_head new_pages; /* new pages to add */ 487 struct list_head new_pages; /* new pages to add */
479 struct work_struct update_pages_work; 488 struct work_struct update_pages_work;
480 struct completion update_done; 489 struct completion update_done;
490
491 struct rb_irq_work irq_work;
481}; 492};
482 493
483struct ring_buffer { 494struct ring_buffer {
@@ -497,6 +508,8 @@ struct ring_buffer {
497 struct notifier_block cpu_notify; 508 struct notifier_block cpu_notify;
498#endif 509#endif
499 u64 (*clock)(void); 510 u64 (*clock)(void);
511
512 struct rb_irq_work irq_work;
500}; 513};
501 514
502struct ring_buffer_iter { 515struct ring_buffer_iter {
@@ -508,6 +521,118 @@ struct ring_buffer_iter {
508 u64 read_stamp; 521 u64 read_stamp;
509}; 522};
510 523
524/*
525 * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
526 *
527 * Schedules a delayed work to wake up any task that is blocked on the
528 * ring buffer waiters queue.
529 */
530static void rb_wake_up_waiters(struct irq_work *work)
531{
532 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
533
534 wake_up_all(&rbwork->waiters);
535}
536
537/**
538 * ring_buffer_wait - wait for input to the ring buffer
539 * @buffer: buffer to wait on
540 * @cpu: the cpu buffer to wait on
541 *
542 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
543 * as data is added to any of the @buffer's cpu buffers. Otherwise
544 * it will wait for data to be added to a specific cpu buffer.
545 */
546void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
547{
548 struct ring_buffer_per_cpu *cpu_buffer;
549 DEFINE_WAIT(wait);
550 struct rb_irq_work *work;
551
552 /*
553 * Depending on what the caller is waiting for, either any
554 * data in any cpu buffer, or a specific buffer, put the
555 * caller on the appropriate wait queue.
556 */
557 if (cpu == RING_BUFFER_ALL_CPUS)
558 work = &buffer->irq_work;
559 else {
560 cpu_buffer = buffer->buffers[cpu];
561 work = &cpu_buffer->irq_work;
562 }
563
564
565 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
566
567 /*
568 * The events can happen in critical sections where
569 * checking a work queue can cause deadlocks.
570 * After adding a task to the queue, this flag is set
571 * only to notify events to try to wake up the queue
572 * using irq_work.
573 *
574 * We don't clear it even if the buffer is no longer
575 * empty. The flag only causes the next event to run
576 * irq_work to do the work queue wake up. The worse
577 * that can happen if we race with !trace_empty() is that
578 * an event will cause an irq_work to try to wake up
579 * an empty queue.
580 *
581 * There's no reason to protect this flag either, as
582 * the work queue and irq_work logic will do the necessary
583 * synchronization for the wake ups. The only thing
584 * that is necessary is that the wake up happens after
585 * a task has been queued. It's OK for spurious wake ups.
586 */
587 work->waiters_pending = true;
588
589 if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
590 (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
591 schedule();
592
593 finish_wait(&work->waiters, &wait);
594}
595
596/**
597 * ring_buffer_poll_wait - poll on buffer input
598 * @buffer: buffer to wait on
599 * @cpu: the cpu buffer to wait on
600 * @filp: the file descriptor
601 * @poll_table: The poll descriptor
602 *
603 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
604 * as data is added to any of the @buffer's cpu buffers. Otherwise
605 * it will wait for data to be added to a specific cpu buffer.
606 *
607 * Returns POLLIN | POLLRDNORM if data exists in the buffers,
608 * zero otherwise.
609 */
610int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
611 struct file *filp, poll_table *poll_table)
612{
613 struct ring_buffer_per_cpu *cpu_buffer;
614 struct rb_irq_work *work;
615
616 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
617 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
618 return POLLIN | POLLRDNORM;
619
620 if (cpu == RING_BUFFER_ALL_CPUS)
621 work = &buffer->irq_work;
622 else {
623 cpu_buffer = buffer->buffers[cpu];
624 work = &cpu_buffer->irq_work;
625 }
626
627 work->waiters_pending = true;
628 poll_wait(filp, &work->waiters, poll_table);
629
630 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
631 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
632 return POLLIN | POLLRDNORM;
633 return 0;
634}
635
511/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 636/* buffer may be either ring_buffer or ring_buffer_per_cpu */
512#define RB_WARN_ON(b, cond) \ 637#define RB_WARN_ON(b, cond) \
513 ({ \ 638 ({ \
@@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1063 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1188 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1064 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); 1189 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
1065 init_completion(&cpu_buffer->update_done); 1190 init_completion(&cpu_buffer->update_done);
1191 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
1192 init_waitqueue_head(&cpu_buffer->irq_work.waiters);
1066 1193
1067 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1194 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1068 GFP_KERNEL, cpu_to_node(cpu)); 1195 GFP_KERNEL, cpu_to_node(cpu));
@@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1158 buffer->clock = trace_clock_local; 1285 buffer->clock = trace_clock_local;
1159 buffer->reader_lock_key = key; 1286 buffer->reader_lock_key = key;
1160 1287
1288 init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
1289 init_waitqueue_head(&buffer->irq_work.waiters);
1290
1161 /* need at least two pages */ 1291 /* need at least two pages */
1162 if (nr_pages < 2) 1292 if (nr_pages < 2)
1163 nr_pages = 2; 1293 nr_pages = 2;
@@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1553 if (!cpu_buffer->nr_pages_to_update) 1683 if (!cpu_buffer->nr_pages_to_update)
1554 continue; 1684 continue;
1555 1685
1556 if (cpu_online(cpu)) 1686 /* The update must run on the CPU that is being updated. */
1687 preempt_disable();
1688 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
1689 rb_update_pages(cpu_buffer);
1690 cpu_buffer->nr_pages_to_update = 0;
1691 } else {
1692 /*
1693 * Can not disable preemption for schedule_work_on()
1694 * on PREEMPT_RT.
1695 */
1696 preempt_enable();
1557 schedule_work_on(cpu, 1697 schedule_work_on(cpu,
1558 &cpu_buffer->update_pages_work); 1698 &cpu_buffer->update_pages_work);
1559 else 1699 preempt_disable();
1560 rb_update_pages(cpu_buffer); 1700 }
1701 preempt_enable();
1561 } 1702 }
1562 1703
1563 /* wait for all the updates to complete */ 1704 /* wait for all the updates to complete */
@@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1595 1736
1596 get_online_cpus(); 1737 get_online_cpus();
1597 1738
1598 if (cpu_online(cpu_id)) { 1739 preempt_disable();
1740 /* The update must run on the CPU that is being updated. */
1741 if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
1742 rb_update_pages(cpu_buffer);
1743 else {
1744 /*
1745 * Can not disable preemption for schedule_work_on()
1746 * on PREEMPT_RT.
1747 */
1748 preempt_enable();
1599 schedule_work_on(cpu_id, 1749 schedule_work_on(cpu_id,
1600 &cpu_buffer->update_pages_work); 1750 &cpu_buffer->update_pages_work);
1601 wait_for_completion(&cpu_buffer->update_done); 1751 wait_for_completion(&cpu_buffer->update_done);
1602 } else 1752 preempt_disable();
1603 rb_update_pages(cpu_buffer); 1753 }
1754 preempt_enable();
1604 1755
1605 cpu_buffer->nr_pages_to_update = 0; 1756 cpu_buffer->nr_pages_to_update = 0;
1606 put_online_cpus(); 1757 put_online_cpus();
@@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2612 rb_end_commit(cpu_buffer); 2763 rb_end_commit(cpu_buffer);
2613} 2764}
2614 2765
2766static __always_inline void
2767rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2768{
2769 if (buffer->irq_work.waiters_pending) {
2770 buffer->irq_work.waiters_pending = false;
2771 /* irq_work_queue() supplies it's own memory barriers */
2772 irq_work_queue(&buffer->irq_work.work);
2773 }
2774
2775 if (cpu_buffer->irq_work.waiters_pending) {
2776 cpu_buffer->irq_work.waiters_pending = false;
2777 /* irq_work_queue() supplies it's own memory barriers */
2778 irq_work_queue(&cpu_buffer->irq_work.work);
2779 }
2780}
2781
2615/** 2782/**
2616 * ring_buffer_unlock_commit - commit a reserved 2783 * ring_buffer_unlock_commit - commit a reserved
2617 * @buffer: The buffer to commit to 2784 * @buffer: The buffer to commit to
@@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2631 2798
2632 rb_commit(cpu_buffer, event); 2799 rb_commit(cpu_buffer, event);
2633 2800
2801 rb_wakeups(buffer, cpu_buffer);
2802
2634 trace_recursive_unlock(); 2803 trace_recursive_unlock();
2635 2804
2636 preempt_enable_notrace(); 2805 preempt_enable_notrace();
@@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
2803 2972
2804 rb_commit(cpu_buffer, event); 2973 rb_commit(cpu_buffer, event);
2805 2974
2975 rb_wakeups(buffer, cpu_buffer);
2976
2806 ret = 0; 2977 ret = 0;
2807 out: 2978 out:
2808 preempt_enable_notrace(); 2979 preempt_enable_notrace();
@@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self,
4467 return NOTIFY_OK; 4638 return NOTIFY_OK;
4468} 4639}
4469#endif 4640#endif
4641
4642#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
4643/*
4644 * This is a basic integrity check of the ring buffer.
4645 * Late in the boot cycle this test will run when configured in.
4646 * It will kick off a thread per CPU that will go into a loop
4647 * writing to the per cpu ring buffer various sizes of data.
4648 * Some of the data will be large items, some small.
4649 *
4650 * Another thread is created that goes into a spin, sending out
4651 * IPIs to the other CPUs to also write into the ring buffer.
4652 * this is to test the nesting ability of the buffer.
4653 *
4654 * Basic stats are recorded and reported. If something in the
4655 * ring buffer should happen that's not expected, a big warning
4656 * is displayed and all ring buffers are disabled.
4657 */
4658static struct task_struct *rb_threads[NR_CPUS] __initdata;
4659
4660struct rb_test_data {
4661 struct ring_buffer *buffer;
4662 unsigned long events;
4663 unsigned long bytes_written;
4664 unsigned long bytes_alloc;
4665 unsigned long bytes_dropped;
4666 unsigned long events_nested;
4667 unsigned long bytes_written_nested;
4668 unsigned long bytes_alloc_nested;
4669 unsigned long bytes_dropped_nested;
4670 int min_size_nested;
4671 int max_size_nested;
4672 int max_size;
4673 int min_size;
4674 int cpu;
4675 int cnt;
4676};
4677
4678static struct rb_test_data rb_data[NR_CPUS] __initdata;
4679
4680/* 1 meg per cpu */
4681#define RB_TEST_BUFFER_SIZE 1048576
4682
4683static char rb_string[] __initdata =
4684 "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
4685 "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
4686 "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
4687
4688static bool rb_test_started __initdata;
4689
4690struct rb_item {
4691 int size;
4692 char str[];
4693};
4694
4695static __init int rb_write_something(struct rb_test_data *data, bool nested)
4696{
4697 struct ring_buffer_event *event;
4698 struct rb_item *item;
4699 bool started;
4700 int event_len;
4701 int size;
4702 int len;
4703 int cnt;
4704
4705 /* Have nested writes different that what is written */
4706 cnt = data->cnt + (nested ? 27 : 0);
4707
4708 /* Multiply cnt by ~e, to make some unique increment */
4709 size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
4710
4711 len = size + sizeof(struct rb_item);
4712
4713 started = rb_test_started;
4714 /* read rb_test_started before checking buffer enabled */
4715 smp_rmb();
4716
4717 event = ring_buffer_lock_reserve(data->buffer, len);
4718 if (!event) {
4719 /* Ignore dropped events before test starts. */
4720 if (started) {
4721 if (nested)
4722 data->bytes_dropped += len;
4723 else
4724 data->bytes_dropped_nested += len;
4725 }
4726 return len;
4727 }
4728
4729 event_len = ring_buffer_event_length(event);
4730
4731 if (RB_WARN_ON(data->buffer, event_len < len))
4732 goto out;
4733
4734 item = ring_buffer_event_data(event);
4735 item->size = size;
4736 memcpy(item->str, rb_string, size);
4737
4738 if (nested) {
4739 data->bytes_alloc_nested += event_len;
4740 data->bytes_written_nested += len;
4741 data->events_nested++;
4742 if (!data->min_size_nested || len < data->min_size_nested)
4743 data->min_size_nested = len;
4744 if (len > data->max_size_nested)
4745 data->max_size_nested = len;
4746 } else {
4747 data->bytes_alloc += event_len;
4748 data->bytes_written += len;
4749 data->events++;
4750 if (!data->min_size || len < data->min_size)
4751 data->max_size = len;
4752 if (len > data->max_size)
4753 data->max_size = len;
4754 }
4755
4756 out:
4757 ring_buffer_unlock_commit(data->buffer, event);
4758
4759 return 0;
4760}
4761
4762static __init int rb_test(void *arg)
4763{
4764 struct rb_test_data *data = arg;
4765
4766 while (!kthread_should_stop()) {
4767 rb_write_something(data, false);
4768 data->cnt++;
4769
4770 set_current_state(TASK_INTERRUPTIBLE);
4771 /* Now sleep between a min of 100-300us and a max of 1ms */
4772 usleep_range(((data->cnt % 3) + 1) * 100, 1000);
4773 }
4774
4775 return 0;
4776}
4777
4778static __init void rb_ipi(void *ignore)
4779{
4780 struct rb_test_data *data;
4781 int cpu = smp_processor_id();
4782
4783 data = &rb_data[cpu];
4784 rb_write_something(data, true);
4785}
4786
4787static __init int rb_hammer_test(void *arg)
4788{
4789 while (!kthread_should_stop()) {
4790
4791 /* Send an IPI to all cpus to write data! */
4792 smp_call_function(rb_ipi, NULL, 1);
4793 /* No sleep, but for non preempt, let others run */
4794 schedule();
4795 }
4796
4797 return 0;
4798}
4799
4800static __init int test_ringbuffer(void)
4801{
4802 struct task_struct *rb_hammer;
4803 struct ring_buffer *buffer;
4804 int cpu;
4805 int ret = 0;
4806
4807 pr_info("Running ring buffer tests...\n");
4808
4809 buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
4810 if (WARN_ON(!buffer))
4811 return 0;
4812
4813 /* Disable buffer so that threads can't write to it yet */
4814 ring_buffer_record_off(buffer);
4815
4816 for_each_online_cpu(cpu) {
4817 rb_data[cpu].buffer = buffer;
4818 rb_data[cpu].cpu = cpu;
4819 rb_data[cpu].cnt = cpu;
4820 rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
4821 "rbtester/%d", cpu);
4822 if (WARN_ON(!rb_threads[cpu])) {
4823 pr_cont("FAILED\n");
4824 ret = -1;
4825 goto out_free;
4826 }
4827
4828 kthread_bind(rb_threads[cpu], cpu);
4829 wake_up_process(rb_threads[cpu]);
4830 }
4831
4832 /* Now create the rb hammer! */
4833 rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
4834 if (WARN_ON(!rb_hammer)) {
4835 pr_cont("FAILED\n");
4836 ret = -1;
4837 goto out_free;
4838 }
4839
4840 ring_buffer_record_on(buffer);
4841 /*
4842 * Show buffer is enabled before setting rb_test_started.
4843 * Yes there's a small race window where events could be
4844 * dropped and the thread wont catch it. But when a ring
4845 * buffer gets enabled, there will always be some kind of
4846 * delay before other CPUs see it. Thus, we don't care about
4847 * those dropped events. We care about events dropped after
4848 * the threads see that the buffer is active.
4849 */
4850 smp_wmb();
4851 rb_test_started = true;
4852
4853 set_current_state(TASK_INTERRUPTIBLE);
4854 /* Just run for 10 seconds */;
4855 schedule_timeout(10 * HZ);
4856
4857 kthread_stop(rb_hammer);
4858
4859 out_free:
4860 for_each_online_cpu(cpu) {
4861 if (!rb_threads[cpu])
4862 break;
4863 kthread_stop(rb_threads[cpu]);
4864 }
4865 if (ret) {
4866 ring_buffer_free(buffer);
4867 return ret;
4868 }
4869
4870 /* Report! */
4871 pr_info("finished\n");
4872 for_each_online_cpu(cpu) {
4873 struct ring_buffer_event *event;
4874 struct rb_test_data *data = &rb_data[cpu];
4875 struct rb_item *item;
4876 unsigned long total_events;
4877 unsigned long total_dropped;
4878 unsigned long total_written;
4879 unsigned long total_alloc;
4880 unsigned long total_read = 0;
4881 unsigned long total_size = 0;
4882 unsigned long total_len = 0;
4883 unsigned long total_lost = 0;
4884 unsigned long lost;
4885 int big_event_size;
4886 int small_event_size;
4887
4888 ret = -1;
4889
4890 total_events = data->events + data->events_nested;
4891 total_written = data->bytes_written + data->bytes_written_nested;
4892 total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
4893 total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
4894
4895 big_event_size = data->max_size + data->max_size_nested;
4896 small_event_size = data->min_size + data->min_size_nested;
4897
4898 pr_info("CPU %d:\n", cpu);
4899 pr_info(" events: %ld\n", total_events);
4900 pr_info(" dropped bytes: %ld\n", total_dropped);
4901 pr_info(" alloced bytes: %ld\n", total_alloc);
4902 pr_info(" written bytes: %ld\n", total_written);
4903 pr_info(" biggest event: %d\n", big_event_size);
4904 pr_info(" smallest event: %d\n", small_event_size);
4905
4906 if (RB_WARN_ON(buffer, total_dropped))
4907 break;
4908
4909 ret = 0;
4910
4911 while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
4912 total_lost += lost;
4913 item = ring_buffer_event_data(event);
4914 total_len += ring_buffer_event_length(event);
4915 total_size += item->size + sizeof(struct rb_item);
4916 if (memcmp(&item->str[0], rb_string, item->size) != 0) {
4917 pr_info("FAILED!\n");
4918 pr_info("buffer had: %.*s\n", item->size, item->str);
4919 pr_info("expected: %.*s\n", item->size, rb_string);
4920 RB_WARN_ON(buffer, 1);
4921 ret = -1;
4922 break;
4923 }
4924 total_read++;
4925 }
4926 if (ret)
4927 break;
4928
4929 ret = -1;
4930
4931 pr_info(" read events: %ld\n", total_read);
4932 pr_info(" lost events: %ld\n", total_lost);
4933 pr_info(" total events: %ld\n", total_lost + total_read);
4934 pr_info(" recorded len bytes: %ld\n", total_len);
4935 pr_info(" recorded size bytes: %ld\n", total_size);
4936 if (total_lost)
4937 pr_info(" With dropped events, record len and size may not match\n"
4938 " alloced and written from above\n");
4939 if (!total_lost) {
4940 if (RB_WARN_ON(buffer, total_len != total_alloc ||
4941 total_size != total_written))
4942 break;
4943 }
4944 if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
4945 break;
4946
4947 ret = 0;
4948 }
4949 if (!ret)
4950 pr_info("Ring buffer PASSED!\n");
4951
4952 ring_buffer_free(buffer);
4953 return 0;
4954}
4955
4956late_initcall(test_ringbuffer);
4957#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 66338c4f7f4b..ae6fa2d1cdf7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * ring buffer based function tracer 2 * ring buffer based function tracer
3 * 3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> 5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 * 6 *
7 * Originally taken from the RT patch by: 7 * Originally taken from the RT patch by:
@@ -19,7 +19,6 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/irq_work.h>
23#include <linux/debugfs.h> 22#include <linux/debugfs.h>
24#include <linux/pagemap.h> 23#include <linux/pagemap.h>
25#include <linux/hardirq.h> 24#include <linux/hardirq.h>
@@ -48,7 +47,7 @@
48 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
49 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
50 */ 49 */
51int ring_buffer_expanded; 50bool ring_buffer_expanded;
52 51
53/* 52/*
54 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
87static DEFINE_PER_CPU(bool, trace_cmdline_save); 86static DEFINE_PER_CPU(bool, trace_cmdline_save);
88 87
89/* 88/*
90 * When a reader is waiting for data, then this variable is
91 * set to true.
92 */
93static bool trace_wakeup_needed;
94
95static struct irq_work trace_work_wakeup;
96
97/*
98 * Kill all tracing for good (never come back). 89 * Kill all tracing for good (never come back).
99 * It is initialized to 1 but will turn to zero if the initialization 90 * It is initialized to 1 but will turn to zero if the initialization
100 * of the tracer is successful. But that is the only place that sets 91 * of the tracer is successful. But that is the only place that sets
@@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf);
130static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 121static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
131static char *default_bootup_tracer; 122static char *default_bootup_tracer;
132 123
124static bool allocate_snapshot;
125
133static int __init set_cmdline_ftrace(char *str) 126static int __init set_cmdline_ftrace(char *str)
134{ 127{
135 strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 128 strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
136 default_bootup_tracer = bootup_tracer_buf; 129 default_bootup_tracer = bootup_tracer_buf;
137 /* We are using ftrace early, expand it */ 130 /* We are using ftrace early, expand it */
138 ring_buffer_expanded = 1; 131 ring_buffer_expanded = true;
139 return 1; 132 return 1;
140} 133}
141__setup("ftrace=", set_cmdline_ftrace); 134__setup("ftrace=", set_cmdline_ftrace);
@@ -156,6 +149,15 @@ static int __init set_ftrace_dump_on_oops(char *str)
156} 149}
157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 150__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
158 151
152static int __init boot_alloc_snapshot(char *str)
153{
154 allocate_snapshot = true;
155 /* We also need the main ring buffer expanded */
156 ring_buffer_expanded = true;
157 return 1;
158}
159__setup("alloc_snapshot", boot_alloc_snapshot);
160
159 161
160static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; 162static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
161static char *trace_boot_options __initdata; 163static char *trace_boot_options __initdata;
@@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec)
189 */ 191 */
190static struct trace_array global_trace; 192static struct trace_array global_trace;
191 193
192static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 194LIST_HEAD(ftrace_trace_arrays);
193 195
194int filter_current_check_discard(struct ring_buffer *buffer, 196int filter_current_check_discard(struct ring_buffer *buffer,
195 struct ftrace_event_call *call, void *rec, 197 struct ftrace_event_call *call, void *rec,
@@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu)
204 u64 ts; 206 u64 ts;
205 207
206 /* Early boot up does not have a buffer yet */ 208 /* Early boot up does not have a buffer yet */
207 if (!global_trace.buffer) 209 if (!global_trace.trace_buffer.buffer)
208 return trace_clock_local(); 210 return trace_clock_local();
209 211
210 ts = ring_buffer_time_stamp(global_trace.buffer, cpu); 212 ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
211 ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); 213 ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
212 214
213 return ts; 215 return ts;
214} 216}
215 217
216/*
217 * The max_tr is used to snapshot the global_trace when a maximum
218 * latency is reached. Some tracers will use this to store a maximum
219 * trace while it continues examining live traces.
220 *
221 * The buffers for the max_tr are set up the same as the global_trace.
222 * When a snapshot is taken, the link list of the max_tr is swapped
223 * with the link list of the global_trace and the buffers are reset for
224 * the global_trace so the tracing can continue.
225 */
226static struct trace_array max_tr;
227
228static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
229
230int tracing_is_enabled(void) 218int tracing_is_enabled(void)
231{ 219{
232 return tracing_is_on(); 220 return tracing_is_on();
@@ -249,9 +237,6 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
249/* trace_types holds a link list of available tracers. */ 237/* trace_types holds a link list of available tracers. */
250static struct tracer *trace_types __read_mostly; 238static struct tracer *trace_types __read_mostly;
251 239
252/* current_trace points to the tracer that is currently active */
253static struct tracer *current_trace __read_mostly = &nop_trace;
254
255/* 240/*
256 * trace_types_lock is used to protect the trace_types list. 241 * trace_types_lock is used to protect the trace_types list.
257 */ 242 */
@@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
285 270
286static inline void trace_access_lock(int cpu) 271static inline void trace_access_lock(int cpu)
287{ 272{
288 if (cpu == TRACE_PIPE_ALL_CPU) { 273 if (cpu == RING_BUFFER_ALL_CPUS) {
289 /* gain it for accessing the whole ring buffer. */ 274 /* gain it for accessing the whole ring buffer. */
290 down_write(&all_cpu_access_lock); 275 down_write(&all_cpu_access_lock);
291 } else { 276 } else {
292 /* gain it for accessing a cpu ring buffer. */ 277 /* gain it for accessing a cpu ring buffer. */
293 278
294 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ 279 /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
295 down_read(&all_cpu_access_lock); 280 down_read(&all_cpu_access_lock);
296 281
297 /* Secondly block other access to this @cpu ring buffer. */ 282 /* Secondly block other access to this @cpu ring buffer. */
@@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu)
301 286
302static inline void trace_access_unlock(int cpu) 287static inline void trace_access_unlock(int cpu)
303{ 288{
304 if (cpu == TRACE_PIPE_ALL_CPU) { 289 if (cpu == RING_BUFFER_ALL_CPUS) {
305 up_write(&all_cpu_access_lock); 290 up_write(&all_cpu_access_lock);
306 } else { 291 } else {
307 mutex_unlock(&per_cpu(cpu_access_lock, cpu)); 292 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
@@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void)
339 324
340#endif 325#endif
341 326
342/* trace_wait is a waitqueue for tasks blocked on trace_poll */
343static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
344
345/* trace_flags holds trace_options default values */ 327/* trace_flags holds trace_options default values */
346unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 328unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
347 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 329 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
348 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | 330 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
349 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; 331 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
350
351static int trace_stop_count;
352static DEFINE_RAW_SPINLOCK(tracing_start_lock);
353
354/**
355 * trace_wake_up - wake up tasks waiting for trace input
356 *
357 * Schedules a delayed work to wake up any task that is blocked on the
358 * trace_wait queue. These is used with trace_poll for tasks polling the
359 * trace.
360 */
361static void trace_wake_up(struct irq_work *work)
362{
363 wake_up_all(&trace_wait);
364
365}
366 332
367/** 333/**
368 * tracing_on - enable tracing buffers 334 * tracing_on - enable tracing buffers
@@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work)
372 */ 338 */
373void tracing_on(void) 339void tracing_on(void)
374{ 340{
375 if (global_trace.buffer) 341 if (global_trace.trace_buffer.buffer)
376 ring_buffer_record_on(global_trace.buffer); 342 ring_buffer_record_on(global_trace.trace_buffer.buffer);
377 /* 343 /*
378 * This flag is only looked at when buffers haven't been 344 * This flag is only looked at when buffers haven't been
379 * allocated yet. We don't really care about the race 345 * allocated yet. We don't really care about the race
@@ -385,6 +351,196 @@ void tracing_on(void)
385EXPORT_SYMBOL_GPL(tracing_on); 351EXPORT_SYMBOL_GPL(tracing_on);
386 352
387/** 353/**
354 * __trace_puts - write a constant string into the trace buffer.
355 * @ip: The address of the caller
356 * @str: The constant string to write
357 * @size: The size of the string.
358 */
359int __trace_puts(unsigned long ip, const char *str, int size)
360{
361 struct ring_buffer_event *event;
362 struct ring_buffer *buffer;
363 struct print_entry *entry;
364 unsigned long irq_flags;
365 int alloc;
366
367 alloc = sizeof(*entry) + size + 2; /* possible \n added */
368
369 local_save_flags(irq_flags);
370 buffer = global_trace.trace_buffer.buffer;
371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
372 irq_flags, preempt_count());
373 if (!event)
374 return 0;
375
376 entry = ring_buffer_event_data(event);
377 entry->ip = ip;
378
379 memcpy(&entry->buf, str, size);
380
381 /* Add a newline if necessary */
382 if (entry->buf[size - 1] != '\n') {
383 entry->buf[size] = '\n';
384 entry->buf[size + 1] = '\0';
385 } else
386 entry->buf[size] = '\0';
387
388 __buffer_unlock_commit(buffer, event);
389
390 return size;
391}
392EXPORT_SYMBOL_GPL(__trace_puts);
393
394/**
395 * __trace_bputs - write the pointer to a constant string into trace buffer
396 * @ip: The address of the caller
397 * @str: The constant string to write to the buffer to
398 */
399int __trace_bputs(unsigned long ip, const char *str)
400{
401 struct ring_buffer_event *event;
402 struct ring_buffer *buffer;
403 struct bputs_entry *entry;
404 unsigned long irq_flags;
405 int size = sizeof(struct bputs_entry);
406
407 local_save_flags(irq_flags);
408 buffer = global_trace.trace_buffer.buffer;
409 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
410 irq_flags, preempt_count());
411 if (!event)
412 return 0;
413
414 entry = ring_buffer_event_data(event);
415 entry->ip = ip;
416 entry->str = str;
417
418 __buffer_unlock_commit(buffer, event);
419
420 return 1;
421}
422EXPORT_SYMBOL_GPL(__trace_bputs);
423
424#ifdef CONFIG_TRACER_SNAPSHOT
425/**
426 * trace_snapshot - take a snapshot of the current buffer.
427 *
428 * This causes a swap between the snapshot buffer and the current live
429 * tracing buffer. You can use this to take snapshots of the live
430 * trace when some condition is triggered, but continue to trace.
431 *
432 * Note, make sure to allocate the snapshot with either
433 * a tracing_snapshot_alloc(), or by doing it manually
434 * with: echo 1 > /sys/kernel/debug/tracing/snapshot
435 *
436 * If the snapshot buffer is not allocated, it will stop tracing.
437 * Basically making a permanent snapshot.
438 */
439void tracing_snapshot(void)
440{
441 struct trace_array *tr = &global_trace;
442 struct tracer *tracer = tr->current_trace;
443 unsigned long flags;
444
445 if (in_nmi()) {
446 internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
447 internal_trace_puts("*** snapshot is being ignored ***\n");
448 return;
449 }
450
451 if (!tr->allocated_snapshot) {
452 internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
453 internal_trace_puts("*** stopping trace here! ***\n");
454 tracing_off();
455 return;
456 }
457
458 /* Note, snapshot can not be used when the tracer uses it */
459 if (tracer->use_max_tr) {
460 internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
461 internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
462 return;
463 }
464
465 local_irq_save(flags);
466 update_max_tr(tr, current, smp_processor_id());
467 local_irq_restore(flags);
468}
469EXPORT_SYMBOL_GPL(tracing_snapshot);
470
471static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
472 struct trace_buffer *size_buf, int cpu_id);
473static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
474
475static int alloc_snapshot(struct trace_array *tr)
476{
477 int ret;
478
479 if (!tr->allocated_snapshot) {
480
481 /* allocate spare buffer */
482 ret = resize_buffer_duplicate_size(&tr->max_buffer,
483 &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
484 if (ret < 0)
485 return ret;
486
487 tr->allocated_snapshot = true;
488 }
489
490 return 0;
491}
492
493void free_snapshot(struct trace_array *tr)
494{
495 /*
496 * We don't free the ring buffer. instead, resize it because
497 * The max_tr ring buffer has some state (e.g. ring->clock) and
498 * we want preserve it.
499 */
500 ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
501 set_buffer_entries(&tr->max_buffer, 1);
502 tracing_reset_online_cpus(&tr->max_buffer);
503 tr->allocated_snapshot = false;
504}
505
506/**
507 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
508 *
509 * This is similar to trace_snapshot(), but it will allocate the
510 * snapshot buffer if it isn't already allocated. Use this only
511 * where it is safe to sleep, as the allocation may sleep.
512 *
513 * This causes a swap between the snapshot buffer and the current live
514 * tracing buffer. You can use this to take snapshots of the live
515 * trace when some condition is triggered, but continue to trace.
516 */
517void tracing_snapshot_alloc(void)
518{
519 struct trace_array *tr = &global_trace;
520 int ret;
521
522 ret = alloc_snapshot(tr);
523 if (WARN_ON(ret < 0))
524 return;
525
526 tracing_snapshot();
527}
528EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
529#else
530void tracing_snapshot(void)
531{
532 WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
533}
534EXPORT_SYMBOL_GPL(tracing_snapshot);
535void tracing_snapshot_alloc(void)
536{
537 /* Give warning */
538 tracing_snapshot();
539}
540EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
541#endif /* CONFIG_TRACER_SNAPSHOT */
542
543/**
388 * tracing_off - turn off tracing buffers 544 * tracing_off - turn off tracing buffers
389 * 545 *
390 * This function stops the tracing buffers from recording data. 546 * This function stops the tracing buffers from recording data.
@@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on);
394 */ 550 */
395void tracing_off(void) 551void tracing_off(void)
396{ 552{
397 if (global_trace.buffer) 553 if (global_trace.trace_buffer.buffer)
398 ring_buffer_record_off(global_trace.buffer); 554 ring_buffer_record_off(global_trace.trace_buffer.buffer);
399 /* 555 /*
400 * This flag is only looked at when buffers haven't been 556 * This flag is only looked at when buffers haven't been
401 * allocated yet. We don't really care about the race 557 * allocated yet. We don't really care about the race
@@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off);
411 */ 567 */
412int tracing_is_on(void) 568int tracing_is_on(void)
413{ 569{
414 if (global_trace.buffer) 570 if (global_trace.trace_buffer.buffer)
415 return ring_buffer_record_is_on(global_trace.buffer); 571 return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
416 return !global_trace.buffer_disabled; 572 return !global_trace.buffer_disabled;
417} 573}
418EXPORT_SYMBOL_GPL(tracing_is_on); 574EXPORT_SYMBOL_GPL(tracing_is_on);
@@ -479,6 +635,7 @@ static const char *trace_options[] = {
479 "disable_on_free", 635 "disable_on_free",
480 "irq-info", 636 "irq-info",
481 "markers", 637 "markers",
638 "function-trace",
482 NULL 639 NULL
483}; 640};
484 641
@@ -490,6 +647,8 @@ static struct {
490 { trace_clock_local, "local", 1 }, 647 { trace_clock_local, "local", 1 },
491 { trace_clock_global, "global", 1 }, 648 { trace_clock_global, "global", 1 },
492 { trace_clock_counter, "counter", 0 }, 649 { trace_clock_counter, "counter", 0 },
650 { trace_clock_jiffies, "uptime", 1 },
651 { trace_clock, "perf", 1 },
493 ARCH_TRACE_CLOCKS 652 ARCH_TRACE_CLOCKS
494}; 653};
495 654
@@ -670,13 +829,14 @@ unsigned long __read_mostly tracing_max_latency;
670static void 829static void
671__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 830__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
672{ 831{
673 struct trace_array_cpu *data = tr->data[cpu]; 832 struct trace_buffer *trace_buf = &tr->trace_buffer;
674 struct trace_array_cpu *max_data; 833 struct trace_buffer *max_buf = &tr->max_buffer;
834 struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
835 struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
675 836
676 max_tr.cpu = cpu; 837 max_buf->cpu = cpu;
677 max_tr.time_start = data->preempt_timestamp; 838 max_buf->time_start = data->preempt_timestamp;
678 839
679 max_data = max_tr.data[cpu];
680 max_data->saved_latency = tracing_max_latency; 840 max_data->saved_latency = tracing_max_latency;
681 max_data->critical_start = data->critical_start; 841 max_data->critical_start = data->critical_start;
682 max_data->critical_end = data->critical_end; 842 max_data->critical_end = data->critical_end;
@@ -706,22 +866,22 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
706{ 866{
707 struct ring_buffer *buf; 867 struct ring_buffer *buf;
708 868
709 if (trace_stop_count) 869 if (tr->stop_count)
710 return; 870 return;
711 871
712 WARN_ON_ONCE(!irqs_disabled()); 872 WARN_ON_ONCE(!irqs_disabled());
713 873
714 if (!current_trace->allocated_snapshot) { 874 if (!tr->allocated_snapshot) {
715 /* Only the nop tracer should hit this when disabling */ 875 /* Only the nop tracer should hit this when disabling */
716 WARN_ON_ONCE(current_trace != &nop_trace); 876 WARN_ON_ONCE(tr->current_trace != &nop_trace);
717 return; 877 return;
718 } 878 }
719 879
720 arch_spin_lock(&ftrace_max_lock); 880 arch_spin_lock(&ftrace_max_lock);
721 881
722 buf = tr->buffer; 882 buf = tr->trace_buffer.buffer;
723 tr->buffer = max_tr.buffer; 883 tr->trace_buffer.buffer = tr->max_buffer.buffer;
724 max_tr.buffer = buf; 884 tr->max_buffer.buffer = buf;
725 885
726 __update_max_tr(tr, tsk, cpu); 886 __update_max_tr(tr, tsk, cpu);
727 arch_spin_unlock(&ftrace_max_lock); 887 arch_spin_unlock(&ftrace_max_lock);
@@ -740,19 +900,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
740{ 900{
741 int ret; 901 int ret;
742 902
743 if (trace_stop_count) 903 if (tr->stop_count)
744 return; 904 return;
745 905
746 WARN_ON_ONCE(!irqs_disabled()); 906 WARN_ON_ONCE(!irqs_disabled());
747 if (!current_trace->allocated_snapshot) { 907 if (!tr->allocated_snapshot) {
748 /* Only the nop tracer should hit this when disabling */ 908 /* Only the nop tracer should hit this when disabling */
749 WARN_ON_ONCE(current_trace != &nop_trace); 909 WARN_ON_ONCE(tr->current_trace != &nop_trace);
750 return; 910 return;
751 } 911 }
752 912
753 arch_spin_lock(&ftrace_max_lock); 913 arch_spin_lock(&ftrace_max_lock);
754 914
755 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 915 ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
756 916
757 if (ret == -EBUSY) { 917 if (ret == -EBUSY) {
758 /* 918 /*
@@ -761,7 +921,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
761 * the max trace buffer (no one writes directly to it) 921 * the max trace buffer (no one writes directly to it)
762 * and flag that it failed. 922 * and flag that it failed.
763 */ 923 */
764 trace_array_printk(&max_tr, _THIS_IP_, 924 trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
765 "Failed to swap buffers due to commit in progress\n"); 925 "Failed to swap buffers due to commit in progress\n");
766 } 926 }
767 927
@@ -774,37 +934,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
774 934
775static void default_wait_pipe(struct trace_iterator *iter) 935static void default_wait_pipe(struct trace_iterator *iter)
776{ 936{
777 DEFINE_WAIT(wait); 937 /* Iterators are static, they should be filled or empty */
938 if (trace_buffer_iter(iter, iter->cpu_file))
939 return;
940
941 ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
942}
943
944#ifdef CONFIG_FTRACE_STARTUP_TEST
945static int run_tracer_selftest(struct tracer *type)
946{
947 struct trace_array *tr = &global_trace;
948 struct tracer *saved_tracer = tr->current_trace;
949 int ret;
778 950
779 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); 951 if (!type->selftest || tracing_selftest_disabled)
952 return 0;
780 953
781 /* 954 /*
782 * The events can happen in critical sections where 955 * Run a selftest on this tracer.
783 * checking a work queue can cause deadlocks. 956 * Here we reset the trace buffer, and set the current
784 * After adding a task to the queue, this flag is set 957 * tracer to be this tracer. The tracer can then run some
785 * only to notify events to try to wake up the queue 958 * internal tracing to verify that everything is in order.
786 * using irq_work. 959 * If we fail, we do not register this tracer.
787 *
788 * We don't clear it even if the buffer is no longer
789 * empty. The flag only causes the next event to run
790 * irq_work to do the work queue wake up. The worse
791 * that can happen if we race with !trace_empty() is that
792 * an event will cause an irq_work to try to wake up
793 * an empty queue.
794 *
795 * There's no reason to protect this flag either, as
796 * the work queue and irq_work logic will do the necessary
797 * synchronization for the wake ups. The only thing
798 * that is necessary is that the wake up happens after
799 * a task has been queued. It's OK for spurious wake ups.
800 */ 960 */
801 trace_wakeup_needed = true; 961 tracing_reset_online_cpus(&tr->trace_buffer);
802 962
803 if (trace_empty(iter)) 963 tr->current_trace = type;
804 schedule();
805 964
806 finish_wait(&trace_wait, &wait); 965#ifdef CONFIG_TRACER_MAX_TRACE
966 if (type->use_max_tr) {
967 /* If we expanded the buffers, make sure the max is expanded too */
968 if (ring_buffer_expanded)
969 ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
970 RING_BUFFER_ALL_CPUS);
971 tr->allocated_snapshot = true;
972 }
973#endif
974
975 /* the test is responsible for initializing and enabling */
976 pr_info("Testing tracer %s: ", type->name);
977 ret = type->selftest(type, tr);
978 /* the test is responsible for resetting too */
979 tr->current_trace = saved_tracer;
980 if (ret) {
981 printk(KERN_CONT "FAILED!\n");
982 /* Add the warning after printing 'FAILED' */
983 WARN_ON(1);
984 return -1;
985 }
986 /* Only reset on passing, to avoid touching corrupted buffers */
987 tracing_reset_online_cpus(&tr->trace_buffer);
988
989#ifdef CONFIG_TRACER_MAX_TRACE
990 if (type->use_max_tr) {
991 tr->allocated_snapshot = false;
992
993 /* Shrink the max buffer again */
994 if (ring_buffer_expanded)
995 ring_buffer_resize(tr->max_buffer.buffer, 1,
996 RING_BUFFER_ALL_CPUS);
997 }
998#endif
999
1000 printk(KERN_CONT "PASSED\n");
1001 return 0;
1002}
1003#else
1004static inline int run_tracer_selftest(struct tracer *type)
1005{
1006 return 0;
807} 1007}
1008#endif /* CONFIG_FTRACE_STARTUP_TEST */
808 1009
809/** 1010/**
810 * register_tracer - register a tracer with the ftrace system. 1011 * register_tracer - register a tracer with the ftrace system.
@@ -851,57 +1052,9 @@ int register_tracer(struct tracer *type)
851 if (!type->wait_pipe) 1052 if (!type->wait_pipe)
852 type->wait_pipe = default_wait_pipe; 1053 type->wait_pipe = default_wait_pipe;
853 1054
854 1055 ret = run_tracer_selftest(type);
855#ifdef CONFIG_FTRACE_STARTUP_TEST 1056 if (ret < 0)
856 if (type->selftest && !tracing_selftest_disabled) { 1057 goto out;
857 struct tracer *saved_tracer = current_trace;
858 struct trace_array *tr = &global_trace;
859
860 /*
861 * Run a selftest on this tracer.
862 * Here we reset the trace buffer, and set the current
863 * tracer to be this tracer. The tracer can then run some
864 * internal tracing to verify that everything is in order.
865 * If we fail, we do not register this tracer.
866 */
867 tracing_reset_online_cpus(tr);
868
869 current_trace = type;
870
871 if (type->use_max_tr) {
872 /* If we expanded the buffers, make sure the max is expanded too */
873 if (ring_buffer_expanded)
874 ring_buffer_resize(max_tr.buffer, trace_buf_size,
875 RING_BUFFER_ALL_CPUS);
876 type->allocated_snapshot = true;
877 }
878
879 /* the test is responsible for initializing and enabling */
880 pr_info("Testing tracer %s: ", type->name);
881 ret = type->selftest(type, tr);
882 /* the test is responsible for resetting too */
883 current_trace = saved_tracer;
884 if (ret) {
885 printk(KERN_CONT "FAILED!\n");
886 /* Add the warning after printing 'FAILED' */
887 WARN_ON(1);
888 goto out;
889 }
890 /* Only reset on passing, to avoid touching corrupted buffers */
891 tracing_reset_online_cpus(tr);
892
893 if (type->use_max_tr) {
894 type->allocated_snapshot = false;
895
896 /* Shrink the max buffer again */
897 if (ring_buffer_expanded)
898 ring_buffer_resize(max_tr.buffer, 1,
899 RING_BUFFER_ALL_CPUS);
900 }
901
902 printk(KERN_CONT "PASSED\n");
903 }
904#endif
905 1058
906 type->next = trace_types; 1059 type->next = trace_types;
907 trace_types = type; 1060 trace_types = type;
@@ -921,7 +1074,7 @@ int register_tracer(struct tracer *type)
921 tracing_set_tracer(type->name); 1074 tracing_set_tracer(type->name);
922 default_bootup_tracer = NULL; 1075 default_bootup_tracer = NULL;
923 /* disable other selftests, since this will break it. */ 1076 /* disable other selftests, since this will break it. */
924 tracing_selftest_disabled = 1; 1077 tracing_selftest_disabled = true;
925#ifdef CONFIG_FTRACE_STARTUP_TEST 1078#ifdef CONFIG_FTRACE_STARTUP_TEST
926 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", 1079 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
927 type->name); 1080 type->name);
@@ -931,9 +1084,9 @@ int register_tracer(struct tracer *type)
931 return ret; 1084 return ret;
932} 1085}
933 1086
934void tracing_reset(struct trace_array *tr, int cpu) 1087void tracing_reset(struct trace_buffer *buf, int cpu)
935{ 1088{
936 struct ring_buffer *buffer = tr->buffer; 1089 struct ring_buffer *buffer = buf->buffer;
937 1090
938 if (!buffer) 1091 if (!buffer)
939 return; 1092 return;
@@ -947,9 +1100,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
947 ring_buffer_record_enable(buffer); 1100 ring_buffer_record_enable(buffer);
948} 1101}
949 1102
950void tracing_reset_online_cpus(struct trace_array *tr) 1103void tracing_reset_online_cpus(struct trace_buffer *buf)
951{ 1104{
952 struct ring_buffer *buffer = tr->buffer; 1105 struct ring_buffer *buffer = buf->buffer;
953 int cpu; 1106 int cpu;
954 1107
955 if (!buffer) 1108 if (!buffer)
@@ -960,7 +1113,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
960 /* Make sure all commits have finished */ 1113 /* Make sure all commits have finished */
961 synchronize_sched(); 1114 synchronize_sched();
962 1115
963 tr->time_start = ftrace_now(tr->cpu); 1116 buf->time_start = ftrace_now(buf->cpu);
964 1117
965 for_each_online_cpu(cpu) 1118 for_each_online_cpu(cpu)
966 ring_buffer_reset_cpu(buffer, cpu); 1119 ring_buffer_reset_cpu(buffer, cpu);
@@ -970,12 +1123,21 @@ void tracing_reset_online_cpus(struct trace_array *tr)
970 1123
971void tracing_reset_current(int cpu) 1124void tracing_reset_current(int cpu)
972{ 1125{
973 tracing_reset(&global_trace, cpu); 1126 tracing_reset(&global_trace.trace_buffer, cpu);
974} 1127}
975 1128
976void tracing_reset_current_online_cpus(void) 1129void tracing_reset_all_online_cpus(void)
977{ 1130{
978 tracing_reset_online_cpus(&global_trace); 1131 struct trace_array *tr;
1132
1133 mutex_lock(&trace_types_lock);
1134 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
1135 tracing_reset_online_cpus(&tr->trace_buffer);
1136#ifdef CONFIG_TRACER_MAX_TRACE
1137 tracing_reset_online_cpus(&tr->max_buffer);
1138#endif
1139 }
1140 mutex_unlock(&trace_types_lock);
979} 1141}
980 1142
981#define SAVED_CMDLINES 128 1143#define SAVED_CMDLINES 128
@@ -998,7 +1160,7 @@ static void trace_init_cmdlines(void)
998 1160
999int is_tracing_stopped(void) 1161int is_tracing_stopped(void)
1000{ 1162{
1001 return trace_stop_count; 1163 return global_trace.stop_count;
1002} 1164}
1003 1165
1004/** 1166/**
@@ -1030,12 +1192,12 @@ void tracing_start(void)
1030 if (tracing_disabled) 1192 if (tracing_disabled)
1031 return; 1193 return;
1032 1194
1033 raw_spin_lock_irqsave(&tracing_start_lock, flags); 1195 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1034 if (--trace_stop_count) { 1196 if (--global_trace.stop_count) {
1035 if (trace_stop_count < 0) { 1197 if (global_trace.stop_count < 0) {
1036 /* Someone screwed up their debugging */ 1198 /* Someone screwed up their debugging */
1037 WARN_ON_ONCE(1); 1199 WARN_ON_ONCE(1);
1038 trace_stop_count = 0; 1200 global_trace.stop_count = 0;
1039 } 1201 }
1040 goto out; 1202 goto out;
1041 } 1203 }
@@ -1043,19 +1205,52 @@ void tracing_start(void)
1043 /* Prevent the buffers from switching */ 1205 /* Prevent the buffers from switching */
1044 arch_spin_lock(&ftrace_max_lock); 1206 arch_spin_lock(&ftrace_max_lock);
1045 1207
1046 buffer = global_trace.buffer; 1208 buffer = global_trace.trace_buffer.buffer;
1047 if (buffer) 1209 if (buffer)
1048 ring_buffer_record_enable(buffer); 1210 ring_buffer_record_enable(buffer);
1049 1211
1050 buffer = max_tr.buffer; 1212#ifdef CONFIG_TRACER_MAX_TRACE
1213 buffer = global_trace.max_buffer.buffer;
1051 if (buffer) 1214 if (buffer)
1052 ring_buffer_record_enable(buffer); 1215 ring_buffer_record_enable(buffer);
1216#endif
1053 1217
1054 arch_spin_unlock(&ftrace_max_lock); 1218 arch_spin_unlock(&ftrace_max_lock);
1055 1219
1056 ftrace_start(); 1220 ftrace_start();
1057 out: 1221 out:
1058 raw_spin_unlock_irqrestore(&tracing_start_lock, flags); 1222 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1223}
1224
1225static void tracing_start_tr(struct trace_array *tr)
1226{
1227 struct ring_buffer *buffer;
1228 unsigned long flags;
1229
1230 if (tracing_disabled)
1231 return;
1232
1233 /* If global, we need to also start the max tracer */
1234 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
1235 return tracing_start();
1236
1237 raw_spin_lock_irqsave(&tr->start_lock, flags);
1238
1239 if (--tr->stop_count) {
1240 if (tr->stop_count < 0) {
1241 /* Someone screwed up their debugging */
1242 WARN_ON_ONCE(1);
1243 tr->stop_count = 0;
1244 }
1245 goto out;
1246 }
1247
1248 buffer = tr->trace_buffer.buffer;
1249 if (buffer)
1250 ring_buffer_record_enable(buffer);
1251
1252 out:
1253 raw_spin_unlock_irqrestore(&tr->start_lock, flags);
1059} 1254}
1060 1255
1061/** 1256/**
@@ -1070,25 +1265,48 @@ void tracing_stop(void)
1070 unsigned long flags; 1265 unsigned long flags;
1071 1266
1072 ftrace_stop(); 1267 ftrace_stop();
1073 raw_spin_lock_irqsave(&tracing_start_lock, flags); 1268 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1074 if (trace_stop_count++) 1269 if (global_trace.stop_count++)
1075 goto out; 1270 goto out;
1076 1271
1077 /* Prevent the buffers from switching */ 1272 /* Prevent the buffers from switching */
1078 arch_spin_lock(&ftrace_max_lock); 1273 arch_spin_lock(&ftrace_max_lock);
1079 1274
1080 buffer = global_trace.buffer; 1275 buffer = global_trace.trace_buffer.buffer;
1081 if (buffer) 1276 if (buffer)
1082 ring_buffer_record_disable(buffer); 1277 ring_buffer_record_disable(buffer);
1083 1278
1084 buffer = max_tr.buffer; 1279#ifdef CONFIG_TRACER_MAX_TRACE
1280 buffer = global_trace.max_buffer.buffer;
1085 if (buffer) 1281 if (buffer)
1086 ring_buffer_record_disable(buffer); 1282 ring_buffer_record_disable(buffer);
1283#endif
1087 1284
1088 arch_spin_unlock(&ftrace_max_lock); 1285 arch_spin_unlock(&ftrace_max_lock);
1089 1286
1090 out: 1287 out:
1091 raw_spin_unlock_irqrestore(&tracing_start_lock, flags); 1288 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1289}
1290
1291static void tracing_stop_tr(struct trace_array *tr)
1292{
1293 struct ring_buffer *buffer;
1294 unsigned long flags;
1295
1296 /* If global, we need to also stop the max tracer */
1297 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
1298 return tracing_stop();
1299
1300 raw_spin_lock_irqsave(&tr->start_lock, flags);
1301 if (tr->stop_count++)
1302 goto out;
1303
1304 buffer = tr->trace_buffer.buffer;
1305 if (buffer)
1306 ring_buffer_record_disable(buffer);
1307
1308 out:
1309 raw_spin_unlock_irqrestore(&tr->start_lock, flags);
1092} 1310}
1093 1311
1094void trace_stop_cmdline_recording(void); 1312void trace_stop_cmdline_recording(void);
@@ -1221,11 +1439,6 @@ void
1221__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) 1439__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
1222{ 1440{
1223 __this_cpu_write(trace_cmdline_save, true); 1441 __this_cpu_write(trace_cmdline_save, true);
1224 if (trace_wakeup_needed) {
1225 trace_wakeup_needed = false;
1226 /* irq_work_queue() supplies it's own memory barriers */
1227 irq_work_queue(&trace_work_wakeup);
1228 }
1229 ring_buffer_unlock_commit(buffer, event); 1442 ring_buffer_unlock_commit(buffer, event);
1230} 1443}
1231 1444
@@ -1249,11 +1462,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1249EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); 1462EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1250 1463
1251struct ring_buffer_event * 1464struct ring_buffer_event *
1465trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
1466 struct ftrace_event_file *ftrace_file,
1467 int type, unsigned long len,
1468 unsigned long flags, int pc)
1469{
1470 *current_rb = ftrace_file->tr->trace_buffer.buffer;
1471 return trace_buffer_lock_reserve(*current_rb,
1472 type, len, flags, pc);
1473}
1474EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
1475
1476struct ring_buffer_event *
1252trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, 1477trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
1253 int type, unsigned long len, 1478 int type, unsigned long len,
1254 unsigned long flags, int pc) 1479 unsigned long flags, int pc)
1255{ 1480{
1256 *current_rb = global_trace.buffer; 1481 *current_rb = global_trace.trace_buffer.buffer;
1257 return trace_buffer_lock_reserve(*current_rb, 1482 return trace_buffer_lock_reserve(*current_rb,
1258 type, len, flags, pc); 1483 type, len, flags, pc);
1259} 1484}
@@ -1292,7 +1517,7 @@ trace_function(struct trace_array *tr,
1292 int pc) 1517 int pc)
1293{ 1518{
1294 struct ftrace_event_call *call = &event_function; 1519 struct ftrace_event_call *call = &event_function;
1295 struct ring_buffer *buffer = tr->buffer; 1520 struct ring_buffer *buffer = tr->trace_buffer.buffer;
1296 struct ring_buffer_event *event; 1521 struct ring_buffer_event *event;
1297 struct ftrace_entry *entry; 1522 struct ftrace_entry *entry;
1298 1523
@@ -1433,13 +1658,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1433void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 1658void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1434 int pc) 1659 int pc)
1435{ 1660{
1436 __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); 1661 __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
1437} 1662}
1438 1663
1439/** 1664/**
1440 * trace_dump_stack - record a stack back trace in the trace buffer 1665 * trace_dump_stack - record a stack back trace in the trace buffer
1666 * @skip: Number of functions to skip (helper handlers)
1441 */ 1667 */
1442void trace_dump_stack(void) 1668void trace_dump_stack(int skip)
1443{ 1669{
1444 unsigned long flags; 1670 unsigned long flags;
1445 1671
@@ -1448,8 +1674,13 @@ void trace_dump_stack(void)
1448 1674
1449 local_save_flags(flags); 1675 local_save_flags(flags);
1450 1676
1451 /* skipping 3 traces, seems to get us at the caller of this function */ 1677 /*
1452 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); 1678 * Skip 3 more, seems to get us at the caller of
1679 * this function.
1680 */
1681 skip += 3;
1682 __ftrace_trace_stack(global_trace.trace_buffer.buffer,
1683 flags, skip, preempt_count(), NULL);
1453} 1684}
1454 1685
1455static DEFINE_PER_CPU(int, user_stack_count); 1686static DEFINE_PER_CPU(int, user_stack_count);
@@ -1619,7 +1850,7 @@ void trace_printk_init_buffers(void)
1619 * directly here. If the global_trace.buffer is already 1850 * directly here. If the global_trace.buffer is already
1620 * allocated here, then this was called by module code. 1851 * allocated here, then this was called by module code.
1621 */ 1852 */
1622 if (global_trace.buffer) 1853 if (global_trace.trace_buffer.buffer)
1623 tracing_start_cmdline_record(); 1854 tracing_start_cmdline_record();
1624} 1855}
1625 1856
@@ -1679,7 +1910,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1679 1910
1680 local_save_flags(flags); 1911 local_save_flags(flags);
1681 size = sizeof(*entry) + sizeof(u32) * len; 1912 size = sizeof(*entry) + sizeof(u32) * len;
1682 buffer = tr->buffer; 1913 buffer = tr->trace_buffer.buffer;
1683 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, 1914 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1684 flags, pc); 1915 flags, pc);
1685 if (!event) 1916 if (!event)
@@ -1702,27 +1933,12 @@ out:
1702} 1933}
1703EXPORT_SYMBOL_GPL(trace_vbprintk); 1934EXPORT_SYMBOL_GPL(trace_vbprintk);
1704 1935
1705int trace_array_printk(struct trace_array *tr, 1936static int
1706 unsigned long ip, const char *fmt, ...) 1937__trace_array_vprintk(struct ring_buffer *buffer,
1707{ 1938 unsigned long ip, const char *fmt, va_list args)
1708 int ret;
1709 va_list ap;
1710
1711 if (!(trace_flags & TRACE_ITER_PRINTK))
1712 return 0;
1713
1714 va_start(ap, fmt);
1715 ret = trace_array_vprintk(tr, ip, fmt, ap);
1716 va_end(ap);
1717 return ret;
1718}
1719
1720int trace_array_vprintk(struct trace_array *tr,
1721 unsigned long ip, const char *fmt, va_list args)
1722{ 1939{
1723 struct ftrace_event_call *call = &event_print; 1940 struct ftrace_event_call *call = &event_print;
1724 struct ring_buffer_event *event; 1941 struct ring_buffer_event *event;
1725 struct ring_buffer *buffer;
1726 int len = 0, size, pc; 1942 int len = 0, size, pc;
1727 struct print_entry *entry; 1943 struct print_entry *entry;
1728 unsigned long flags; 1944 unsigned long flags;
@@ -1750,7 +1966,6 @@ int trace_array_vprintk(struct trace_array *tr,
1750 1966
1751 local_save_flags(flags); 1967 local_save_flags(flags);
1752 size = sizeof(*entry) + len + 1; 1968 size = sizeof(*entry) + len + 1;
1753 buffer = tr->buffer;
1754 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1969 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1755 flags, pc); 1970 flags, pc);
1756 if (!event) 1971 if (!event)
@@ -1771,6 +1986,42 @@ int trace_array_vprintk(struct trace_array *tr,
1771 return len; 1986 return len;
1772} 1987}
1773 1988
1989int trace_array_vprintk(struct trace_array *tr,
1990 unsigned long ip, const char *fmt, va_list args)
1991{
1992 return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
1993}
1994
1995int trace_array_printk(struct trace_array *tr,
1996 unsigned long ip, const char *fmt, ...)
1997{
1998 int ret;
1999 va_list ap;
2000
2001 if (!(trace_flags & TRACE_ITER_PRINTK))
2002 return 0;
2003
2004 va_start(ap, fmt);
2005 ret = trace_array_vprintk(tr, ip, fmt, ap);
2006 va_end(ap);
2007 return ret;
2008}
2009
2010int trace_array_printk_buf(struct ring_buffer *buffer,
2011 unsigned long ip, const char *fmt, ...)
2012{
2013 int ret;
2014 va_list ap;
2015
2016 if (!(trace_flags & TRACE_ITER_PRINTK))
2017 return 0;
2018
2019 va_start(ap, fmt);
2020 ret = __trace_array_vprintk(buffer, ip, fmt, ap);
2021 va_end(ap);
2022 return ret;
2023}
2024
1774int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 2025int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1775{ 2026{
1776 return trace_array_vprintk(&global_trace, ip, fmt, args); 2027 return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -1796,7 +2047,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1796 if (buf_iter) 2047 if (buf_iter)
1797 event = ring_buffer_iter_peek(buf_iter, ts); 2048 event = ring_buffer_iter_peek(buf_iter, ts);
1798 else 2049 else
1799 event = ring_buffer_peek(iter->tr->buffer, cpu, ts, 2050 event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
1800 lost_events); 2051 lost_events);
1801 2052
1802 if (event) { 2053 if (event) {
@@ -1811,7 +2062,7 @@ static struct trace_entry *
1811__find_next_entry(struct trace_iterator *iter, int *ent_cpu, 2062__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1812 unsigned long *missing_events, u64 *ent_ts) 2063 unsigned long *missing_events, u64 *ent_ts)
1813{ 2064{
1814 struct ring_buffer *buffer = iter->tr->buffer; 2065 struct ring_buffer *buffer = iter->trace_buffer->buffer;
1815 struct trace_entry *ent, *next = NULL; 2066 struct trace_entry *ent, *next = NULL;
1816 unsigned long lost_events = 0, next_lost = 0; 2067 unsigned long lost_events = 0, next_lost = 0;
1817 int cpu_file = iter->cpu_file; 2068 int cpu_file = iter->cpu_file;
@@ -1824,7 +2075,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1824 * If we are in a per_cpu trace file, don't bother by iterating over 2075 * If we are in a per_cpu trace file, don't bother by iterating over
1825 * all cpu and peek directly. 2076 * all cpu and peek directly.
1826 */ 2077 */
1827 if (cpu_file > TRACE_PIPE_ALL_CPU) { 2078 if (cpu_file > RING_BUFFER_ALL_CPUS) {
1828 if (ring_buffer_empty_cpu(buffer, cpu_file)) 2079 if (ring_buffer_empty_cpu(buffer, cpu_file))
1829 return NULL; 2080 return NULL;
1830 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); 2081 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
@@ -1888,7 +2139,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
1888 2139
1889static void trace_consume(struct trace_iterator *iter) 2140static void trace_consume(struct trace_iterator *iter)
1890{ 2141{
1891 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, 2142 ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
1892 &iter->lost_events); 2143 &iter->lost_events);
1893} 2144}
1894 2145
@@ -1921,13 +2172,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1921 2172
1922void tracing_iter_reset(struct trace_iterator *iter, int cpu) 2173void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1923{ 2174{
1924 struct trace_array *tr = iter->tr;
1925 struct ring_buffer_event *event; 2175 struct ring_buffer_event *event;
1926 struct ring_buffer_iter *buf_iter; 2176 struct ring_buffer_iter *buf_iter;
1927 unsigned long entries = 0; 2177 unsigned long entries = 0;
1928 u64 ts; 2178 u64 ts;
1929 2179
1930 tr->data[cpu]->skipped_entries = 0; 2180 per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
1931 2181
1932 buf_iter = trace_buffer_iter(iter, cpu); 2182 buf_iter = trace_buffer_iter(iter, cpu);
1933 if (!buf_iter) 2183 if (!buf_iter)
@@ -1941,13 +2191,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1941 * by the timestamp being before the start of the buffer. 2191 * by the timestamp being before the start of the buffer.
1942 */ 2192 */
1943 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { 2193 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1944 if (ts >= iter->tr->time_start) 2194 if (ts >= iter->trace_buffer->time_start)
1945 break; 2195 break;
1946 entries++; 2196 entries++;
1947 ring_buffer_read(buf_iter, NULL); 2197 ring_buffer_read(buf_iter, NULL);
1948 } 2198 }
1949 2199
1950 tr->data[cpu]->skipped_entries = entries; 2200 per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
1951} 2201}
1952 2202
1953/* 2203/*
@@ -1957,6 +2207,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1957static void *s_start(struct seq_file *m, loff_t *pos) 2207static void *s_start(struct seq_file *m, loff_t *pos)
1958{ 2208{
1959 struct trace_iterator *iter = m->private; 2209 struct trace_iterator *iter = m->private;
2210 struct trace_array *tr = iter->tr;
1960 int cpu_file = iter->cpu_file; 2211 int cpu_file = iter->cpu_file;
1961 void *p = NULL; 2212 void *p = NULL;
1962 loff_t l = 0; 2213 loff_t l = 0;
@@ -1969,12 +2220,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1969 * will point to the same string as current_trace->name. 2220 * will point to the same string as current_trace->name.
1970 */ 2221 */
1971 mutex_lock(&trace_types_lock); 2222 mutex_lock(&trace_types_lock);
1972 if (unlikely(current_trace && iter->trace->name != current_trace->name)) 2223 if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
1973 *iter->trace = *current_trace; 2224 *iter->trace = *tr->current_trace;
1974 mutex_unlock(&trace_types_lock); 2225 mutex_unlock(&trace_types_lock);
1975 2226
2227#ifdef CONFIG_TRACER_MAX_TRACE
1976 if (iter->snapshot && iter->trace->use_max_tr) 2228 if (iter->snapshot && iter->trace->use_max_tr)
1977 return ERR_PTR(-EBUSY); 2229 return ERR_PTR(-EBUSY);
2230#endif
1978 2231
1979 if (!iter->snapshot) 2232 if (!iter->snapshot)
1980 atomic_inc(&trace_record_cmdline_disabled); 2233 atomic_inc(&trace_record_cmdline_disabled);
@@ -1984,7 +2237,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1984 iter->cpu = 0; 2237 iter->cpu = 0;
1985 iter->idx = -1; 2238 iter->idx = -1;
1986 2239
1987 if (cpu_file == TRACE_PIPE_ALL_CPU) { 2240 if (cpu_file == RING_BUFFER_ALL_CPUS) {
1988 for_each_tracing_cpu(cpu) 2241 for_each_tracing_cpu(cpu)
1989 tracing_iter_reset(iter, cpu); 2242 tracing_iter_reset(iter, cpu);
1990 } else 2243 } else
@@ -2016,17 +2269,21 @@ static void s_stop(struct seq_file *m, void *p)
2016{ 2269{
2017 struct trace_iterator *iter = m->private; 2270 struct trace_iterator *iter = m->private;
2018 2271
2272#ifdef CONFIG_TRACER_MAX_TRACE
2019 if (iter->snapshot && iter->trace->use_max_tr) 2273 if (iter->snapshot && iter->trace->use_max_tr)
2020 return; 2274 return;
2275#endif
2021 2276
2022 if (!iter->snapshot) 2277 if (!iter->snapshot)
2023 atomic_dec(&trace_record_cmdline_disabled); 2278 atomic_dec(&trace_record_cmdline_disabled);
2279
2024 trace_access_unlock(iter->cpu_file); 2280 trace_access_unlock(iter->cpu_file);
2025 trace_event_read_unlock(); 2281 trace_event_read_unlock();
2026} 2282}
2027 2283
2028static void 2284static void
2029get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) 2285get_total_entries(struct trace_buffer *buf,
2286 unsigned long *total, unsigned long *entries)
2030{ 2287{
2031 unsigned long count; 2288 unsigned long count;
2032 int cpu; 2289 int cpu;
@@ -2035,19 +2292,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
2035 *entries = 0; 2292 *entries = 0;
2036 2293
2037 for_each_tracing_cpu(cpu) { 2294 for_each_tracing_cpu(cpu) {
2038 count = ring_buffer_entries_cpu(tr->buffer, cpu); 2295 count = ring_buffer_entries_cpu(buf->buffer, cpu);
2039 /* 2296 /*
2040 * If this buffer has skipped entries, then we hold all 2297 * If this buffer has skipped entries, then we hold all
2041 * entries for the trace and we need to ignore the 2298 * entries for the trace and we need to ignore the
2042 * ones before the time stamp. 2299 * ones before the time stamp.
2043 */ 2300 */
2044 if (tr->data[cpu]->skipped_entries) { 2301 if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
2045 count -= tr->data[cpu]->skipped_entries; 2302 count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
2046 /* total is the same as the entries */ 2303 /* total is the same as the entries */
2047 *total += count; 2304 *total += count;
2048 } else 2305 } else
2049 *total += count + 2306 *total += count +
2050 ring_buffer_overrun_cpu(tr->buffer, cpu); 2307 ring_buffer_overrun_cpu(buf->buffer, cpu);
2051 *entries += count; 2308 *entries += count;
2052 } 2309 }
2053} 2310}
@@ -2064,27 +2321,27 @@ static void print_lat_help_header(struct seq_file *m)
2064 seq_puts(m, "# \\ / ||||| \\ | / \n"); 2321 seq_puts(m, "# \\ / ||||| \\ | / \n");
2065} 2322}
2066 2323
2067static void print_event_info(struct trace_array *tr, struct seq_file *m) 2324static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
2068{ 2325{
2069 unsigned long total; 2326 unsigned long total;
2070 unsigned long entries; 2327 unsigned long entries;
2071 2328
2072 get_total_entries(tr, &total, &entries); 2329 get_total_entries(buf, &total, &entries);
2073 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", 2330 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
2074 entries, total, num_online_cpus()); 2331 entries, total, num_online_cpus());
2075 seq_puts(m, "#\n"); 2332 seq_puts(m, "#\n");
2076} 2333}
2077 2334
2078static void print_func_help_header(struct trace_array *tr, struct seq_file *m) 2335static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
2079{ 2336{
2080 print_event_info(tr, m); 2337 print_event_info(buf, m);
2081 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 2338 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
2082 seq_puts(m, "# | | | | |\n"); 2339 seq_puts(m, "# | | | | |\n");
2083} 2340}
2084 2341
2085static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) 2342static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
2086{ 2343{
2087 print_event_info(tr, m); 2344 print_event_info(buf, m);
2088 seq_puts(m, "# _-----=> irqs-off\n"); 2345 seq_puts(m, "# _-----=> irqs-off\n");
2089 seq_puts(m, "# / _----=> need-resched\n"); 2346 seq_puts(m, "# / _----=> need-resched\n");
2090 seq_puts(m, "# | / _---=> hardirq/softirq\n"); 2347 seq_puts(m, "# | / _---=> hardirq/softirq\n");
@@ -2098,16 +2355,16 @@ void
2098print_trace_header(struct seq_file *m, struct trace_iterator *iter) 2355print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2099{ 2356{
2100 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 2357 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
2101 struct trace_array *tr = iter->tr; 2358 struct trace_buffer *buf = iter->trace_buffer;
2102 struct trace_array_cpu *data = tr->data[tr->cpu]; 2359 struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
2103 struct tracer *type = current_trace; 2360 struct tracer *type = iter->trace;
2104 unsigned long entries; 2361 unsigned long entries;
2105 unsigned long total; 2362 unsigned long total;
2106 const char *name = "preemption"; 2363 const char *name = "preemption";
2107 2364
2108 name = type->name; 2365 name = type->name;
2109 2366
2110 get_total_entries(tr, &total, &entries); 2367 get_total_entries(buf, &total, &entries);
2111 2368
2112 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 2369 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
2113 name, UTS_RELEASE); 2370 name, UTS_RELEASE);
@@ -2118,7 +2375,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2118 nsecs_to_usecs(data->saved_latency), 2375 nsecs_to_usecs(data->saved_latency),
2119 entries, 2376 entries,
2120 total, 2377 total,
2121 tr->cpu, 2378 buf->cpu,
2122#if defined(CONFIG_PREEMPT_NONE) 2379#if defined(CONFIG_PREEMPT_NONE)
2123 "server", 2380 "server",
2124#elif defined(CONFIG_PREEMPT_VOLUNTARY) 2381#elif defined(CONFIG_PREEMPT_VOLUNTARY)
@@ -2169,7 +2426,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
2169 if (cpumask_test_cpu(iter->cpu, iter->started)) 2426 if (cpumask_test_cpu(iter->cpu, iter->started))
2170 return; 2427 return;
2171 2428
2172 if (iter->tr->data[iter->cpu]->skipped_entries) 2429 if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
2173 return; 2430 return;
2174 2431
2175 cpumask_set_cpu(iter->cpu, iter->started); 2432 cpumask_set_cpu(iter->cpu, iter->started);
@@ -2292,14 +2549,14 @@ int trace_empty(struct trace_iterator *iter)
2292 int cpu; 2549 int cpu;
2293 2550
2294 /* If we are looking at one CPU buffer, only check that one */ 2551 /* If we are looking at one CPU buffer, only check that one */
2295 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { 2552 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
2296 cpu = iter->cpu_file; 2553 cpu = iter->cpu_file;
2297 buf_iter = trace_buffer_iter(iter, cpu); 2554 buf_iter = trace_buffer_iter(iter, cpu);
2298 if (buf_iter) { 2555 if (buf_iter) {
2299 if (!ring_buffer_iter_empty(buf_iter)) 2556 if (!ring_buffer_iter_empty(buf_iter))
2300 return 0; 2557 return 0;
2301 } else { 2558 } else {
2302 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2559 if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
2303 return 0; 2560 return 0;
2304 } 2561 }
2305 return 1; 2562 return 1;
@@ -2311,7 +2568,7 @@ int trace_empty(struct trace_iterator *iter)
2311 if (!ring_buffer_iter_empty(buf_iter)) 2568 if (!ring_buffer_iter_empty(buf_iter))
2312 return 0; 2569 return 0;
2313 } else { 2570 } else {
2314 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2571 if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
2315 return 0; 2572 return 0;
2316 } 2573 }
2317 } 2574 }
@@ -2335,6 +2592,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2335 return ret; 2592 return ret;
2336 } 2593 }
2337 2594
2595 if (iter->ent->type == TRACE_BPUTS &&
2596 trace_flags & TRACE_ITER_PRINTK &&
2597 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
2598 return trace_print_bputs_msg_only(iter);
2599
2338 if (iter->ent->type == TRACE_BPRINT && 2600 if (iter->ent->type == TRACE_BPRINT &&
2339 trace_flags & TRACE_ITER_PRINTK && 2601 trace_flags & TRACE_ITER_PRINTK &&
2340 trace_flags & TRACE_ITER_PRINTK_MSGONLY) 2602 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
@@ -2389,9 +2651,9 @@ void trace_default_header(struct seq_file *m)
2389 } else { 2651 } else {
2390 if (!(trace_flags & TRACE_ITER_VERBOSE)) { 2652 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
2391 if (trace_flags & TRACE_ITER_IRQ_INFO) 2653 if (trace_flags & TRACE_ITER_IRQ_INFO)
2392 print_func_help_header_irq(iter->tr, m); 2654 print_func_help_header_irq(iter->trace_buffer, m);
2393 else 2655 else
2394 print_func_help_header(iter->tr, m); 2656 print_func_help_header(iter->trace_buffer, m);
2395 } 2657 }
2396 } 2658 }
2397} 2659}
@@ -2405,14 +2667,8 @@ static void test_ftrace_alive(struct seq_file *m)
2405} 2667}
2406 2668
2407#ifdef CONFIG_TRACER_MAX_TRACE 2669#ifdef CONFIG_TRACER_MAX_TRACE
2408static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) 2670static void show_snapshot_main_help(struct seq_file *m)
2409{ 2671{
2410 if (iter->trace->allocated_snapshot)
2411 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
2412 else
2413 seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
2414
2415 seq_printf(m, "# Snapshot commands:\n");
2416 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); 2672 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
2417 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); 2673 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2418 seq_printf(m, "# Takes a snapshot of the main buffer.\n"); 2674 seq_printf(m, "# Takes a snapshot of the main buffer.\n");
@@ -2420,6 +2676,35 @@ static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2420 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); 2676 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2421 seq_printf(m, "# is not a '0' or '1')\n"); 2677 seq_printf(m, "# is not a '0' or '1')\n");
2422} 2678}
2679
2680static void show_snapshot_percpu_help(struct seq_file *m)
2681{
2682 seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
2683#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2684 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2685 seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n");
2686#else
2687 seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
2688 seq_printf(m, "# Must use main snapshot file to allocate.\n");
2689#endif
2690 seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
2691 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2692 seq_printf(m, "# is not a '0' or '1')\n");
2693}
2694
2695static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2696{
2697 if (iter->tr->allocated_snapshot)
2698 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
2699 else
2700 seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
2701
2702 seq_printf(m, "# Snapshot commands:\n");
2703 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
2704 show_snapshot_main_help(m);
2705 else
2706 show_snapshot_percpu_help(m);
2707}
2423#else 2708#else
2424/* Should never be called */ 2709/* Should never be called */
2425static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } 2710static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
@@ -2479,7 +2764,8 @@ static const struct seq_operations tracer_seq_ops = {
2479static struct trace_iterator * 2764static struct trace_iterator *
2480__tracing_open(struct inode *inode, struct file *file, bool snapshot) 2765__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2481{ 2766{
2482 long cpu_file = (long) inode->i_private; 2767 struct trace_cpu *tc = inode->i_private;
2768 struct trace_array *tr = tc->tr;
2483 struct trace_iterator *iter; 2769 struct trace_iterator *iter;
2484 int cpu; 2770 int cpu;
2485 2771
@@ -2504,26 +2790,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2504 if (!iter->trace) 2790 if (!iter->trace)
2505 goto fail; 2791 goto fail;
2506 2792
2507 *iter->trace = *current_trace; 2793 *iter->trace = *tr->current_trace;
2508 2794
2509 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) 2795 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
2510 goto fail; 2796 goto fail;
2511 2797
2512 if (current_trace->print_max || snapshot) 2798 iter->tr = tr;
2513 iter->tr = &max_tr; 2799
2800#ifdef CONFIG_TRACER_MAX_TRACE
2801 /* Currently only the top directory has a snapshot */
2802 if (tr->current_trace->print_max || snapshot)
2803 iter->trace_buffer = &tr->max_buffer;
2514 else 2804 else
2515 iter->tr = &global_trace; 2805#endif
2806 iter->trace_buffer = &tr->trace_buffer;
2516 iter->snapshot = snapshot; 2807 iter->snapshot = snapshot;
2517 iter->pos = -1; 2808 iter->pos = -1;
2518 mutex_init(&iter->mutex); 2809 mutex_init(&iter->mutex);
2519 iter->cpu_file = cpu_file; 2810 iter->cpu_file = tc->cpu;
2520 2811
2521 /* Notify the tracer early; before we stop tracing. */ 2812 /* Notify the tracer early; before we stop tracing. */
2522 if (iter->trace && iter->trace->open) 2813 if (iter->trace && iter->trace->open)
2523 iter->trace->open(iter); 2814 iter->trace->open(iter);
2524 2815
2525 /* Annotate start of buffers if we had overruns */ 2816 /* Annotate start of buffers if we had overruns */
2526 if (ring_buffer_overruns(iter->tr->buffer)) 2817 if (ring_buffer_overruns(iter->trace_buffer->buffer))
2527 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2818 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2528 2819
2529 /* Output in nanoseconds only if we are using a clock in nanoseconds. */ 2820 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -2532,12 +2823,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2532 2823
2533 /* stop the trace while dumping if we are not opening "snapshot" */ 2824 /* stop the trace while dumping if we are not opening "snapshot" */
2534 if (!iter->snapshot) 2825 if (!iter->snapshot)
2535 tracing_stop(); 2826 tracing_stop_tr(tr);
2536 2827
2537 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2828 if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
2538 for_each_tracing_cpu(cpu) { 2829 for_each_tracing_cpu(cpu) {
2539 iter->buffer_iter[cpu] = 2830 iter->buffer_iter[cpu] =
2540 ring_buffer_read_prepare(iter->tr->buffer, cpu); 2831 ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
2541 } 2832 }
2542 ring_buffer_read_prepare_sync(); 2833 ring_buffer_read_prepare_sync();
2543 for_each_tracing_cpu(cpu) { 2834 for_each_tracing_cpu(cpu) {
@@ -2547,12 +2838,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2547 } else { 2838 } else {
2548 cpu = iter->cpu_file; 2839 cpu = iter->cpu_file;
2549 iter->buffer_iter[cpu] = 2840 iter->buffer_iter[cpu] =
2550 ring_buffer_read_prepare(iter->tr->buffer, cpu); 2841 ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
2551 ring_buffer_read_prepare_sync(); 2842 ring_buffer_read_prepare_sync();
2552 ring_buffer_read_start(iter->buffer_iter[cpu]); 2843 ring_buffer_read_start(iter->buffer_iter[cpu]);
2553 tracing_iter_reset(iter, cpu); 2844 tracing_iter_reset(iter, cpu);
2554 } 2845 }
2555 2846
2847 tr->ref++;
2848
2556 mutex_unlock(&trace_types_lock); 2849 mutex_unlock(&trace_types_lock);
2557 2850
2558 return iter; 2851 return iter;
@@ -2579,14 +2872,20 @@ static int tracing_release(struct inode *inode, struct file *file)
2579{ 2872{
2580 struct seq_file *m = file->private_data; 2873 struct seq_file *m = file->private_data;
2581 struct trace_iterator *iter; 2874 struct trace_iterator *iter;
2875 struct trace_array *tr;
2582 int cpu; 2876 int cpu;
2583 2877
2584 if (!(file->f_mode & FMODE_READ)) 2878 if (!(file->f_mode & FMODE_READ))
2585 return 0; 2879 return 0;
2586 2880
2587 iter = m->private; 2881 iter = m->private;
2882 tr = iter->tr;
2588 2883
2589 mutex_lock(&trace_types_lock); 2884 mutex_lock(&trace_types_lock);
2885
2886 WARN_ON(!tr->ref);
2887 tr->ref--;
2888
2590 for_each_tracing_cpu(cpu) { 2889 for_each_tracing_cpu(cpu) {
2591 if (iter->buffer_iter[cpu]) 2890 if (iter->buffer_iter[cpu])
2592 ring_buffer_read_finish(iter->buffer_iter[cpu]); 2891 ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2597,7 +2896,7 @@ static int tracing_release(struct inode *inode, struct file *file)
2597 2896
2598 if (!iter->snapshot) 2897 if (!iter->snapshot)
2599 /* reenable tracing if it was previously enabled */ 2898 /* reenable tracing if it was previously enabled */
2600 tracing_start(); 2899 tracing_start_tr(tr);
2601 mutex_unlock(&trace_types_lock); 2900 mutex_unlock(&trace_types_lock);
2602 2901
2603 mutex_destroy(&iter->mutex); 2902 mutex_destroy(&iter->mutex);
@@ -2616,12 +2915,13 @@ static int tracing_open(struct inode *inode, struct file *file)
2616 /* If this file was open for write, then erase contents */ 2915 /* If this file was open for write, then erase contents */
2617 if ((file->f_mode & FMODE_WRITE) && 2916 if ((file->f_mode & FMODE_WRITE) &&
2618 (file->f_flags & O_TRUNC)) { 2917 (file->f_flags & O_TRUNC)) {
2619 long cpu = (long) inode->i_private; 2918 struct trace_cpu *tc = inode->i_private;
2919 struct trace_array *tr = tc->tr;
2620 2920
2621 if (cpu == TRACE_PIPE_ALL_CPU) 2921 if (tc->cpu == RING_BUFFER_ALL_CPUS)
2622 tracing_reset_online_cpus(&global_trace); 2922 tracing_reset_online_cpus(&tr->trace_buffer);
2623 else 2923 else
2624 tracing_reset(&global_trace, cpu); 2924 tracing_reset(&tr->trace_buffer, tc->cpu);
2625 } 2925 }
2626 2926
2627 if (file->f_mode & FMODE_READ) { 2927 if (file->f_mode & FMODE_READ) {
@@ -2768,8 +3068,9 @@ static ssize_t
2768tracing_cpumask_write(struct file *filp, const char __user *ubuf, 3068tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2769 size_t count, loff_t *ppos) 3069 size_t count, loff_t *ppos)
2770{ 3070{
2771 int err, cpu; 3071 struct trace_array *tr = filp->private_data;
2772 cpumask_var_t tracing_cpumask_new; 3072 cpumask_var_t tracing_cpumask_new;
3073 int err, cpu;
2773 3074
2774 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) 3075 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2775 return -ENOMEM; 3076 return -ENOMEM;
@@ -2789,13 +3090,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2789 */ 3090 */
2790 if (cpumask_test_cpu(cpu, tracing_cpumask) && 3091 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2791 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3092 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2792 atomic_inc(&global_trace.data[cpu]->disabled); 3093 atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
2793 ring_buffer_record_disable_cpu(global_trace.buffer, cpu); 3094 ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
2794 } 3095 }
2795 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 3096 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2796 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3097 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2797 atomic_dec(&global_trace.data[cpu]->disabled); 3098 atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
2798 ring_buffer_record_enable_cpu(global_trace.buffer, cpu); 3099 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
2799 } 3100 }
2800 } 3101 }
2801 arch_spin_unlock(&ftrace_max_lock); 3102 arch_spin_unlock(&ftrace_max_lock);
@@ -2824,12 +3125,13 @@ static const struct file_operations tracing_cpumask_fops = {
2824static int tracing_trace_options_show(struct seq_file *m, void *v) 3125static int tracing_trace_options_show(struct seq_file *m, void *v)
2825{ 3126{
2826 struct tracer_opt *trace_opts; 3127 struct tracer_opt *trace_opts;
3128 struct trace_array *tr = m->private;
2827 u32 tracer_flags; 3129 u32 tracer_flags;
2828 int i; 3130 int i;
2829 3131
2830 mutex_lock(&trace_types_lock); 3132 mutex_lock(&trace_types_lock);
2831 tracer_flags = current_trace->flags->val; 3133 tracer_flags = tr->current_trace->flags->val;
2832 trace_opts = current_trace->flags->opts; 3134 trace_opts = tr->current_trace->flags->opts;
2833 3135
2834 for (i = 0; trace_options[i]; i++) { 3136 for (i = 0; trace_options[i]; i++) {
2835 if (trace_flags & (1 << i)) 3137 if (trace_flags & (1 << i))
@@ -2893,15 +3195,15 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
2893 return 0; 3195 return 0;
2894} 3196}
2895 3197
2896int set_tracer_flag(unsigned int mask, int enabled) 3198int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
2897{ 3199{
2898 /* do nothing if flag is already set */ 3200 /* do nothing if flag is already set */
2899 if (!!(trace_flags & mask) == !!enabled) 3201 if (!!(trace_flags & mask) == !!enabled)
2900 return 0; 3202 return 0;
2901 3203
2902 /* Give the tracer a chance to approve the change */ 3204 /* Give the tracer a chance to approve the change */
2903 if (current_trace->flag_changed) 3205 if (tr->current_trace->flag_changed)
2904 if (current_trace->flag_changed(current_trace, mask, !!enabled)) 3206 if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled))
2905 return -EINVAL; 3207 return -EINVAL;
2906 3208
2907 if (enabled) 3209 if (enabled)
@@ -2913,9 +3215,9 @@ int set_tracer_flag(unsigned int mask, int enabled)
2913 trace_event_enable_cmd_record(enabled); 3215 trace_event_enable_cmd_record(enabled);
2914 3216
2915 if (mask == TRACE_ITER_OVERWRITE) { 3217 if (mask == TRACE_ITER_OVERWRITE) {
2916 ring_buffer_change_overwrite(global_trace.buffer, enabled); 3218 ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
2917#ifdef CONFIG_TRACER_MAX_TRACE 3219#ifdef CONFIG_TRACER_MAX_TRACE
2918 ring_buffer_change_overwrite(max_tr.buffer, enabled); 3220 ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
2919#endif 3221#endif
2920 } 3222 }
2921 3223
@@ -2925,7 +3227,7 @@ int set_tracer_flag(unsigned int mask, int enabled)
2925 return 0; 3227 return 0;
2926} 3228}
2927 3229
2928static int trace_set_options(char *option) 3230static int trace_set_options(struct trace_array *tr, char *option)
2929{ 3231{
2930 char *cmp; 3232 char *cmp;
2931 int neg = 0; 3233 int neg = 0;
@@ -2943,14 +3245,14 @@ static int trace_set_options(char *option)
2943 3245
2944 for (i = 0; trace_options[i]; i++) { 3246 for (i = 0; trace_options[i]; i++) {
2945 if (strcmp(cmp, trace_options[i]) == 0) { 3247 if (strcmp(cmp, trace_options[i]) == 0) {
2946 ret = set_tracer_flag(1 << i, !neg); 3248 ret = set_tracer_flag(tr, 1 << i, !neg);
2947 break; 3249 break;
2948 } 3250 }
2949 } 3251 }
2950 3252
2951 /* If no option could be set, test the specific tracer options */ 3253 /* If no option could be set, test the specific tracer options */
2952 if (!trace_options[i]) 3254 if (!trace_options[i])
2953 ret = set_tracer_option(current_trace, cmp, neg); 3255 ret = set_tracer_option(tr->current_trace, cmp, neg);
2954 3256
2955 mutex_unlock(&trace_types_lock); 3257 mutex_unlock(&trace_types_lock);
2956 3258
@@ -2961,6 +3263,8 @@ static ssize_t
2961tracing_trace_options_write(struct file *filp, const char __user *ubuf, 3263tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2962 size_t cnt, loff_t *ppos) 3264 size_t cnt, loff_t *ppos)
2963{ 3265{
3266 struct seq_file *m = filp->private_data;
3267 struct trace_array *tr = m->private;
2964 char buf[64]; 3268 char buf[64];
2965 int ret; 3269 int ret;
2966 3270
@@ -2972,7 +3276,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2972 3276
2973 buf[cnt] = 0; 3277 buf[cnt] = 0;
2974 3278
2975 ret = trace_set_options(buf); 3279 ret = trace_set_options(tr, buf);
2976 if (ret < 0) 3280 if (ret < 0)
2977 return ret; 3281 return ret;
2978 3282
@@ -2985,7 +3289,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)
2985{ 3289{
2986 if (tracing_disabled) 3290 if (tracing_disabled)
2987 return -ENODEV; 3291 return -ENODEV;
2988 return single_open(file, tracing_trace_options_show, NULL); 3292
3293 return single_open(file, tracing_trace_options_show, inode->i_private);
2989} 3294}
2990 3295
2991static const struct file_operations tracing_iter_fops = { 3296static const struct file_operations tracing_iter_fops = {
@@ -2998,20 +3303,84 @@ static const struct file_operations tracing_iter_fops = {
2998 3303
2999static const char readme_msg[] = 3304static const char readme_msg[] =
3000 "tracing mini-HOWTO:\n\n" 3305 "tracing mini-HOWTO:\n\n"
3001 "# mount -t debugfs nodev /sys/kernel/debug\n\n" 3306 "# echo 0 > tracing_on : quick way to disable tracing\n"
3002 "# cat /sys/kernel/debug/tracing/available_tracers\n" 3307 "# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
3003 "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" 3308 " Important files:\n"
3004 "# cat /sys/kernel/debug/tracing/current_tracer\n" 3309 " trace\t\t\t- The static contents of the buffer\n"
3005 "nop\n" 3310 "\t\t\t To clear the buffer write into this file: echo > trace\n"
3006 "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" 3311 " trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
3007 "# cat /sys/kernel/debug/tracing/current_tracer\n" 3312 " current_tracer\t- function and latency tracers\n"
3008 "wakeup\n" 3313 " available_tracers\t- list of configured tracers for current_tracer\n"
3009 "# cat /sys/kernel/debug/tracing/trace_options\n" 3314 " buffer_size_kb\t- view and modify size of per cpu buffer\n"
3010 "noprint-parent nosym-offset nosym-addr noverbose\n" 3315 " buffer_total_size_kb - view total size of all cpu buffers\n\n"
3011 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 3316 " trace_clock\t\t-change the clock used to order events\n"
3012 "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" 3317 " local: Per cpu clock but may not be synced across CPUs\n"
3013 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" 3318 " global: Synced across CPUs but slows tracing down.\n"
3014 "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" 3319 " counter: Not a clock, but just an increment\n"
3320 " uptime: Jiffy counter from time of boot\n"
3321 " perf: Same clock that perf events use\n"
3322#ifdef CONFIG_X86_64
3323 " x86-tsc: TSC cycle counter\n"
3324#endif
3325 "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
3326 " tracing_cpumask\t- Limit which CPUs to trace\n"
3327 " instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
3328 "\t\t\t Remove sub-buffer with rmdir\n"
3329 " trace_options\t\t- Set format or modify how tracing happens\n"
3330 "\t\t\t Disable an option by adding a suffix 'no' to the option name\n"
3331#ifdef CONFIG_DYNAMIC_FTRACE
3332 "\n available_filter_functions - list of functions that can be filtered on\n"
3333 " set_ftrace_filter\t- echo function name in here to only trace these functions\n"
3334 " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
3335 " modules: Can select a group via module\n"
3336 " Format: :mod:<module-name>\n"
3337 " example: echo :mod:ext3 > set_ftrace_filter\n"
3338 " triggers: a command to perform when function is hit\n"
3339 " Format: <function>:<trigger>[:count]\n"
3340 " trigger: traceon, traceoff\n"
3341 " enable_event:<system>:<event>\n"
3342 " disable_event:<system>:<event>\n"
3343#ifdef CONFIG_STACKTRACE
3344 " stacktrace\n"
3345#endif
3346#ifdef CONFIG_TRACER_SNAPSHOT
3347 " snapshot\n"
3348#endif
3349 " example: echo do_fault:traceoff > set_ftrace_filter\n"
3350 " echo do_trap:traceoff:3 > set_ftrace_filter\n"
3351 " The first one will disable tracing every time do_fault is hit\n"
3352 " The second will disable tracing at most 3 times when do_trap is hit\n"
3353 " The first time do trap is hit and it disables tracing, the counter\n"
3354 " will decrement to 2. If tracing is already disabled, the counter\n"
3355 " will not decrement. It only decrements when the trigger did work\n"
3356 " To remove trigger without count:\n"
3357 " echo '!<function>:<trigger> > set_ftrace_filter\n"
3358 " To remove trigger with a count:\n"
3359 " echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
3360 " set_ftrace_notrace\t- echo function name in here to never trace.\n"
3361 " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
3362 " modules: Can select a group via module command :mod:\n"
3363 " Does not accept triggers\n"
3364#endif /* CONFIG_DYNAMIC_FTRACE */
3365#ifdef CONFIG_FUNCTION_TRACER
3366 " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n"
3367#endif
3368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3369 " set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
3370 " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
3371#endif
3372#ifdef CONFIG_TRACER_SNAPSHOT
3373 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
3374 "\t\t\t Read the contents for more information\n"
3375#endif
3376#ifdef CONFIG_STACKTRACE
3377 " stack_trace\t\t- Shows the max stack trace when active\n"
3378 " stack_max_size\t- Shows current max stack size that was traced\n"
3379 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n"
3380#ifdef CONFIG_DYNAMIC_FTRACE
3381 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
3382#endif
3383#endif /* CONFIG_STACKTRACE */
3015; 3384;
3016 3385
3017static ssize_t 3386static ssize_t
@@ -3083,11 +3452,12 @@ static ssize_t
3083tracing_set_trace_read(struct file *filp, char __user *ubuf, 3452tracing_set_trace_read(struct file *filp, char __user *ubuf,
3084 size_t cnt, loff_t *ppos) 3453 size_t cnt, loff_t *ppos)
3085{ 3454{
3455 struct trace_array *tr = filp->private_data;
3086 char buf[MAX_TRACER_SIZE+2]; 3456 char buf[MAX_TRACER_SIZE+2];
3087 int r; 3457 int r;
3088 3458
3089 mutex_lock(&trace_types_lock); 3459 mutex_lock(&trace_types_lock);
3090 r = sprintf(buf, "%s\n", current_trace->name); 3460 r = sprintf(buf, "%s\n", tr->current_trace->name);
3091 mutex_unlock(&trace_types_lock); 3461 mutex_unlock(&trace_types_lock);
3092 3462
3093 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3463 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3095,43 +3465,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
3095 3465
3096int tracer_init(struct tracer *t, struct trace_array *tr) 3466int tracer_init(struct tracer *t, struct trace_array *tr)
3097{ 3467{
3098 tracing_reset_online_cpus(tr); 3468 tracing_reset_online_cpus(&tr->trace_buffer);
3099 return t->init(tr); 3469 return t->init(tr);
3100} 3470}
3101 3471
3102static void set_buffer_entries(struct trace_array *tr, unsigned long val) 3472static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
3103{ 3473{
3104 int cpu; 3474 int cpu;
3475
3105 for_each_tracing_cpu(cpu) 3476 for_each_tracing_cpu(cpu)
3106 tr->data[cpu]->entries = val; 3477 per_cpu_ptr(buf->data, cpu)->entries = val;
3107} 3478}
3108 3479
3480#ifdef CONFIG_TRACER_MAX_TRACE
3109/* resize @tr's buffer to the size of @size_tr's entries */ 3481/* resize @tr's buffer to the size of @size_tr's entries */
3110static int resize_buffer_duplicate_size(struct trace_array *tr, 3482static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
3111 struct trace_array *size_tr, int cpu_id) 3483 struct trace_buffer *size_buf, int cpu_id)
3112{ 3484{
3113 int cpu, ret = 0; 3485 int cpu, ret = 0;
3114 3486
3115 if (cpu_id == RING_BUFFER_ALL_CPUS) { 3487 if (cpu_id == RING_BUFFER_ALL_CPUS) {
3116 for_each_tracing_cpu(cpu) { 3488 for_each_tracing_cpu(cpu) {
3117 ret = ring_buffer_resize(tr->buffer, 3489 ret = ring_buffer_resize(trace_buf->buffer,
3118 size_tr->data[cpu]->entries, cpu); 3490 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
3119 if (ret < 0) 3491 if (ret < 0)
3120 break; 3492 break;
3121 tr->data[cpu]->entries = size_tr->data[cpu]->entries; 3493 per_cpu_ptr(trace_buf->data, cpu)->entries =
3494 per_cpu_ptr(size_buf->data, cpu)->entries;
3122 } 3495 }
3123 } else { 3496 } else {
3124 ret = ring_buffer_resize(tr->buffer, 3497 ret = ring_buffer_resize(trace_buf->buffer,
3125 size_tr->data[cpu_id]->entries, cpu_id); 3498 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
3126 if (ret == 0) 3499 if (ret == 0)
3127 tr->data[cpu_id]->entries = 3500 per_cpu_ptr(trace_buf->data, cpu_id)->entries =
3128 size_tr->data[cpu_id]->entries; 3501 per_cpu_ptr(size_buf->data, cpu_id)->entries;
3129 } 3502 }
3130 3503
3131 return ret; 3504 return ret;
3132} 3505}
3506#endif /* CONFIG_TRACER_MAX_TRACE */
3133 3507
3134static int __tracing_resize_ring_buffer(unsigned long size, int cpu) 3508static int __tracing_resize_ring_buffer(struct trace_array *tr,
3509 unsigned long size, int cpu)
3135{ 3510{
3136 int ret; 3511 int ret;
3137 3512
@@ -3140,23 +3515,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3140 * we use the size that was given, and we can forget about 3515 * we use the size that was given, and we can forget about
3141 * expanding it later. 3516 * expanding it later.
3142 */ 3517 */
3143 ring_buffer_expanded = 1; 3518 ring_buffer_expanded = true;
3144 3519
3145 /* May be called before buffers are initialized */ 3520 /* May be called before buffers are initialized */
3146 if (!global_trace.buffer) 3521 if (!tr->trace_buffer.buffer)
3147 return 0; 3522 return 0;
3148 3523
3149 ret = ring_buffer_resize(global_trace.buffer, size, cpu); 3524 ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
3150 if (ret < 0) 3525 if (ret < 0)
3151 return ret; 3526 return ret;
3152 3527
3153 if (!current_trace->use_max_tr) 3528#ifdef CONFIG_TRACER_MAX_TRACE
3529 if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
3530 !tr->current_trace->use_max_tr)
3154 goto out; 3531 goto out;
3155 3532
3156 ret = ring_buffer_resize(max_tr.buffer, size, cpu); 3533 ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
3157 if (ret < 0) { 3534 if (ret < 0) {
3158 int r = resize_buffer_duplicate_size(&global_trace, 3535 int r = resize_buffer_duplicate_size(&tr->trace_buffer,
3159 &global_trace, cpu); 3536 &tr->trace_buffer, cpu);
3160 if (r < 0) { 3537 if (r < 0) {
3161 /* 3538 /*
3162 * AARGH! We are left with different 3539 * AARGH! We are left with different
@@ -3179,20 +3556,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3179 } 3556 }
3180 3557
3181 if (cpu == RING_BUFFER_ALL_CPUS) 3558 if (cpu == RING_BUFFER_ALL_CPUS)
3182 set_buffer_entries(&max_tr, size); 3559 set_buffer_entries(&tr->max_buffer, size);
3183 else 3560 else
3184 max_tr.data[cpu]->entries = size; 3561 per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
3185 3562
3186 out: 3563 out:
3564#endif /* CONFIG_TRACER_MAX_TRACE */
3565
3187 if (cpu == RING_BUFFER_ALL_CPUS) 3566 if (cpu == RING_BUFFER_ALL_CPUS)
3188 set_buffer_entries(&global_trace, size); 3567 set_buffer_entries(&tr->trace_buffer, size);
3189 else 3568 else
3190 global_trace.data[cpu]->entries = size; 3569 per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
3191 3570
3192 return ret; 3571 return ret;
3193} 3572}
3194 3573
3195static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) 3574static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
3575 unsigned long size, int cpu_id)
3196{ 3576{
3197 int ret = size; 3577 int ret = size;
3198 3578
@@ -3206,7 +3586,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
3206 } 3586 }
3207 } 3587 }
3208 3588
3209 ret = __tracing_resize_ring_buffer(size, cpu_id); 3589 ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
3210 if (ret < 0) 3590 if (ret < 0)
3211 ret = -ENOMEM; 3591 ret = -ENOMEM;
3212 3592
@@ -3233,7 +3613,7 @@ int tracing_update_buffers(void)
3233 3613
3234 mutex_lock(&trace_types_lock); 3614 mutex_lock(&trace_types_lock);
3235 if (!ring_buffer_expanded) 3615 if (!ring_buffer_expanded)
3236 ret = __tracing_resize_ring_buffer(trace_buf_size, 3616 ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
3237 RING_BUFFER_ALL_CPUS); 3617 RING_BUFFER_ALL_CPUS);
3238 mutex_unlock(&trace_types_lock); 3618 mutex_unlock(&trace_types_lock);
3239 3619
@@ -3243,7 +3623,7 @@ int tracing_update_buffers(void)
3243struct trace_option_dentry; 3623struct trace_option_dentry;
3244 3624
3245static struct trace_option_dentry * 3625static struct trace_option_dentry *
3246create_trace_option_files(struct tracer *tracer); 3626create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
3247 3627
3248static void 3628static void
3249destroy_trace_option_files(struct trace_option_dentry *topts); 3629destroy_trace_option_files(struct trace_option_dentry *topts);
@@ -3253,13 +3633,15 @@ static int tracing_set_tracer(const char *buf)
3253 static struct trace_option_dentry *topts; 3633 static struct trace_option_dentry *topts;
3254 struct trace_array *tr = &global_trace; 3634 struct trace_array *tr = &global_trace;
3255 struct tracer *t; 3635 struct tracer *t;
3636#ifdef CONFIG_TRACER_MAX_TRACE
3256 bool had_max_tr; 3637 bool had_max_tr;
3638#endif
3257 int ret = 0; 3639 int ret = 0;
3258 3640
3259 mutex_lock(&trace_types_lock); 3641 mutex_lock(&trace_types_lock);
3260 3642
3261 if (!ring_buffer_expanded) { 3643 if (!ring_buffer_expanded) {
3262 ret = __tracing_resize_ring_buffer(trace_buf_size, 3644 ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
3263 RING_BUFFER_ALL_CPUS); 3645 RING_BUFFER_ALL_CPUS);
3264 if (ret < 0) 3646 if (ret < 0)
3265 goto out; 3647 goto out;
@@ -3274,18 +3656,21 @@ static int tracing_set_tracer(const char *buf)
3274 ret = -EINVAL; 3656 ret = -EINVAL;
3275 goto out; 3657 goto out;
3276 } 3658 }
3277 if (t == current_trace) 3659 if (t == tr->current_trace)
3278 goto out; 3660 goto out;
3279 3661
3280 trace_branch_disable(); 3662 trace_branch_disable();
3281 3663
3282 current_trace->enabled = false; 3664 tr->current_trace->enabled = false;
3283 3665
3284 if (current_trace->reset) 3666 if (tr->current_trace->reset)
3285 current_trace->reset(tr); 3667 tr->current_trace->reset(tr);
3286 3668
3287 had_max_tr = current_trace->allocated_snapshot; 3669 /* Current trace needs to be nop_trace before synchronize_sched */
3288 current_trace = &nop_trace; 3670 tr->current_trace = &nop_trace;
3671
3672#ifdef CONFIG_TRACER_MAX_TRACE
3673 had_max_tr = tr->allocated_snapshot;
3289 3674
3290 if (had_max_tr && !t->use_max_tr) { 3675 if (had_max_tr && !t->use_max_tr) {
3291 /* 3676 /*
@@ -3296,27 +3681,20 @@ static int tracing_set_tracer(const char *buf)
3296 * so a synchronized_sched() is sufficient. 3681 * so a synchronized_sched() is sufficient.
3297 */ 3682 */
3298 synchronize_sched(); 3683 synchronize_sched();
3299 /* 3684 free_snapshot(tr);
3300 * We don't free the ring buffer. instead, resize it because
3301 * The max_tr ring buffer has some state (e.g. ring->clock) and
3302 * we want preserve it.
3303 */
3304 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3305 set_buffer_entries(&max_tr, 1);
3306 tracing_reset_online_cpus(&max_tr);
3307 current_trace->allocated_snapshot = false;
3308 } 3685 }
3686#endif
3309 destroy_trace_option_files(topts); 3687 destroy_trace_option_files(topts);
3310 3688
3311 topts = create_trace_option_files(t); 3689 topts = create_trace_option_files(tr, t);
3690
3691#ifdef CONFIG_TRACER_MAX_TRACE
3312 if (t->use_max_tr && !had_max_tr) { 3692 if (t->use_max_tr && !had_max_tr) {
3313 /* we need to make per cpu buffer sizes equivalent */ 3693 ret = alloc_snapshot(tr);
3314 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3315 RING_BUFFER_ALL_CPUS);
3316 if (ret < 0) 3694 if (ret < 0)
3317 goto out; 3695 goto out;
3318 t->allocated_snapshot = true;
3319 } 3696 }
3697#endif
3320 3698
3321 if (t->init) { 3699 if (t->init) {
3322 ret = tracer_init(t, tr); 3700 ret = tracer_init(t, tr);
@@ -3324,8 +3702,8 @@ static int tracing_set_tracer(const char *buf)
3324 goto out; 3702 goto out;
3325 } 3703 }
3326 3704
3327 current_trace = t; 3705 tr->current_trace = t;
3328 current_trace->enabled = true; 3706 tr->current_trace->enabled = true;
3329 trace_branch_enable(tr); 3707 trace_branch_enable(tr);
3330 out: 3708 out:
3331 mutex_unlock(&trace_types_lock); 3709 mutex_unlock(&trace_types_lock);
@@ -3399,7 +3777,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3399 3777
3400static int tracing_open_pipe(struct inode *inode, struct file *filp) 3778static int tracing_open_pipe(struct inode *inode, struct file *filp)
3401{ 3779{
3402 long cpu_file = (long) inode->i_private; 3780 struct trace_cpu *tc = inode->i_private;
3781 struct trace_array *tr = tc->tr;
3403 struct trace_iterator *iter; 3782 struct trace_iterator *iter;
3404 int ret = 0; 3783 int ret = 0;
3405 3784
@@ -3424,7 +3803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3424 ret = -ENOMEM; 3803 ret = -ENOMEM;
3425 goto fail; 3804 goto fail;
3426 } 3805 }
3427 *iter->trace = *current_trace; 3806 *iter->trace = *tr->current_trace;
3428 3807
3429 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 3808 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3430 ret = -ENOMEM; 3809 ret = -ENOMEM;
@@ -3441,8 +3820,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3441 if (trace_clocks[trace_clock_id].in_ns) 3820 if (trace_clocks[trace_clock_id].in_ns)
3442 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 3821 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3443 3822
3444 iter->cpu_file = cpu_file; 3823 iter->cpu_file = tc->cpu;
3445 iter->tr = &global_trace; 3824 iter->tr = tc->tr;
3825 iter->trace_buffer = &tc->tr->trace_buffer;
3446 mutex_init(&iter->mutex); 3826 mutex_init(&iter->mutex);
3447 filp->private_data = iter; 3827 filp->private_data = iter;
3448 3828
@@ -3481,24 +3861,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3481} 3861}
3482 3862
3483static unsigned int 3863static unsigned int
3484tracing_poll_pipe(struct file *filp, poll_table *poll_table) 3864trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
3485{ 3865{
3486 struct trace_iterator *iter = filp->private_data; 3866 /* Iterators are static, they should be filled or empty */
3867 if (trace_buffer_iter(iter, iter->cpu_file))
3868 return POLLIN | POLLRDNORM;
3487 3869
3488 if (trace_flags & TRACE_ITER_BLOCK) { 3870 if (trace_flags & TRACE_ITER_BLOCK)
3489 /* 3871 /*
3490 * Always select as readable when in blocking mode 3872 * Always select as readable when in blocking mode
3491 */ 3873 */
3492 return POLLIN | POLLRDNORM; 3874 return POLLIN | POLLRDNORM;
3493 } else { 3875 else
3494 if (!trace_empty(iter)) 3876 return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
3495 return POLLIN | POLLRDNORM; 3877 filp, poll_table);
3496 poll_wait(filp, &trace_wait, poll_table); 3878}
3497 if (!trace_empty(iter))
3498 return POLLIN | POLLRDNORM;
3499 3879
3500 return 0; 3880static unsigned int
3501 } 3881tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3882{
3883 struct trace_iterator *iter = filp->private_data;
3884
3885 return trace_poll(iter, filp, poll_table);
3502} 3886}
3503 3887
3504/* 3888/*
@@ -3564,6 +3948,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3564 size_t cnt, loff_t *ppos) 3948 size_t cnt, loff_t *ppos)
3565{ 3949{
3566 struct trace_iterator *iter = filp->private_data; 3950 struct trace_iterator *iter = filp->private_data;
3951 struct trace_array *tr = iter->tr;
3567 ssize_t sret; 3952 ssize_t sret;
3568 3953
3569 /* return any leftover data */ 3954 /* return any leftover data */
@@ -3575,8 +3960,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3575 3960
3576 /* copy the tracer to avoid using a global lock all around */ 3961 /* copy the tracer to avoid using a global lock all around */
3577 mutex_lock(&trace_types_lock); 3962 mutex_lock(&trace_types_lock);
3578 if (unlikely(iter->trace->name != current_trace->name)) 3963 if (unlikely(iter->trace->name != tr->current_trace->name))
3579 *iter->trace = *current_trace; 3964 *iter->trace = *tr->current_trace;
3580 mutex_unlock(&trace_types_lock); 3965 mutex_unlock(&trace_types_lock);
3581 3966
3582 /* 3967 /*
@@ -3732,6 +4117,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3732 .ops = &tracing_pipe_buf_ops, 4117 .ops = &tracing_pipe_buf_ops,
3733 .spd_release = tracing_spd_release_pipe, 4118 .spd_release = tracing_spd_release_pipe,
3734 }; 4119 };
4120 struct trace_array *tr = iter->tr;
3735 ssize_t ret; 4121 ssize_t ret;
3736 size_t rem; 4122 size_t rem;
3737 unsigned int i; 4123 unsigned int i;
@@ -3741,8 +4127,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3741 4127
3742 /* copy the tracer to avoid using a global lock all around */ 4128 /* copy the tracer to avoid using a global lock all around */
3743 mutex_lock(&trace_types_lock); 4129 mutex_lock(&trace_types_lock);
3744 if (unlikely(iter->trace->name != current_trace->name)) 4130 if (unlikely(iter->trace->name != tr->current_trace->name))
3745 *iter->trace = *current_trace; 4131 *iter->trace = *tr->current_trace;
3746 mutex_unlock(&trace_types_lock); 4132 mutex_unlock(&trace_types_lock);
3747 4133
3748 mutex_lock(&iter->mutex); 4134 mutex_lock(&iter->mutex);
@@ -3804,43 +4190,19 @@ out_err:
3804 goto out; 4190 goto out;
3805} 4191}
3806 4192
3807struct ftrace_entries_info {
3808 struct trace_array *tr;
3809 int cpu;
3810};
3811
3812static int tracing_entries_open(struct inode *inode, struct file *filp)
3813{
3814 struct ftrace_entries_info *info;
3815
3816 if (tracing_disabled)
3817 return -ENODEV;
3818
3819 info = kzalloc(sizeof(*info), GFP_KERNEL);
3820 if (!info)
3821 return -ENOMEM;
3822
3823 info->tr = &global_trace;
3824 info->cpu = (unsigned long)inode->i_private;
3825
3826 filp->private_data = info;
3827
3828 return 0;
3829}
3830
3831static ssize_t 4193static ssize_t
3832tracing_entries_read(struct file *filp, char __user *ubuf, 4194tracing_entries_read(struct file *filp, char __user *ubuf,
3833 size_t cnt, loff_t *ppos) 4195 size_t cnt, loff_t *ppos)
3834{ 4196{
3835 struct ftrace_entries_info *info = filp->private_data; 4197 struct trace_cpu *tc = filp->private_data;
3836 struct trace_array *tr = info->tr; 4198 struct trace_array *tr = tc->tr;
3837 char buf[64]; 4199 char buf[64];
3838 int r = 0; 4200 int r = 0;
3839 ssize_t ret; 4201 ssize_t ret;
3840 4202
3841 mutex_lock(&trace_types_lock); 4203 mutex_lock(&trace_types_lock);
3842 4204
3843 if (info->cpu == RING_BUFFER_ALL_CPUS) { 4205 if (tc->cpu == RING_BUFFER_ALL_CPUS) {
3844 int cpu, buf_size_same; 4206 int cpu, buf_size_same;
3845 unsigned long size; 4207 unsigned long size;
3846 4208
@@ -3850,8 +4212,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
3850 for_each_tracing_cpu(cpu) { 4212 for_each_tracing_cpu(cpu) {
3851 /* fill in the size from first enabled cpu */ 4213 /* fill in the size from first enabled cpu */
3852 if (size == 0) 4214 if (size == 0)
3853 size = tr->data[cpu]->entries; 4215 size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
3854 if (size != tr->data[cpu]->entries) { 4216 if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
3855 buf_size_same = 0; 4217 buf_size_same = 0;
3856 break; 4218 break;
3857 } 4219 }
@@ -3867,7 +4229,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
3867 } else 4229 } else
3868 r = sprintf(buf, "X\n"); 4230 r = sprintf(buf, "X\n");
3869 } else 4231 } else
3870 r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); 4232 r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);
3871 4233
3872 mutex_unlock(&trace_types_lock); 4234 mutex_unlock(&trace_types_lock);
3873 4235
@@ -3879,7 +4241,7 @@ static ssize_t
3879tracing_entries_write(struct file *filp, const char __user *ubuf, 4241tracing_entries_write(struct file *filp, const char __user *ubuf,
3880 size_t cnt, loff_t *ppos) 4242 size_t cnt, loff_t *ppos)
3881{ 4243{
3882 struct ftrace_entries_info *info = filp->private_data; 4244 struct trace_cpu *tc = filp->private_data;
3883 unsigned long val; 4245 unsigned long val;
3884 int ret; 4246 int ret;
3885 4247
@@ -3894,7 +4256,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3894 /* value is in KB */ 4256 /* value is in KB */
3895 val <<= 10; 4257 val <<= 10;
3896 4258
3897 ret = tracing_resize_ring_buffer(val, info->cpu); 4259 ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
3898 if (ret < 0) 4260 if (ret < 0)
3899 return ret; 4261 return ret;
3900 4262
@@ -3903,16 +4265,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3903 return cnt; 4265 return cnt;
3904} 4266}
3905 4267
3906static int
3907tracing_entries_release(struct inode *inode, struct file *filp)
3908{
3909 struct ftrace_entries_info *info = filp->private_data;
3910
3911 kfree(info);
3912
3913 return 0;
3914}
3915
3916static ssize_t 4268static ssize_t
3917tracing_total_entries_read(struct file *filp, char __user *ubuf, 4269tracing_total_entries_read(struct file *filp, char __user *ubuf,
3918 size_t cnt, loff_t *ppos) 4270 size_t cnt, loff_t *ppos)
@@ -3924,7 +4276,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
3924 4276
3925 mutex_lock(&trace_types_lock); 4277 mutex_lock(&trace_types_lock);
3926 for_each_tracing_cpu(cpu) { 4278 for_each_tracing_cpu(cpu) {
3927 size += tr->data[cpu]->entries >> 10; 4279 size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
3928 if (!ring_buffer_expanded) 4280 if (!ring_buffer_expanded)
3929 expanded_size += trace_buf_size >> 10; 4281 expanded_size += trace_buf_size >> 10;
3930 } 4282 }
@@ -3954,11 +4306,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3954static int 4306static int
3955tracing_free_buffer_release(struct inode *inode, struct file *filp) 4307tracing_free_buffer_release(struct inode *inode, struct file *filp)
3956{ 4308{
4309 struct trace_array *tr = inode->i_private;
4310
3957 /* disable tracing ? */ 4311 /* disable tracing ? */
3958 if (trace_flags & TRACE_ITER_STOP_ON_FREE) 4312 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3959 tracing_off(); 4313 tracing_off();
3960 /* resize the ring buffer to 0 */ 4314 /* resize the ring buffer to 0 */
3961 tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); 4315 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
3962 4316
3963 return 0; 4317 return 0;
3964} 4318}
@@ -4027,7 +4381,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4027 4381
4028 local_save_flags(irq_flags); 4382 local_save_flags(irq_flags);
4029 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 4383 size = sizeof(*entry) + cnt + 2; /* possible \n added */
4030 buffer = global_trace.buffer; 4384 buffer = global_trace.trace_buffer.buffer;
4031 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 4385 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
4032 irq_flags, preempt_count()); 4386 irq_flags, preempt_count());
4033 if (!event) { 4387 if (!event) {
@@ -4069,13 +4423,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4069 4423
4070static int tracing_clock_show(struct seq_file *m, void *v) 4424static int tracing_clock_show(struct seq_file *m, void *v)
4071{ 4425{
4426 struct trace_array *tr = m->private;
4072 int i; 4427 int i;
4073 4428
4074 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) 4429 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
4075 seq_printf(m, 4430 seq_printf(m,
4076 "%s%s%s%s", i ? " " : "", 4431 "%s%s%s%s", i ? " " : "",
4077 i == trace_clock_id ? "[" : "", trace_clocks[i].name, 4432 i == tr->clock_id ? "[" : "", trace_clocks[i].name,
4078 i == trace_clock_id ? "]" : ""); 4433 i == tr->clock_id ? "]" : "");
4079 seq_putc(m, '\n'); 4434 seq_putc(m, '\n');
4080 4435
4081 return 0; 4436 return 0;
@@ -4084,6 +4439,8 @@ static int tracing_clock_show(struct seq_file *m, void *v)
4084static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 4439static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4085 size_t cnt, loff_t *fpos) 4440 size_t cnt, loff_t *fpos)
4086{ 4441{
4442 struct seq_file *m = filp->private_data;
4443 struct trace_array *tr = m->private;
4087 char buf[64]; 4444 char buf[64];
4088 const char *clockstr; 4445 const char *clockstr;
4089 int i; 4446 int i;
@@ -4105,20 +4462,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4105 if (i == ARRAY_SIZE(trace_clocks)) 4462 if (i == ARRAY_SIZE(trace_clocks))
4106 return -EINVAL; 4463 return -EINVAL;
4107 4464
4108 trace_clock_id = i;
4109
4110 mutex_lock(&trace_types_lock); 4465 mutex_lock(&trace_types_lock);
4111 4466
4112 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); 4467 tr->clock_id = i;
4113 if (max_tr.buffer) 4468
4114 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); 4469 ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
4115 4470
4116 /* 4471 /*
4117 * New clock may not be consistent with the previous clock. 4472 * New clock may not be consistent with the previous clock.
4118 * Reset the buffer so that it doesn't have incomparable timestamps. 4473 * Reset the buffer so that it doesn't have incomparable timestamps.
4119 */ 4474 */
4120 tracing_reset_online_cpus(&global_trace); 4475 tracing_reset_online_cpus(&global_trace.trace_buffer);
4121 tracing_reset_online_cpus(&max_tr); 4476
4477#ifdef CONFIG_TRACER_MAX_TRACE
4478 if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
4479 ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
4480 tracing_reset_online_cpus(&global_trace.max_buffer);
4481#endif
4122 4482
4123 mutex_unlock(&trace_types_lock); 4483 mutex_unlock(&trace_types_lock);
4124 4484
@@ -4131,20 +4491,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
4131{ 4491{
4132 if (tracing_disabled) 4492 if (tracing_disabled)
4133 return -ENODEV; 4493 return -ENODEV;
4134 return single_open(file, tracing_clock_show, NULL); 4494
4495 return single_open(file, tracing_clock_show, inode->i_private);
4135} 4496}
4136 4497
4498struct ftrace_buffer_info {
4499 struct trace_iterator iter;
4500 void *spare;
4501 unsigned int read;
4502};
4503
4137#ifdef CONFIG_TRACER_SNAPSHOT 4504#ifdef CONFIG_TRACER_SNAPSHOT
4138static int tracing_snapshot_open(struct inode *inode, struct file *file) 4505static int tracing_snapshot_open(struct inode *inode, struct file *file)
4139{ 4506{
4507 struct trace_cpu *tc = inode->i_private;
4140 struct trace_iterator *iter; 4508 struct trace_iterator *iter;
4509 struct seq_file *m;
4141 int ret = 0; 4510 int ret = 0;
4142 4511
4143 if (file->f_mode & FMODE_READ) { 4512 if (file->f_mode & FMODE_READ) {
4144 iter = __tracing_open(inode, file, true); 4513 iter = __tracing_open(inode, file, true);
4145 if (IS_ERR(iter)) 4514 if (IS_ERR(iter))
4146 ret = PTR_ERR(iter); 4515 ret = PTR_ERR(iter);
4516 } else {
4517 /* Writes still need the seq_file to hold the private data */
4518 m = kzalloc(sizeof(*m), GFP_KERNEL);
4519 if (!m)
4520 return -ENOMEM;
4521 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
4522 if (!iter) {
4523 kfree(m);
4524 return -ENOMEM;
4525 }
4526 iter->tr = tc->tr;
4527 iter->trace_buffer = &tc->tr->max_buffer;
4528 iter->cpu_file = tc->cpu;
4529 m->private = iter;
4530 file->private_data = m;
4147 } 4531 }
4532
4148 return ret; 4533 return ret;
4149} 4534}
4150 4535
@@ -4152,6 +4537,9 @@ static ssize_t
4152tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, 4537tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4153 loff_t *ppos) 4538 loff_t *ppos)
4154{ 4539{
4540 struct seq_file *m = filp->private_data;
4541 struct trace_iterator *iter = m->private;
4542 struct trace_array *tr = iter->tr;
4155 unsigned long val; 4543 unsigned long val;
4156 int ret; 4544 int ret;
4157 4545
@@ -4165,40 +4553,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4165 4553
4166 mutex_lock(&trace_types_lock); 4554 mutex_lock(&trace_types_lock);
4167 4555
4168 if (current_trace->use_max_tr) { 4556 if (tr->current_trace->use_max_tr) {
4169 ret = -EBUSY; 4557 ret = -EBUSY;
4170 goto out; 4558 goto out;
4171 } 4559 }
4172 4560
4173 switch (val) { 4561 switch (val) {
4174 case 0: 4562 case 0:
4175 if (current_trace->allocated_snapshot) { 4563 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
4176 /* free spare buffer */ 4564 ret = -EINVAL;
4177 ring_buffer_resize(max_tr.buffer, 1, 4565 break;
4178 RING_BUFFER_ALL_CPUS);
4179 set_buffer_entries(&max_tr, 1);
4180 tracing_reset_online_cpus(&max_tr);
4181 current_trace->allocated_snapshot = false;
4182 } 4566 }
4567 if (tr->allocated_snapshot)
4568 free_snapshot(tr);
4183 break; 4569 break;
4184 case 1: 4570 case 1:
4185 if (!current_trace->allocated_snapshot) { 4571/* Only allow per-cpu swap if the ring buffer supports it */
4186 /* allocate spare buffer */ 4572#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
4187 ret = resize_buffer_duplicate_size(&max_tr, 4573 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
4188 &global_trace, RING_BUFFER_ALL_CPUS); 4574 ret = -EINVAL;
4575 break;
4576 }
4577#endif
4578 if (!tr->allocated_snapshot) {
4579 ret = alloc_snapshot(tr);
4189 if (ret < 0) 4580 if (ret < 0)
4190 break; 4581 break;
4191 current_trace->allocated_snapshot = true;
4192 } 4582 }
4193
4194 local_irq_disable(); 4583 local_irq_disable();
4195 /* Now, we're going to swap */ 4584 /* Now, we're going to swap */
4196 update_max_tr(&global_trace, current, smp_processor_id()); 4585 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
4586 update_max_tr(tr, current, smp_processor_id());
4587 else
4588 update_max_tr_single(tr, current, iter->cpu_file);
4197 local_irq_enable(); 4589 local_irq_enable();
4198 break; 4590 break;
4199 default: 4591 default:
4200 if (current_trace->allocated_snapshot) 4592 if (tr->allocated_snapshot) {
4201 tracing_reset_online_cpus(&max_tr); 4593 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
4594 tracing_reset_online_cpus(&tr->max_buffer);
4595 else
4596 tracing_reset(&tr->max_buffer, iter->cpu_file);
4597 }
4202 break; 4598 break;
4203 } 4599 }
4204 4600
@@ -4210,6 +4606,51 @@ out:
4210 mutex_unlock(&trace_types_lock); 4606 mutex_unlock(&trace_types_lock);
4211 return ret; 4607 return ret;
4212} 4608}
4609
4610static int tracing_snapshot_release(struct inode *inode, struct file *file)
4611{
4612 struct seq_file *m = file->private_data;
4613
4614 if (file->f_mode & FMODE_READ)
4615 return tracing_release(inode, file);
4616
4617 /* If write only, the seq_file is just a stub */
4618 if (m)
4619 kfree(m->private);
4620 kfree(m);
4621
4622 return 0;
4623}
4624
4625static int tracing_buffers_open(struct inode *inode, struct file *filp);
4626static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
4627 size_t count, loff_t *ppos);
4628static int tracing_buffers_release(struct inode *inode, struct file *file);
4629static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4630 struct pipe_inode_info *pipe, size_t len, unsigned int flags);
4631
4632static int snapshot_raw_open(struct inode *inode, struct file *filp)
4633{
4634 struct ftrace_buffer_info *info;
4635 int ret;
4636
4637 ret = tracing_buffers_open(inode, filp);
4638 if (ret < 0)
4639 return ret;
4640
4641 info = filp->private_data;
4642
4643 if (info->iter.trace->use_max_tr) {
4644 tracing_buffers_release(inode, filp);
4645 return -EBUSY;
4646 }
4647
4648 info->iter.snapshot = true;
4649 info->iter.trace_buffer = &info->iter.tr->max_buffer;
4650
4651 return ret;
4652}
4653
4213#endif /* CONFIG_TRACER_SNAPSHOT */ 4654#endif /* CONFIG_TRACER_SNAPSHOT */
4214 4655
4215 4656
@@ -4237,10 +4678,9 @@ static const struct file_operations tracing_pipe_fops = {
4237}; 4678};
4238 4679
4239static const struct file_operations tracing_entries_fops = { 4680static const struct file_operations tracing_entries_fops = {
4240 .open = tracing_entries_open, 4681 .open = tracing_open_generic,
4241 .read = tracing_entries_read, 4682 .read = tracing_entries_read,
4242 .write = tracing_entries_write, 4683 .write = tracing_entries_write,
4243 .release = tracing_entries_release,
4244 .llseek = generic_file_llseek, 4684 .llseek = generic_file_llseek,
4245}; 4685};
4246 4686
@@ -4275,20 +4715,23 @@ static const struct file_operations snapshot_fops = {
4275 .read = seq_read, 4715 .read = seq_read,
4276 .write = tracing_snapshot_write, 4716 .write = tracing_snapshot_write,
4277 .llseek = tracing_seek, 4717 .llseek = tracing_seek,
4278 .release = tracing_release, 4718 .release = tracing_snapshot_release,
4279}; 4719};
4280#endif /* CONFIG_TRACER_SNAPSHOT */
4281 4720
4282struct ftrace_buffer_info { 4721static const struct file_operations snapshot_raw_fops = {
4283 struct trace_array *tr; 4722 .open = snapshot_raw_open,
4284 void *spare; 4723 .read = tracing_buffers_read,
4285 int cpu; 4724 .release = tracing_buffers_release,
4286 unsigned int read; 4725 .splice_read = tracing_buffers_splice_read,
4726 .llseek = no_llseek,
4287}; 4727};
4288 4728
4729#endif /* CONFIG_TRACER_SNAPSHOT */
4730
4289static int tracing_buffers_open(struct inode *inode, struct file *filp) 4731static int tracing_buffers_open(struct inode *inode, struct file *filp)
4290{ 4732{
4291 int cpu = (int)(long)inode->i_private; 4733 struct trace_cpu *tc = inode->i_private;
4734 struct trace_array *tr = tc->tr;
4292 struct ftrace_buffer_info *info; 4735 struct ftrace_buffer_info *info;
4293 4736
4294 if (tracing_disabled) 4737 if (tracing_disabled)
@@ -4298,72 +4741,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
4298 if (!info) 4741 if (!info)
4299 return -ENOMEM; 4742 return -ENOMEM;
4300 4743
4301 info->tr = &global_trace; 4744 mutex_lock(&trace_types_lock);
4302 info->cpu = cpu; 4745
4303 info->spare = NULL; 4746 tr->ref++;
4747
4748 info->iter.tr = tr;
4749 info->iter.cpu_file = tc->cpu;
4750 info->iter.trace = tr->current_trace;
4751 info->iter.trace_buffer = &tr->trace_buffer;
4752 info->spare = NULL;
4304 /* Force reading ring buffer for first read */ 4753 /* Force reading ring buffer for first read */
4305 info->read = (unsigned int)-1; 4754 info->read = (unsigned int)-1;
4306 4755
4307 filp->private_data = info; 4756 filp->private_data = info;
4308 4757
4758 mutex_unlock(&trace_types_lock);
4759
4309 return nonseekable_open(inode, filp); 4760 return nonseekable_open(inode, filp);
4310} 4761}
4311 4762
4763static unsigned int
4764tracing_buffers_poll(struct file *filp, poll_table *poll_table)
4765{
4766 struct ftrace_buffer_info *info = filp->private_data;
4767 struct trace_iterator *iter = &info->iter;
4768
4769 return trace_poll(iter, filp, poll_table);
4770}
4771
4312static ssize_t 4772static ssize_t
4313tracing_buffers_read(struct file *filp, char __user *ubuf, 4773tracing_buffers_read(struct file *filp, char __user *ubuf,
4314 size_t count, loff_t *ppos) 4774 size_t count, loff_t *ppos)
4315{ 4775{
4316 struct ftrace_buffer_info *info = filp->private_data; 4776 struct ftrace_buffer_info *info = filp->private_data;
4777 struct trace_iterator *iter = &info->iter;
4317 ssize_t ret; 4778 ssize_t ret;
4318 size_t size; 4779 ssize_t size;
4319 4780
4320 if (!count) 4781 if (!count)
4321 return 0; 4782 return 0;
4322 4783
4784 mutex_lock(&trace_types_lock);
4785
4786#ifdef CONFIG_TRACER_MAX_TRACE
4787 if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
4788 size = -EBUSY;
4789 goto out_unlock;
4790 }
4791#endif
4792
4323 if (!info->spare) 4793 if (!info->spare)
4324 info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); 4794 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
4795 iter->cpu_file);
4796 size = -ENOMEM;
4325 if (!info->spare) 4797 if (!info->spare)
4326 return -ENOMEM; 4798 goto out_unlock;
4327 4799
4328 /* Do we have previous read data to read? */ 4800 /* Do we have previous read data to read? */
4329 if (info->read < PAGE_SIZE) 4801 if (info->read < PAGE_SIZE)
4330 goto read; 4802 goto read;
4331 4803
4332 trace_access_lock(info->cpu); 4804 again:
4333 ret = ring_buffer_read_page(info->tr->buffer, 4805 trace_access_lock(iter->cpu_file);
4806 ret = ring_buffer_read_page(iter->trace_buffer->buffer,
4334 &info->spare, 4807 &info->spare,
4335 count, 4808 count,
4336 info->cpu, 0); 4809 iter->cpu_file, 0);
4337 trace_access_unlock(info->cpu); 4810 trace_access_unlock(iter->cpu_file);
4338 if (ret < 0)
4339 return 0;
4340 4811
4341 info->read = 0; 4812 if (ret < 0) {
4813 if (trace_empty(iter)) {
4814 if ((filp->f_flags & O_NONBLOCK)) {
4815 size = -EAGAIN;
4816 goto out_unlock;
4817 }
4818 mutex_unlock(&trace_types_lock);
4819 iter->trace->wait_pipe(iter);
4820 mutex_lock(&trace_types_lock);
4821 if (signal_pending(current)) {
4822 size = -EINTR;
4823 goto out_unlock;
4824 }
4825 goto again;
4826 }
4827 size = 0;
4828 goto out_unlock;
4829 }
4342 4830
4343read: 4831 info->read = 0;
4832 read:
4344 size = PAGE_SIZE - info->read; 4833 size = PAGE_SIZE - info->read;
4345 if (size > count) 4834 if (size > count)
4346 size = count; 4835 size = count;
4347 4836
4348 ret = copy_to_user(ubuf, info->spare + info->read, size); 4837 ret = copy_to_user(ubuf, info->spare + info->read, size);
4349 if (ret == size) 4838 if (ret == size) {
4350 return -EFAULT; 4839 size = -EFAULT;
4840 goto out_unlock;
4841 }
4351 size -= ret; 4842 size -= ret;
4352 4843
4353 *ppos += size; 4844 *ppos += size;
4354 info->read += size; 4845 info->read += size;
4355 4846
4847 out_unlock:
4848 mutex_unlock(&trace_types_lock);
4849
4356 return size; 4850 return size;
4357} 4851}
4358 4852
4359static int tracing_buffers_release(struct inode *inode, struct file *file) 4853static int tracing_buffers_release(struct inode *inode, struct file *file)
4360{ 4854{
4361 struct ftrace_buffer_info *info = file->private_data; 4855 struct ftrace_buffer_info *info = file->private_data;
4856 struct trace_iterator *iter = &info->iter;
4857
4858 mutex_lock(&trace_types_lock);
4859
4860 WARN_ON(!iter->tr->ref);
4861 iter->tr->ref--;
4362 4862
4363 if (info->spare) 4863 if (info->spare)
4364 ring_buffer_free_read_page(info->tr->buffer, info->spare); 4864 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
4365 kfree(info); 4865 kfree(info);
4366 4866
4867 mutex_unlock(&trace_types_lock);
4868
4367 return 0; 4869 return 0;
4368} 4870}
4369 4871
@@ -4428,6 +4930,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4428 unsigned int flags) 4930 unsigned int flags)
4429{ 4931{
4430 struct ftrace_buffer_info *info = file->private_data; 4932 struct ftrace_buffer_info *info = file->private_data;
4933 struct trace_iterator *iter = &info->iter;
4431 struct partial_page partial_def[PIPE_DEF_BUFFERS]; 4934 struct partial_page partial_def[PIPE_DEF_BUFFERS];
4432 struct page *pages_def[PIPE_DEF_BUFFERS]; 4935 struct page *pages_def[PIPE_DEF_BUFFERS];
4433 struct splice_pipe_desc spd = { 4936 struct splice_pipe_desc spd = {
@@ -4440,10 +4943,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4440 }; 4943 };
4441 struct buffer_ref *ref; 4944 struct buffer_ref *ref;
4442 int entries, size, i; 4945 int entries, size, i;
4443 size_t ret; 4946 ssize_t ret;
4444 4947
4445 if (splice_grow_spd(pipe, &spd)) 4948 mutex_lock(&trace_types_lock);
4446 return -ENOMEM; 4949
4950#ifdef CONFIG_TRACER_MAX_TRACE
4951 if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
4952 ret = -EBUSY;
4953 goto out;
4954 }
4955#endif
4956
4957 if (splice_grow_spd(pipe, &spd)) {
4958 ret = -ENOMEM;
4959 goto out;
4960 }
4447 4961
4448 if (*ppos & (PAGE_SIZE - 1)) { 4962 if (*ppos & (PAGE_SIZE - 1)) {
4449 ret = -EINVAL; 4963 ret = -EINVAL;
@@ -4458,8 +4972,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4458 len &= PAGE_MASK; 4972 len &= PAGE_MASK;
4459 } 4973 }
4460 4974
4461 trace_access_lock(info->cpu); 4975 again:
4462 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 4976 trace_access_lock(iter->cpu_file);
4977 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
4463 4978
4464 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { 4979 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
4465 struct page *page; 4980 struct page *page;
@@ -4470,15 +4985,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4470 break; 4985 break;
4471 4986
4472 ref->ref = 1; 4987 ref->ref = 1;
4473 ref->buffer = info->tr->buffer; 4988 ref->buffer = iter->trace_buffer->buffer;
4474 ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); 4989 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
4475 if (!ref->page) { 4990 if (!ref->page) {
4476 kfree(ref); 4991 kfree(ref);
4477 break; 4992 break;
4478 } 4993 }
4479 4994
4480 r = ring_buffer_read_page(ref->buffer, &ref->page, 4995 r = ring_buffer_read_page(ref->buffer, &ref->page,
4481 len, info->cpu, 1); 4996 len, iter->cpu_file, 1);
4482 if (r < 0) { 4997 if (r < 0) {
4483 ring_buffer_free_read_page(ref->buffer, ref->page); 4998 ring_buffer_free_read_page(ref->buffer, ref->page);
4484 kfree(ref); 4999 kfree(ref);
@@ -4502,31 +5017,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4502 spd.nr_pages++; 5017 spd.nr_pages++;
4503 *ppos += PAGE_SIZE; 5018 *ppos += PAGE_SIZE;
4504 5019
4505 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 5020 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
4506 } 5021 }
4507 5022
4508 trace_access_unlock(info->cpu); 5023 trace_access_unlock(iter->cpu_file);
4509 spd.nr_pages = i; 5024 spd.nr_pages = i;
4510 5025
4511 /* did we read anything? */ 5026 /* did we read anything? */
4512 if (!spd.nr_pages) { 5027 if (!spd.nr_pages) {
4513 if (flags & SPLICE_F_NONBLOCK) 5028 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
4514 ret = -EAGAIN; 5029 ret = -EAGAIN;
4515 else 5030 goto out;
4516 ret = 0; 5031 }
4517 /* TODO: block */ 5032 mutex_unlock(&trace_types_lock);
4518 goto out; 5033 iter->trace->wait_pipe(iter);
5034 mutex_lock(&trace_types_lock);
5035 if (signal_pending(current)) {
5036 ret = -EINTR;
5037 goto out;
5038 }
5039 goto again;
4519 } 5040 }
4520 5041
4521 ret = splice_to_pipe(pipe, &spd); 5042 ret = splice_to_pipe(pipe, &spd);
4522 splice_shrink_spd(&spd); 5043 splice_shrink_spd(&spd);
4523out: 5044out:
5045 mutex_unlock(&trace_types_lock);
5046
4524 return ret; 5047 return ret;
4525} 5048}
4526 5049
4527static const struct file_operations tracing_buffers_fops = { 5050static const struct file_operations tracing_buffers_fops = {
4528 .open = tracing_buffers_open, 5051 .open = tracing_buffers_open,
4529 .read = tracing_buffers_read, 5052 .read = tracing_buffers_read,
5053 .poll = tracing_buffers_poll,
4530 .release = tracing_buffers_release, 5054 .release = tracing_buffers_release,
4531 .splice_read = tracing_buffers_splice_read, 5055 .splice_read = tracing_buffers_splice_read,
4532 .llseek = no_llseek, 5056 .llseek = no_llseek,
@@ -4536,12 +5060,14 @@ static ssize_t
4536tracing_stats_read(struct file *filp, char __user *ubuf, 5060tracing_stats_read(struct file *filp, char __user *ubuf,
4537 size_t count, loff_t *ppos) 5061 size_t count, loff_t *ppos)
4538{ 5062{
4539 unsigned long cpu = (unsigned long)filp->private_data; 5063 struct trace_cpu *tc = filp->private_data;
4540 struct trace_array *tr = &global_trace; 5064 struct trace_array *tr = tc->tr;
5065 struct trace_buffer *trace_buf = &tr->trace_buffer;
4541 struct trace_seq *s; 5066 struct trace_seq *s;
4542 unsigned long cnt; 5067 unsigned long cnt;
4543 unsigned long long t; 5068 unsigned long long t;
4544 unsigned long usec_rem; 5069 unsigned long usec_rem;
5070 int cpu = tc->cpu;
4545 5071
4546 s = kmalloc(sizeof(*s), GFP_KERNEL); 5072 s = kmalloc(sizeof(*s), GFP_KERNEL);
4547 if (!s) 5073 if (!s)
@@ -4549,41 +5075,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4549 5075
4550 trace_seq_init(s); 5076 trace_seq_init(s);
4551 5077
4552 cnt = ring_buffer_entries_cpu(tr->buffer, cpu); 5078 cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
4553 trace_seq_printf(s, "entries: %ld\n", cnt); 5079 trace_seq_printf(s, "entries: %ld\n", cnt);
4554 5080
4555 cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); 5081 cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
4556 trace_seq_printf(s, "overrun: %ld\n", cnt); 5082 trace_seq_printf(s, "overrun: %ld\n", cnt);
4557 5083
4558 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 5084 cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
4559 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 5085 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
4560 5086
4561 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); 5087 cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
4562 trace_seq_printf(s, "bytes: %ld\n", cnt); 5088 trace_seq_printf(s, "bytes: %ld\n", cnt);
4563 5089
4564 if (trace_clocks[trace_clock_id].in_ns) { 5090 if (trace_clocks[trace_clock_id].in_ns) {
4565 /* local or global for trace_clock */ 5091 /* local or global for trace_clock */
4566 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); 5092 t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
4567 usec_rem = do_div(t, USEC_PER_SEC); 5093 usec_rem = do_div(t, USEC_PER_SEC);
4568 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", 5094 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
4569 t, usec_rem); 5095 t, usec_rem);
4570 5096
4571 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); 5097 t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
4572 usec_rem = do_div(t, USEC_PER_SEC); 5098 usec_rem = do_div(t, USEC_PER_SEC);
4573 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); 5099 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4574 } else { 5100 } else {
4575 /* counter or tsc mode for trace_clock */ 5101 /* counter or tsc mode for trace_clock */
4576 trace_seq_printf(s, "oldest event ts: %llu\n", 5102 trace_seq_printf(s, "oldest event ts: %llu\n",
4577 ring_buffer_oldest_event_ts(tr->buffer, cpu)); 5103 ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
4578 5104
4579 trace_seq_printf(s, "now ts: %llu\n", 5105 trace_seq_printf(s, "now ts: %llu\n",
4580 ring_buffer_time_stamp(tr->buffer, cpu)); 5106 ring_buffer_time_stamp(trace_buf->buffer, cpu));
4581 } 5107 }
4582 5108
4583 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); 5109 cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
4584 trace_seq_printf(s, "dropped events: %ld\n", cnt); 5110 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4585 5111
4586 cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); 5112 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
4587 trace_seq_printf(s, "read events: %ld\n", cnt); 5113 trace_seq_printf(s, "read events: %ld\n", cnt);
4588 5114
4589 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 5115 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
@@ -4635,60 +5161,161 @@ static const struct file_operations tracing_dyn_info_fops = {
4635 .read = tracing_read_dyn_info, 5161 .read = tracing_read_dyn_info,
4636 .llseek = generic_file_llseek, 5162 .llseek = generic_file_llseek,
4637}; 5163};
4638#endif 5164#endif /* CONFIG_DYNAMIC_FTRACE */
4639 5165
4640static struct dentry *d_tracer; 5166#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
5167static void
5168ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
5169{
5170 tracing_snapshot();
5171}
4641 5172
4642struct dentry *tracing_init_dentry(void) 5173static void
5174ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
5175{
5176 unsigned long *count = (long *)data;
5177
5178 if (!*count)
5179 return;
5180
5181 if (*count != -1)
5182 (*count)--;
5183
5184 tracing_snapshot();
5185}
5186
5187static int
5188ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
5189 struct ftrace_probe_ops *ops, void *data)
5190{
5191 long count = (long)data;
5192
5193 seq_printf(m, "%ps:", (void *)ip);
5194
5195 seq_printf(m, "snapshot");
5196
5197 if (count == -1)
5198 seq_printf(m, ":unlimited\n");
5199 else
5200 seq_printf(m, ":count=%ld\n", count);
5201
5202 return 0;
5203}
5204
5205static struct ftrace_probe_ops snapshot_probe_ops = {
5206 .func = ftrace_snapshot,
5207 .print = ftrace_snapshot_print,
5208};
5209
5210static struct ftrace_probe_ops snapshot_count_probe_ops = {
5211 .func = ftrace_count_snapshot,
5212 .print = ftrace_snapshot_print,
5213};
5214
5215static int
5216ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
5217 char *glob, char *cmd, char *param, int enable)
4643{ 5218{
4644 static int once; 5219 struct ftrace_probe_ops *ops;
5220 void *count = (void *)-1;
5221 char *number;
5222 int ret;
4645 5223
4646 if (d_tracer) 5224 /* hash funcs only work with set_ftrace_filter */
4647 return d_tracer; 5225 if (!enable)
5226 return -EINVAL;
5227
5228 ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
5229
5230 if (glob[0] == '!') {
5231 unregister_ftrace_function_probe_func(glob+1, ops);
5232 return 0;
5233 }
5234
5235 if (!param)
5236 goto out_reg;
5237
5238 number = strsep(&param, ":");
5239
5240 if (!strlen(number))
5241 goto out_reg;
5242
5243 /*
5244 * We use the callback data field (which is a pointer)
5245 * as our counter.
5246 */
5247 ret = kstrtoul(number, 0, (unsigned long *)&count);
5248 if (ret)
5249 return ret;
5250
5251 out_reg:
5252 ret = register_ftrace_function_probe(glob, ops, count);
5253
5254 if (ret >= 0)
5255 alloc_snapshot(&global_trace);
5256
5257 return ret < 0 ? ret : 0;
5258}
5259
5260static struct ftrace_func_command ftrace_snapshot_cmd = {
5261 .name = "snapshot",
5262 .func = ftrace_trace_snapshot_callback,
5263};
5264
5265static int register_snapshot_cmd(void)
5266{
5267 return register_ftrace_command(&ftrace_snapshot_cmd);
5268}
5269#else
5270static inline int register_snapshot_cmd(void) { return 0; }
5271#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
5272
5273struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
5274{
5275 if (tr->dir)
5276 return tr->dir;
4648 5277
4649 if (!debugfs_initialized()) 5278 if (!debugfs_initialized())
4650 return NULL; 5279 return NULL;
4651 5280
4652 d_tracer = debugfs_create_dir("tracing", NULL); 5281 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
5282 tr->dir = debugfs_create_dir("tracing", NULL);
4653 5283
4654 if (!d_tracer && !once) { 5284 if (!tr->dir)
4655 once = 1; 5285 pr_warn_once("Could not create debugfs directory 'tracing'\n");
4656 pr_warning("Could not create debugfs directory 'tracing'\n");
4657 return NULL;
4658 }
4659 5286
4660 return d_tracer; 5287 return tr->dir;
4661} 5288}
4662 5289
4663static struct dentry *d_percpu; 5290struct dentry *tracing_init_dentry(void)
5291{
5292 return tracing_init_dentry_tr(&global_trace);
5293}
4664 5294
4665static struct dentry *tracing_dentry_percpu(void) 5295static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
4666{ 5296{
4667 static int once;
4668 struct dentry *d_tracer; 5297 struct dentry *d_tracer;
4669 5298
4670 if (d_percpu) 5299 if (tr->percpu_dir)
4671 return d_percpu; 5300 return tr->percpu_dir;
4672
4673 d_tracer = tracing_init_dentry();
4674 5301
5302 d_tracer = tracing_init_dentry_tr(tr);
4675 if (!d_tracer) 5303 if (!d_tracer)
4676 return NULL; 5304 return NULL;
4677 5305
4678 d_percpu = debugfs_create_dir("per_cpu", d_tracer); 5306 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
4679 5307
4680 if (!d_percpu && !once) { 5308 WARN_ONCE(!tr->percpu_dir,
4681 once = 1; 5309 "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
4682 pr_warning("Could not create debugfs directory 'per_cpu'\n");
4683 return NULL;
4684 }
4685 5310
4686 return d_percpu; 5311 return tr->percpu_dir;
4687} 5312}
4688 5313
4689static void tracing_init_debugfs_percpu(long cpu) 5314static void
5315tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
4690{ 5316{
4691 struct dentry *d_percpu = tracing_dentry_percpu(); 5317 struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
5318 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
4692 struct dentry *d_cpu; 5319 struct dentry *d_cpu;
4693 char cpu_dir[30]; /* 30 characters should be more than enough */ 5320 char cpu_dir[30]; /* 30 characters should be more than enough */
4694 5321
@@ -4704,20 +5331,28 @@ static void tracing_init_debugfs_percpu(long cpu)
4704 5331
4705 /* per cpu trace_pipe */ 5332 /* per cpu trace_pipe */
4706 trace_create_file("trace_pipe", 0444, d_cpu, 5333 trace_create_file("trace_pipe", 0444, d_cpu,
4707 (void *) cpu, &tracing_pipe_fops); 5334 (void *)&data->trace_cpu, &tracing_pipe_fops);
4708 5335
4709 /* per cpu trace */ 5336 /* per cpu trace */
4710 trace_create_file("trace", 0644, d_cpu, 5337 trace_create_file("trace", 0644, d_cpu,
4711 (void *) cpu, &tracing_fops); 5338 (void *)&data->trace_cpu, &tracing_fops);
4712 5339
4713 trace_create_file("trace_pipe_raw", 0444, d_cpu, 5340 trace_create_file("trace_pipe_raw", 0444, d_cpu,
4714 (void *) cpu, &tracing_buffers_fops); 5341 (void *)&data->trace_cpu, &tracing_buffers_fops);
4715 5342
4716 trace_create_file("stats", 0444, d_cpu, 5343 trace_create_file("stats", 0444, d_cpu,
4717 (void *) cpu, &tracing_stats_fops); 5344 (void *)&data->trace_cpu, &tracing_stats_fops);
4718 5345
4719 trace_create_file("buffer_size_kb", 0444, d_cpu, 5346 trace_create_file("buffer_size_kb", 0444, d_cpu,
4720 (void *) cpu, &tracing_entries_fops); 5347 (void *)&data->trace_cpu, &tracing_entries_fops);
5348
5349#ifdef CONFIG_TRACER_SNAPSHOT
5350 trace_create_file("snapshot", 0644, d_cpu,
5351 (void *)&data->trace_cpu, &snapshot_fops);
5352
5353 trace_create_file("snapshot_raw", 0444, d_cpu,
5354 (void *)&data->trace_cpu, &snapshot_raw_fops);
5355#endif
4721} 5356}
4722 5357
4723#ifdef CONFIG_FTRACE_SELFTEST 5358#ifdef CONFIG_FTRACE_SELFTEST
@@ -4728,6 +5363,7 @@ static void tracing_init_debugfs_percpu(long cpu)
4728struct trace_option_dentry { 5363struct trace_option_dentry {
4729 struct tracer_opt *opt; 5364 struct tracer_opt *opt;
4730 struct tracer_flags *flags; 5365 struct tracer_flags *flags;
5366 struct trace_array *tr;
4731 struct dentry *entry; 5367 struct dentry *entry;
4732}; 5368};
4733 5369
@@ -4763,7 +5399,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
4763 5399
4764 if (!!(topt->flags->val & topt->opt->bit) != val) { 5400 if (!!(topt->flags->val & topt->opt->bit) != val) {
4765 mutex_lock(&trace_types_lock); 5401 mutex_lock(&trace_types_lock);
4766 ret = __set_tracer_option(current_trace, topt->flags, 5402 ret = __set_tracer_option(topt->tr->current_trace, topt->flags,
4767 topt->opt, !val); 5403 topt->opt, !val);
4768 mutex_unlock(&trace_types_lock); 5404 mutex_unlock(&trace_types_lock);
4769 if (ret) 5405 if (ret)
@@ -4802,6 +5438,7 @@ static ssize_t
4802trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, 5438trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4803 loff_t *ppos) 5439 loff_t *ppos)
4804{ 5440{
5441 struct trace_array *tr = &global_trace;
4805 long index = (long)filp->private_data; 5442 long index = (long)filp->private_data;
4806 unsigned long val; 5443 unsigned long val;
4807 int ret; 5444 int ret;
@@ -4814,7 +5451,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4814 return -EINVAL; 5451 return -EINVAL;
4815 5452
4816 mutex_lock(&trace_types_lock); 5453 mutex_lock(&trace_types_lock);
4817 ret = set_tracer_flag(1 << index, val); 5454 ret = set_tracer_flag(tr, 1 << index, val);
4818 mutex_unlock(&trace_types_lock); 5455 mutex_unlock(&trace_types_lock);
4819 5456
4820 if (ret < 0) 5457 if (ret < 0)
@@ -4848,40 +5485,41 @@ struct dentry *trace_create_file(const char *name,
4848} 5485}
4849 5486
4850 5487
4851static struct dentry *trace_options_init_dentry(void) 5488static struct dentry *trace_options_init_dentry(struct trace_array *tr)
4852{ 5489{
4853 struct dentry *d_tracer; 5490 struct dentry *d_tracer;
4854 static struct dentry *t_options;
4855 5491
4856 if (t_options) 5492 if (tr->options)
4857 return t_options; 5493 return tr->options;
4858 5494
4859 d_tracer = tracing_init_dentry(); 5495 d_tracer = tracing_init_dentry_tr(tr);
4860 if (!d_tracer) 5496 if (!d_tracer)
4861 return NULL; 5497 return NULL;
4862 5498
4863 t_options = debugfs_create_dir("options", d_tracer); 5499 tr->options = debugfs_create_dir("options", d_tracer);
4864 if (!t_options) { 5500 if (!tr->options) {
4865 pr_warning("Could not create debugfs directory 'options'\n"); 5501 pr_warning("Could not create debugfs directory 'options'\n");
4866 return NULL; 5502 return NULL;
4867 } 5503 }
4868 5504
4869 return t_options; 5505 return tr->options;
4870} 5506}
4871 5507
4872static void 5508static void
4873create_trace_option_file(struct trace_option_dentry *topt, 5509create_trace_option_file(struct trace_array *tr,
5510 struct trace_option_dentry *topt,
4874 struct tracer_flags *flags, 5511 struct tracer_flags *flags,
4875 struct tracer_opt *opt) 5512 struct tracer_opt *opt)
4876{ 5513{
4877 struct dentry *t_options; 5514 struct dentry *t_options;
4878 5515
4879 t_options = trace_options_init_dentry(); 5516 t_options = trace_options_init_dentry(tr);
4880 if (!t_options) 5517 if (!t_options)
4881 return; 5518 return;
4882 5519
4883 topt->flags = flags; 5520 topt->flags = flags;
4884 topt->opt = opt; 5521 topt->opt = opt;
5522 topt->tr = tr;
4885 5523
4886 topt->entry = trace_create_file(opt->name, 0644, t_options, topt, 5524 topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
4887 &trace_options_fops); 5525 &trace_options_fops);
@@ -4889,7 +5527,7 @@ create_trace_option_file(struct trace_option_dentry *topt,
4889} 5527}
4890 5528
4891static struct trace_option_dentry * 5529static struct trace_option_dentry *
4892create_trace_option_files(struct tracer *tracer) 5530create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
4893{ 5531{
4894 struct trace_option_dentry *topts; 5532 struct trace_option_dentry *topts;
4895 struct tracer_flags *flags; 5533 struct tracer_flags *flags;
@@ -4914,7 +5552,7 @@ create_trace_option_files(struct tracer *tracer)
4914 return NULL; 5552 return NULL;
4915 5553
4916 for (cnt = 0; opts[cnt].name; cnt++) 5554 for (cnt = 0; opts[cnt].name; cnt++)
4917 create_trace_option_file(&topts[cnt], flags, 5555 create_trace_option_file(tr, &topts[cnt], flags,
4918 &opts[cnt]); 5556 &opts[cnt]);
4919 5557
4920 return topts; 5558 return topts;
@@ -4937,11 +5575,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
4937} 5575}
4938 5576
4939static struct dentry * 5577static struct dentry *
4940create_trace_option_core_file(const char *option, long index) 5578create_trace_option_core_file(struct trace_array *tr,
5579 const char *option, long index)
4941{ 5580{
4942 struct dentry *t_options; 5581 struct dentry *t_options;
4943 5582
4944 t_options = trace_options_init_dentry(); 5583 t_options = trace_options_init_dentry(tr);
4945 if (!t_options) 5584 if (!t_options)
4946 return NULL; 5585 return NULL;
4947 5586
@@ -4949,17 +5588,17 @@ create_trace_option_core_file(const char *option, long index)
4949 &trace_options_core_fops); 5588 &trace_options_core_fops);
4950} 5589}
4951 5590
4952static __init void create_trace_options_dir(void) 5591static __init void create_trace_options_dir(struct trace_array *tr)
4953{ 5592{
4954 struct dentry *t_options; 5593 struct dentry *t_options;
4955 int i; 5594 int i;
4956 5595
4957 t_options = trace_options_init_dentry(); 5596 t_options = trace_options_init_dentry(tr);
4958 if (!t_options) 5597 if (!t_options)
4959 return; 5598 return;
4960 5599
4961 for (i = 0; trace_options[i]; i++) 5600 for (i = 0; trace_options[i]; i++)
4962 create_trace_option_core_file(trace_options[i], i); 5601 create_trace_option_core_file(tr, trace_options[i], i);
4963} 5602}
4964 5603
4965static ssize_t 5604static ssize_t
@@ -4967,7 +5606,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,
4967 size_t cnt, loff_t *ppos) 5606 size_t cnt, loff_t *ppos)
4968{ 5607{
4969 struct trace_array *tr = filp->private_data; 5608 struct trace_array *tr = filp->private_data;
4970 struct ring_buffer *buffer = tr->buffer; 5609 struct ring_buffer *buffer = tr->trace_buffer.buffer;
4971 char buf[64]; 5610 char buf[64];
4972 int r; 5611 int r;
4973 5612
@@ -4986,7 +5625,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
4986 size_t cnt, loff_t *ppos) 5625 size_t cnt, loff_t *ppos)
4987{ 5626{
4988 struct trace_array *tr = filp->private_data; 5627 struct trace_array *tr = filp->private_data;
4989 struct ring_buffer *buffer = tr->buffer; 5628 struct ring_buffer *buffer = tr->trace_buffer.buffer;
4990 unsigned long val; 5629 unsigned long val;
4991 int ret; 5630 int ret;
4992 5631
@@ -4998,12 +5637,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
4998 mutex_lock(&trace_types_lock); 5637 mutex_lock(&trace_types_lock);
4999 if (val) { 5638 if (val) {
5000 ring_buffer_record_on(buffer); 5639 ring_buffer_record_on(buffer);
5001 if (current_trace->start) 5640 if (tr->current_trace->start)
5002 current_trace->start(tr); 5641 tr->current_trace->start(tr);
5003 } else { 5642 } else {
5004 ring_buffer_record_off(buffer); 5643 ring_buffer_record_off(buffer);
5005 if (current_trace->stop) 5644 if (tr->current_trace->stop)
5006 current_trace->stop(tr); 5645 tr->current_trace->stop(tr);
5007 } 5646 }
5008 mutex_unlock(&trace_types_lock); 5647 mutex_unlock(&trace_types_lock);
5009 } 5648 }
@@ -5020,23 +5659,310 @@ static const struct file_operations rb_simple_fops = {
5020 .llseek = default_llseek, 5659 .llseek = default_llseek,
5021}; 5660};
5022 5661
5662struct dentry *trace_instance_dir;
5663
5664static void
5665init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
5666
5667static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
5668{
5669 int cpu;
5670
5671 for_each_tracing_cpu(cpu) {
5672 memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu));
5673 per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu;
5674 per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr;
5675 }
5676}
5677
5678static int
5679allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
5680{
5681 enum ring_buffer_flags rb_flags;
5682
5683 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
5684
5685 buf->buffer = ring_buffer_alloc(size, rb_flags);
5686 if (!buf->buffer)
5687 return -ENOMEM;
5688
5689 buf->data = alloc_percpu(struct trace_array_cpu);
5690 if (!buf->data) {
5691 ring_buffer_free(buf->buffer);
5692 return -ENOMEM;
5693 }
5694
5695 init_trace_buffers(tr, buf);
5696
5697 /* Allocate the first page for all buffers */
5698 set_buffer_entries(&tr->trace_buffer,
5699 ring_buffer_size(tr->trace_buffer.buffer, 0));
5700
5701 return 0;
5702}
5703
5704static int allocate_trace_buffers(struct trace_array *tr, int size)
5705{
5706 int ret;
5707
5708 ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
5709 if (ret)
5710 return ret;
5711
5712#ifdef CONFIG_TRACER_MAX_TRACE
5713 ret = allocate_trace_buffer(tr, &tr->max_buffer,
5714 allocate_snapshot ? size : 1);
5715 if (WARN_ON(ret)) {
5716 ring_buffer_free(tr->trace_buffer.buffer);
5717 free_percpu(tr->trace_buffer.data);
5718 return -ENOMEM;
5719 }
5720 tr->allocated_snapshot = allocate_snapshot;
5721
5722 /*
5723 * Only the top level trace array gets its snapshot allocated
5724 * from the kernel command line.
5725 */
5726 allocate_snapshot = false;
5727#endif
5728 return 0;
5729}
5730
5731static int new_instance_create(const char *name)
5732{
5733 struct trace_array *tr;
5734 int ret;
5735
5736 mutex_lock(&trace_types_lock);
5737
5738 ret = -EEXIST;
5739 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
5740 if (tr->name && strcmp(tr->name, name) == 0)
5741 goto out_unlock;
5742 }
5743
5744 ret = -ENOMEM;
5745 tr = kzalloc(sizeof(*tr), GFP_KERNEL);
5746 if (!tr)
5747 goto out_unlock;
5748
5749 tr->name = kstrdup(name, GFP_KERNEL);
5750 if (!tr->name)
5751 goto out_free_tr;
5752
5753 raw_spin_lock_init(&tr->start_lock);
5754
5755 tr->current_trace = &nop_trace;
5756
5757 INIT_LIST_HEAD(&tr->systems);
5758 INIT_LIST_HEAD(&tr->events);
5759
5760 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
5761 goto out_free_tr;
5762
5763 /* Holder for file callbacks */
5764 tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
5765 tr->trace_cpu.tr = tr;
5766
5767 tr->dir = debugfs_create_dir(name, trace_instance_dir);
5768 if (!tr->dir)
5769 goto out_free_tr;
5770
5771 ret = event_trace_add_tracer(tr->dir, tr);
5772 if (ret)
5773 goto out_free_tr;
5774
5775 init_tracer_debugfs(tr, tr->dir);
5776
5777 list_add(&tr->list, &ftrace_trace_arrays);
5778
5779 mutex_unlock(&trace_types_lock);
5780
5781 return 0;
5782
5783 out_free_tr:
5784 if (tr->trace_buffer.buffer)
5785 ring_buffer_free(tr->trace_buffer.buffer);
5786 kfree(tr->name);
5787 kfree(tr);
5788
5789 out_unlock:
5790 mutex_unlock(&trace_types_lock);
5791
5792 return ret;
5793
5794}
5795
5796static int instance_delete(const char *name)
5797{
5798 struct trace_array *tr;
5799 int found = 0;
5800 int ret;
5801
5802 mutex_lock(&trace_types_lock);
5803
5804 ret = -ENODEV;
5805 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
5806 if (tr->name && strcmp(tr->name, name) == 0) {
5807 found = 1;
5808 break;
5809 }
5810 }
5811 if (!found)
5812 goto out_unlock;
5813
5814 ret = -EBUSY;
5815 if (tr->ref)
5816 goto out_unlock;
5817
5818 list_del(&tr->list);
5819
5820 event_trace_del_tracer(tr);
5821 debugfs_remove_recursive(tr->dir);
5822 free_percpu(tr->trace_buffer.data);
5823 ring_buffer_free(tr->trace_buffer.buffer);
5824
5825 kfree(tr->name);
5826 kfree(tr);
5827
5828 ret = 0;
5829
5830 out_unlock:
5831 mutex_unlock(&trace_types_lock);
5832
5833 return ret;
5834}
5835
5836static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
5837{
5838 struct dentry *parent;
5839 int ret;
5840
5841 /* Paranoid: Make sure the parent is the "instances" directory */
5842 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
5843 if (WARN_ON_ONCE(parent != trace_instance_dir))
5844 return -ENOENT;
5845
5846 /*
5847 * The inode mutex is locked, but debugfs_create_dir() will also
5848 * take the mutex. As the instances directory can not be destroyed
5849 * or changed in any other way, it is safe to unlock it, and
5850 * let the dentry try. If two users try to make the same dir at
5851 * the same time, then the new_instance_create() will determine the
5852 * winner.
5853 */
5854 mutex_unlock(&inode->i_mutex);
5855
5856 ret = new_instance_create(dentry->d_iname);
5857
5858 mutex_lock(&inode->i_mutex);
5859
5860 return ret;
5861}
5862
5863static int instance_rmdir(struct inode *inode, struct dentry *dentry)
5864{
5865 struct dentry *parent;
5866 int ret;
5867
5868 /* Paranoid: Make sure the parent is the "instances" directory */
5869 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
5870 if (WARN_ON_ONCE(parent != trace_instance_dir))
5871 return -ENOENT;
5872
5873 /* The caller did a dget() on dentry */
5874 mutex_unlock(&dentry->d_inode->i_mutex);
5875
5876 /*
5877 * The inode mutex is locked, but debugfs_create_dir() will also
5878 * take the mutex. As the instances directory can not be destroyed
5879 * or changed in any other way, it is safe to unlock it, and
5880 * let the dentry try. If two users try to make the same dir at
5881 * the same time, then the instance_delete() will determine the
5882 * winner.
5883 */
5884 mutex_unlock(&inode->i_mutex);
5885
5886 ret = instance_delete(dentry->d_iname);
5887
5888 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
5889 mutex_lock(&dentry->d_inode->i_mutex);
5890
5891 return ret;
5892}
5893
5894static const struct inode_operations instance_dir_inode_operations = {
5895 .lookup = simple_lookup,
5896 .mkdir = instance_mkdir,
5897 .rmdir = instance_rmdir,
5898};
5899
5900static __init void create_trace_instances(struct dentry *d_tracer)
5901{
5902 trace_instance_dir = debugfs_create_dir("instances", d_tracer);
5903 if (WARN_ON(!trace_instance_dir))
5904 return;
5905
5906 /* Hijack the dir inode operations, to allow mkdir */
5907 trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
5908}
5909
5910static void
5911init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
5912{
5913 int cpu;
5914
5915 trace_create_file("trace_options", 0644, d_tracer,
5916 tr, &tracing_iter_fops);
5917
5918 trace_create_file("trace", 0644, d_tracer,
5919 (void *)&tr->trace_cpu, &tracing_fops);
5920
5921 trace_create_file("trace_pipe", 0444, d_tracer,
5922 (void *)&tr->trace_cpu, &tracing_pipe_fops);
5923
5924 trace_create_file("buffer_size_kb", 0644, d_tracer,
5925 (void *)&tr->trace_cpu, &tracing_entries_fops);
5926
5927 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5928 tr, &tracing_total_entries_fops);
5929
5930 trace_create_file("free_buffer", 0644, d_tracer,
5931 tr, &tracing_free_buffer_fops);
5932
5933 trace_create_file("trace_marker", 0220, d_tracer,
5934 tr, &tracing_mark_fops);
5935
5936 trace_create_file("trace_clock", 0644, d_tracer, tr,
5937 &trace_clock_fops);
5938
5939 trace_create_file("tracing_on", 0644, d_tracer,
5940 tr, &rb_simple_fops);
5941
5942#ifdef CONFIG_TRACER_SNAPSHOT
5943 trace_create_file("snapshot", 0644, d_tracer,
5944 (void *)&tr->trace_cpu, &snapshot_fops);
5945#endif
5946
5947 for_each_tracing_cpu(cpu)
5948 tracing_init_debugfs_percpu(tr, cpu);
5949
5950}
5951
5023static __init int tracer_init_debugfs(void) 5952static __init int tracer_init_debugfs(void)
5024{ 5953{
5025 struct dentry *d_tracer; 5954 struct dentry *d_tracer;
5026 int cpu;
5027 5955
5028 trace_access_lock_init(); 5956 trace_access_lock_init();
5029 5957
5030 d_tracer = tracing_init_dentry(); 5958 d_tracer = tracing_init_dentry();
5959 if (!d_tracer)
5960 return 0;
5031 5961
5032 trace_create_file("trace_options", 0644, d_tracer, 5962 init_tracer_debugfs(&global_trace, d_tracer);
5033 NULL, &tracing_iter_fops);
5034 5963
5035 trace_create_file("tracing_cpumask", 0644, d_tracer, 5964 trace_create_file("tracing_cpumask", 0644, d_tracer,
5036 NULL, &tracing_cpumask_fops); 5965 &global_trace, &tracing_cpumask_fops);
5037
5038 trace_create_file("trace", 0644, d_tracer,
5039 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
5040 5966
5041 trace_create_file("available_tracers", 0444, d_tracer, 5967 trace_create_file("available_tracers", 0444, d_tracer,
5042 &global_trace, &show_traces_fops); 5968 &global_trace, &show_traces_fops);
@@ -5055,44 +5981,17 @@ static __init int tracer_init_debugfs(void)
5055 trace_create_file("README", 0444, d_tracer, 5981 trace_create_file("README", 0444, d_tracer,
5056 NULL, &tracing_readme_fops); 5982 NULL, &tracing_readme_fops);
5057 5983
5058 trace_create_file("trace_pipe", 0444, d_tracer,
5059 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
5060
5061 trace_create_file("buffer_size_kb", 0644, d_tracer,
5062 (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
5063
5064 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5065 &global_trace, &tracing_total_entries_fops);
5066
5067 trace_create_file("free_buffer", 0644, d_tracer,
5068 &global_trace, &tracing_free_buffer_fops);
5069
5070 trace_create_file("trace_marker", 0220, d_tracer,
5071 NULL, &tracing_mark_fops);
5072
5073 trace_create_file("saved_cmdlines", 0444, d_tracer, 5984 trace_create_file("saved_cmdlines", 0444, d_tracer,
5074 NULL, &tracing_saved_cmdlines_fops); 5985 NULL, &tracing_saved_cmdlines_fops);
5075 5986
5076 trace_create_file("trace_clock", 0644, d_tracer, NULL,
5077 &trace_clock_fops);
5078
5079 trace_create_file("tracing_on", 0644, d_tracer,
5080 &global_trace, &rb_simple_fops);
5081
5082#ifdef CONFIG_DYNAMIC_FTRACE 5987#ifdef CONFIG_DYNAMIC_FTRACE
5083 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 5988 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
5084 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 5989 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
5085#endif 5990#endif
5086 5991
5087#ifdef CONFIG_TRACER_SNAPSHOT 5992 create_trace_instances(d_tracer);
5088 trace_create_file("snapshot", 0644, d_tracer,
5089 (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
5090#endif
5091 5993
5092 create_trace_options_dir(); 5994 create_trace_options_dir(&global_trace);
5093
5094 for_each_tracing_cpu(cpu)
5095 tracing_init_debugfs_percpu(cpu);
5096 5995
5097 return 0; 5996 return 0;
5098} 5997}
@@ -5148,8 +6047,8 @@ void
5148trace_printk_seq(struct trace_seq *s) 6047trace_printk_seq(struct trace_seq *s)
5149{ 6048{
5150 /* Probably should print a warning here. */ 6049 /* Probably should print a warning here. */
5151 if (s->len >= 1000) 6050 if (s->len >= TRACE_MAX_PRINT)
5152 s->len = 1000; 6051 s->len = TRACE_MAX_PRINT;
5153 6052
5154 /* should be zero ended, but we are paranoid. */ 6053 /* should be zero ended, but we are paranoid. */
5155 s->buffer[s->len] = 0; 6054 s->buffer[s->len] = 0;
@@ -5162,46 +6061,43 @@ trace_printk_seq(struct trace_seq *s)
5162void trace_init_global_iter(struct trace_iterator *iter) 6061void trace_init_global_iter(struct trace_iterator *iter)
5163{ 6062{
5164 iter->tr = &global_trace; 6063 iter->tr = &global_trace;
5165 iter->trace = current_trace; 6064 iter->trace = iter->tr->current_trace;
5166 iter->cpu_file = TRACE_PIPE_ALL_CPU; 6065 iter->cpu_file = RING_BUFFER_ALL_CPUS;
6066 iter->trace_buffer = &global_trace.trace_buffer;
5167} 6067}
5168 6068
5169static void 6069void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
5170__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5171{ 6070{
5172 static arch_spinlock_t ftrace_dump_lock =
5173 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
5174 /* use static because iter can be a bit big for the stack */ 6071 /* use static because iter can be a bit big for the stack */
5175 static struct trace_iterator iter; 6072 static struct trace_iterator iter;
6073 static atomic_t dump_running;
5176 unsigned int old_userobj; 6074 unsigned int old_userobj;
5177 static int dump_ran;
5178 unsigned long flags; 6075 unsigned long flags;
5179 int cnt = 0, cpu; 6076 int cnt = 0, cpu;
5180 6077
5181 /* only one dump */ 6078 /* Only allow one dump user at a time. */
5182 local_irq_save(flags); 6079 if (atomic_inc_return(&dump_running) != 1) {
5183 arch_spin_lock(&ftrace_dump_lock); 6080 atomic_dec(&dump_running);
5184 if (dump_ran) 6081 return;
5185 goto out; 6082 }
5186
5187 dump_ran = 1;
5188 6083
6084 /*
6085 * Always turn off tracing when we dump.
6086 * We don't need to show trace output of what happens
6087 * between multiple crashes.
6088 *
6089 * If the user does a sysrq-z, then they can re-enable
6090 * tracing with echo 1 > tracing_on.
6091 */
5189 tracing_off(); 6092 tracing_off();
5190 6093
5191 /* Did function tracer already get disabled? */ 6094 local_irq_save(flags);
5192 if (ftrace_is_dead()) {
5193 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
5194 printk("# MAY BE MISSING FUNCTION EVENTS\n");
5195 }
5196
5197 if (disable_tracing)
5198 ftrace_kill();
5199 6095
5200 /* Simulate the iterator */ 6096 /* Simulate the iterator */
5201 trace_init_global_iter(&iter); 6097 trace_init_global_iter(&iter);
5202 6098
5203 for_each_tracing_cpu(cpu) { 6099 for_each_tracing_cpu(cpu) {
5204 atomic_inc(&iter.tr->data[cpu]->disabled); 6100 atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
5205 } 6101 }
5206 6102
5207 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 6103 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -5211,7 +6107,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5211 6107
5212 switch (oops_dump_mode) { 6108 switch (oops_dump_mode) {
5213 case DUMP_ALL: 6109 case DUMP_ALL:
5214 iter.cpu_file = TRACE_PIPE_ALL_CPU; 6110 iter.cpu_file = RING_BUFFER_ALL_CPUS;
5215 break; 6111 break;
5216 case DUMP_ORIG: 6112 case DUMP_ORIG:
5217 iter.cpu_file = raw_smp_processor_id(); 6113 iter.cpu_file = raw_smp_processor_id();
@@ -5220,11 +6116,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5220 goto out_enable; 6116 goto out_enable;
5221 default: 6117 default:
5222 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); 6118 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
5223 iter.cpu_file = TRACE_PIPE_ALL_CPU; 6119 iter.cpu_file = RING_BUFFER_ALL_CPUS;
5224 } 6120 }
5225 6121
5226 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 6122 printk(KERN_TRACE "Dumping ftrace buffer:\n");
5227 6123
6124 /* Did function tracer already get disabled? */
6125 if (ftrace_is_dead()) {
6126 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
6127 printk("# MAY BE MISSING FUNCTION EVENTS\n");
6128 }
6129
5228 /* 6130 /*
5229 * We need to stop all tracing on all CPUS to read the 6131 * We need to stop all tracing on all CPUS to read the
5230 * the next buffer. This is a bit expensive, but is 6132 * the next buffer. This is a bit expensive, but is
@@ -5264,33 +6166,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5264 printk(KERN_TRACE "---------------------------------\n"); 6166 printk(KERN_TRACE "---------------------------------\n");
5265 6167
5266 out_enable: 6168 out_enable:
5267 /* Re-enable tracing if requested */ 6169 trace_flags |= old_userobj;
5268 if (!disable_tracing) {
5269 trace_flags |= old_userobj;
5270 6170
5271 for_each_tracing_cpu(cpu) { 6171 for_each_tracing_cpu(cpu) {
5272 atomic_dec(&iter.tr->data[cpu]->disabled); 6172 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
5273 }
5274 tracing_on();
5275 } 6173 }
5276 6174 atomic_dec(&dump_running);
5277 out:
5278 arch_spin_unlock(&ftrace_dump_lock);
5279 local_irq_restore(flags); 6175 local_irq_restore(flags);
5280} 6176}
5281
5282/* By default: disable tracing after the dump */
5283void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
5284{
5285 __ftrace_dump(true, oops_dump_mode);
5286}
5287EXPORT_SYMBOL_GPL(ftrace_dump); 6177EXPORT_SYMBOL_GPL(ftrace_dump);
5288 6178
5289__init static int tracer_alloc_buffers(void) 6179__init static int tracer_alloc_buffers(void)
5290{ 6180{
5291 int ring_buf_size; 6181 int ring_buf_size;
5292 enum ring_buffer_flags rb_flags;
5293 int i;
5294 int ret = -ENOMEM; 6182 int ret = -ENOMEM;
5295 6183
5296 6184
@@ -5311,49 +6199,27 @@ __init static int tracer_alloc_buffers(void)
5311 else 6199 else
5312 ring_buf_size = 1; 6200 ring_buf_size = 1;
5313 6201
5314 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
5315
5316 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 6202 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
5317 cpumask_copy(tracing_cpumask, cpu_all_mask); 6203 cpumask_copy(tracing_cpumask, cpu_all_mask);
5318 6204
6205 raw_spin_lock_init(&global_trace.start_lock);
6206
5319 /* TODO: make the number of buffers hot pluggable with CPUS */ 6207 /* TODO: make the number of buffers hot pluggable with CPUS */
5320 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); 6208 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
5321 if (!global_trace.buffer) {
5322 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 6209 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
5323 WARN_ON(1); 6210 WARN_ON(1);
5324 goto out_free_cpumask; 6211 goto out_free_cpumask;
5325 } 6212 }
6213
5326 if (global_trace.buffer_disabled) 6214 if (global_trace.buffer_disabled)
5327 tracing_off(); 6215 tracing_off();
5328 6216
5329
5330#ifdef CONFIG_TRACER_MAX_TRACE
5331 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
5332 if (!max_tr.buffer) {
5333 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
5334 WARN_ON(1);
5335 ring_buffer_free(global_trace.buffer);
5336 goto out_free_cpumask;
5337 }
5338#endif
5339
5340 /* Allocate the first page for all buffers */
5341 for_each_tracing_cpu(i) {
5342 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
5343 max_tr.data[i] = &per_cpu(max_tr_data, i);
5344 }
5345
5346 set_buffer_entries(&global_trace,
5347 ring_buffer_size(global_trace.buffer, 0));
5348#ifdef CONFIG_TRACER_MAX_TRACE
5349 set_buffer_entries(&max_tr, 1);
5350#endif
5351
5352 trace_init_cmdlines(); 6217 trace_init_cmdlines();
5353 init_irq_work(&trace_work_wakeup, trace_wake_up);
5354 6218
5355 register_tracer(&nop_trace); 6219 register_tracer(&nop_trace);
5356 6220
6221 global_trace.current_trace = &nop_trace;
6222
5357 /* All seems OK, enable tracing */ 6223 /* All seems OK, enable tracing */
5358 tracing_disabled = 0; 6224 tracing_disabled = 0;
5359 6225
@@ -5362,16 +6228,32 @@ __init static int tracer_alloc_buffers(void)
5362 6228
5363 register_die_notifier(&trace_die_notifier); 6229 register_die_notifier(&trace_die_notifier);
5364 6230
6231 global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
6232
6233 /* Holder for file callbacks */
6234 global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
6235 global_trace.trace_cpu.tr = &global_trace;
6236
6237 INIT_LIST_HEAD(&global_trace.systems);
6238 INIT_LIST_HEAD(&global_trace.events);
6239 list_add(&global_trace.list, &ftrace_trace_arrays);
6240
5365 while (trace_boot_options) { 6241 while (trace_boot_options) {
5366 char *option; 6242 char *option;
5367 6243
5368 option = strsep(&trace_boot_options, ","); 6244 option = strsep(&trace_boot_options, ",");
5369 trace_set_options(option); 6245 trace_set_options(&global_trace, option);
5370 } 6246 }
5371 6247
6248 register_snapshot_cmd();
6249
5372 return 0; 6250 return 0;
5373 6251
5374out_free_cpumask: 6252out_free_cpumask:
6253 free_percpu(global_trace.trace_buffer.data);
6254#ifdef CONFIG_TRACER_MAX_TRACE
6255 free_percpu(global_trace.max_buffer.data);
6256#endif
5375 free_cpumask_var(tracing_cpumask); 6257 free_cpumask_var(tracing_cpumask);
5376out_free_buffer_mask: 6258out_free_buffer_mask:
5377 free_cpumask_var(tracing_buffer_mask); 6259 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2081971367ea..711ca7d3e7f1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,11 @@
13#include <linux/trace_seq.h> 13#include <linux/trace_seq.h>
14#include <linux/ftrace_event.h> 14#include <linux/ftrace_event.h>
15 15
16#ifdef CONFIG_FTRACE_SYSCALLS
17#include <asm/unistd.h> /* For NR_SYSCALLS */
18#include <asm/syscall.h> /* some archs define it here */
19#endif
20
16enum trace_type { 21enum trace_type {
17 __TRACE_FIRST_TYPE = 0, 22 __TRACE_FIRST_TYPE = 0,
18 23
@@ -29,6 +34,7 @@ enum trace_type {
29 TRACE_GRAPH_ENT, 34 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK, 35 TRACE_USER_STACK,
31 TRACE_BLK, 36 TRACE_BLK,
37 TRACE_BPUTS,
32 38
33 __TRACE_LAST_TYPE, 39 __TRACE_LAST_TYPE,
34}; 40};
@@ -103,11 +109,6 @@ struct kretprobe_trace_entry_head {
103 unsigned long ret_ip; 109 unsigned long ret_ip;
104}; 110};
105 111
106struct uprobe_trace_entry_head {
107 struct trace_entry ent;
108 unsigned long ip;
109};
110
111/* 112/*
112 * trace_flag_type is an enumeration that holds different 113 * trace_flag_type is an enumeration that holds different
113 * states when a trace occurs. These are: 114 * states when a trace occurs. These are:
@@ -127,12 +128,21 @@ enum trace_flag_type {
127 128
128#define TRACE_BUF_SIZE 1024 129#define TRACE_BUF_SIZE 1024
129 130
131struct trace_array;
132
133struct trace_cpu {
134 struct trace_array *tr;
135 struct dentry *dir;
136 int cpu;
137};
138
130/* 139/*
131 * The CPU trace array - it consists of thousands of trace entries 140 * The CPU trace array - it consists of thousands of trace entries
132 * plus some other descriptor data: (for example which task started 141 * plus some other descriptor data: (for example which task started
133 * the trace, etc.) 142 * the trace, etc.)
134 */ 143 */
135struct trace_array_cpu { 144struct trace_array_cpu {
145 struct trace_cpu trace_cpu;
136 atomic_t disabled; 146 atomic_t disabled;
137 void *buffer_page; /* ring buffer spare */ 147 void *buffer_page; /* ring buffer spare */
138 148
@@ -151,20 +161,83 @@ struct trace_array_cpu {
151 char comm[TASK_COMM_LEN]; 161 char comm[TASK_COMM_LEN];
152}; 162};
153 163
164struct tracer;
165
166struct trace_buffer {
167 struct trace_array *tr;
168 struct ring_buffer *buffer;
169 struct trace_array_cpu __percpu *data;
170 cycle_t time_start;
171 int cpu;
172};
173
154/* 174/*
155 * The trace array - an array of per-CPU trace arrays. This is the 175 * The trace array - an array of per-CPU trace arrays. This is the
156 * highest level data structure that individual tracers deal with. 176 * highest level data structure that individual tracers deal with.
157 * They have on/off state as well: 177 * They have on/off state as well:
158 */ 178 */
159struct trace_array { 179struct trace_array {
160 struct ring_buffer *buffer; 180 struct list_head list;
161 int cpu; 181 char *name;
182 struct trace_buffer trace_buffer;
183#ifdef CONFIG_TRACER_MAX_TRACE
184 /*
185 * The max_buffer is used to snapshot the trace when a maximum
186 * latency is reached, or when the user initiates a snapshot.
187 * Some tracers will use this to store a maximum trace while
188 * it continues examining live traces.
189 *
190 * The buffers for the max_buffer are set up the same as the trace_buffer
191 * When a snapshot is taken, the buffer of the max_buffer is swapped
192 * with the buffer of the trace_buffer and the buffers are reset for
193 * the trace_buffer so the tracing can continue.
194 */
195 struct trace_buffer max_buffer;
196 bool allocated_snapshot;
197#endif
162 int buffer_disabled; 198 int buffer_disabled;
163 cycle_t time_start; 199 struct trace_cpu trace_cpu; /* place holder */
200#ifdef CONFIG_FTRACE_SYSCALLS
201 int sys_refcount_enter;
202 int sys_refcount_exit;
203 DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
204 DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
205#endif
206 int stop_count;
207 int clock_id;
208 struct tracer *current_trace;
209 unsigned int flags;
210 raw_spinlock_t start_lock;
211 struct dentry *dir;
212 struct dentry *options;
213 struct dentry *percpu_dir;
214 struct dentry *event_dir;
215 struct list_head systems;
216 struct list_head events;
164 struct task_struct *waiter; 217 struct task_struct *waiter;
165 struct trace_array_cpu *data[NR_CPUS]; 218 int ref;
219};
220
221enum {
222 TRACE_ARRAY_FL_GLOBAL = (1 << 0)
166}; 223};
167 224
225extern struct list_head ftrace_trace_arrays;
226
227/*
228 * The global tracer (top) should be the first trace array added,
229 * but we check the flag anyway.
230 */
231static inline struct trace_array *top_trace_array(void)
232{
233 struct trace_array *tr;
234
235 tr = list_entry(ftrace_trace_arrays.prev,
236 typeof(*tr), list);
237 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
238 return tr;
239}
240
168#define FTRACE_CMP_TYPE(var, type) \ 241#define FTRACE_CMP_TYPE(var, type) \
169 __builtin_types_compatible_p(typeof(var), type *) 242 __builtin_types_compatible_p(typeof(var), type *)
170 243
@@ -200,6 +273,7 @@ extern void __ftrace_bad_type(void);
200 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 273 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
201 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 274 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
202 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 275 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
276 IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
203 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 277 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
204 TRACE_MMIO_RW); \ 278 TRACE_MMIO_RW); \
205 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 279 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -289,9 +363,10 @@ struct tracer {
289 struct tracer *next; 363 struct tracer *next;
290 struct tracer_flags *flags; 364 struct tracer_flags *flags;
291 bool print_max; 365 bool print_max;
292 bool use_max_tr;
293 bool allocated_snapshot;
294 bool enabled; 366 bool enabled;
367#ifdef CONFIG_TRACER_MAX_TRACE
368 bool use_max_tr;
369#endif
295}; 370};
296 371
297 372
@@ -427,8 +502,6 @@ static __always_inline void trace_clear_recursion(int bit)
427 current->trace_recursion = val; 502 current->trace_recursion = val;
428} 503}
429 504
430#define TRACE_PIPE_ALL_CPU -1
431
432static inline struct ring_buffer_iter * 505static inline struct ring_buffer_iter *
433trace_buffer_iter(struct trace_iterator *iter, int cpu) 506trace_buffer_iter(struct trace_iterator *iter, int cpu)
434{ 507{
@@ -439,10 +512,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
439 512
440int tracer_init(struct tracer *t, struct trace_array *tr); 513int tracer_init(struct tracer *t, struct trace_array *tr);
441int tracing_is_enabled(void); 514int tracing_is_enabled(void);
442void tracing_reset(struct trace_array *tr, int cpu); 515void tracing_reset(struct trace_buffer *buf, int cpu);
443void tracing_reset_online_cpus(struct trace_array *tr); 516void tracing_reset_online_cpus(struct trace_buffer *buf);
444void tracing_reset_current(int cpu); 517void tracing_reset_current(int cpu);
445void tracing_reset_current_online_cpus(void); 518void tracing_reset_all_online_cpus(void);
446int tracing_open_generic(struct inode *inode, struct file *filp); 519int tracing_open_generic(struct inode *inode, struct file *filp);
447struct dentry *trace_create_file(const char *name, 520struct dentry *trace_create_file(const char *name,
448 umode_t mode, 521 umode_t mode,
@@ -450,6 +523,7 @@ struct dentry *trace_create_file(const char *name,
450 void *data, 523 void *data,
451 const struct file_operations *fops); 524 const struct file_operations *fops);
452 525
526struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
453struct dentry *tracing_init_dentry(void); 527struct dentry *tracing_init_dentry(void);
454 528
455struct ring_buffer_event; 529struct ring_buffer_event;
@@ -583,7 +657,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
583#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 657#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
584extern int DYN_FTRACE_TEST_NAME2(void); 658extern int DYN_FTRACE_TEST_NAME2(void);
585 659
586extern int ring_buffer_expanded; 660extern bool ring_buffer_expanded;
587extern bool tracing_selftest_disabled; 661extern bool tracing_selftest_disabled;
588DECLARE_PER_CPU(int, ftrace_cpu_disabled); 662DECLARE_PER_CPU(int, ftrace_cpu_disabled);
589 663
@@ -619,6 +693,8 @@ trace_array_vprintk(struct trace_array *tr,
619 unsigned long ip, const char *fmt, va_list args); 693 unsigned long ip, const char *fmt, va_list args);
620int trace_array_printk(struct trace_array *tr, 694int trace_array_printk(struct trace_array *tr,
621 unsigned long ip, const char *fmt, ...); 695 unsigned long ip, const char *fmt, ...);
696int trace_array_printk_buf(struct ring_buffer *buffer,
697 unsigned long ip, const char *fmt, ...);
622void trace_printk_seq(struct trace_seq *s); 698void trace_printk_seq(struct trace_seq *s);
623enum print_line_t print_trace_line(struct trace_iterator *iter); 699enum print_line_t print_trace_line(struct trace_iterator *iter);
624 700
@@ -786,6 +862,7 @@ enum trace_iterator_flags {
786 TRACE_ITER_STOP_ON_FREE = 0x400000, 862 TRACE_ITER_STOP_ON_FREE = 0x400000,
787 TRACE_ITER_IRQ_INFO = 0x800000, 863 TRACE_ITER_IRQ_INFO = 0x800000,
788 TRACE_ITER_MARKERS = 0x1000000, 864 TRACE_ITER_MARKERS = 0x1000000,
865 TRACE_ITER_FUNCTION = 0x2000000,
789}; 866};
790 867
791/* 868/*
@@ -832,8 +909,8 @@ enum {
832 909
833struct ftrace_event_field { 910struct ftrace_event_field {
834 struct list_head link; 911 struct list_head link;
835 char *name; 912 const char *name;
836 char *type; 913 const char *type;
837 int filter_type; 914 int filter_type;
838 int offset; 915 int offset;
839 int size; 916 int size;
@@ -851,12 +928,19 @@ struct event_filter {
851struct event_subsystem { 928struct event_subsystem {
852 struct list_head list; 929 struct list_head list;
853 const char *name; 930 const char *name;
854 struct dentry *entry;
855 struct event_filter *filter; 931 struct event_filter *filter;
856 int nr_events;
857 int ref_count; 932 int ref_count;
858}; 933};
859 934
935struct ftrace_subsystem_dir {
936 struct list_head list;
937 struct event_subsystem *subsystem;
938 struct trace_array *tr;
939 struct dentry *entry;
940 int ref_count;
941 int nr_events;
942};
943
860#define FILTER_PRED_INVALID ((unsigned short)-1) 944#define FILTER_PRED_INVALID ((unsigned short)-1)
861#define FILTER_PRED_IS_RIGHT (1 << 15) 945#define FILTER_PRED_IS_RIGHT (1 << 15)
862#define FILTER_PRED_FOLD (1 << 15) 946#define FILTER_PRED_FOLD (1 << 15)
@@ -906,22 +990,20 @@ struct filter_pred {
906 unsigned short right; 990 unsigned short right;
907}; 991};
908 992
909extern struct list_head ftrace_common_fields;
910
911extern enum regex_type 993extern enum regex_type
912filter_parse_regex(char *buff, int len, char **search, int *not); 994filter_parse_regex(char *buff, int len, char **search, int *not);
913extern void print_event_filter(struct ftrace_event_call *call, 995extern void print_event_filter(struct ftrace_event_call *call,
914 struct trace_seq *s); 996 struct trace_seq *s);
915extern int apply_event_filter(struct ftrace_event_call *call, 997extern int apply_event_filter(struct ftrace_event_call *call,
916 char *filter_string); 998 char *filter_string);
917extern int apply_subsystem_event_filter(struct event_subsystem *system, 999extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
918 char *filter_string); 1000 char *filter_string);
919extern void print_subsystem_event_filter(struct event_subsystem *system, 1001extern void print_subsystem_event_filter(struct event_subsystem *system,
920 struct trace_seq *s); 1002 struct trace_seq *s);
921extern int filter_assign_type(const char *type); 1003extern int filter_assign_type(const char *type);
922 1004
923struct list_head * 1005struct ftrace_event_field *
924trace_get_fields(struct ftrace_event_call *event_call); 1006trace_find_event_field(struct ftrace_event_call *call, char *name);
925 1007
926static inline int 1008static inline int
927filter_check_discard(struct ftrace_event_call *call, void *rec, 1009filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -938,6 +1020,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
938} 1020}
939 1021
940extern void trace_event_enable_cmd_record(bool enable); 1022extern void trace_event_enable_cmd_record(bool enable);
1023extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
1024extern int event_trace_del_tracer(struct trace_array *tr);
941 1025
942extern struct mutex event_mutex; 1026extern struct mutex event_mutex;
943extern struct list_head ftrace_events; 1027extern struct list_head ftrace_events;
@@ -948,7 +1032,18 @@ extern const char *__stop___trace_bprintk_fmt[];
948void trace_printk_init_buffers(void); 1032void trace_printk_init_buffers(void);
949void trace_printk_start_comm(void); 1033void trace_printk_start_comm(void);
950int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); 1034int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
951int set_tracer_flag(unsigned int mask, int enabled); 1035int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
1036
1037/*
1038 * Normal trace_printk() and friends allocates special buffers
1039 * to do the manipulation, as well as saves the print formats
1040 * into sections to display. But the trace infrastructure wants
1041 * to use these without the added overhead at the price of being
1042 * a bit slower (used mainly for warnings, where we don't care
1043 * about performance). The internal_trace_puts() is for such
1044 * a purpose.
1045 */
1046#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
952 1047
953#undef FTRACE_ENTRY 1048#undef FTRACE_ENTRY
954#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 1049#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 95e96842ed29..d594da0dc03c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
32{ 32{
33 struct ftrace_event_call *call = &event_branch; 33 struct ftrace_event_call *call = &event_branch;
34 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
35 struct trace_array_cpu *data;
35 struct ring_buffer_event *event; 36 struct ring_buffer_event *event;
36 struct trace_branch *entry; 37 struct trace_branch *entry;
37 struct ring_buffer *buffer; 38 struct ring_buffer *buffer;
@@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
51 52
52 local_irq_save(flags); 53 local_irq_save(flags);
53 cpu = raw_smp_processor_id(); 54 cpu = raw_smp_processor_id();
54 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 55 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
56 if (atomic_inc_return(&data->disabled) != 1)
55 goto out; 57 goto out;
56 58
57 pc = preempt_count(); 59 pc = preempt_count();
58 buffer = tr->buffer; 60 buffer = tr->trace_buffer.buffer;
59 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, 61 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
60 sizeof(*entry), flags, pc); 62 sizeof(*entry), flags, pc);
61 if (!event) 63 if (!event)
@@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
80 __buffer_unlock_commit(buffer, event); 82 __buffer_unlock_commit(buffer, event);
81 83
82 out: 84 out:
83 atomic_dec(&tr->data[cpu]->disabled); 85 atomic_dec(&data->disabled);
84 local_irq_restore(flags); 86 local_irq_restore(flags);
85} 87}
86 88
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aa8f5f48dae6..26dc348332b7 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -57,6 +57,16 @@ u64 notrace trace_clock(void)
57 return local_clock(); 57 return local_clock();
58} 58}
59 59
60/*
61 * trace_jiffy_clock(): Simply use jiffies as a clock counter.
62 */
63u64 notrace trace_clock_jiffies(void)
64{
65 u64 jiffy = jiffies - INITIAL_JIFFIES;
66
67 /* Return nsecs */
68 return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
69}
60 70
61/* 71/*
62 * trace_clock_global(): special globally coherent trace clock 72 * trace_clock_global(): special globally coherent trace clock
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4108e1250ca2..e2d027ac66a2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,
223 __dynamic_array( u32, buf ) 223 __dynamic_array( u32, buf )
224 ), 224 ),
225 225
226 F_printk("%08lx fmt:%p", 226 F_printk("%pf: %s",
227 __entry->ip, __entry->fmt), 227 (void *)__entry->ip, __entry->fmt),
228 228
229 FILTER_OTHER 229 FILTER_OTHER
230); 230);
@@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,
238 __dynamic_array( char, buf ) 238 __dynamic_array( char, buf )
239 ), 239 ),
240 240
241 F_printk("%08lx %s", 241 F_printk("%pf: %s",
242 __entry->ip, __entry->buf), 242 (void *)__entry->ip, __entry->buf),
243
244 FILTER_OTHER
245);
246
247FTRACE_ENTRY(bputs, bputs_entry,
248
249 TRACE_BPUTS,
250
251 F_STRUCT(
252 __field( unsigned long, ip )
253 __field( const char *, str )
254 ),
255
256 F_printk("%pf: %s",
257 (void *)__entry->ip, __entry->str),
243 258
244 FILTER_OTHER 259 FILTER_OTHER
245); 260);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 57e9b284250c..53582e982e51 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage); 34EXPORT_SYMBOL_GPL(event_storage);
35 35
36LIST_HEAD(ftrace_events); 36LIST_HEAD(ftrace_events);
37LIST_HEAD(ftrace_common_fields); 37static LIST_HEAD(ftrace_common_fields);
38 38
39struct list_head * 39#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
40
41static struct kmem_cache *field_cachep;
42static struct kmem_cache *file_cachep;
43
44/* Double loops, do not use break, only goto's work */
45#define do_for_each_event_file(tr, file) \
46 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
47 list_for_each_entry(file, &tr->events, list)
48
49#define do_for_each_event_file_safe(tr, file) \
50 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
51 struct ftrace_event_file *___n; \
52 list_for_each_entry_safe(file, ___n, &tr->events, list)
53
54#define while_for_each_event_file() \
55 }
56
57static struct list_head *
40trace_get_fields(struct ftrace_event_call *event_call) 58trace_get_fields(struct ftrace_event_call *event_call)
41{ 59{
42 if (!event_call->class->get_fields) 60 if (!event_call->class->get_fields)
@@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call)
44 return event_call->class->get_fields(event_call); 62 return event_call->class->get_fields(event_call);
45} 63}
46 64
65static struct ftrace_event_field *
66__find_event_field(struct list_head *head, char *name)
67{
68 struct ftrace_event_field *field;
69
70 list_for_each_entry(field, head, link) {
71 if (!strcmp(field->name, name))
72 return field;
73 }
74
75 return NULL;
76}
77
78struct ftrace_event_field *
79trace_find_event_field(struct ftrace_event_call *call, char *name)
80{
81 struct ftrace_event_field *field;
82 struct list_head *head;
83
84 field = __find_event_field(&ftrace_common_fields, name);
85 if (field)
86 return field;
87
88 head = trace_get_fields(call);
89 return __find_event_field(head, name);
90}
91
47static int __trace_define_field(struct list_head *head, const char *type, 92static int __trace_define_field(struct list_head *head, const char *type,
48 const char *name, int offset, int size, 93 const char *name, int offset, int size,
49 int is_signed, int filter_type) 94 int is_signed, int filter_type)
50{ 95{
51 struct ftrace_event_field *field; 96 struct ftrace_event_field *field;
52 97
53 field = kzalloc(sizeof(*field), GFP_KERNEL); 98 field = kmem_cache_alloc(field_cachep, GFP_TRACE);
54 if (!field) 99 if (!field)
55 goto err; 100 goto err;
56 101
57 field->name = kstrdup(name, GFP_KERNEL); 102 field->name = name;
58 if (!field->name) 103 field->type = type;
59 goto err;
60
61 field->type = kstrdup(type, GFP_KERNEL);
62 if (!field->type)
63 goto err;
64 104
65 if (filter_type == FILTER_OTHER) 105 if (filter_type == FILTER_OTHER)
66 field->filter_type = filter_assign_type(type); 106 field->filter_type = filter_assign_type(type);
@@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
76 return 0; 116 return 0;
77 117
78err: 118err:
79 if (field) 119 kmem_cache_free(field_cachep, field);
80 kfree(field->name);
81 kfree(field);
82 120
83 return -ENOMEM; 121 return -ENOMEM;
84} 122}
@@ -120,7 +158,7 @@ static int trace_define_common_fields(void)
120 return ret; 158 return ret;
121} 159}
122 160
123void trace_destroy_fields(struct ftrace_event_call *call) 161static void trace_destroy_fields(struct ftrace_event_call *call)
124{ 162{
125 struct ftrace_event_field *field, *next; 163 struct ftrace_event_field *field, *next;
126 struct list_head *head; 164 struct list_head *head;
@@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)
128 head = trace_get_fields(call); 166 head = trace_get_fields(call);
129 list_for_each_entry_safe(field, next, head, link) { 167 list_for_each_entry_safe(field, next, head, link) {
130 list_del(&field->link); 168 list_del(&field->link);
131 kfree(field->type); 169 kmem_cache_free(field_cachep, field);
132 kfree(field->name);
133 kfree(field);
134 } 170 }
135} 171}
136 172
@@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
149int ftrace_event_reg(struct ftrace_event_call *call, 185int ftrace_event_reg(struct ftrace_event_call *call,
150 enum trace_reg type, void *data) 186 enum trace_reg type, void *data)
151{ 187{
188 struct ftrace_event_file *file = data;
189
152 switch (type) { 190 switch (type) {
153 case TRACE_REG_REGISTER: 191 case TRACE_REG_REGISTER:
154 return tracepoint_probe_register(call->name, 192 return tracepoint_probe_register(call->name,
155 call->class->probe, 193 call->class->probe,
156 call); 194 file);
157 case TRACE_REG_UNREGISTER: 195 case TRACE_REG_UNREGISTER:
158 tracepoint_probe_unregister(call->name, 196 tracepoint_probe_unregister(call->name,
159 call->class->probe, 197 call->class->probe,
160 call); 198 file);
161 return 0; 199 return 0;
162 200
163#ifdef CONFIG_PERF_EVENTS 201#ifdef CONFIG_PERF_EVENTS
@@ -183,54 +221,100 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);
183 221
184void trace_event_enable_cmd_record(bool enable) 222void trace_event_enable_cmd_record(bool enable)
185{ 223{
186 struct ftrace_event_call *call; 224 struct ftrace_event_file *file;
225 struct trace_array *tr;
187 226
188 mutex_lock(&event_mutex); 227 mutex_lock(&event_mutex);
189 list_for_each_entry(call, &ftrace_events, list) { 228 do_for_each_event_file(tr, file) {
190 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) 229
230 if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
191 continue; 231 continue;
192 232
193 if (enable) { 233 if (enable) {
194 tracing_start_cmdline_record(); 234 tracing_start_cmdline_record();
195 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 235 set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
196 } else { 236 } else {
197 tracing_stop_cmdline_record(); 237 tracing_stop_cmdline_record();
198 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 238 clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
199 } 239 }
200 } 240 } while_for_each_event_file();
201 mutex_unlock(&event_mutex); 241 mutex_unlock(&event_mutex);
202} 242}
203 243
204static int ftrace_event_enable_disable(struct ftrace_event_call *call, 244static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
205 int enable) 245 int enable, int soft_disable)
206{ 246{
247 struct ftrace_event_call *call = file->event_call;
207 int ret = 0; 248 int ret = 0;
249 int disable;
208 250
209 switch (enable) { 251 switch (enable) {
210 case 0: 252 case 0:
211 if (call->flags & TRACE_EVENT_FL_ENABLED) { 253 /*
212 call->flags &= ~TRACE_EVENT_FL_ENABLED; 254 * When soft_disable is set and enable is cleared, we want
213 if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { 255 * to clear the SOFT_DISABLED flag but leave the event in the
256 * state that it was. That is, if the event was enabled and
257 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
258 * is set we do not want the event to be enabled before we
259 * clear the bit.
260 *
261 * When soft_disable is not set but the SOFT_MODE flag is,
262 * we do nothing. Do not disable the tracepoint, otherwise
263 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
264 */
265 if (soft_disable) {
266 disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
267 clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
268 } else
269 disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
270
271 if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
272 clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
273 if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
214 tracing_stop_cmdline_record(); 274 tracing_stop_cmdline_record();
215 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 275 clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
216 } 276 }
217 call->class->reg(call, TRACE_REG_UNREGISTER, NULL); 277 call->class->reg(call, TRACE_REG_UNREGISTER, file);
218 } 278 }
279 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */
280 if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
281 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
219 break; 282 break;
220 case 1: 283 case 1:
221 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { 284 /*
285 * When soft_disable is set and enable is set, we want to
286 * register the tracepoint for the event, but leave the event
287 * as is. That means, if the event was already enabled, we do
288 * nothing (but set SOFT_MODE). If the event is disabled, we
289 * set SOFT_DISABLED before enabling the event tracepoint, so
290 * it still seems to be disabled.
291 */
292 if (!soft_disable)
293 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
294 else
295 set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
296
297 if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
298
299 /* Keep the event disabled, when going to SOFT_MODE. */
300 if (soft_disable)
301 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
302
222 if (trace_flags & TRACE_ITER_RECORD_CMD) { 303 if (trace_flags & TRACE_ITER_RECORD_CMD) {
223 tracing_start_cmdline_record(); 304 tracing_start_cmdline_record();
224 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 305 set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
225 } 306 }
226 ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); 307 ret = call->class->reg(call, TRACE_REG_REGISTER, file);
227 if (ret) { 308 if (ret) {
228 tracing_stop_cmdline_record(); 309 tracing_stop_cmdline_record();
229 pr_info("event trace: Could not enable event " 310 pr_info("event trace: Could not enable event "
230 "%s\n", call->name); 311 "%s\n", call->name);
231 break; 312 break;
232 } 313 }
233 call->flags |= TRACE_EVENT_FL_ENABLED; 314 set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
315
316 /* WAS_ENABLED gets set but never cleared. */
317 call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
234 } 318 }
235 break; 319 break;
236 } 320 }
@@ -238,13 +322,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
238 return ret; 322 return ret;
239} 323}
240 324
241static void ftrace_clear_events(void) 325static int ftrace_event_enable_disable(struct ftrace_event_file *file,
326 int enable)
242{ 327{
243 struct ftrace_event_call *call; 328 return __ftrace_event_enable_disable(file, enable, 0);
329}
330
331static void ftrace_clear_events(struct trace_array *tr)
332{
333 struct ftrace_event_file *file;
244 334
245 mutex_lock(&event_mutex); 335 mutex_lock(&event_mutex);
246 list_for_each_entry(call, &ftrace_events, list) { 336 list_for_each_entry(file, &tr->events, list) {
247 ftrace_event_enable_disable(call, 0); 337 ftrace_event_enable_disable(file, 0);
248 } 338 }
249 mutex_unlock(&event_mutex); 339 mutex_unlock(&event_mutex);
250} 340}
@@ -257,11 +347,12 @@ static void __put_system(struct event_subsystem *system)
257 if (--system->ref_count) 347 if (--system->ref_count)
258 return; 348 return;
259 349
350 list_del(&system->list);
351
260 if (filter) { 352 if (filter) {
261 kfree(filter->filter_string); 353 kfree(filter->filter_string);
262 kfree(filter); 354 kfree(filter);
263 } 355 }
264 kfree(system->name);
265 kfree(system); 356 kfree(system);
266} 357}
267 358
@@ -271,24 +362,45 @@ static void __get_system(struct event_subsystem *system)
271 system->ref_count++; 362 system->ref_count++;
272} 363}
273 364
274static void put_system(struct event_subsystem *system) 365static void __get_system_dir(struct ftrace_subsystem_dir *dir)
366{
367 WARN_ON_ONCE(dir->ref_count == 0);
368 dir->ref_count++;
369 __get_system(dir->subsystem);
370}
371
372static void __put_system_dir(struct ftrace_subsystem_dir *dir)
373{
374 WARN_ON_ONCE(dir->ref_count == 0);
375 /* If the subsystem is about to be freed, the dir must be too */
376 WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1);
377
378 __put_system(dir->subsystem);
379 if (!--dir->ref_count)
380 kfree(dir);
381}
382
383static void put_system(struct ftrace_subsystem_dir *dir)
275{ 384{
276 mutex_lock(&event_mutex); 385 mutex_lock(&event_mutex);
277 __put_system(system); 386 __put_system_dir(dir);
278 mutex_unlock(&event_mutex); 387 mutex_unlock(&event_mutex);
279} 388}
280 389
281/* 390/*
282 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 391 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
283 */ 392 */
284static int __ftrace_set_clr_event(const char *match, const char *sub, 393static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
285 const char *event, int set) 394 const char *sub, const char *event, int set)
286{ 395{
396 struct ftrace_event_file *file;
287 struct ftrace_event_call *call; 397 struct ftrace_event_call *call;
288 int ret = -EINVAL; 398 int ret = -EINVAL;
289 399
290 mutex_lock(&event_mutex); 400 mutex_lock(&event_mutex);
291 list_for_each_entry(call, &ftrace_events, list) { 401 list_for_each_entry(file, &tr->events, list) {
402
403 call = file->event_call;
292 404
293 if (!call->name || !call->class || !call->class->reg) 405 if (!call->name || !call->class || !call->class->reg)
294 continue; 406 continue;
@@ -307,7 +419,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
307 if (event && strcmp(event, call->name) != 0) 419 if (event && strcmp(event, call->name) != 0)
308 continue; 420 continue;
309 421
310 ftrace_event_enable_disable(call, set); 422 ftrace_event_enable_disable(file, set);
311 423
312 ret = 0; 424 ret = 0;
313 } 425 }
@@ -316,7 +428,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
316 return ret; 428 return ret;
317} 429}
318 430
319static int ftrace_set_clr_event(char *buf, int set) 431static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
320{ 432{
321 char *event = NULL, *sub = NULL, *match; 433 char *event = NULL, *sub = NULL, *match;
322 434
@@ -344,7 +456,7 @@ static int ftrace_set_clr_event(char *buf, int set)
344 event = NULL; 456 event = NULL;
345 } 457 }
346 458
347 return __ftrace_set_clr_event(match, sub, event, set); 459 return __ftrace_set_clr_event(tr, match, sub, event, set);
348} 460}
349 461
350/** 462/**
@@ -361,7 +473,9 @@ static int ftrace_set_clr_event(char *buf, int set)
361 */ 473 */
362int trace_set_clr_event(const char *system, const char *event, int set) 474int trace_set_clr_event(const char *system, const char *event, int set)
363{ 475{
364 return __ftrace_set_clr_event(NULL, system, event, set); 476 struct trace_array *tr = top_trace_array();
477
478 return __ftrace_set_clr_event(tr, NULL, system, event, set);
365} 479}
366EXPORT_SYMBOL_GPL(trace_set_clr_event); 480EXPORT_SYMBOL_GPL(trace_set_clr_event);
367 481
@@ -373,6 +487,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
373 size_t cnt, loff_t *ppos) 487 size_t cnt, loff_t *ppos)
374{ 488{
375 struct trace_parser parser; 489 struct trace_parser parser;
490 struct seq_file *m = file->private_data;
491 struct trace_array *tr = m->private;
376 ssize_t read, ret; 492 ssize_t read, ret;
377 493
378 if (!cnt) 494 if (!cnt)
@@ -395,7 +511,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
395 511
396 parser.buffer[parser.idx] = 0; 512 parser.buffer[parser.idx] = 0;
397 513
398 ret = ftrace_set_clr_event(parser.buffer + !set, set); 514 ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
399 if (ret) 515 if (ret)
400 goto out_put; 516 goto out_put;
401 } 517 }
@@ -411,17 +527,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
411static void * 527static void *
412t_next(struct seq_file *m, void *v, loff_t *pos) 528t_next(struct seq_file *m, void *v, loff_t *pos)
413{ 529{
414 struct ftrace_event_call *call = v; 530 struct ftrace_event_file *file = v;
531 struct ftrace_event_call *call;
532 struct trace_array *tr = m->private;
415 533
416 (*pos)++; 534 (*pos)++;
417 535
418 list_for_each_entry_continue(call, &ftrace_events, list) { 536 list_for_each_entry_continue(file, &tr->events, list) {
537 call = file->event_call;
419 /* 538 /*
420 * The ftrace subsystem is for showing formats only. 539 * The ftrace subsystem is for showing formats only.
421 * They can not be enabled or disabled via the event files. 540 * They can not be enabled or disabled via the event files.
422 */ 541 */
423 if (call->class && call->class->reg) 542 if (call->class && call->class->reg)
424 return call; 543 return file;
425 } 544 }
426 545
427 return NULL; 546 return NULL;
@@ -429,30 +548,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
429 548
430static void *t_start(struct seq_file *m, loff_t *pos) 549static void *t_start(struct seq_file *m, loff_t *pos)
431{ 550{
432 struct ftrace_event_call *call; 551 struct ftrace_event_file *file;
552 struct trace_array *tr = m->private;
433 loff_t l; 553 loff_t l;
434 554
435 mutex_lock(&event_mutex); 555 mutex_lock(&event_mutex);
436 556
437 call = list_entry(&ftrace_events, struct ftrace_event_call, list); 557 file = list_entry(&tr->events, struct ftrace_event_file, list);
438 for (l = 0; l <= *pos; ) { 558 for (l = 0; l <= *pos; ) {
439 call = t_next(m, call, &l); 559 file = t_next(m, file, &l);
440 if (!call) 560 if (!file)
441 break; 561 break;
442 } 562 }
443 return call; 563 return file;
444} 564}
445 565
446static void * 566static void *
447s_next(struct seq_file *m, void *v, loff_t *pos) 567s_next(struct seq_file *m, void *v, loff_t *pos)
448{ 568{
449 struct ftrace_event_call *call = v; 569 struct ftrace_event_file *file = v;
570 struct trace_array *tr = m->private;
450 571
451 (*pos)++; 572 (*pos)++;
452 573
453 list_for_each_entry_continue(call, &ftrace_events, list) { 574 list_for_each_entry_continue(file, &tr->events, list) {
454 if (call->flags & TRACE_EVENT_FL_ENABLED) 575 if (file->flags & FTRACE_EVENT_FL_ENABLED)
455 return call; 576 return file;
456 } 577 }
457 578
458 return NULL; 579 return NULL;
@@ -460,23 +581,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
460 581
461static void *s_start(struct seq_file *m, loff_t *pos) 582static void *s_start(struct seq_file *m, loff_t *pos)
462{ 583{
463 struct ftrace_event_call *call; 584 struct ftrace_event_file *file;
585 struct trace_array *tr = m->private;
464 loff_t l; 586 loff_t l;
465 587
466 mutex_lock(&event_mutex); 588 mutex_lock(&event_mutex);
467 589
468 call = list_entry(&ftrace_events, struct ftrace_event_call, list); 590 file = list_entry(&tr->events, struct ftrace_event_file, list);
469 for (l = 0; l <= *pos; ) { 591 for (l = 0; l <= *pos; ) {
470 call = s_next(m, call, &l); 592 file = s_next(m, file, &l);
471 if (!call) 593 if (!file)
472 break; 594 break;
473 } 595 }
474 return call; 596 return file;
475} 597}
476 598
477static int t_show(struct seq_file *m, void *v) 599static int t_show(struct seq_file *m, void *v)
478{ 600{
479 struct ftrace_event_call *call = v; 601 struct ftrace_event_file *file = v;
602 struct ftrace_event_call *call = file->event_call;
480 603
481 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) 604 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
482 seq_printf(m, "%s:", call->class->system); 605 seq_printf(m, "%s:", call->class->system);
@@ -494,25 +617,31 @@ static ssize_t
494event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 617event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
495 loff_t *ppos) 618 loff_t *ppos)
496{ 619{
497 struct ftrace_event_call *call = filp->private_data; 620 struct ftrace_event_file *file = filp->private_data;
498 char *buf; 621 char *buf;
499 622
500 if (call->flags & TRACE_EVENT_FL_ENABLED) 623 if (file->flags & FTRACE_EVENT_FL_ENABLED) {
501 buf = "1\n"; 624 if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
502 else 625 buf = "0*\n";
626 else
627 buf = "1\n";
628 } else
503 buf = "0\n"; 629 buf = "0\n";
504 630
505 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); 631 return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
506} 632}
507 633
508static ssize_t 634static ssize_t
509event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 635event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
510 loff_t *ppos) 636 loff_t *ppos)
511{ 637{
512 struct ftrace_event_call *call = filp->private_data; 638 struct ftrace_event_file *file = filp->private_data;
513 unsigned long val; 639 unsigned long val;
514 int ret; 640 int ret;
515 641
642 if (!file)
643 return -EINVAL;
644
516 ret = kstrtoul_from_user(ubuf, cnt, 10, &val); 645 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
517 if (ret) 646 if (ret)
518 return ret; 647 return ret;
@@ -525,7 +654,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
525 case 0: 654 case 0:
526 case 1: 655 case 1:
527 mutex_lock(&event_mutex); 656 mutex_lock(&event_mutex);
528 ret = ftrace_event_enable_disable(call, val); 657 ret = ftrace_event_enable_disable(file, val);
529 mutex_unlock(&event_mutex); 658 mutex_unlock(&event_mutex);
530 break; 659 break;
531 660
@@ -543,14 +672,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
543 loff_t *ppos) 672 loff_t *ppos)
544{ 673{
545 const char set_to_char[4] = { '?', '0', '1', 'X' }; 674 const char set_to_char[4] = { '?', '0', '1', 'X' };
546 struct event_subsystem *system = filp->private_data; 675 struct ftrace_subsystem_dir *dir = filp->private_data;
676 struct event_subsystem *system = dir->subsystem;
547 struct ftrace_event_call *call; 677 struct ftrace_event_call *call;
678 struct ftrace_event_file *file;
679 struct trace_array *tr = dir->tr;
548 char buf[2]; 680 char buf[2];
549 int set = 0; 681 int set = 0;
550 int ret; 682 int ret;
551 683
552 mutex_lock(&event_mutex); 684 mutex_lock(&event_mutex);
553 list_for_each_entry(call, &ftrace_events, list) { 685 list_for_each_entry(file, &tr->events, list) {
686 call = file->event_call;
554 if (!call->name || !call->class || !call->class->reg) 687 if (!call->name || !call->class || !call->class->reg)
555 continue; 688 continue;
556 689
@@ -562,7 +695,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
562 * or if all events or cleared, or if we have 695 * or if all events or cleared, or if we have
563 * a mixture. 696 * a mixture.
564 */ 697 */
565 set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); 698 set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
566 699
567 /* 700 /*
568 * If we have a mixture, no need to look further. 701 * If we have a mixture, no need to look further.
@@ -584,7 +717,8 @@ static ssize_t
584system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 717system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
585 loff_t *ppos) 718 loff_t *ppos)
586{ 719{
587 struct event_subsystem *system = filp->private_data; 720 struct ftrace_subsystem_dir *dir = filp->private_data;
721 struct event_subsystem *system = dir->subsystem;
588 const char *name = NULL; 722 const char *name = NULL;
589 unsigned long val; 723 unsigned long val;
590 ssize_t ret; 724 ssize_t ret;
@@ -607,7 +741,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
607 if (system) 741 if (system)
608 name = system->name; 742 name = system->name;
609 743
610 ret = __ftrace_set_clr_event(NULL, name, NULL, val); 744 ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
611 if (ret) 745 if (ret)
612 goto out; 746 goto out;
613 747
@@ -845,43 +979,75 @@ static LIST_HEAD(event_subsystems);
845static int subsystem_open(struct inode *inode, struct file *filp) 979static int subsystem_open(struct inode *inode, struct file *filp)
846{ 980{
847 struct event_subsystem *system = NULL; 981 struct event_subsystem *system = NULL;
982 struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
983 struct trace_array *tr;
848 int ret; 984 int ret;
849 985
850 if (!inode->i_private)
851 goto skip_search;
852
853 /* Make sure the system still exists */ 986 /* Make sure the system still exists */
854 mutex_lock(&event_mutex); 987 mutex_lock(&event_mutex);
855 list_for_each_entry(system, &event_subsystems, list) { 988 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
856 if (system == inode->i_private) { 989 list_for_each_entry(dir, &tr->systems, list) {
857 /* Don't open systems with no events */ 990 if (dir == inode->i_private) {
858 if (!system->nr_events) { 991 /* Don't open systems with no events */
859 system = NULL; 992 if (dir->nr_events) {
860 break; 993 __get_system_dir(dir);
994 system = dir->subsystem;
995 }
996 goto exit_loop;
861 } 997 }
862 __get_system(system);
863 break;
864 } 998 }
865 } 999 }
1000 exit_loop:
866 mutex_unlock(&event_mutex); 1001 mutex_unlock(&event_mutex);
867 1002
868 if (system != inode->i_private) 1003 if (!system)
869 return -ENODEV; 1004 return -ENODEV;
870 1005
871 skip_search: 1006 /* Some versions of gcc think dir can be uninitialized here */
1007 WARN_ON(!dir);
1008
872 ret = tracing_open_generic(inode, filp); 1009 ret = tracing_open_generic(inode, filp);
873 if (ret < 0 && system) 1010 if (ret < 0)
874 put_system(system); 1011 put_system(dir);
1012
1013 return ret;
1014}
1015
1016static int system_tr_open(struct inode *inode, struct file *filp)
1017{
1018 struct ftrace_subsystem_dir *dir;
1019 struct trace_array *tr = inode->i_private;
1020 int ret;
1021
1022 /* Make a temporary dir that has no system but points to tr */
1023 dir = kzalloc(sizeof(*dir), GFP_KERNEL);
1024 if (!dir)
1025 return -ENOMEM;
1026
1027 dir->tr = tr;
1028
1029 ret = tracing_open_generic(inode, filp);
1030 if (ret < 0)
1031 kfree(dir);
1032
1033 filp->private_data = dir;
875 1034
876 return ret; 1035 return ret;
877} 1036}
878 1037
879static int subsystem_release(struct inode *inode, struct file *file) 1038static int subsystem_release(struct inode *inode, struct file *file)
880{ 1039{
881 struct event_subsystem *system = inode->i_private; 1040 struct ftrace_subsystem_dir *dir = file->private_data;
882 1041
883 if (system) 1042 /*
884 put_system(system); 1043 * If dir->subsystem is NULL, then this is a temporary
1044 * descriptor that was made for a trace_array to enable
1045 * all subsystems.
1046 */
1047 if (dir->subsystem)
1048 put_system(dir);
1049 else
1050 kfree(dir);
885 1051
886 return 0; 1052 return 0;
887} 1053}
@@ -890,7 +1056,8 @@ static ssize_t
890subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 1056subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
891 loff_t *ppos) 1057 loff_t *ppos)
892{ 1058{
893 struct event_subsystem *system = filp->private_data; 1059 struct ftrace_subsystem_dir *dir = filp->private_data;
1060 struct event_subsystem *system = dir->subsystem;
894 struct trace_seq *s; 1061 struct trace_seq *s;
895 int r; 1062 int r;
896 1063
@@ -915,7 +1082,7 @@ static ssize_t
915subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, 1082subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
916 loff_t *ppos) 1083 loff_t *ppos)
917{ 1084{
918 struct event_subsystem *system = filp->private_data; 1085 struct ftrace_subsystem_dir *dir = filp->private_data;
919 char *buf; 1086 char *buf;
920 int err; 1087 int err;
921 1088
@@ -932,7 +1099,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
932 } 1099 }
933 buf[cnt] = '\0'; 1100 buf[cnt] = '\0';
934 1101
935 err = apply_subsystem_event_filter(system, buf); 1102 err = apply_subsystem_event_filter(dir, buf);
936 free_page((unsigned long) buf); 1103 free_page((unsigned long) buf);
937 if (err < 0) 1104 if (err < 0)
938 return err; 1105 return err;
@@ -1041,30 +1208,35 @@ static const struct file_operations ftrace_system_enable_fops = {
1041 .release = subsystem_release, 1208 .release = subsystem_release,
1042}; 1209};
1043 1210
1211static const struct file_operations ftrace_tr_enable_fops = {
1212 .open = system_tr_open,
1213 .read = system_enable_read,
1214 .write = system_enable_write,
1215 .llseek = default_llseek,
1216 .release = subsystem_release,
1217};
1218
1044static const struct file_operations ftrace_show_header_fops = { 1219static const struct file_operations ftrace_show_header_fops = {
1045 .open = tracing_open_generic, 1220 .open = tracing_open_generic,
1046 .read = show_header, 1221 .read = show_header,
1047 .llseek = default_llseek, 1222 .llseek = default_llseek,
1048}; 1223};
1049 1224
1050static struct dentry *event_trace_events_dir(void) 1225static int
1226ftrace_event_open(struct inode *inode, struct file *file,
1227 const struct seq_operations *seq_ops)
1051{ 1228{
1052 static struct dentry *d_tracer; 1229 struct seq_file *m;
1053 static struct dentry *d_events; 1230 int ret;
1054
1055 if (d_events)
1056 return d_events;
1057
1058 d_tracer = tracing_init_dentry();
1059 if (!d_tracer)
1060 return NULL;
1061 1231
1062 d_events = debugfs_create_dir("events", d_tracer); 1232 ret = seq_open(file, seq_ops);
1063 if (!d_events) 1233 if (ret < 0)
1064 pr_warning("Could not create debugfs " 1234 return ret;
1065 "'events' directory\n"); 1235 m = file->private_data;
1236 /* copy tr over to seq ops */
1237 m->private = inode->i_private;
1066 1238
1067 return d_events; 1239 return ret;
1068} 1240}
1069 1241
1070static int 1242static int
@@ -1072,117 +1244,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)
1072{ 1244{
1073 const struct seq_operations *seq_ops = &show_event_seq_ops; 1245 const struct seq_operations *seq_ops = &show_event_seq_ops;
1074 1246
1075 return seq_open(file, seq_ops); 1247 return ftrace_event_open(inode, file, seq_ops);
1076} 1248}
1077 1249
1078static int 1250static int
1079ftrace_event_set_open(struct inode *inode, struct file *file) 1251ftrace_event_set_open(struct inode *inode, struct file *file)
1080{ 1252{
1081 const struct seq_operations *seq_ops = &show_set_event_seq_ops; 1253 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1254 struct trace_array *tr = inode->i_private;
1082 1255
1083 if ((file->f_mode & FMODE_WRITE) && 1256 if ((file->f_mode & FMODE_WRITE) &&
1084 (file->f_flags & O_TRUNC)) 1257 (file->f_flags & O_TRUNC))
1085 ftrace_clear_events(); 1258 ftrace_clear_events(tr);
1086 1259
1087 return seq_open(file, seq_ops); 1260 return ftrace_event_open(inode, file, seq_ops);
1261}
1262
1263static struct event_subsystem *
1264create_new_subsystem(const char *name)
1265{
1266 struct event_subsystem *system;
1267
1268 /* need to create new entry */
1269 system = kmalloc(sizeof(*system), GFP_KERNEL);
1270 if (!system)
1271 return NULL;
1272
1273 system->ref_count = 1;
1274 system->name = name;
1275
1276 system->filter = NULL;
1277
1278 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
1279 if (!system->filter)
1280 goto out_free;
1281
1282 list_add(&system->list, &event_subsystems);
1283
1284 return system;
1285
1286 out_free:
1287 kfree(system);
1288 return NULL;
1088} 1289}
1089 1290
1090static struct dentry * 1291static struct dentry *
1091event_subsystem_dir(const char *name, struct dentry *d_events) 1292event_subsystem_dir(struct trace_array *tr, const char *name,
1293 struct ftrace_event_file *file, struct dentry *parent)
1092{ 1294{
1295 struct ftrace_subsystem_dir *dir;
1093 struct event_subsystem *system; 1296 struct event_subsystem *system;
1094 struct dentry *entry; 1297 struct dentry *entry;
1095 1298
1096 /* First see if we did not already create this dir */ 1299 /* First see if we did not already create this dir */
1097 list_for_each_entry(system, &event_subsystems, list) { 1300 list_for_each_entry(dir, &tr->systems, list) {
1301 system = dir->subsystem;
1098 if (strcmp(system->name, name) == 0) { 1302 if (strcmp(system->name, name) == 0) {
1099 system->nr_events++; 1303 dir->nr_events++;
1100 return system->entry; 1304 file->system = dir;
1305 return dir->entry;
1101 } 1306 }
1102 } 1307 }
1103 1308
1104 /* need to create new entry */ 1309 /* Now see if the system itself exists. */
1105 system = kmalloc(sizeof(*system), GFP_KERNEL); 1310 list_for_each_entry(system, &event_subsystems, list) {
1106 if (!system) { 1311 if (strcmp(system->name, name) == 0)
1107 pr_warning("No memory to create event subsystem %s\n", 1312 break;
1108 name);
1109 return d_events;
1110 } 1313 }
1314 /* Reset system variable when not found */
1315 if (&system->list == &event_subsystems)
1316 system = NULL;
1111 1317
1112 system->entry = debugfs_create_dir(name, d_events); 1318 dir = kmalloc(sizeof(*dir), GFP_KERNEL);
1113 if (!system->entry) { 1319 if (!dir)
1114 pr_warning("Could not create event subsystem %s\n", 1320 goto out_fail;
1115 name);
1116 kfree(system);
1117 return d_events;
1118 }
1119 1321
1120 system->nr_events = 1; 1322 if (!system) {
1121 system->ref_count = 1; 1323 system = create_new_subsystem(name);
1122 system->name = kstrdup(name, GFP_KERNEL); 1324 if (!system)
1123 if (!system->name) { 1325 goto out_free;
1124 debugfs_remove(system->entry); 1326 } else
1125 kfree(system); 1327 __get_system(system);
1126 return d_events; 1328
1329 dir->entry = debugfs_create_dir(name, parent);
1330 if (!dir->entry) {
1331 pr_warning("Failed to create system directory %s\n", name);
1332 __put_system(system);
1333 goto out_free;
1127 } 1334 }
1128 1335
1129 list_add(&system->list, &event_subsystems); 1336 dir->tr = tr;
1130 1337 dir->ref_count = 1;
1131 system->filter = NULL; 1338 dir->nr_events = 1;
1132 1339 dir->subsystem = system;
1133 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); 1340 file->system = dir;
1134 if (!system->filter) {
1135 pr_warning("Could not allocate filter for subsystem "
1136 "'%s'\n", name);
1137 return system->entry;
1138 }
1139 1341
1140 entry = debugfs_create_file("filter", 0644, system->entry, system, 1342 entry = debugfs_create_file("filter", 0644, dir->entry, dir,
1141 &ftrace_subsystem_filter_fops); 1343 &ftrace_subsystem_filter_fops);
1142 if (!entry) { 1344 if (!entry) {
1143 kfree(system->filter); 1345 kfree(system->filter);
1144 system->filter = NULL; 1346 system->filter = NULL;
1145 pr_warning("Could not create debugfs " 1347 pr_warning("Could not create debugfs '%s/filter' entry\n", name);
1146 "'%s/filter' entry\n", name);
1147 } 1348 }
1148 1349
1149 trace_create_file("enable", 0644, system->entry, system, 1350 trace_create_file("enable", 0644, dir->entry, dir,
1150 &ftrace_system_enable_fops); 1351 &ftrace_system_enable_fops);
1151 1352
1152 return system->entry; 1353 list_add(&dir->list, &tr->systems);
1354
1355 return dir->entry;
1356
1357 out_free:
1358 kfree(dir);
1359 out_fail:
1360 /* Only print this message if failed on memory allocation */
1361 if (!dir || !system)
1362 pr_warning("No memory to create event subsystem %s\n",
1363 name);
1364 return NULL;
1153} 1365}
1154 1366
1155static int 1367static int
1156event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, 1368event_create_dir(struct dentry *parent,
1369 struct ftrace_event_file *file,
1157 const struct file_operations *id, 1370 const struct file_operations *id,
1158 const struct file_operations *enable, 1371 const struct file_operations *enable,
1159 const struct file_operations *filter, 1372 const struct file_operations *filter,
1160 const struct file_operations *format) 1373 const struct file_operations *format)
1161{ 1374{
1375 struct ftrace_event_call *call = file->event_call;
1376 struct trace_array *tr = file->tr;
1162 struct list_head *head; 1377 struct list_head *head;
1378 struct dentry *d_events;
1163 int ret; 1379 int ret;
1164 1380
1165 /* 1381 /*
1166 * If the trace point header did not define TRACE_SYSTEM 1382 * If the trace point header did not define TRACE_SYSTEM
1167 * then the system would be called "TRACE_SYSTEM". 1383 * then the system would be called "TRACE_SYSTEM".
1168 */ 1384 */
1169 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) 1385 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
1170 d_events = event_subsystem_dir(call->class->system, d_events); 1386 d_events = event_subsystem_dir(tr, call->class->system, file, parent);
1171 1387 if (!d_events)
1172 call->dir = debugfs_create_dir(call->name, d_events); 1388 return -ENOMEM;
1173 if (!call->dir) { 1389 } else
1174 pr_warning("Could not create debugfs " 1390 d_events = parent;
1175 "'%s' directory\n", call->name); 1391
1392 file->dir = debugfs_create_dir(call->name, d_events);
1393 if (!file->dir) {
1394 pr_warning("Could not create debugfs '%s' directory\n",
1395 call->name);
1176 return -1; 1396 return -1;
1177 } 1397 }
1178 1398
1179 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) 1399 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1180 trace_create_file("enable", 0644, call->dir, call, 1400 trace_create_file("enable", 0644, file->dir, file,
1181 enable); 1401 enable);
1182 1402
1183#ifdef CONFIG_PERF_EVENTS 1403#ifdef CONFIG_PERF_EVENTS
1184 if (call->event.type && call->class->reg) 1404 if (call->event.type && call->class->reg)
1185 trace_create_file("id", 0444, call->dir, call, 1405 trace_create_file("id", 0444, file->dir, call,
1186 id); 1406 id);
1187#endif 1407#endif
1188 1408
@@ -1196,23 +1416,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1196 if (ret < 0) { 1416 if (ret < 0) {
1197 pr_warning("Could not initialize trace point" 1417 pr_warning("Could not initialize trace point"
1198 " events/%s\n", call->name); 1418 " events/%s\n", call->name);
1199 return ret; 1419 return -1;
1200 } 1420 }
1201 } 1421 }
1202 trace_create_file("filter", 0644, call->dir, call, 1422 trace_create_file("filter", 0644, file->dir, call,
1203 filter); 1423 filter);
1204 1424
1205 trace_create_file("format", 0444, call->dir, call, 1425 trace_create_file("format", 0444, file->dir, call,
1206 format); 1426 format);
1207 1427
1208 return 0; 1428 return 0;
1209} 1429}
1210 1430
1431static void remove_subsystem(struct ftrace_subsystem_dir *dir)
1432{
1433 if (!dir)
1434 return;
1435
1436 if (!--dir->nr_events) {
1437 debugfs_remove_recursive(dir->entry);
1438 list_del(&dir->list);
1439 __put_system_dir(dir);
1440 }
1441}
1442
1443static void remove_event_from_tracers(struct ftrace_event_call *call)
1444{
1445 struct ftrace_event_file *file;
1446 struct trace_array *tr;
1447
1448 do_for_each_event_file_safe(tr, file) {
1449
1450 if (file->event_call != call)
1451 continue;
1452
1453 list_del(&file->list);
1454 debugfs_remove_recursive(file->dir);
1455 remove_subsystem(file->system);
1456 kmem_cache_free(file_cachep, file);
1457
1458 /*
1459 * The do_for_each_event_file_safe() is
1460 * a double loop. After finding the call for this
1461 * trace_array, we use break to jump to the next
1462 * trace_array.
1463 */
1464 break;
1465 } while_for_each_event_file();
1466}
1467
1211static void event_remove(struct ftrace_event_call *call) 1468static void event_remove(struct ftrace_event_call *call)
1212{ 1469{
1213 ftrace_event_enable_disable(call, 0); 1470 struct trace_array *tr;
1471 struct ftrace_event_file *file;
1472
1473 do_for_each_event_file(tr, file) {
1474 if (file->event_call != call)
1475 continue;
1476 ftrace_event_enable_disable(file, 0);
1477 /*
1478 * The do_for_each_event_file() is
1479 * a double loop. After finding the call for this
1480 * trace_array, we use break to jump to the next
1481 * trace_array.
1482 */
1483 break;
1484 } while_for_each_event_file();
1485
1214 if (call->event.funcs) 1486 if (call->event.funcs)
1215 __unregister_ftrace_event(&call->event); 1487 __unregister_ftrace_event(&call->event);
1488 remove_event_from_tracers(call);
1216 list_del(&call->list); 1489 list_del(&call->list);
1217} 1490}
1218 1491
@@ -1234,82 +1507,99 @@ static int event_init(struct ftrace_event_call *call)
1234} 1507}
1235 1508
1236static int 1509static int
1237__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, 1510__register_event(struct ftrace_event_call *call, struct module *mod)
1238 const struct file_operations *id,
1239 const struct file_operations *enable,
1240 const struct file_operations *filter,
1241 const struct file_operations *format)
1242{ 1511{
1243 struct dentry *d_events;
1244 int ret; 1512 int ret;
1245 1513
1246 ret = event_init(call); 1514 ret = event_init(call);
1247 if (ret < 0) 1515 if (ret < 0)
1248 return ret; 1516 return ret;
1249 1517
1250 d_events = event_trace_events_dir(); 1518 list_add(&call->list, &ftrace_events);
1251 if (!d_events)
1252 return -ENOENT;
1253
1254 ret = event_create_dir(call, d_events, id, enable, filter, format);
1255 if (!ret)
1256 list_add(&call->list, &ftrace_events);
1257 call->mod = mod; 1519 call->mod = mod;
1258 1520
1259 return ret; 1521 return 0;
1522}
1523
1524/* Add an event to a trace directory */
1525static int
1526__trace_add_new_event(struct ftrace_event_call *call,
1527 struct trace_array *tr,
1528 const struct file_operations *id,
1529 const struct file_operations *enable,
1530 const struct file_operations *filter,
1531 const struct file_operations *format)
1532{
1533 struct ftrace_event_file *file;
1534
1535 file = kmem_cache_alloc(file_cachep, GFP_TRACE);
1536 if (!file)
1537 return -ENOMEM;
1538
1539 file->event_call = call;
1540 file->tr = tr;
1541 list_add(&file->list, &tr->events);
1542
1543 return event_create_dir(tr->event_dir, file, id, enable, filter, format);
1260} 1544}
1261 1545
1546/*
1547 * Just create a decriptor for early init. A descriptor is required
1548 * for enabling events at boot. We want to enable events before
1549 * the filesystem is initialized.
1550 */
1551static __init int
1552__trace_early_add_new_event(struct ftrace_event_call *call,
1553 struct trace_array *tr)
1554{
1555 struct ftrace_event_file *file;
1556
1557 file = kmem_cache_alloc(file_cachep, GFP_TRACE);
1558 if (!file)
1559 return -ENOMEM;
1560
1561 file->event_call = call;
1562 file->tr = tr;
1563 list_add(&file->list, &tr->events);
1564
1565 return 0;
1566}
1567
1568struct ftrace_module_file_ops;
1569static void __add_event_to_tracers(struct ftrace_event_call *call,
1570 struct ftrace_module_file_ops *file_ops);
1571
1262/* Add an additional event_call dynamically */ 1572/* Add an additional event_call dynamically */
1263int trace_add_event_call(struct ftrace_event_call *call) 1573int trace_add_event_call(struct ftrace_event_call *call)
1264{ 1574{
1265 int ret; 1575 int ret;
1266 mutex_lock(&event_mutex); 1576 mutex_lock(&event_mutex);
1267 ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1268 &ftrace_enable_fops,
1269 &ftrace_event_filter_fops,
1270 &ftrace_event_format_fops);
1271 mutex_unlock(&event_mutex);
1272 return ret;
1273}
1274 1577
1275static void remove_subsystem_dir(const char *name) 1578 ret = __register_event(call, NULL);
1276{ 1579 if (ret >= 0)
1277 struct event_subsystem *system; 1580 __add_event_to_tracers(call, NULL);
1278
1279 if (strcmp(name, TRACE_SYSTEM) == 0)
1280 return;
1281 1581
1282 list_for_each_entry(system, &event_subsystems, list) { 1582 mutex_unlock(&event_mutex);
1283 if (strcmp(system->name, name) == 0) { 1583 return ret;
1284 if (!--system->nr_events) {
1285 debugfs_remove_recursive(system->entry);
1286 list_del(&system->list);
1287 __put_system(system);
1288 }
1289 break;
1290 }
1291 }
1292} 1584}
1293 1585
1294/* 1586/*
1295 * Must be called under locking both of event_mutex and trace_event_mutex. 1587 * Must be called under locking both of event_mutex and trace_event_sem.
1296 */ 1588 */
1297static void __trace_remove_event_call(struct ftrace_event_call *call) 1589static void __trace_remove_event_call(struct ftrace_event_call *call)
1298{ 1590{
1299 event_remove(call); 1591 event_remove(call);
1300 trace_destroy_fields(call); 1592 trace_destroy_fields(call);
1301 destroy_preds(call); 1593 destroy_preds(call);
1302 debugfs_remove_recursive(call->dir);
1303 remove_subsystem_dir(call->class->system);
1304} 1594}
1305 1595
1306/* Remove an event_call */ 1596/* Remove an event_call */
1307void trace_remove_event_call(struct ftrace_event_call *call) 1597void trace_remove_event_call(struct ftrace_event_call *call)
1308{ 1598{
1309 mutex_lock(&event_mutex); 1599 mutex_lock(&event_mutex);
1310 down_write(&trace_event_mutex); 1600 down_write(&trace_event_sem);
1311 __trace_remove_event_call(call); 1601 __trace_remove_event_call(call);
1312 up_write(&trace_event_mutex); 1602 up_write(&trace_event_sem);
1313 mutex_unlock(&event_mutex); 1603 mutex_unlock(&event_mutex);
1314} 1604}
1315 1605
@@ -1336,6 +1626,26 @@ struct ftrace_module_file_ops {
1336}; 1626};
1337 1627
1338static struct ftrace_module_file_ops * 1628static struct ftrace_module_file_ops *
1629find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1630{
1631 /*
1632 * As event_calls are added in groups by module,
1633 * when we find one file_ops, we don't need to search for
1634 * each call in that module, as the rest should be the
1635 * same. Only search for a new one if the last one did
1636 * not match.
1637 */
1638 if (file_ops && mod == file_ops->mod)
1639 return file_ops;
1640
1641 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1642 if (file_ops->mod == mod)
1643 return file_ops;
1644 }
1645 return NULL;
1646}
1647
1648static struct ftrace_module_file_ops *
1339trace_create_file_ops(struct module *mod) 1649trace_create_file_ops(struct module *mod)
1340{ 1650{
1341 struct ftrace_module_file_ops *file_ops; 1651 struct ftrace_module_file_ops *file_ops;
@@ -1386,9 +1696,8 @@ static void trace_module_add_events(struct module *mod)
1386 return; 1696 return;
1387 1697
1388 for_each_event(call, start, end) { 1698 for_each_event(call, start, end) {
1389 __trace_add_event_call(*call, mod, 1699 __register_event(*call, mod);
1390 &file_ops->id, &file_ops->enable, 1700 __add_event_to_tracers(*call, file_ops);
1391 &file_ops->filter, &file_ops->format);
1392 } 1701 }
1393} 1702}
1394 1703
@@ -1396,12 +1705,13 @@ static void trace_module_remove_events(struct module *mod)
1396{ 1705{
1397 struct ftrace_module_file_ops *file_ops; 1706 struct ftrace_module_file_ops *file_ops;
1398 struct ftrace_event_call *call, *p; 1707 struct ftrace_event_call *call, *p;
1399 bool found = false; 1708 bool clear_trace = false;
1400 1709
1401 down_write(&trace_event_mutex); 1710 down_write(&trace_event_sem);
1402 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1711 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1403 if (call->mod == mod) { 1712 if (call->mod == mod) {
1404 found = true; 1713 if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
1714 clear_trace = true;
1405 __trace_remove_event_call(call); 1715 __trace_remove_event_call(call);
1406 } 1716 }
1407 } 1717 }
@@ -1415,14 +1725,18 @@ static void trace_module_remove_events(struct module *mod)
1415 list_del(&file_ops->list); 1725 list_del(&file_ops->list);
1416 kfree(file_ops); 1726 kfree(file_ops);
1417 } 1727 }
1728 up_write(&trace_event_sem);
1418 1729
1419 /* 1730 /*
1420 * It is safest to reset the ring buffer if the module being unloaded 1731 * It is safest to reset the ring buffer if the module being unloaded
1421 * registered any events. 1732 * registered any events that were used. The only worry is if
1733 * a new module gets loaded, and takes on the same id as the events
1734 * of this module. When printing out the buffer, traced events left
1735 * over from this module may be passed to the new module events and
1736 * unexpected results may occur.
1422 */ 1737 */
1423 if (found) 1738 if (clear_trace)
1424 tracing_reset_current_online_cpus(); 1739 tracing_reset_all_online_cpus();
1425 up_write(&trace_event_mutex);
1426} 1740}
1427 1741
1428static int trace_module_notify(struct notifier_block *self, 1742static int trace_module_notify(struct notifier_block *self,
@@ -1443,14 +1757,433 @@ static int trace_module_notify(struct notifier_block *self,
1443 1757
1444 return 0; 1758 return 0;
1445} 1759}
1760
1761static int
1762__trace_add_new_mod_event(struct ftrace_event_call *call,
1763 struct trace_array *tr,
1764 struct ftrace_module_file_ops *file_ops)
1765{
1766 return __trace_add_new_event(call, tr,
1767 &file_ops->id, &file_ops->enable,
1768 &file_ops->filter, &file_ops->format);
1769}
1770
1446#else 1771#else
1447static int trace_module_notify(struct notifier_block *self, 1772static inline struct ftrace_module_file_ops *
1448 unsigned long val, void *data) 1773find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1774{
1775 return NULL;
1776}
1777static inline int trace_module_notify(struct notifier_block *self,
1778 unsigned long val, void *data)
1449{ 1779{
1450 return 0; 1780 return 0;
1451} 1781}
1782static inline int
1783__trace_add_new_mod_event(struct ftrace_event_call *call,
1784 struct trace_array *tr,
1785 struct ftrace_module_file_ops *file_ops)
1786{
1787 return -ENODEV;
1788}
1452#endif /* CONFIG_MODULES */ 1789#endif /* CONFIG_MODULES */
1453 1790
1791/* Create a new event directory structure for a trace directory. */
1792static void
1793__trace_add_event_dirs(struct trace_array *tr)
1794{
1795 struct ftrace_module_file_ops *file_ops = NULL;
1796 struct ftrace_event_call *call;
1797 int ret;
1798
1799 list_for_each_entry(call, &ftrace_events, list) {
1800 if (call->mod) {
1801 /*
1802 * Directories for events by modules need to
1803 * keep module ref counts when opened (as we don't
1804 * want the module to disappear when reading one
1805 * of these files). The file_ops keep account of
1806 * the module ref count.
1807 */
1808 file_ops = find_ftrace_file_ops(file_ops, call->mod);
1809 if (!file_ops)
1810 continue; /* Warn? */
1811 ret = __trace_add_new_mod_event(call, tr, file_ops);
1812 if (ret < 0)
1813 pr_warning("Could not create directory for event %s\n",
1814 call->name);
1815 continue;
1816 }
1817 ret = __trace_add_new_event(call, tr,
1818 &ftrace_event_id_fops,
1819 &ftrace_enable_fops,
1820 &ftrace_event_filter_fops,
1821 &ftrace_event_format_fops);
1822 if (ret < 0)
1823 pr_warning("Could not create directory for event %s\n",
1824 call->name);
1825 }
1826}
1827
1828#ifdef CONFIG_DYNAMIC_FTRACE
1829
1830/* Avoid typos */
1831#define ENABLE_EVENT_STR "enable_event"
1832#define DISABLE_EVENT_STR "disable_event"
1833
1834struct event_probe_data {
1835 struct ftrace_event_file *file;
1836 unsigned long count;
1837 int ref;
1838 bool enable;
1839};
1840
1841static struct ftrace_event_file *
1842find_event_file(struct trace_array *tr, const char *system, const char *event)
1843{
1844 struct ftrace_event_file *file;
1845 struct ftrace_event_call *call;
1846
1847 list_for_each_entry(file, &tr->events, list) {
1848
1849 call = file->event_call;
1850
1851 if (!call->name || !call->class || !call->class->reg)
1852 continue;
1853
1854 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
1855 continue;
1856
1857 if (strcmp(event, call->name) == 0 &&
1858 strcmp(system, call->class->system) == 0)
1859 return file;
1860 }
1861 return NULL;
1862}
1863
1864static void
1865event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
1866{
1867 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1868 struct event_probe_data *data = *pdata;
1869
1870 if (!data)
1871 return;
1872
1873 if (data->enable)
1874 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
1875 else
1876 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
1877}
1878
1879static void
1880event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
1881{
1882 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1883 struct event_probe_data *data = *pdata;
1884
1885 if (!data)
1886 return;
1887
1888 if (!data->count)
1889 return;
1890
1891 /* Skip if the event is in a state we want to switch to */
1892 if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
1893 return;
1894
1895 if (data->count != -1)
1896 (data->count)--;
1897
1898 event_enable_probe(ip, parent_ip, _data);
1899}
1900
1901static int
1902event_enable_print(struct seq_file *m, unsigned long ip,
1903 struct ftrace_probe_ops *ops, void *_data)
1904{
1905 struct event_probe_data *data = _data;
1906
1907 seq_printf(m, "%ps:", (void *)ip);
1908
1909 seq_printf(m, "%s:%s:%s",
1910 data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
1911 data->file->event_call->class->system,
1912 data->file->event_call->name);
1913
1914 if (data->count == -1)
1915 seq_printf(m, ":unlimited\n");
1916 else
1917 seq_printf(m, ":count=%ld\n", data->count);
1918
1919 return 0;
1920}
1921
1922static int
1923event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
1924 void **_data)
1925{
1926 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1927 struct event_probe_data *data = *pdata;
1928
1929 data->ref++;
1930 return 0;
1931}
1932
1933static void
1934event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
1935 void **_data)
1936{
1937 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1938 struct event_probe_data *data = *pdata;
1939
1940 if (WARN_ON_ONCE(data->ref <= 0))
1941 return;
1942
1943 data->ref--;
1944 if (!data->ref) {
1945 /* Remove the SOFT_MODE flag */
1946 __ftrace_event_enable_disable(data->file, 0, 1);
1947 module_put(data->file->event_call->mod);
1948 kfree(data);
1949 }
1950 *pdata = NULL;
1951}
1952
1953static struct ftrace_probe_ops event_enable_probe_ops = {
1954 .func = event_enable_probe,
1955 .print = event_enable_print,
1956 .init = event_enable_init,
1957 .free = event_enable_free,
1958};
1959
1960static struct ftrace_probe_ops event_enable_count_probe_ops = {
1961 .func = event_enable_count_probe,
1962 .print = event_enable_print,
1963 .init = event_enable_init,
1964 .free = event_enable_free,
1965};
1966
1967static struct ftrace_probe_ops event_disable_probe_ops = {
1968 .func = event_enable_probe,
1969 .print = event_enable_print,
1970 .init = event_enable_init,
1971 .free = event_enable_free,
1972};
1973
1974static struct ftrace_probe_ops event_disable_count_probe_ops = {
1975 .func = event_enable_count_probe,
1976 .print = event_enable_print,
1977 .init = event_enable_init,
1978 .free = event_enable_free,
1979};
1980
1981static int
1982event_enable_func(struct ftrace_hash *hash,
1983 char *glob, char *cmd, char *param, int enabled)
1984{
1985 struct trace_array *tr = top_trace_array();
1986 struct ftrace_event_file *file;
1987 struct ftrace_probe_ops *ops;
1988 struct event_probe_data *data;
1989 const char *system;
1990 const char *event;
1991 char *number;
1992 bool enable;
1993 int ret;
1994
1995 /* hash funcs only work with set_ftrace_filter */
1996 if (!enabled)
1997 return -EINVAL;
1998
1999 if (!param)
2000 return -EINVAL;
2001
2002 system = strsep(&param, ":");
2003 if (!param)
2004 return -EINVAL;
2005
2006 event = strsep(&param, ":");
2007
2008 mutex_lock(&event_mutex);
2009
2010 ret = -EINVAL;
2011 file = find_event_file(tr, system, event);
2012 if (!file)
2013 goto out;
2014
2015 enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
2016
2017 if (enable)
2018 ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops;
2019 else
2020 ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
2021
2022 if (glob[0] == '!') {
2023 unregister_ftrace_function_probe_func(glob+1, ops);
2024 ret = 0;
2025 goto out;
2026 }
2027
2028 ret = -ENOMEM;
2029 data = kzalloc(sizeof(*data), GFP_KERNEL);
2030 if (!data)
2031 goto out;
2032
2033 data->enable = enable;
2034 data->count = -1;
2035 data->file = file;
2036
2037 if (!param)
2038 goto out_reg;
2039
2040 number = strsep(&param, ":");
2041
2042 ret = -EINVAL;
2043 if (!strlen(number))
2044 goto out_free;
2045
2046 /*
2047 * We use the callback data field (which is a pointer)
2048 * as our counter.
2049 */
2050 ret = kstrtoul(number, 0, &data->count);
2051 if (ret)
2052 goto out_free;
2053
2054 out_reg:
2055 /* Don't let event modules unload while probe registered */
2056 ret = try_module_get(file->event_call->mod);
2057 if (!ret)
2058 goto out_free;
2059
2060 ret = __ftrace_event_enable_disable(file, 1, 1);
2061 if (ret < 0)
2062 goto out_put;
2063 ret = register_ftrace_function_probe(glob, ops, data);
2064 if (!ret)
2065 goto out_disable;
2066 out:
2067 mutex_unlock(&event_mutex);
2068 return ret;
2069
2070 out_disable:
2071 __ftrace_event_enable_disable(file, 0, 1);
2072 out_put:
2073 module_put(file->event_call->mod);
2074 out_free:
2075 kfree(data);
2076 goto out;
2077}
2078
2079static struct ftrace_func_command event_enable_cmd = {
2080 .name = ENABLE_EVENT_STR,
2081 .func = event_enable_func,
2082};
2083
2084static struct ftrace_func_command event_disable_cmd = {
2085 .name = DISABLE_EVENT_STR,
2086 .func = event_enable_func,
2087};
2088
2089static __init int register_event_cmds(void)
2090{
2091 int ret;
2092
2093 ret = register_ftrace_command(&event_enable_cmd);
2094 if (WARN_ON(ret < 0))
2095 return ret;
2096 ret = register_ftrace_command(&event_disable_cmd);
2097 if (WARN_ON(ret < 0))
2098 unregister_ftrace_command(&event_enable_cmd);
2099 return ret;
2100}
2101#else
2102static inline int register_event_cmds(void) { return 0; }
2103#endif /* CONFIG_DYNAMIC_FTRACE */
2104
2105/*
2106 * The top level array has already had its ftrace_event_file
2107 * descriptors created in order to allow for early events to
2108 * be recorded. This function is called after the debugfs has been
2109 * initialized, and we now have to create the files associated
2110 * to the events.
2111 */
2112static __init void
2113__trace_early_add_event_dirs(struct trace_array *tr)
2114{
2115 struct ftrace_event_file *file;
2116 int ret;
2117
2118
2119 list_for_each_entry(file, &tr->events, list) {
2120 ret = event_create_dir(tr->event_dir, file,
2121 &ftrace_event_id_fops,
2122 &ftrace_enable_fops,
2123 &ftrace_event_filter_fops,
2124 &ftrace_event_format_fops);
2125 if (ret < 0)
2126 pr_warning("Could not create directory for event %s\n",
2127 file->event_call->name);
2128 }
2129}
2130
2131/*
2132 * For early boot up, the top trace array requires to have
2133 * a list of events that can be enabled. This must be done before
2134 * the filesystem is set up in order to allow events to be traced
2135 * early.
2136 */
2137static __init void
2138__trace_early_add_events(struct trace_array *tr)
2139{
2140 struct ftrace_event_call *call;
2141 int ret;
2142
2143 list_for_each_entry(call, &ftrace_events, list) {
2144 /* Early boot up should not have any modules loaded */
2145 if (WARN_ON_ONCE(call->mod))
2146 continue;
2147
2148 ret = __trace_early_add_new_event(call, tr);
2149 if (ret < 0)
2150 pr_warning("Could not create early event %s\n",
2151 call->name);
2152 }
2153}
2154
2155/* Remove the event directory structure for a trace directory. */
2156static void
2157__trace_remove_event_dirs(struct trace_array *tr)
2158{
2159 struct ftrace_event_file *file, *next;
2160
2161 list_for_each_entry_safe(file, next, &tr->events, list) {
2162 list_del(&file->list);
2163 debugfs_remove_recursive(file->dir);
2164 remove_subsystem(file->system);
2165 kmem_cache_free(file_cachep, file);
2166 }
2167}
2168
2169static void
2170__add_event_to_tracers(struct ftrace_event_call *call,
2171 struct ftrace_module_file_ops *file_ops)
2172{
2173 struct trace_array *tr;
2174
2175 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
2176 if (file_ops)
2177 __trace_add_new_mod_event(call, tr, file_ops);
2178 else
2179 __trace_add_new_event(call, tr,
2180 &ftrace_event_id_fops,
2181 &ftrace_enable_fops,
2182 &ftrace_event_filter_fops,
2183 &ftrace_event_format_fops);
2184 }
2185}
2186
1454static struct notifier_block trace_module_nb = { 2187static struct notifier_block trace_module_nb = {
1455 .notifier_call = trace_module_notify, 2188 .notifier_call = trace_module_notify,
1456 .priority = 0, 2189 .priority = 0,
@@ -1464,15 +2197,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1464static __init int setup_trace_event(char *str) 2197static __init int setup_trace_event(char *str)
1465{ 2198{
1466 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); 2199 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1467 ring_buffer_expanded = 1; 2200 ring_buffer_expanded = true;
1468 tracing_selftest_disabled = 1; 2201 tracing_selftest_disabled = true;
1469 2202
1470 return 1; 2203 return 1;
1471} 2204}
1472__setup("trace_event=", setup_trace_event); 2205__setup("trace_event=", setup_trace_event);
1473 2206
2207/* Expects to have event_mutex held when called */
2208static int
2209create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
2210{
2211 struct dentry *d_events;
2212 struct dentry *entry;
2213
2214 entry = debugfs_create_file("set_event", 0644, parent,
2215 tr, &ftrace_set_event_fops);
2216 if (!entry) {
2217 pr_warning("Could not create debugfs 'set_event' entry\n");
2218 return -ENOMEM;
2219 }
2220
2221 d_events = debugfs_create_dir("events", parent);
2222 if (!d_events) {
2223 pr_warning("Could not create debugfs 'events' directory\n");
2224 return -ENOMEM;
2225 }
2226
2227 /* ring buffer internal formats */
2228 trace_create_file("header_page", 0444, d_events,
2229 ring_buffer_print_page_header,
2230 &ftrace_show_header_fops);
2231
2232 trace_create_file("header_event", 0444, d_events,
2233 ring_buffer_print_entry_header,
2234 &ftrace_show_header_fops);
2235
2236 trace_create_file("enable", 0644, d_events,
2237 tr, &ftrace_tr_enable_fops);
2238
2239 tr->event_dir = d_events;
2240
2241 return 0;
2242}
2243
2244/**
2245 * event_trace_add_tracer - add a instance of a trace_array to events
2246 * @parent: The parent dentry to place the files/directories for events in
2247 * @tr: The trace array associated with these events
2248 *
2249 * When a new instance is created, it needs to set up its events
2250 * directory, as well as other files associated with events. It also
2251 * creates the event hierachry in the @parent/events directory.
2252 *
2253 * Returns 0 on success.
2254 */
2255int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
2256{
2257 int ret;
2258
2259 mutex_lock(&event_mutex);
2260
2261 ret = create_event_toplevel_files(parent, tr);
2262 if (ret)
2263 goto out_unlock;
2264
2265 down_write(&trace_event_sem);
2266 __trace_add_event_dirs(tr);
2267 up_write(&trace_event_sem);
2268
2269 out_unlock:
2270 mutex_unlock(&event_mutex);
2271
2272 return ret;
2273}
2274
2275/*
2276 * The top trace array already had its file descriptors created.
2277 * Now the files themselves need to be created.
2278 */
2279static __init int
2280early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
2281{
2282 int ret;
2283
2284 mutex_lock(&event_mutex);
2285
2286 ret = create_event_toplevel_files(parent, tr);
2287 if (ret)
2288 goto out_unlock;
2289
2290 down_write(&trace_event_sem);
2291 __trace_early_add_event_dirs(tr);
2292 up_write(&trace_event_sem);
2293
2294 out_unlock:
2295 mutex_unlock(&event_mutex);
2296
2297 return ret;
2298}
2299
2300int event_trace_del_tracer(struct trace_array *tr)
2301{
2302 /* Disable any running events */
2303 __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
2304
2305 mutex_lock(&event_mutex);
2306
2307 down_write(&trace_event_sem);
2308 __trace_remove_event_dirs(tr);
2309 debugfs_remove_recursive(tr->event_dir);
2310 up_write(&trace_event_sem);
2311
2312 tr->event_dir = NULL;
2313
2314 mutex_unlock(&event_mutex);
2315
2316 return 0;
2317}
2318
2319static __init int event_trace_memsetup(void)
2320{
2321 field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
2322 file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
2323 return 0;
2324}
2325
1474static __init int event_trace_enable(void) 2326static __init int event_trace_enable(void)
1475{ 2327{
2328 struct trace_array *tr = top_trace_array();
1476 struct ftrace_event_call **iter, *call; 2329 struct ftrace_event_call **iter, *call;
1477 char *buf = bootup_event_buf; 2330 char *buf = bootup_event_buf;
1478 char *token; 2331 char *token;
@@ -1486,6 +2339,14 @@ static __init int event_trace_enable(void)
1486 list_add(&call->list, &ftrace_events); 2339 list_add(&call->list, &ftrace_events);
1487 } 2340 }
1488 2341
2342 /*
2343 * We need the top trace array to have a working set of trace
2344 * points at early init, before the debug files and directories
2345 * are created. Create the file entries now, and attach them
2346 * to the actual file dentries later.
2347 */
2348 __trace_early_add_events(tr);
2349
1489 while (true) { 2350 while (true) {
1490 token = strsep(&buf, ","); 2351 token = strsep(&buf, ",");
1491 2352
@@ -1494,73 +2355,43 @@ static __init int event_trace_enable(void)
1494 if (!*token) 2355 if (!*token)
1495 continue; 2356 continue;
1496 2357
1497 ret = ftrace_set_clr_event(token, 1); 2358 ret = ftrace_set_clr_event(tr, token, 1);
1498 if (ret) 2359 if (ret)
1499 pr_warn("Failed to enable trace event: %s\n", token); 2360 pr_warn("Failed to enable trace event: %s\n", token);
1500 } 2361 }
1501 2362
1502 trace_printk_start_comm(); 2363 trace_printk_start_comm();
1503 2364
2365 register_event_cmds();
2366
1504 return 0; 2367 return 0;
1505} 2368}
1506 2369
1507static __init int event_trace_init(void) 2370static __init int event_trace_init(void)
1508{ 2371{
1509 struct ftrace_event_call *call; 2372 struct trace_array *tr;
1510 struct dentry *d_tracer; 2373 struct dentry *d_tracer;
1511 struct dentry *entry; 2374 struct dentry *entry;
1512 struct dentry *d_events;
1513 int ret; 2375 int ret;
1514 2376
2377 tr = top_trace_array();
2378
1515 d_tracer = tracing_init_dentry(); 2379 d_tracer = tracing_init_dentry();
1516 if (!d_tracer) 2380 if (!d_tracer)
1517 return 0; 2381 return 0;
1518 2382
1519 entry = debugfs_create_file("available_events", 0444, d_tracer, 2383 entry = debugfs_create_file("available_events", 0444, d_tracer,
1520 NULL, &ftrace_avail_fops); 2384 tr, &ftrace_avail_fops);
1521 if (!entry) 2385 if (!entry)
1522 pr_warning("Could not create debugfs " 2386 pr_warning("Could not create debugfs "
1523 "'available_events' entry\n"); 2387 "'available_events' entry\n");
1524 2388
1525 entry = debugfs_create_file("set_event", 0644, d_tracer,
1526 NULL, &ftrace_set_event_fops);
1527 if (!entry)
1528 pr_warning("Could not create debugfs "
1529 "'set_event' entry\n");
1530
1531 d_events = event_trace_events_dir();
1532 if (!d_events)
1533 return 0;
1534
1535 /* ring buffer internal formats */
1536 trace_create_file("header_page", 0444, d_events,
1537 ring_buffer_print_page_header,
1538 &ftrace_show_header_fops);
1539
1540 trace_create_file("header_event", 0444, d_events,
1541 ring_buffer_print_entry_header,
1542 &ftrace_show_header_fops);
1543
1544 trace_create_file("enable", 0644, d_events,
1545 NULL, &ftrace_system_enable_fops);
1546
1547 if (trace_define_common_fields()) 2389 if (trace_define_common_fields())
1548 pr_warning("tracing: Failed to allocate common fields"); 2390 pr_warning("tracing: Failed to allocate common fields");
1549 2391
1550 /* 2392 ret = early_event_add_tracer(d_tracer, tr);
1551 * Early initialization already enabled ftrace event. 2393 if (ret)
1552 * Now it's only necessary to create the event directory. 2394 return ret;
1553 */
1554 list_for_each_entry(call, &ftrace_events, list) {
1555
1556 ret = event_create_dir(call, d_events,
1557 &ftrace_event_id_fops,
1558 &ftrace_enable_fops,
1559 &ftrace_event_filter_fops,
1560 &ftrace_event_format_fops);
1561 if (ret < 0)
1562 event_remove(call);
1563 }
1564 2395
1565 ret = register_module_notifier(&trace_module_nb); 2396 ret = register_module_notifier(&trace_module_nb);
1566 if (ret) 2397 if (ret)
@@ -1568,6 +2399,7 @@ static __init int event_trace_init(void)
1568 2399
1569 return 0; 2400 return 0;
1570} 2401}
2402early_initcall(event_trace_memsetup);
1571core_initcall(event_trace_enable); 2403core_initcall(event_trace_enable);
1572fs_initcall(event_trace_init); 2404fs_initcall(event_trace_init);
1573 2405
@@ -1627,13 +2459,20 @@ static __init void event_test_stuff(void)
1627 */ 2459 */
1628static __init void event_trace_self_tests(void) 2460static __init void event_trace_self_tests(void)
1629{ 2461{
2462 struct ftrace_subsystem_dir *dir;
2463 struct ftrace_event_file *file;
1630 struct ftrace_event_call *call; 2464 struct ftrace_event_call *call;
1631 struct event_subsystem *system; 2465 struct event_subsystem *system;
2466 struct trace_array *tr;
1632 int ret; 2467 int ret;
1633 2468
2469 tr = top_trace_array();
2470
1634 pr_info("Running tests on trace events:\n"); 2471 pr_info("Running tests on trace events:\n");
1635 2472
1636 list_for_each_entry(call, &ftrace_events, list) { 2473 list_for_each_entry(file, &tr->events, list) {
2474
2475 call = file->event_call;
1637 2476
1638 /* Only test those that have a probe */ 2477 /* Only test those that have a probe */
1639 if (!call->class || !call->class->probe) 2478 if (!call->class || !call->class->probe)
@@ -1657,15 +2496,15 @@ static __init void event_trace_self_tests(void)
1657 * If an event is already enabled, someone is using 2496 * If an event is already enabled, someone is using
1658 * it and the self test should not be on. 2497 * it and the self test should not be on.
1659 */ 2498 */
1660 if (call->flags & TRACE_EVENT_FL_ENABLED) { 2499 if (file->flags & FTRACE_EVENT_FL_ENABLED) {
1661 pr_warning("Enabled event during self test!\n"); 2500 pr_warning("Enabled event during self test!\n");
1662 WARN_ON_ONCE(1); 2501 WARN_ON_ONCE(1);
1663 continue; 2502 continue;
1664 } 2503 }
1665 2504
1666 ftrace_event_enable_disable(call, 1); 2505 ftrace_event_enable_disable(file, 1);
1667 event_test_stuff(); 2506 event_test_stuff();
1668 ftrace_event_enable_disable(call, 0); 2507 ftrace_event_enable_disable(file, 0);
1669 2508
1670 pr_cont("OK\n"); 2509 pr_cont("OK\n");
1671 } 2510 }
@@ -1674,7 +2513,9 @@ static __init void event_trace_self_tests(void)
1674 2513
1675 pr_info("Running tests on trace event systems:\n"); 2514 pr_info("Running tests on trace event systems:\n");
1676 2515
1677 list_for_each_entry(system, &event_subsystems, list) { 2516 list_for_each_entry(dir, &tr->systems, list) {
2517
2518 system = dir->subsystem;
1678 2519
1679 /* the ftrace system is special, skip it */ 2520 /* the ftrace system is special, skip it */
1680 if (strcmp(system->name, "ftrace") == 0) 2521 if (strcmp(system->name, "ftrace") == 0)
@@ -1682,7 +2523,7 @@ static __init void event_trace_self_tests(void)
1682 2523
1683 pr_info("Testing event system %s: ", system->name); 2524 pr_info("Testing event system %s: ", system->name);
1684 2525
1685 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); 2526 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
1686 if (WARN_ON_ONCE(ret)) { 2527 if (WARN_ON_ONCE(ret)) {
1687 pr_warning("error enabling system %s\n", 2528 pr_warning("error enabling system %s\n",
1688 system->name); 2529 system->name);
@@ -1691,7 +2532,7 @@ static __init void event_trace_self_tests(void)
1691 2532
1692 event_test_stuff(); 2533 event_test_stuff();
1693 2534
1694 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); 2535 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
1695 if (WARN_ON_ONCE(ret)) { 2536 if (WARN_ON_ONCE(ret)) {
1696 pr_warning("error disabling system %s\n", 2537 pr_warning("error disabling system %s\n",
1697 system->name); 2538 system->name);
@@ -1706,7 +2547,7 @@ static __init void event_trace_self_tests(void)
1706 pr_info("Running tests on all trace events:\n"); 2547 pr_info("Running tests on all trace events:\n");
1707 pr_info("Testing all events: "); 2548 pr_info("Testing all events: ");
1708 2549
1709 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); 2550 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
1710 if (WARN_ON_ONCE(ret)) { 2551 if (WARN_ON_ONCE(ret)) {
1711 pr_warning("error enabling all events\n"); 2552 pr_warning("error enabling all events\n");
1712 return; 2553 return;
@@ -1715,7 +2556,7 @@ static __init void event_trace_self_tests(void)
1715 event_test_stuff(); 2556 event_test_stuff();
1716 2557
1717 /* reset sysname */ 2558 /* reset sysname */
1718 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); 2559 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
1719 if (WARN_ON_ONCE(ret)) { 2560 if (WARN_ON_ONCE(ret)) {
1720 pr_warning("error disabling all events\n"); 2561 pr_warning("error disabling all events\n");
1721 return; 2562 return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e5b0ca8b8d4d..a6361178de5a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,
658 mutex_unlock(&event_mutex); 658 mutex_unlock(&event_mutex);
659} 659}
660 660
661static struct ftrace_event_field *
662__find_event_field(struct list_head *head, char *name)
663{
664 struct ftrace_event_field *field;
665
666 list_for_each_entry(field, head, link) {
667 if (!strcmp(field->name, name))
668 return field;
669 }
670
671 return NULL;
672}
673
674static struct ftrace_event_field *
675find_event_field(struct ftrace_event_call *call, char *name)
676{
677 struct ftrace_event_field *field;
678 struct list_head *head;
679
680 field = __find_event_field(&ftrace_common_fields, name);
681 if (field)
682 return field;
683
684 head = trace_get_fields(call);
685 return __find_event_field(head, name);
686}
687
688static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 661static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
689{ 662{
690 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); 663 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
@@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
1337 return NULL; 1310 return NULL;
1338 } 1311 }
1339 1312
1340 field = find_event_field(call, operand1); 1313 field = trace_find_event_field(call, operand1);
1341 if (!field) { 1314 if (!field) {
1342 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); 1315 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
1343 return NULL; 1316 return NULL;
@@ -1907,16 +1880,17 @@ out_unlock:
1907 return err; 1880 return err;
1908} 1881}
1909 1882
1910int apply_subsystem_event_filter(struct event_subsystem *system, 1883int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
1911 char *filter_string) 1884 char *filter_string)
1912{ 1885{
1886 struct event_subsystem *system = dir->subsystem;
1913 struct event_filter *filter; 1887 struct event_filter *filter;
1914 int err = 0; 1888 int err = 0;
1915 1889
1916 mutex_lock(&event_mutex); 1890 mutex_lock(&event_mutex);
1917 1891
1918 /* Make sure the system still has events */ 1892 /* Make sure the system still has events */
1919 if (!system->nr_events) { 1893 if (!dir->nr_events) {
1920 err = -ENODEV; 1894 err = -ENODEV;
1921 goto out_unlock; 1895 goto out_unlock;
1922 } 1896 }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e039906b037d..d21a74670088 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void) \
129 129
130#undef FTRACE_ENTRY 130#undef FTRACE_ENTRY
131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ 131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
132int \ 132static int __init \
133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ 133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
134{ \ 134{ \
135 struct struct_name field; \ 135 struct struct_name field; \
@@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ 168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
169 regfn) \ 169 regfn) \
170 \ 170 \
171struct ftrace_event_class event_class_ftrace_##call = { \ 171struct ftrace_event_class __refdata event_class_ftrace_##call = { \
172 .system = __stringify(TRACE_SYSTEM), \ 172 .system = __stringify(TRACE_SYSTEM), \
173 .define_fields = ftrace_define_fields_##call, \ 173 .define_fields = ftrace_define_fields_##call, \
174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 601152523326..c4d6d7191988 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);
28static int function_trace_init(struct trace_array *tr) 28static int function_trace_init(struct trace_array *tr)
29{ 29{
30 func_trace = tr; 30 func_trace = tr;
31 tr->cpu = get_cpu(); 31 tr->trace_buffer.cpu = get_cpu();
32 put_cpu(); 32 put_cpu();
33 33
34 tracing_start_cmdline_record(); 34 tracing_start_cmdline_record();
@@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)
44 44
45static void function_trace_start(struct trace_array *tr) 45static void function_trace_start(struct trace_array *tr)
46{ 46{
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(&tr->trace_buffer);
48} 48}
49 49
50/* Our option */ 50/* Our option */
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
76 goto out; 76 goto out;
77 77
78 cpu = smp_processor_id(); 78 cpu = smp_processor_id();
79 data = tr->data[cpu]; 79 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
80 if (!atomic_read(&data->disabled)) { 80 if (!atomic_read(&data->disabled)) {
81 local_save_flags(flags); 81 local_save_flags(flags);
82 trace_function(tr, ip, parent_ip, flags, pc); 82 trace_function(tr, ip, parent_ip, flags, pc);
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
107 */ 107 */
108 local_irq_save(flags); 108 local_irq_save(flags);
109 cpu = raw_smp_processor_id(); 109 cpu = raw_smp_processor_id();
110 data = tr->data[cpu]; 110 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
111 disabled = atomic_inc_return(&data->disabled); 111 disabled = atomic_inc_return(&data->disabled);
112 112
113 if (likely(disabled == 1)) { 113 if (likely(disabled == 1)) {
@@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly =
214}; 214};
215 215
216#ifdef CONFIG_DYNAMIC_FTRACE 216#ifdef CONFIG_DYNAMIC_FTRACE
217static void 217static int update_count(void **data)
218ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
219{ 218{
220 long *count = (long *)data; 219 unsigned long *count = (long *)data;
221
222 if (tracing_is_on())
223 return;
224 220
225 if (!*count) 221 if (!*count)
226 return; 222 return 0;
227 223
228 if (*count != -1) 224 if (*count != -1)
229 (*count)--; 225 (*count)--;
230 226
231 tracing_on(); 227 return 1;
232} 228}
233 229
234static void 230static void
235ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) 231ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
236{ 232{
237 long *count = (long *)data; 233 if (tracing_is_on())
234 return;
235
236 if (update_count(data))
237 tracing_on();
238}
238 239
240static void
241ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
242{
239 if (!tracing_is_on()) 243 if (!tracing_is_on())
240 return; 244 return;
241 245
242 if (!*count) 246 if (update_count(data))
247 tracing_off();
248}
249
250static void
251ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
252{
253 if (tracing_is_on())
243 return; 254 return;
244 255
245 if (*count != -1) 256 tracing_on();
246 (*count)--; 257}
258
259static void
260ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
261{
262 if (!tracing_is_on())
263 return;
247 264
248 tracing_off(); 265 tracing_off();
249} 266}
250 267
251static int 268/*
252ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 269 * Skip 4:
253 struct ftrace_probe_ops *ops, void *data); 270 * ftrace_stacktrace()
271 * function_trace_probe_call()
272 * ftrace_ops_list_func()
273 * ftrace_call()
274 */
275#define STACK_SKIP 4
254 276
255static struct ftrace_probe_ops traceon_probe_ops = { 277static void
256 .func = ftrace_traceon, 278ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
257 .print = ftrace_trace_onoff_print, 279{
258}; 280 trace_dump_stack(STACK_SKIP);
281}
259 282
260static struct ftrace_probe_ops traceoff_probe_ops = { 283static void
261 .func = ftrace_traceoff, 284ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
262 .print = ftrace_trace_onoff_print, 285{
263}; 286 if (!tracing_is_on())
287 return;
288
289 if (update_count(data))
290 trace_dump_stack(STACK_SKIP);
291}
264 292
265static int 293static int
266ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 294ftrace_probe_print(const char *name, struct seq_file *m,
267 struct ftrace_probe_ops *ops, void *data) 295 unsigned long ip, void *data)
268{ 296{
269 long count = (long)data; 297 long count = (long)data;
270 298
271 seq_printf(m, "%ps:", (void *)ip); 299 seq_printf(m, "%ps:%s", (void *)ip, name);
272
273 if (ops == &traceon_probe_ops)
274 seq_printf(m, "traceon");
275 else
276 seq_printf(m, "traceoff");
277 300
278 if (count == -1) 301 if (count == -1)
279 seq_printf(m, ":unlimited\n"); 302 seq_printf(m, ":unlimited\n");
@@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
284} 307}
285 308
286static int 309static int
287ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) 310ftrace_traceon_print(struct seq_file *m, unsigned long ip,
311 struct ftrace_probe_ops *ops, void *data)
288{ 312{
289 struct ftrace_probe_ops *ops; 313 return ftrace_probe_print("traceon", m, ip, data);
290 314}
291 /* we register both traceon and traceoff to this callback */
292 if (strcmp(cmd, "traceon") == 0)
293 ops = &traceon_probe_ops;
294 else
295 ops = &traceoff_probe_ops;
296 315
297 unregister_ftrace_function_probe_func(glob, ops); 316static int
317ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
318 struct ftrace_probe_ops *ops, void *data)
319{
320 return ftrace_probe_print("traceoff", m, ip, data);
321}
298 322
299 return 0; 323static int
324ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
325 struct ftrace_probe_ops *ops, void *data)
326{
327 return ftrace_probe_print("stacktrace", m, ip, data);
300} 328}
301 329
330static struct ftrace_probe_ops traceon_count_probe_ops = {
331 .func = ftrace_traceon_count,
332 .print = ftrace_traceon_print,
333};
334
335static struct ftrace_probe_ops traceoff_count_probe_ops = {
336 .func = ftrace_traceoff_count,
337 .print = ftrace_traceoff_print,
338};
339
340static struct ftrace_probe_ops stacktrace_count_probe_ops = {
341 .func = ftrace_stacktrace_count,
342 .print = ftrace_stacktrace_print,
343};
344
345static struct ftrace_probe_ops traceon_probe_ops = {
346 .func = ftrace_traceon,
347 .print = ftrace_traceon_print,
348};
349
350static struct ftrace_probe_ops traceoff_probe_ops = {
351 .func = ftrace_traceoff,
352 .print = ftrace_traceoff_print,
353};
354
355static struct ftrace_probe_ops stacktrace_probe_ops = {
356 .func = ftrace_stacktrace,
357 .print = ftrace_stacktrace_print,
358};
359
302static int 360static int
303ftrace_trace_onoff_callback(struct ftrace_hash *hash, 361ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
304 char *glob, char *cmd, char *param, int enable) 362 struct ftrace_hash *hash, char *glob,
363 char *cmd, char *param, int enable)
305{ 364{
306 struct ftrace_probe_ops *ops;
307 void *count = (void *)-1; 365 void *count = (void *)-1;
308 char *number; 366 char *number;
309 int ret; 367 int ret;
@@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
312 if (!enable) 370 if (!enable)
313 return -EINVAL; 371 return -EINVAL;
314 372
315 if (glob[0] == '!') 373 if (glob[0] == '!') {
316 return ftrace_trace_onoff_unreg(glob+1, cmd, param); 374 unregister_ftrace_function_probe_func(glob+1, ops);
317 375 return 0;
318 /* we register both traceon and traceoff to this callback */ 376 }
319 if (strcmp(cmd, "traceon") == 0)
320 ops = &traceon_probe_ops;
321 else
322 ops = &traceoff_probe_ops;
323 377
324 if (!param) 378 if (!param)
325 goto out_reg; 379 goto out_reg;
@@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
343 return ret < 0 ? ret : 0; 397 return ret < 0 ? ret : 0;
344} 398}
345 399
400static int
401ftrace_trace_onoff_callback(struct ftrace_hash *hash,
402 char *glob, char *cmd, char *param, int enable)
403{
404 struct ftrace_probe_ops *ops;
405
406 /* we register both traceon and traceoff to this callback */
407 if (strcmp(cmd, "traceon") == 0)
408 ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
409 else
410 ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
411
412 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
413 param, enable);
414}
415
416static int
417ftrace_stacktrace_callback(struct ftrace_hash *hash,
418 char *glob, char *cmd, char *param, int enable)
419{
420 struct ftrace_probe_ops *ops;
421
422 ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
423
424 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
425 param, enable);
426}
427
346static struct ftrace_func_command ftrace_traceon_cmd = { 428static struct ftrace_func_command ftrace_traceon_cmd = {
347 .name = "traceon", 429 .name = "traceon",
348 .func = ftrace_trace_onoff_callback, 430 .func = ftrace_trace_onoff_callback,
@@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {
353 .func = ftrace_trace_onoff_callback, 435 .func = ftrace_trace_onoff_callback,
354}; 436};
355 437
438static struct ftrace_func_command ftrace_stacktrace_cmd = {
439 .name = "stacktrace",
440 .func = ftrace_stacktrace_callback,
441};
442
356static int __init init_func_cmd_traceon(void) 443static int __init init_func_cmd_traceon(void)
357{ 444{
358 int ret; 445 int ret;
@@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void)
364 ret = register_ftrace_command(&ftrace_traceon_cmd); 451 ret = register_ftrace_command(&ftrace_traceon_cmd);
365 if (ret) 452 if (ret)
366 unregister_ftrace_command(&ftrace_traceoff_cmd); 453 unregister_ftrace_command(&ftrace_traceoff_cmd);
454
455 ret = register_ftrace_command(&ftrace_stacktrace_cmd);
456 if (ret) {
457 unregister_ftrace_command(&ftrace_traceoff_cmd);
458 unregister_ftrace_command(&ftrace_traceon_cmd);
459 }
367 return ret; 460 return ret;
368} 461}
369#else 462#else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 39ada66389cc..8388bc99f2ee 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,
218{ 218{
219 struct ftrace_event_call *call = &event_funcgraph_entry; 219 struct ftrace_event_call *call = &event_funcgraph_entry;
220 struct ring_buffer_event *event; 220 struct ring_buffer_event *event;
221 struct ring_buffer *buffer = tr->buffer; 221 struct ring_buffer *buffer = tr->trace_buffer.buffer;
222 struct ftrace_graph_ent_entry *entry; 222 struct ftrace_graph_ent_entry *entry;
223 223
224 if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) 224 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
265 265
266 local_irq_save(flags); 266 local_irq_save(flags);
267 cpu = raw_smp_processor_id(); 267 cpu = raw_smp_processor_id();
268 data = tr->data[cpu]; 268 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
269 disabled = atomic_inc_return(&data->disabled); 269 disabled = atomic_inc_return(&data->disabled);
270 if (likely(disabled == 1)) { 270 if (likely(disabled == 1)) {
271 pc = preempt_count(); 271 pc = preempt_count();
@@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,
323{ 323{
324 struct ftrace_event_call *call = &event_funcgraph_exit; 324 struct ftrace_event_call *call = &event_funcgraph_exit;
325 struct ring_buffer_event *event; 325 struct ring_buffer_event *event;
326 struct ring_buffer *buffer = tr->buffer; 326 struct ring_buffer *buffer = tr->trace_buffer.buffer;
327 struct ftrace_graph_ret_entry *entry; 327 struct ftrace_graph_ret_entry *entry;
328 328
329 if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) 329 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
350 350
351 local_irq_save(flags); 351 local_irq_save(flags);
352 cpu = raw_smp_processor_id(); 352 cpu = raw_smp_processor_id();
353 data = tr->data[cpu]; 353 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
354 disabled = atomic_inc_return(&data->disabled); 354 disabled = atomic_inc_return(&data->disabled);
355 if (likely(disabled == 1)) { 355 if (likely(disabled == 1)) {
356 pc = preempt_count(); 356 pc = preempt_count();
@@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,
560 * We need to consume the current entry to see 560 * We need to consume the current entry to see
561 * the next one. 561 * the next one.
562 */ 562 */
563 ring_buffer_consume(iter->tr->buffer, iter->cpu, 563 ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
564 NULL, NULL); 564 NULL, NULL);
565 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 565 event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
566 NULL, NULL); 566 NULL, NULL);
567 } 567 }
568 568
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 443b25b43b4f..b19d065a28cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -33,6 +33,7 @@ enum {
33static int trace_type __read_mostly; 33static int trace_type __read_mostly;
34 34
35static int save_flags; 35static int save_flags;
36static bool function_enabled;
36 37
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph); 38static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph); 39static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,
121 if (!irqs_disabled_flags(*flags)) 122 if (!irqs_disabled_flags(*flags))
122 return 0; 123 return 0;
123 124
124 *data = tr->data[cpu]; 125 *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
125 disabled = atomic_inc_return(&(*data)->disabled); 126 disabled = atomic_inc_return(&(*data)->disabled);
126 127
127 if (likely(disabled == 1)) 128 if (likely(disabled == 1))
@@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
175 per_cpu(tracing_cpu, cpu) = 0; 176 per_cpu(tracing_cpu, cpu) = 0;
176 177
177 tracing_max_latency = 0; 178 tracing_max_latency = 0;
178 tracing_reset_online_cpus(irqsoff_trace); 179 tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
179 180
180 return start_irqsoff_tracer(irqsoff_trace, set); 181 return start_irqsoff_tracer(irqsoff_trace, set);
181} 182}
@@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
380 if (per_cpu(tracing_cpu, cpu)) 381 if (per_cpu(tracing_cpu, cpu))
381 return; 382 return;
382 383
383 data = tr->data[cpu]; 384 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
384 385
385 if (unlikely(!data) || atomic_read(&data->disabled)) 386 if (unlikely(!data) || atomic_read(&data->disabled))
386 return; 387 return;
@@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
418 if (!tracer_enabled) 419 if (!tracer_enabled)
419 return; 420 return;
420 421
421 data = tr->data[cpu]; 422 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
422 423
423 if (unlikely(!data) || 424 if (unlikely(!data) ||
424 !data->critical_start || atomic_read(&data->disabled)) 425 !data->critical_start || atomic_read(&data->disabled))
@@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
528} 529}
529#endif /* CONFIG_PREEMPT_TRACER */ 530#endif /* CONFIG_PREEMPT_TRACER */
530 531
531static int start_irqsoff_tracer(struct trace_array *tr, int graph) 532static int register_irqsoff_function(int graph, int set)
532{ 533{
533 int ret = 0; 534 int ret;
534 535
535 if (!graph) 536 /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
536 ret = register_ftrace_function(&trace_ops); 537 if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
537 else 538 return 0;
539
540 if (graph)
538 ret = register_ftrace_graph(&irqsoff_graph_return, 541 ret = register_ftrace_graph(&irqsoff_graph_return,
539 &irqsoff_graph_entry); 542 &irqsoff_graph_entry);
543 else
544 ret = register_ftrace_function(&trace_ops);
545
546 if (!ret)
547 function_enabled = true;
548
549 return ret;
550}
551
552static void unregister_irqsoff_function(int graph)
553{
554 if (!function_enabled)
555 return;
556
557 if (graph)
558 unregister_ftrace_graph();
559 else
560 unregister_ftrace_function(&trace_ops);
561
562 function_enabled = false;
563}
564
565static void irqsoff_function_set(int set)
566{
567 if (set)
568 register_irqsoff_function(is_graph(), 1);
569 else
570 unregister_irqsoff_function(is_graph());
571}
572
573static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set)
574{
575 if (mask & TRACE_ITER_FUNCTION)
576 irqsoff_function_set(set);
577
578 return trace_keep_overwrite(tracer, mask, set);
579}
580
581static int start_irqsoff_tracer(struct trace_array *tr, int graph)
582{
583 int ret;
584
585 ret = register_irqsoff_function(graph, 0);
540 586
541 if (!ret && tracing_is_enabled()) 587 if (!ret && tracing_is_enabled())
542 tracer_enabled = 1; 588 tracer_enabled = 1;
@@ -550,10 +596,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
550{ 596{
551 tracer_enabled = 0; 597 tracer_enabled = 0;
552 598
553 if (!graph) 599 unregister_irqsoff_function(graph);
554 unregister_ftrace_function(&trace_ops);
555 else
556 unregister_ftrace_graph();
557} 600}
558 601
559static void __irqsoff_tracer_init(struct trace_array *tr) 602static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -561,14 +604,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
561 save_flags = trace_flags; 604 save_flags = trace_flags;
562 605
563 /* non overwrite screws up the latency tracers */ 606 /* non overwrite screws up the latency tracers */
564 set_tracer_flag(TRACE_ITER_OVERWRITE, 1); 607 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
565 set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); 608 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
566 609
567 tracing_max_latency = 0; 610 tracing_max_latency = 0;
568 irqsoff_trace = tr; 611 irqsoff_trace = tr;
569 /* make sure that the tracer is visible */ 612 /* make sure that the tracer is visible */
570 smp_wmb(); 613 smp_wmb();
571 tracing_reset_online_cpus(tr); 614 tracing_reset_online_cpus(&tr->trace_buffer);
572 615
573 if (start_irqsoff_tracer(tr, is_graph())) 616 if (start_irqsoff_tracer(tr, is_graph()))
574 printk(KERN_ERR "failed to start irqsoff tracer\n"); 617 printk(KERN_ERR "failed to start irqsoff tracer\n");
@@ -581,8 +624,8 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
581 624
582 stop_irqsoff_tracer(tr, is_graph()); 625 stop_irqsoff_tracer(tr, is_graph());
583 626
584 set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); 627 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
585 set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); 628 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
586} 629}
587 630
588static void irqsoff_tracer_start(struct trace_array *tr) 631static void irqsoff_tracer_start(struct trace_array *tr)
@@ -615,7 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =
615 .print_line = irqsoff_print_line, 658 .print_line = irqsoff_print_line,
616 .flags = &tracer_flags, 659 .flags = &tracer_flags,
617 .set_flag = irqsoff_set_flag, 660 .set_flag = irqsoff_set_flag,
618 .flag_changed = trace_keep_overwrite, 661 .flag_changed = irqsoff_flag_changed,
619#ifdef CONFIG_FTRACE_SELFTEST 662#ifdef CONFIG_FTRACE_SELFTEST
620 .selftest = trace_selftest_startup_irqsoff, 663 .selftest = trace_selftest_startup_irqsoff,
621#endif 664#endif
@@ -649,7 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =
649 .print_line = irqsoff_print_line, 692 .print_line = irqsoff_print_line,
650 .flags = &tracer_flags, 693 .flags = &tracer_flags,
651 .set_flag = irqsoff_set_flag, 694 .set_flag = irqsoff_set_flag,
652 .flag_changed = trace_keep_overwrite, 695 .flag_changed = irqsoff_flag_changed,
653#ifdef CONFIG_FTRACE_SELFTEST 696#ifdef CONFIG_FTRACE_SELFTEST
654 .selftest = trace_selftest_startup_preemptoff, 697 .selftest = trace_selftest_startup_preemptoff,
655#endif 698#endif
@@ -685,7 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
685 .print_line = irqsoff_print_line, 728 .print_line = irqsoff_print_line,
686 .flags = &tracer_flags, 729 .flags = &tracer_flags,
687 .set_flag = irqsoff_set_flag, 730 .set_flag = irqsoff_set_flag,
688 .flag_changed = trace_keep_overwrite, 731 .flag_changed = irqsoff_flag_changed,
689#ifdef CONFIG_FTRACE_SELFTEST 732#ifdef CONFIG_FTRACE_SELFTEST
690 .selftest = trace_selftest_startup_preemptirqsoff, 733 .selftest = trace_selftest_startup_preemptirqsoff,
691#endif 734#endif
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 3c5c5dfea0b3..bd90e1b06088 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
26 trace_init_global_iter(&iter); 26 trace_init_global_iter(&iter);
27 27
28 for_each_tracing_cpu(cpu) { 28 for_each_tracing_cpu(cpu) {
29 atomic_inc(&iter.tr->data[cpu]->disabled); 29 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
30 } 30 }
31 31
32 old_userobj = trace_flags; 32 old_userobj = trace_flags;
@@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
43 iter.iter_flags |= TRACE_FILE_LAT_FMT; 43 iter.iter_flags |= TRACE_FILE_LAT_FMT;
44 iter.pos = -1; 44 iter.pos = -1;
45 45
46 if (cpu_file == TRACE_PIPE_ALL_CPU) { 46 if (cpu_file == RING_BUFFER_ALL_CPUS) {
47 for_each_tracing_cpu(cpu) { 47 for_each_tracing_cpu(cpu) {
48 iter.buffer_iter[cpu] = 48 iter.buffer_iter[cpu] =
49 ring_buffer_read_prepare(iter.tr->buffer, cpu); 49 ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
50 ring_buffer_read_start(iter.buffer_iter[cpu]); 50 ring_buffer_read_start(iter.buffer_iter[cpu]);
51 tracing_iter_reset(&iter, cpu); 51 tracing_iter_reset(&iter, cpu);
52 } 52 }
53 } else { 53 } else {
54 iter.cpu_file = cpu_file; 54 iter.cpu_file = cpu_file;
55 iter.buffer_iter[cpu_file] = 55 iter.buffer_iter[cpu_file] =
56 ring_buffer_read_prepare(iter.tr->buffer, cpu_file); 56 ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
57 ring_buffer_read_start(iter.buffer_iter[cpu_file]); 57 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
58 tracing_iter_reset(&iter, cpu_file); 58 tracing_iter_reset(&iter, cpu_file);
59 } 59 }
@@ -83,7 +83,7 @@ out:
83 trace_flags = old_userobj; 83 trace_flags = old_userobj;
84 84
85 for_each_tracing_cpu(cpu) { 85 for_each_tracing_cpu(cpu) {
86 atomic_dec(&iter.tr->data[cpu]->disabled); 86 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
87 } 87 }
88 88
89 for_each_tracing_cpu(cpu) 89 for_each_tracing_cpu(cpu)
@@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)
115 !cpu_online(cpu_file)) 115 !cpu_online(cpu_file))
116 return KDB_BADINT; 116 return KDB_BADINT;
117 } else { 117 } else {
118 cpu_file = TRACE_PIPE_ALL_CPU; 118 cpu_file = RING_BUFFER_ALL_CPUS;
119 } 119 }
120 120
121 kdb_trap_printk++; 121 kdb_trap_printk++;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fd3c8aae55e5..a5e8f4878bfa 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)
31 overrun_detected = false; 31 overrun_detected = false;
32 prev_overruns = 0; 32 prev_overruns = 0;
33 33
34 tracing_reset_online_cpus(tr); 34 tracing_reset_online_cpus(&tr->trace_buffer);
35} 35}
36 36
37static int mmio_trace_init(struct trace_array *tr) 37static int mmio_trace_init(struct trace_array *tr)
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)
128static unsigned long count_overruns(struct trace_iterator *iter) 128static unsigned long count_overruns(struct trace_iterator *iter)
129{ 129{
130 unsigned long cnt = atomic_xchg(&dropped_count, 0); 130 unsigned long cnt = atomic_xchg(&dropped_count, 0);
131 unsigned long over = ring_buffer_overruns(iter->tr->buffer); 131 unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
132 132
133 if (over > prev_overruns) 133 if (over > prev_overruns)
134 cnt += over - prev_overruns; 134 cnt += over - prev_overruns;
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
309 struct mmiotrace_rw *rw) 309 struct mmiotrace_rw *rw)
310{ 310{
311 struct ftrace_event_call *call = &event_mmiotrace_rw; 311 struct ftrace_event_call *call = &event_mmiotrace_rw;
312 struct ring_buffer *buffer = tr->buffer; 312 struct ring_buffer *buffer = tr->trace_buffer.buffer;
313 struct ring_buffer_event *event; 313 struct ring_buffer_event *event;
314 struct trace_mmiotrace_rw *entry; 314 struct trace_mmiotrace_rw *entry;
315 int pc = preempt_count(); 315 int pc = preempt_count();
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
330void mmio_trace_rw(struct mmiotrace_rw *rw) 330void mmio_trace_rw(struct mmiotrace_rw *rw)
331{ 331{
332 struct trace_array *tr = mmio_trace_array; 332 struct trace_array *tr = mmio_trace_array;
333 struct trace_array_cpu *data = tr->data[smp_processor_id()]; 333 struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
334 __trace_mmiotrace_rw(tr, data, rw); 334 __trace_mmiotrace_rw(tr, data, rw);
335} 335}
336 336
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
339 struct mmiotrace_map *map) 339 struct mmiotrace_map *map)
340{ 340{
341 struct ftrace_event_call *call = &event_mmiotrace_map; 341 struct ftrace_event_call *call = &event_mmiotrace_map;
342 struct ring_buffer *buffer = tr->buffer; 342 struct ring_buffer *buffer = tr->trace_buffer.buffer;
343 struct ring_buffer_event *event; 343 struct ring_buffer_event *event;
344 struct trace_mmiotrace_map *entry; 344 struct trace_mmiotrace_map *entry;
345 int pc = preempt_count(); 345 int pc = preempt_count();
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
363 struct trace_array_cpu *data; 363 struct trace_array_cpu *data;
364 364
365 preempt_disable(); 365 preempt_disable();
366 data = tr->data[smp_processor_id()]; 366 data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
367 __trace_mmiotrace_map(tr, data, map); 367 __trace_mmiotrace_map(tr, data, map);
368 preempt_enable(); 368 preempt_enable();
369} 369}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 697e88d13907..bb922d9ee51b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
14/* must be a power of 2 */ 14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128 15#define EVENT_HASHSIZE 128
16 16
17DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_sem);
18 18
19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
20 20
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
37 return ret; 37 return ret;
38} 38}
39 39
40enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
41{
42 struct trace_seq *s = &iter->seq;
43 struct trace_entry *entry = iter->ent;
44 struct bputs_entry *field;
45 int ret;
46
47 trace_assign_type(field, entry);
48
49 ret = trace_seq_puts(s, field->str);
50 if (!ret)
51 return TRACE_TYPE_PARTIAL_LINE;
52
53 return TRACE_TYPE_HANDLED;
54}
55
40enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 56enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
41{ 57{
42 struct trace_seq *s = &iter->seq; 58 struct trace_seq *s = &iter->seq;
@@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
397} 413}
398EXPORT_SYMBOL(ftrace_print_hex_seq); 414EXPORT_SYMBOL(ftrace_print_hex_seq);
399 415
416int ftrace_raw_output_prep(struct trace_iterator *iter,
417 struct trace_event *trace_event)
418{
419 struct ftrace_event_call *event;
420 struct trace_seq *s = &iter->seq;
421 struct trace_seq *p = &iter->tmp_seq;
422 struct trace_entry *entry;
423 int ret;
424
425 event = container_of(trace_event, struct ftrace_event_call, event);
426 entry = iter->ent;
427
428 if (entry->type != event->event.type) {
429 WARN_ON_ONCE(1);
430 return TRACE_TYPE_UNHANDLED;
431 }
432
433 trace_seq_init(p);
434 ret = trace_seq_printf(s, "%s: ", event->name);
435 if (!ret)
436 return TRACE_TYPE_PARTIAL_LINE;
437
438 return 0;
439}
440EXPORT_SYMBOL(ftrace_raw_output_prep);
441
400#ifdef CONFIG_KRETPROBES 442#ifdef CONFIG_KRETPROBES
401static inline const char *kretprobed(const char *name) 443static inline const char *kretprobed(const char *name)
402{ 444{
@@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
617{ 659{
618 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; 660 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
619 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; 661 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
620 unsigned long long abs_ts = iter->ts - iter->tr->time_start; 662 unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
621 unsigned long long rel_ts = next_ts - iter->ts; 663 unsigned long long rel_ts = next_ts - iter->ts;
622 struct trace_seq *s = &iter->seq; 664 struct trace_seq *s = &iter->seq;
623 665
@@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list)
783 825
784void trace_event_read_lock(void) 826void trace_event_read_lock(void)
785{ 827{
786 down_read(&trace_event_mutex); 828 down_read(&trace_event_sem);
787} 829}
788 830
789void trace_event_read_unlock(void) 831void trace_event_read_unlock(void)
790{ 832{
791 up_read(&trace_event_mutex); 833 up_read(&trace_event_sem);
792} 834}
793 835
794/** 836/**
@@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event)
811 unsigned key; 853 unsigned key;
812 int ret = 0; 854 int ret = 0;
813 855
814 down_write(&trace_event_mutex); 856 down_write(&trace_event_sem);
815 857
816 if (WARN_ON(!event)) 858 if (WARN_ON(!event))
817 goto out; 859 goto out;
@@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event)
866 908
867 ret = event->type; 909 ret = event->type;
868 out: 910 out:
869 up_write(&trace_event_mutex); 911 up_write(&trace_event_sem);
870 912
871 return ret; 913 return ret;
872} 914}
873EXPORT_SYMBOL_GPL(register_ftrace_event); 915EXPORT_SYMBOL_GPL(register_ftrace_event);
874 916
875/* 917/*
876 * Used by module code with the trace_event_mutex held for write. 918 * Used by module code with the trace_event_sem held for write.
877 */ 919 */
878int __unregister_ftrace_event(struct trace_event *event) 920int __unregister_ftrace_event(struct trace_event *event)
879{ 921{
@@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event)
888 */ 930 */
889int unregister_ftrace_event(struct trace_event *event) 931int unregister_ftrace_event(struct trace_event *event)
890{ 932{
891 down_write(&trace_event_mutex); 933 down_write(&trace_event_sem);
892 __unregister_ftrace_event(event); 934 __unregister_ftrace_event(event);
893 up_write(&trace_event_mutex); 935 up_write(&trace_event_sem);
894 936
895 return 0; 937 return 0;
896} 938}
@@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = {
1217 .funcs = &trace_user_stack_funcs, 1259 .funcs = &trace_user_stack_funcs,
1218}; 1260};
1219 1261
1262/* TRACE_BPUTS */
1263static enum print_line_t
1264trace_bputs_print(struct trace_iterator *iter, int flags,
1265 struct trace_event *event)
1266{
1267 struct trace_entry *entry = iter->ent;
1268 struct trace_seq *s = &iter->seq;
1269 struct bputs_entry *field;
1270
1271 trace_assign_type(field, entry);
1272
1273 if (!seq_print_ip_sym(s, field->ip, flags))
1274 goto partial;
1275
1276 if (!trace_seq_puts(s, ": "))
1277 goto partial;
1278
1279 if (!trace_seq_puts(s, field->str))
1280 goto partial;
1281
1282 return TRACE_TYPE_HANDLED;
1283
1284 partial:
1285 return TRACE_TYPE_PARTIAL_LINE;
1286}
1287
1288
1289static enum print_line_t
1290trace_bputs_raw(struct trace_iterator *iter, int flags,
1291 struct trace_event *event)
1292{
1293 struct bputs_entry *field;
1294 struct trace_seq *s = &iter->seq;
1295
1296 trace_assign_type(field, iter->ent);
1297
1298 if (!trace_seq_printf(s, ": %lx : ", field->ip))
1299 goto partial;
1300
1301 if (!trace_seq_puts(s, field->str))
1302 goto partial;
1303
1304 return TRACE_TYPE_HANDLED;
1305
1306 partial:
1307 return TRACE_TYPE_PARTIAL_LINE;
1308}
1309
1310static struct trace_event_functions trace_bputs_funcs = {
1311 .trace = trace_bputs_print,
1312 .raw = trace_bputs_raw,
1313};
1314
1315static struct trace_event trace_bputs_event = {
1316 .type = TRACE_BPUTS,
1317 .funcs = &trace_bputs_funcs,
1318};
1319
1220/* TRACE_BPRINT */ 1320/* TRACE_BPRINT */
1221static enum print_line_t 1321static enum print_line_t
1222trace_bprint_print(struct trace_iterator *iter, int flags, 1322trace_bprint_print(struct trace_iterator *iter, int flags,
@@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = {
1329 &trace_wake_event, 1429 &trace_wake_event,
1330 &trace_stack_event, 1430 &trace_stack_event,
1331 &trace_user_stack_event, 1431 &trace_user_stack_event,
1432 &trace_bputs_event,
1332 &trace_bprint_event, 1433 &trace_bprint_event,
1333 &trace_print_event, 1434 &trace_print_event,
1334 NULL 1435 NULL
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index c038eba0492b..127a9d8c8357 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -5,6 +5,8 @@
5#include "trace.h" 5#include "trace.h"
6 6
7extern enum print_line_t 7extern enum print_line_t
8trace_print_bputs_msg_only(struct trace_iterator *iter);
9extern enum print_line_t
8trace_print_bprintk_msg_only(struct trace_iterator *iter); 10trace_print_bprintk_msg_only(struct trace_iterator *iter);
9extern enum print_line_t 11extern enum print_line_t
10trace_print_printk_msg_only(struct trace_iterator *iter); 12trace_print_printk_msg_only(struct trace_iterator *iter);
@@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
31 33
32/* used by module unregistering */ 34/* used by module unregistering */
33extern int __unregister_ftrace_event(struct trace_event *event); 35extern int __unregister_ftrace_event(struct trace_event *event);
34extern struct rw_semaphore trace_event_mutex; 36extern struct rw_semaphore trace_event_sem;
35 37
36#define MAX_MEMHEX_BYTES 8 38#define MAX_MEMHEX_BYTES 8
37#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) 39#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3374c792ccd8..4e98e3b257a3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
28 unsigned long flags, int pc) 28 unsigned long flags, int pc)
29{ 29{
30 struct ftrace_event_call *call = &event_context_switch; 30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer; 31 struct ring_buffer *buffer = tr->trace_buffer.buffer;
32 struct ring_buffer_event *event; 32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry; 33 struct ctx_switch_entry *entry;
34 34
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
69 pc = preempt_count(); 69 pc = preempt_count();
70 local_irq_save(flags); 70 local_irq_save(flags);
71 cpu = raw_smp_processor_id(); 71 cpu = raw_smp_processor_id();
72 data = ctx_trace->data[cpu]; 72 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
73 73
74 if (likely(!atomic_read(&data->disabled))) 74 if (likely(!atomic_read(&data->disabled)))
75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); 75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
86 struct ftrace_event_call *call = &event_wakeup; 86 struct ftrace_event_call *call = &event_wakeup;
87 struct ring_buffer_event *event; 87 struct ring_buffer_event *event;
88 struct ctx_switch_entry *entry; 88 struct ctx_switch_entry *entry;
89 struct ring_buffer *buffer = tr->buffer; 89 struct ring_buffer *buffer = tr->trace_buffer.buffer;
90 90
91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, 91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
92 sizeof(*entry), flags, pc); 92 sizeof(*entry), flags, pc);
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
123 pc = preempt_count(); 123 pc = preempt_count();
124 local_irq_save(flags); 124 local_irq_save(flags);
125 cpu = raw_smp_processor_id(); 125 cpu = raw_smp_processor_id();
126 data = ctx_trace->data[cpu]; 126 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
127 127
128 if (likely(!atomic_read(&data->disabled))) 128 if (likely(!atomic_read(&data->disabled)))
129 tracing_sched_wakeup_trace(ctx_trace, wakee, current, 129 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index fde652c9a511..fee77e15d815 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -37,6 +37,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace); 37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
38 38
39static int save_flags; 39static int save_flags;
40static bool function_enabled;
40 41
41#define TRACE_DISPLAY_GRAPH 1 42#define TRACE_DISPLAY_GRAPH 1
42 43
@@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
89 if (cpu != wakeup_current_cpu) 90 if (cpu != wakeup_current_cpu)
90 goto out_enable; 91 goto out_enable;
91 92
92 *data = tr->data[cpu]; 93 *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
93 disabled = atomic_inc_return(&(*data)->disabled); 94 disabled = atomic_inc_return(&(*data)->disabled);
94 if (unlikely(disabled != 1)) 95 if (unlikely(disabled != 1))
95 goto out; 96 goto out;
@@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =
134}; 135};
135#endif /* CONFIG_FUNCTION_TRACER */ 136#endif /* CONFIG_FUNCTION_TRACER */
136 137
137static int start_func_tracer(int graph) 138static int register_wakeup_function(int graph, int set)
138{ 139{
139 int ret; 140 int ret;
140 141
141 if (!graph) 142 /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
142 ret = register_ftrace_function(&trace_ops); 143 if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
143 else 144 return 0;
145
146 if (graph)
144 ret = register_ftrace_graph(&wakeup_graph_return, 147 ret = register_ftrace_graph(&wakeup_graph_return,
145 &wakeup_graph_entry); 148 &wakeup_graph_entry);
149 else
150 ret = register_ftrace_function(&trace_ops);
151
152 if (!ret)
153 function_enabled = true;
154
155 return ret;
156}
157
158static void unregister_wakeup_function(int graph)
159{
160 if (!function_enabled)
161 return;
162
163 if (graph)
164 unregister_ftrace_graph();
165 else
166 unregister_ftrace_function(&trace_ops);
167
168 function_enabled = false;
169}
170
171static void wakeup_function_set(int set)
172{
173 if (set)
174 register_wakeup_function(is_graph(), 1);
175 else
176 unregister_wakeup_function(is_graph());
177}
178
179static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set)
180{
181 if (mask & TRACE_ITER_FUNCTION)
182 wakeup_function_set(set);
183
184 return trace_keep_overwrite(tracer, mask, set);
185}
186
187static int start_func_tracer(int graph)
188{
189 int ret;
190
191 ret = register_wakeup_function(graph, 0);
146 192
147 if (!ret && tracing_is_enabled()) 193 if (!ret && tracing_is_enabled())
148 tracer_enabled = 1; 194 tracer_enabled = 1;
@@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)
156{ 202{
157 tracer_enabled = 0; 203 tracer_enabled = 0;
158 204
159 if (!graph) 205 unregister_wakeup_function(graph);
160 unregister_ftrace_function(&trace_ops);
161 else
162 unregister_ftrace_graph();
163} 206}
164 207
165#ifdef CONFIG_FUNCTION_GRAPH_TRACER 208#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore,
353 396
354 /* disable local data, not wakeup_cpu data */ 397 /* disable local data, not wakeup_cpu data */
355 cpu = raw_smp_processor_id(); 398 cpu = raw_smp_processor_id();
356 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 399 disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
357 if (likely(disabled != 1)) 400 if (likely(disabled != 1))
358 goto out; 401 goto out;
359 402
@@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore,
365 goto out_unlock; 408 goto out_unlock;
366 409
367 /* The task we are waiting for is waking up */ 410 /* The task we are waiting for is waking up */
368 data = wakeup_trace->data[wakeup_cpu]; 411 data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
369 412
370 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 413 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
371 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 414 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -387,7 +430,7 @@ out_unlock:
387 arch_spin_unlock(&wakeup_lock); 430 arch_spin_unlock(&wakeup_lock);
388 local_irq_restore(flags); 431 local_irq_restore(flags);
389out: 432out:
390 atomic_dec(&wakeup_trace->data[cpu]->disabled); 433 atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
391} 434}
392 435
393static void __wakeup_reset(struct trace_array *tr) 436static void __wakeup_reset(struct trace_array *tr)
@@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr)
405{ 448{
406 unsigned long flags; 449 unsigned long flags;
407 450
408 tracing_reset_online_cpus(tr); 451 tracing_reset_online_cpus(&tr->trace_buffer);
409 452
410 local_irq_save(flags); 453 local_irq_save(flags);
411 arch_spin_lock(&wakeup_lock); 454 arch_spin_lock(&wakeup_lock);
@@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
435 return; 478 return;
436 479
437 pc = preempt_count(); 480 pc = preempt_count();
438 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 481 disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
439 if (unlikely(disabled != 1)) 482 if (unlikely(disabled != 1))
440 goto out; 483 goto out;
441 484
@@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
458 501
459 local_save_flags(flags); 502 local_save_flags(flags);
460 503
461 data = wakeup_trace->data[wakeup_cpu]; 504 data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
462 data->preempt_timestamp = ftrace_now(cpu); 505 data->preempt_timestamp = ftrace_now(cpu);
463 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); 506 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
464 507
@@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
472out_locked: 515out_locked:
473 arch_spin_unlock(&wakeup_lock); 516 arch_spin_unlock(&wakeup_lock);
474out: 517out:
475 atomic_dec(&wakeup_trace->data[cpu]->disabled); 518 atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
476} 519}
477 520
478static void start_wakeup_tracer(struct trace_array *tr) 521static void start_wakeup_tracer(struct trace_array *tr)
@@ -543,8 +586,8 @@ static int __wakeup_tracer_init(struct trace_array *tr)
543 save_flags = trace_flags; 586 save_flags = trace_flags;
544 587
545 /* non overwrite screws up the latency tracers */ 588 /* non overwrite screws up the latency tracers */
546 set_tracer_flag(TRACE_ITER_OVERWRITE, 1); 589 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
547 set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); 590 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
548 591
549 tracing_max_latency = 0; 592 tracing_max_latency = 0;
550 wakeup_trace = tr; 593 wakeup_trace = tr;
@@ -573,8 +616,8 @@ static void wakeup_tracer_reset(struct trace_array *tr)
573 /* make sure we put back any tasks we are tracing */ 616 /* make sure we put back any tasks we are tracing */
574 wakeup_reset(tr); 617 wakeup_reset(tr);
575 618
576 set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); 619 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
577 set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); 620 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
578} 621}
579 622
580static void wakeup_tracer_start(struct trace_array *tr) 623static void wakeup_tracer_start(struct trace_array *tr)
@@ -600,7 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =
600 .print_line = wakeup_print_line, 643 .print_line = wakeup_print_line,
601 .flags = &tracer_flags, 644 .flags = &tracer_flags,
602 .set_flag = wakeup_set_flag, 645 .set_flag = wakeup_set_flag,
603 .flag_changed = trace_keep_overwrite, 646 .flag_changed = wakeup_flag_changed,
604#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
605 .selftest = trace_selftest_startup_wakeup, 648 .selftest = trace_selftest_startup_wakeup,
606#endif 649#endif
@@ -622,7 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
622 .print_line = wakeup_print_line, 665 .print_line = wakeup_print_line,
623 .flags = &tracer_flags, 666 .flags = &tracer_flags,
624 .set_flag = wakeup_set_flag, 667 .set_flag = wakeup_set_flag,
625 .flag_changed = trace_keep_overwrite, 668 .flag_changed = wakeup_flag_changed,
626#ifdef CONFIG_FTRACE_SELFTEST 669#ifdef CONFIG_FTRACE_SELFTEST
627 .selftest = trace_selftest_startup_wakeup, 670 .selftest = trace_selftest_startup_wakeup,
628#endif 671#endif
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 51c819c12c29..55e2cf66967b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
21 return 0; 21 return 0;
22} 22}
23 23
24static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) 24static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
25{ 25{
26 struct ring_buffer_event *event; 26 struct ring_buffer_event *event;
27 struct trace_entry *entry; 27 struct trace_entry *entry;
28 unsigned int loops = 0; 28 unsigned int loops = 0;
29 29
30 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { 30 while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
31 entry = ring_buffer_event_data(event); 31 entry = ring_buffer_event_data(event);
32 32
33 /* 33 /*
@@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
58 * Test the trace buffer to see if all the elements 58 * Test the trace buffer to see if all the elements
59 * are still sane. 59 * are still sane.
60 */ 60 */
61static int trace_test_buffer(struct trace_array *tr, unsigned long *count) 61static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
62{ 62{
63 unsigned long flags, cnt = 0; 63 unsigned long flags, cnt = 0;
64 int cpu, ret = 0; 64 int cpu, ret = 0;
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
67 local_irq_save(flags); 67 local_irq_save(flags);
68 arch_spin_lock(&ftrace_max_lock); 68 arch_spin_lock(&ftrace_max_lock);
69 69
70 cnt = ring_buffer_entries(tr->buffer); 70 cnt = ring_buffer_entries(buf->buffer);
71 71
72 /* 72 /*
73 * The trace_test_buffer_cpu runs a while loop to consume all data. 73 * The trace_test_buffer_cpu runs a while loop to consume all data.
@@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
78 */ 78 */
79 tracing_off(); 79 tracing_off();
80 for_each_possible_cpu(cpu) { 80 for_each_possible_cpu(cpu) {
81 ret = trace_test_buffer_cpu(tr, cpu); 81 ret = trace_test_buffer_cpu(buf, cpu);
82 if (ret) 82 if (ret)
83 break; 83 break;
84 } 84 }
@@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
355 msleep(100); 355 msleep(100);
356 356
357 /* we should have nothing in the buffer */ 357 /* we should have nothing in the buffer */
358 ret = trace_test_buffer(tr, &count); 358 ret = trace_test_buffer(&tr->trace_buffer, &count);
359 if (ret) 359 if (ret)
360 goto out; 360 goto out;
361 361
@@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
376 ftrace_enabled = 0; 376 ftrace_enabled = 0;
377 377
378 /* check the trace buffer */ 378 /* check the trace buffer */
379 ret = trace_test_buffer(tr, &count); 379 ret = trace_test_buffer(&tr->trace_buffer, &count);
380 tracing_start(); 380 tracing_start();
381 381
382 /* we should only have one item */ 382 /* we should only have one item */
@@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
666 ftrace_enabled = 0; 666 ftrace_enabled = 0;
667 667
668 /* check the trace buffer */ 668 /* check the trace buffer */
669 ret = trace_test_buffer(tr, &count); 669 ret = trace_test_buffer(&tr->trace_buffer, &count);
670 trace->reset(tr); 670 trace->reset(tr);
671 tracing_start(); 671 tracing_start();
672 672
@@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
703/* Maximum number of functions to trace before diagnosing a hang */ 703/* Maximum number of functions to trace before diagnosing a hang */
704#define GRAPH_MAX_FUNC_TEST 100000000 704#define GRAPH_MAX_FUNC_TEST 100000000
705 705
706static void
707__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
708static unsigned int graph_hang_thresh; 706static unsigned int graph_hang_thresh;
709 707
710/* Wrap the real function entry probe to avoid possible hanging */ 708/* Wrap the real function entry probe to avoid possible hanging */
@@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
714 if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { 712 if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
715 ftrace_graph_stop(); 713 ftrace_graph_stop();
716 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 714 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
717 if (ftrace_dump_on_oops) 715 if (ftrace_dump_on_oops) {
718 __ftrace_dump(false, DUMP_ALL); 716 ftrace_dump(DUMP_ALL);
717 /* ftrace_dump() disables tracing */
718 tracing_on();
719 }
719 return 0; 720 return 0;
720 } 721 }
721 722
@@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
737 * Simulate the init() callback but we attach a watchdog callback 738 * Simulate the init() callback but we attach a watchdog callback
738 * to detect and recover from possible hangs 739 * to detect and recover from possible hangs
739 */ 740 */
740 tracing_reset_online_cpus(tr); 741 tracing_reset_online_cpus(&tr->trace_buffer);
741 set_graph_array(tr); 742 set_graph_array(tr);
742 ret = register_ftrace_graph(&trace_graph_return, 743 ret = register_ftrace_graph(&trace_graph_return,
743 &trace_graph_entry_watchdog); 744 &trace_graph_entry_watchdog);
@@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
760 tracing_stop(); 761 tracing_stop();
761 762
762 /* check the trace buffer */ 763 /* check the trace buffer */
763 ret = trace_test_buffer(tr, &count); 764 ret = trace_test_buffer(&tr->trace_buffer, &count);
764 765
765 trace->reset(tr); 766 trace->reset(tr);
766 tracing_start(); 767 tracing_start();
@@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
815 /* stop the tracing. */ 816 /* stop the tracing. */
816 tracing_stop(); 817 tracing_stop();
817 /* check both trace buffers */ 818 /* check both trace buffers */
818 ret = trace_test_buffer(tr, NULL); 819 ret = trace_test_buffer(&tr->trace_buffer, NULL);
819 if (!ret) 820 if (!ret)
820 ret = trace_test_buffer(&max_tr, &count); 821 ret = trace_test_buffer(&tr->max_buffer, &count);
821 trace->reset(tr); 822 trace->reset(tr);
822 tracing_start(); 823 tracing_start();
823 824
@@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
877 /* stop the tracing. */ 878 /* stop the tracing. */
878 tracing_stop(); 879 tracing_stop();
879 /* check both trace buffers */ 880 /* check both trace buffers */
880 ret = trace_test_buffer(tr, NULL); 881 ret = trace_test_buffer(&tr->trace_buffer, NULL);
881 if (!ret) 882 if (!ret)
882 ret = trace_test_buffer(&max_tr, &count); 883 ret = trace_test_buffer(&tr->max_buffer, &count);
883 trace->reset(tr); 884 trace->reset(tr);
884 tracing_start(); 885 tracing_start();
885 886
@@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
943 /* stop the tracing. */ 944 /* stop the tracing. */
944 tracing_stop(); 945 tracing_stop();
945 /* check both trace buffers */ 946 /* check both trace buffers */
946 ret = trace_test_buffer(tr, NULL); 947 ret = trace_test_buffer(&tr->trace_buffer, NULL);
947 if (ret) 948 if (ret)
948 goto out; 949 goto out;
949 950
950 ret = trace_test_buffer(&max_tr, &count); 951 ret = trace_test_buffer(&tr->max_buffer, &count);
951 if (ret) 952 if (ret)
952 goto out; 953 goto out;
953 954
@@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
973 /* stop the tracing. */ 974 /* stop the tracing. */
974 tracing_stop(); 975 tracing_stop();
975 /* check both trace buffers */ 976 /* check both trace buffers */
976 ret = trace_test_buffer(tr, NULL); 977 ret = trace_test_buffer(&tr->trace_buffer, NULL);
977 if (ret) 978 if (ret)
978 goto out; 979 goto out;
979 980
980 ret = trace_test_buffer(&max_tr, &count); 981 ret = trace_test_buffer(&tr->max_buffer, &count);
981 982
982 if (!ret && !count) { 983 if (!ret && !count) {
983 printk(KERN_CONT ".. no entries found .."); 984 printk(KERN_CONT ".. no entries found ..");
@@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1084 /* stop the tracing. */ 1085 /* stop the tracing. */
1085 tracing_stop(); 1086 tracing_stop();
1086 /* check both trace buffers */ 1087 /* check both trace buffers */
1087 ret = trace_test_buffer(tr, NULL); 1088 ret = trace_test_buffer(&tr->trace_buffer, NULL);
1088 printk("ret = %d\n", ret); 1089 printk("ret = %d\n", ret);
1089 if (!ret) 1090 if (!ret)
1090 ret = trace_test_buffer(&max_tr, &count); 1091 ret = trace_test_buffer(&tr->max_buffer, &count);
1091 1092
1092 1093
1093 trace->reset(tr); 1094 trace->reset(tr);
@@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
1126 /* stop the tracing. */ 1127 /* stop the tracing. */
1127 tracing_stop(); 1128 tracing_stop();
1128 /* check the trace buffer */ 1129 /* check the trace buffer */
1129 ret = trace_test_buffer(tr, &count); 1130 ret = trace_test_buffer(&tr->trace_buffer, &count);
1130 trace->reset(tr); 1131 trace->reset(tr);
1131 tracing_start(); 1132 tracing_start();
1132 1133
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 83a8b5b7bd35..b20428c5efe2 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -20,13 +20,24 @@
20 20
21#define STACK_TRACE_ENTRIES 500 21#define STACK_TRACE_ENTRIES 500
22 22
23#ifdef CC_USING_FENTRY
24# define fentry 1
25#else
26# define fentry 0
27#endif
28
23static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = 29static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
24 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; 30 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
25static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; 31static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
26 32
33/*
34 * Reserve one entry for the passed in ip. This will allow
35 * us to remove most or all of the stack size overhead
36 * added by the stack tracer itself.
37 */
27static struct stack_trace max_stack_trace = { 38static struct stack_trace max_stack_trace = {
28 .max_entries = STACK_TRACE_ENTRIES, 39 .max_entries = STACK_TRACE_ENTRIES - 1,
29 .entries = stack_dump_trace, 40 .entries = &stack_dump_trace[1],
30}; 41};
31 42
32static unsigned long max_stack_size; 43static unsigned long max_stack_size;
@@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
39int stack_tracer_enabled; 50int stack_tracer_enabled;
40static int last_stack_tracer_enabled; 51static int last_stack_tracer_enabled;
41 52
42static inline void check_stack(void) 53static inline void
54check_stack(unsigned long ip, unsigned long *stack)
43{ 55{
44 unsigned long this_size, flags; 56 unsigned long this_size, flags;
45 unsigned long *p, *top, *start; 57 unsigned long *p, *top, *start;
58 static int tracer_frame;
59 int frame_size = ACCESS_ONCE(tracer_frame);
46 int i; 60 int i;
47 61
48 this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); 62 this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
49 this_size = THREAD_SIZE - this_size; 63 this_size = THREAD_SIZE - this_size;
64 /* Remove the frame of the tracer */
65 this_size -= frame_size;
50 66
51 if (this_size <= max_stack_size) 67 if (this_size <= max_stack_size)
52 return; 68 return;
53 69
54 /* we do not handle interrupt stacks yet */ 70 /* we do not handle interrupt stacks yet */
55 if (!object_is_on_stack(&this_size)) 71 if (!object_is_on_stack(stack))
56 return; 72 return;
57 73
58 local_irq_save(flags); 74 local_irq_save(flags);
59 arch_spin_lock(&max_stack_lock); 75 arch_spin_lock(&max_stack_lock);
60 76
77 /* In case another CPU set the tracer_frame on us */
78 if (unlikely(!frame_size))
79 this_size -= tracer_frame;
80
61 /* a race could have already updated it */ 81 /* a race could have already updated it */
62 if (this_size <= max_stack_size) 82 if (this_size <= max_stack_size)
63 goto out; 83 goto out;
@@ -70,10 +90,18 @@ static inline void check_stack(void)
70 save_stack_trace(&max_stack_trace); 90 save_stack_trace(&max_stack_trace);
71 91
72 /* 92 /*
93 * Add the passed in ip from the function tracer.
94 * Searching for this on the stack will skip over
95 * most of the overhead from the stack tracer itself.
96 */
97 stack_dump_trace[0] = ip;
98 max_stack_trace.nr_entries++;
99
100 /*
73 * Now find where in the stack these are. 101 * Now find where in the stack these are.
74 */ 102 */
75 i = 0; 103 i = 0;
76 start = &this_size; 104 start = stack;
77 top = (unsigned long *) 105 top = (unsigned long *)
78 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); 106 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
79 107
@@ -97,6 +125,18 @@ static inline void check_stack(void)
97 found = 1; 125 found = 1;
98 /* Start the search from here */ 126 /* Start the search from here */
99 start = p + 1; 127 start = p + 1;
128 /*
129 * We do not want to show the overhead
130 * of the stack tracer stack in the
131 * max stack. If we haven't figured
132 * out what that is, then figure it out
133 * now.
134 */
135 if (unlikely(!tracer_frame) && i == 1) {
136 tracer_frame = (p - stack) *
137 sizeof(unsigned long);
138 max_stack_size -= tracer_frame;
139 }
100 } 140 }
101 } 141 }
102 142
@@ -113,6 +153,7 @@ static void
113stack_trace_call(unsigned long ip, unsigned long parent_ip, 153stack_trace_call(unsigned long ip, unsigned long parent_ip,
114 struct ftrace_ops *op, struct pt_regs *pt_regs) 154 struct ftrace_ops *op, struct pt_regs *pt_regs)
115{ 155{
156 unsigned long stack;
116 int cpu; 157 int cpu;
117 158
118 preempt_disable_notrace(); 159 preempt_disable_notrace();
@@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
122 if (per_cpu(trace_active, cpu)++ != 0) 163 if (per_cpu(trace_active, cpu)++ != 0)
123 goto out; 164 goto out;
124 165
125 check_stack(); 166 /*
167 * When fentry is used, the traced function does not get
168 * its stack frame set up, and we lose the parent.
169 * The ip is pretty useless because the function tracer
170 * was called before that function set up its stack frame.
171 * In this case, we use the parent ip.
172 *
173 * By adding the return address of either the parent ip
174 * or the current ip we can disregard most of the stack usage
175 * caused by the stack tracer itself.
176 *
177 * The function tracer always reports the address of where the
178 * mcount call was, but the stack will hold the return address.
179 */
180 if (fentry)
181 ip = parent_ip;
182 else
183 ip += MCOUNT_INSN_SIZE;
184
185 check_stack(ip, &stack);
126 186
127 out: 187 out:
128 per_cpu(trace_active, cpu)--; 188 per_cpu(trace_active, cpu)--;
@@ -371,6 +431,8 @@ static __init int stack_trace_init(void)
371 struct dentry *d_tracer; 431 struct dentry *d_tracer;
372 432
373 d_tracer = tracing_init_dentry(); 433 d_tracer = tracing_init_dentry();
434 if (!d_tracer)
435 return 0;
374 436
375 trace_create_file("stack_max_size", 0644, d_tracer, 437 trace_create_file("stack_max_size", 0644, d_tracer,
376 &max_stack_size, &stack_max_size_fops); 438 &max_stack_size, &stack_max_size_fops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 96cffb269e73..847f88a6194b 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -307,6 +307,8 @@ static int tracing_stat_init(void)
307 struct dentry *d_tracing; 307 struct dentry *d_tracing;
308 308
309 d_tracing = tracing_init_dentry(); 309 d_tracing = tracing_init_dentry();
310 if (!d_tracing)
311 return 0;
310 312
311 stat_dir = debugfs_create_dir("trace_stat", d_tracing); 313 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
312 if (!stat_dir) 314 if (!stat_dir)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7a809e321058..8f2ac73c7a5f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -12,10 +12,6 @@
12#include "trace.h" 12#include "trace.h"
13 13
14static DEFINE_MUTEX(syscall_trace_lock); 14static DEFINE_MUTEX(syscall_trace_lock);
15static int sys_refcount_enter;
16static int sys_refcount_exit;
17static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
18static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
19 15
20static int syscall_enter_register(struct ftrace_event_call *event, 16static int syscall_enter_register(struct ftrace_event_call *event,
21 enum trace_reg type, void *data); 17 enum trace_reg type, void *data);
@@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
41 /* 37 /*
42 * Only compare after the "sys" prefix. Archs that use 38 * Only compare after the "sys" prefix. Archs that use
43 * syscall wrappers may have syscalls symbols aliases prefixed 39 * syscall wrappers may have syscalls symbols aliases prefixed
44 * with "SyS" instead of "sys", leading to an unwanted 40 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
45 * mismatch. 41 * mismatch.
46 */ 42 */
47 return !strcmp(sym + 3, name + 3); 43 return !strcmp(sym + 3, name + 3);
@@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
265 kfree(call->print_fmt); 261 kfree(call->print_fmt);
266} 262}
267 263
268static int syscall_enter_define_fields(struct ftrace_event_call *call) 264static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
269{ 265{
270 struct syscall_trace_enter trace; 266 struct syscall_trace_enter trace;
271 struct syscall_metadata *meta = call->data; 267 struct syscall_metadata *meta = call->data;
@@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)
288 return ret; 284 return ret;
289} 285}
290 286
291static int syscall_exit_define_fields(struct ftrace_event_call *call) 287static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
292{ 288{
293 struct syscall_trace_exit trace; 289 struct syscall_trace_exit trace;
294 int ret; 290 int ret;
@@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
303 return ret; 299 return ret;
304} 300}
305 301
306static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 302static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
307{ 303{
304 struct trace_array *tr = data;
308 struct syscall_trace_enter *entry; 305 struct syscall_trace_enter *entry;
309 struct syscall_metadata *sys_data; 306 struct syscall_metadata *sys_data;
310 struct ring_buffer_event *event; 307 struct ring_buffer_event *event;
@@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
315 syscall_nr = trace_get_syscall_nr(current, regs); 312 syscall_nr = trace_get_syscall_nr(current, regs);
316 if (syscall_nr < 0) 313 if (syscall_nr < 0)
317 return; 314 return;
318 if (!test_bit(syscall_nr, enabled_enter_syscalls)) 315 if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))
319 return; 316 return;
320 317
321 sys_data = syscall_nr_to_meta(syscall_nr); 318 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
324 321
325 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 322 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
326 323
327 event = trace_current_buffer_lock_reserve(&buffer, 324 buffer = tr->trace_buffer.buffer;
325 event = trace_buffer_lock_reserve(buffer,
328 sys_data->enter_event->event.type, size, 0, 0); 326 sys_data->enter_event->event.type, size, 0, 0);
329 if (!event) 327 if (!event)
330 return; 328 return;
@@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
338 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 336 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
339} 337}
340 338
341static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 339static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
342{ 340{
341 struct trace_array *tr = data;
343 struct syscall_trace_exit *entry; 342 struct syscall_trace_exit *entry;
344 struct syscall_metadata *sys_data; 343 struct syscall_metadata *sys_data;
345 struct ring_buffer_event *event; 344 struct ring_buffer_event *event;
@@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
349 syscall_nr = trace_get_syscall_nr(current, regs); 348 syscall_nr = trace_get_syscall_nr(current, regs);
350 if (syscall_nr < 0) 349 if (syscall_nr < 0)
351 return; 350 return;
352 if (!test_bit(syscall_nr, enabled_exit_syscalls)) 351 if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))
353 return; 352 return;
354 353
355 sys_data = syscall_nr_to_meta(syscall_nr); 354 sys_data = syscall_nr_to_meta(syscall_nr);
356 if (!sys_data) 355 if (!sys_data)
357 return; 356 return;
358 357
359 event = trace_current_buffer_lock_reserve(&buffer, 358 buffer = tr->trace_buffer.buffer;
359 event = trace_buffer_lock_reserve(buffer,
360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0); 360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
361 if (!event) 361 if (!event)
362 return; 362 return;
@@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
370 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 370 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
371} 371}
372 372
373static int reg_event_syscall_enter(struct ftrace_event_call *call) 373static int reg_event_syscall_enter(struct ftrace_event_file *file,
374 struct ftrace_event_call *call)
374{ 375{
376 struct trace_array *tr = file->tr;
375 int ret = 0; 377 int ret = 0;
376 int num; 378 int num;
377 379
@@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)
379 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 381 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
380 return -ENOSYS; 382 return -ENOSYS;
381 mutex_lock(&syscall_trace_lock); 383 mutex_lock(&syscall_trace_lock);
382 if (!sys_refcount_enter) 384 if (!tr->sys_refcount_enter)
383 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); 385 ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
384 if (!ret) { 386 if (!ret) {
385 set_bit(num, enabled_enter_syscalls); 387 set_bit(num, tr->enabled_enter_syscalls);
386 sys_refcount_enter++; 388 tr->sys_refcount_enter++;
387 } 389 }
388 mutex_unlock(&syscall_trace_lock); 390 mutex_unlock(&syscall_trace_lock);
389 return ret; 391 return ret;
390} 392}
391 393
392static void unreg_event_syscall_enter(struct ftrace_event_call *call) 394static void unreg_event_syscall_enter(struct ftrace_event_file *file,
395 struct ftrace_event_call *call)
393{ 396{
397 struct trace_array *tr = file->tr;
394 int num; 398 int num;
395 399
396 num = ((struct syscall_metadata *)call->data)->syscall_nr; 400 num = ((struct syscall_metadata *)call->data)->syscall_nr;
397 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 401 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
398 return; 402 return;
399 mutex_lock(&syscall_trace_lock); 403 mutex_lock(&syscall_trace_lock);
400 sys_refcount_enter--; 404 tr->sys_refcount_enter--;
401 clear_bit(num, enabled_enter_syscalls); 405 clear_bit(num, tr->enabled_enter_syscalls);
402 if (!sys_refcount_enter) 406 if (!tr->sys_refcount_enter)
403 unregister_trace_sys_enter(ftrace_syscall_enter, NULL); 407 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
404 mutex_unlock(&syscall_trace_lock); 408 mutex_unlock(&syscall_trace_lock);
405} 409}
406 410
407static int reg_event_syscall_exit(struct ftrace_event_call *call) 411static int reg_event_syscall_exit(struct ftrace_event_file *file,
412 struct ftrace_event_call *call)
408{ 413{
414 struct trace_array *tr = file->tr;
409 int ret = 0; 415 int ret = 0;
410 int num; 416 int num;
411 417
@@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)
413 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 419 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
414 return -ENOSYS; 420 return -ENOSYS;
415 mutex_lock(&syscall_trace_lock); 421 mutex_lock(&syscall_trace_lock);
416 if (!sys_refcount_exit) 422 if (!tr->sys_refcount_exit)
417 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); 423 ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
418 if (!ret) { 424 if (!ret) {
419 set_bit(num, enabled_exit_syscalls); 425 set_bit(num, tr->enabled_exit_syscalls);
420 sys_refcount_exit++; 426 tr->sys_refcount_exit++;
421 } 427 }
422 mutex_unlock(&syscall_trace_lock); 428 mutex_unlock(&syscall_trace_lock);
423 return ret; 429 return ret;
424} 430}
425 431
426static void unreg_event_syscall_exit(struct ftrace_event_call *call) 432static void unreg_event_syscall_exit(struct ftrace_event_file *file,
433 struct ftrace_event_call *call)
427{ 434{
435 struct trace_array *tr = file->tr;
428 int num; 436 int num;
429 437
430 num = ((struct syscall_metadata *)call->data)->syscall_nr; 438 num = ((struct syscall_metadata *)call->data)->syscall_nr;
431 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 439 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
432 return; 440 return;
433 mutex_lock(&syscall_trace_lock); 441 mutex_lock(&syscall_trace_lock);
434 sys_refcount_exit--; 442 tr->sys_refcount_exit--;
435 clear_bit(num, enabled_exit_syscalls); 443 clear_bit(num, tr->enabled_exit_syscalls);
436 if (!sys_refcount_exit) 444 if (!tr->sys_refcount_exit)
437 unregister_trace_sys_exit(ftrace_syscall_exit, NULL); 445 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
438 mutex_unlock(&syscall_trace_lock); 446 mutex_unlock(&syscall_trace_lock);
439} 447}
440 448
@@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
471 .trace = print_syscall_exit, 479 .trace = print_syscall_exit,
472}; 480};
473 481
474struct ftrace_event_class event_class_syscall_enter = { 482struct ftrace_event_class __refdata event_class_syscall_enter = {
475 .system = "syscalls", 483 .system = "syscalls",
476 .reg = syscall_enter_register, 484 .reg = syscall_enter_register,
477 .define_fields = syscall_enter_define_fields, 485 .define_fields = syscall_enter_define_fields,
@@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {
479 .raw_init = init_syscall_trace, 487 .raw_init = init_syscall_trace,
480}; 488};
481 489
482struct ftrace_event_class event_class_syscall_exit = { 490struct ftrace_event_class __refdata event_class_syscall_exit = {
483 .system = "syscalls", 491 .system = "syscalls",
484 .reg = syscall_exit_register, 492 .reg = syscall_exit_register,
485 .define_fields = syscall_exit_define_fields, 493 .define_fields = syscall_exit_define_fields,
@@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
685static int syscall_enter_register(struct ftrace_event_call *event, 693static int syscall_enter_register(struct ftrace_event_call *event,
686 enum trace_reg type, void *data) 694 enum trace_reg type, void *data)
687{ 695{
696 struct ftrace_event_file *file = data;
697
688 switch (type) { 698 switch (type) {
689 case TRACE_REG_REGISTER: 699 case TRACE_REG_REGISTER:
690 return reg_event_syscall_enter(event); 700 return reg_event_syscall_enter(file, event);
691 case TRACE_REG_UNREGISTER: 701 case TRACE_REG_UNREGISTER:
692 unreg_event_syscall_enter(event); 702 unreg_event_syscall_enter(file, event);
693 return 0; 703 return 0;
694 704
695#ifdef CONFIG_PERF_EVENTS 705#ifdef CONFIG_PERF_EVENTS
@@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,
711static int syscall_exit_register(struct ftrace_event_call *event, 721static int syscall_exit_register(struct ftrace_event_call *event,
712 enum trace_reg type, void *data) 722 enum trace_reg type, void *data)
713{ 723{
724 struct ftrace_event_file *file = data;
725
714 switch (type) { 726 switch (type) {
715 case TRACE_REG_REGISTER: 727 case TRACE_REG_REGISTER:
716 return reg_event_syscall_exit(event); 728 return reg_event_syscall_exit(file, event);
717 case TRACE_REG_UNREGISTER: 729 case TRACE_REG_UNREGISTER:
718 unreg_event_syscall_exit(event); 730 unreg_event_syscall_exit(file, event);
719 return 0; 731 return 0;
720 732
721#ifdef CONFIG_PERF_EVENTS 733#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8dad2a92dee9..32494fb0ee64 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,6 +28,18 @@
28 28
29#define UPROBE_EVENT_SYSTEM "uprobes" 29#define UPROBE_EVENT_SYSTEM "uprobes"
30 30
31struct uprobe_trace_entry_head {
32 struct trace_entry ent;
33 unsigned long vaddr[];
34};
35
36#define SIZEOF_TRACE_ENTRY(is_return) \
37 (sizeof(struct uprobe_trace_entry_head) + \
38 sizeof(unsigned long) * (is_return ? 2 : 1))
39
40#define DATAOF_TRACE_ENTRY(entry, is_return) \
41 ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
42
31struct trace_uprobe_filter { 43struct trace_uprobe_filter {
32 rwlock_t rwlock; 44 rwlock_t rwlock;
33 int nr_systemwide; 45 int nr_systemwide;
@@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock);
64static LIST_HEAD(uprobe_list); 76static LIST_HEAD(uprobe_list);
65 77
66static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 78static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
79static int uretprobe_dispatcher(struct uprobe_consumer *con,
80 unsigned long func, struct pt_regs *regs);
67 81
68static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) 82static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
69{ 83{
@@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
77 return !filter->nr_systemwide && list_empty(&filter->perf_events); 91 return !filter->nr_systemwide && list_empty(&filter->perf_events);
78} 92}
79 93
94static inline bool is_ret_probe(struct trace_uprobe *tu)
95{
96 return tu->consumer.ret_handler != NULL;
97}
98
80/* 99/*
81 * Allocate new trace_uprobe and initialize it (including uprobes). 100 * Allocate new trace_uprobe and initialize it (including uprobes).
82 */ 101 */
83static struct trace_uprobe * 102static struct trace_uprobe *
84alloc_trace_uprobe(const char *group, const char *event, int nargs) 103alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
85{ 104{
86 struct trace_uprobe *tu; 105 struct trace_uprobe *tu;
87 106
@@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
106 125
107 INIT_LIST_HEAD(&tu->list); 126 INIT_LIST_HEAD(&tu->list);
108 tu->consumer.handler = uprobe_dispatcher; 127 tu->consumer.handler = uprobe_dispatcher;
128 if (is_ret)
129 tu->consumer.ret_handler = uretprobe_dispatcher;
109 init_trace_uprobe_filter(&tu->filter); 130 init_trace_uprobe_filter(&tu->filter);
110 return tu; 131 return tu;
111 132
@@ -180,7 +201,7 @@ end:
180 201
181/* 202/*
182 * Argument syntax: 203 * Argument syntax:
183 * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] 204 * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]
184 * 205 *
185 * - Remove uprobe: -:[GRP/]EVENT 206 * - Remove uprobe: -:[GRP/]EVENT
186 */ 207 */
@@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv)
192 char buf[MAX_EVENT_NAME_LEN]; 213 char buf[MAX_EVENT_NAME_LEN];
193 struct path path; 214 struct path path;
194 unsigned long offset; 215 unsigned long offset;
195 bool is_delete; 216 bool is_delete, is_return;
196 int i, ret; 217 int i, ret;
197 218
198 inode = NULL; 219 inode = NULL;
199 ret = 0; 220 ret = 0;
200 is_delete = false; 221 is_delete = false;
222 is_return = false;
201 event = NULL; 223 event = NULL;
202 group = NULL; 224 group = NULL;
203 225
204 /* argc must be >= 1 */ 226 /* argc must be >= 1 */
205 if (argv[0][0] == '-') 227 if (argv[0][0] == '-')
206 is_delete = true; 228 is_delete = true;
229 else if (argv[0][0] == 'r')
230 is_return = true;
207 else if (argv[0][0] != 'p') { 231 else if (argv[0][0] != 'p') {
208 pr_info("Probe definition must be started with 'p' or '-'.\n"); 232 pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
209 return -EINVAL; 233 return -EINVAL;
210 } 234 }
211 235
@@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv)
303 kfree(tail); 327 kfree(tail);
304 } 328 }
305 329
306 tu = alloc_trace_uprobe(group, event, argc); 330 tu = alloc_trace_uprobe(group, event, argc, is_return);
307 if (IS_ERR(tu)) { 331 if (IS_ERR(tu)) {
308 pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); 332 pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
309 ret = PTR_ERR(tu); 333 ret = PTR_ERR(tu);
@@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
414static int probes_seq_show(struct seq_file *m, void *v) 438static int probes_seq_show(struct seq_file *m, void *v)
415{ 439{
416 struct trace_uprobe *tu = v; 440 struct trace_uprobe *tu = v;
441 char c = is_ret_probe(tu) ? 'r' : 'p';
417 int i; 442 int i;
418 443
419 seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); 444 seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);
420 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); 445 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
421 446
422 for (i = 0; i < tu->nr_args; i++) 447 for (i = 0; i < tu->nr_args; i++)
@@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = {
485 .release = seq_release, 510 .release = seq_release,
486}; 511};
487 512
488/* uprobe handler */ 513static void uprobe_trace_print(struct trace_uprobe *tu,
489static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 514 unsigned long func, struct pt_regs *regs)
490{ 515{
491 struct uprobe_trace_entry_head *entry; 516 struct uprobe_trace_entry_head *entry;
492 struct ring_buffer_event *event; 517 struct ring_buffer_event *event;
493 struct ring_buffer *buffer; 518 struct ring_buffer *buffer;
494 u8 *data; 519 void *data;
495 int size, i, pc; 520 int size, i;
496 unsigned long irq_flags;
497 struct ftrace_event_call *call = &tu->call; 521 struct ftrace_event_call *call = &tu->call;
498 522
499 local_save_flags(irq_flags); 523 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
500 pc = preempt_count();
501
502 size = sizeof(*entry) + tu->size;
503
504 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 524 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
505 size, irq_flags, pc); 525 size + tu->size, 0, 0);
506 if (!event) 526 if (!event)
507 return 0; 527 return;
508 528
509 entry = ring_buffer_event_data(event); 529 entry = ring_buffer_event_data(event);
510 entry->ip = instruction_pointer(task_pt_regs(current)); 530 if (is_ret_probe(tu)) {
511 data = (u8 *)&entry[1]; 531 entry->vaddr[0] = func;
532 entry->vaddr[1] = instruction_pointer(regs);
533 data = DATAOF_TRACE_ENTRY(entry, true);
534 } else {
535 entry->vaddr[0] = instruction_pointer(regs);
536 data = DATAOF_TRACE_ENTRY(entry, false);
537 }
538
512 for (i = 0; i < tu->nr_args; i++) 539 for (i = 0; i < tu->nr_args; i++)
513 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 540 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
514 541
515 if (!filter_current_check_discard(buffer, call, entry, event)) 542 if (!filter_current_check_discard(buffer, call, entry, event))
516 trace_buffer_unlock_commit(buffer, event, irq_flags, pc); 543 trace_buffer_unlock_commit(buffer, event, 0, 0);
544}
517 545
546/* uprobe handler */
547static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
548{
549 if (!is_ret_probe(tu))
550 uprobe_trace_print(tu, 0, regs);
518 return 0; 551 return 0;
519} 552}
520 553
554static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
555 struct pt_regs *regs)
556{
557 uprobe_trace_print(tu, func, regs);
558}
559
521/* Event entry printers */ 560/* Event entry printers */
522static enum print_line_t 561static enum print_line_t
523print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) 562print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
524{ 563{
525 struct uprobe_trace_entry_head *field; 564 struct uprobe_trace_entry_head *entry;
526 struct trace_seq *s = &iter->seq; 565 struct trace_seq *s = &iter->seq;
527 struct trace_uprobe *tu; 566 struct trace_uprobe *tu;
528 u8 *data; 567 u8 *data;
529 int i; 568 int i;
530 569
531 field = (struct uprobe_trace_entry_head *)iter->ent; 570 entry = (struct uprobe_trace_entry_head *)iter->ent;
532 tu = container_of(event, struct trace_uprobe, call.event); 571 tu = container_of(event, struct trace_uprobe, call.event);
533 572
534 if (!trace_seq_printf(s, "%s: (", tu->call.name)) 573 if (is_ret_probe(tu)) {
535 goto partial; 574 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name,
536 575 entry->vaddr[1], entry->vaddr[0]))
537 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) 576 goto partial;
538 goto partial; 577 data = DATAOF_TRACE_ENTRY(entry, true);
539 578 } else {
540 if (!trace_seq_puts(s, ")")) 579 if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name,
541 goto partial; 580 entry->vaddr[0]))
581 goto partial;
582 data = DATAOF_TRACE_ENTRY(entry, false);
583 }
542 584
543 data = (u8 *)&field[1];
544 for (i = 0; i < tu->nr_args; i++) { 585 for (i = 0; i < tu->nr_args; i++) {
545 if (!tu->args[i].type->print(s, tu->args[i].name, 586 if (!tu->args[i].type->print(s, tu->args[i].name,
546 data + tu->args[i].offset, field)) 587 data + tu->args[i].offset, entry))
547 goto partial; 588 goto partial;
548 } 589 }
549 590
@@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag)
595 636
596static int uprobe_event_define_fields(struct ftrace_event_call *event_call) 637static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
597{ 638{
598 int ret, i; 639 int ret, i, size;
599 struct uprobe_trace_entry_head field; 640 struct uprobe_trace_entry_head field;
600 struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; 641 struct trace_uprobe *tu = event_call->data;
601 642
602 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 643 if (is_ret_probe(tu)) {
644 DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
645 DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
646 size = SIZEOF_TRACE_ENTRY(true);
647 } else {
648 DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
649 size = SIZEOF_TRACE_ENTRY(false);
650 }
603 /* Set argument names as fields */ 651 /* Set argument names as fields */
604 for (i = 0; i < tu->nr_args; i++) { 652 for (i = 0; i < tu->nr_args; i++) {
605 ret = trace_define_field(event_call, tu->args[i].type->fmttype, 653 ret = trace_define_field(event_call, tu->args[i].type->fmttype,
606 tu->args[i].name, 654 tu->args[i].name,
607 sizeof(field) + tu->args[i].offset, 655 size + tu->args[i].offset,
608 tu->args[i].type->size, 656 tu->args[i].type->size,
609 tu->args[i].type->is_signed, 657 tu->args[i].type->is_signed,
610 FILTER_OTHER); 658 FILTER_OTHER);
@@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
622 int i; 670 int i;
623 int pos = 0; 671 int pos = 0;
624 672
625 fmt = "(%lx)"; 673 if (is_ret_probe(tu)) {
626 arg = "REC->" FIELD_STRING_IP; 674 fmt = "(%lx <- %lx)";
675 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
676 } else {
677 fmt = "(%lx)";
678 arg = "REC->" FIELD_STRING_IP;
679 }
627 680
628 /* When len=0, we just calculate the needed length */ 681 /* When len=0, we just calculate the needed length */
629 682
@@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
752 return ret; 805 return ret;
753} 806}
754 807
755/* uprobe profile handler */ 808static void uprobe_perf_print(struct trace_uprobe *tu,
756static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 809 unsigned long func, struct pt_regs *regs)
757{ 810{
758 struct ftrace_event_call *call = &tu->call; 811 struct ftrace_event_call *call = &tu->call;
759 struct uprobe_trace_entry_head *entry; 812 struct uprobe_trace_entry_head *entry;
760 struct hlist_head *head; 813 struct hlist_head *head;
761 u8 *data; 814 void *data;
762 int size, __size, i; 815 int size, rctx, i;
763 int rctx;
764 816
765 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) 817 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
766 return UPROBE_HANDLER_REMOVE; 818 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
767
768 __size = sizeof(*entry) + tu->size;
769 size = ALIGN(__size + sizeof(u32), sizeof(u64));
770 size -= sizeof(u32);
771 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 819 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
772 return 0; 820 return;
773 821
774 preempt_disable(); 822 preempt_disable();
823 head = this_cpu_ptr(call->perf_events);
824 if (hlist_empty(head))
825 goto out;
775 826
776 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 827 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
777 if (!entry) 828 if (!entry)
778 goto out; 829 goto out;
779 830
780 entry->ip = instruction_pointer(task_pt_regs(current)); 831 if (is_ret_probe(tu)) {
781 data = (u8 *)&entry[1]; 832 entry->vaddr[0] = func;
833 entry->vaddr[1] = instruction_pointer(regs);
834 data = DATAOF_TRACE_ENTRY(entry, true);
835 } else {
836 entry->vaddr[0] = instruction_pointer(regs);
837 data = DATAOF_TRACE_ENTRY(entry, false);
838 }
839
782 for (i = 0; i < tu->nr_args; i++) 840 for (i = 0; i < tu->nr_args; i++)
783 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 841 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
784 842
785 head = this_cpu_ptr(call->perf_events); 843 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
786 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
787
788 out: 844 out:
789 preempt_enable(); 845 preempt_enable();
846}
847
848/* uprobe profile handler */
849static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
850{
851 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
852 return UPROBE_HANDLER_REMOVE;
853
854 if (!is_ret_probe(tu))
855 uprobe_perf_print(tu, 0, regs);
790 return 0; 856 return 0;
791} 857}
858
859static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
860 struct pt_regs *regs)
861{
862 uprobe_perf_print(tu, func, regs);
863}
792#endif /* CONFIG_PERF_EVENTS */ 864#endif /* CONFIG_PERF_EVENTS */
793 865
794static 866static
795int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) 867int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
796{ 868{
797 struct trace_uprobe *tu = (struct trace_uprobe *)event->data; 869 struct trace_uprobe *tu = event->data;
798 870
799 switch (type) { 871 switch (type) {
800 case TRACE_REG_REGISTER: 872 case TRACE_REG_REGISTER:
@@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
843 return ret; 915 return ret;
844} 916}
845 917
918static int uretprobe_dispatcher(struct uprobe_consumer *con,
919 unsigned long func, struct pt_regs *regs)
920{
921 struct trace_uprobe *tu;
922
923 tu = container_of(con, struct trace_uprobe, consumer);
924
925 if (tu->flags & TP_FLAG_TRACE)
926 uretprobe_trace_func(tu, func, regs);
927
928#ifdef CONFIG_PERF_EVENTS
929 if (tu->flags & TP_FLAG_PROFILE)
930 uretprobe_perf_func(tu, func, regs);
931#endif
932 return 0;
933}
934
846static struct trace_event_functions uprobe_funcs = { 935static struct trace_event_functions uprobe_funcs = {
847 .trace = print_uprobe_event 936 .trace = print_uprobe_event
848}; 937};
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 0c05a4592047..29f26540e9c9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
112 int nr_probes = 0; 112 int nr_probes = 0;
113 struct tracepoint_func *old, *new; 113 struct tracepoint_func *old, *new;
114 114
115 WARN_ON(!probe); 115 if (WARN_ON(!probe))
116 return ERR_PTR(-EINVAL);
116 117
117 debug_print_probes(entry); 118 debug_print_probes(entry);
118 old = entry->funcs; 119 old = entry->funcs;
@@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
152 153
153 debug_print_probes(entry); 154 debug_print_probes(entry);
154 /* (N -> M), (N > 1, M >= 0) probes */ 155 /* (N -> M), (N > 1, M >= 0) probes */
155 for (nr_probes = 0; old[nr_probes].func; nr_probes++) { 156 if (probe) {
156 if (!probe || 157 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
157 (old[nr_probes].func == probe && 158 if (old[nr_probes].func == probe &&
158 old[nr_probes].data == data)) 159 old[nr_probes].data == data)
159 nr_del++; 160 nr_del++;
161 }
160 } 162 }
161 163
164 /*
165 * If probe is NULL, then nr_probes = nr_del = 0, and then the
166 * entire entry will be removed.
167 */
162 if (nr_probes - nr_del == 0) { 168 if (nr_probes - nr_del == 0) {
163 /* N -> 0, (N > 1) */ 169 /* N -> 0, (N > 1) */
164 entry->funcs = NULL; 170 entry->funcs = NULL;
@@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
173 if (new == NULL) 179 if (new == NULL)
174 return ERR_PTR(-ENOMEM); 180 return ERR_PTR(-ENOMEM);
175 for (i = 0; old[i].func; i++) 181 for (i = 0; old[i].func; i++)
176 if (probe && 182 if (old[i].func != probe || old[i].data != data)
177 (old[i].func != probe || old[i].data != data))
178 new[j++] = old[i]; 183 new[j++] = old[i];
179 new[nr_probes - nr_del].func = NULL; 184 new[nr_probes - nr_del].func = NULL;
180 entry->refcount = nr_probes - nr_del; 185 entry->refcount = nr_probes - nr_del;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d7948eb10225..f6c83d7ef000 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -18,67 +18,43 @@
18 18
19SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) 19SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
20{ 20{
21 long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); 21 return sys_chown(filename, low2highuid(user), low2highgid(group));
22 /* avoid REGPARM breakage on x86: */
23 asmlinkage_protect(3, ret, filename, user, group);
24 return ret;
25} 22}
26 23
27SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) 24SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
28{ 25{
29 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); 26 return sys_lchown(filename, low2highuid(user), low2highgid(group));
30 /* avoid REGPARM breakage on x86: */
31 asmlinkage_protect(3, ret, filename, user, group);
32 return ret;
33} 27}
34 28
35SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) 29SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
36{ 30{
37 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); 31 return sys_fchown(fd, low2highuid(user), low2highgid(group));
38 /* avoid REGPARM breakage on x86: */
39 asmlinkage_protect(3, ret, fd, user, group);
40 return ret;
41} 32}
42 33
43SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) 34SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
44{ 35{
45 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); 36 return sys_setregid(low2highgid(rgid), low2highgid(egid));
46 /* avoid REGPARM breakage on x86: */
47 asmlinkage_protect(2, ret, rgid, egid);
48 return ret;
49} 37}
50 38
51SYSCALL_DEFINE1(setgid16, old_gid_t, gid) 39SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
52{ 40{
53 long ret = sys_setgid(low2highgid(gid)); 41 return sys_setgid(low2highgid(gid));
54 /* avoid REGPARM breakage on x86: */
55 asmlinkage_protect(1, ret, gid);
56 return ret;
57} 42}
58 43
59SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) 44SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
60{ 45{
61 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); 46 return sys_setreuid(low2highuid(ruid), low2highuid(euid));
62 /* avoid REGPARM breakage on x86: */
63 asmlinkage_protect(2, ret, ruid, euid);
64 return ret;
65} 47}
66 48
67SYSCALL_DEFINE1(setuid16, old_uid_t, uid) 49SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
68{ 50{
69 long ret = sys_setuid(low2highuid(uid)); 51 return sys_setuid(low2highuid(uid));
70 /* avoid REGPARM breakage on x86: */
71 asmlinkage_protect(1, ret, uid);
72 return ret;
73} 52}
74 53
75SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) 54SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
76{ 55{
77 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), 56 return sys_setresuid(low2highuid(ruid), low2highuid(euid),
78 low2highuid(suid)); 57 low2highuid(suid));
79 /* avoid REGPARM breakage on x86: */
80 asmlinkage_protect(3, ret, ruid, euid, suid);
81 return ret;
82} 58}
83 59
84SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) 60SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
@@ -100,11 +76,8 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid
100 76
101SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) 77SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
102{ 78{
103 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), 79 return sys_setresgid(low2highgid(rgid), low2highgid(egid),
104 low2highgid(sgid)); 80 low2highgid(sgid));
105 /* avoid REGPARM breakage on x86: */
106 asmlinkage_protect(3, ret, rgid, egid, sgid);
107 return ret;
108} 81}
109 82
110 83
@@ -127,18 +100,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid
127 100
128SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) 101SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
129{ 102{
130 long ret = sys_setfsuid(low2highuid(uid)); 103 return sys_setfsuid(low2highuid(uid));
131 /* avoid REGPARM breakage on x86: */
132 asmlinkage_protect(1, ret, uid);
133 return ret;
134} 104}
135 105
136SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) 106SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
137{ 107{
138 long ret = sys_setfsgid(low2highgid(gid)); 108 return sys_setfsgid(low2highgid(gid));
139 /* avoid REGPARM breakage on x86: */
140 asmlinkage_protect(1, ret, gid);
141 return ret;
142} 109}
143 110
144static int groups16_to_user(old_gid_t __user *grouplist, 111static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4a944676358e..05039e348f07 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write,
517 return ret; 517 return ret;
518 518
519 set_sample_period(); 519 set_sample_period();
520 /*
521 * Watchdog threads shouldn't be enabled if they are
522 * disabled. The 'watchdog_disabled' variable check in
523 * watchdog_*_all_cpus() function takes care of this.
524 */
520 if (watchdog_enabled && watchdog_thresh) 525 if (watchdog_enabled && watchdog_thresh)
521 watchdog_enable_all_cpus(); 526 watchdog_enable_all_cpus();
522 else 527 else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b48cd597145d..4aa9f5bc6b2d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,7 +41,12 @@
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/jhash.h>
44#include <linux/hashtable.h> 45#include <linux/hashtable.h>
46#include <linux/rculist.h>
47#include <linux/nodemask.h>
48#include <linux/moduleparam.h>
49#include <linux/uaccess.h>
45 50
46#include "workqueue_internal.h" 51#include "workqueue_internal.h"
47 52
@@ -58,12 +63,11 @@ enum {
58 * %WORKER_UNBOUND set and concurrency management disabled, and may 63 * %WORKER_UNBOUND set and concurrency management disabled, and may
59 * be executing on any CPU. The pool behaves as an unbound one. 64 * be executing on any CPU. The pool behaves as an unbound one.
60 * 65 *
61 * Note that DISASSOCIATED can be flipped only while holding 66 * Note that DISASSOCIATED should be flipped only while holding
62 * assoc_mutex to avoid changing binding state while 67 * manager_mutex to avoid changing binding state while
63 * create_worker() is in progress. 68 * create_worker() is in progress.
64 */ 69 */
65 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 70 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
66 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
67 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 71 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
68 POOL_FREEZING = 1 << 3, /* freeze in progress */ 72 POOL_FREEZING = 1 << 3, /* freeze in progress */
69 73
@@ -74,12 +78,14 @@ enum {
74 WORKER_PREP = 1 << 3, /* preparing to run works */ 78 WORKER_PREP = 1 << 3, /* preparing to run works */
75 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 79 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
76 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 80 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
81 WORKER_REBOUND = 1 << 8, /* worker was rebound */
77 82
78 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | 83 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
79 WORKER_CPU_INTENSIVE, 84 WORKER_UNBOUND | WORKER_REBOUND,
80 85
81 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ 86 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
82 87
88 UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
83 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 89 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
84 90
85 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 91 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
@@ -97,6 +103,8 @@ enum {
97 */ 103 */
98 RESCUER_NICE_LEVEL = -20, 104 RESCUER_NICE_LEVEL = -20,
99 HIGHPRI_NICE_LEVEL = -20, 105 HIGHPRI_NICE_LEVEL = -20,
106
107 WQ_NAME_LEN = 24,
100}; 108};
101 109
102/* 110/*
@@ -115,16 +123,26 @@ enum {
115 * cpu or grabbing pool->lock is enough for read access. If 123 * cpu or grabbing pool->lock is enough for read access. If
116 * POOL_DISASSOCIATED is set, it's identical to L. 124 * POOL_DISASSOCIATED is set, it's identical to L.
117 * 125 *
118 * F: wq->flush_mutex protected. 126 * MG: pool->manager_mutex and pool->lock protected. Writes require both
127 * locks. Reads can happen under either lock.
128 *
129 * PL: wq_pool_mutex protected.
130 *
131 * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
132 *
133 * WQ: wq->mutex protected.
119 * 134 *
120 * W: workqueue_lock protected. 135 * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
136 *
137 * MD: wq_mayday_lock protected.
121 */ 138 */
122 139
123/* struct worker is defined in workqueue_internal.h */ 140/* struct worker is defined in workqueue_internal.h */
124 141
125struct worker_pool { 142struct worker_pool {
126 spinlock_t lock; /* the pool lock */ 143 spinlock_t lock; /* the pool lock */
127 unsigned int cpu; /* I: the associated cpu */ 144 int cpu; /* I: the associated cpu */
145 int node; /* I: the associated node ID */
128 int id; /* I: pool ID */ 146 int id; /* I: pool ID */
129 unsigned int flags; /* X: flags */ 147 unsigned int flags; /* X: flags */
130 148
@@ -138,12 +156,18 @@ struct worker_pool {
138 struct timer_list idle_timer; /* L: worker idle timeout */ 156 struct timer_list idle_timer; /* L: worker idle timeout */
139 struct timer_list mayday_timer; /* L: SOS timer for workers */ 157 struct timer_list mayday_timer; /* L: SOS timer for workers */
140 158
141 /* workers are chained either in busy_hash or idle_list */ 159 /* a workers is either on busy_hash or idle_list, or the manager */
142 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); 160 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
143 /* L: hash of busy workers */ 161 /* L: hash of busy workers */
144 162
145 struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ 163 /* see manage_workers() for details on the two manager mutexes */
146 struct ida worker_ida; /* L: for worker IDs */ 164 struct mutex manager_arb; /* manager arbitration */
165 struct mutex manager_mutex; /* manager exclusion */
166 struct idr worker_idr; /* MG: worker IDs and iteration */
167
168 struct workqueue_attrs *attrs; /* I: worker attributes */
169 struct hlist_node hash_node; /* PL: unbound_pool_hash node */
170 int refcnt; /* PL: refcnt for unbound pools */
147 171
148 /* 172 /*
149 * The current concurrency level. As it's likely to be accessed 173 * The current concurrency level. As it's likely to be accessed
@@ -151,6 +175,12 @@ struct worker_pool {
151 * cacheline. 175 * cacheline.
152 */ 176 */
153 atomic_t nr_running ____cacheline_aligned_in_smp; 177 atomic_t nr_running ____cacheline_aligned_in_smp;
178
179 /*
180 * Destruction of pool is sched-RCU protected to allow dereferences
181 * from get_work_pool().
182 */
183 struct rcu_head rcu;
154} ____cacheline_aligned_in_smp; 184} ____cacheline_aligned_in_smp;
155 185
156/* 186/*
@@ -164,75 +194,107 @@ struct pool_workqueue {
164 struct workqueue_struct *wq; /* I: the owning workqueue */ 194 struct workqueue_struct *wq; /* I: the owning workqueue */
165 int work_color; /* L: current color */ 195 int work_color; /* L: current color */
166 int flush_color; /* L: flushing color */ 196 int flush_color; /* L: flushing color */
197 int refcnt; /* L: reference count */
167 int nr_in_flight[WORK_NR_COLORS]; 198 int nr_in_flight[WORK_NR_COLORS];
168 /* L: nr of in_flight works */ 199 /* L: nr of in_flight works */
169 int nr_active; /* L: nr of active works */ 200 int nr_active; /* L: nr of active works */
170 int max_active; /* L: max active works */ 201 int max_active; /* L: max active works */
171 struct list_head delayed_works; /* L: delayed works */ 202 struct list_head delayed_works; /* L: delayed works */
172}; 203 struct list_head pwqs_node; /* WR: node on wq->pwqs */
204 struct list_head mayday_node; /* MD: node on wq->maydays */
205
206 /*
207 * Release of unbound pwq is punted to system_wq. See put_pwq()
208 * and pwq_unbound_release_workfn() for details. pool_workqueue
209 * itself is also sched-RCU protected so that the first pwq can be
210 * determined without grabbing wq->mutex.
211 */
212 struct work_struct unbound_release_work;
213 struct rcu_head rcu;
214} __aligned(1 << WORK_STRUCT_FLAG_BITS);
173 215
174/* 216/*
175 * Structure used to wait for workqueue flush. 217 * Structure used to wait for workqueue flush.
176 */ 218 */
177struct wq_flusher { 219struct wq_flusher {
178 struct list_head list; /* F: list of flushers */ 220 struct list_head list; /* WQ: list of flushers */
179 int flush_color; /* F: flush color waiting for */ 221 int flush_color; /* WQ: flush color waiting for */
180 struct completion done; /* flush completion */ 222 struct completion done; /* flush completion */
181}; 223};
182 224
183/* 225struct wq_device;
184 * All cpumasks are assumed to be always set on UP and thus can't be
185 * used to determine whether there's something to be done.
186 */
187#ifdef CONFIG_SMP
188typedef cpumask_var_t mayday_mask_t;
189#define mayday_test_and_set_cpu(cpu, mask) \
190 cpumask_test_and_set_cpu((cpu), (mask))
191#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
192#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
193#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp))
194#define free_mayday_mask(mask) free_cpumask_var((mask))
195#else
196typedef unsigned long mayday_mask_t;
197#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
198#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
199#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
200#define alloc_mayday_mask(maskp, gfp) true
201#define free_mayday_mask(mask) do { } while (0)
202#endif
203 226
204/* 227/*
205 * The externally visible workqueue abstraction is an array of 228 * The externally visible workqueue. It relays the issued work items to
206 * per-CPU workqueues: 229 * the appropriate worker_pool through its pool_workqueues.
207 */ 230 */
208struct workqueue_struct { 231struct workqueue_struct {
209 unsigned int flags; /* W: WQ_* flags */ 232 struct list_head pwqs; /* WR: all pwqs of this wq */
210 union { 233 struct list_head list; /* PL: list of all workqueues */
211 struct pool_workqueue __percpu *pcpu; 234
212 struct pool_workqueue *single; 235 struct mutex mutex; /* protects this wq */
213 unsigned long v; 236 int work_color; /* WQ: current work color */
214 } pool_wq; /* I: pwq's */ 237 int flush_color; /* WQ: current flush color */
215 struct list_head list; /* W: list of all workqueues */
216
217 struct mutex flush_mutex; /* protects wq flushing */
218 int work_color; /* F: current work color */
219 int flush_color; /* F: current flush color */
220 atomic_t nr_pwqs_to_flush; /* flush in progress */ 238 atomic_t nr_pwqs_to_flush; /* flush in progress */
221 struct wq_flusher *first_flusher; /* F: first flusher */ 239 struct wq_flusher *first_flusher; /* WQ: first flusher */
222 struct list_head flusher_queue; /* F: flush waiters */ 240 struct list_head flusher_queue; /* WQ: flush waiters */
223 struct list_head flusher_overflow; /* F: flush overflow list */ 241 struct list_head flusher_overflow; /* WQ: flush overflow list */
224 242
225 mayday_mask_t mayday_mask; /* cpus requesting rescue */ 243 struct list_head maydays; /* MD: pwqs requesting rescue */
226 struct worker *rescuer; /* I: rescue worker */ 244 struct worker *rescuer; /* I: rescue worker */
227 245
228 int nr_drainers; /* W: drain in progress */ 246 int nr_drainers; /* WQ: drain in progress */
229 int saved_max_active; /* W: saved pwq max_active */ 247 int saved_max_active; /* WQ: saved pwq max_active */
248
249 struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */
250 struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */
251
252#ifdef CONFIG_SYSFS
253 struct wq_device *wq_dev; /* I: for sysfs interface */
254#endif
230#ifdef CONFIG_LOCKDEP 255#ifdef CONFIG_LOCKDEP
231 struct lockdep_map lockdep_map; 256 struct lockdep_map lockdep_map;
232#endif 257#endif
233 char name[]; /* I: workqueue name */ 258 char name[WQ_NAME_LEN]; /* I: workqueue name */
259
260 /* hot fields used during command issue, aligned to cacheline */
261 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
262 struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
263 struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
234}; 264};
235 265
266static struct kmem_cache *pwq_cache;
267
268static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
269static cpumask_var_t *wq_numa_possible_cpumask;
270 /* possible CPUs of each node */
271
272static bool wq_disable_numa;
273module_param_named(disable_numa, wq_disable_numa, bool, 0444);
274
275static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
276
277/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
278static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
279
280static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
281static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
282
283static LIST_HEAD(workqueues); /* PL: list of all workqueues */
284static bool workqueue_freezing; /* PL: have wqs started freezing? */
285
286/* the per-cpu worker pools */
287static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
288 cpu_worker_pools);
289
290static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
291
292/* PL: hash of all unbound pools keyed by pool->attrs */
293static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
294
295/* I: attributes used when instantiating standard unbound pools on demand */
296static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
297
236struct workqueue_struct *system_wq __read_mostly; 298struct workqueue_struct *system_wq __read_mostly;
237EXPORT_SYMBOL_GPL(system_wq); 299EXPORT_SYMBOL_GPL(system_wq);
238struct workqueue_struct *system_highpri_wq __read_mostly; 300struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -244,64 +306,87 @@ EXPORT_SYMBOL_GPL(system_unbound_wq);
244struct workqueue_struct *system_freezable_wq __read_mostly; 306struct workqueue_struct *system_freezable_wq __read_mostly;
245EXPORT_SYMBOL_GPL(system_freezable_wq); 307EXPORT_SYMBOL_GPL(system_freezable_wq);
246 308
309static int worker_thread(void *__worker);
310static void copy_workqueue_attrs(struct workqueue_attrs *to,
311 const struct workqueue_attrs *from);
312
247#define CREATE_TRACE_POINTS 313#define CREATE_TRACE_POINTS
248#include <trace/events/workqueue.h> 314#include <trace/events/workqueue.h>
249 315
250#define for_each_std_worker_pool(pool, cpu) \ 316#define assert_rcu_or_pool_mutex() \
251 for ((pool) = &std_worker_pools(cpu)[0]; \ 317 rcu_lockdep_assert(rcu_read_lock_sched_held() || \
252 (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) 318 lockdep_is_held(&wq_pool_mutex), \
319 "sched RCU or wq_pool_mutex should be held")
253 320
254#define for_each_busy_worker(worker, i, pool) \ 321#define assert_rcu_or_wq_mutex(wq) \
255 hash_for_each(pool->busy_hash, i, worker, hentry) 322 rcu_lockdep_assert(rcu_read_lock_sched_held() || \
323 lockdep_is_held(&wq->mutex), \
324 "sched RCU or wq->mutex should be held")
256 325
257static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, 326#ifdef CONFIG_LOCKDEP
258 unsigned int sw) 327#define assert_manager_or_pool_lock(pool) \
259{ 328 WARN_ONCE(debug_locks && \
260 if (cpu < nr_cpu_ids) { 329 !lockdep_is_held(&(pool)->manager_mutex) && \
261 if (sw & 1) { 330 !lockdep_is_held(&(pool)->lock), \
262 cpu = cpumask_next(cpu, mask); 331 "pool->manager_mutex or ->lock should be held")
263 if (cpu < nr_cpu_ids) 332#else
264 return cpu; 333#define assert_manager_or_pool_lock(pool) do { } while (0)
265 } 334#endif
266 if (sw & 2)
267 return WORK_CPU_UNBOUND;
268 }
269 return WORK_CPU_END;
270}
271 335
272static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, 336#define for_each_cpu_worker_pool(pool, cpu) \
273 struct workqueue_struct *wq) 337 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
274{ 338 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
275 return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); 339 (pool)++)
276}
277 340
278/* 341/**
279 * CPU iterators 342 * for_each_pool - iterate through all worker_pools in the system
343 * @pool: iteration cursor
344 * @pi: integer used for iteration
280 * 345 *
281 * An extra cpu number is defined using an invalid cpu number 346 * This must be called either with wq_pool_mutex held or sched RCU read
282 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any 347 * locked. If the pool needs to be used beyond the locking in effect, the
283 * specific CPU. The following iterators are similar to for_each_*_cpu() 348 * caller is responsible for guaranteeing that the pool stays online.
284 * iterators but also considers the unbound CPU.
285 * 349 *
286 * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND 350 * The if/else clause exists only for the lockdep assertion and can be
287 * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND 351 * ignored.
288 * for_each_pwq_cpu() : possible CPUs for bound workqueues,
289 * WORK_CPU_UNBOUND for unbound workqueues
290 */ 352 */
291#define for_each_wq_cpu(cpu) \ 353#define for_each_pool(pool, pi) \
292 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ 354 idr_for_each_entry(&worker_pool_idr, pool, pi) \
293 (cpu) < WORK_CPU_END; \ 355 if (({ assert_rcu_or_pool_mutex(); false; })) { } \
294 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) 356 else
295 357
296#define for_each_online_wq_cpu(cpu) \ 358/**
297 for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ 359 * for_each_pool_worker - iterate through all workers of a worker_pool
298 (cpu) < WORK_CPU_END; \ 360 * @worker: iteration cursor
299 (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) 361 * @wi: integer used for iteration
362 * @pool: worker_pool to iterate workers of
363 *
364 * This must be called with either @pool->manager_mutex or ->lock held.
365 *
366 * The if/else clause exists only for the lockdep assertion and can be
367 * ignored.
368 */
369#define for_each_pool_worker(worker, wi, pool) \
370 idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \
371 if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
372 else
300 373
301#define for_each_pwq_cpu(cpu, wq) \ 374/**
302 for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ 375 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
303 (cpu) < WORK_CPU_END; \ 376 * @pwq: iteration cursor
304 (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) 377 * @wq: the target workqueue
378 *
379 * This must be called either with wq->mutex held or sched RCU read locked.
380 * If the pwq needs to be used beyond the locking in effect, the caller is
381 * responsible for guaranteeing that the pwq stays online.
382 *
383 * The if/else clause exists only for the lockdep assertion and can be
384 * ignored.
385 */
386#define for_each_pwq(pwq, wq) \
387 list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
388 if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
389 else
305 390
306#ifdef CONFIG_DEBUG_OBJECTS_WORK 391#ifdef CONFIG_DEBUG_OBJECTS_WORK
307 392
@@ -419,77 +504,35 @@ static inline void debug_work_activate(struct work_struct *work) { }
419static inline void debug_work_deactivate(struct work_struct *work) { } 504static inline void debug_work_deactivate(struct work_struct *work) { }
420#endif 505#endif
421 506
422/* Serializes the accesses to the list of workqueues. */
423static DEFINE_SPINLOCK(workqueue_lock);
424static LIST_HEAD(workqueues);
425static bool workqueue_freezing; /* W: have wqs started freezing? */
426
427/*
428 * The CPU and unbound standard worker pools. The unbound ones have
429 * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
430 */
431static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
432 cpu_std_worker_pools);
433static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
434
435/* idr of all pools */
436static DEFINE_MUTEX(worker_pool_idr_mutex);
437static DEFINE_IDR(worker_pool_idr);
438
439static int worker_thread(void *__worker);
440
441static struct worker_pool *std_worker_pools(int cpu)
442{
443 if (cpu != WORK_CPU_UNBOUND)
444 return per_cpu(cpu_std_worker_pools, cpu);
445 else
446 return unbound_std_worker_pools;
447}
448
449static int std_worker_pool_pri(struct worker_pool *pool)
450{
451 return pool - std_worker_pools(pool->cpu);
452}
453
454/* allocate ID and assign it to @pool */ 507/* allocate ID and assign it to @pool */
455static int worker_pool_assign_id(struct worker_pool *pool) 508static int worker_pool_assign_id(struct worker_pool *pool)
456{ 509{
457 int ret; 510 int ret;
458 511
459 mutex_lock(&worker_pool_idr_mutex); 512 lockdep_assert_held(&wq_pool_mutex);
513
460 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); 514 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
461 if (ret >= 0) 515 if (ret >= 0) {
462 pool->id = ret; 516 pool->id = ret;
463 mutex_unlock(&worker_pool_idr_mutex); 517 return 0;
464 518 }
465 return ret < 0 ? ret : 0; 519 return ret;
466} 520}
467 521
468/* 522/**
469 * Lookup worker_pool by id. The idr currently is built during boot and 523 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
470 * never modified. Don't worry about locking for now. 524 * @wq: the target workqueue
525 * @node: the node ID
526 *
527 * This must be called either with pwq_lock held or sched RCU read locked.
528 * If the pwq needs to be used beyond the locking in effect, the caller is
529 * responsible for guaranteeing that the pwq stays online.
471 */ 530 */
472static struct worker_pool *worker_pool_by_id(int pool_id) 531static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
532 int node)
473{ 533{
474 return idr_find(&worker_pool_idr, pool_id); 534 assert_rcu_or_wq_mutex(wq);
475} 535 return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
476
477static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
478{
479 struct worker_pool *pools = std_worker_pools(cpu);
480
481 return &pools[highpri];
482}
483
484static struct pool_workqueue *get_pwq(unsigned int cpu,
485 struct workqueue_struct *wq)
486{
487 if (!(wq->flags & WQ_UNBOUND)) {
488 if (likely(cpu < nr_cpu_ids))
489 return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
490 } else if (likely(cpu == WORK_CPU_UNBOUND))
491 return wq->pool_wq.single;
492 return NULL;
493} 536}
494 537
495static unsigned int work_color_to_flags(int color) 538static unsigned int work_color_to_flags(int color)
@@ -531,7 +574,7 @@ static int work_next_color(int color)
531static inline void set_work_data(struct work_struct *work, unsigned long data, 574static inline void set_work_data(struct work_struct *work, unsigned long data,
532 unsigned long flags) 575 unsigned long flags)
533{ 576{
534 BUG_ON(!work_pending(work)); 577 WARN_ON_ONCE(!work_pending(work));
535 atomic_long_set(&work->data, data | flags | work_static(work)); 578 atomic_long_set(&work->data, data | flags | work_static(work));
536} 579}
537 580
@@ -583,13 +626,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
583 * @work: the work item of interest 626 * @work: the work item of interest
584 * 627 *
585 * Return the worker_pool @work was last associated with. %NULL if none. 628 * Return the worker_pool @work was last associated with. %NULL if none.
629 *
630 * Pools are created and destroyed under wq_pool_mutex, and allows read
631 * access under sched-RCU read lock. As such, this function should be
632 * called under wq_pool_mutex or with preemption disabled.
633 *
634 * All fields of the returned pool are accessible as long as the above
635 * mentioned locking is in effect. If the returned pool needs to be used
636 * beyond the critical section, the caller is responsible for ensuring the
637 * returned pool is and stays online.
586 */ 638 */
587static struct worker_pool *get_work_pool(struct work_struct *work) 639static struct worker_pool *get_work_pool(struct work_struct *work)
588{ 640{
589 unsigned long data = atomic_long_read(&work->data); 641 unsigned long data = atomic_long_read(&work->data);
590 struct worker_pool *pool;
591 int pool_id; 642 int pool_id;
592 643
644 assert_rcu_or_pool_mutex();
645
593 if (data & WORK_STRUCT_PWQ) 646 if (data & WORK_STRUCT_PWQ)
594 return ((struct pool_workqueue *) 647 return ((struct pool_workqueue *)
595 (data & WORK_STRUCT_WQ_DATA_MASK))->pool; 648 (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
@@ -598,9 +651,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
598 if (pool_id == WORK_OFFQ_POOL_NONE) 651 if (pool_id == WORK_OFFQ_POOL_NONE)
599 return NULL; 652 return NULL;
600 653
601 pool = worker_pool_by_id(pool_id); 654 return idr_find(&worker_pool_idr, pool_id);
602 WARN_ON_ONCE(!pool);
603 return pool;
604} 655}
605 656
606/** 657/**
@@ -689,7 +740,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
689/* Do we have too many workers and should some go away? */ 740/* Do we have too many workers and should some go away? */
690static bool too_many_workers(struct worker_pool *pool) 741static bool too_many_workers(struct worker_pool *pool)
691{ 742{
692 bool managing = pool->flags & POOL_MANAGING_WORKERS; 743 bool managing = mutex_is_locked(&pool->manager_arb);
693 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 744 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
694 int nr_busy = pool->nr_workers - nr_idle; 745 int nr_busy = pool->nr_workers - nr_idle;
695 746
@@ -744,7 +795,7 @@ static void wake_up_worker(struct worker_pool *pool)
744 * CONTEXT: 795 * CONTEXT:
745 * spin_lock_irq(rq->lock) 796 * spin_lock_irq(rq->lock)
746 */ 797 */
747void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) 798void wq_worker_waking_up(struct task_struct *task, int cpu)
748{ 799{
749 struct worker *worker = kthread_data(task); 800 struct worker *worker = kthread_data(task);
750 801
@@ -769,8 +820,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
769 * RETURNS: 820 * RETURNS:
770 * Worker task on @cpu to wake up, %NULL if none. 821 * Worker task on @cpu to wake up, %NULL if none.
771 */ 822 */
772struct task_struct *wq_worker_sleeping(struct task_struct *task, 823struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
773 unsigned int cpu)
774{ 824{
775 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 825 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
776 struct worker_pool *pool; 826 struct worker_pool *pool;
@@ -786,7 +836,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
786 pool = worker->pool; 836 pool = worker->pool;
787 837
788 /* this can only happen on the local cpu */ 838 /* this can only happen on the local cpu */
789 BUG_ON(cpu != raw_smp_processor_id()); 839 if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
840 return NULL;
790 841
791 /* 842 /*
792 * The counterpart of the following dec_and_test, implied mb, 843 * The counterpart of the following dec_and_test, implied mb,
@@ -891,13 +942,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
891 * recycled work item as currently executing and make it wait until the 942 * recycled work item as currently executing and make it wait until the
892 * current execution finishes, introducing an unwanted dependency. 943 * current execution finishes, introducing an unwanted dependency.
893 * 944 *
894 * This function checks the work item address, work function and workqueue 945 * This function checks the work item address and work function to avoid
895 * to avoid false positives. Note that this isn't complete as one may 946 * false positives. Note that this isn't complete as one may construct a
896 * construct a work function which can introduce dependency onto itself 947 * work function which can introduce dependency onto itself through a
897 * through a recycled work item. Well, if somebody wants to shoot oneself 948 * recycled work item. Well, if somebody wants to shoot oneself in the
898 * in the foot that badly, there's only so much we can do, and if such 949 * foot that badly, there's only so much we can do, and if such deadlock
899 * deadlock actually occurs, it should be easy to locate the culprit work 950 * actually occurs, it should be easy to locate the culprit work function.
900 * function.
901 * 951 *
902 * CONTEXT: 952 * CONTEXT:
903 * spin_lock_irq(pool->lock). 953 * spin_lock_irq(pool->lock).
@@ -961,6 +1011,64 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
961 *nextp = n; 1011 *nextp = n;
962} 1012}
963 1013
1014/**
1015 * get_pwq - get an extra reference on the specified pool_workqueue
1016 * @pwq: pool_workqueue to get
1017 *
1018 * Obtain an extra reference on @pwq. The caller should guarantee that
1019 * @pwq has positive refcnt and be holding the matching pool->lock.
1020 */
1021static void get_pwq(struct pool_workqueue *pwq)
1022{
1023 lockdep_assert_held(&pwq->pool->lock);
1024 WARN_ON_ONCE(pwq->refcnt <= 0);
1025 pwq->refcnt++;
1026}
1027
1028/**
1029 * put_pwq - put a pool_workqueue reference
1030 * @pwq: pool_workqueue to put
1031 *
1032 * Drop a reference of @pwq. If its refcnt reaches zero, schedule its
1033 * destruction. The caller should be holding the matching pool->lock.
1034 */
1035static void put_pwq(struct pool_workqueue *pwq)
1036{
1037 lockdep_assert_held(&pwq->pool->lock);
1038 if (likely(--pwq->refcnt))
1039 return;
1040 if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
1041 return;
1042 /*
1043 * @pwq can't be released under pool->lock, bounce to
1044 * pwq_unbound_release_workfn(). This never recurses on the same
1045 * pool->lock as this path is taken only for unbound workqueues and
1046 * the release work item is scheduled on a per-cpu workqueue. To
1047 * avoid lockdep warning, unbound pool->locks are given lockdep
1048 * subclass of 1 in get_unbound_pool().
1049 */
1050 schedule_work(&pwq->unbound_release_work);
1051}
1052
1053/**
1054 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
1055 * @pwq: pool_workqueue to put (can be %NULL)
1056 *
1057 * put_pwq() with locking. This function also allows %NULL @pwq.
1058 */
1059static void put_pwq_unlocked(struct pool_workqueue *pwq)
1060{
1061 if (pwq) {
1062 /*
1063 * As both pwqs and pools are sched-RCU protected, the
1064 * following lock operations are safe.
1065 */
1066 spin_lock_irq(&pwq->pool->lock);
1067 put_pwq(pwq);
1068 spin_unlock_irq(&pwq->pool->lock);
1069 }
1070}
1071
964static void pwq_activate_delayed_work(struct work_struct *work) 1072static void pwq_activate_delayed_work(struct work_struct *work)
965{ 1073{
966 struct pool_workqueue *pwq = get_work_pwq(work); 1074 struct pool_workqueue *pwq = get_work_pwq(work);
@@ -992,9 +1100,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
992 */ 1100 */
993static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) 1101static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
994{ 1102{
995 /* ignore uncolored works */ 1103 /* uncolored work items don't participate in flushing or nr_active */
996 if (color == WORK_NO_COLOR) 1104 if (color == WORK_NO_COLOR)
997 return; 1105 goto out_put;
998 1106
999 pwq->nr_in_flight[color]--; 1107 pwq->nr_in_flight[color]--;
1000 1108
@@ -1007,11 +1115,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1007 1115
1008 /* is flush in progress and are we at the flushing tip? */ 1116 /* is flush in progress and are we at the flushing tip? */
1009 if (likely(pwq->flush_color != color)) 1117 if (likely(pwq->flush_color != color))
1010 return; 1118 goto out_put;
1011 1119
1012 /* are there still in-flight works? */ 1120 /* are there still in-flight works? */
1013 if (pwq->nr_in_flight[color]) 1121 if (pwq->nr_in_flight[color])
1014 return; 1122 goto out_put;
1015 1123
1016 /* this pwq is done, clear flush_color */ 1124 /* this pwq is done, clear flush_color */
1017 pwq->flush_color = -1; 1125 pwq->flush_color = -1;
@@ -1022,6 +1130,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1022 */ 1130 */
1023 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) 1131 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1024 complete(&pwq->wq->first_flusher->done); 1132 complete(&pwq->wq->first_flusher->done);
1133out_put:
1134 put_pwq(pwq);
1025} 1135}
1026 1136
1027/** 1137/**
@@ -1144,11 +1254,12 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1144 /* we own @work, set data and link */ 1254 /* we own @work, set data and link */
1145 set_work_pwq(work, pwq, extra_flags); 1255 set_work_pwq(work, pwq, extra_flags);
1146 list_add_tail(&work->entry, head); 1256 list_add_tail(&work->entry, head);
1257 get_pwq(pwq);
1147 1258
1148 /* 1259 /*
1149 * Ensure either worker_sched_deactivated() sees the above 1260 * Ensure either wq_worker_sleeping() sees the above
1150 * list_add_tail() or we see zero nr_running to avoid workers 1261 * list_add_tail() or we see zero nr_running to avoid workers lying
1151 * lying around lazily while there are works to be processed. 1262 * around lazily while there are works to be processed.
1152 */ 1263 */
1153 smp_mb(); 1264 smp_mb();
1154 1265
@@ -1172,10 +1283,11 @@ static bool is_chained_work(struct workqueue_struct *wq)
1172 return worker && worker->current_pwq->wq == wq; 1283 return worker && worker->current_pwq->wq == wq;
1173} 1284}
1174 1285
1175static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 1286static void __queue_work(int cpu, struct workqueue_struct *wq,
1176 struct work_struct *work) 1287 struct work_struct *work)
1177{ 1288{
1178 struct pool_workqueue *pwq; 1289 struct pool_workqueue *pwq;
1290 struct worker_pool *last_pool;
1179 struct list_head *worklist; 1291 struct list_head *worklist;
1180 unsigned int work_flags; 1292 unsigned int work_flags;
1181 unsigned int req_cpu = cpu; 1293 unsigned int req_cpu = cpu;
@@ -1191,48 +1303,62 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1191 debug_work_activate(work); 1303 debug_work_activate(work);
1192 1304
1193 /* if dying, only works from the same workqueue are allowed */ 1305 /* if dying, only works from the same workqueue are allowed */
1194 if (unlikely(wq->flags & WQ_DRAINING) && 1306 if (unlikely(wq->flags & __WQ_DRAINING) &&
1195 WARN_ON_ONCE(!is_chained_work(wq))) 1307 WARN_ON_ONCE(!is_chained_work(wq)))
1196 return; 1308 return;
1309retry:
1310 if (req_cpu == WORK_CPU_UNBOUND)
1311 cpu = raw_smp_processor_id();
1197 1312
1198 /* determine the pwq to use */ 1313 /* pwq which will be used unless @work is executing elsewhere */
1199 if (!(wq->flags & WQ_UNBOUND)) { 1314 if (!(wq->flags & WQ_UNBOUND))
1200 struct worker_pool *last_pool; 1315 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
1201 1316 else
1202 if (cpu == WORK_CPU_UNBOUND) 1317 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
1203 cpu = raw_smp_processor_id();
1204
1205 /*
1206 * It's multi cpu. If @work was previously on a different
1207 * cpu, it might still be running there, in which case the
1208 * work needs to be queued on that cpu to guarantee
1209 * non-reentrancy.
1210 */
1211 pwq = get_pwq(cpu, wq);
1212 last_pool = get_work_pool(work);
1213 1318
1214 if (last_pool && last_pool != pwq->pool) { 1319 /*
1215 struct worker *worker; 1320 * If @work was previously on a different pool, it might still be
1321 * running there, in which case the work needs to be queued on that
1322 * pool to guarantee non-reentrancy.
1323 */
1324 last_pool = get_work_pool(work);
1325 if (last_pool && last_pool != pwq->pool) {
1326 struct worker *worker;
1216 1327
1217 spin_lock(&last_pool->lock); 1328 spin_lock(&last_pool->lock);
1218 1329
1219 worker = find_worker_executing_work(last_pool, work); 1330 worker = find_worker_executing_work(last_pool, work);
1220 1331
1221 if (worker && worker->current_pwq->wq == wq) { 1332 if (worker && worker->current_pwq->wq == wq) {
1222 pwq = get_pwq(last_pool->cpu, wq); 1333 pwq = worker->current_pwq;
1223 } else {
1224 /* meh... not running there, queue here */
1225 spin_unlock(&last_pool->lock);
1226 spin_lock(&pwq->pool->lock);
1227 }
1228 } else { 1334 } else {
1335 /* meh... not running there, queue here */
1336 spin_unlock(&last_pool->lock);
1229 spin_lock(&pwq->pool->lock); 1337 spin_lock(&pwq->pool->lock);
1230 } 1338 }
1231 } else { 1339 } else {
1232 pwq = get_pwq(WORK_CPU_UNBOUND, wq);
1233 spin_lock(&pwq->pool->lock); 1340 spin_lock(&pwq->pool->lock);
1234 } 1341 }
1235 1342
1343 /*
1344 * pwq is determined and locked. For unbound pools, we could have
1345 * raced with pwq release and it could already be dead. If its
1346 * refcnt is zero, repeat pwq selection. Note that pwqs never die
1347 * without another pwq replacing it in the numa_pwq_tbl or while
1348 * work items are executing on it, so the retrying is guaranteed to
1349 * make forward-progress.
1350 */
1351 if (unlikely(!pwq->refcnt)) {
1352 if (wq->flags & WQ_UNBOUND) {
1353 spin_unlock(&pwq->pool->lock);
1354 cpu_relax();
1355 goto retry;
1356 }
1357 /* oops */
1358 WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
1359 wq->name, cpu);
1360 }
1361
1236 /* pwq determined, queue */ 1362 /* pwq determined, queue */
1237 trace_workqueue_queue_work(req_cpu, pwq, work); 1363 trace_workqueue_queue_work(req_cpu, pwq, work);
1238 1364
@@ -1287,22 +1413,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
1287} 1413}
1288EXPORT_SYMBOL_GPL(queue_work_on); 1414EXPORT_SYMBOL_GPL(queue_work_on);
1289 1415
1290/**
1291 * queue_work - queue work on a workqueue
1292 * @wq: workqueue to use
1293 * @work: work to queue
1294 *
1295 * Returns %false if @work was already on a queue, %true otherwise.
1296 *
1297 * We queue the work to the CPU on which it was submitted, but if the CPU dies
1298 * it can be processed by another CPU.
1299 */
1300bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
1301{
1302 return queue_work_on(WORK_CPU_UNBOUND, wq, work);
1303}
1304EXPORT_SYMBOL_GPL(queue_work);
1305
1306void delayed_work_timer_fn(unsigned long __data) 1416void delayed_work_timer_fn(unsigned long __data)
1307{ 1417{
1308 struct delayed_work *dwork = (struct delayed_work *)__data; 1418 struct delayed_work *dwork = (struct delayed_work *)__data;
@@ -1378,21 +1488,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1378EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1488EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1379 1489
1380/** 1490/**
1381 * queue_delayed_work - queue work on a workqueue after delay
1382 * @wq: workqueue to use
1383 * @dwork: delayable work to queue
1384 * @delay: number of jiffies to wait before queueing
1385 *
1386 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
1387 */
1388bool queue_delayed_work(struct workqueue_struct *wq,
1389 struct delayed_work *dwork, unsigned long delay)
1390{
1391 return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1392}
1393EXPORT_SYMBOL_GPL(queue_delayed_work);
1394
1395/**
1396 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU 1491 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
1397 * @cpu: CPU number to execute work on 1492 * @cpu: CPU number to execute work on
1398 * @wq: workqueue to use 1493 * @wq: workqueue to use
@@ -1431,21 +1526,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1431EXPORT_SYMBOL_GPL(mod_delayed_work_on); 1526EXPORT_SYMBOL_GPL(mod_delayed_work_on);
1432 1527
1433/** 1528/**
1434 * mod_delayed_work - modify delay of or queue a delayed work
1435 * @wq: workqueue to use
1436 * @dwork: work to queue
1437 * @delay: number of jiffies to wait before queueing
1438 *
1439 * mod_delayed_work_on() on local CPU.
1440 */
1441bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
1442 unsigned long delay)
1443{
1444 return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1445}
1446EXPORT_SYMBOL_GPL(mod_delayed_work);
1447
1448/**
1449 * worker_enter_idle - enter idle state 1529 * worker_enter_idle - enter idle state
1450 * @worker: worker which is entering idle state 1530 * @worker: worker which is entering idle state
1451 * 1531 *
@@ -1459,9 +1539,10 @@ static void worker_enter_idle(struct worker *worker)
1459{ 1539{
1460 struct worker_pool *pool = worker->pool; 1540 struct worker_pool *pool = worker->pool;
1461 1541
1462 BUG_ON(worker->flags & WORKER_IDLE); 1542 if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
1463 BUG_ON(!list_empty(&worker->entry) && 1543 WARN_ON_ONCE(!list_empty(&worker->entry) &&
1464 (worker->hentry.next || worker->hentry.pprev)); 1544 (worker->hentry.next || worker->hentry.pprev)))
1545 return;
1465 1546
1466 /* can't use worker_set_flags(), also called from start_worker() */ 1547 /* can't use worker_set_flags(), also called from start_worker() */
1467 worker->flags |= WORKER_IDLE; 1548 worker->flags |= WORKER_IDLE;
@@ -1498,22 +1579,25 @@ static void worker_leave_idle(struct worker *worker)
1498{ 1579{
1499 struct worker_pool *pool = worker->pool; 1580 struct worker_pool *pool = worker->pool;
1500 1581
1501 BUG_ON(!(worker->flags & WORKER_IDLE)); 1582 if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
1583 return;
1502 worker_clr_flags(worker, WORKER_IDLE); 1584 worker_clr_flags(worker, WORKER_IDLE);
1503 pool->nr_idle--; 1585 pool->nr_idle--;
1504 list_del_init(&worker->entry); 1586 list_del_init(&worker->entry);
1505} 1587}
1506 1588
1507/** 1589/**
1508 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool 1590 * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
1509 * @worker: self 1591 * @pool: target worker_pool
1592 *
1593 * Bind %current to the cpu of @pool if it is associated and lock @pool.
1510 * 1594 *
1511 * Works which are scheduled while the cpu is online must at least be 1595 * Works which are scheduled while the cpu is online must at least be
1512 * scheduled to a worker which is bound to the cpu so that if they are 1596 * scheduled to a worker which is bound to the cpu so that if they are
1513 * flushed from cpu callbacks while cpu is going down, they are 1597 * flushed from cpu callbacks while cpu is going down, they are
1514 * guaranteed to execute on the cpu. 1598 * guaranteed to execute on the cpu.
1515 * 1599 *
1516 * This function is to be used by rogue workers and rescuers to bind 1600 * This function is to be used by unbound workers and rescuers to bind
1517 * themselves to the target cpu and may race with cpu going down or 1601 * themselves to the target cpu and may race with cpu going down or
1518 * coming online. kthread_bind() can't be used because it may put the 1602 * coming online. kthread_bind() can't be used because it may put the
1519 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used 1603 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
@@ -1534,12 +1618,9 @@ static void worker_leave_idle(struct worker *worker)
1534 * %true if the associated pool is online (@worker is successfully 1618 * %true if the associated pool is online (@worker is successfully
1535 * bound), %false if offline. 1619 * bound), %false if offline.
1536 */ 1620 */
1537static bool worker_maybe_bind_and_lock(struct worker *worker) 1621static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
1538__acquires(&pool->lock) 1622__acquires(&pool->lock)
1539{ 1623{
1540 struct worker_pool *pool = worker->pool;
1541 struct task_struct *task = worker->task;
1542
1543 while (true) { 1624 while (true) {
1544 /* 1625 /*
1545 * The following call may fail, succeed or succeed 1626 * The following call may fail, succeed or succeed
@@ -1548,14 +1629,13 @@ __acquires(&pool->lock)
1548 * against POOL_DISASSOCIATED. 1629 * against POOL_DISASSOCIATED.
1549 */ 1630 */
1550 if (!(pool->flags & POOL_DISASSOCIATED)) 1631 if (!(pool->flags & POOL_DISASSOCIATED))
1551 set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); 1632 set_cpus_allowed_ptr(current, pool->attrs->cpumask);
1552 1633
1553 spin_lock_irq(&pool->lock); 1634 spin_lock_irq(&pool->lock);
1554 if (pool->flags & POOL_DISASSOCIATED) 1635 if (pool->flags & POOL_DISASSOCIATED)
1555 return false; 1636 return false;
1556 if (task_cpu(task) == pool->cpu && 1637 if (task_cpu(current) == pool->cpu &&
1557 cpumask_equal(&current->cpus_allowed, 1638 cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
1558 get_cpu_mask(pool->cpu)))
1559 return true; 1639 return true;
1560 spin_unlock_irq(&pool->lock); 1640 spin_unlock_irq(&pool->lock);
1561 1641
@@ -1570,108 +1650,6 @@ __acquires(&pool->lock)
1570 } 1650 }
1571} 1651}
1572 1652
1573/*
1574 * Rebind an idle @worker to its CPU. worker_thread() will test
1575 * list_empty(@worker->entry) before leaving idle and call this function.
1576 */
1577static void idle_worker_rebind(struct worker *worker)
1578{
1579 /* CPU may go down again inbetween, clear UNBOUND only on success */
1580 if (worker_maybe_bind_and_lock(worker))
1581 worker_clr_flags(worker, WORKER_UNBOUND);
1582
1583 /* rebind complete, become available again */
1584 list_add(&worker->entry, &worker->pool->idle_list);
1585 spin_unlock_irq(&worker->pool->lock);
1586}
1587
1588/*
1589 * Function for @worker->rebind.work used to rebind unbound busy workers to
1590 * the associated cpu which is coming back online. This is scheduled by
1591 * cpu up but can race with other cpu hotplug operations and may be
1592 * executed twice without intervening cpu down.
1593 */
1594static void busy_worker_rebind_fn(struct work_struct *work)
1595{
1596 struct worker *worker = container_of(work, struct worker, rebind_work);
1597
1598 if (worker_maybe_bind_and_lock(worker))
1599 worker_clr_flags(worker, WORKER_UNBOUND);
1600
1601 spin_unlock_irq(&worker->pool->lock);
1602}
1603
1604/**
1605 * rebind_workers - rebind all workers of a pool to the associated CPU
1606 * @pool: pool of interest
1607 *
1608 * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding
1609 * is different for idle and busy ones.
1610 *
1611 * Idle ones will be removed from the idle_list and woken up. They will
1612 * add themselves back after completing rebind. This ensures that the
1613 * idle_list doesn't contain any unbound workers when re-bound busy workers
1614 * try to perform local wake-ups for concurrency management.
1615 *
1616 * Busy workers can rebind after they finish their current work items.
1617 * Queueing the rebind work item at the head of the scheduled list is
1618 * enough. Note that nr_running will be properly bumped as busy workers
1619 * rebind.
1620 *
1621 * On return, all non-manager workers are scheduled for rebind - see
1622 * manage_workers() for the manager special case. Any idle worker
1623 * including the manager will not appear on @idle_list until rebind is
1624 * complete, making local wake-ups safe.
1625 */
1626static void rebind_workers(struct worker_pool *pool)
1627{
1628 struct worker *worker, *n;
1629 int i;
1630
1631 lockdep_assert_held(&pool->assoc_mutex);
1632 lockdep_assert_held(&pool->lock);
1633
1634 /* dequeue and kick idle ones */
1635 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1636 /*
1637 * idle workers should be off @pool->idle_list until rebind
1638 * is complete to avoid receiving premature local wake-ups.
1639 */
1640 list_del_init(&worker->entry);
1641
1642 /*
1643 * worker_thread() will see the above dequeuing and call
1644 * idle_worker_rebind().
1645 */
1646 wake_up_process(worker->task);
1647 }
1648
1649 /* rebind busy workers */
1650 for_each_busy_worker(worker, i, pool) {
1651 struct work_struct *rebind_work = &worker->rebind_work;
1652 struct workqueue_struct *wq;
1653
1654 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1655 work_data_bits(rebind_work)))
1656 continue;
1657
1658 debug_work_activate(rebind_work);
1659
1660 /*
1661 * wq doesn't really matter but let's keep @worker->pool
1662 * and @pwq->pool consistent for sanity.
1663 */
1664 if (std_worker_pool_pri(worker->pool))
1665 wq = system_highpri_wq;
1666 else
1667 wq = system_wq;
1668
1669 insert_work(get_pwq(pool->cpu, wq), rebind_work,
1670 worker->scheduled.next,
1671 work_color_to_flags(WORK_NO_COLOR));
1672 }
1673}
1674
1675static struct worker *alloc_worker(void) 1653static struct worker *alloc_worker(void)
1676{ 1654{
1677 struct worker *worker; 1655 struct worker *worker;
@@ -1680,7 +1658,6 @@ static struct worker *alloc_worker(void)
1680 if (worker) { 1658 if (worker) {
1681 INIT_LIST_HEAD(&worker->entry); 1659 INIT_LIST_HEAD(&worker->entry);
1682 INIT_LIST_HEAD(&worker->scheduled); 1660 INIT_LIST_HEAD(&worker->scheduled);
1683 INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
1684 /* on creation a worker is in !idle && prep state */ 1661 /* on creation a worker is in !idle && prep state */
1685 worker->flags = WORKER_PREP; 1662 worker->flags = WORKER_PREP;
1686 } 1663 }
@@ -1703,18 +1680,25 @@ static struct worker *alloc_worker(void)
1703 */ 1680 */
1704static struct worker *create_worker(struct worker_pool *pool) 1681static struct worker *create_worker(struct worker_pool *pool)
1705{ 1682{
1706 const char *pri = std_worker_pool_pri(pool) ? "H" : "";
1707 struct worker *worker = NULL; 1683 struct worker *worker = NULL;
1708 int id = -1; 1684 int id = -1;
1685 char id_buf[16];
1709 1686
1687 lockdep_assert_held(&pool->manager_mutex);
1688
1689 /*
1690 * ID is needed to determine kthread name. Allocate ID first
1691 * without installing the pointer.
1692 */
1693 idr_preload(GFP_KERNEL);
1710 spin_lock_irq(&pool->lock); 1694 spin_lock_irq(&pool->lock);
1711 while (ida_get_new(&pool->worker_ida, &id)) { 1695
1712 spin_unlock_irq(&pool->lock); 1696 id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
1713 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) 1697
1714 goto fail;
1715 spin_lock_irq(&pool->lock);
1716 }
1717 spin_unlock_irq(&pool->lock); 1698 spin_unlock_irq(&pool->lock);
1699 idr_preload_end();
1700 if (id < 0)
1701 goto fail;
1718 1702
1719 worker = alloc_worker(); 1703 worker = alloc_worker();
1720 if (!worker) 1704 if (!worker)
@@ -1723,40 +1707,46 @@ static struct worker *create_worker(struct worker_pool *pool)
1723 worker->pool = pool; 1707 worker->pool = pool;
1724 worker->id = id; 1708 worker->id = id;
1725 1709
1726 if (pool->cpu != WORK_CPU_UNBOUND) 1710 if (pool->cpu >= 0)
1727 worker->task = kthread_create_on_node(worker_thread, 1711 snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
1728 worker, cpu_to_node(pool->cpu), 1712 pool->attrs->nice < 0 ? "H" : "");
1729 "kworker/%u:%d%s", pool->cpu, id, pri);
1730 else 1713 else
1731 worker->task = kthread_create(worker_thread, worker, 1714 snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
1732 "kworker/u:%d%s", id, pri); 1715
1716 worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
1717 "kworker/%s", id_buf);
1733 if (IS_ERR(worker->task)) 1718 if (IS_ERR(worker->task))
1734 goto fail; 1719 goto fail;
1735 1720
1736 if (std_worker_pool_pri(pool)) 1721 /*
1737 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); 1722 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1723 * online CPUs. It'll be re-applied when any of the CPUs come up.
1724 */
1725 set_user_nice(worker->task, pool->attrs->nice);
1726 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1727
1728 /* prevent userland from meddling with cpumask of workqueue workers */
1729 worker->task->flags |= PF_NO_SETAFFINITY;
1738 1730
1739 /* 1731 /*
1740 * Determine CPU binding of the new worker depending on 1732 * The caller is responsible for ensuring %POOL_DISASSOCIATED
1741 * %POOL_DISASSOCIATED. The caller is responsible for ensuring the 1733 * remains stable across this function. See the comments above the
1742 * flag remains stable across this function. See the comments 1734 * flag definition for details.
1743 * above the flag definition for details.
1744 *
1745 * As an unbound worker may later become a regular one if CPU comes
1746 * online, make sure every worker has %PF_THREAD_BOUND set.
1747 */ 1735 */
1748 if (!(pool->flags & POOL_DISASSOCIATED)) { 1736 if (pool->flags & POOL_DISASSOCIATED)
1749 kthread_bind(worker->task, pool->cpu);
1750 } else {
1751 worker->task->flags |= PF_THREAD_BOUND;
1752 worker->flags |= WORKER_UNBOUND; 1737 worker->flags |= WORKER_UNBOUND;
1753 } 1738
1739 /* successful, commit the pointer to idr */
1740 spin_lock_irq(&pool->lock);
1741 idr_replace(&pool->worker_idr, worker, worker->id);
1742 spin_unlock_irq(&pool->lock);
1754 1743
1755 return worker; 1744 return worker;
1745
1756fail: 1746fail:
1757 if (id >= 0) { 1747 if (id >= 0) {
1758 spin_lock_irq(&pool->lock); 1748 spin_lock_irq(&pool->lock);
1759 ida_remove(&pool->worker_ida, id); 1749 idr_remove(&pool->worker_idr, id);
1760 spin_unlock_irq(&pool->lock); 1750 spin_unlock_irq(&pool->lock);
1761 } 1751 }
1762 kfree(worker); 1752 kfree(worker);
@@ -1781,6 +1771,30 @@ static void start_worker(struct worker *worker)
1781} 1771}
1782 1772
1783/** 1773/**
1774 * create_and_start_worker - create and start a worker for a pool
1775 * @pool: the target pool
1776 *
1777 * Grab the managership of @pool and create and start a new worker for it.
1778 */
1779static int create_and_start_worker(struct worker_pool *pool)
1780{
1781 struct worker *worker;
1782
1783 mutex_lock(&pool->manager_mutex);
1784
1785 worker = create_worker(pool);
1786 if (worker) {
1787 spin_lock_irq(&pool->lock);
1788 start_worker(worker);
1789 spin_unlock_irq(&pool->lock);
1790 }
1791
1792 mutex_unlock(&pool->manager_mutex);
1793
1794 return worker ? 0 : -ENOMEM;
1795}
1796
1797/**
1784 * destroy_worker - destroy a workqueue worker 1798 * destroy_worker - destroy a workqueue worker
1785 * @worker: worker to be destroyed 1799 * @worker: worker to be destroyed
1786 * 1800 *
@@ -1792,11 +1806,14 @@ static void start_worker(struct worker *worker)
1792static void destroy_worker(struct worker *worker) 1806static void destroy_worker(struct worker *worker)
1793{ 1807{
1794 struct worker_pool *pool = worker->pool; 1808 struct worker_pool *pool = worker->pool;
1795 int id = worker->id; 1809
1810 lockdep_assert_held(&pool->manager_mutex);
1811 lockdep_assert_held(&pool->lock);
1796 1812
1797 /* sanity check frenzy */ 1813 /* sanity check frenzy */
1798 BUG_ON(worker->current_work); 1814 if (WARN_ON(worker->current_work) ||
1799 BUG_ON(!list_empty(&worker->scheduled)); 1815 WARN_ON(!list_empty(&worker->scheduled)))
1816 return;
1800 1817
1801 if (worker->flags & WORKER_STARTED) 1818 if (worker->flags & WORKER_STARTED)
1802 pool->nr_workers--; 1819 pool->nr_workers--;
@@ -1806,13 +1823,14 @@ static void destroy_worker(struct worker *worker)
1806 list_del_init(&worker->entry); 1823 list_del_init(&worker->entry);
1807 worker->flags |= WORKER_DIE; 1824 worker->flags |= WORKER_DIE;
1808 1825
1826 idr_remove(&pool->worker_idr, worker->id);
1827
1809 spin_unlock_irq(&pool->lock); 1828 spin_unlock_irq(&pool->lock);
1810 1829
1811 kthread_stop(worker->task); 1830 kthread_stop(worker->task);
1812 kfree(worker); 1831 kfree(worker);
1813 1832
1814 spin_lock_irq(&pool->lock); 1833 spin_lock_irq(&pool->lock);
1815 ida_remove(&pool->worker_ida, id);
1816} 1834}
1817 1835
1818static void idle_worker_timeout(unsigned long __pool) 1836static void idle_worker_timeout(unsigned long __pool)
@@ -1841,23 +1859,21 @@ static void idle_worker_timeout(unsigned long __pool)
1841 spin_unlock_irq(&pool->lock); 1859 spin_unlock_irq(&pool->lock);
1842} 1860}
1843 1861
1844static bool send_mayday(struct work_struct *work) 1862static void send_mayday(struct work_struct *work)
1845{ 1863{
1846 struct pool_workqueue *pwq = get_work_pwq(work); 1864 struct pool_workqueue *pwq = get_work_pwq(work);
1847 struct workqueue_struct *wq = pwq->wq; 1865 struct workqueue_struct *wq = pwq->wq;
1848 unsigned int cpu;
1849 1866
1850 if (!(wq->flags & WQ_RESCUER)) 1867 lockdep_assert_held(&wq_mayday_lock);
1851 return false; 1868
1869 if (!wq->rescuer)
1870 return;
1852 1871
1853 /* mayday mayday mayday */ 1872 /* mayday mayday mayday */
1854 cpu = pwq->pool->cpu; 1873 if (list_empty(&pwq->mayday_node)) {
1855 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1874 list_add_tail(&pwq->mayday_node, &wq->maydays);
1856 if (cpu == WORK_CPU_UNBOUND)
1857 cpu = 0;
1858 if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1859 wake_up_process(wq->rescuer->task); 1875 wake_up_process(wq->rescuer->task);
1860 return true; 1876 }
1861} 1877}
1862 1878
1863static void pool_mayday_timeout(unsigned long __pool) 1879static void pool_mayday_timeout(unsigned long __pool)
@@ -1865,7 +1881,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1865 struct worker_pool *pool = (void *)__pool; 1881 struct worker_pool *pool = (void *)__pool;
1866 struct work_struct *work; 1882 struct work_struct *work;
1867 1883
1868 spin_lock_irq(&pool->lock); 1884 spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */
1885 spin_lock(&pool->lock);
1869 1886
1870 if (need_to_create_worker(pool)) { 1887 if (need_to_create_worker(pool)) {
1871 /* 1888 /*
@@ -1878,7 +1895,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1878 send_mayday(work); 1895 send_mayday(work);
1879 } 1896 }
1880 1897
1881 spin_unlock_irq(&pool->lock); 1898 spin_unlock(&pool->lock);
1899 spin_unlock_irq(&wq_mayday_lock);
1882 1900
1883 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1901 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1884} 1902}
@@ -1893,8 +1911,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1893 * sent to all rescuers with works scheduled on @pool to resolve 1911 * sent to all rescuers with works scheduled on @pool to resolve
1894 * possible allocation deadlock. 1912 * possible allocation deadlock.
1895 * 1913 *
1896 * On return, need_to_create_worker() is guaranteed to be false and 1914 * On return, need_to_create_worker() is guaranteed to be %false and
1897 * may_start_working() true. 1915 * may_start_working() %true.
1898 * 1916 *
1899 * LOCKING: 1917 * LOCKING:
1900 * spin_lock_irq(pool->lock) which may be released and regrabbed 1918 * spin_lock_irq(pool->lock) which may be released and regrabbed
@@ -1902,7 +1920,7 @@ static void pool_mayday_timeout(unsigned long __pool)
1902 * manager. 1920 * manager.
1903 * 1921 *
1904 * RETURNS: 1922 * RETURNS:
1905 * false if no action was taken and pool->lock stayed locked, true 1923 * %false if no action was taken and pool->lock stayed locked, %true
1906 * otherwise. 1924 * otherwise.
1907 */ 1925 */
1908static bool maybe_create_worker(struct worker_pool *pool) 1926static bool maybe_create_worker(struct worker_pool *pool)
@@ -1925,7 +1943,8 @@ restart:
1925 del_timer_sync(&pool->mayday_timer); 1943 del_timer_sync(&pool->mayday_timer);
1926 spin_lock_irq(&pool->lock); 1944 spin_lock_irq(&pool->lock);
1927 start_worker(worker); 1945 start_worker(worker);
1928 BUG_ON(need_to_create_worker(pool)); 1946 if (WARN_ON_ONCE(need_to_create_worker(pool)))
1947 goto restart;
1929 return true; 1948 return true;
1930 } 1949 }
1931 1950
@@ -1958,7 +1977,7 @@ restart:
1958 * multiple times. Called only from manager. 1977 * multiple times. Called only from manager.
1959 * 1978 *
1960 * RETURNS: 1979 * RETURNS:
1961 * false if no action was taken and pool->lock stayed locked, true 1980 * %false if no action was taken and pool->lock stayed locked, %true
1962 * otherwise. 1981 * otherwise.
1963 */ 1982 */
1964static bool maybe_destroy_workers(struct worker_pool *pool) 1983static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2009,42 +2028,37 @@ static bool manage_workers(struct worker *worker)
2009 struct worker_pool *pool = worker->pool; 2028 struct worker_pool *pool = worker->pool;
2010 bool ret = false; 2029 bool ret = false;
2011 2030
2012 if (pool->flags & POOL_MANAGING_WORKERS) 2031 /*
2032 * Managership is governed by two mutexes - manager_arb and
2033 * manager_mutex. manager_arb handles arbitration of manager role.
2034 * Anyone who successfully grabs manager_arb wins the arbitration
2035 * and becomes the manager. mutex_trylock() on pool->manager_arb
2036 * failure while holding pool->lock reliably indicates that someone
2037 * else is managing the pool and the worker which failed trylock
2038 * can proceed to executing work items. This means that anyone
2039 * grabbing manager_arb is responsible for actually performing
2040 * manager duties. If manager_arb is grabbed and released without
2041 * actual management, the pool may stall indefinitely.
2042 *
2043 * manager_mutex is used for exclusion of actual management
2044 * operations. The holder of manager_mutex can be sure that none
2045 * of management operations, including creation and destruction of
2046 * workers, won't take place until the mutex is released. Because
2047 * manager_mutex doesn't interfere with manager role arbitration,
2048 * it is guaranteed that the pool's management, while may be
2049 * delayed, won't be disturbed by someone else grabbing
2050 * manager_mutex.
2051 */
2052 if (!mutex_trylock(&pool->manager_arb))
2013 return ret; 2053 return ret;
2014 2054
2015 pool->flags |= POOL_MANAGING_WORKERS;
2016
2017 /* 2055 /*
2018 * To simplify both worker management and CPU hotplug, hold off 2056 * With manager arbitration won, manager_mutex would be free in
2019 * management while hotplug is in progress. CPU hotplug path can't 2057 * most cases. trylock first without dropping @pool->lock.
2020 * grab %POOL_MANAGING_WORKERS to achieve this because that can
2021 * lead to idle worker depletion (all become busy thinking someone
2022 * else is managing) which in turn can result in deadlock under
2023 * extreme circumstances. Use @pool->assoc_mutex to synchronize
2024 * manager against CPU hotplug.
2025 *
2026 * assoc_mutex would always be free unless CPU hotplug is in
2027 * progress. trylock first without dropping @pool->lock.
2028 */ 2058 */
2029 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { 2059 if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
2030 spin_unlock_irq(&pool->lock); 2060 spin_unlock_irq(&pool->lock);
2031 mutex_lock(&pool->assoc_mutex); 2061 mutex_lock(&pool->manager_mutex);
2032 /*
2033 * CPU hotplug could have happened while we were waiting
2034 * for assoc_mutex. Hotplug itself can't handle us
2035 * because manager isn't either on idle or busy list, and
2036 * @pool's state and ours could have deviated.
2037 *
2038 * As hotplug is now excluded via assoc_mutex, we can
2039 * simply try to bind. It will succeed or fail depending
2040 * on @pool's current state. Try it and adjust
2041 * %WORKER_UNBOUND accordingly.
2042 */
2043 if (worker_maybe_bind_and_lock(worker))
2044 worker->flags &= ~WORKER_UNBOUND;
2045 else
2046 worker->flags |= WORKER_UNBOUND;
2047
2048 ret = true; 2062 ret = true;
2049 } 2063 }
2050 2064
@@ -2057,8 +2071,8 @@ static bool manage_workers(struct worker *worker)
2057 ret |= maybe_destroy_workers(pool); 2071 ret |= maybe_destroy_workers(pool);
2058 ret |= maybe_create_worker(pool); 2072 ret |= maybe_create_worker(pool);
2059 2073
2060 pool->flags &= ~POOL_MANAGING_WORKERS; 2074 mutex_unlock(&pool->manager_mutex);
2061 mutex_unlock(&pool->assoc_mutex); 2075 mutex_unlock(&pool->manager_arb);
2062 return ret; 2076 return ret;
2063} 2077}
2064 2078
@@ -2184,6 +2198,7 @@ __acquires(&pool->lock)
2184 worker->current_work = NULL; 2198 worker->current_work = NULL;
2185 worker->current_func = NULL; 2199 worker->current_func = NULL;
2186 worker->current_pwq = NULL; 2200 worker->current_pwq = NULL;
2201 worker->desc_valid = false;
2187 pwq_dec_nr_in_flight(pwq, work_color); 2202 pwq_dec_nr_in_flight(pwq, work_color);
2188} 2203}
2189 2204
@@ -2212,11 +2227,11 @@ static void process_scheduled_works(struct worker *worker)
2212 * worker_thread - the worker thread function 2227 * worker_thread - the worker thread function
2213 * @__worker: self 2228 * @__worker: self
2214 * 2229 *
2215 * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools 2230 * The worker thread function. All workers belong to a worker_pool -
2216 * of these per each cpu. These workers process all works regardless of 2231 * either a per-cpu one or dynamic unbound one. These workers process all
2217 * their specific target workqueue. The only exception is works which 2232 * work items regardless of their specific target workqueue. The only
2218 * belong to workqueues with a rescuer which will be explained in 2233 * exception is work items which belong to workqueues with a rescuer which
2219 * rescuer_thread(). 2234 * will be explained in rescuer_thread().
2220 */ 2235 */
2221static int worker_thread(void *__worker) 2236static int worker_thread(void *__worker)
2222{ 2237{
@@ -2228,19 +2243,12 @@ static int worker_thread(void *__worker)
2228woke_up: 2243woke_up:
2229 spin_lock_irq(&pool->lock); 2244 spin_lock_irq(&pool->lock);
2230 2245
2231 /* we are off idle list if destruction or rebind is requested */ 2246 /* am I supposed to die? */
2232 if (unlikely(list_empty(&worker->entry))) { 2247 if (unlikely(worker->flags & WORKER_DIE)) {
2233 spin_unlock_irq(&pool->lock); 2248 spin_unlock_irq(&pool->lock);
2234 2249 WARN_ON_ONCE(!list_empty(&worker->entry));
2235 /* if DIE is set, destruction is requested */ 2250 worker->task->flags &= ~PF_WQ_WORKER;
2236 if (worker->flags & WORKER_DIE) { 2251 return 0;
2237 worker->task->flags &= ~PF_WQ_WORKER;
2238 return 0;
2239 }
2240
2241 /* otherwise, rebind */
2242 idle_worker_rebind(worker);
2243 goto woke_up;
2244 } 2252 }
2245 2253
2246 worker_leave_idle(worker); 2254 worker_leave_idle(worker);
@@ -2258,14 +2266,16 @@ recheck:
2258 * preparing to process a work or actually processing it. 2266 * preparing to process a work or actually processing it.
2259 * Make sure nobody diddled with it while I was sleeping. 2267 * Make sure nobody diddled with it while I was sleeping.
2260 */ 2268 */
2261 BUG_ON(!list_empty(&worker->scheduled)); 2269 WARN_ON_ONCE(!list_empty(&worker->scheduled));
2262 2270
2263 /* 2271 /*
2264 * When control reaches this point, we're guaranteed to have 2272 * Finish PREP stage. We're guaranteed to have at least one idle
2265 * at least one idle worker or that someone else has already 2273 * worker or that someone else has already assumed the manager
2266 * assumed the manager role. 2274 * role. This is where @worker starts participating in concurrency
2275 * management if applicable and concurrency management is restored
2276 * after being rebound. See rebind_workers() for details.
2267 */ 2277 */
2268 worker_clr_flags(worker, WORKER_PREP); 2278 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
2269 2279
2270 do { 2280 do {
2271 struct work_struct *work = 2281 struct work_struct *work =
@@ -2307,7 +2317,7 @@ sleep:
2307 * @__rescuer: self 2317 * @__rescuer: self
2308 * 2318 *
2309 * Workqueue rescuer thread function. There's one rescuer for each 2319 * Workqueue rescuer thread function. There's one rescuer for each
2310 * workqueue which has WQ_RESCUER set. 2320 * workqueue which has WQ_MEM_RECLAIM set.
2311 * 2321 *
2312 * Regular work processing on a pool may block trying to create a new 2322 * Regular work processing on a pool may block trying to create a new
2313 * worker which uses GFP_KERNEL allocation which has slight chance of 2323 * worker which uses GFP_KERNEL allocation which has slight chance of
@@ -2326,8 +2336,6 @@ static int rescuer_thread(void *__rescuer)
2326 struct worker *rescuer = __rescuer; 2336 struct worker *rescuer = __rescuer;
2327 struct workqueue_struct *wq = rescuer->rescue_wq; 2337 struct workqueue_struct *wq = rescuer->rescue_wq;
2328 struct list_head *scheduled = &rescuer->scheduled; 2338 struct list_head *scheduled = &rescuer->scheduled;
2329 bool is_unbound = wq->flags & WQ_UNBOUND;
2330 unsigned int cpu;
2331 2339
2332 set_user_nice(current, RESCUER_NICE_LEVEL); 2340 set_user_nice(current, RESCUER_NICE_LEVEL);
2333 2341
@@ -2345,28 +2353,29 @@ repeat:
2345 return 0; 2353 return 0;
2346 } 2354 }
2347 2355
2348 /* 2356 /* see whether any pwq is asking for help */
2349 * See whether any cpu is asking for help. Unbounded 2357 spin_lock_irq(&wq_mayday_lock);
2350 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. 2358
2351 */ 2359 while (!list_empty(&wq->maydays)) {
2352 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2360 struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
2353 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2361 struct pool_workqueue, mayday_node);
2354 struct pool_workqueue *pwq = get_pwq(tcpu, wq);
2355 struct worker_pool *pool = pwq->pool; 2362 struct worker_pool *pool = pwq->pool;
2356 struct work_struct *work, *n; 2363 struct work_struct *work, *n;
2357 2364
2358 __set_current_state(TASK_RUNNING); 2365 __set_current_state(TASK_RUNNING);
2359 mayday_clear_cpu(cpu, wq->mayday_mask); 2366 list_del_init(&pwq->mayday_node);
2367
2368 spin_unlock_irq(&wq_mayday_lock);
2360 2369
2361 /* migrate to the target cpu if possible */ 2370 /* migrate to the target cpu if possible */
2371 worker_maybe_bind_and_lock(pool);
2362 rescuer->pool = pool; 2372 rescuer->pool = pool;
2363 worker_maybe_bind_and_lock(rescuer);
2364 2373
2365 /* 2374 /*
2366 * Slurp in all works issued via this workqueue and 2375 * Slurp in all works issued via this workqueue and
2367 * process'em. 2376 * process'em.
2368 */ 2377 */
2369 BUG_ON(!list_empty(&rescuer->scheduled)); 2378 WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
2370 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2379 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2371 if (get_work_pwq(work) == pwq) 2380 if (get_work_pwq(work) == pwq)
2372 move_linked_works(work, scheduled, &n); 2381 move_linked_works(work, scheduled, &n);
@@ -2381,9 +2390,13 @@ repeat:
2381 if (keep_working(pool)) 2390 if (keep_working(pool))
2382 wake_up_worker(pool); 2391 wake_up_worker(pool);
2383 2392
2384 spin_unlock_irq(&pool->lock); 2393 rescuer->pool = NULL;
2394 spin_unlock(&pool->lock);
2395 spin_lock(&wq_mayday_lock);
2385 } 2396 }
2386 2397
2398 spin_unlock_irq(&wq_mayday_lock);
2399
2387 /* rescuers should never participate in concurrency management */ 2400 /* rescuers should never participate in concurrency management */
2388 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); 2401 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2389 schedule(); 2402 schedule();
@@ -2487,7 +2500,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
2487 * advanced to @work_color. 2500 * advanced to @work_color.
2488 * 2501 *
2489 * CONTEXT: 2502 * CONTEXT:
2490 * mutex_lock(wq->flush_mutex). 2503 * mutex_lock(wq->mutex).
2491 * 2504 *
2492 * RETURNS: 2505 * RETURNS:
2493 * %true if @flush_color >= 0 and there's something to flush. %false 2506 * %true if @flush_color >= 0 and there's something to flush. %false
@@ -2497,21 +2510,20 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2497 int flush_color, int work_color) 2510 int flush_color, int work_color)
2498{ 2511{
2499 bool wait = false; 2512 bool wait = false;
2500 unsigned int cpu; 2513 struct pool_workqueue *pwq;
2501 2514
2502 if (flush_color >= 0) { 2515 if (flush_color >= 0) {
2503 BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); 2516 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
2504 atomic_set(&wq->nr_pwqs_to_flush, 1); 2517 atomic_set(&wq->nr_pwqs_to_flush, 1);
2505 } 2518 }
2506 2519
2507 for_each_pwq_cpu(cpu, wq) { 2520 for_each_pwq(pwq, wq) {
2508 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2509 struct worker_pool *pool = pwq->pool; 2521 struct worker_pool *pool = pwq->pool;
2510 2522
2511 spin_lock_irq(&pool->lock); 2523 spin_lock_irq(&pool->lock);
2512 2524
2513 if (flush_color >= 0) { 2525 if (flush_color >= 0) {
2514 BUG_ON(pwq->flush_color != -1); 2526 WARN_ON_ONCE(pwq->flush_color != -1);
2515 2527
2516 if (pwq->nr_in_flight[flush_color]) { 2528 if (pwq->nr_in_flight[flush_color]) {
2517 pwq->flush_color = flush_color; 2529 pwq->flush_color = flush_color;
@@ -2521,7 +2533,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2521 } 2533 }
2522 2534
2523 if (work_color >= 0) { 2535 if (work_color >= 0) {
2524 BUG_ON(work_color != work_next_color(pwq->work_color)); 2536 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
2525 pwq->work_color = work_color; 2537 pwq->work_color = work_color;
2526 } 2538 }
2527 2539
@@ -2538,11 +2550,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2538 * flush_workqueue - ensure that any scheduled work has run to completion. 2550 * flush_workqueue - ensure that any scheduled work has run to completion.
2539 * @wq: workqueue to flush 2551 * @wq: workqueue to flush
2540 * 2552 *
2541 * Forces execution of the workqueue and blocks until its completion. 2553 * This function sleeps until all work items which were queued on entry
2542 * This is typically used in driver shutdown handlers. 2554 * have finished execution, but it is not livelocked by new incoming ones.
2543 *
2544 * We sleep until all works which were queued on entry have been handled,
2545 * but we are not livelocked by new incoming ones.
2546 */ 2555 */
2547void flush_workqueue(struct workqueue_struct *wq) 2556void flush_workqueue(struct workqueue_struct *wq)
2548{ 2557{
@@ -2556,7 +2565,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2556 lock_map_acquire(&wq->lockdep_map); 2565 lock_map_acquire(&wq->lockdep_map);
2557 lock_map_release(&wq->lockdep_map); 2566 lock_map_release(&wq->lockdep_map);
2558 2567
2559 mutex_lock(&wq->flush_mutex); 2568 mutex_lock(&wq->mutex);
2560 2569
2561 /* 2570 /*
2562 * Start-to-wait phase 2571 * Start-to-wait phase
@@ -2569,13 +2578,13 @@ void flush_workqueue(struct workqueue_struct *wq)
2569 * becomes our flush_color and work_color is advanced 2578 * becomes our flush_color and work_color is advanced
2570 * by one. 2579 * by one.
2571 */ 2580 */
2572 BUG_ON(!list_empty(&wq->flusher_overflow)); 2581 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
2573 this_flusher.flush_color = wq->work_color; 2582 this_flusher.flush_color = wq->work_color;
2574 wq->work_color = next_color; 2583 wq->work_color = next_color;
2575 2584
2576 if (!wq->first_flusher) { 2585 if (!wq->first_flusher) {
2577 /* no flush in progress, become the first flusher */ 2586 /* no flush in progress, become the first flusher */
2578 BUG_ON(wq->flush_color != this_flusher.flush_color); 2587 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
2579 2588
2580 wq->first_flusher = &this_flusher; 2589 wq->first_flusher = &this_flusher;
2581 2590
@@ -2588,7 +2597,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2588 } 2597 }
2589 } else { 2598 } else {
2590 /* wait in queue */ 2599 /* wait in queue */
2591 BUG_ON(wq->flush_color == this_flusher.flush_color); 2600 WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
2592 list_add_tail(&this_flusher.list, &wq->flusher_queue); 2601 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2593 flush_workqueue_prep_pwqs(wq, -1, wq->work_color); 2602 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2594 } 2603 }
@@ -2601,7 +2610,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2601 list_add_tail(&this_flusher.list, &wq->flusher_overflow); 2610 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2602 } 2611 }
2603 2612
2604 mutex_unlock(&wq->flush_mutex); 2613 mutex_unlock(&wq->mutex);
2605 2614
2606 wait_for_completion(&this_flusher.done); 2615 wait_for_completion(&this_flusher.done);
2607 2616
@@ -2614,7 +2623,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2614 if (wq->first_flusher != &this_flusher) 2623 if (wq->first_flusher != &this_flusher)
2615 return; 2624 return;
2616 2625
2617 mutex_lock(&wq->flush_mutex); 2626 mutex_lock(&wq->mutex);
2618 2627
2619 /* we might have raced, check again with mutex held */ 2628 /* we might have raced, check again with mutex held */
2620 if (wq->first_flusher != &this_flusher) 2629 if (wq->first_flusher != &this_flusher)
@@ -2622,8 +2631,8 @@ void flush_workqueue(struct workqueue_struct *wq)
2622 2631
2623 wq->first_flusher = NULL; 2632 wq->first_flusher = NULL;
2624 2633
2625 BUG_ON(!list_empty(&this_flusher.list)); 2634 WARN_ON_ONCE(!list_empty(&this_flusher.list));
2626 BUG_ON(wq->flush_color != this_flusher.flush_color); 2635 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
2627 2636
2628 while (true) { 2637 while (true) {
2629 struct wq_flusher *next, *tmp; 2638 struct wq_flusher *next, *tmp;
@@ -2636,8 +2645,8 @@ void flush_workqueue(struct workqueue_struct *wq)
2636 complete(&next->done); 2645 complete(&next->done);
2637 } 2646 }
2638 2647
2639 BUG_ON(!list_empty(&wq->flusher_overflow) && 2648 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
2640 wq->flush_color != work_next_color(wq->work_color)); 2649 wq->flush_color != work_next_color(wq->work_color));
2641 2650
2642 /* this flush_color is finished, advance by one */ 2651 /* this flush_color is finished, advance by one */
2643 wq->flush_color = work_next_color(wq->flush_color); 2652 wq->flush_color = work_next_color(wq->flush_color);
@@ -2661,7 +2670,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2661 } 2670 }
2662 2671
2663 if (list_empty(&wq->flusher_queue)) { 2672 if (list_empty(&wq->flusher_queue)) {
2664 BUG_ON(wq->flush_color != wq->work_color); 2673 WARN_ON_ONCE(wq->flush_color != wq->work_color);
2665 break; 2674 break;
2666 } 2675 }
2667 2676
@@ -2669,8 +2678,8 @@ void flush_workqueue(struct workqueue_struct *wq)
2669 * Need to flush more colors. Make the next flusher 2678 * Need to flush more colors. Make the next flusher
2670 * the new first flusher and arm pwqs. 2679 * the new first flusher and arm pwqs.
2671 */ 2680 */
2672 BUG_ON(wq->flush_color == wq->work_color); 2681 WARN_ON_ONCE(wq->flush_color == wq->work_color);
2673 BUG_ON(wq->flush_color != next->flush_color); 2682 WARN_ON_ONCE(wq->flush_color != next->flush_color);
2674 2683
2675 list_del_init(&next->list); 2684 list_del_init(&next->list);
2676 wq->first_flusher = next; 2685 wq->first_flusher = next;
@@ -2686,7 +2695,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2686 } 2695 }
2687 2696
2688out_unlock: 2697out_unlock:
2689 mutex_unlock(&wq->flush_mutex); 2698 mutex_unlock(&wq->mutex);
2690} 2699}
2691EXPORT_SYMBOL_GPL(flush_workqueue); 2700EXPORT_SYMBOL_GPL(flush_workqueue);
2692 2701
@@ -2704,22 +2713,23 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
2704void drain_workqueue(struct workqueue_struct *wq) 2713void drain_workqueue(struct workqueue_struct *wq)
2705{ 2714{
2706 unsigned int flush_cnt = 0; 2715 unsigned int flush_cnt = 0;
2707 unsigned int cpu; 2716 struct pool_workqueue *pwq;
2708 2717
2709 /* 2718 /*
2710 * __queue_work() needs to test whether there are drainers, is much 2719 * __queue_work() needs to test whether there are drainers, is much
2711 * hotter than drain_workqueue() and already looks at @wq->flags. 2720 * hotter than drain_workqueue() and already looks at @wq->flags.
2712 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. 2721 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
2713 */ 2722 */
2714 spin_lock(&workqueue_lock); 2723 mutex_lock(&wq->mutex);
2715 if (!wq->nr_drainers++) 2724 if (!wq->nr_drainers++)
2716 wq->flags |= WQ_DRAINING; 2725 wq->flags |= __WQ_DRAINING;
2717 spin_unlock(&workqueue_lock); 2726 mutex_unlock(&wq->mutex);
2718reflush: 2727reflush:
2719 flush_workqueue(wq); 2728 flush_workqueue(wq);
2720 2729
2721 for_each_pwq_cpu(cpu, wq) { 2730 mutex_lock(&wq->mutex);
2722 struct pool_workqueue *pwq = get_pwq(cpu, wq); 2731
2732 for_each_pwq(pwq, wq) {
2723 bool drained; 2733 bool drained;
2724 2734
2725 spin_lock_irq(&pwq->pool->lock); 2735 spin_lock_irq(&pwq->pool->lock);
@@ -2731,15 +2741,16 @@ reflush:
2731 2741
2732 if (++flush_cnt == 10 || 2742 if (++flush_cnt == 10 ||
2733 (flush_cnt % 100 == 0 && flush_cnt <= 1000)) 2743 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2734 pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", 2744 pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
2735 wq->name, flush_cnt); 2745 wq->name, flush_cnt);
2746
2747 mutex_unlock(&wq->mutex);
2736 goto reflush; 2748 goto reflush;
2737 } 2749 }
2738 2750
2739 spin_lock(&workqueue_lock);
2740 if (!--wq->nr_drainers) 2751 if (!--wq->nr_drainers)
2741 wq->flags &= ~WQ_DRAINING; 2752 wq->flags &= ~__WQ_DRAINING;
2742 spin_unlock(&workqueue_lock); 2753 mutex_unlock(&wq->mutex);
2743} 2754}
2744EXPORT_SYMBOL_GPL(drain_workqueue); 2755EXPORT_SYMBOL_GPL(drain_workqueue);
2745 2756
@@ -2750,11 +2761,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2750 struct pool_workqueue *pwq; 2761 struct pool_workqueue *pwq;
2751 2762
2752 might_sleep(); 2763 might_sleep();
2764
2765 local_irq_disable();
2753 pool = get_work_pool(work); 2766 pool = get_work_pool(work);
2754 if (!pool) 2767 if (!pool) {
2768 local_irq_enable();
2755 return false; 2769 return false;
2770 }
2756 2771
2757 spin_lock_irq(&pool->lock); 2772 spin_lock(&pool->lock);
2758 /* see the comment in try_to_grab_pending() with the same code */ 2773 /* see the comment in try_to_grab_pending() with the same code */
2759 pwq = get_work_pwq(work); 2774 pwq = get_work_pwq(work);
2760 if (pwq) { 2775 if (pwq) {
@@ -2776,7 +2791,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2776 * flusher is not running on the same workqueue by verifying write 2791 * flusher is not running on the same workqueue by verifying write
2777 * access. 2792 * access.
2778 */ 2793 */
2779 if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) 2794 if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
2780 lock_map_acquire(&pwq->wq->lockdep_map); 2795 lock_map_acquire(&pwq->wq->lockdep_map);
2781 else 2796 else
2782 lock_map_acquire_read(&pwq->wq->lockdep_map); 2797 lock_map_acquire_read(&pwq->wq->lockdep_map);
@@ -2933,66 +2948,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
2933EXPORT_SYMBOL(cancel_delayed_work_sync); 2948EXPORT_SYMBOL(cancel_delayed_work_sync);
2934 2949
2935/** 2950/**
2936 * schedule_work_on - put work task on a specific cpu
2937 * @cpu: cpu to put the work task on
2938 * @work: job to be done
2939 *
2940 * This puts a job on a specific cpu
2941 */
2942bool schedule_work_on(int cpu, struct work_struct *work)
2943{
2944 return queue_work_on(cpu, system_wq, work);
2945}
2946EXPORT_SYMBOL(schedule_work_on);
2947
2948/**
2949 * schedule_work - put work task in global workqueue
2950 * @work: job to be done
2951 *
2952 * Returns %false if @work was already on the kernel-global workqueue and
2953 * %true otherwise.
2954 *
2955 * This puts a job in the kernel-global workqueue if it was not already
2956 * queued and leaves it in the same position on the kernel-global
2957 * workqueue otherwise.
2958 */
2959bool schedule_work(struct work_struct *work)
2960{
2961 return queue_work(system_wq, work);
2962}
2963EXPORT_SYMBOL(schedule_work);
2964
2965/**
2966 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2967 * @cpu: cpu to use
2968 * @dwork: job to be done
2969 * @delay: number of jiffies to wait
2970 *
2971 * After waiting for a given time this puts a job in the kernel-global
2972 * workqueue on the specified CPU.
2973 */
2974bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
2975 unsigned long delay)
2976{
2977 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
2978}
2979EXPORT_SYMBOL(schedule_delayed_work_on);
2980
2981/**
2982 * schedule_delayed_work - put work task in global workqueue after delay
2983 * @dwork: job to be done
2984 * @delay: number of jiffies to wait or 0 for immediate execution
2985 *
2986 * After waiting for a given time this puts a job in the kernel-global
2987 * workqueue.
2988 */
2989bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
2990{
2991 return queue_delayed_work(system_wq, dwork, delay);
2992}
2993EXPORT_SYMBOL(schedule_delayed_work);
2994
2995/**
2996 * schedule_on_each_cpu - execute a function synchronously on each online CPU 2951 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2997 * @func: the function to call 2952 * @func: the function to call
2998 * 2953 *
@@ -3085,51 +3040,1025 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
3085} 3040}
3086EXPORT_SYMBOL_GPL(execute_in_process_context); 3041EXPORT_SYMBOL_GPL(execute_in_process_context);
3087 3042
3088int keventd_up(void) 3043#ifdef CONFIG_SYSFS
3044/*
3045 * Workqueues with WQ_SYSFS flag set is visible to userland via
3046 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
3047 * following attributes.
3048 *
3049 * per_cpu RO bool : whether the workqueue is per-cpu or unbound
3050 * max_active RW int : maximum number of in-flight work items
3051 *
3052 * Unbound workqueues have the following extra attributes.
3053 *
3054 * id RO int : the associated pool ID
3055 * nice RW int : nice value of the workers
3056 * cpumask RW mask : bitmask of allowed CPUs for the workers
3057 */
3058struct wq_device {
3059 struct workqueue_struct *wq;
3060 struct device dev;
3061};
3062
3063static struct workqueue_struct *dev_to_wq(struct device *dev)
3064{
3065 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
3066
3067 return wq_dev->wq;
3068}
3069
3070static ssize_t wq_per_cpu_show(struct device *dev,
3071 struct device_attribute *attr, char *buf)
3072{
3073 struct workqueue_struct *wq = dev_to_wq(dev);
3074
3075 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
3076}
3077
3078static ssize_t wq_max_active_show(struct device *dev,
3079 struct device_attribute *attr, char *buf)
3080{
3081 struct workqueue_struct *wq = dev_to_wq(dev);
3082
3083 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
3084}
3085
3086static ssize_t wq_max_active_store(struct device *dev,
3087 struct device_attribute *attr,
3088 const char *buf, size_t count)
3089{
3090 struct workqueue_struct *wq = dev_to_wq(dev);
3091 int val;
3092
3093 if (sscanf(buf, "%d", &val) != 1 || val <= 0)
3094 return -EINVAL;
3095
3096 workqueue_set_max_active(wq, val);
3097 return count;
3098}
3099
3100static struct device_attribute wq_sysfs_attrs[] = {
3101 __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),
3102 __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),
3103 __ATTR_NULL,
3104};
3105
3106static ssize_t wq_pool_ids_show(struct device *dev,
3107 struct device_attribute *attr, char *buf)
3108{
3109 struct workqueue_struct *wq = dev_to_wq(dev);
3110 const char *delim = "";
3111 int node, written = 0;
3112
3113 rcu_read_lock_sched();
3114 for_each_node(node) {
3115 written += scnprintf(buf + written, PAGE_SIZE - written,
3116 "%s%d:%d", delim, node,
3117 unbound_pwq_by_node(wq, node)->pool->id);
3118 delim = " ";
3119 }
3120 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3121 rcu_read_unlock_sched();
3122
3123 return written;
3124}
3125
3126static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
3127 char *buf)
3128{
3129 struct workqueue_struct *wq = dev_to_wq(dev);
3130 int written;
3131
3132 mutex_lock(&wq->mutex);
3133 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
3134 mutex_unlock(&wq->mutex);
3135
3136 return written;
3137}
3138
3139/* prepare workqueue_attrs for sysfs store operations */
3140static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
3141{
3142 struct workqueue_attrs *attrs;
3143
3144 attrs = alloc_workqueue_attrs(GFP_KERNEL);
3145 if (!attrs)
3146 return NULL;
3147
3148 mutex_lock(&wq->mutex);
3149 copy_workqueue_attrs(attrs, wq->unbound_attrs);
3150 mutex_unlock(&wq->mutex);
3151 return attrs;
3152}
3153
3154static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
3155 const char *buf, size_t count)
3156{
3157 struct workqueue_struct *wq = dev_to_wq(dev);
3158 struct workqueue_attrs *attrs;
3159 int ret;
3160
3161 attrs = wq_sysfs_prep_attrs(wq);
3162 if (!attrs)
3163 return -ENOMEM;
3164
3165 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
3166 attrs->nice >= -20 && attrs->nice <= 19)
3167 ret = apply_workqueue_attrs(wq, attrs);
3168 else
3169 ret = -EINVAL;
3170
3171 free_workqueue_attrs(attrs);
3172 return ret ?: count;
3173}
3174
3175static ssize_t wq_cpumask_show(struct device *dev,
3176 struct device_attribute *attr, char *buf)
3177{
3178 struct workqueue_struct *wq = dev_to_wq(dev);
3179 int written;
3180
3181 mutex_lock(&wq->mutex);
3182 written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask);
3183 mutex_unlock(&wq->mutex);
3184
3185 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3186 return written;
3187}
3188
3189static ssize_t wq_cpumask_store(struct device *dev,
3190 struct device_attribute *attr,
3191 const char *buf, size_t count)
3192{
3193 struct workqueue_struct *wq = dev_to_wq(dev);
3194 struct workqueue_attrs *attrs;
3195 int ret;
3196
3197 attrs = wq_sysfs_prep_attrs(wq);
3198 if (!attrs)
3199 return -ENOMEM;
3200
3201 ret = cpumask_parse(buf, attrs->cpumask);
3202 if (!ret)
3203 ret = apply_workqueue_attrs(wq, attrs);
3204
3205 free_workqueue_attrs(attrs);
3206 return ret ?: count;
3207}
3208
3209static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
3210 char *buf)
3211{
3212 struct workqueue_struct *wq = dev_to_wq(dev);
3213 int written;
3214
3215 mutex_lock(&wq->mutex);
3216 written = scnprintf(buf, PAGE_SIZE, "%d\n",
3217 !wq->unbound_attrs->no_numa);
3218 mutex_unlock(&wq->mutex);
3219
3220 return written;
3221}
3222
3223static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
3224 const char *buf, size_t count)
3225{
3226 struct workqueue_struct *wq = dev_to_wq(dev);
3227 struct workqueue_attrs *attrs;
3228 int v, ret;
3229
3230 attrs = wq_sysfs_prep_attrs(wq);
3231 if (!attrs)
3232 return -ENOMEM;
3233
3234 ret = -EINVAL;
3235 if (sscanf(buf, "%d", &v) == 1) {
3236 attrs->no_numa = !v;
3237 ret = apply_workqueue_attrs(wq, attrs);
3238 }
3239
3240 free_workqueue_attrs(attrs);
3241 return ret ?: count;
3242}
3243
3244static struct device_attribute wq_sysfs_unbound_attrs[] = {
3245 __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
3246 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
3247 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
3248 __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
3249 __ATTR_NULL,
3250};
3251
3252static struct bus_type wq_subsys = {
3253 .name = "workqueue",
3254 .dev_attrs = wq_sysfs_attrs,
3255};
3256
3257static int __init wq_sysfs_init(void)
3258{
3259 return subsys_virtual_register(&wq_subsys, NULL);
3260}
3261core_initcall(wq_sysfs_init);
3262
3263static void wq_device_release(struct device *dev)
3089{ 3264{
3090 return system_wq != NULL; 3265 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
3266
3267 kfree(wq_dev);
3091} 3268}
3092 3269
3093static int alloc_pwqs(struct workqueue_struct *wq) 3270/**
3271 * workqueue_sysfs_register - make a workqueue visible in sysfs
3272 * @wq: the workqueue to register
3273 *
3274 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
3275 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
3276 * which is the preferred method.
3277 *
3278 * Workqueue user should use this function directly iff it wants to apply
3279 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
3280 * apply_workqueue_attrs() may race against userland updating the
3281 * attributes.
3282 *
3283 * Returns 0 on success, -errno on failure.
3284 */
3285int workqueue_sysfs_register(struct workqueue_struct *wq)
3094{ 3286{
3287 struct wq_device *wq_dev;
3288 int ret;
3289
3095 /* 3290 /*
3096 * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. 3291 * Adjusting max_active or creating new pwqs by applyting
3097 * Make sure that the alignment isn't lower than that of 3292 * attributes breaks ordering guarantee. Disallow exposing ordered
3098 * unsigned long long. 3293 * workqueues.
3099 */ 3294 */
3100 const size_t size = sizeof(struct pool_workqueue); 3295 if (WARN_ON(wq->flags & __WQ_ORDERED))
3101 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 3296 return -EINVAL;
3102 __alignof__(unsigned long long));
3103 3297
3104 if (!(wq->flags & WQ_UNBOUND)) 3298 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
3105 wq->pool_wq.pcpu = __alloc_percpu(size, align); 3299 if (!wq_dev)
3106 else { 3300 return -ENOMEM;
3107 void *ptr; 3301
3302 wq_dev->wq = wq;
3303 wq_dev->dev.bus = &wq_subsys;
3304 wq_dev->dev.init_name = wq->name;
3305 wq_dev->dev.release = wq_device_release;
3306
3307 /*
3308 * unbound_attrs are created separately. Suppress uevent until
3309 * everything is ready.
3310 */
3311 dev_set_uevent_suppress(&wq_dev->dev, true);
3312
3313 ret = device_register(&wq_dev->dev);
3314 if (ret) {
3315 kfree(wq_dev);
3316 wq->wq_dev = NULL;
3317 return ret;
3318 }
3319
3320 if (wq->flags & WQ_UNBOUND) {
3321 struct device_attribute *attr;
3322
3323 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
3324 ret = device_create_file(&wq_dev->dev, attr);
3325 if (ret) {
3326 device_unregister(&wq_dev->dev);
3327 wq->wq_dev = NULL;
3328 return ret;
3329 }
3330 }
3331 }
3332
3333 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
3334 return 0;
3335}
3336
3337/**
3338 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
3339 * @wq: the workqueue to unregister
3340 *
3341 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
3342 */
3343static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
3344{
3345 struct wq_device *wq_dev = wq->wq_dev;
3346
3347 if (!wq->wq_dev)
3348 return;
3349
3350 wq->wq_dev = NULL;
3351 device_unregister(&wq_dev->dev);
3352}
3353#else /* CONFIG_SYSFS */
3354static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
3355#endif /* CONFIG_SYSFS */
3356
3357/**
3358 * free_workqueue_attrs - free a workqueue_attrs
3359 * @attrs: workqueue_attrs to free
3360 *
3361 * Undo alloc_workqueue_attrs().
3362 */
3363void free_workqueue_attrs(struct workqueue_attrs *attrs)
3364{
3365 if (attrs) {
3366 free_cpumask_var(attrs->cpumask);
3367 kfree(attrs);
3368 }
3369}
3370
3371/**
3372 * alloc_workqueue_attrs - allocate a workqueue_attrs
3373 * @gfp_mask: allocation mask to use
3374 *
3375 * Allocate a new workqueue_attrs, initialize with default settings and
3376 * return it. Returns NULL on failure.
3377 */
3378struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
3379{
3380 struct workqueue_attrs *attrs;
3381
3382 attrs = kzalloc(sizeof(*attrs), gfp_mask);
3383 if (!attrs)
3384 goto fail;
3385 if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
3386 goto fail;
3387
3388 cpumask_copy(attrs->cpumask, cpu_possible_mask);
3389 return attrs;
3390fail:
3391 free_workqueue_attrs(attrs);
3392 return NULL;
3393}
3394
3395static void copy_workqueue_attrs(struct workqueue_attrs *to,
3396 const struct workqueue_attrs *from)
3397{
3398 to->nice = from->nice;
3399 cpumask_copy(to->cpumask, from->cpumask);
3400}
3401
3402/* hash value of the content of @attr */
3403static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
3404{
3405 u32 hash = 0;
3406
3407 hash = jhash_1word(attrs->nice, hash);
3408 hash = jhash(cpumask_bits(attrs->cpumask),
3409 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
3410 return hash;
3411}
3412
3413/* content equality test */
3414static bool wqattrs_equal(const struct workqueue_attrs *a,
3415 const struct workqueue_attrs *b)
3416{
3417 if (a->nice != b->nice)
3418 return false;
3419 if (!cpumask_equal(a->cpumask, b->cpumask))
3420 return false;
3421 return true;
3422}
3423
3424/**
3425 * init_worker_pool - initialize a newly zalloc'd worker_pool
3426 * @pool: worker_pool to initialize
3427 *
3428 * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs.
3429 * Returns 0 on success, -errno on failure. Even on failure, all fields
3430 * inside @pool proper are initialized and put_unbound_pool() can be called
3431 * on @pool safely to release it.
3432 */
3433static int init_worker_pool(struct worker_pool *pool)
3434{
3435 spin_lock_init(&pool->lock);
3436 pool->id = -1;
3437 pool->cpu = -1;
3438 pool->node = NUMA_NO_NODE;
3439 pool->flags |= POOL_DISASSOCIATED;
3440 INIT_LIST_HEAD(&pool->worklist);
3441 INIT_LIST_HEAD(&pool->idle_list);
3442 hash_init(pool->busy_hash);
3443
3444 init_timer_deferrable(&pool->idle_timer);
3445 pool->idle_timer.function = idle_worker_timeout;
3446 pool->idle_timer.data = (unsigned long)pool;
3447
3448 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3449 (unsigned long)pool);
3450
3451 mutex_init(&pool->manager_arb);
3452 mutex_init(&pool->manager_mutex);
3453 idr_init(&pool->worker_idr);
3454
3455 INIT_HLIST_NODE(&pool->hash_node);
3456 pool->refcnt = 1;
3457
3458 /* shouldn't fail above this point */
3459 pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
3460 if (!pool->attrs)
3461 return -ENOMEM;
3462 return 0;
3463}
3464
3465static void rcu_free_pool(struct rcu_head *rcu)
3466{
3467 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
3468
3469 idr_destroy(&pool->worker_idr);
3470 free_workqueue_attrs(pool->attrs);
3471 kfree(pool);
3472}
3473
3474/**
3475 * put_unbound_pool - put a worker_pool
3476 * @pool: worker_pool to put
3477 *
3478 * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
3479 * safe manner. get_unbound_pool() calls this function on its failure path
3480 * and this function should be able to release pools which went through,
3481 * successfully or not, init_worker_pool().
3482 *
3483 * Should be called with wq_pool_mutex held.
3484 */
3485static void put_unbound_pool(struct worker_pool *pool)
3486{
3487 struct worker *worker;
3488
3489 lockdep_assert_held(&wq_pool_mutex);
3490
3491 if (--pool->refcnt)
3492 return;
3493
3494 /* sanity checks */
3495 if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
3496 WARN_ON(!list_empty(&pool->worklist)))
3497 return;
3498
3499 /* release id and unhash */
3500 if (pool->id >= 0)
3501 idr_remove(&worker_pool_idr, pool->id);
3502 hash_del(&pool->hash_node);
3503
3504 /*
3505 * Become the manager and destroy all workers. Grabbing
3506 * manager_arb prevents @pool's workers from blocking on
3507 * manager_mutex.
3508 */
3509 mutex_lock(&pool->manager_arb);
3510 mutex_lock(&pool->manager_mutex);
3511 spin_lock_irq(&pool->lock);
3512
3513 while ((worker = first_worker(pool)))
3514 destroy_worker(worker);
3515 WARN_ON(pool->nr_workers || pool->nr_idle);
3516
3517 spin_unlock_irq(&pool->lock);
3518 mutex_unlock(&pool->manager_mutex);
3519 mutex_unlock(&pool->manager_arb);
3520
3521 /* shut down the timers */
3522 del_timer_sync(&pool->idle_timer);
3523 del_timer_sync(&pool->mayday_timer);
3524
3525 /* sched-RCU protected to allow dereferences from get_work_pool() */
3526 call_rcu_sched(&pool->rcu, rcu_free_pool);
3527}
3528
3529/**
3530 * get_unbound_pool - get a worker_pool with the specified attributes
3531 * @attrs: the attributes of the worker_pool to get
3532 *
3533 * Obtain a worker_pool which has the same attributes as @attrs, bump the
3534 * reference count and return it. If there already is a matching
3535 * worker_pool, it will be used; otherwise, this function attempts to
3536 * create a new one. On failure, returns NULL.
3537 *
3538 * Should be called with wq_pool_mutex held.
3539 */
3540static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3541{
3542 u32 hash = wqattrs_hash(attrs);
3543 struct worker_pool *pool;
3544 int node;
3545
3546 lockdep_assert_held(&wq_pool_mutex);
3547
3548 /* do we already have a matching pool? */
3549 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
3550 if (wqattrs_equal(pool->attrs, attrs)) {
3551 pool->refcnt++;
3552 goto out_unlock;
3553 }
3554 }
3555
3556 /* nope, create a new one */
3557 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
3558 if (!pool || init_worker_pool(pool) < 0)
3559 goto fail;
3560
3561 if (workqueue_freezing)
3562 pool->flags |= POOL_FREEZING;
3563
3564 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3565 copy_workqueue_attrs(pool->attrs, attrs);
3566
3567 /* if cpumask is contained inside a NUMA node, we belong to that node */
3568 if (wq_numa_enabled) {
3569 for_each_node(node) {
3570 if (cpumask_subset(pool->attrs->cpumask,
3571 wq_numa_possible_cpumask[node])) {
3572 pool->node = node;
3573 break;
3574 }
3575 }
3576 }
3577
3578 if (worker_pool_assign_id(pool) < 0)
3579 goto fail;
3580
3581 /* create and start the initial worker */
3582 if (create_and_start_worker(pool) < 0)
3583 goto fail;
3584
3585 /* install */
3586 hash_add(unbound_pool_hash, &pool->hash_node, hash);
3587out_unlock:
3588 return pool;
3589fail:
3590 if (pool)
3591 put_unbound_pool(pool);
3592 return NULL;
3593}
3594
3595static void rcu_free_pwq(struct rcu_head *rcu)
3596{
3597 kmem_cache_free(pwq_cache,
3598 container_of(rcu, struct pool_workqueue, rcu));
3599}
3600
3601/*
3602 * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
3603 * and needs to be destroyed.
3604 */
3605static void pwq_unbound_release_workfn(struct work_struct *work)
3606{
3607 struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
3608 unbound_release_work);
3609 struct workqueue_struct *wq = pwq->wq;
3610 struct worker_pool *pool = pwq->pool;
3611 bool is_last;
3612
3613 if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
3614 return;
3615
3616 /*
3617 * Unlink @pwq. Synchronization against wq->mutex isn't strictly
3618 * necessary on release but do it anyway. It's easier to verify
3619 * and consistent with the linking path.
3620 */
3621 mutex_lock(&wq->mutex);
3622 list_del_rcu(&pwq->pwqs_node);
3623 is_last = list_empty(&wq->pwqs);
3624 mutex_unlock(&wq->mutex);
3625
3626 mutex_lock(&wq_pool_mutex);
3627 put_unbound_pool(pool);
3628 mutex_unlock(&wq_pool_mutex);
3629
3630 call_rcu_sched(&pwq->rcu, rcu_free_pwq);
3631
3632 /*
3633 * If we're the last pwq going away, @wq is already dead and no one
3634 * is gonna access it anymore. Free it.
3635 */
3636 if (is_last) {
3637 free_workqueue_attrs(wq->unbound_attrs);
3638 kfree(wq);
3639 }
3640}
3641
3642/**
3643 * pwq_adjust_max_active - update a pwq's max_active to the current setting
3644 * @pwq: target pool_workqueue
3645 *
3646 * If @pwq isn't freezing, set @pwq->max_active to the associated
3647 * workqueue's saved_max_active and activate delayed work items
3648 * accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
3649 */
3650static void pwq_adjust_max_active(struct pool_workqueue *pwq)
3651{
3652 struct workqueue_struct *wq = pwq->wq;
3653 bool freezable = wq->flags & WQ_FREEZABLE;
3654
3655 /* for @wq->saved_max_active */
3656 lockdep_assert_held(&wq->mutex);
3657
3658 /* fast exit for non-freezable wqs */
3659 if (!freezable && pwq->max_active == wq->saved_max_active)
3660 return;
3661
3662 spin_lock_irq(&pwq->pool->lock);
3663
3664 if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
3665 pwq->max_active = wq->saved_max_active;
3666
3667 while (!list_empty(&pwq->delayed_works) &&
3668 pwq->nr_active < pwq->max_active)
3669 pwq_activate_first_delayed(pwq);
3108 3670
3109 /* 3671 /*
3110 * Allocate enough room to align pwq and put an extra 3672 * Need to kick a worker after thawed or an unbound wq's
3111 * pointer at the end pointing back to the originally 3673 * max_active is bumped. It's a slow path. Do it always.
3112 * allocated pointer which will be used for free.
3113 */ 3674 */
3114 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); 3675 wake_up_worker(pwq->pool);
3115 if (ptr) { 3676 } else {
3116 wq->pool_wq.single = PTR_ALIGN(ptr, align); 3677 pwq->max_active = 0;
3117 *(void **)(wq->pool_wq.single + 1) = ptr; 3678 }
3679
3680 spin_unlock_irq(&pwq->pool->lock);
3681}
3682
3683/* initialize newly alloced @pwq which is associated with @wq and @pool */
3684static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
3685 struct worker_pool *pool)
3686{
3687 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3688
3689 memset(pwq, 0, sizeof(*pwq));
3690
3691 pwq->pool = pool;
3692 pwq->wq = wq;
3693 pwq->flush_color = -1;
3694 pwq->refcnt = 1;
3695 INIT_LIST_HEAD(&pwq->delayed_works);
3696 INIT_LIST_HEAD(&pwq->pwqs_node);
3697 INIT_LIST_HEAD(&pwq->mayday_node);
3698 INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
3699}
3700
3701/* sync @pwq with the current state of its associated wq and link it */
3702static void link_pwq(struct pool_workqueue *pwq)
3703{
3704 struct workqueue_struct *wq = pwq->wq;
3705
3706 lockdep_assert_held(&wq->mutex);
3707
3708 /* may be called multiple times, ignore if already linked */
3709 if (!list_empty(&pwq->pwqs_node))
3710 return;
3711
3712 /*
3713 * Set the matching work_color. This is synchronized with
3714 * wq->mutex to avoid confusing flush_workqueue().
3715 */
3716 pwq->work_color = wq->work_color;
3717
3718 /* sync max_active to the current setting */
3719 pwq_adjust_max_active(pwq);
3720
3721 /* link in @pwq */
3722 list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
3723}
3724
3725/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
3726static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
3727 const struct workqueue_attrs *attrs)
3728{
3729 struct worker_pool *pool;
3730 struct pool_workqueue *pwq;
3731
3732 lockdep_assert_held(&wq_pool_mutex);
3733
3734 pool = get_unbound_pool(attrs);
3735 if (!pool)
3736 return NULL;
3737
3738 pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
3739 if (!pwq) {
3740 put_unbound_pool(pool);
3741 return NULL;
3742 }
3743
3744 init_pwq(pwq, wq, pool);
3745 return pwq;
3746}
3747
3748/* undo alloc_unbound_pwq(), used only in the error path */
3749static void free_unbound_pwq(struct pool_workqueue *pwq)
3750{
3751 lockdep_assert_held(&wq_pool_mutex);
3752
3753 if (pwq) {
3754 put_unbound_pool(pwq->pool);
3755 kmem_cache_free(pwq_cache, pwq);
3756 }
3757}
3758
3759/**
3760 * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
3761 * @attrs: the wq_attrs of interest
3762 * @node: the target NUMA node
3763 * @cpu_going_down: if >= 0, the CPU to consider as offline
3764 * @cpumask: outarg, the resulting cpumask
3765 *
3766 * Calculate the cpumask a workqueue with @attrs should use on @node. If
3767 * @cpu_going_down is >= 0, that cpu is considered offline during
3768 * calculation. The result is stored in @cpumask. This function returns
3769 * %true if the resulting @cpumask is different from @attrs->cpumask,
3770 * %false if equal.
3771 *
3772 * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
3773 * enabled and @node has online CPUs requested by @attrs, the returned
3774 * cpumask is the intersection of the possible CPUs of @node and
3775 * @attrs->cpumask.
3776 *
3777 * The caller is responsible for ensuring that the cpumask of @node stays
3778 * stable.
3779 */
3780static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
3781 int cpu_going_down, cpumask_t *cpumask)
3782{
3783 if (!wq_numa_enabled || attrs->no_numa)
3784 goto use_dfl;
3785
3786 /* does @node have any online CPUs @attrs wants? */
3787 cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
3788 if (cpu_going_down >= 0)
3789 cpumask_clear_cpu(cpu_going_down, cpumask);
3790
3791 if (cpumask_empty(cpumask))
3792 goto use_dfl;
3793
3794 /* yeap, return possible CPUs in @node that @attrs wants */
3795 cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
3796 return !cpumask_equal(cpumask, attrs->cpumask);
3797
3798use_dfl:
3799 cpumask_copy(cpumask, attrs->cpumask);
3800 return false;
3801}
3802
3803/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
3804static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
3805 int node,
3806 struct pool_workqueue *pwq)
3807{
3808 struct pool_workqueue *old_pwq;
3809
3810 lockdep_assert_held(&wq->mutex);
3811
3812 /* link_pwq() can handle duplicate calls */
3813 link_pwq(pwq);
3814
3815 old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
3816 rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
3817 return old_pwq;
3818}
3819
3820/**
3821 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
3822 * @wq: the target workqueue
3823 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
3824 *
3825 * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
3826 * machines, this function maps a separate pwq to each NUMA node with
3827 * possibles CPUs in @attrs->cpumask so that work items are affine to the
3828 * NUMA node it was issued on. Older pwqs are released as in-flight work
3829 * items finish. Note that a work item which repeatedly requeues itself
3830 * back-to-back will stay on its current pwq.
3831 *
3832 * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on
3833 * failure.
3834 */
3835int apply_workqueue_attrs(struct workqueue_struct *wq,
3836 const struct workqueue_attrs *attrs)
3837{
3838 struct workqueue_attrs *new_attrs, *tmp_attrs;
3839 struct pool_workqueue **pwq_tbl, *dfl_pwq;
3840 int node, ret;
3841
3842 /* only unbound workqueues can change attributes */
3843 if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
3844 return -EINVAL;
3845
3846 /* creating multiple pwqs breaks ordering guarantee */
3847 if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
3848 return -EINVAL;
3849
3850 pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
3851 new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3852 tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3853 if (!pwq_tbl || !new_attrs || !tmp_attrs)
3854 goto enomem;
3855
3856 /* make a copy of @attrs and sanitize it */
3857 copy_workqueue_attrs(new_attrs, attrs);
3858 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
3859
3860 /*
3861 * We may create multiple pwqs with differing cpumasks. Make a
3862 * copy of @new_attrs which will be modified and used to obtain
3863 * pools.
3864 */
3865 copy_workqueue_attrs(tmp_attrs, new_attrs);
3866
3867 /*
3868 * CPUs should stay stable across pwq creations and installations.
3869 * Pin CPUs, determine the target cpumask for each node and create
3870 * pwqs accordingly.
3871 */
3872 get_online_cpus();
3873
3874 mutex_lock(&wq_pool_mutex);
3875
3876 /*
3877 * If something goes wrong during CPU up/down, we'll fall back to
3878 * the default pwq covering whole @attrs->cpumask. Always create
3879 * it even if we don't use it immediately.
3880 */
3881 dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
3882 if (!dfl_pwq)
3883 goto enomem_pwq;
3884
3885 for_each_node(node) {
3886 if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
3887 pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
3888 if (!pwq_tbl[node])
3889 goto enomem_pwq;
3890 } else {
3891 dfl_pwq->refcnt++;
3892 pwq_tbl[node] = dfl_pwq;
3118 } 3893 }
3119 } 3894 }
3120 3895
3121 /* just in case, make sure it's actually aligned */ 3896 mutex_unlock(&wq_pool_mutex);
3122 BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); 3897
3123 return wq->pool_wq.v ? 0 : -ENOMEM; 3898 /* all pwqs have been created successfully, let's install'em */
3899 mutex_lock(&wq->mutex);
3900
3901 copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
3902
3903 /* save the previous pwq and install the new one */
3904 for_each_node(node)
3905 pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
3906
3907 /* @dfl_pwq might not have been used, ensure it's linked */
3908 link_pwq(dfl_pwq);
3909 swap(wq->dfl_pwq, dfl_pwq);
3910
3911 mutex_unlock(&wq->mutex);
3912
3913 /* put the old pwqs */
3914 for_each_node(node)
3915 put_pwq_unlocked(pwq_tbl[node]);
3916 put_pwq_unlocked(dfl_pwq);
3917
3918 put_online_cpus();
3919 ret = 0;
3920 /* fall through */
3921out_free:
3922 free_workqueue_attrs(tmp_attrs);
3923 free_workqueue_attrs(new_attrs);
3924 kfree(pwq_tbl);
3925 return ret;
3926
3927enomem_pwq:
3928 free_unbound_pwq(dfl_pwq);
3929 for_each_node(node)
3930 if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
3931 free_unbound_pwq(pwq_tbl[node]);
3932 mutex_unlock(&wq_pool_mutex);
3933 put_online_cpus();
3934enomem:
3935 ret = -ENOMEM;
3936 goto out_free;
3124} 3937}
3125 3938
3126static void free_pwqs(struct workqueue_struct *wq) 3939/**
3940 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
3941 * @wq: the target workqueue
3942 * @cpu: the CPU coming up or going down
3943 * @online: whether @cpu is coming up or going down
3944 *
3945 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
3946 * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
3947 * @wq accordingly.
3948 *
3949 * If NUMA affinity can't be adjusted due to memory allocation failure, it
3950 * falls back to @wq->dfl_pwq which may not be optimal but is always
3951 * correct.
3952 *
3953 * Note that when the last allowed CPU of a NUMA node goes offline for a
3954 * workqueue with a cpumask spanning multiple nodes, the workers which were
3955 * already executing the work items for the workqueue will lose their CPU
3956 * affinity and may execute on any CPU. This is similar to how per-cpu
3957 * workqueues behave on CPU_DOWN. If a workqueue user wants strict
3958 * affinity, it's the user's responsibility to flush the work item from
3959 * CPU_DOWN_PREPARE.
3960 */
3961static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
3962 bool online)
3127{ 3963{
3128 if (!(wq->flags & WQ_UNBOUND)) 3964 int node = cpu_to_node(cpu);
3129 free_percpu(wq->pool_wq.pcpu); 3965 int cpu_off = online ? -1 : cpu;
3130 else if (wq->pool_wq.single) { 3966 struct pool_workqueue *old_pwq = NULL, *pwq;
3131 /* the pointer to free is stored right after the pwq */ 3967 struct workqueue_attrs *target_attrs;
3132 kfree(*(void **)(wq->pool_wq.single + 1)); 3968 cpumask_t *cpumask;
3969
3970 lockdep_assert_held(&wq_pool_mutex);
3971
3972 if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
3973 return;
3974
3975 /*
3976 * We don't wanna alloc/free wq_attrs for each wq for each CPU.
3977 * Let's use a preallocated one. The following buf is protected by
3978 * CPU hotplug exclusion.
3979 */
3980 target_attrs = wq_update_unbound_numa_attrs_buf;
3981 cpumask = target_attrs->cpumask;
3982
3983 mutex_lock(&wq->mutex);
3984 if (wq->unbound_attrs->no_numa)
3985 goto out_unlock;
3986
3987 copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
3988 pwq = unbound_pwq_by_node(wq, node);
3989
3990 /*
3991 * Let's determine what needs to be done. If the target cpumask is
3992 * different from wq's, we need to compare it to @pwq's and create
3993 * a new one if they don't match. If the target cpumask equals
3994 * wq's, the default pwq should be used. If @pwq is already the
3995 * default one, nothing to do; otherwise, install the default one.
3996 */
3997 if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
3998 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
3999 goto out_unlock;
4000 } else {
4001 if (pwq == wq->dfl_pwq)
4002 goto out_unlock;
4003 else
4004 goto use_dfl_pwq;
4005 }
4006
4007 mutex_unlock(&wq->mutex);
4008
4009 /* create a new pwq */
4010 pwq = alloc_unbound_pwq(wq, target_attrs);
4011 if (!pwq) {
4012 pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
4013 wq->name);
4014 goto out_unlock;
4015 }
4016
4017 /*
4018 * Install the new pwq. As this function is called only from CPU
4019 * hotplug callbacks and applying a new attrs is wrapped with
4020 * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
4021 * inbetween.
4022 */
4023 mutex_lock(&wq->mutex);
4024 old_pwq = numa_pwq_tbl_install(wq, node, pwq);
4025 goto out_unlock;
4026
4027use_dfl_pwq:
4028 spin_lock_irq(&wq->dfl_pwq->pool->lock);
4029 get_pwq(wq->dfl_pwq);
4030 spin_unlock_irq(&wq->dfl_pwq->pool->lock);
4031 old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
4032out_unlock:
4033 mutex_unlock(&wq->mutex);
4034 put_pwq_unlocked(old_pwq);
4035}
4036
4037static int alloc_and_link_pwqs(struct workqueue_struct *wq)
4038{
4039 bool highpri = wq->flags & WQ_HIGHPRI;
4040 int cpu;
4041
4042 if (!(wq->flags & WQ_UNBOUND)) {
4043 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
4044 if (!wq->cpu_pwqs)
4045 return -ENOMEM;
4046
4047 for_each_possible_cpu(cpu) {
4048 struct pool_workqueue *pwq =
4049 per_cpu_ptr(wq->cpu_pwqs, cpu);
4050 struct worker_pool *cpu_pools =
4051 per_cpu(cpu_worker_pools, cpu);
4052
4053 init_pwq(pwq, wq, &cpu_pools[highpri]);
4054
4055 mutex_lock(&wq->mutex);
4056 link_pwq(pwq);
4057 mutex_unlock(&wq->mutex);
4058 }
4059 return 0;
4060 } else {
4061 return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
3133 } 4062 }
3134} 4063}
3135 4064
@@ -3151,30 +4080,28 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3151 struct lock_class_key *key, 4080 struct lock_class_key *key,
3152 const char *lock_name, ...) 4081 const char *lock_name, ...)
3153{ 4082{
3154 va_list args, args1; 4083 size_t tbl_size = 0;
4084 va_list args;
3155 struct workqueue_struct *wq; 4085 struct workqueue_struct *wq;
3156 unsigned int cpu; 4086 struct pool_workqueue *pwq;
3157 size_t namelen;
3158 4087
3159 /* determine namelen, allocate wq and format name */ 4088 /* allocate wq and format name */
3160 va_start(args, lock_name); 4089 if (flags & WQ_UNBOUND)
3161 va_copy(args1, args); 4090 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
3162 namelen = vsnprintf(NULL, 0, fmt, args) + 1;
3163 4091
3164 wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); 4092 wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
3165 if (!wq) 4093 if (!wq)
3166 goto err; 4094 return NULL;
3167 4095
3168 vsnprintf(wq->name, namelen, fmt, args1); 4096 if (flags & WQ_UNBOUND) {
3169 va_end(args); 4097 wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3170 va_end(args1); 4098 if (!wq->unbound_attrs)
4099 goto err_free_wq;
4100 }
3171 4101
3172 /* 4102 va_start(args, lock_name);
3173 * Workqueues which may be used during memory reclaim should 4103 vsnprintf(wq->name, sizeof(wq->name), fmt, args);
3174 * have a rescuer to guarantee forward progress. 4104 va_end(args);
3175 */
3176 if (flags & WQ_MEM_RECLAIM)
3177 flags |= WQ_RESCUER;
3178 4105
3179 max_active = max_active ?: WQ_DFL_ACTIVE; 4106 max_active = max_active ?: WQ_DFL_ACTIVE;
3180 max_active = wq_clamp_max_active(max_active, flags, wq->name); 4107 max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -3182,71 +4109,70 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3182 /* init wq */ 4109 /* init wq */
3183 wq->flags = flags; 4110 wq->flags = flags;
3184 wq->saved_max_active = max_active; 4111 wq->saved_max_active = max_active;
3185 mutex_init(&wq->flush_mutex); 4112 mutex_init(&wq->mutex);
3186 atomic_set(&wq->nr_pwqs_to_flush, 0); 4113 atomic_set(&wq->nr_pwqs_to_flush, 0);
4114 INIT_LIST_HEAD(&wq->pwqs);
3187 INIT_LIST_HEAD(&wq->flusher_queue); 4115 INIT_LIST_HEAD(&wq->flusher_queue);
3188 INIT_LIST_HEAD(&wq->flusher_overflow); 4116 INIT_LIST_HEAD(&wq->flusher_overflow);
4117 INIT_LIST_HEAD(&wq->maydays);
3189 4118
3190 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 4119 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3191 INIT_LIST_HEAD(&wq->list); 4120 INIT_LIST_HEAD(&wq->list);
3192 4121
3193 if (alloc_pwqs(wq) < 0) 4122 if (alloc_and_link_pwqs(wq) < 0)
3194 goto err; 4123 goto err_free_wq;
3195
3196 for_each_pwq_cpu(cpu, wq) {
3197 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3198 4124
3199 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); 4125 /*
3200 pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); 4126 * Workqueues which may be used during memory reclaim should
3201 pwq->wq = wq; 4127 * have a rescuer to guarantee forward progress.
3202 pwq->flush_color = -1; 4128 */
3203 pwq->max_active = max_active; 4129 if (flags & WQ_MEM_RECLAIM) {
3204 INIT_LIST_HEAD(&pwq->delayed_works);
3205 }
3206
3207 if (flags & WQ_RESCUER) {
3208 struct worker *rescuer; 4130 struct worker *rescuer;
3209 4131
3210 if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) 4132 rescuer = alloc_worker();
3211 goto err;
3212
3213 wq->rescuer = rescuer = alloc_worker();
3214 if (!rescuer) 4133 if (!rescuer)
3215 goto err; 4134 goto err_destroy;
3216 4135
3217 rescuer->rescue_wq = wq; 4136 rescuer->rescue_wq = wq;
3218 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", 4137 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
3219 wq->name); 4138 wq->name);
3220 if (IS_ERR(rescuer->task)) 4139 if (IS_ERR(rescuer->task)) {
3221 goto err; 4140 kfree(rescuer);
4141 goto err_destroy;
4142 }
3222 4143
3223 rescuer->task->flags |= PF_THREAD_BOUND; 4144 wq->rescuer = rescuer;
4145 rescuer->task->flags |= PF_NO_SETAFFINITY;
3224 wake_up_process(rescuer->task); 4146 wake_up_process(rescuer->task);
3225 } 4147 }
3226 4148
4149 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
4150 goto err_destroy;
4151
3227 /* 4152 /*
3228 * workqueue_lock protects global freeze state and workqueues 4153 * wq_pool_mutex protects global freeze state and workqueues list.
3229 * list. Grab it, set max_active accordingly and add the new 4154 * Grab it, adjust max_active and add the new @wq to workqueues
3230 * workqueue to workqueues list. 4155 * list.
3231 */ 4156 */
3232 spin_lock(&workqueue_lock); 4157 mutex_lock(&wq_pool_mutex);
3233 4158
3234 if (workqueue_freezing && wq->flags & WQ_FREEZABLE) 4159 mutex_lock(&wq->mutex);
3235 for_each_pwq_cpu(cpu, wq) 4160 for_each_pwq(pwq, wq)
3236 get_pwq(cpu, wq)->max_active = 0; 4161 pwq_adjust_max_active(pwq);
4162 mutex_unlock(&wq->mutex);
3237 4163
3238 list_add(&wq->list, &workqueues); 4164 list_add(&wq->list, &workqueues);
3239 4165
3240 spin_unlock(&workqueue_lock); 4166 mutex_unlock(&wq_pool_mutex);
3241 4167
3242 return wq; 4168 return wq;
3243err: 4169
3244 if (wq) { 4170err_free_wq:
3245 free_pwqs(wq); 4171 free_workqueue_attrs(wq->unbound_attrs);
3246 free_mayday_mask(wq->mayday_mask); 4172 kfree(wq);
3247 kfree(wq->rescuer); 4173 return NULL;
3248 kfree(wq); 4174err_destroy:
3249 } 4175 destroy_workqueue(wq);
3250 return NULL; 4176 return NULL;
3251} 4177}
3252EXPORT_SYMBOL_GPL(__alloc_workqueue_key); 4178EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
@@ -3259,60 +4185,78 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3259 */ 4185 */
3260void destroy_workqueue(struct workqueue_struct *wq) 4186void destroy_workqueue(struct workqueue_struct *wq)
3261{ 4187{
3262 unsigned int cpu; 4188 struct pool_workqueue *pwq;
4189 int node;
3263 4190
3264 /* drain it before proceeding with destruction */ 4191 /* drain it before proceeding with destruction */
3265 drain_workqueue(wq); 4192 drain_workqueue(wq);
3266 4193
4194 /* sanity checks */
4195 mutex_lock(&wq->mutex);
4196 for_each_pwq(pwq, wq) {
4197 int i;
4198
4199 for (i = 0; i < WORK_NR_COLORS; i++) {
4200 if (WARN_ON(pwq->nr_in_flight[i])) {
4201 mutex_unlock(&wq->mutex);
4202 return;
4203 }
4204 }
4205
4206 if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) ||
4207 WARN_ON(pwq->nr_active) ||
4208 WARN_ON(!list_empty(&pwq->delayed_works))) {
4209 mutex_unlock(&wq->mutex);
4210 return;
4211 }
4212 }
4213 mutex_unlock(&wq->mutex);
4214
3267 /* 4215 /*
3268 * wq list is used to freeze wq, remove from list after 4216 * wq list is used to freeze wq, remove from list after
3269 * flushing is complete in case freeze races us. 4217 * flushing is complete in case freeze races us.
3270 */ 4218 */
3271 spin_lock(&workqueue_lock); 4219 mutex_lock(&wq_pool_mutex);
3272 list_del(&wq->list); 4220 list_del_init(&wq->list);
3273 spin_unlock(&workqueue_lock); 4221 mutex_unlock(&wq_pool_mutex);
3274
3275 /* sanity check */
3276 for_each_pwq_cpu(cpu, wq) {
3277 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3278 int i;
3279 4222
3280 for (i = 0; i < WORK_NR_COLORS; i++) 4223 workqueue_sysfs_unregister(wq);
3281 BUG_ON(pwq->nr_in_flight[i]);
3282 BUG_ON(pwq->nr_active);
3283 BUG_ON(!list_empty(&pwq->delayed_works));
3284 }
3285 4224
3286 if (wq->flags & WQ_RESCUER) { 4225 if (wq->rescuer) {
3287 kthread_stop(wq->rescuer->task); 4226 kthread_stop(wq->rescuer->task);
3288 free_mayday_mask(wq->mayday_mask);
3289 kfree(wq->rescuer); 4227 kfree(wq->rescuer);
4228 wq->rescuer = NULL;
3290 } 4229 }
3291 4230
3292 free_pwqs(wq); 4231 if (!(wq->flags & WQ_UNBOUND)) {
3293 kfree(wq); 4232 /*
3294} 4233 * The base ref is never dropped on per-cpu pwqs. Directly
3295EXPORT_SYMBOL_GPL(destroy_workqueue); 4234 * free the pwqs and wq.
3296 4235 */
3297/** 4236 free_percpu(wq->cpu_pwqs);
3298 * pwq_set_max_active - adjust max_active of a pwq 4237 kfree(wq);
3299 * @pwq: target pool_workqueue 4238 } else {
3300 * @max_active: new max_active value. 4239 /*
3301 * 4240 * We're the sole accessor of @wq at this point. Directly
3302 * Set @pwq->max_active to @max_active and activate delayed works if 4241 * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
3303 * increased. 4242 * @wq will be freed when the last pwq is released.
3304 * 4243 */
3305 * CONTEXT: 4244 for_each_node(node) {
3306 * spin_lock_irq(pool->lock). 4245 pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
3307 */ 4246 RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
3308static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) 4247 put_pwq_unlocked(pwq);
3309{ 4248 }
3310 pwq->max_active = max_active;
3311 4249
3312 while (!list_empty(&pwq->delayed_works) && 4250 /*
3313 pwq->nr_active < pwq->max_active) 4251 * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
3314 pwq_activate_first_delayed(pwq); 4252 * put. Don't access it afterwards.
4253 */
4254 pwq = wq->dfl_pwq;
4255 wq->dfl_pwq = NULL;
4256 put_pwq_unlocked(pwq);
4257 }
3315} 4258}
4259EXPORT_SYMBOL_GPL(destroy_workqueue);
3316 4260
3317/** 4261/**
3318 * workqueue_set_max_active - adjust max_active of a workqueue 4262 * workqueue_set_max_active - adjust max_active of a workqueue
@@ -3326,30 +4270,37 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
3326 */ 4270 */
3327void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) 4271void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3328{ 4272{
3329 unsigned int cpu; 4273 struct pool_workqueue *pwq;
4274
4275 /* disallow meddling with max_active for ordered workqueues */
4276 if (WARN_ON(wq->flags & __WQ_ORDERED))
4277 return;
3330 4278
3331 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); 4279 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
3332 4280
3333 spin_lock(&workqueue_lock); 4281 mutex_lock(&wq->mutex);
3334 4282
3335 wq->saved_max_active = max_active; 4283 wq->saved_max_active = max_active;
3336 4284
3337 for_each_pwq_cpu(cpu, wq) { 4285 for_each_pwq(pwq, wq)
3338 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4286 pwq_adjust_max_active(pwq);
3339 struct worker_pool *pool = pwq->pool;
3340 4287
3341 spin_lock_irq(&pool->lock); 4288 mutex_unlock(&wq->mutex);
3342 4289}
3343 if (!(wq->flags & WQ_FREEZABLE) || 4290EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3344 !(pool->flags & POOL_FREEZING))
3345 pwq_set_max_active(pwq, max_active);
3346 4291
3347 spin_unlock_irq(&pool->lock); 4292/**
3348 } 4293 * current_is_workqueue_rescuer - is %current workqueue rescuer?
4294 *
4295 * Determine whether %current is a workqueue rescuer. Can be used from
4296 * work functions to determine whether it's being run off the rescuer task.
4297 */
4298bool current_is_workqueue_rescuer(void)
4299{
4300 struct worker *worker = current_wq_worker();
3349 4301
3350 spin_unlock(&workqueue_lock); 4302 return worker && worker->rescue_wq;
3351} 4303}
3352EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3353 4304
3354/** 4305/**
3355 * workqueue_congested - test whether a workqueue is congested 4306 * workqueue_congested - test whether a workqueue is congested
@@ -3363,11 +4314,22 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3363 * RETURNS: 4314 * RETURNS:
3364 * %true if congested, %false otherwise. 4315 * %true if congested, %false otherwise.
3365 */ 4316 */
3366bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) 4317bool workqueue_congested(int cpu, struct workqueue_struct *wq)
3367{ 4318{
3368 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4319 struct pool_workqueue *pwq;
4320 bool ret;
4321
4322 rcu_read_lock_sched();
3369 4323
3370 return !list_empty(&pwq->delayed_works); 4324 if (!(wq->flags & WQ_UNBOUND))
4325 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
4326 else
4327 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
4328
4329 ret = !list_empty(&pwq->delayed_works);
4330 rcu_read_unlock_sched();
4331
4332 return ret;
3371} 4333}
3372EXPORT_SYMBOL_GPL(workqueue_congested); 4334EXPORT_SYMBOL_GPL(workqueue_congested);
3373 4335
@@ -3384,24 +4346,104 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
3384 */ 4346 */
3385unsigned int work_busy(struct work_struct *work) 4347unsigned int work_busy(struct work_struct *work)
3386{ 4348{
3387 struct worker_pool *pool = get_work_pool(work); 4349 struct worker_pool *pool;
3388 unsigned long flags; 4350 unsigned long flags;
3389 unsigned int ret = 0; 4351 unsigned int ret = 0;
3390 4352
3391 if (work_pending(work)) 4353 if (work_pending(work))
3392 ret |= WORK_BUSY_PENDING; 4354 ret |= WORK_BUSY_PENDING;
3393 4355
4356 local_irq_save(flags);
4357 pool = get_work_pool(work);
3394 if (pool) { 4358 if (pool) {
3395 spin_lock_irqsave(&pool->lock, flags); 4359 spin_lock(&pool->lock);
3396 if (find_worker_executing_work(pool, work)) 4360 if (find_worker_executing_work(pool, work))
3397 ret |= WORK_BUSY_RUNNING; 4361 ret |= WORK_BUSY_RUNNING;
3398 spin_unlock_irqrestore(&pool->lock, flags); 4362 spin_unlock(&pool->lock);
3399 } 4363 }
4364 local_irq_restore(flags);
3400 4365
3401 return ret; 4366 return ret;
3402} 4367}
3403EXPORT_SYMBOL_GPL(work_busy); 4368EXPORT_SYMBOL_GPL(work_busy);
3404 4369
4370/**
4371 * set_worker_desc - set description for the current work item
4372 * @fmt: printf-style format string
4373 * @...: arguments for the format string
4374 *
4375 * This function can be called by a running work function to describe what
4376 * the work item is about. If the worker task gets dumped, this
4377 * information will be printed out together to help debugging. The
4378 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
4379 */
4380void set_worker_desc(const char *fmt, ...)
4381{
4382 struct worker *worker = current_wq_worker();
4383 va_list args;
4384
4385 if (worker) {
4386 va_start(args, fmt);
4387 vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
4388 va_end(args);
4389 worker->desc_valid = true;
4390 }
4391}
4392
4393/**
4394 * print_worker_info - print out worker information and description
4395 * @log_lvl: the log level to use when printing
4396 * @task: target task
4397 *
4398 * If @task is a worker and currently executing a work item, print out the
4399 * name of the workqueue being serviced and worker description set with
4400 * set_worker_desc() by the currently executing work item.
4401 *
4402 * This function can be safely called on any task as long as the
4403 * task_struct itself is accessible. While safe, this function isn't
4404 * synchronized and may print out mixups or garbages of limited length.
4405 */
4406void print_worker_info(const char *log_lvl, struct task_struct *task)
4407{
4408 work_func_t *fn = NULL;
4409 char name[WQ_NAME_LEN] = { };
4410 char desc[WORKER_DESC_LEN] = { };
4411 struct pool_workqueue *pwq = NULL;
4412 struct workqueue_struct *wq = NULL;
4413 bool desc_valid = false;
4414 struct worker *worker;
4415
4416 if (!(task->flags & PF_WQ_WORKER))
4417 return;
4418
4419 /*
4420 * This function is called without any synchronization and @task
4421 * could be in any state. Be careful with dereferences.
4422 */
4423 worker = probe_kthread_data(task);
4424
4425 /*
4426 * Carefully copy the associated workqueue's workfn and name. Keep
4427 * the original last '\0' in case the original contains garbage.
4428 */
4429 probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
4430 probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
4431 probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
4432 probe_kernel_read(name, wq->name, sizeof(name) - 1);
4433
4434 /* copy worker description */
4435 probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
4436 if (desc_valid)
4437 probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
4438
4439 if (fn || name[0] || desc[0]) {
4440 printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
4441 if (desc[0])
4442 pr_cont(" (%s)", desc);
4443 pr_cont("\n");
4444 }
4445}
4446
3405/* 4447/*
3406 * CPU hotplug. 4448 * CPU hotplug.
3407 * 4449 *
@@ -3422,31 +4464,28 @@ static void wq_unbind_fn(struct work_struct *work)
3422 int cpu = smp_processor_id(); 4464 int cpu = smp_processor_id();
3423 struct worker_pool *pool; 4465 struct worker_pool *pool;
3424 struct worker *worker; 4466 struct worker *worker;
3425 int i; 4467 int wi;
3426 4468
3427 for_each_std_worker_pool(pool, cpu) { 4469 for_each_cpu_worker_pool(pool, cpu) {
3428 BUG_ON(cpu != smp_processor_id()); 4470 WARN_ON_ONCE(cpu != smp_processor_id());
3429 4471
3430 mutex_lock(&pool->assoc_mutex); 4472 mutex_lock(&pool->manager_mutex);
3431 spin_lock_irq(&pool->lock); 4473 spin_lock_irq(&pool->lock);
3432 4474
3433 /* 4475 /*
3434 * We've claimed all manager positions. Make all workers 4476 * We've blocked all manager operations. Make all workers
3435 * unbound and set DISASSOCIATED. Before this, all workers 4477 * unbound and set DISASSOCIATED. Before this, all workers
3436 * except for the ones which are still executing works from 4478 * except for the ones which are still executing works from
3437 * before the last CPU down must be on the cpu. After 4479 * before the last CPU down must be on the cpu. After
3438 * this, they may become diasporas. 4480 * this, they may become diasporas.
3439 */ 4481 */
3440 list_for_each_entry(worker, &pool->idle_list, entry) 4482 for_each_pool_worker(worker, wi, pool)
3441 worker->flags |= WORKER_UNBOUND;
3442
3443 for_each_busy_worker(worker, i, pool)
3444 worker->flags |= WORKER_UNBOUND; 4483 worker->flags |= WORKER_UNBOUND;
3445 4484
3446 pool->flags |= POOL_DISASSOCIATED; 4485 pool->flags |= POOL_DISASSOCIATED;
3447 4486
3448 spin_unlock_irq(&pool->lock); 4487 spin_unlock_irq(&pool->lock);
3449 mutex_unlock(&pool->assoc_mutex); 4488 mutex_unlock(&pool->manager_mutex);
3450 4489
3451 /* 4490 /*
3452 * Call schedule() so that we cross rq->lock and thus can 4491 * Call schedule() so that we cross rq->lock and thus can
@@ -3477,6 +4516,103 @@ static void wq_unbind_fn(struct work_struct *work)
3477 } 4516 }
3478} 4517}
3479 4518
4519/**
4520 * rebind_workers - rebind all workers of a pool to the associated CPU
4521 * @pool: pool of interest
4522 *
4523 * @pool->cpu is coming online. Rebind all workers to the CPU.
4524 */
4525static void rebind_workers(struct worker_pool *pool)
4526{
4527 struct worker *worker;
4528 int wi;
4529
4530 lockdep_assert_held(&pool->manager_mutex);
4531
4532 /*
4533 * Restore CPU affinity of all workers. As all idle workers should
4534 * be on the run-queue of the associated CPU before any local
4535 * wake-ups for concurrency management happen, restore CPU affinty
4536 * of all workers first and then clear UNBOUND. As we're called
4537 * from CPU_ONLINE, the following shouldn't fail.
4538 */
4539 for_each_pool_worker(worker, wi, pool)
4540 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4541 pool->attrs->cpumask) < 0);
4542
4543 spin_lock_irq(&pool->lock);
4544
4545 for_each_pool_worker(worker, wi, pool) {
4546 unsigned int worker_flags = worker->flags;
4547
4548 /*
4549 * A bound idle worker should actually be on the runqueue
4550 * of the associated CPU for local wake-ups targeting it to
4551 * work. Kick all idle workers so that they migrate to the
4552 * associated CPU. Doing this in the same loop as
4553 * replacing UNBOUND with REBOUND is safe as no worker will
4554 * be bound before @pool->lock is released.
4555 */
4556 if (worker_flags & WORKER_IDLE)
4557 wake_up_process(worker->task);
4558
4559 /*
4560 * We want to clear UNBOUND but can't directly call
4561 * worker_clr_flags() or adjust nr_running. Atomically
4562 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
4563 * @worker will clear REBOUND using worker_clr_flags() when
4564 * it initiates the next execution cycle thus restoring
4565 * concurrency management. Note that when or whether
4566 * @worker clears REBOUND doesn't affect correctness.
4567 *
4568 * ACCESS_ONCE() is necessary because @worker->flags may be
4569 * tested without holding any lock in
4570 * wq_worker_waking_up(). Without it, NOT_RUNNING test may
4571 * fail incorrectly leading to premature concurrency
4572 * management operations.
4573 */
4574 WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
4575 worker_flags |= WORKER_REBOUND;
4576 worker_flags &= ~WORKER_UNBOUND;
4577 ACCESS_ONCE(worker->flags) = worker_flags;
4578 }
4579
4580 spin_unlock_irq(&pool->lock);
4581}
4582
4583/**
4584 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
4585 * @pool: unbound pool of interest
4586 * @cpu: the CPU which is coming up
4587 *
4588 * An unbound pool may end up with a cpumask which doesn't have any online
4589 * CPUs. When a worker of such pool get scheduled, the scheduler resets
4590 * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any
4591 * online CPU before, cpus_allowed of all its workers should be restored.
4592 */
4593static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4594{
4595 static cpumask_t cpumask;
4596 struct worker *worker;
4597 int wi;
4598
4599 lockdep_assert_held(&pool->manager_mutex);
4600
4601 /* is @cpu allowed for @pool? */
4602 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
4603 return;
4604
4605 /* is @cpu the only online CPU? */
4606 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
4607 if (cpumask_weight(&cpumask) != 1)
4608 return;
4609
4610 /* as we're called from CPU_ONLINE, the following shouldn't fail */
4611 for_each_pool_worker(worker, wi, pool)
4612 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4613 pool->attrs->cpumask) < 0);
4614}
4615
3480/* 4616/*
3481 * Workqueues should be brought up before normal priority CPU notifiers. 4617 * Workqueues should be brought up before normal priority CPU notifiers.
3482 * This will be registered high priority CPU notifier. 4618 * This will be registered high priority CPU notifier.
@@ -3485,39 +4621,46 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3485 unsigned long action, 4621 unsigned long action,
3486 void *hcpu) 4622 void *hcpu)
3487{ 4623{
3488 unsigned int cpu = (unsigned long)hcpu; 4624 int cpu = (unsigned long)hcpu;
3489 struct worker_pool *pool; 4625 struct worker_pool *pool;
4626 struct workqueue_struct *wq;
4627 int pi;
3490 4628
3491 switch (action & ~CPU_TASKS_FROZEN) { 4629 switch (action & ~CPU_TASKS_FROZEN) {
3492 case CPU_UP_PREPARE: 4630 case CPU_UP_PREPARE:
3493 for_each_std_worker_pool(pool, cpu) { 4631 for_each_cpu_worker_pool(pool, cpu) {
3494 struct worker *worker;
3495
3496 if (pool->nr_workers) 4632 if (pool->nr_workers)
3497 continue; 4633 continue;
3498 4634 if (create_and_start_worker(pool) < 0)
3499 worker = create_worker(pool);
3500 if (!worker)
3501 return NOTIFY_BAD; 4635 return NOTIFY_BAD;
3502
3503 spin_lock_irq(&pool->lock);
3504 start_worker(worker);
3505 spin_unlock_irq(&pool->lock);
3506 } 4636 }
3507 break; 4637 break;
3508 4638
3509 case CPU_DOWN_FAILED: 4639 case CPU_DOWN_FAILED:
3510 case CPU_ONLINE: 4640 case CPU_ONLINE:
3511 for_each_std_worker_pool(pool, cpu) { 4641 mutex_lock(&wq_pool_mutex);
3512 mutex_lock(&pool->assoc_mutex);
3513 spin_lock_irq(&pool->lock);
3514 4642
3515 pool->flags &= ~POOL_DISASSOCIATED; 4643 for_each_pool(pool, pi) {
3516 rebind_workers(pool); 4644 mutex_lock(&pool->manager_mutex);
4645
4646 if (pool->cpu == cpu) {
4647 spin_lock_irq(&pool->lock);
4648 pool->flags &= ~POOL_DISASSOCIATED;
4649 spin_unlock_irq(&pool->lock);
3517 4650
3518 spin_unlock_irq(&pool->lock); 4651 rebind_workers(pool);
3519 mutex_unlock(&pool->assoc_mutex); 4652 } else if (pool->cpu < 0) {
4653 restore_unbound_workers_cpumask(pool, cpu);
4654 }
4655
4656 mutex_unlock(&pool->manager_mutex);
3520 } 4657 }
4658
4659 /* update NUMA affinity of unbound workqueues */
4660 list_for_each_entry(wq, &workqueues, list)
4661 wq_update_unbound_numa(wq, cpu, true);
4662
4663 mutex_unlock(&wq_pool_mutex);
3521 break; 4664 break;
3522 } 4665 }
3523 return NOTIFY_OK; 4666 return NOTIFY_OK;
@@ -3531,14 +4674,23 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3531 unsigned long action, 4674 unsigned long action,
3532 void *hcpu) 4675 void *hcpu)
3533{ 4676{
3534 unsigned int cpu = (unsigned long)hcpu; 4677 int cpu = (unsigned long)hcpu;
3535 struct work_struct unbind_work; 4678 struct work_struct unbind_work;
4679 struct workqueue_struct *wq;
3536 4680
3537 switch (action & ~CPU_TASKS_FROZEN) { 4681 switch (action & ~CPU_TASKS_FROZEN) {
3538 case CPU_DOWN_PREPARE: 4682 case CPU_DOWN_PREPARE:
3539 /* unbinding should happen on the local CPU */ 4683 /* unbinding per-cpu workers should happen on the local CPU */
3540 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); 4684 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
3541 queue_work_on(cpu, system_highpri_wq, &unbind_work); 4685 queue_work_on(cpu, system_highpri_wq, &unbind_work);
4686
4687 /* update NUMA affinity of unbound workqueues */
4688 mutex_lock(&wq_pool_mutex);
4689 list_for_each_entry(wq, &workqueues, list)
4690 wq_update_unbound_numa(wq, cpu, false);
4691 mutex_unlock(&wq_pool_mutex);
4692
4693 /* wait for per-cpu unbinding to finish */
3542 flush_work(&unbind_work); 4694 flush_work(&unbind_work);
3543 break; 4695 break;
3544 } 4696 }
@@ -3571,7 +4723,7 @@ static void work_for_cpu_fn(struct work_struct *work)
3571 * It is up to the caller to ensure that the cpu doesn't go offline. 4723 * It is up to the caller to ensure that the cpu doesn't go offline.
3572 * The caller must not hold any locks which would prevent @fn from completing. 4724 * The caller must not hold any locks which would prevent @fn from completing.
3573 */ 4725 */
3574long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) 4726long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
3575{ 4727{
3576 struct work_for_cpu wfc = { .fn = fn, .arg = arg }; 4728 struct work_for_cpu wfc = { .fn = fn, .arg = arg };
3577 4729
@@ -3589,44 +4741,40 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3589 * freeze_workqueues_begin - begin freezing workqueues 4741 * freeze_workqueues_begin - begin freezing workqueues
3590 * 4742 *
3591 * Start freezing workqueues. After this function returns, all freezable 4743 * Start freezing workqueues. After this function returns, all freezable
3592 * workqueues will queue new works to their frozen_works list instead of 4744 * workqueues will queue new works to their delayed_works list instead of
3593 * pool->worklist. 4745 * pool->worklist.
3594 * 4746 *
3595 * CONTEXT: 4747 * CONTEXT:
3596 * Grabs and releases workqueue_lock and pool->lock's. 4748 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
3597 */ 4749 */
3598void freeze_workqueues_begin(void) 4750void freeze_workqueues_begin(void)
3599{ 4751{
3600 unsigned int cpu; 4752 struct worker_pool *pool;
4753 struct workqueue_struct *wq;
4754 struct pool_workqueue *pwq;
4755 int pi;
3601 4756
3602 spin_lock(&workqueue_lock); 4757 mutex_lock(&wq_pool_mutex);
3603 4758
3604 BUG_ON(workqueue_freezing); 4759 WARN_ON_ONCE(workqueue_freezing);
3605 workqueue_freezing = true; 4760 workqueue_freezing = true;
3606 4761
3607 for_each_wq_cpu(cpu) { 4762 /* set FREEZING */
3608 struct worker_pool *pool; 4763 for_each_pool(pool, pi) {
3609 struct workqueue_struct *wq; 4764 spin_lock_irq(&pool->lock);
3610 4765 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3611 for_each_std_worker_pool(pool, cpu) { 4766 pool->flags |= POOL_FREEZING;
3612 spin_lock_irq(&pool->lock); 4767 spin_unlock_irq(&pool->lock);
3613 4768 }
3614 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3615 pool->flags |= POOL_FREEZING;
3616
3617 list_for_each_entry(wq, &workqueues, list) {
3618 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3619
3620 if (pwq && pwq->pool == pool &&
3621 (wq->flags & WQ_FREEZABLE))
3622 pwq->max_active = 0;
3623 }
3624 4769
3625 spin_unlock_irq(&pool->lock); 4770 list_for_each_entry(wq, &workqueues, list) {
3626 } 4771 mutex_lock(&wq->mutex);
4772 for_each_pwq(pwq, wq)
4773 pwq_adjust_max_active(pwq);
4774 mutex_unlock(&wq->mutex);
3627 } 4775 }
3628 4776
3629 spin_unlock(&workqueue_lock); 4777 mutex_unlock(&wq_pool_mutex);
3630} 4778}
3631 4779
3632/** 4780/**
@@ -3636,7 +4784,7 @@ void freeze_workqueues_begin(void)
3636 * between freeze_workqueues_begin() and thaw_workqueues(). 4784 * between freeze_workqueues_begin() and thaw_workqueues().
3637 * 4785 *
3638 * CONTEXT: 4786 * CONTEXT:
3639 * Grabs and releases workqueue_lock. 4787 * Grabs and releases wq_pool_mutex.
3640 * 4788 *
3641 * RETURNS: 4789 * RETURNS:
3642 * %true if some freezable workqueues are still busy. %false if freezing 4790 * %true if some freezable workqueues are still busy. %false if freezing
@@ -3644,34 +4792,34 @@ void freeze_workqueues_begin(void)
3644 */ 4792 */
3645bool freeze_workqueues_busy(void) 4793bool freeze_workqueues_busy(void)
3646{ 4794{
3647 unsigned int cpu;
3648 bool busy = false; 4795 bool busy = false;
4796 struct workqueue_struct *wq;
4797 struct pool_workqueue *pwq;
3649 4798
3650 spin_lock(&workqueue_lock); 4799 mutex_lock(&wq_pool_mutex);
3651 4800
3652 BUG_ON(!workqueue_freezing); 4801 WARN_ON_ONCE(!workqueue_freezing);
3653 4802
3654 for_each_wq_cpu(cpu) { 4803 list_for_each_entry(wq, &workqueues, list) {
3655 struct workqueue_struct *wq; 4804 if (!(wq->flags & WQ_FREEZABLE))
4805 continue;
3656 /* 4806 /*
3657 * nr_active is monotonically decreasing. It's safe 4807 * nr_active is monotonically decreasing. It's safe
3658 * to peek without lock. 4808 * to peek without lock.
3659 */ 4809 */
3660 list_for_each_entry(wq, &workqueues, list) { 4810 rcu_read_lock_sched();
3661 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4811 for_each_pwq(pwq, wq) {
3662 4812 WARN_ON_ONCE(pwq->nr_active < 0);
3663 if (!pwq || !(wq->flags & WQ_FREEZABLE))
3664 continue;
3665
3666 BUG_ON(pwq->nr_active < 0);
3667 if (pwq->nr_active) { 4813 if (pwq->nr_active) {
3668 busy = true; 4814 busy = true;
4815 rcu_read_unlock_sched();
3669 goto out_unlock; 4816 goto out_unlock;
3670 } 4817 }
3671 } 4818 }
4819 rcu_read_unlock_sched();
3672 } 4820 }
3673out_unlock: 4821out_unlock:
3674 spin_unlock(&workqueue_lock); 4822 mutex_unlock(&wq_pool_mutex);
3675 return busy; 4823 return busy;
3676} 4824}
3677 4825
@@ -3682,104 +4830,141 @@ out_unlock:
3682 * frozen works are transferred to their respective pool worklists. 4830 * frozen works are transferred to their respective pool worklists.
3683 * 4831 *
3684 * CONTEXT: 4832 * CONTEXT:
3685 * Grabs and releases workqueue_lock and pool->lock's. 4833 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
3686 */ 4834 */
3687void thaw_workqueues(void) 4835void thaw_workqueues(void)
3688{ 4836{
3689 unsigned int cpu; 4837 struct workqueue_struct *wq;
4838 struct pool_workqueue *pwq;
4839 struct worker_pool *pool;
4840 int pi;
3690 4841
3691 spin_lock(&workqueue_lock); 4842 mutex_lock(&wq_pool_mutex);
3692 4843
3693 if (!workqueue_freezing) 4844 if (!workqueue_freezing)
3694 goto out_unlock; 4845 goto out_unlock;
3695 4846
3696 for_each_wq_cpu(cpu) { 4847 /* clear FREEZING */
3697 struct worker_pool *pool; 4848 for_each_pool(pool, pi) {
3698 struct workqueue_struct *wq; 4849 spin_lock_irq(&pool->lock);
4850 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
4851 pool->flags &= ~POOL_FREEZING;
4852 spin_unlock_irq(&pool->lock);
4853 }
3699 4854
3700 for_each_std_worker_pool(pool, cpu) { 4855 /* restore max_active and repopulate worklist */
3701 spin_lock_irq(&pool->lock); 4856 list_for_each_entry(wq, &workqueues, list) {
4857 mutex_lock(&wq->mutex);
4858 for_each_pwq(pwq, wq)
4859 pwq_adjust_max_active(pwq);
4860 mutex_unlock(&wq->mutex);
4861 }
3702 4862
3703 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); 4863 workqueue_freezing = false;
3704 pool->flags &= ~POOL_FREEZING; 4864out_unlock:
4865 mutex_unlock(&wq_pool_mutex);
4866}
4867#endif /* CONFIG_FREEZER */
3705 4868
3706 list_for_each_entry(wq, &workqueues, list) { 4869static void __init wq_numa_init(void)
3707 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4870{
4871 cpumask_var_t *tbl;
4872 int node, cpu;
3708 4873
3709 if (!pwq || pwq->pool != pool || 4874 /* determine NUMA pwq table len - highest node id + 1 */
3710 !(wq->flags & WQ_FREEZABLE)) 4875 for_each_node(node)
3711 continue; 4876 wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
3712 4877
3713 /* restore max_active and repopulate worklist */ 4878 if (num_possible_nodes() <= 1)
3714 pwq_set_max_active(pwq, wq->saved_max_active); 4879 return;
3715 }
3716 4880
3717 wake_up_worker(pool); 4881 if (wq_disable_numa) {
4882 pr_info("workqueue: NUMA affinity support disabled\n");
4883 return;
4884 }
4885
4886 wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
4887 BUG_ON(!wq_update_unbound_numa_attrs_buf);
3718 4888
3719 spin_unlock_irq(&pool->lock); 4889 /*
4890 * We want masks of possible CPUs of each node which isn't readily
4891 * available. Build one from cpu_to_node() which should have been
4892 * fully initialized by now.
4893 */
4894 tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
4895 BUG_ON(!tbl);
4896
4897 for_each_node(node)
4898 BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node));
4899
4900 for_each_possible_cpu(cpu) {
4901 node = cpu_to_node(cpu);
4902 if (WARN_ON(node == NUMA_NO_NODE)) {
4903 pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
4904 /* happens iff arch is bonkers, let's just proceed */
4905 return;
3720 } 4906 }
4907 cpumask_set_cpu(cpu, tbl[node]);
3721 } 4908 }
3722 4909
3723 workqueue_freezing = false; 4910 wq_numa_possible_cpumask = tbl;
3724out_unlock: 4911 wq_numa_enabled = true;
3725 spin_unlock(&workqueue_lock);
3726} 4912}
3727#endif /* CONFIG_FREEZER */
3728 4913
3729static int __init init_workqueues(void) 4914static int __init init_workqueues(void)
3730{ 4915{
3731 unsigned int cpu; 4916 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
4917 int i, cpu;
3732 4918
3733 /* make sure we have enough bits for OFFQ pool ID */ 4919 /* make sure we have enough bits for OFFQ pool ID */
3734 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < 4920 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
3735 WORK_CPU_END * NR_STD_WORKER_POOLS); 4921 WORK_CPU_END * NR_STD_WORKER_POOLS);
3736 4922
4923 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
4924
4925 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
4926
3737 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); 4927 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3738 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); 4928 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3739 4929
4930 wq_numa_init();
4931
3740 /* initialize CPU pools */ 4932 /* initialize CPU pools */
3741 for_each_wq_cpu(cpu) { 4933 for_each_possible_cpu(cpu) {
3742 struct worker_pool *pool; 4934 struct worker_pool *pool;
3743 4935
3744 for_each_std_worker_pool(pool, cpu) { 4936 i = 0;
3745 spin_lock_init(&pool->lock); 4937 for_each_cpu_worker_pool(pool, cpu) {
4938 BUG_ON(init_worker_pool(pool));
3746 pool->cpu = cpu; 4939 pool->cpu = cpu;
3747 pool->flags |= POOL_DISASSOCIATED; 4940 cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
3748 INIT_LIST_HEAD(&pool->worklist); 4941 pool->attrs->nice = std_nice[i++];
3749 INIT_LIST_HEAD(&pool->idle_list); 4942 pool->node = cpu_to_node(cpu);
3750 hash_init(pool->busy_hash);
3751
3752 init_timer_deferrable(&pool->idle_timer);
3753 pool->idle_timer.function = idle_worker_timeout;
3754 pool->idle_timer.data = (unsigned long)pool;
3755
3756 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3757 (unsigned long)pool);
3758
3759 mutex_init(&pool->assoc_mutex);
3760 ida_init(&pool->worker_ida);
3761 4943
3762 /* alloc pool ID */ 4944 /* alloc pool ID */
4945 mutex_lock(&wq_pool_mutex);
3763 BUG_ON(worker_pool_assign_id(pool)); 4946 BUG_ON(worker_pool_assign_id(pool));
4947 mutex_unlock(&wq_pool_mutex);
3764 } 4948 }
3765 } 4949 }
3766 4950
3767 /* create the initial worker */ 4951 /* create the initial worker */
3768 for_each_online_wq_cpu(cpu) { 4952 for_each_online_cpu(cpu) {
3769 struct worker_pool *pool; 4953 struct worker_pool *pool;
3770 4954
3771 for_each_std_worker_pool(pool, cpu) { 4955 for_each_cpu_worker_pool(pool, cpu) {
3772 struct worker *worker; 4956 pool->flags &= ~POOL_DISASSOCIATED;
4957 BUG_ON(create_and_start_worker(pool) < 0);
4958 }
4959 }
3773 4960
3774 if (cpu != WORK_CPU_UNBOUND) 4961 /* create default unbound wq attrs */
3775 pool->flags &= ~POOL_DISASSOCIATED; 4962 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
4963 struct workqueue_attrs *attrs;
3776 4964
3777 worker = create_worker(pool); 4965 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
3778 BUG_ON(!worker); 4966 attrs->nice = std_nice[i];
3779 spin_lock_irq(&pool->lock); 4967 unbound_std_wq_attrs[i] = attrs;
3780 start_worker(worker);
3781 spin_unlock_irq(&pool->lock);
3782 }
3783 } 4968 }
3784 4969
3785 system_wq = alloc_workqueue("events", 0, 0); 4970 system_wq = alloc_workqueue("events", 0, 0);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 07650264ec15..ad83c96b2ece 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -29,16 +29,24 @@ struct worker {
29 struct work_struct *current_work; /* L: work being processed */ 29 struct work_struct *current_work; /* L: work being processed */
30 work_func_t current_func; /* L: current_work's fn */ 30 work_func_t current_func; /* L: current_work's fn */
31 struct pool_workqueue *current_pwq; /* L: current_work's pwq */ 31 struct pool_workqueue *current_pwq; /* L: current_work's pwq */
32 bool desc_valid; /* ->desc is valid */
32 struct list_head scheduled; /* L: scheduled works */ 33 struct list_head scheduled; /* L: scheduled works */
34
35 /* 64 bytes boundary on 64bit, 32 on 32bit */
36
33 struct task_struct *task; /* I: worker task */ 37 struct task_struct *task; /* I: worker task */
34 struct worker_pool *pool; /* I: the associated pool */ 38 struct worker_pool *pool; /* I: the associated pool */
35 /* 64 bytes boundary on 64bit, 32 on 32bit */ 39 /* L: for rescuers */
40
36 unsigned long last_active; /* L: last active timestamp */ 41 unsigned long last_active; /* L: last active timestamp */
37 unsigned int flags; /* X: flags */ 42 unsigned int flags; /* X: flags */
38 int id; /* I: worker id */ 43 int id; /* I: worker id */
39 44
40 /* for rebinding worker to CPU */ 45 /*
41 struct work_struct rebind_work; /* L: for busy worker */ 46 * Opaque string set with work_set_desc(). Printed out with task
47 * dump for debugging - WARN, BUG, panic or sysrq.
48 */
49 char desc[WORKER_DESC_LEN];
42 50
43 /* used only by rescuers to point to the target workqueue */ 51 /* used only by rescuers to point to the target workqueue */
44 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ 52 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
@@ -58,8 +66,7 @@ static inline struct worker *current_wq_worker(void)
58 * Scheduler hooks for concurrency managed workqueue. Only to be used from 66 * Scheduler hooks for concurrency managed workqueue. Only to be used from
59 * sched.c and workqueue.c. 67 * sched.c and workqueue.c.
60 */ 68 */
61void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); 69void wq_worker_waking_up(struct task_struct *task, int cpu);
62struct task_struct *wq_worker_sleeping(struct task_struct *task, 70struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
63 unsigned int cpu);
64 71
65#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ 72#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */