aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/async.c40
-rw-r--r--kernel/audit.c22
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/auditsc.c14
-rw-r--r--kernel/capability.c24
-rw-r--r--kernel/cgroup.c795
-rw-r--r--kernel/compat.c84
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpu/idle.c116
-rw-r--r--kernel/cpuset.c158
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/events/core.c87
-rw-r--r--kernel/events/internal.h2
-rw-r--r--kernel/events/ring_buffer.c36
-rw-r--r--kernel/events/uprobes.c300
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/extable.c6
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/futex.c46
-rw-r--r--kernel/hrtimer.c33
-rw-r--r--kernel/irq/irqdomain.c20
-rw-r--r--kernel/irq/proc.c20
-rw-r--r--kernel/kallsyms.c26
-rw-r--r--kernel/kexec.c161
-rw-r--r--kernel/kmod.c98
-rw-r--r--kernel/kprobes.c19
-rw-r--r--kernel/kthread.c111
-rw-r--r--kernel/lockdep.c46
-rw-r--r--kernel/modsign_certificate.S13
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/mutex.c151
-rw-r--r--kernel/nsproxy.c6
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/posix-cpu-timers.c76
-rw-r--r--kernel/posix-timers.c121
-rw-r--r--kernel/power/console.c116
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/suspend.c22
-rw-r--r--kernel/printk.c177
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/ptrace.c80
-rw-r--r--kernel/range.c3
-rw-r--r--kernel/rcutree.c276
-rw-r--r--kernel/rcutree.h43
-rw-r--r--kernel/rcutree_plugin.h622
-rw-r--r--kernel/rcutree_trace.c10
-rw-r--r--kernel/relay.c14
-rw-r--r--kernel/resource.c198
-rw-r--r--kernel/rtmutex-tester.c5
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/clock.c26
-rw-r--r--kernel/sched/core.c414
-rw-r--r--kernel/sched/cpuacct.c296
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cputime.c240
-rw-r--r--kernel/sched/fair.c158
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/idle_task.c17
-rw-r--r--kernel/sched/sched.h244
-rw-r--r--kernel/sched/stats.c7
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/semaphore.c8
-rw-r--r--kernel/signal.c18
-rw-r--r--kernel/smp.c91
-rw-r--r--kernel/smpboot.c14
-rw-r--r--kernel/softirq.c25
-rw-r--r--kernel/sys.c289
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/test_kprobes.c2
-rw-r--r--kernel/time.c11
-rw-r--r--kernel/time/Kconfig80
-rw-r--r--kernel/time/ntp.c105
-rw-r--r--kernel/time/ntp_internal.h12
-rw-r--r--kernel/time/tick-broadcast.c245
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-internal.h5
-rw-r--r--kernel/time/tick-sched.c300
-rw-r--r--kernel/time/timekeeping.c396
-rw-r--r--kernel/time/timer_list.c104
-rw-r--r--kernel/timer.c159
-rw-r--r--kernel/trace/Kconfig73
-rw-r--r--kernel/trace/blktrace.c30
-rw-r--r--kernel/trace/ftrace.c154
-rw-r--r--kernel/trace/ring_buffer.c500
-rw-r--r--kernel/trace/trace.c2261
-rw-r--r--kernel/trace/trace.h151
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c10
-rw-r--r--kernel/trace/trace_entries.h23
-rw-r--r--kernel/trace/trace_events.c1397
-rw-r--r--kernel/trace/trace_events_filter.c34
-rw-r--r--kernel/trace/trace_export.c4
-rw-r--r--kernel/trace/trace_functions.c207
-rw-r--r--kernel/trace/trace_functions_graph.c12
-rw-r--r--kernel/trace/trace_irqsoff.c90
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_mmiotrace.c12
-rw-r--r--kernel/trace/trace_output.c119
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_sched_switch.c8
-rw-r--r--kernel/trace/trace_sched_wakeup.c93
-rw-r--r--kernel/trace/trace_selftest.c51
-rw-r--r--kernel/trace/trace_stack.c78
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c90
-rw-r--r--kernel/trace/trace_uprobe.c203
-rw-r--r--kernel/tracepoint.c21
-rw-r--r--kernel/uid16.c55
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/user_namespace.c39
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/watchdog.c5
-rw-r--r--kernel/workqueue.c2946
-rw-r--r--kernel/workqueue_internal.h19
123 files changed, 11266 insertions, 5063 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f437..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,4 @@
4config_data.h 4config_data.h
5config_data.gz 5config_data.gz
6timeconst.h 6timeconst.h
7hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index bbde5f1a4486..271fd3119af9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -24,6 +24,7 @@ endif
24 24
25obj-y += sched/ 25obj-y += sched/
26obj-y += power/ 26obj-y += power/
27obj-y += cpu/
27 28
28obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 29obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
29obj-$(CONFIG_FREEZER) += freezer.o 30obj-$(CONFIG_FREEZER) += freezer.o
@@ -175,7 +176,7 @@ signing_key.priv signing_key.x509: x509.genkey
175 openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \ 176 openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
176 -batch -x509 -config x509.genkey \ 177 -batch -x509 -config x509.genkey \
177 -outform DER -out signing_key.x509 \ 178 -outform DER -out signing_key.x509 \
178 -keyout signing_key.priv 179 -keyout signing_key.priv 2>&1
179 @echo "###" 180 @echo "###"
180 @echo "### Key pair generated." 181 @echo "### Key pair generated."
181 @echo "###" 182 @echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index b9bd7f098ee5..8d6e145138bb 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct,
540 ac.ac_swaps = encode_comp_t(0); 540 ac.ac_swaps = encode_comp_t(0);
541 541
542 /* 542 /*
543 * Get freeze protection. If the fs is frozen, just skip the write
544 * as we could deadlock the system otherwise.
545 */
546 if (!file_start_write_trylock(file))
547 goto out;
548 /*
543 * Kernel segment override to datasegment and write it 549 * Kernel segment override to datasegment and write it
544 * to the accounting file. 550 * to the accounting file.
545 */ 551 */
@@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
554 sizeof(acct_t), &file->f_pos); 560 sizeof(acct_t), &file->f_pos);
555 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 561 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
556 set_fs(fs); 562 set_fs(fs);
563 file_end_write(file);
557out: 564out:
558 revert_creds(orig_cred); 565 revert_creds(orig_cred);
559} 566}
diff --git a/kernel/async.c b/kernel/async.c
index 8ddee2c3e5b0..61f023ce0228 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -73,7 +73,7 @@ struct async_entry {
73 struct list_head global_list; 73 struct list_head global_list;
74 struct work_struct work; 74 struct work_struct work;
75 async_cookie_t cookie; 75 async_cookie_t cookie;
76 async_func_ptr *func; 76 async_func_t func;
77 void *data; 77 void *data;
78 struct async_domain *domain; 78 struct async_domain *domain;
79}; 79};
@@ -84,24 +84,20 @@ static atomic_t entry_count;
84 84
85static async_cookie_t lowest_in_progress(struct async_domain *domain) 85static async_cookie_t lowest_in_progress(struct async_domain *domain)
86{ 86{
87 struct async_entry *first = NULL; 87 struct list_head *pending;
88 async_cookie_t ret = ASYNC_COOKIE_MAX; 88 async_cookie_t ret = ASYNC_COOKIE_MAX;
89 unsigned long flags; 89 unsigned long flags;
90 90
91 spin_lock_irqsave(&async_lock, flags); 91 spin_lock_irqsave(&async_lock, flags);
92 92
93 if (domain) { 93 if (domain)
94 if (!list_empty(&domain->pending)) 94 pending = &domain->pending;
95 first = list_first_entry(&domain->pending, 95 else
96 struct async_entry, domain_list); 96 pending = &async_global_pending;
97 } else {
98 if (!list_empty(&async_global_pending))
99 first = list_first_entry(&async_global_pending,
100 struct async_entry, global_list);
101 }
102 97
103 if (first) 98 if (!list_empty(pending))
104 ret = first->cookie; 99 ret = list_first_entry(pending, struct async_entry,
100 domain_list)->cookie;
105 101
106 spin_unlock_irqrestore(&async_lock, flags); 102 spin_unlock_irqrestore(&async_lock, flags);
107 return ret; 103 return ret;
@@ -149,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work)
149 wake_up(&async_done); 145 wake_up(&async_done);
150} 146}
151 147
152static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) 148static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
153{ 149{
154 struct async_entry *entry; 150 struct async_entry *entry;
155 unsigned long flags; 151 unsigned long flags;
@@ -169,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
169 spin_unlock_irqrestore(&async_lock, flags); 165 spin_unlock_irqrestore(&async_lock, flags);
170 166
171 /* low on memory.. run synchronously */ 167 /* low on memory.. run synchronously */
172 ptr(data, newcookie); 168 func(data, newcookie);
173 return newcookie; 169 return newcookie;
174 } 170 }
175 INIT_LIST_HEAD(&entry->domain_list); 171 INIT_LIST_HEAD(&entry->domain_list);
176 INIT_LIST_HEAD(&entry->global_list); 172 INIT_LIST_HEAD(&entry->global_list);
177 INIT_WORK(&entry->work, async_run_entry_fn); 173 INIT_WORK(&entry->work, async_run_entry_fn);
178 entry->func = ptr; 174 entry->func = func;
179 entry->data = data; 175 entry->data = data;
180 entry->domain = domain; 176 entry->domain = domain;
181 177
@@ -202,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
202 198
203/** 199/**
204 * async_schedule - schedule a function for asynchronous execution 200 * async_schedule - schedule a function for asynchronous execution
205 * @ptr: function to execute asynchronously 201 * @func: function to execute asynchronously
206 * @data: data pointer to pass to the function 202 * @data: data pointer to pass to the function
207 * 203 *
208 * Returns an async_cookie_t that may be used for checkpointing later. 204 * Returns an async_cookie_t that may be used for checkpointing later.
209 * Note: This function may be called from atomic or non-atomic contexts. 205 * Note: This function may be called from atomic or non-atomic contexts.
210 */ 206 */
211async_cookie_t async_schedule(async_func_ptr *ptr, void *data) 207async_cookie_t async_schedule(async_func_t func, void *data)
212{ 208{
213 return __async_schedule(ptr, data, &async_dfl_domain); 209 return __async_schedule(func, data, &async_dfl_domain);
214} 210}
215EXPORT_SYMBOL_GPL(async_schedule); 211EXPORT_SYMBOL_GPL(async_schedule);
216 212
217/** 213/**
218 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain 214 * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
219 * @ptr: function to execute asynchronously 215 * @func: function to execute asynchronously
220 * @data: data pointer to pass to the function 216 * @data: data pointer to pass to the function
221 * @domain: the domain 217 * @domain: the domain
222 * 218 *
@@ -226,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule);
226 * synchronization domain is specified via @domain. Note: This function 222 * synchronization domain is specified via @domain. Note: This function
227 * may be called from atomic or non-atomic contexts. 223 * may be called from atomic or non-atomic contexts.
228 */ 224 */
229async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, 225async_cookie_t async_schedule_domain(async_func_t func, void *data,
230 struct async_domain *domain) 226 struct async_domain *domain)
231{ 227{
232 return __async_schedule(ptr, data, domain); 228 return __async_schedule(func, data, domain);
233} 229}
234EXPORT_SYMBOL_GPL(async_schedule_domain); 230EXPORT_SYMBOL_GPL(async_schedule_domain);
235 231
diff --git a/kernel/audit.c b/kernel/audit.c
index d596e5355f15..0b084fa44b1f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -58,7 +58,7 @@
58#ifdef CONFIG_SECURITY 58#ifdef CONFIG_SECURITY
59#include <linux/security.h> 59#include <linux/security.h>
60#endif 60#endif
61#include <linux/netlink.h> 61#include <net/netlink.h>
62#include <linux/freezer.h> 62#include <linux/freezer.h>
63#include <linux/tty.h> 63#include <linux/tty.h>
64#include <linux/pid_namespace.h> 64#include <linux/pid_namespace.h>
@@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
660 660
661 /* As soon as there's any sign of userspace auditd, 661 /* As soon as there's any sign of userspace auditd,
662 * start kauditd to talk to it */ 662 * start kauditd to talk to it */
663 if (!kauditd_task) 663 if (!kauditd_task) {
664 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); 664 kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
665 if (IS_ERR(kauditd_task)) { 665 if (IS_ERR(kauditd_task)) {
666 err = PTR_ERR(kauditd_task); 666 err = PTR_ERR(kauditd_task);
667 kauditd_task = NULL; 667 kauditd_task = NULL;
668 return err; 668 return err;
669 }
669 } 670 }
670
671 loginuid = audit_get_loginuid(current); 671 loginuid = audit_get_loginuid(current);
672 sessionid = audit_get_sessionid(current); 672 sessionid = audit_get_sessionid(current);
673 security_task_getsecid(current, &sid); 673 security_task_getsecid(current, &sid);
@@ -910,7 +910,7 @@ static void audit_receive_skb(struct sk_buff *skb)
910{ 910{
911 struct nlmsghdr *nlh; 911 struct nlmsghdr *nlh;
912 /* 912 /*
913 * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 913 * len MUST be signed for nlmsg_next to be able to dec it below 0
914 * if the nlmsg_len was not aligned 914 * if the nlmsg_len was not aligned
915 */ 915 */
916 int len; 916 int len;
@@ -919,13 +919,13 @@ static void audit_receive_skb(struct sk_buff *skb)
919 nlh = nlmsg_hdr(skb); 919 nlh = nlmsg_hdr(skb);
920 len = skb->len; 920 len = skb->len;
921 921
922 while (NLMSG_OK(nlh, len)) { 922 while (nlmsg_ok(nlh, len)) {
923 err = audit_receive_msg(skb, nlh); 923 err = audit_receive_msg(skb, nlh);
924 /* if err or if this message says it wants a response */ 924 /* if err or if this message says it wants a response */
925 if (err || (nlh->nlmsg_flags & NLM_F_ACK)) 925 if (err || (nlh->nlmsg_flags & NLM_F_ACK))
926 netlink_ack(skb, nlh, err); 926 netlink_ack(skb, nlh, err);
927 927
928 nlh = NLMSG_NEXT(nlh, len); 928 nlh = nlmsg_next(nlh, &len);
929 } 929 }
930} 930}
931 931
@@ -1483,7 +1483,7 @@ void audit_log_end(struct audit_buffer *ab)
1483 audit_log_lost("rate limit exceeded"); 1483 audit_log_lost("rate limit exceeded");
1484 } else { 1484 } else {
1485 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1485 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1486 nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); 1486 nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
1487 1487
1488 if (audit_pid) { 1488 if (audit_pid) {
1489 skb_queue_tail(&audit_skb_queue, ab->skb); 1489 skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index d51cba868e1b..11468d99dad0 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -59,10 +59,7 @@ struct audit_entry {
59 struct audit_krule rule; 59 struct audit_krule rule;
60}; 60};
61 61
62#ifdef CONFIG_AUDIT
63extern int audit_enabled;
64extern int audit_ever_enabled; 62extern int audit_ever_enabled;
65#endif
66 63
67extern int audit_pid; 64extern int audit_pid;
68 65
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 642a89c4f3d6..a291aa23fb3f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -617,9 +617,9 @@ void audit_trim_trees(void)
617 } 617 }
618 spin_unlock(&hash_lock); 618 spin_unlock(&hash_lock);
619 trim_marked(tree); 619 trim_marked(tree);
620 put_tree(tree);
621 drop_collected_mounts(root_mnt); 620 drop_collected_mounts(root_mnt);
622skip_it: 621skip_it:
622 put_tree(tree);
623 mutex_lock(&audit_filter_mutex); 623 mutex_lock(&audit_filter_mutex);
624 } 624 }
625 list_del(&cursor); 625 list_del(&cursor);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f9fc54bbe06f..267436826c3b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -594,6 +594,10 @@ exit_nofree:
594 return entry; 594 return entry;
595 595
596exit_free: 596exit_free:
597 if (entry->rule.watch)
598 audit_put_watch(entry->rule.watch); /* matches initial get */
599 if (entry->rule.tree)
600 audit_put_tree(entry->rule.tree); /* that's the temporary one */
597 audit_free_rule(entry); 601 audit_free_rule(entry);
598 return ERR_PTR(err); 602 return ERR_PTR(err);
599} 603}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f857a0a9..c68229411a7c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context)
1034 } 1034 }
1035} 1035}
1036 1036
1037static inline void audit_zero_context(struct audit_context *context,
1038 enum audit_state state)
1039{
1040 memset(context, 0, sizeof(*context));
1041 context->state = state;
1042 context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1043}
1044
1045static inline struct audit_context *audit_alloc_context(enum audit_state state) 1037static inline struct audit_context *audit_alloc_context(enum audit_state state)
1046{ 1038{
1047 struct audit_context *context; 1039 struct audit_context *context;
1048 1040
1049 if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) 1041 context = kzalloc(sizeof(*context), GFP_KERNEL);
1042 if (!context)
1050 return NULL; 1043 return NULL;
1051 audit_zero_context(context, state); 1044 context->state = state;
1045 context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1052 INIT_LIST_HEAD(&context->killed_trees); 1046 INIT_LIST_HEAD(&context->killed_trees);
1053 INIT_LIST_HEAD(&context->names_list); 1047 INIT_LIST_HEAD(&context->names_list);
1054 return context; 1048 return context;
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d97259484..f6c2ce5701e1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
393EXPORT_SYMBOL(ns_capable); 393EXPORT_SYMBOL(ns_capable);
394 394
395/** 395/**
396 * file_ns_capable - Determine if the file's opener had a capability in effect
397 * @file: The file we want to check
398 * @ns: The usernamespace we want the capability in
399 * @cap: The capability to be tested for
400 *
401 * Return true if task that opened the file had a capability in effect
402 * when the file was opened.
403 *
404 * This does not set PF_SUPERPRIV because the caller may not
405 * actually be privileged.
406 */
407bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
408{
409 if (WARN_ON_ONCE(!cap_valid(cap)))
410 return false;
411
412 if (security_capable(file->f_cred, ns, cap) == 0)
413 return true;
414
415 return false;
416}
417EXPORT_SYMBOL(file_ns_capable);
418
419/**
396 * capable - Determine if the current task has a superior capability in effect 420 * capable - Determine if the current task has a superior capability in effect
397 * @cap: The capability to be tested for 421 * @cap: The capability to be tested for
398 * 422 *
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..2a9926275f80 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -30,7 +30,6 @@
30#include <linux/cred.h> 30#include <linux/cred.h>
31#include <linux/ctype.h> 31#include <linux/ctype.h>
32#include <linux/errno.h> 32#include <linux/errno.h>
33#include <linux/fs.h>
34#include <linux/init_task.h> 33#include <linux/init_task.h>
35#include <linux/kernel.h> 34#include <linux/kernel.h>
36#include <linux/list.h> 35#include <linux/list.h>
@@ -59,7 +58,7 @@
59#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 58#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
60#include <linux/eventfd.h> 59#include <linux/eventfd.h>
61#include <linux/poll.h> 60#include <linux/poll.h>
62#include <linux/flex_array.h> /* used in cgroup_attach_proc */ 61#include <linux/flex_array.h> /* used in cgroup_attach_task */
63#include <linux/kthread.h> 62#include <linux/kthread.h>
64 63
65#include <linux/atomic.h> 64#include <linux/atomic.h>
@@ -83,7 +82,13 @@
83 * B happens only through cgroup_show_options() and using cgroup_root_mutex 82 * B happens only through cgroup_show_options() and using cgroup_root_mutex
84 * breaks it. 83 * breaks it.
85 */ 84 */
85#ifdef CONFIG_PROVE_RCU
86DEFINE_MUTEX(cgroup_mutex);
87EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */
88#else
86static DEFINE_MUTEX(cgroup_mutex); 89static DEFINE_MUTEX(cgroup_mutex);
90#endif
91
87static DEFINE_MUTEX(cgroup_root_mutex); 92static DEFINE_MUTEX(cgroup_root_mutex);
88 93
89/* 94/*
@@ -98,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
98#include <linux/cgroup_subsys.h> 103#include <linux/cgroup_subsys.h>
99}; 104};
100 105
101#define MAX_CGROUP_ROOT_NAMELEN 64
102
103/*
104 * A cgroupfs_root represents the root of a cgroup hierarchy,
105 * and may be associated with a superblock to form an active
106 * hierarchy
107 */
108struct cgroupfs_root {
109 struct super_block *sb;
110
111 /*
112 * The bitmask of subsystems intended to be attached to this
113 * hierarchy
114 */
115 unsigned long subsys_mask;
116
117 /* Unique id for this hierarchy. */
118 int hierarchy_id;
119
120 /* The bitmask of subsystems currently attached to this hierarchy */
121 unsigned long actual_subsys_mask;
122
123 /* A list running through the attached subsystems */
124 struct list_head subsys_list;
125
126 /* The root cgroup for this hierarchy */
127 struct cgroup top_cgroup;
128
129 /* Tracks how many cgroups are currently defined in hierarchy.*/
130 int number_of_cgroups;
131
132 /* A list running through the active hierarchies */
133 struct list_head root_list;
134
135 /* All cgroups on this root, cgroup_mutex protected */
136 struct list_head allcg_list;
137
138 /* Hierarchy-specific flags */
139 unsigned long flags;
140
141 /* IDs for cgroups in this hierarchy */
142 struct ida cgroup_ida;
143
144 /* The path to use for release notifications. */
145 char release_agent_path[PATH_MAX];
146
147 /* The name for this hierarchy - may be empty */
148 char name[MAX_CGROUP_ROOT_NAMELEN];
149};
150
151/* 106/*
152 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 107 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
153 * subsystems that are otherwise unattached - it never has more than a 108 * subsystems that are otherwise unattached - it never has more than a
@@ -162,6 +117,9 @@ struct cfent {
162 struct list_head node; 117 struct list_head node;
163 struct dentry *dentry; 118 struct dentry *dentry;
164 struct cftype *type; 119 struct cftype *type;
120
121 /* file xattrs */
122 struct simple_xattrs xattrs;
165}; 123};
166 124
167/* 125/*
@@ -238,6 +196,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
238/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 196/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
239#define dummytop (&rootnode.top_cgroup) 197#define dummytop (&rootnode.top_cgroup)
240 198
199static struct cgroup_name root_cgroup_name = { .name = "/" };
200
241/* This flag indicates whether tasks in the fork and exit paths should 201/* This flag indicates whether tasks in the fork and exit paths should
242 * check for fork/exit handlers to call. This avoids us having to do 202 * check for fork/exit handlers to call. This avoids us having to do
243 * extra work in the fork/exit path if none of the subsystems need to 203 * extra work in the fork/exit path if none of the subsystems need to
@@ -249,20 +209,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
249static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 209static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
250 struct cftype cfts[], bool is_add); 210 struct cftype cfts[], bool is_add);
251 211
252#ifdef CONFIG_PROVE_LOCKING
253int cgroup_lock_is_held(void)
254{
255 return lockdep_is_held(&cgroup_mutex);
256}
257#else /* #ifdef CONFIG_PROVE_LOCKING */
258int cgroup_lock_is_held(void)
259{
260 return mutex_is_locked(&cgroup_mutex);
261}
262#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
263
264EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
265
266static int css_unbias_refcnt(int refcnt) 212static int css_unbias_refcnt(int refcnt)
267{ 213{
268 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; 214 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
@@ -282,11 +228,25 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
282 return test_bit(CGRP_REMOVED, &cgrp->flags); 228 return test_bit(CGRP_REMOVED, &cgrp->flags);
283} 229}
284 230
285/* bits in struct cgroupfs_root flags field */ 231/**
286enum { 232 * cgroup_is_descendant - test ancestry
287 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 233 * @cgrp: the cgroup to be tested
288 ROOT_XATTR, /* supports extended attributes */ 234 * @ancestor: possible ancestor of @cgrp
289}; 235 *
236 * Test whether @cgrp is a descendant of @ancestor. It also returns %true
237 * if @cgrp == @ancestor. This function is safe to call as long as @cgrp
238 * and @ancestor are accessible.
239 */
240bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
241{
242 while (cgrp) {
243 if (cgrp == ancestor)
244 return true;
245 cgrp = cgrp->parent;
246 }
247 return false;
248}
249EXPORT_SYMBOL_GPL(cgroup_is_descendant);
290 250
291static int cgroup_is_releasable(const struct cgroup *cgrp) 251static int cgroup_is_releasable(const struct cgroup *cgrp)
292{ 252{
@@ -327,6 +287,23 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
327 return __d_cfe(dentry)->type; 287 return __d_cfe(dentry)->type;
328} 288}
329 289
290/**
291 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
292 * @cgrp: the cgroup to be checked for liveness
293 *
294 * On success, returns true; the mutex should be later unlocked. On
295 * failure returns false with no lock held.
296 */
297static bool cgroup_lock_live_group(struct cgroup *cgrp)
298{
299 mutex_lock(&cgroup_mutex);
300 if (cgroup_is_removed(cgrp)) {
301 mutex_unlock(&cgroup_mutex);
302 return false;
303 }
304 return true;
305}
306
330/* the list of cgroups eligible for automatic release. Protected by 307/* the list of cgroups eligible for automatic release. Protected by
331 * release_list_lock */ 308 * release_list_lock */
332static LIST_HEAD(release_list); 309static LIST_HEAD(release_list);
@@ -800,27 +777,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
800 * update of a tasks cgroup pointer by cgroup_attach_task() 777 * update of a tasks cgroup pointer by cgroup_attach_task()
801 */ 778 */
802 779
803/**
804 * cgroup_lock - lock out any changes to cgroup structures
805 *
806 */
807void cgroup_lock(void)
808{
809 mutex_lock(&cgroup_mutex);
810}
811EXPORT_SYMBOL_GPL(cgroup_lock);
812
813/**
814 * cgroup_unlock - release lock on cgroup changes
815 *
816 * Undo the lock taken in a previous cgroup_lock() call.
817 */
818void cgroup_unlock(void)
819{
820 mutex_unlock(&cgroup_mutex);
821}
822EXPORT_SYMBOL_GPL(cgroup_unlock);
823
824/* 780/*
825 * A couple of forward declarations required, due to cyclic reference loop: 781 * A couple of forward declarations required, due to cyclic reference loop:
826 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> 782 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
@@ -859,6 +815,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
859 return inode; 815 return inode;
860} 816}
861 817
818static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
819{
820 struct cgroup_name *name;
821
822 name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
823 if (!name)
824 return NULL;
825 strcpy(name->name, dentry->d_name.name);
826 return name;
827}
828
862static void cgroup_free_fn(struct work_struct *work) 829static void cgroup_free_fn(struct work_struct *work)
863{ 830{
864 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 831 struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -875,8 +842,18 @@ static void cgroup_free_fn(struct work_struct *work)
875 mutex_unlock(&cgroup_mutex); 842 mutex_unlock(&cgroup_mutex);
876 843
877 /* 844 /*
845 * We get a ref to the parent's dentry, and put the ref when
846 * this cgroup is being freed, so it's guaranteed that the
847 * parent won't be destroyed before its children.
848 */
849 dput(cgrp->parent->dentry);
850
851 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
852
853 /*
878 * Drop the active superblock reference that we took when we 854 * Drop the active superblock reference that we took when we
879 * created the cgroup 855 * created the cgroup. This will free cgrp->root, if we are
856 * holding the last reference to @sb.
880 */ 857 */
881 deactivate_super(cgrp->root->sb); 858 deactivate_super(cgrp->root->sb);
882 859
@@ -888,7 +865,7 @@ static void cgroup_free_fn(struct work_struct *work)
888 865
889 simple_xattrs_free(&cgrp->xattrs); 866 simple_xattrs_free(&cgrp->xattrs);
890 867
891 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 868 kfree(rcu_dereference_raw(cgrp->name));
892 kfree(cgrp); 869 kfree(cgrp);
893} 870}
894 871
@@ -910,13 +887,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
910 } else { 887 } else {
911 struct cfent *cfe = __d_cfe(dentry); 888 struct cfent *cfe = __d_cfe(dentry);
912 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 889 struct cgroup *cgrp = dentry->d_parent->d_fsdata;
913 struct cftype *cft = cfe->type;
914 890
915 WARN_ONCE(!list_empty(&cfe->node) && 891 WARN_ONCE(!list_empty(&cfe->node) &&
916 cgrp != &cgrp->root->top_cgroup, 892 cgrp != &cgrp->root->top_cgroup,
917 "cfe still linked for %s\n", cfe->type->name); 893 "cfe still linked for %s\n", cfe->type->name);
894 simple_xattrs_free(&cfe->xattrs);
918 kfree(cfe); 895 kfree(cfe);
919 simple_xattrs_free(&cft->xattrs);
920 } 896 }
921 iput(inode); 897 iput(inode);
922} 898}
@@ -1108,9 +1084,11 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
1108 mutex_lock(&cgroup_root_mutex); 1084 mutex_lock(&cgroup_root_mutex);
1109 for_each_subsys(root, ss) 1085 for_each_subsys(root, ss)
1110 seq_printf(seq, ",%s", ss->name); 1086 seq_printf(seq, ",%s", ss->name);
1111 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1087 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
1088 seq_puts(seq, ",sane_behavior");
1089 if (root->flags & CGRP_ROOT_NOPREFIX)
1112 seq_puts(seq, ",noprefix"); 1090 seq_puts(seq, ",noprefix");
1113 if (test_bit(ROOT_XATTR, &root->flags)) 1091 if (root->flags & CGRP_ROOT_XATTR)
1114 seq_puts(seq, ",xattr"); 1092 seq_puts(seq, ",xattr");
1115 if (strlen(root->release_agent_path)) 1093 if (strlen(root->release_agent_path))
1116 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1094 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
@@ -1172,8 +1150,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1172 all_ss = true; 1150 all_ss = true;
1173 continue; 1151 continue;
1174 } 1152 }
1153 if (!strcmp(token, "__DEVEL__sane_behavior")) {
1154 opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
1155 continue;
1156 }
1175 if (!strcmp(token, "noprefix")) { 1157 if (!strcmp(token, "noprefix")) {
1176 set_bit(ROOT_NOPREFIX, &opts->flags); 1158 opts->flags |= CGRP_ROOT_NOPREFIX;
1177 continue; 1159 continue;
1178 } 1160 }
1179 if (!strcmp(token, "clone_children")) { 1161 if (!strcmp(token, "clone_children")) {
@@ -1181,7 +1163,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1181 continue; 1163 continue;
1182 } 1164 }
1183 if (!strcmp(token, "xattr")) { 1165 if (!strcmp(token, "xattr")) {
1184 set_bit(ROOT_XATTR, &opts->flags); 1166 opts->flags |= CGRP_ROOT_XATTR;
1185 continue; 1167 continue;
1186 } 1168 }
1187 if (!strncmp(token, "release_agent=", 14)) { 1169 if (!strncmp(token, "release_agent=", 14)) {
@@ -1259,13 +1241,26 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1259 1241
1260 /* Consistency checks */ 1242 /* Consistency checks */
1261 1243
1244 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1245 pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
1246
1247 if (opts->flags & CGRP_ROOT_NOPREFIX) {
1248 pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
1249 return -EINVAL;
1250 }
1251
1252 if (opts->cpuset_clone_children) {
1253 pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
1254 return -EINVAL;
1255 }
1256 }
1257
1262 /* 1258 /*
1263 * Option noprefix was introduced just for backward compatibility 1259 * Option noprefix was introduced just for backward compatibility
1264 * with the old cpuset, so we allow noprefix only if mounting just 1260 * with the old cpuset, so we allow noprefix only if mounting just
1265 * the cpuset subsystem. 1261 * the cpuset subsystem.
1266 */ 1262 */
1267 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1263 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
1268 (opts->subsys_mask & mask))
1269 return -EINVAL; 1264 return -EINVAL;
1270 1265
1271 1266
@@ -1336,6 +1331,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
1336 struct cgroup_sb_opts opts; 1331 struct cgroup_sb_opts opts;
1337 unsigned long added_mask, removed_mask; 1332 unsigned long added_mask, removed_mask;
1338 1333
1334 if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
1335 pr_err("cgroup: sane_behavior: remount is not allowed\n");
1336 return -EINVAL;
1337 }
1338
1339 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1339 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
1340 mutex_lock(&cgroup_mutex); 1340 mutex_lock(&cgroup_mutex);
1341 mutex_lock(&cgroup_root_mutex); 1341 mutex_lock(&cgroup_root_mutex);
@@ -1421,7 +1421,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
1421 INIT_LIST_HEAD(&root->allcg_list); 1421 INIT_LIST_HEAD(&root->allcg_list);
1422 root->number_of_cgroups = 1; 1422 root->number_of_cgroups = 1;
1423 cgrp->root = root; 1423 cgrp->root = root;
1424 cgrp->top_cgroup = cgrp; 1424 cgrp->name = &root_cgroup_name;
1425 init_cgroup_housekeeping(cgrp); 1425 init_cgroup_housekeeping(cgrp);
1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 1426 list_add_tail(&cgrp->allcg_node, &root->allcg_list);
1427} 1427}
@@ -1685,6 +1685,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
1685 * any) is not needed 1685 * any) is not needed
1686 */ 1686 */
1687 cgroup_drop_root(opts.new_root); 1687 cgroup_drop_root(opts.new_root);
1688
1689 if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
1690 root->flags != opts.flags) {
1691 pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
1692 ret = -EINVAL;
1693 goto drop_new_super;
1694 }
1695
1688 /* no subsys rebinding, so refcounts don't change */ 1696 /* no subsys rebinding, so refcounts don't change */
1689 drop_parsed_module_refcounts(opts.subsys_mask); 1697 drop_parsed_module_refcounts(opts.subsys_mask);
1690 } 1698 }
@@ -1769,49 +1777,48 @@ static struct kobject *cgroup_kobj;
1769 * @buf: the buffer to write the path into 1777 * @buf: the buffer to write the path into
1770 * @buflen: the length of the buffer 1778 * @buflen: the length of the buffer
1771 * 1779 *
1772 * Called with cgroup_mutex held or else with an RCU-protected cgroup 1780 * Writes path of cgroup into buf. Returns 0 on success, -errno on error.
1773 * reference. Writes path of cgroup into buf. Returns 0 on success, 1781 *
1774 * -errno on error. 1782 * We can't generate cgroup path using dentry->d_name, as accessing
1783 * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
1784 * inode's i_mutex, while on the other hand cgroup_path() can be called
1785 * with some irq-safe spinlocks held.
1775 */ 1786 */
1776int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1787int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1777{ 1788{
1778 struct dentry *dentry = cgrp->dentry; 1789 int ret = -ENAMETOOLONG;
1779 char *start; 1790 char *start;
1780 1791
1781 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1792 if (!cgrp->parent) {
1782 "cgroup_path() called without proper locking"); 1793 if (strlcpy(buf, "/", buflen) >= buflen)
1783 1794 return -ENAMETOOLONG;
1784 if (cgrp == dummytop) {
1785 /*
1786 * Inactive subsystems have no dentry for their root
1787 * cgroup
1788 */
1789 strcpy(buf, "/");
1790 return 0; 1795 return 0;
1791 } 1796 }
1792 1797
1793 start = buf + buflen - 1; 1798 start = buf + buflen - 1;
1794
1795 *start = '\0'; 1799 *start = '\0';
1796 for (;;) {
1797 int len = dentry->d_name.len;
1798 1800
1801 rcu_read_lock();
1802 do {
1803 const char *name = cgroup_name(cgrp);
1804 int len;
1805
1806 len = strlen(name);
1799 if ((start -= len) < buf) 1807 if ((start -= len) < buf)
1800 return -ENAMETOOLONG; 1808 goto out;
1801 memcpy(start, dentry->d_name.name, len); 1809 memcpy(start, name, len);
1802 cgrp = cgrp->parent;
1803 if (!cgrp)
1804 break;
1805 1810
1806 dentry = cgrp->dentry;
1807 if (!cgrp->parent)
1808 continue;
1809 if (--start < buf) 1811 if (--start < buf)
1810 return -ENAMETOOLONG; 1812 goto out;
1811 *start = '/'; 1813 *start = '/';
1812 } 1814
1815 cgrp = cgrp->parent;
1816 } while (cgrp->parent);
1817 ret = 0;
1813 memmove(buf, start, buf + buflen - start); 1818 memmove(buf, start, buf + buflen - start);
1814 return 0; 1819out:
1820 rcu_read_unlock();
1821 return ret;
1815} 1822}
1816EXPORT_SYMBOL_GPL(cgroup_path); 1823EXPORT_SYMBOL_GPL(cgroup_path);
1817 1824
@@ -1900,7 +1907,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1900 * 1907 *
1901 * Must be called with cgroup_mutex and threadgroup locked. 1908 * Must be called with cgroup_mutex and threadgroup locked.
1902 */ 1909 */
1903static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1910static void cgroup_task_migrate(struct cgroup *oldcgrp,
1904 struct task_struct *tsk, struct css_set *newcg) 1911 struct task_struct *tsk, struct css_set *newcg)
1905{ 1912{
1906 struct css_set *oldcg; 1913 struct css_set *oldcg;
@@ -1933,121 +1940,22 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1933} 1940}
1934 1941
1935/** 1942/**
1936 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1943 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
1937 * @cgrp: the cgroup the task is attaching to
1938 * @tsk: the task to be attached
1939 *
1940 * Call with cgroup_mutex and threadgroup locked. May take task_lock of
1941 * @tsk during call.
1942 */
1943int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1944{
1945 int retval = 0;
1946 struct cgroup_subsys *ss, *failed_ss = NULL;
1947 struct cgroup *oldcgrp;
1948 struct cgroupfs_root *root = cgrp->root;
1949 struct cgroup_taskset tset = { };
1950 struct css_set *newcg;
1951
1952 /* @tsk either already exited or can't exit until the end */
1953 if (tsk->flags & PF_EXITING)
1954 return -ESRCH;
1955
1956 /* Nothing to do if the task is already in that cgroup */
1957 oldcgrp = task_cgroup_from_root(tsk, root);
1958 if (cgrp == oldcgrp)
1959 return 0;
1960
1961 tset.single.task = tsk;
1962 tset.single.cgrp = oldcgrp;
1963
1964 for_each_subsys(root, ss) {
1965 if (ss->can_attach) {
1966 retval = ss->can_attach(cgrp, &tset);
1967 if (retval) {
1968 /*
1969 * Remember on which subsystem the can_attach()
1970 * failed, so that we only call cancel_attach()
1971 * against the subsystems whose can_attach()
1972 * succeeded. (See below)
1973 */
1974 failed_ss = ss;
1975 goto out;
1976 }
1977 }
1978 }
1979
1980 newcg = find_css_set(tsk->cgroups, cgrp);
1981 if (!newcg) {
1982 retval = -ENOMEM;
1983 goto out;
1984 }
1985
1986 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
1987
1988 for_each_subsys(root, ss) {
1989 if (ss->attach)
1990 ss->attach(cgrp, &tset);
1991 }
1992
1993out:
1994 if (retval) {
1995 for_each_subsys(root, ss) {
1996 if (ss == failed_ss)
1997 /*
1998 * This subsystem was the one that failed the
1999 * can_attach() check earlier, so we don't need
2000 * to call cancel_attach() against it or any
2001 * remaining subsystems.
2002 */
2003 break;
2004 if (ss->cancel_attach)
2005 ss->cancel_attach(cgrp, &tset);
2006 }
2007 }
2008 return retval;
2009}
2010
2011/**
2012 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2013 * @from: attach to all cgroups of a given task
2014 * @tsk: the task to be attached
2015 */
2016int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2017{
2018 struct cgroupfs_root *root;
2019 int retval = 0;
2020
2021 cgroup_lock();
2022 for_each_active_root(root) {
2023 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2024
2025 retval = cgroup_attach_task(from_cg, tsk);
2026 if (retval)
2027 break;
2028 }
2029 cgroup_unlock();
2030
2031 return retval;
2032}
2033EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2034
2035/**
2036 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2037 * @cgrp: the cgroup to attach to 1944 * @cgrp: the cgroup to attach to
2038 * @leader: the threadgroup leader task_struct of the group to be attached 1945 * @tsk: the task or the leader of the threadgroup to be attached
1946 * @threadgroup: attach the whole threadgroup?
2039 * 1947 *
2040 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1948 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
2041 * task_lock of each thread in leader's threadgroup individually in turn. 1949 * task_lock of @tsk or each thread in the threadgroup individually in turn.
2042 */ 1950 */
2043static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 1951static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
1952 bool threadgroup)
2044{ 1953{
2045 int retval, i, group_size; 1954 int retval, i, group_size;
2046 struct cgroup_subsys *ss, *failed_ss = NULL; 1955 struct cgroup_subsys *ss, *failed_ss = NULL;
2047 /* guaranteed to be initialized later, but the compiler needs this */
2048 struct cgroupfs_root *root = cgrp->root; 1956 struct cgroupfs_root *root = cgrp->root;
2049 /* threadgroup list cursor and array */ 1957 /* threadgroup list cursor and array */
2050 struct task_struct *tsk; 1958 struct task_struct *leader = tsk;
2051 struct task_and_cgroup *tc; 1959 struct task_and_cgroup *tc;
2052 struct flex_array *group; 1960 struct flex_array *group;
2053 struct cgroup_taskset tset = { }; 1961 struct cgroup_taskset tset = { };
@@ -2059,17 +1967,19 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2059 * group - group_rwsem prevents new threads from appearing, and if 1967 * group - group_rwsem prevents new threads from appearing, and if
2060 * threads exit, this will just be an over-estimate. 1968 * threads exit, this will just be an over-estimate.
2061 */ 1969 */
2062 group_size = get_nr_threads(leader); 1970 if (threadgroup)
1971 group_size = get_nr_threads(tsk);
1972 else
1973 group_size = 1;
2063 /* flex_array supports very large thread-groups better than kmalloc. */ 1974 /* flex_array supports very large thread-groups better than kmalloc. */
2064 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 1975 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
2065 if (!group) 1976 if (!group)
2066 return -ENOMEM; 1977 return -ENOMEM;
2067 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 1978 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2068 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); 1979 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
2069 if (retval) 1980 if (retval)
2070 goto out_free_group_list; 1981 goto out_free_group_list;
2071 1982
2072 tsk = leader;
2073 i = 0; 1983 i = 0;
2074 /* 1984 /*
2075 * Prevent freeing of tasks while we take a snapshot. Tasks that are 1985 * Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2098,6 +2008,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2098 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2008 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2099 BUG_ON(retval != 0); 2009 BUG_ON(retval != 0);
2100 i++; 2010 i++;
2011
2012 if (!threadgroup)
2013 break;
2101 } while_each_thread(leader, tsk); 2014 } while_each_thread(leader, tsk);
2102 rcu_read_unlock(); 2015 rcu_read_unlock();
2103 /* remember the number of threads in the array for later. */ 2016 /* remember the number of threads in the array for later. */
@@ -2143,7 +2056,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2143 */ 2056 */
2144 for (i = 0; i < group_size; i++) { 2057 for (i = 0; i < group_size; i++) {
2145 tc = flex_array_get(group, i); 2058 tc = flex_array_get(group, i);
2146 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); 2059 cgroup_task_migrate(tc->cgrp, tc->task, tc->cg);
2147 } 2060 }
2148 /* nothing is sensitive to fork() after this point. */ 2061 /* nothing is sensitive to fork() after this point. */
2149 2062
@@ -2224,11 +2137,11 @@ retry_find_task:
2224 tsk = tsk->group_leader; 2137 tsk = tsk->group_leader;
2225 2138
2226 /* 2139 /*
2227 * Workqueue threads may acquire PF_THREAD_BOUND and become 2140 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
2228 * trapped in a cpuset, or RT worker may be born in a cgroup 2141 * trapped in a cpuset, or RT worker may be born in a cgroup
2229 * with no rt_runtime allocated. Just say no. 2142 * with no rt_runtime allocated. Just say no.
2230 */ 2143 */
2231 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { 2144 if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
2232 ret = -EINVAL; 2145 ret = -EINVAL;
2233 rcu_read_unlock(); 2146 rcu_read_unlock();
2234 goto out_unlock_cgroup; 2147 goto out_unlock_cgroup;
@@ -2251,17 +2164,42 @@ retry_find_task:
2251 put_task_struct(tsk); 2164 put_task_struct(tsk);
2252 goto retry_find_task; 2165 goto retry_find_task;
2253 } 2166 }
2254 ret = cgroup_attach_proc(cgrp, tsk); 2167 }
2255 } else 2168
2256 ret = cgroup_attach_task(cgrp, tsk); 2169 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2170
2257 threadgroup_unlock(tsk); 2171 threadgroup_unlock(tsk);
2258 2172
2259 put_task_struct(tsk); 2173 put_task_struct(tsk);
2260out_unlock_cgroup: 2174out_unlock_cgroup:
2261 cgroup_unlock(); 2175 mutex_unlock(&cgroup_mutex);
2262 return ret; 2176 return ret;
2263} 2177}
2264 2178
2179/**
2180 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
2181 * @from: attach to all cgroups of a given task
2182 * @tsk: the task to be attached
2183 */
2184int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
2185{
2186 struct cgroupfs_root *root;
2187 int retval = 0;
2188
2189 mutex_lock(&cgroup_mutex);
2190 for_each_active_root(root) {
2191 struct cgroup *from_cg = task_cgroup_from_root(from, root);
2192
2193 retval = cgroup_attach_task(from_cg, tsk, false);
2194 if (retval)
2195 break;
2196 }
2197 mutex_unlock(&cgroup_mutex);
2198
2199 return retval;
2200}
2201EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
2202
2265static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2203static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2266{ 2204{
2267 return attach_task_by_pid(cgrp, pid, false); 2205 return attach_task_by_pid(cgrp, pid, false);
@@ -2272,24 +2210,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2272 return attach_task_by_pid(cgrp, tgid, true); 2210 return attach_task_by_pid(cgrp, tgid, true);
2273} 2211}
2274 2212
2275/**
2276 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
2277 * @cgrp: the cgroup to be checked for liveness
2278 *
2279 * On success, returns true; the lock should be later released with
2280 * cgroup_unlock(). On failure returns false with no lock held.
2281 */
2282bool cgroup_lock_live_group(struct cgroup *cgrp)
2283{
2284 mutex_lock(&cgroup_mutex);
2285 if (cgroup_is_removed(cgrp)) {
2286 mutex_unlock(&cgroup_mutex);
2287 return false;
2288 }
2289 return true;
2290}
2291EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
2292
2293static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2213static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2294 const char *buffer) 2214 const char *buffer)
2295{ 2215{
@@ -2301,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
2301 mutex_lock(&cgroup_root_mutex); 2221 mutex_lock(&cgroup_root_mutex);
2302 strcpy(cgrp->root->release_agent_path, buffer); 2222 strcpy(cgrp->root->release_agent_path, buffer);
2303 mutex_unlock(&cgroup_root_mutex); 2223 mutex_unlock(&cgroup_root_mutex);
2304 cgroup_unlock(); 2224 mutex_unlock(&cgroup_mutex);
2305 return 0; 2225 return 0;
2306} 2226}
2307 2227
@@ -2312,7 +2232,14 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
2312 return -ENODEV; 2232 return -ENODEV;
2313 seq_puts(seq, cgrp->root->release_agent_path); 2233 seq_puts(seq, cgrp->root->release_agent_path);
2314 seq_putc(seq, '\n'); 2234 seq_putc(seq, '\n');
2315 cgroup_unlock(); 2235 mutex_unlock(&cgroup_mutex);
2236 return 0;
2237}
2238
2239static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
2240 struct seq_file *seq)
2241{
2242 seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
2316 return 0; 2243 return 0;
2317} 2244}
2318 2245
@@ -2537,13 +2464,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
2537static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2464static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
2538 struct inode *new_dir, struct dentry *new_dentry) 2465 struct inode *new_dir, struct dentry *new_dentry)
2539{ 2466{
2467 int ret;
2468 struct cgroup_name *name, *old_name;
2469 struct cgroup *cgrp;
2470
2471 /*
2472 * It's convinient to use parent dir's i_mutex to protected
2473 * cgrp->name.
2474 */
2475 lockdep_assert_held(&old_dir->i_mutex);
2476
2540 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2477 if (!S_ISDIR(old_dentry->d_inode->i_mode))
2541 return -ENOTDIR; 2478 return -ENOTDIR;
2542 if (new_dentry->d_inode) 2479 if (new_dentry->d_inode)
2543 return -EEXIST; 2480 return -EEXIST;
2544 if (old_dir != new_dir) 2481 if (old_dir != new_dir)
2545 return -EIO; 2482 return -EIO;
2546 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2483
2484 cgrp = __d_cgrp(old_dentry);
2485
2486 name = cgroup_alloc_name(new_dentry);
2487 if (!name)
2488 return -ENOMEM;
2489
2490 ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
2491 if (ret) {
2492 kfree(name);
2493 return ret;
2494 }
2495
2496 old_name = cgrp->name;
2497 rcu_assign_pointer(cgrp->name, name);
2498
2499 kfree_rcu(old_name, rcu_head);
2500 return 0;
2547} 2501}
2548 2502
2549static struct simple_xattrs *__d_xattrs(struct dentry *dentry) 2503static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -2551,13 +2505,13 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
2551 if (S_ISDIR(dentry->d_inode->i_mode)) 2505 if (S_ISDIR(dentry->d_inode->i_mode))
2552 return &__d_cgrp(dentry)->xattrs; 2506 return &__d_cgrp(dentry)->xattrs;
2553 else 2507 else
2554 return &__d_cft(dentry)->xattrs; 2508 return &__d_cfe(dentry)->xattrs;
2555} 2509}
2556 2510
2557static inline int xattr_enabled(struct dentry *dentry) 2511static inline int xattr_enabled(struct dentry *dentry)
2558{ 2512{
2559 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 2513 struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
2560 return test_bit(ROOT_XATTR, &root->flags); 2514 return root->flags & CGRP_ROOT_XATTR;
2561} 2515}
2562 2516
2563static bool is_valid_xattr(const char *name) 2517static bool is_valid_xattr(const char *name)
@@ -2727,9 +2681,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2727 umode_t mode; 2681 umode_t mode;
2728 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2682 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
2729 2683
2730 simple_xattrs_init(&cft->xattrs); 2684 if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
2731
2732 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
2733 strcpy(name, subsys->name); 2685 strcpy(name, subsys->name);
2734 strcat(name, "."); 2686 strcat(name, ".");
2735 } 2687 }
@@ -2753,6 +2705,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2753 cfe->type = (void *)cft; 2705 cfe->type = (void *)cft;
2754 cfe->dentry = dentry; 2706 cfe->dentry = dentry;
2755 dentry->d_fsdata = cfe; 2707 dentry->d_fsdata = cfe;
2708 simple_xattrs_init(&cfe->xattrs);
2756 list_add_tail(&cfe->node, &parent->files); 2709 list_add_tail(&cfe->node, &parent->files);
2757 cfe = NULL; 2710 cfe = NULL;
2758 } 2711 }
@@ -2770,6 +2723,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
2770 2723
2771 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2724 for (cft = cfts; cft->name[0] != '\0'; cft++) {
2772 /* does cft->flags tell us to skip this file on @cgrp? */ 2725 /* does cft->flags tell us to skip this file on @cgrp? */
2726 if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
2727 continue;
2773 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2728 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
2774 continue; 2729 continue;
2775 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2730 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
@@ -3300,6 +3255,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3300 return 0; 3255 return 0;
3301} 3256}
3302 3257
3258static void cgroup_transfer_one_task(struct task_struct *task,
3259 struct cgroup_scanner *scan)
3260{
3261 struct cgroup *new_cgroup = scan->data;
3262
3263 mutex_lock(&cgroup_mutex);
3264 cgroup_attach_task(new_cgroup, task, false);
3265 mutex_unlock(&cgroup_mutex);
3266}
3267
3268/**
3269 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
3270 * @to: cgroup to which the tasks will be moved
3271 * @from: cgroup in which the tasks currently reside
3272 */
3273int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
3274{
3275 struct cgroup_scanner scan;
3276
3277 scan.cg = from;
3278 scan.test_task = NULL; /* select all tasks in cgroup */
3279 scan.process_task = cgroup_transfer_one_task;
3280 scan.heap = NULL;
3281 scan.data = to;
3282
3283 return cgroup_scan_tasks(&scan);
3284}
3285
3303/* 3286/*
3304 * Stuff for reading the 'tasks'/'procs' files. 3287 * Stuff for reading the 'tasks'/'procs' files.
3305 * 3288 *
@@ -3362,35 +3345,14 @@ static void pidlist_free(void *p)
3362 else 3345 else
3363 kfree(p); 3346 kfree(p);
3364} 3347}
3365static void *pidlist_resize(void *p, int newcount)
3366{
3367 void *newlist;
3368 /* note: if new alloc fails, old p will still be valid either way */
3369 if (is_vmalloc_addr(p)) {
3370 newlist = vmalloc(newcount * sizeof(pid_t));
3371 if (!newlist)
3372 return NULL;
3373 memcpy(newlist, p, newcount * sizeof(pid_t));
3374 vfree(p);
3375 } else {
3376 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
3377 }
3378 return newlist;
3379}
3380 3348
3381/* 3349/*
3382 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3350 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
3383 * If the new stripped list is sufficiently smaller and there's enough memory 3351 * Returns the number of unique elements.
3384 * to allocate a new buffer, will let go of the unneeded memory. Returns the
3385 * number of unique elements.
3386 */ 3352 */
3387/* is the size difference enough that we should re-allocate the array? */ 3353static int pidlist_uniq(pid_t *list, int length)
3388#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
3389static int pidlist_uniq(pid_t **p, int length)
3390{ 3354{
3391 int src, dest = 1; 3355 int src, dest = 1;
3392 pid_t *list = *p;
3393 pid_t *newlist;
3394 3356
3395 /* 3357 /*
3396 * we presume the 0th element is unique, so i starts at 1. trivial 3358 * we presume the 0th element is unique, so i starts at 1. trivial
@@ -3411,16 +3373,6 @@ static int pidlist_uniq(pid_t **p, int length)
3411 dest++; 3373 dest++;
3412 } 3374 }
3413after: 3375after:
3414 /*
3415 * if the length difference is large enough, we want to allocate a
3416 * smaller buffer to save memory. if this fails due to out of memory,
3417 * we'll just stay with what we've got.
3418 */
3419 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
3420 newlist = pidlist_resize(list, dest);
3421 if (newlist)
3422 *p = newlist;
3423 }
3424 return dest; 3376 return dest;
3425} 3377}
3426 3378
@@ -3516,7 +3468,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3516 /* now sort & (if procs) strip out duplicates */ 3468 /* now sort & (if procs) strip out duplicates */
3517 sort(array, length, sizeof(pid_t), cmppid, NULL); 3469 sort(array, length, sizeof(pid_t), cmppid, NULL);
3518 if (type == CGROUP_FILE_PROCS) 3470 if (type == CGROUP_FILE_PROCS)
3519 length = pidlist_uniq(&array, length); 3471 length = pidlist_uniq(array, length);
3520 l = cgroup_pidlist_find(cgrp, type); 3472 l = cgroup_pidlist_find(cgrp, type);
3521 if (!l) { 3473 if (!l) {
3522 pidlist_free(array); 3474 pidlist_free(array);
@@ -3930,11 +3882,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
3930 if (ret) 3882 if (ret)
3931 goto fail; 3883 goto fail;
3932 3884
3933 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { 3885 efile->f_op->poll(efile, &event->pt);
3934 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
3935 ret = 0;
3936 goto fail;
3937 }
3938 3886
3939 /* 3887 /*
3940 * Events should be removed after rmdir of cgroup directory, but before 3888 * Events should be removed after rmdir of cgroup directory, but before
@@ -4016,10 +3964,16 @@ static struct cftype files[] = {
4016 }, 3964 },
4017 { 3965 {
4018 .name = "cgroup.clone_children", 3966 .name = "cgroup.clone_children",
3967 .flags = CFTYPE_INSANE,
4019 .read_u64 = cgroup_clone_children_read, 3968 .read_u64 = cgroup_clone_children_read,
4020 .write_u64 = cgroup_clone_children_write, 3969 .write_u64 = cgroup_clone_children_write,
4021 }, 3970 },
4022 { 3971 {
3972 .name = "cgroup.sane_behavior",
3973 .flags = CFTYPE_ONLY_ON_ROOT,
3974 .read_seq_string = cgroup_sane_behavior_show,
3975 },
3976 {
4023 .name = "release_agent", 3977 .name = "release_agent",
4024 .flags = CFTYPE_ONLY_ON_ROOT, 3978 .flags = CFTYPE_ONLY_ON_ROOT,
4025 .read_seq_string = cgroup_release_agent_show, 3979 .read_seq_string = cgroup_release_agent_show,
@@ -4131,17 +4085,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
4131 if (!(css->flags & CSS_ONLINE)) 4085 if (!(css->flags & CSS_ONLINE))
4132 return; 4086 return;
4133 4087
4134 /* 4088 if (ss->css_offline)
4135 * css_offline() should be called with cgroup_mutex unlocked. See
4136 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for
4137 * details. This temporary unlocking should go away once
4138 * cgroup_mutex is unexported from controllers.
4139 */
4140 if (ss->css_offline) {
4141 mutex_unlock(&cgroup_mutex);
4142 ss->css_offline(cgrp); 4089 ss->css_offline(cgrp);
4143 mutex_lock(&cgroup_mutex);
4144 }
4145 4090
4146 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4091 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
4147} 4092}
@@ -4158,6 +4103,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4158 umode_t mode) 4103 umode_t mode)
4159{ 4104{
4160 struct cgroup *cgrp; 4105 struct cgroup *cgrp;
4106 struct cgroup_name *name;
4161 struct cgroupfs_root *root = parent->root; 4107 struct cgroupfs_root *root = parent->root;
4162 int err = 0; 4108 int err = 0;
4163 struct cgroup_subsys *ss; 4109 struct cgroup_subsys *ss;
@@ -4168,9 +4114,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4168 if (!cgrp) 4114 if (!cgrp)
4169 return -ENOMEM; 4115 return -ENOMEM;
4170 4116
4117 name = cgroup_alloc_name(dentry);
4118 if (!name)
4119 goto err_free_cgrp;
4120 rcu_assign_pointer(cgrp->name, name);
4121
4171 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4122 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
4172 if (cgrp->id < 0) 4123 if (cgrp->id < 0)
4173 goto err_free_cgrp; 4124 goto err_free_name;
4174 4125
4175 /* 4126 /*
4176 * Only live parents can have children. Note that the liveliness 4127 * Only live parents can have children. Note that the liveliness
@@ -4198,7 +4149,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4198 4149
4199 cgrp->parent = parent; 4150 cgrp->parent = parent;
4200 cgrp->root = parent->root; 4151 cgrp->root = parent->root;
4201 cgrp->top_cgroup = parent->top_cgroup;
4202 4152
4203 if (notify_on_release(parent)) 4153 if (notify_on_release(parent))
4204 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4154 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -4241,6 +4191,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
4241 for_each_subsys(root, ss) 4191 for_each_subsys(root, ss)
4242 dget(dentry); 4192 dget(dentry);
4243 4193
4194 /* hold a ref to the parent's dentry */
4195 dget(parent->dentry);
4196
4244 /* creation succeeded, notify subsystems */ 4197 /* creation succeeded, notify subsystems */
4245 for_each_subsys(root, ss) { 4198 for_each_subsys(root, ss) {
4246 err = online_css(ss, cgrp); 4199 err = online_css(ss, cgrp);
@@ -4276,6 +4229,8 @@ err_free_all:
4276 deactivate_super(sb); 4229 deactivate_super(sb);
4277err_free_id: 4230err_free_id:
4278 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4231 ida_simple_remove(&root->cgroup_ida, cgrp->id);
4232err_free_name:
4233 kfree(rcu_dereference_raw(cgrp->name));
4279err_free_cgrp: 4234err_free_cgrp:
4280 kfree(cgrp); 4235 kfree(cgrp);
4281 return err; 4236 return err;
@@ -4295,56 +4250,13 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4295 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4250 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
4296} 4251}
4297 4252
4298/*
4299 * Check the reference count on each subsystem. Since we already
4300 * established that there are no tasks in the cgroup, if the css refcount
4301 * is also 1, then there should be no outstanding references, so the
4302 * subsystem is safe to destroy. We scan across all subsystems rather than
4303 * using the per-hierarchy linked list of mounted subsystems since we can
4304 * be called via check_for_release() with no synchronization other than
4305 * RCU, and the subsystem linked list isn't RCU-safe.
4306 */
4307static int cgroup_has_css_refs(struct cgroup *cgrp)
4308{
4309 int i;
4310
4311 /*
4312 * We won't need to lock the subsys array, because the subsystems
4313 * we're concerned about aren't going anywhere since our cgroup root
4314 * has a reference on them.
4315 */
4316 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
4317 struct cgroup_subsys *ss = subsys[i];
4318 struct cgroup_subsys_state *css;
4319
4320 /* Skip subsystems not present or not in this hierarchy */
4321 if (ss == NULL || ss->root != cgrp->root)
4322 continue;
4323
4324 css = cgrp->subsys[ss->subsys_id];
4325 /*
4326 * When called from check_for_release() it's possible
4327 * that by this point the cgroup has been removed
4328 * and the css deleted. But a false-positive doesn't
4329 * matter, since it can only happen if the cgroup
4330 * has been deleted and hence no longer needs the
4331 * release agent to be called anyway.
4332 */
4333 if (css && css_refcnt(css) > 1)
4334 return 1;
4335 }
4336 return 0;
4337}
4338
4339static int cgroup_destroy_locked(struct cgroup *cgrp) 4253static int cgroup_destroy_locked(struct cgroup *cgrp)
4340 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4254 __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
4341{ 4255{
4342 struct dentry *d = cgrp->dentry; 4256 struct dentry *d = cgrp->dentry;
4343 struct cgroup *parent = cgrp->parent; 4257 struct cgroup *parent = cgrp->parent;
4344 DEFINE_WAIT(wait);
4345 struct cgroup_event *event, *tmp; 4258 struct cgroup_event *event, *tmp;
4346 struct cgroup_subsys *ss; 4259 struct cgroup_subsys *ss;
4347 LIST_HEAD(tmp_list);
4348 4260
4349 lockdep_assert_held(&d->d_inode->i_mutex); 4261 lockdep_assert_held(&d->d_inode->i_mutex);
4350 lockdep_assert_held(&cgroup_mutex); 4262 lockdep_assert_held(&cgroup_mutex);
@@ -4468,7 +4380,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4468 * need to invoke fork callbacks here. */ 4380 * need to invoke fork callbacks here. */
4469 BUG_ON(!list_empty(&init_task.tasks)); 4381 BUG_ON(!list_empty(&init_task.tasks));
4470 4382
4471 ss->active = 1;
4472 BUG_ON(online_css(ss, dummytop)); 4383 BUG_ON(online_css(ss, dummytop));
4473 4384
4474 mutex_unlock(&cgroup_mutex); 4385 mutex_unlock(&cgroup_mutex);
@@ -4573,7 +4484,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4573 } 4484 }
4574 write_unlock(&css_set_lock); 4485 write_unlock(&css_set_lock);
4575 4486
4576 ss->active = 1;
4577 ret = online_css(ss, dummytop); 4487 ret = online_css(ss, dummytop);
4578 if (ret) 4488 if (ret)
4579 goto err_unload; 4489 goto err_unload;
@@ -4614,7 +4524,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4614 mutex_lock(&cgroup_mutex); 4524 mutex_lock(&cgroup_mutex);
4615 4525
4616 offline_css(ss, dummytop); 4526 offline_css(ss, dummytop);
4617 ss->active = 0;
4618 4527
4619 if (ss->use_id) 4528 if (ss->use_id)
4620 idr_destroy(&ss->idr); 4529 idr_destroy(&ss->idr);
@@ -4769,7 +4678,7 @@ out:
4769 */ 4678 */
4770 4679
4771/* TODO: Use a proper seq_file iterator */ 4680/* TODO: Use a proper seq_file iterator */
4772static int proc_cgroup_show(struct seq_file *m, void *v) 4681int proc_cgroup_show(struct seq_file *m, void *v)
4773{ 4682{
4774 struct pid *pid; 4683 struct pid *pid;
4775 struct task_struct *tsk; 4684 struct task_struct *tsk;
@@ -4821,19 +4730,6 @@ out:
4821 return retval; 4730 return retval;
4822} 4731}
4823 4732
4824static int cgroup_open(struct inode *inode, struct file *file)
4825{
4826 struct pid *pid = PROC_I(inode)->pid;
4827 return single_open(file, proc_cgroup_show, pid);
4828}
4829
4830const struct file_operations proc_cgroup_operations = {
4831 .open = cgroup_open,
4832 .read = seq_read,
4833 .llseek = seq_lseek,
4834 .release = single_release,
4835};
4836
4837/* Display information about each subsystem and each hierarchy */ 4733/* Display information about each subsystem and each hierarchy */
4838static int proc_cgroupstats_show(struct seq_file *m, void *v) 4734static int proc_cgroupstats_show(struct seq_file *m, void *v)
4839{ 4735{
@@ -4935,17 +4831,17 @@ void cgroup_post_fork(struct task_struct *child)
4935 * and addition to css_set. 4831 * and addition to css_set.
4936 */ 4832 */
4937 if (need_forkexit_callback) { 4833 if (need_forkexit_callback) {
4938 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4834 /*
4835 * fork/exit callbacks are supported only for builtin
4836 * subsystems, and the builtin section of the subsys
4837 * array is immutable, so we don't need to lock the
4838 * subsys array here. On the other hand, modular section
4839 * of the array can be freed at module unload, so we
4840 * can't touch that.
4841 */
4842 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4939 struct cgroup_subsys *ss = subsys[i]; 4843 struct cgroup_subsys *ss = subsys[i];
4940 4844
4941 /*
4942 * fork/exit callbacks are supported only for
4943 * builtin subsystems and we don't need further
4944 * synchronization as they never go away.
4945 */
4946 if (!ss || ss->module)
4947 continue;
4948
4949 if (ss->fork) 4845 if (ss->fork)
4950 ss->fork(child); 4846 ss->fork(child);
4951 } 4847 }
@@ -5010,13 +4906,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5010 tsk->cgroups = &init_css_set; 4906 tsk->cgroups = &init_css_set;
5011 4907
5012 if (run_callbacks && need_forkexit_callback) { 4908 if (run_callbacks && need_forkexit_callback) {
5013 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4909 /*
4910 * fork/exit callbacks are supported only for builtin
4911 * subsystems, see cgroup_post_fork() for details.
4912 */
4913 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
5014 struct cgroup_subsys *ss = subsys[i]; 4914 struct cgroup_subsys *ss = subsys[i];
5015 4915
5016 /* modular subsystems can't use callbacks */
5017 if (!ss || ss->module)
5018 continue;
5019
5020 if (ss->exit) { 4916 if (ss->exit) {
5021 struct cgroup *old_cgrp = 4917 struct cgroup *old_cgrp =
5022 rcu_dereference_raw(cg->subsys[i])->cgroup; 4918 rcu_dereference_raw(cg->subsys[i])->cgroup;
@@ -5030,44 +4926,19 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
5030 put_css_set_taskexit(cg); 4926 put_css_set_taskexit(cg);
5031} 4927}
5032 4928
5033/**
5034 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
5035 * @cgrp: the cgroup in question
5036 * @task: the task in question
5037 *
5038 * See if @cgrp is a descendant of @task's cgroup in the appropriate
5039 * hierarchy.
5040 *
5041 * If we are sending in dummytop, then presumably we are creating
5042 * the top cgroup in the subsystem.
5043 *
5044 * Called only by the ns (nsproxy) cgroup.
5045 */
5046int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
5047{
5048 int ret;
5049 struct cgroup *target;
5050
5051 if (cgrp == dummytop)
5052 return 1;
5053
5054 target = task_cgroup_from_root(task, cgrp->root);
5055 while (cgrp != target && cgrp!= cgrp->top_cgroup)
5056 cgrp = cgrp->parent;
5057 ret = (cgrp == target);
5058 return ret;
5059}
5060
5061static void check_for_release(struct cgroup *cgrp) 4929static void check_for_release(struct cgroup *cgrp)
5062{ 4930{
5063 /* All of these checks rely on RCU to keep the cgroup 4931 /* All of these checks rely on RCU to keep the cgroup
5064 * structure alive */ 4932 * structure alive */
5065 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) 4933 if (cgroup_is_releasable(cgrp) &&
5066 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { 4934 !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
5067 /* Control Group is currently removeable. If it's not 4935 /*
4936 * Control Group is currently removeable. If it's not
5068 * already queued for a userspace notification, queue 4937 * already queued for a userspace notification, queue
5069 * it now */ 4938 * it now
4939 */
5070 int need_schedule_work = 0; 4940 int need_schedule_work = 0;
4941
5071 raw_spin_lock(&release_list_lock); 4942 raw_spin_lock(&release_list_lock);
5072 if (!cgroup_is_removed(cgrp) && 4943 if (!cgroup_is_removed(cgrp) &&
5073 list_empty(&cgrp->release_list)) { 4944 list_empty(&cgrp->release_list)) {
@@ -5100,24 +4971,11 @@ EXPORT_SYMBOL_GPL(__css_tryget);
5100/* Caller must verify that the css is not for root cgroup */ 4971/* Caller must verify that the css is not for root cgroup */
5101void __css_put(struct cgroup_subsys_state *css) 4972void __css_put(struct cgroup_subsys_state *css)
5102{ 4973{
5103 struct cgroup *cgrp = css->cgroup;
5104 int v; 4974 int v;
5105 4975
5106 rcu_read_lock();
5107 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); 4976 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
5108 4977 if (v == 0)
5109 switch (v) {
5110 case 1:
5111 if (notify_on_release(cgrp)) {
5112 set_bit(CGRP_RELEASABLE, &cgrp->flags);
5113 check_for_release(cgrp);
5114 }
5115 break;
5116 case 0:
5117 schedule_work(&css->dput_work); 4978 schedule_work(&css->dput_work);
5118 break;
5119 }
5120 rcu_read_unlock();
5121} 4979}
5122EXPORT_SYMBOL_GPL(__css_put); 4980EXPORT_SYMBOL_GPL(__css_put);
5123 4981
@@ -5416,55 +5274,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
5416} 5274}
5417EXPORT_SYMBOL_GPL(css_lookup); 5275EXPORT_SYMBOL_GPL(css_lookup);
5418 5276
5419/**
5420 * css_get_next - lookup next cgroup under specified hierarchy.
5421 * @ss: pointer to subsystem
5422 * @id: current position of iteration.
5423 * @root: pointer to css. search tree under this.
5424 * @foundid: position of found object.
5425 *
5426 * Search next css under the specified hierarchy of rootid. Calling under
5427 * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
5428 */
5429struct cgroup_subsys_state *
5430css_get_next(struct cgroup_subsys *ss, int id,
5431 struct cgroup_subsys_state *root, int *foundid)
5432{
5433 struct cgroup_subsys_state *ret = NULL;
5434 struct css_id *tmp;
5435 int tmpid;
5436 int rootid = css_id(root);
5437 int depth = css_depth(root);
5438
5439 if (!rootid)
5440 return NULL;
5441
5442 BUG_ON(!ss->use_id);
5443 WARN_ON_ONCE(!rcu_read_lock_held());
5444
5445 /* fill start point for scan */
5446 tmpid = id;
5447 while (1) {
5448 /*
5449 * scan next entry from bitmap(tree), tmpid is updated after
5450 * idr_get_next().
5451 */
5452 tmp = idr_get_next(&ss->idr, &tmpid);
5453 if (!tmp)
5454 break;
5455 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
5456 ret = rcu_dereference(tmp->css);
5457 if (ret) {
5458 *foundid = tmpid;
5459 break;
5460 }
5461 }
5462 /* continue to scan from next id */
5463 tmpid = tmpid + 1;
5464 }
5465 return ret;
5466}
5467
5468/* 5277/*
5469 * get corresponding css from file open on cgroupfs directory 5278 * get corresponding css from file open on cgroupfs directory
5470 */ 5279 */
diff --git a/kernel/compat.c b/kernel/compat.c
index 19971d8c7299..0a09e481b70b 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -516,25 +516,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
516 return 0; 516 return 0;
517} 517}
518 518
519asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
520{
521 struct rusage r;
522 int ret;
523 mm_segment_t old_fs = get_fs();
524
525 set_fs(KERNEL_DS);
526 ret = sys_getrusage(who, (struct rusage __user *) &r);
527 set_fs(old_fs);
528
529 if (ret)
530 return ret;
531
532 if (put_compat_rusage(&r, ru))
533 return -EFAULT;
534
535 return 0;
536}
537
538COMPAT_SYSCALL_DEFINE4(wait4, 519COMPAT_SYSCALL_DEFINE4(wait4,
539 compat_pid_t, pid, 520 compat_pid_t, pid,
540 compat_uint_t __user *, stat_addr, 521 compat_uint_t __user *, stat_addr,
@@ -1138,71 +1119,6 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
1138} 1119}
1139#endif 1120#endif
1140 1121
1141struct compat_sysinfo {
1142 s32 uptime;
1143 u32 loads[3];
1144 u32 totalram;
1145 u32 freeram;
1146 u32 sharedram;
1147 u32 bufferram;
1148 u32 totalswap;
1149 u32 freeswap;
1150 u16 procs;
1151 u16 pad;
1152 u32 totalhigh;
1153 u32 freehigh;
1154 u32 mem_unit;
1155 char _f[20-2*sizeof(u32)-sizeof(int)];
1156};
1157
1158asmlinkage long
1159compat_sys_sysinfo(struct compat_sysinfo __user *info)
1160{
1161 struct sysinfo s;
1162
1163 do_sysinfo(&s);
1164
1165 /* Check to see if any memory value is too large for 32-bit and scale
1166 * down if needed
1167 */
1168 if ((s.totalram >> 32) || (s.totalswap >> 32)) {
1169 int bitcount = 0;
1170
1171 while (s.mem_unit < PAGE_SIZE) {
1172 s.mem_unit <<= 1;
1173 bitcount++;
1174 }
1175
1176 s.totalram >>= bitcount;
1177 s.freeram >>= bitcount;
1178 s.sharedram >>= bitcount;
1179 s.bufferram >>= bitcount;
1180 s.totalswap >>= bitcount;
1181 s.freeswap >>= bitcount;
1182 s.totalhigh >>= bitcount;
1183 s.freehigh >>= bitcount;
1184 }
1185
1186 if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
1187 __put_user (s.uptime, &info->uptime) ||
1188 __put_user (s.loads[0], &info->loads[0]) ||
1189 __put_user (s.loads[1], &info->loads[1]) ||
1190 __put_user (s.loads[2], &info->loads[2]) ||
1191 __put_user (s.totalram, &info->totalram) ||
1192 __put_user (s.freeram, &info->freeram) ||
1193 __put_user (s.sharedram, &info->sharedram) ||
1194 __put_user (s.bufferram, &info->bufferram) ||
1195 __put_user (s.totalswap, &info->totalswap) ||
1196 __put_user (s.freeswap, &info->freeswap) ||
1197 __put_user (s.procs, &info->procs) ||
1198 __put_user (s.totalhigh, &info->totalhigh) ||
1199 __put_user (s.freehigh, &info->freehigh) ||
1200 __put_user (s.mem_unit, &info->mem_unit))
1201 return -EFAULT;
1202
1203 return 0;
1204}
1205
1206COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, 1122COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
1207 compat_pid_t, pid, 1123 compat_pid_t, pid,
1208 struct compat_timespec __user *, interval) 1124 struct compat_timespec __user *, interval)
diff --git a/kernel/configs.c b/kernel/configs.c
index 42e8fa075eed..c18b1f1ae515 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,7 +79,7 @@ static int __init ikconfig_init(void)
79 if (!entry) 79 if (!entry)
80 return -ENOMEM; 80 return -ENOMEM;
81 81
82 entry->size = kernel_config_data_size; 82 proc_set_size(entry, kernel_config_data_size);
83 83
84 return 0; 84 return 0;
85} 85}
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
new file mode 100644
index 000000000000..59ab052ef7a0
--- /dev/null
+++ b/kernel/cpu/Makefile
@@ -0,0 +1 @@
obj-y = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
new file mode 100644
index 000000000000..8b86c0c68edf
--- /dev/null
+++ b/kernel/cpu/idle.c
@@ -0,0 +1,116 @@
1/*
2 * Generic entry point for the idle threads
3 */
4#include <linux/sched.h>
5#include <linux/cpu.h>
6#include <linux/tick.h>
7#include <linux/mm.h>
8
9#include <asm/tlb.h>
10
11#include <trace/events/power.h>
12
13static int __read_mostly cpu_idle_force_poll;
14
15void cpu_idle_poll_ctrl(bool enable)
16{
17 if (enable) {
18 cpu_idle_force_poll++;
19 } else {
20 cpu_idle_force_poll--;
21 WARN_ON_ONCE(cpu_idle_force_poll < 0);
22 }
23}
24
25#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
26static int __init cpu_idle_poll_setup(char *__unused)
27{
28 cpu_idle_force_poll = 1;
29 return 1;
30}
31__setup("nohlt", cpu_idle_poll_setup);
32
33static int __init cpu_idle_nopoll_setup(char *__unused)
34{
35 cpu_idle_force_poll = 0;
36 return 1;
37}
38__setup("hlt", cpu_idle_nopoll_setup);
39#endif
40
41static inline int cpu_idle_poll(void)
42{
43 trace_cpu_idle_rcuidle(0, smp_processor_id());
44 local_irq_enable();
45 while (!need_resched())
46 cpu_relax();
47 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
48 return 1;
49}
50
51/* Weak implementations for optional arch specific functions */
52void __weak arch_cpu_idle_prepare(void) { }
53void __weak arch_cpu_idle_enter(void) { }
54void __weak arch_cpu_idle_exit(void) { }
55void __weak arch_cpu_idle_dead(void) { }
56void __weak arch_cpu_idle(void)
57{
58 cpu_idle_force_poll = 1;
59}
60
61/*
62 * Generic idle loop implementation
63 */
64static void cpu_idle_loop(void)
65{
66 while (1) {
67 tick_nohz_idle_enter();
68
69 while (!need_resched()) {
70 check_pgt_cache();
71 rmb();
72
73 if (cpu_is_offline(smp_processor_id()))
74 arch_cpu_idle_dead();
75
76 local_irq_disable();
77 arch_cpu_idle_enter();
78
79 /*
80 * In poll mode we reenable interrupts and spin.
81 *
82 * Also if we detected in the wakeup from idle
83 * path that the tick broadcast device expired
84 * for us, we don't want to go deep idle as we
85 * know that the IPI is going to arrive right
86 * away
87 */
88 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
89 cpu_idle_poll();
90 } else {
91 current_clr_polling();
92 if (!need_resched()) {
93 stop_critical_timings();
94 rcu_idle_enter();
95 arch_cpu_idle();
96 WARN_ON_ONCE(irqs_disabled());
97 rcu_idle_exit();
98 start_critical_timings();
99 } else {
100 local_irq_enable();
101 }
102 current_set_polling();
103 }
104 arch_cpu_idle_exit();
105 }
106 tick_nohz_idle_exit();
107 schedule_preempt_disabled();
108 }
109}
110
111void cpu_startup_entry(enum cpuhp_state state)
112{
113 current_set_polling();
114 arch_cpu_idle_prepare();
115 cpu_idle_loop();
116}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4f9dfe43ecbd..64b3f791bbe5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -265,17 +265,6 @@ static DEFINE_MUTEX(cpuset_mutex);
265static DEFINE_MUTEX(callback_mutex); 265static DEFINE_MUTEX(callback_mutex);
266 266
267/* 267/*
268 * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
269 * buffers. They are statically allocated to prevent using excess stack
270 * when calling cpuset_print_task_mems_allowed().
271 */
272#define CPUSET_NAME_LEN (128)
273#define CPUSET_NODELIST_LEN (256)
274static char cpuset_name[CPUSET_NAME_LEN];
275static char cpuset_nodelist[CPUSET_NODELIST_LEN];
276static DEFINE_SPINLOCK(cpuset_buffer_lock);
277
278/*
279 * CPU / memory hotplug is handled asynchronously. 268 * CPU / memory hotplug is handled asynchronously.
280 */ 269 */
281static struct workqueue_struct *cpuset_propagate_hotplug_wq; 270static struct workqueue_struct *cpuset_propagate_hotplug_wq;
@@ -780,25 +769,26 @@ static void rebuild_sched_domains_locked(void)
780 lockdep_assert_held(&cpuset_mutex); 769 lockdep_assert_held(&cpuset_mutex);
781 get_online_cpus(); 770 get_online_cpus();
782 771
772 /*
773 * We have raced with CPU hotplug. Don't do anything to avoid
774 * passing doms with offlined cpu to partition_sched_domains().
775 * Anyways, hotplug work item will rebuild sched domains.
776 */
777 if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
778 goto out;
779
783 /* Generate domain masks and attrs */ 780 /* Generate domain masks and attrs */
784 ndoms = generate_sched_domains(&doms, &attr); 781 ndoms = generate_sched_domains(&doms, &attr);
785 782
786 /* Have scheduler rebuild the domains */ 783 /* Have scheduler rebuild the domains */
787 partition_sched_domains(ndoms, doms, attr); 784 partition_sched_domains(ndoms, doms, attr);
788 785out:
789 put_online_cpus(); 786 put_online_cpus();
790} 787}
791#else /* !CONFIG_SMP */ 788#else /* !CONFIG_SMP */
792static void rebuild_sched_domains_locked(void) 789static void rebuild_sched_domains_locked(void)
793{ 790{
794} 791}
795
796static int generate_sched_domains(cpumask_var_t **domains,
797 struct sched_domain_attr **attributes)
798{
799 *domains = NULL;
800 return 1;
801}
802#endif /* CONFIG_SMP */ 792#endif /* CONFIG_SMP */
803 793
804void rebuild_sched_domains(void) 794void rebuild_sched_domains(void)
@@ -1388,16 +1378,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1388 1378
1389 cgroup_taskset_for_each(task, cgrp, tset) { 1379 cgroup_taskset_for_each(task, cgrp, tset) {
1390 /* 1380 /*
1391 * Kthreads bound to specific cpus cannot be moved to a new 1381 * Kthreads which disallow setaffinity shouldn't be moved
1392 * cpuset; we cannot change their cpu affinity and 1382 * to a new cpuset; we don't want to change their cpu
1393 * isolating such threads by their set of allowed nodes is 1383 * affinity and isolating such threads by their set of
1394 * unnecessary. Thus, cpusets are not applicable for such 1384 * allowed nodes is unnecessary. Thus, cpusets are not
1395 * threads. This prevents checking for success of 1385 * applicable for such threads. This prevents checking for
1396 * set_cpus_allowed_ptr() on all attached tasks before 1386 * success of set_cpus_allowed_ptr() on all attached tasks
1397 * cpus_allowed may be changed. 1387 * before cpus_allowed may be changed.
1398 */ 1388 */
1399 ret = -EINVAL; 1389 ret = -EINVAL;
1400 if (task->flags & PF_THREAD_BOUND) 1390 if (task->flags & PF_NO_SETAFFINITY)
1401 goto out_unlock; 1391 goto out_unlock;
1402 ret = security_task_setscheduler(task); 1392 ret = security_task_setscheduler(task);
1403 if (ret) 1393 if (ret)
@@ -2005,50 +1995,6 @@ int __init cpuset_init(void)
2005 return 0; 1995 return 0;
2006} 1996}
2007 1997
2008/**
2009 * cpuset_do_move_task - move a given task to another cpuset
2010 * @tsk: pointer to task_struct the task to move
2011 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
2012 *
2013 * Called by cgroup_scan_tasks() for each task in a cgroup.
2014 * Return nonzero to stop the walk through the tasks.
2015 */
2016static void cpuset_do_move_task(struct task_struct *tsk,
2017 struct cgroup_scanner *scan)
2018{
2019 struct cgroup *new_cgroup = scan->data;
2020
2021 cgroup_lock();
2022 cgroup_attach_task(new_cgroup, tsk);
2023 cgroup_unlock();
2024}
2025
2026/**
2027 * move_member_tasks_to_cpuset - move tasks from one cpuset to another
2028 * @from: cpuset in which the tasks currently reside
2029 * @to: cpuset to which the tasks will be moved
2030 *
2031 * Called with cpuset_mutex held
2032 * callback_mutex must not be held, as cpuset_attach() will take it.
2033 *
2034 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
2035 * calling callback functions for each.
2036 */
2037static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
2038{
2039 struct cgroup_scanner scan;
2040
2041 scan.cg = from->css.cgroup;
2042 scan.test_task = NULL; /* select all tasks in cgroup */
2043 scan.process_task = cpuset_do_move_task;
2044 scan.heap = NULL;
2045 scan.data = to->css.cgroup;
2046
2047 if (cgroup_scan_tasks(&scan))
2048 printk(KERN_ERR "move_member_tasks_to_cpuset: "
2049 "cgroup_scan_tasks failed\n");
2050}
2051
2052/* 1998/*
2053 * If CPU and/or memory hotplug handlers, below, unplug any CPUs 1999 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
2054 * or memory nodes, we need to walk over the cpuset hierarchy, 2000 * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2069,7 +2015,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2069 nodes_empty(parent->mems_allowed)) 2015 nodes_empty(parent->mems_allowed))
2070 parent = parent_cs(parent); 2016 parent = parent_cs(parent);
2071 2017
2072 move_member_tasks_to_cpuset(cs, parent); 2018 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2019 rcu_read_lock();
2020 printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
2021 cgroup_name(cs->css.cgroup));
2022 rcu_read_unlock();
2023 }
2073} 2024}
2074 2025
2075/** 2026/**
@@ -2222,17 +2173,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2222 flush_workqueue(cpuset_propagate_hotplug_wq); 2173 flush_workqueue(cpuset_propagate_hotplug_wq);
2223 2174
2224 /* rebuild sched domains if cpus_allowed has changed */ 2175 /* rebuild sched domains if cpus_allowed has changed */
2225 if (cpus_updated) { 2176 if (cpus_updated)
2226 struct sched_domain_attr *attr; 2177 rebuild_sched_domains();
2227 cpumask_var_t *doms;
2228 int ndoms;
2229
2230 mutex_lock(&cpuset_mutex);
2231 ndoms = generate_sched_domains(&doms, &attr);
2232 mutex_unlock(&cpuset_mutex);
2233
2234 partition_sched_domains(ndoms, doms, attr);
2235 }
2236} 2178}
2237 2179
2238void cpuset_update_active_cpus(bool cpu_online) 2180void cpuset_update_active_cpus(bool cpu_online)
@@ -2251,7 +2193,6 @@ void cpuset_update_active_cpus(bool cpu_online)
2251 schedule_work(&cpuset_hotplug_work); 2193 schedule_work(&cpuset_hotplug_work);
2252} 2194}
2253 2195
2254#ifdef CONFIG_MEMORY_HOTPLUG
2255/* 2196/*
2256 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. 2197 * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
2257 * Call this routine anytime after node_states[N_MEMORY] changes. 2198 * Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2263,20 +2204,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2263 schedule_work(&cpuset_hotplug_work); 2204 schedule_work(&cpuset_hotplug_work);
2264 return NOTIFY_OK; 2205 return NOTIFY_OK;
2265} 2206}
2266#endif 2207
2208static struct notifier_block cpuset_track_online_nodes_nb = {
2209 .notifier_call = cpuset_track_online_nodes,
2210 .priority = 10, /* ??! */
2211};
2267 2212
2268/** 2213/**
2269 * cpuset_init_smp - initialize cpus_allowed 2214 * cpuset_init_smp - initialize cpus_allowed
2270 * 2215 *
2271 * Description: Finish top cpuset after cpu, node maps are initialized 2216 * Description: Finish top cpuset after cpu, node maps are initialized
2272 **/ 2217 */
2273
2274void __init cpuset_init_smp(void) 2218void __init cpuset_init_smp(void)
2275{ 2219{
2276 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2220 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2277 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2221 top_cpuset.mems_allowed = node_states[N_MEMORY];
2278 2222
2279 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2223 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2280 2224
2281 cpuset_propagate_hotplug_wq = 2225 cpuset_propagate_hotplug_wq =
2282 alloc_ordered_workqueue("cpuset_hotplug", 0); 2226 alloc_ordered_workqueue("cpuset_hotplug", 0);
@@ -2592,6 +2536,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2592 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2536 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2593} 2537}
2594 2538
2539#define CPUSET_NODELIST_LEN (256)
2540
2595/** 2541/**
2596 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed 2542 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2597 * @task: pointer to task_struct of some task. 2543 * @task: pointer to task_struct of some task.
@@ -2602,25 +2548,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2602 */ 2548 */
2603void cpuset_print_task_mems_allowed(struct task_struct *tsk) 2549void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2604{ 2550{
2605 struct dentry *dentry; 2551 /* Statically allocated to prevent using excess stack. */
2552 static char cpuset_nodelist[CPUSET_NODELIST_LEN];
2553 static DEFINE_SPINLOCK(cpuset_buffer_lock);
2606 2554
2607 dentry = task_cs(tsk)->css.cgroup->dentry; 2555 struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
2608 spin_lock(&cpuset_buffer_lock);
2609 2556
2610 if (!dentry) { 2557 rcu_read_lock();
2611 strcpy(cpuset_name, "/"); 2558 spin_lock(&cpuset_buffer_lock);
2612 } else {
2613 spin_lock(&dentry->d_lock);
2614 strlcpy(cpuset_name, (const char *)dentry->d_name.name,
2615 CPUSET_NAME_LEN);
2616 spin_unlock(&dentry->d_lock);
2617 }
2618 2559
2619 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, 2560 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2620 tsk->mems_allowed); 2561 tsk->mems_allowed);
2621 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", 2562 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2622 tsk->comm, cpuset_name, cpuset_nodelist); 2563 tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
2564
2623 spin_unlock(&cpuset_buffer_lock); 2565 spin_unlock(&cpuset_buffer_lock);
2566 rcu_read_unlock();
2624} 2567}
2625 2568
2626/* 2569/*
@@ -2666,7 +2609,7 @@ void __cpuset_memory_pressure_bump(void)
2666 * and we take cpuset_mutex, keeping cpuset_attach() from changing it 2609 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
2667 * anyway. 2610 * anyway.
2668 */ 2611 */
2669static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2612int proc_cpuset_show(struct seq_file *m, void *unused_v)
2670{ 2613{
2671 struct pid *pid; 2614 struct pid *pid;
2672 struct task_struct *tsk; 2615 struct task_struct *tsk;
@@ -2700,19 +2643,6 @@ out_free:
2700out: 2643out:
2701 return retval; 2644 return retval;
2702} 2645}
2703
2704static int cpuset_open(struct inode *inode, struct file *file)
2705{
2706 struct pid *pid = PROC_I(inode)->pid;
2707 return single_open(file, proc_cpuset_show, pid);
2708}
2709
2710const struct file_operations proc_cpuset_operations = {
2711 .open = cpuset_open,
2712 .read = seq_read,
2713 .llseek = seq_lseek,
2714 .release = single_release,
2715};
2716#endif /* CONFIG_PROC_PID_CPUSET */ 2646#endif /* CONFIG_PROC_PID_CPUSET */
2717 2647
2718/* Display task mems_allowed in /proc/<pid>/status file. */ 2648/* Display task mems_allowed in /proc/<pid>/status file. */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index c26278fd4851..0506d447aed2 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -775,7 +775,7 @@ static void sysrq_handle_dbg(int key)
775 775
776static struct sysrq_key_op sysrq_dbg_op = { 776static struct sysrq_key_op sysrq_dbg_op = {
777 .handler = sysrq_handle_dbg, 777 .handler = sysrq_handle_dbg,
778 .help_msg = "debug(G)", 778 .help_msg = "debug(g)",
779 .action_msg = "DEBUG", 779 .action_msg = "DEBUG",
780}; 780};
781#endif 781#endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b0cd86501c30..6b41c1899a8b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -18,6 +18,7 @@
18#include <linux/poll.h> 18#include <linux/poll.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/hash.h> 20#include <linux/hash.h>
21#include <linux/tick.h>
21#include <linux/sysfs.h> 22#include <linux/sysfs.h>
22#include <linux/dcache.h> 23#include <linux/dcache.h>
23#include <linux/percpu.h> 24#include <linux/percpu.h>
@@ -37,6 +38,7 @@
37#include <linux/ftrace_event.h> 38#include <linux/ftrace_event.h>
38#include <linux/hw_breakpoint.h> 39#include <linux/hw_breakpoint.h>
39#include <linux/mm_types.h> 40#include <linux/mm_types.h>
41#include <linux/cgroup.h>
40 42
41#include "internal.h" 43#include "internal.h"
42 44
@@ -234,6 +236,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
234#ifdef CONFIG_CGROUP_PERF 236#ifdef CONFIG_CGROUP_PERF
235 237
236/* 238/*
239 * perf_cgroup_info keeps track of time_enabled for a cgroup.
240 * This is a per-cpu dynamically allocated data structure.
241 */
242struct perf_cgroup_info {
243 u64 time;
244 u64 timestamp;
245};
246
247struct perf_cgroup {
248 struct cgroup_subsys_state css;
249 struct perf_cgroup_info __percpu *info;
250};
251
252/*
237 * Must ensure cgroup is pinned (css_get) before calling 253 * Must ensure cgroup is pinned (css_get) before calling
238 * this function. In other words, we cannot call this function 254 * this function. In other words, we cannot call this function
239 * if there is no cgroup event for the current CPU context. 255 * if there is no cgroup event for the current CPU context.
@@ -251,7 +267,22 @@ perf_cgroup_match(struct perf_event *event)
251 struct perf_event_context *ctx = event->ctx; 267 struct perf_event_context *ctx = event->ctx;
252 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 268 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
253 269
254 return !event->cgrp || event->cgrp == cpuctx->cgrp; 270 /* @event doesn't care about cgroup */
271 if (!event->cgrp)
272 return true;
273
274 /* wants specific cgroup scope but @cpuctx isn't associated with any */
275 if (!cpuctx->cgrp)
276 return false;
277
278 /*
279 * Cgroup scoping is recursive. An event enabled for a cgroup is
280 * also enabled for all its descendant cgroups. If @cpuctx's
281 * cgroup is a descendant of @event's (the test covers identity
282 * case), it's a match.
283 */
284 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
285 event->cgrp->css.cgroup);
255} 286}
256 287
257static inline bool perf_tryget_cgroup(struct perf_event *event) 288static inline bool perf_tryget_cgroup(struct perf_event *event)
@@ -655,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
655 686
656 WARN_ON(!irqs_disabled()); 687 WARN_ON(!irqs_disabled());
657 688
658 if (list_empty(&cpuctx->rotation_list)) 689 if (list_empty(&cpuctx->rotation_list)) {
690 int was_empty = list_empty(head);
659 list_add(&cpuctx->rotation_list, head); 691 list_add(&cpuctx->rotation_list, head);
692 if (was_empty)
693 tick_nohz_full_kick();
694 }
660} 695}
661 696
662static void get_ctx(struct perf_event_context *ctx) 697static void get_ctx(struct perf_event_context *ctx)
@@ -961,9 +996,15 @@ static void perf_event__header_size(struct perf_event *event)
961 if (sample_type & PERF_SAMPLE_PERIOD) 996 if (sample_type & PERF_SAMPLE_PERIOD)
962 size += sizeof(data->period); 997 size += sizeof(data->period);
963 998
999 if (sample_type & PERF_SAMPLE_WEIGHT)
1000 size += sizeof(data->weight);
1001
964 if (sample_type & PERF_SAMPLE_READ) 1002 if (sample_type & PERF_SAMPLE_READ)
965 size += event->read_size; 1003 size += event->read_size;
966 1004
1005 if (sample_type & PERF_SAMPLE_DATA_SRC)
1006 size += sizeof(data->data_src.val);
1007
967 event->header_size = size; 1008 event->header_size = size;
968} 1009}
969 1010
@@ -2555,6 +2596,16 @@ done:
2555 list_del_init(&cpuctx->rotation_list); 2596 list_del_init(&cpuctx->rotation_list);
2556} 2597}
2557 2598
2599#ifdef CONFIG_NO_HZ_FULL
2600bool perf_event_can_stop_tick(void)
2601{
2602 if (list_empty(&__get_cpu_var(rotation_list)))
2603 return true;
2604 else
2605 return false;
2606}
2607#endif
2608
2558void perf_event_task_tick(void) 2609void perf_event_task_tick(void)
2559{ 2610{
2560 struct list_head *head = &__get_cpu_var(rotation_list); 2611 struct list_head *head = &__get_cpu_var(rotation_list);
@@ -4178,6 +4229,12 @@ void perf_output_sample(struct perf_output_handle *handle,
4178 perf_output_sample_ustack(handle, 4229 perf_output_sample_ustack(handle,
4179 data->stack_user_size, 4230 data->stack_user_size,
4180 data->regs_user.regs); 4231 data->regs_user.regs);
4232
4233 if (sample_type & PERF_SAMPLE_WEIGHT)
4234 perf_output_put(handle, data->weight);
4235
4236 if (sample_type & PERF_SAMPLE_DATA_SRC)
4237 perf_output_put(handle, data->data_src.val);
4181} 4238}
4182 4239
4183void perf_prepare_sample(struct perf_event_header *header, 4240void perf_prepare_sample(struct perf_event_header *header,
@@ -4434,12 +4491,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
4434 if (ctxn < 0) 4491 if (ctxn < 0)
4435 goto next; 4492 goto next;
4436 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 4493 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4494 if (ctx)
4495 perf_event_task_ctx(ctx, task_event);
4437 } 4496 }
4438 if (ctx)
4439 perf_event_task_ctx(ctx, task_event);
4440next: 4497next:
4441 put_cpu_ptr(pmu->pmu_cpu_context); 4498 put_cpu_ptr(pmu->pmu_cpu_context);
4442 } 4499 }
4500 if (task_event->task_ctx)
4501 perf_event_task_ctx(task_event->task_ctx, task_event);
4502
4443 rcu_read_unlock(); 4503 rcu_read_unlock();
4444} 4504}
4445 4505
@@ -4593,6 +4653,7 @@ void perf_event_comm(struct task_struct *task)
4593 struct perf_event_context *ctx; 4653 struct perf_event_context *ctx;
4594 int ctxn; 4654 int ctxn;
4595 4655
4656 rcu_read_lock();
4596 for_each_task_context_nr(ctxn) { 4657 for_each_task_context_nr(ctxn) {
4597 ctx = task->perf_event_ctxp[ctxn]; 4658 ctx = task->perf_event_ctxp[ctxn];
4598 if (!ctx) 4659 if (!ctx)
@@ -4600,6 +4661,7 @@ void perf_event_comm(struct task_struct *task)
4600 4661
4601 perf_event_enable_on_exec(ctx); 4662 perf_event_enable_on_exec(ctx);
4602 } 4663 }
4664 rcu_read_unlock();
4603 4665
4604 if (!atomic_read(&nr_comm_events)) 4666 if (!atomic_read(&nr_comm_events))
4605 return; 4667 return;
@@ -4734,7 +4796,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4734 } else { 4796 } else {
4735 if (arch_vma_name(mmap_event->vma)) { 4797 if (arch_vma_name(mmap_event->vma)) {
4736 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 4798 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4737 sizeof(tmp)); 4799 sizeof(tmp) - 1);
4800 tmp[sizeof(tmp) - 1] = '\0';
4738 goto got_name; 4801 goto got_name;
4739 } 4802 }
4740 4803
@@ -4761,6 +4824,9 @@ got_name:
4761 mmap_event->file_name = name; 4824 mmap_event->file_name = name;
4762 mmap_event->file_size = size; 4825 mmap_event->file_size = size;
4763 4826
4827 if (!(vma->vm_flags & VM_EXEC))
4828 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
4829
4764 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4830 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4765 4831
4766 rcu_read_lock(); 4832 rcu_read_lock();
@@ -5327,7 +5393,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
5327 5393
5328static int perf_swevent_init(struct perf_event *event) 5394static int perf_swevent_init(struct perf_event *event)
5329{ 5395{
5330 int event_id = event->attr.config; 5396 u64 event_id = event->attr.config;
5331 5397
5332 if (event->attr.type != PERF_TYPE_SOFTWARE) 5398 if (event->attr.type != PERF_TYPE_SOFTWARE)
5333 return -ENOENT; 5399 return -ENOENT;
@@ -5647,6 +5713,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
5647 event->attr.sample_period = NSEC_PER_SEC / freq; 5713 event->attr.sample_period = NSEC_PER_SEC / freq;
5648 hwc->sample_period = event->attr.sample_period; 5714 hwc->sample_period = event->attr.sample_period;
5649 local64_set(&hwc->period_left, hwc->sample_period); 5715 local64_set(&hwc->period_left, hwc->sample_period);
5716 hwc->last_period = hwc->sample_period;
5650 event->attr.freq = 0; 5717 event->attr.freq = 0;
5651 } 5718 }
5652} 5719}
@@ -5982,6 +6049,7 @@ skip_type:
5982 if (pmu->pmu_cpu_context) 6049 if (pmu->pmu_cpu_context)
5983 goto got_cpu_context; 6050 goto got_cpu_context;
5984 6051
6052 ret = -ENOMEM;
5985 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 6053 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5986 if (!pmu->pmu_cpu_context) 6054 if (!pmu->pmu_cpu_context)
5987 goto free_dev; 6055 goto free_dev;
@@ -7509,12 +7577,5 @@ struct cgroup_subsys perf_subsys = {
7509 .css_free = perf_cgroup_css_free, 7577 .css_free = perf_cgroup_css_free,
7510 .exit = perf_cgroup_exit, 7578 .exit = perf_cgroup_exit,
7511 .attach = perf_cgroup_attach, 7579 .attach = perf_cgroup_attach,
7512
7513 /*
7514 * perf_event cgroup doesn't handle nesting correctly.
7515 * ctx->nr_cgroups adjustments should be propagated through the
7516 * cgroup hierarchy. Fix it and remove the following.
7517 */
7518 .broken_hierarchy = true,
7519}; 7580};
7520#endif /* CONFIG_CGROUP_PERF */ 7581#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8b..eb675c4d59df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
16 int page_order; /* allocation order */ 16 int page_order; /* allocation order */
17#endif 17#endif
18 int nr_pages; /* nr of data pages */ 18 int nr_pages; /* nr of data pages */
19 int writable; /* are we writable */ 19 int overwrite; /* can overwrite itself */
20 20
21 atomic_t poll; /* POLL_ for wakeups */ 21 atomic_t poll; /* POLL_ for wakeups */
22 22
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff3973..cd55144270b5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, 18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head) 19 unsigned long offset, unsigned long head)
20{ 20{
21 unsigned long mask; 21 unsigned long sz = perf_data_size(rb);
22 unsigned long mask = sz - 1;
22 23
23 if (!rb->writable) 24 /*
25 * check if user-writable
26 * overwrite : over-write its own tail
27 * !overwrite: buffer possibly drops events.
28 */
29 if (rb->overwrite)
24 return true; 30 return true;
25 31
26 mask = perf_data_size(rb) - 1; 32 /*
33 * verify that payload is not bigger than buffer
34 * otherwise masking logic may fail to detect
35 * the "not enough space" condition
36 */
37 if ((head - offset) > sz)
38 return false;
27 39
28 offset = (offset - tail) & mask; 40 offset = (offset - tail) & mask;
29 head = (head - tail) & mask; 41 head = (head - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
212 rb->watermark = max_size / 2; 224 rb->watermark = max_size / 2;
213 225
214 if (flags & RING_BUFFER_WRITABLE) 226 if (flags & RING_BUFFER_WRITABLE)
215 rb->writable = 1; 227 rb->overwrite = 0;
228 else
229 rb->overwrite = 1;
216 230
217 atomic_set(&rb->refcount, 1); 231 atomic_set(&rb->refcount, 1);
218 232
@@ -312,11 +326,16 @@ void rb_free(struct ring_buffer *rb)
312} 326}
313 327
314#else 328#else
329static int data_page_nr(struct ring_buffer *rb)
330{
331 return rb->nr_pages << page_order(rb);
332}
315 333
316struct page * 334struct page *
317perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) 335perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
318{ 336{
319 if (pgoff > (1UL << page_order(rb))) 337 /* The '>' counts in the user page. */
338 if (pgoff > data_page_nr(rb))
320 return NULL; 339 return NULL;
321 340
322 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); 341 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
@@ -336,10 +355,11 @@ static void rb_free_work(struct work_struct *work)
336 int i, nr; 355 int i, nr;
337 356
338 rb = container_of(work, struct ring_buffer, work); 357 rb = container_of(work, struct ring_buffer, work);
339 nr = 1 << page_order(rb); 358 nr = data_page_nr(rb);
340 359
341 base = rb->user_page; 360 base = rb->user_page;
342 for (i = 0; i < nr + 1; i++) 361 /* The '<=' counts in the user page. */
362 for (i = 0; i <= nr; i++)
343 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 363 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
344 364
345 vfree(base); 365 vfree(base);
@@ -373,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
373 rb->user_page = all_buf; 393 rb->user_page = all_buf;
374 rb->data_pages[0] = all_buf + PAGE_SIZE; 394 rb->data_pages[0] = all_buf + PAGE_SIZE;
375 rb->page_order = ilog2(nr_pages); 395 rb->page_order = ilog2(nr_pages);
376 rb->nr_pages = 1; 396 rb->nr_pages = !!nr_pages;
377 397
378 ring_buffer_init(rb, watermark, flags); 398 ring_buffer_init(rb, watermark, flags);
379 399
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a567c8c7ef31..f3569747d629 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -75,6 +75,15 @@ struct uprobe {
75 struct arch_uprobe arch; 75 struct arch_uprobe arch;
76}; 76};
77 77
78struct return_instance {
79 struct uprobe *uprobe;
80 unsigned long func;
81 unsigned long orig_ret_vaddr; /* original return address */
82 bool chained; /* true, if instance is nested */
83
84 struct return_instance *next; /* keep as stack */
85};
86
78/* 87/*
79 * valid_vma: Verify if the specified vma is an executable vma 88 * valid_vma: Verify if the specified vma is an executable vma
80 * Relax restrictions while unregistering: vm_flags might have 89 * Relax restrictions while unregistering: vm_flags might have
@@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
173 return *insn == UPROBE_SWBP_INSN; 182 return *insn == UPROBE_SWBP_INSN;
174} 183}
175 184
176static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) 185/**
186 * is_trap_insn - check if instruction is breakpoint instruction.
187 * @insn: instruction to be checked.
188 * Default implementation of is_trap_insn
189 * Returns true if @insn is a breakpoint instruction.
190 *
191 * This function is needed for the case where an architecture has multiple
192 * trap instructions (like powerpc).
193 */
194bool __weak is_trap_insn(uprobe_opcode_t *insn)
195{
196 return is_swbp_insn(insn);
197}
198
199static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
177{ 200{
178 void *kaddr = kmap_atomic(page); 201 void *kaddr = kmap_atomic(page);
179 memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); 202 memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
203 kunmap_atomic(kaddr);
204}
205
206static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
207{
208 void *kaddr = kmap_atomic(page);
209 memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
180 kunmap_atomic(kaddr); 210 kunmap_atomic(kaddr);
181} 211}
182 212
@@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
185 uprobe_opcode_t old_opcode; 215 uprobe_opcode_t old_opcode;
186 bool is_swbp; 216 bool is_swbp;
187 217
188 copy_opcode(page, vaddr, &old_opcode); 218 /*
219 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
220 * We do not check if it is any other 'trap variant' which could
221 * be conditional trap instruction such as the one powerpc supports.
222 *
223 * The logic is that we do not care if the underlying instruction
224 * is a trap variant; uprobes always wins over any other (gdb)
225 * breakpoint.
226 */
227 copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
189 is_swbp = is_swbp_insn(&old_opcode); 228 is_swbp = is_swbp_insn(&old_opcode);
190 229
191 if (is_swbp_insn(new_opcode)) { 230 if (is_swbp_insn(new_opcode)) {
@@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
204 * Expect the breakpoint instruction to be the smallest size instruction for 243 * Expect the breakpoint instruction to be the smallest size instruction for
205 * the architecture. If an arch has variable length instruction and the 244 * the architecture. If an arch has variable length instruction and the
206 * breakpoint instruction is not of the smallest length instruction 245 * breakpoint instruction is not of the smallest length instruction
207 * supported by that architecture then we need to modify is_swbp_at_addr and 246 * supported by that architecture then we need to modify is_trap_at_addr and
208 * write_opcode accordingly. This would never be a problem for archs that 247 * write_opcode accordingly. This would never be a problem for archs that
209 * have fixed length instructions. 248 * have fixed length instructions.
210 */ 249 */
@@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
225 uprobe_opcode_t opcode) 264 uprobe_opcode_t opcode)
226{ 265{
227 struct page *old_page, *new_page; 266 struct page *old_page, *new_page;
228 void *vaddr_old, *vaddr_new;
229 struct vm_area_struct *vma; 267 struct vm_area_struct *vma;
230 int ret; 268 int ret;
231 269
@@ -246,15 +284,8 @@ retry:
246 284
247 __SetPageUptodate(new_page); 285 __SetPageUptodate(new_page);
248 286
249 /* copy the page now that we've got it stable */ 287 copy_highpage(new_page, old_page);
250 vaddr_old = kmap_atomic(old_page); 288 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
251 vaddr_new = kmap_atomic(new_page);
252
253 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
254 memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
255
256 kunmap_atomic(vaddr_new);
257 kunmap_atomic(vaddr_old);
258 289
259 ret = anon_vma_prepare(vma); 290 ret = anon_vma_prepare(vma);
260 if (ret) 291 if (ret)
@@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
477 unsigned long nbytes, loff_t offset) 508 unsigned long nbytes, loff_t offset)
478{ 509{
479 struct page *page; 510 struct page *page;
480 void *vaddr;
481 unsigned long off;
482 pgoff_t idx;
483
484 if (!filp)
485 return -EINVAL;
486 511
487 if (!mapping->a_ops->readpage) 512 if (!mapping->a_ops->readpage)
488 return -EIO; 513 return -EIO;
489
490 idx = offset >> PAGE_CACHE_SHIFT;
491 off = offset & ~PAGE_MASK;
492
493 /* 514 /*
494 * Ensure that the page that has the original instruction is 515 * Ensure that the page that has the original instruction is
495 * populated and in page-cache. 516 * populated and in page-cache.
496 */ 517 */
497 page = read_mapping_page(mapping, idx, filp); 518 page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
498 if (IS_ERR(page)) 519 if (IS_ERR(page))
499 return PTR_ERR(page); 520 return PTR_ERR(page);
500 521
501 vaddr = kmap_atomic(page); 522 copy_from_page(page, offset, insn, nbytes);
502 memcpy(insn, vaddr + off, nbytes);
503 kunmap_atomic(vaddr);
504 page_cache_release(page); 523 page_cache_release(page);
505 524
506 return 0; 525 return 0;
@@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
550 goto out; 569 goto out;
551 570
552 ret = -ENOTSUPP; 571 ret = -ENOTSUPP;
553 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) 572 if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
554 goto out; 573 goto out;
555 574
556 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); 575 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
758 down_write(&mm->mmap_sem); 777 down_write(&mm->mmap_sem);
759 vma = find_vma(mm, info->vaddr); 778 vma = find_vma(mm, info->vaddr);
760 if (!vma || !valid_vma(vma, is_register) || 779 if (!vma || !valid_vma(vma, is_register) ||
761 vma->vm_file->f_mapping->host != uprobe->inode) 780 file_inode(vma->vm_file) != uprobe->inode)
762 goto unlock; 781 goto unlock;
763 782
764 if (vma->vm_start > info->vaddr || 783 if (vma->vm_start > info->vaddr ||
@@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
828 struct uprobe *uprobe; 847 struct uprobe *uprobe;
829 int ret; 848 int ret;
830 849
850 /* Uprobe must have at least one set consumer */
851 if (!uc->handler && !uc->ret_handler)
852 return -EINVAL;
853
831 /* Racy, just to catch the obvious mistakes */ 854 /* Racy, just to catch the obvious mistakes */
832 if (offset > i_size_read(inode)) 855 if (offset > i_size_read(inode))
833 return -EINVAL; 856 return -EINVAL;
@@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
917 loff_t offset; 940 loff_t offset;
918 941
919 if (!valid_vma(vma, false) || 942 if (!valid_vma(vma, false) ||
920 vma->vm_file->f_mapping->host != uprobe->inode) 943 file_inode(vma->vm_file) != uprobe->inode)
921 continue; 944 continue;
922 945
923 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 946 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
1010 if (no_uprobe_events() || !valid_vma(vma, true)) 1033 if (no_uprobe_events() || !valid_vma(vma, true))
1011 return 0; 1034 return 0;
1012 1035
1013 inode = vma->vm_file->f_mapping->host; 1036 inode = file_inode(vma->vm_file);
1014 if (!inode) 1037 if (!inode)
1015 return 0; 1038 return 0;
1016 1039
@@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
1041 struct inode *inode; 1064 struct inode *inode;
1042 struct rb_node *n; 1065 struct rb_node *n;
1043 1066
1044 inode = vma->vm_file->f_mapping->host; 1067 inode = file_inode(vma->vm_file);
1045 1068
1046 min = vaddr_to_offset(vma, start); 1069 min = vaddr_to_offset(vma, start);
1047 max = min + (end - start) - 1; 1070 max = min + (end - start) - 1;
@@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void)
1114{ 1137{
1115 struct mm_struct *mm = current->mm; 1138 struct mm_struct *mm = current->mm;
1116 struct xol_area *area; 1139 struct xol_area *area;
1140 uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1117 1141
1118 area = mm->uprobes_state.xol_area; 1142 area = mm->uprobes_state.xol_area;
1119 if (area) 1143 if (area)
@@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void)
1131 if (!area->page) 1155 if (!area->page)
1132 goto free_bitmap; 1156 goto free_bitmap;
1133 1157
1158 /* allocate first slot of task's xol_area for the return probes */
1159 set_bit(0, area->bitmap);
1160 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1161 atomic_set(&area->slot_count, 1);
1134 init_waitqueue_head(&area->wq); 1162 init_waitqueue_head(&area->wq);
1163
1135 if (!xol_add_vma(area)) 1164 if (!xol_add_vma(area))
1136 return area; 1165 return area;
1137 1166
@@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
1216static unsigned long xol_get_insn_slot(struct uprobe *uprobe) 1245static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1217{ 1246{
1218 struct xol_area *area; 1247 struct xol_area *area;
1219 unsigned long offset;
1220 unsigned long xol_vaddr; 1248 unsigned long xol_vaddr;
1221 void *vaddr;
1222 1249
1223 area = get_xol_area(); 1250 area = get_xol_area();
1224 if (!area) 1251 if (!area)
@@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1229 return 0; 1256 return 0;
1230 1257
1231 /* Initialize the slot */ 1258 /* Initialize the slot */
1232 offset = xol_vaddr & ~PAGE_MASK; 1259 copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
1233 vaddr = kmap_atomic(area->page);
1234 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1235 kunmap_atomic(vaddr);
1236 /* 1260 /*
1237 * We probably need flush_icache_user_range() but it needs vma. 1261 * We probably need flush_icache_user_range() but it needs vma.
1238 * This should work on supported architectures too. 1262 * This should work on supported architectures too.
@@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1298void uprobe_free_utask(struct task_struct *t) 1322void uprobe_free_utask(struct task_struct *t)
1299{ 1323{
1300 struct uprobe_task *utask = t->utask; 1324 struct uprobe_task *utask = t->utask;
1325 struct return_instance *ri, *tmp;
1301 1326
1302 if (!utask) 1327 if (!utask)
1303 return; 1328 return;
@@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)
1305 if (utask->active_uprobe) 1330 if (utask->active_uprobe)
1306 put_uprobe(utask->active_uprobe); 1331 put_uprobe(utask->active_uprobe);
1307 1332
1333 ri = utask->return_instances;
1334 while (ri) {
1335 tmp = ri;
1336 ri = ri->next;
1337
1338 put_uprobe(tmp->uprobe);
1339 kfree(tmp);
1340 }
1341
1308 xol_free_insn_slot(t); 1342 xol_free_insn_slot(t);
1309 kfree(utask); 1343 kfree(utask);
1310 t->utask = NULL; 1344 t->utask = NULL;
@@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void)
1333 return current->utask; 1367 return current->utask;
1334} 1368}
1335 1369
1370/*
1371 * Current area->vaddr notion assume the trampoline address is always
1372 * equal area->vaddr.
1373 *
1374 * Returns -1 in case the xol_area is not allocated.
1375 */
1376static unsigned long get_trampoline_vaddr(void)
1377{
1378 struct xol_area *area;
1379 unsigned long trampoline_vaddr = -1;
1380
1381 area = current->mm->uprobes_state.xol_area;
1382 smp_read_barrier_depends();
1383 if (area)
1384 trampoline_vaddr = area->vaddr;
1385
1386 return trampoline_vaddr;
1387}
1388
1389static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
1390{
1391 struct return_instance *ri;
1392 struct uprobe_task *utask;
1393 unsigned long orig_ret_vaddr, trampoline_vaddr;
1394 bool chained = false;
1395
1396 if (!get_xol_area())
1397 return;
1398
1399 utask = get_utask();
1400 if (!utask)
1401 return;
1402
1403 if (utask->depth >= MAX_URETPROBE_DEPTH) {
1404 printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
1405 " nestedness limit pid/tgid=%d/%d\n",
1406 current->pid, current->tgid);
1407 return;
1408 }
1409
1410 ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
1411 if (!ri)
1412 goto fail;
1413
1414 trampoline_vaddr = get_trampoline_vaddr();
1415 orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
1416 if (orig_ret_vaddr == -1)
1417 goto fail;
1418
1419 /*
1420 * We don't want to keep trampoline address in stack, rather keep the
1421 * original return address of first caller thru all the consequent
1422 * instances. This also makes breakpoint unwrapping easier.
1423 */
1424 if (orig_ret_vaddr == trampoline_vaddr) {
1425 if (!utask->return_instances) {
1426 /*
1427 * This situation is not possible. Likely we have an
1428 * attack from user-space.
1429 */
1430 pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
1431 current->pid, current->tgid);
1432 goto fail;
1433 }
1434
1435 chained = true;
1436 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
1437 }
1438
1439 atomic_inc(&uprobe->ref);
1440 ri->uprobe = uprobe;
1441 ri->func = instruction_pointer(regs);
1442 ri->orig_ret_vaddr = orig_ret_vaddr;
1443 ri->chained = chained;
1444
1445 utask->depth++;
1446
1447 /* add instance to the stack */
1448 ri->next = utask->return_instances;
1449 utask->return_instances = ri;
1450
1451 return;
1452
1453 fail:
1454 kfree(ri);
1455}
1456
1336/* Prepare to single-step probed instruction out of line. */ 1457/* Prepare to single-step probed instruction out of line. */
1337static int 1458static int
1338pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) 1459pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
@@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
1431 clear_bit(MMF_HAS_UPROBES, &mm->flags); 1552 clear_bit(MMF_HAS_UPROBES, &mm->flags);
1432} 1553}
1433 1554
1434static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) 1555static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1435{ 1556{
1436 struct page *page; 1557 struct page *page;
1437 uprobe_opcode_t opcode; 1558 uprobe_opcode_t opcode;
@@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
1449 if (result < 0) 1570 if (result < 0)
1450 return result; 1571 return result;
1451 1572
1452 copy_opcode(page, vaddr, &opcode); 1573 copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
1453 put_page(page); 1574 put_page(page);
1454 out: 1575 out:
1455 return is_swbp_insn(&opcode); 1576 /* This needs to return true for any variant of the trap insn */
1577 return is_trap_insn(&opcode);
1456} 1578}
1457 1579
1458static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) 1580static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1465 vma = find_vma(mm, bp_vaddr); 1587 vma = find_vma(mm, bp_vaddr);
1466 if (vma && vma->vm_start <= bp_vaddr) { 1588 if (vma && vma->vm_start <= bp_vaddr) {
1467 if (valid_vma(vma, false)) { 1589 if (valid_vma(vma, false)) {
1468 struct inode *inode = vma->vm_file->f_mapping->host; 1590 struct inode *inode = file_inode(vma->vm_file);
1469 loff_t offset = vaddr_to_offset(vma, bp_vaddr); 1591 loff_t offset = vaddr_to_offset(vma, bp_vaddr);
1470 1592
1471 uprobe = find_uprobe(inode, offset); 1593 uprobe = find_uprobe(inode, offset);
1472 } 1594 }
1473 1595
1474 if (!uprobe) 1596 if (!uprobe)
1475 *is_swbp = is_swbp_at_addr(mm, bp_vaddr); 1597 *is_swbp = is_trap_at_addr(mm, bp_vaddr);
1476 } else { 1598 } else {
1477 *is_swbp = -EFAULT; 1599 *is_swbp = -EFAULT;
1478 } 1600 }
@@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1488{ 1610{
1489 struct uprobe_consumer *uc; 1611 struct uprobe_consumer *uc;
1490 int remove = UPROBE_HANDLER_REMOVE; 1612 int remove = UPROBE_HANDLER_REMOVE;
1613 bool need_prep = false; /* prepare return uprobe, when needed */
1491 1614
1492 down_read(&uprobe->register_rwsem); 1615 down_read(&uprobe->register_rwsem);
1493 for (uc = uprobe->consumers; uc; uc = uc->next) { 1616 for (uc = uprobe->consumers; uc; uc = uc->next) {
1494 int rc = uc->handler(uc, regs); 1617 int rc = 0;
1618
1619 if (uc->handler) {
1620 rc = uc->handler(uc, regs);
1621 WARN(rc & ~UPROBE_HANDLER_MASK,
1622 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1623 }
1624
1625 if (uc->ret_handler)
1626 need_prep = true;
1495 1627
1496 WARN(rc & ~UPROBE_HANDLER_MASK,
1497 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1498 remove &= rc; 1628 remove &= rc;
1499 } 1629 }
1500 1630
1631 if (need_prep && !remove)
1632 prepare_uretprobe(uprobe, regs); /* put bp at return */
1633
1501 if (remove && uprobe->consumers) { 1634 if (remove && uprobe->consumers) {
1502 WARN_ON(!uprobe_is_active(uprobe)); 1635 WARN_ON(!uprobe_is_active(uprobe));
1503 unapply_uprobe(uprobe, current->mm); 1636 unapply_uprobe(uprobe, current->mm);
@@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1505 up_read(&uprobe->register_rwsem); 1638 up_read(&uprobe->register_rwsem);
1506} 1639}
1507 1640
1641static void
1642handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
1643{
1644 struct uprobe *uprobe = ri->uprobe;
1645 struct uprobe_consumer *uc;
1646
1647 down_read(&uprobe->register_rwsem);
1648 for (uc = uprobe->consumers; uc; uc = uc->next) {
1649 if (uc->ret_handler)
1650 uc->ret_handler(uc, ri->func, regs);
1651 }
1652 up_read(&uprobe->register_rwsem);
1653}
1654
1655static bool handle_trampoline(struct pt_regs *regs)
1656{
1657 struct uprobe_task *utask;
1658 struct return_instance *ri, *tmp;
1659 bool chained;
1660
1661 utask = current->utask;
1662 if (!utask)
1663 return false;
1664
1665 ri = utask->return_instances;
1666 if (!ri)
1667 return false;
1668
1669 /*
1670 * TODO: we should throw out return_instance's invalidated by
1671 * longjmp(), currently we assume that the probed function always
1672 * returns.
1673 */
1674 instruction_pointer_set(regs, ri->orig_ret_vaddr);
1675
1676 for (;;) {
1677 handle_uretprobe_chain(ri, regs);
1678
1679 chained = ri->chained;
1680 put_uprobe(ri->uprobe);
1681
1682 tmp = ri;
1683 ri = ri->next;
1684 kfree(tmp);
1685
1686 if (!chained)
1687 break;
1688
1689 utask->depth--;
1690
1691 BUG_ON(!ri);
1692 }
1693
1694 utask->return_instances = ri;
1695
1696 return true;
1697}
1698
1508/* 1699/*
1509 * Run handler and ask thread to singlestep. 1700 * Run handler and ask thread to singlestep.
1510 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1701 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs)
1516 int uninitialized_var(is_swbp); 1707 int uninitialized_var(is_swbp);
1517 1708
1518 bp_vaddr = uprobe_get_swbp_addr(regs); 1709 bp_vaddr = uprobe_get_swbp_addr(regs);
1519 uprobe = find_active_uprobe(bp_vaddr, &is_swbp); 1710 if (bp_vaddr == get_trampoline_vaddr()) {
1711 if (handle_trampoline(regs))
1712 return;
1713
1714 pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
1715 current->pid, current->tgid);
1716 }
1520 1717
1718 uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1521 if (!uprobe) { 1719 if (!uprobe) {
1522 if (is_swbp > 0) { 1720 if (is_swbp > 0) {
1523 /* No matching uprobe; signal SIGTRAP. */ 1721 /* No matching uprobe; signal SIGTRAP. */
@@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
1616 */ 1814 */
1617int uprobe_pre_sstep_notifier(struct pt_regs *regs) 1815int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1618{ 1816{
1619 if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags)) 1817 if (!current->mm)
1818 return 0;
1819
1820 if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
1821 (!current->utask || !current->utask->return_instances))
1620 return 0; 1822 return 0;
1621 1823
1622 set_thread_flag(TIF_UPROBE); 1824 set_thread_flag(TIF_UPROBE);
diff --git a/kernel/exit.c b/kernel/exit.c
index 51e485ca9935..af2eb3cbd499 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -835,7 +835,7 @@ void do_exit(long code)
835 /* 835 /*
836 * Make sure we are holding no locks: 836 * Make sure we are holding no locks:
837 */ 837 */
838 debug_check_no_locks_held(); 838 debug_check_no_locks_held(tsk);
839 /* 839 /*
840 * We can do this unlocked here. The futex code uses this flag 840 * We can do this unlocked here. The futex code uses this flag
841 * just to verify whether the pi state cleanup has been done 841 * just to verify whether the pi state cleanup has been done
@@ -847,7 +847,7 @@ void do_exit(long code)
847 exit_io_context(tsk); 847 exit_io_context(tsk);
848 848
849 if (tsk->splice_pipe) 849 if (tsk->splice_pipe)
850 __free_pipe_info(tsk->splice_pipe); 850 free_pipe_info(tsk->splice_pipe);
851 851
852 if (tsk->task_frag.page) 852 if (tsk->task_frag.page)
853 put_page(tsk->task_frag.page); 853 put_page(tsk->task_frag.page);
@@ -1629,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1629 } 1629 }
1630 1630
1631 put_pid(pid); 1631 put_pid(pid);
1632
1633 /* avoid REGPARM breakage on x86: */
1634 asmlinkage_protect(5, ret, which, upid, infop, options, ru);
1635 return ret; 1632 return ret;
1636} 1633}
1637 1634
@@ -1669,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1669 ret = do_wait(&wo); 1666 ret = do_wait(&wo);
1670 put_pid(pid); 1667 put_pid(pid);
1671 1668
1672 /* avoid REGPARM breakage on x86: */
1673 asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
1674 return ret; 1669 return ret;
1675} 1670}
1676 1671
diff --git a/kernel/extable.c b/kernel/extable.c
index fe35a634bf76..67460b93b1a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1;
41/* Sort the kernel's built-in exception table */ 41/* Sort the kernel's built-in exception table */
42void __init sort_main_extable(void) 42void __init sort_main_extable(void)
43{ 43{
44 if (main_extable_sort_needed) 44 if (main_extable_sort_needed) {
45 pr_notice("Sorting __ex_table...\n");
45 sort_extable(__start___ex_table, __stop___ex_table); 46 sort_extable(__start___ex_table, __stop___ex_table);
46 else 47 }
47 pr_notice("__ex_table already sorted, skipping sort\n");
48} 48}
49 49
50/* Given an address, look for it in the exception tables. */ 50/* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 8d932b1c9056..7d40687b1434 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1141 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1142 return ERR_PTR(-EINVAL); 1142 return ERR_PTR(-EINVAL);
1143 1143
1144 if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
1145 return ERR_PTR(-EINVAL);
1146
1144 /* 1147 /*
1145 * Thread groups must share signals as well, and detached threads 1148 * Thread groups must share signals as well, and detached threads
1146 * can only be started up within the thread group. 1149 * can only be started up within the thread group.
@@ -1230,7 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1230 1233
1231 p->utime = p->stime = p->gtime = 0; 1234 p->utime = p->stime = p->gtime = 0;
1232 p->utimescaled = p->stimescaled = 0; 1235 p->utimescaled = p->stimescaled = 0;
1233#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1236#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1234 p->prev_cputime.utime = p->prev_cputime.stime = 0; 1237 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1235#endif 1238#endif
1236#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1239#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -1674,10 +1677,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
1674 int, tls_val) 1677 int, tls_val)
1675#endif 1678#endif
1676{ 1679{
1677 long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); 1680 return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
1678 asmlinkage_protect(5, ret, clone_flags, newsp,
1679 parent_tidptr, child_tidptr, tls_val);
1680 return ret;
1681} 1681}
1682#endif 1682#endif
1683 1683
@@ -1807,7 +1807,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1807 * If unsharing a user namespace must also unshare the thread. 1807 * If unsharing a user namespace must also unshare the thread.
1808 */ 1808 */
1809 if (unshare_flags & CLONE_NEWUSER) 1809 if (unshare_flags & CLONE_NEWUSER)
1810 unshare_flags |= CLONE_THREAD; 1810 unshare_flags |= CLONE_THREAD | CLONE_FS;
1811 /* 1811 /*
1812 * If unsharing a pid namespace must also unshare the thread. 1812 * If unsharing a pid namespace must also unshare the thread.
1813 */ 1813 */
diff --git a/kernel/futex.c b/kernel/futex.c
index f0090a993dab..b26dcfc02c94 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key)
223 * @rw: mapping needs to be read/write (values: VERIFY_READ, 223 * @rw: mapping needs to be read/write (values: VERIFY_READ,
224 * VERIFY_WRITE) 224 * VERIFY_WRITE)
225 * 225 *
226 * Returns a negative error code or 0 226 * Return: a negative error code or 0
227 *
227 * The key words are stored in *key on success. 228 * The key words are stored in *key on success.
228 * 229 *
229 * For shared mappings, it's (page->index, file_inode(vma->vm_file), 230 * For shared mappings, it's (page->index, file_inode(vma->vm_file),
@@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
705 * be "current" except in the case of requeue pi. 706 * be "current" except in the case of requeue pi.
706 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) 707 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
707 * 708 *
708 * Returns: 709 * Return:
709 * 0 - ready to wait 710 * 0 - ready to wait;
710 * 1 - acquired the lock 711 * 1 - acquired the lock;
711 * <0 - error 712 * <0 - error
712 * 713 *
713 * The hb->lock and futex_key refs shall be held by the caller. 714 * The hb->lock and futex_key refs shall be held by the caller.
@@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1191 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. 1192 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1192 * hb1 and hb2 must be held by the caller. 1193 * hb1 and hb2 must be held by the caller.
1193 * 1194 *
1194 * Returns: 1195 * Return:
1195 * 0 - failed to acquire the lock atomicly 1196 * 0 - failed to acquire the lock atomically;
1196 * 1 - acquired the lock 1197 * 1 - acquired the lock;
1197 * <0 - error 1198 * <0 - error
1198 */ 1199 */
1199static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1200static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1254 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1255 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1255 * uaddr2 atomically on behalf of the top waiter. 1256 * uaddr2 atomically on behalf of the top waiter.
1256 * 1257 *
1257 * Returns: 1258 * Return:
1258 * >=0 - on success, the number of tasks requeued or woken 1259 * >=0 - on success, the number of tasks requeued or woken;
1259 * <0 - on error 1260 * <0 - on error
1260 */ 1261 */
1261static int futex_requeue(u32 __user *uaddr1, unsigned int flags, 1262static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1536 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must 1537 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1537 * be paired with exactly one earlier call to queue_me(). 1538 * be paired with exactly one earlier call to queue_me().
1538 * 1539 *
1539 * Returns: 1540 * Return:
1540 * 1 - if the futex_q was still queued (and we removed unqueued it) 1541 * 1 - if the futex_q was still queued (and we removed unqueued it);
1541 * 0 - if the futex_q was already removed by the waking thread 1542 * 0 - if the futex_q was already removed by the waking thread
1542 */ 1543 */
1543static int unqueue_me(struct futex_q *q) 1544static int unqueue_me(struct futex_q *q)
@@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
1707 * the pi_state owner as well as handle race conditions that may allow us to 1708 * the pi_state owner as well as handle race conditions that may allow us to
1708 * acquire the lock. Must be called with the hb lock held. 1709 * acquire the lock. Must be called with the hb lock held.
1709 * 1710 *
1710 * Returns: 1711 * Return:
1711 * 1 - success, lock taken 1712 * 1 - success, lock taken;
1712 * 0 - success, lock not taken 1713 * 0 - success, lock not taken;
1713 * <0 - on error (-EFAULT) 1714 * <0 - on error (-EFAULT)
1714 */ 1715 */
1715static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) 1716static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1824 * Return with the hb lock held and a q.key reference on success, and unlocked 1825 * Return with the hb lock held and a q.key reference on success, and unlocked
1825 * with no q.key reference on failure. 1826 * with no q.key reference on failure.
1826 * 1827 *
1827 * Returns: 1828 * Return:
1828 * 0 - uaddr contains val and hb has been locked 1829 * 0 - uaddr contains val and hb has been locked;
1829 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked 1830 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1830 */ 1831 */
1831static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, 1832static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2203,9 +2204,9 @@ pi_faulted:
2203 * the wakeup and return the appropriate error code to the caller. Must be 2204 * the wakeup and return the appropriate error code to the caller. Must be
2204 * called with the hb lock held. 2205 * called with the hb lock held.
2205 * 2206 *
2206 * Returns 2207 * Return:
2207 * 0 - no early wakeup detected 2208 * 0 = no early wakeup detected;
2208 * <0 - -ETIMEDOUT or -ERESTARTNOINTR 2209 * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2209 */ 2210 */
2210static inline 2211static inline
2211int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, 2212int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2247 * @val: the expected value of uaddr 2248 * @val: the expected value of uaddr
2248 * @abs_time: absolute timeout 2249 * @abs_time: absolute timeout
2249 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all 2250 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2250 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2251 * @uaddr2: the pi futex we will take prior to returning to user-space 2251 * @uaddr2: the pi futex we will take prior to returning to user-space
2252 * 2252 *
2253 * The caller will wait on uaddr and will be requeued by futex_requeue() to 2253 * The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2258 * there was a need to. 2258 * there was a need to.
2259 * 2259 *
2260 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2260 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2261 * via the following: 2261 * via the following--
2262 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2262 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2263 * 2) wakeup on uaddr2 after a requeue 2263 * 2) wakeup on uaddr2 after a requeue
2264 * 3) signal 2264 * 3) signal
@@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2276 * 2276 *
2277 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2277 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2278 * 2278 *
2279 * Returns: 2279 * Return:
2280 * 0 - On success 2280 * 0 - On success;
2281 * <0 - On error 2281 * <0 - On error
2282 */ 2282 */
2283static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, 2283static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3feb..fd4b13b131f8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -63,6 +63,7 @@
63DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 63DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
64{ 64{
65 65
66 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
66 .clock_base = 67 .clock_base =
67 { 68 {
68 { 69 {
@@ -83,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
83 .get_time = &ktime_get_boottime, 84 .get_time = &ktime_get_boottime,
84 .resolution = KTIME_LOW_RES, 85 .resolution = KTIME_LOW_RES,
85 }, 86 },
87 {
88 .index = HRTIMER_BASE_TAI,
89 .clockid = CLOCK_TAI,
90 .get_time = &ktime_get_clocktai,
91 .resolution = KTIME_LOW_RES,
92 },
86 } 93 }
87}; 94};
88 95
@@ -90,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
90 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, 97 [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
91 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, 98 [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
92 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, 99 [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
100 [CLOCK_TAI] = HRTIMER_BASE_TAI,
93}; 101};
94 102
95static inline int hrtimer_clockid_to_base(clockid_t clock_id) 103static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -106,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
106{ 114{
107 ktime_t xtim, mono, boot; 115 ktime_t xtim, mono, boot;
108 struct timespec xts, tom, slp; 116 struct timespec xts, tom, slp;
117 s32 tai_offset;
109 118
110 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); 119 get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
120 tai_offset = timekeeping_get_tai_offset();
111 121
112 xtim = timespec_to_ktime(xts); 122 xtim = timespec_to_ktime(xts);
113 mono = ktime_add(xtim, timespec_to_ktime(tom)); 123 mono = ktime_add(xtim, timespec_to_ktime(tom));
@@ -115,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
115 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; 125 base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
116 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; 126 base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
117 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; 127 base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
128 base->clock_base[HRTIMER_BASE_TAI].softirq_time =
129 ktime_add(xtim, ktime_set(tai_offset, 0));
118} 130}
119 131
120/* 132/*
@@ -160,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
160 */ 172 */
161static int hrtimer_get_target(int this_cpu, int pinned) 173static int hrtimer_get_target(int this_cpu, int pinned)
162{ 174{
163#ifdef CONFIG_NO_HZ 175#ifdef CONFIG_NO_HZ_COMMON
164 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) 176 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
165 return get_nohz_timer_target(); 177 return get_nohz_timer_target();
166#endif 178#endif
@@ -275,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
275 } else { 287 } else {
276 unsigned long rem = do_div(nsec, NSEC_PER_SEC); 288 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
277 289
290 /* Make sure nsec fits into long */
291 if (unlikely(nsec > KTIME_SEC_MAX))
292 return (ktime_t){ .tv64 = KTIME_MAX };
293
278 tmp = ktime_set((long)nsec, rem); 294 tmp = ktime_set((long)nsec, rem);
279 } 295 }
280 296
@@ -651,8 +667,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
651{ 667{
652 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; 668 ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
653 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; 669 ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
670 ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
654 671
655 return ktime_get_update_offsets(offs_real, offs_boot); 672 return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
656} 673}
657 674
658/* 675/*
@@ -1010,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1010 * @timer: the timer to be added 1027 * @timer: the timer to be added
1011 * @tim: expiry time 1028 * @tim: expiry time
1012 * @delta_ns: "slack" range for the timer 1029 * @delta_ns: "slack" range for the timer
1013 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 1030 * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
1031 * relative (HRTIMER_MODE_REL)
1014 * 1032 *
1015 * Returns: 1033 * Returns:
1016 * 0 on success 1034 * 0 on success
@@ -1027,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1027 * hrtimer_start - (re)start an hrtimer on the current CPU 1045 * hrtimer_start - (re)start an hrtimer on the current CPU
1028 * @timer: the timer to be added 1046 * @timer: the timer to be added
1029 * @tim: expiry time 1047 * @tim: expiry time
1030 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 1048 * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
1049 * relative (HRTIMER_MODE_REL)
1031 * 1050 *
1032 * Returns: 1051 * Returns:
1033 * 0 on success 1052 * 0 on success
@@ -1106,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1106} 1125}
1107EXPORT_SYMBOL_GPL(hrtimer_get_remaining); 1126EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1108 1127
1109#ifdef CONFIG_NO_HZ 1128#ifdef CONFIG_NO_HZ_COMMON
1110/** 1129/**
1111 * hrtimer_get_next_event - get the time until next expiry event 1130 * hrtimer_get_next_event - get the time until next expiry event
1112 * 1131 *
@@ -1309,6 +1328,8 @@ retry:
1309 1328
1310 expires = ktime_sub(hrtimer_get_expires(timer), 1329 expires = ktime_sub(hrtimer_get_expires(timer),
1311 base->offset); 1330 base->offset);
1331 if (expires.tv64 < 0)
1332 expires.tv64 = KTIME_MAX;
1312 if (expires.tv64 < expires_next.tv64) 1333 if (expires.tv64 < expires_next.tv64)
1313 expires_next = expires; 1334 expires_next = expires;
1314 break; 1335 break;
@@ -1642,8 +1663,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1642 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1663 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1643 int i; 1664 int i;
1644 1665
1645 raw_spin_lock_init(&cpu_base->lock);
1646
1647 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1666 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1648 cpu_base->clock_base[i].cpu_base = cpu_base; 1667 cpu_base->clock_base[i].cpu_base = cpu_base;
1649 timerqueue_init_head(&cpu_base->clock_base[i].active); 1668 timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 96f3a1d9c379..5a83dde8ca0c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -462,9 +462,23 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
462 if (domain->ops->map) { 462 if (domain->ops->map) {
463 ret = domain->ops->map(domain, virq, hwirq); 463 ret = domain->ops->map(domain, virq, hwirq);
464 if (ret != 0) { 464 if (ret != 0) {
465 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n", 465 /*
466 virq, hwirq, ret); 466 * If map() returns -EPERM, this interrupt is protected
467 WARN_ON(1); 467 * by the firmware or some other service and shall not
468 * be mapped.
469 *
470 * Since on some platforms we blindly try to map everything
471 * we end up with a log full of backtraces.
472 *
473 * So instead, we silently fail on -EPERM, it is the
474 * responsibility of the PIC driver to display a relevant
475 * message if needed.
476 */
477 if (ret != -EPERM) {
478 pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
479 virq, hwirq, ret);
480 WARN_ON(1);
481 }
468 irq_data->domain = NULL; 482 irq_data->domain = NULL;
469 irq_data->hwirq = 0; 483 irq_data->hwirq = 0;
470 goto err_unmap; 484 goto err_unmap;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 397db02209ed..19ed5c425c3b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
76static ssize_t write_irq_affinity(int type, struct file *file, 76static ssize_t write_irq_affinity(int type, struct file *file,
77 const char __user *buffer, size_t count, loff_t *pos) 77 const char __user *buffer, size_t count, loff_t *pos)
78{ 78{
79 unsigned int irq = (int)(long)PDE(file_inode(file))->data; 79 unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
80 cpumask_var_t new_value; 80 cpumask_var_t new_value;
81 int err; 81 int err;
82 82
@@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
131 131
132static int irq_affinity_proc_open(struct inode *inode, struct file *file) 132static int irq_affinity_proc_open(struct inode *inode, struct file *file)
133{ 133{
134 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 134 return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
135} 135}
136 136
137static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) 137static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
138{ 138{
139 return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); 139 return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
140} 140}
141 141
142static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) 142static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
143{ 143{
144 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); 144 return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
145} 145}
146 146
147static const struct file_operations irq_affinity_proc_fops = { 147static const struct file_operations irq_affinity_proc_fops = {
@@ -212,7 +212,7 @@ out:
212 212
213static int default_affinity_open(struct inode *inode, struct file *file) 213static int default_affinity_open(struct inode *inode, struct file *file)
214{ 214{
215 return single_open(file, default_affinity_show, PDE(inode)->data); 215 return single_open(file, default_affinity_show, PDE_DATA(inode));
216} 216}
217 217
218static const struct file_operations default_affinity_proc_fops = { 218static const struct file_operations default_affinity_proc_fops = {
@@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
233 233
234static int irq_node_proc_open(struct inode *inode, struct file *file) 234static int irq_node_proc_open(struct inode *inode, struct file *file)
235{ 235{
236 return single_open(file, irq_node_proc_show, PDE(inode)->data); 236 return single_open(file, irq_node_proc_show, PDE_DATA(inode));
237} 237}
238 238
239static const struct file_operations irq_node_proc_fops = { 239static const struct file_operations irq_node_proc_fops = {
@@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
256 256
257static int irq_spurious_proc_open(struct inode *inode, struct file *file) 257static int irq_spurious_proc_open(struct inode *inode, struct file *file)
258{ 258{
259 return single_open(file, irq_spurious_proc_show, PDE(inode)->data); 259 return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
260} 260}
261 261
262static const struct file_operations irq_spurious_proc_fops = { 262static const struct file_operations irq_spurious_proc_fops = {
@@ -366,11 +366,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
366 366
367void unregister_handler_proc(unsigned int irq, struct irqaction *action) 367void unregister_handler_proc(unsigned int irq, struct irqaction *action)
368{ 368{
369 if (action->dir) { 369 proc_remove(action->dir);
370 struct irq_desc *desc = irq_to_desc(irq);
371
372 remove_proc_entry(action->dir->name, desc->dir);
373 }
374} 370}
375 371
376static void register_default_affinity_proc(void) 372static void register_default_affinity_proc(void)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2169feeba529..3127ad52cdb2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr)
84 84
85/* 85/*
86 * Expand a compressed symbol data into the resulting uncompressed string, 86 * Expand a compressed symbol data into the resulting uncompressed string,
87 * if uncompressed string is too long (>= maxlen), it will be truncated,
87 * given the offset to where the symbol is in the compressed stream. 88 * given the offset to where the symbol is in the compressed stream.
88 */ 89 */
89static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 90static unsigned int kallsyms_expand_symbol(unsigned int off,
91 char *result, size_t maxlen)
90{ 92{
91 int len, skipped_first = 0; 93 int len, skipped_first = 0;
92 const u8 *tptr, *data; 94 const u8 *tptr, *data;
@@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
113 115
114 while (*tptr) { 116 while (*tptr) {
115 if (skipped_first) { 117 if (skipped_first) {
118 if (maxlen <= 1)
119 goto tail;
116 *result = *tptr; 120 *result = *tptr;
117 result++; 121 result++;
122 maxlen--;
118 } else 123 } else
119 skipped_first = 1; 124 skipped_first = 1;
120 tptr++; 125 tptr++;
121 } 126 }
122 } 127 }
123 128
124 *result = '\0'; 129tail:
130 if (maxlen)
131 *result = '\0';
125 132
126 /* Return to offset to the next symbol. */ 133 /* Return to offset to the next symbol. */
127 return off; 134 return off;
@@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name)
176 unsigned int off; 183 unsigned int off;
177 184
178 for (i = 0, off = 0; i < kallsyms_num_syms; i++) { 185 for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
179 off = kallsyms_expand_symbol(off, namebuf); 186 off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
180 187
181 if (strcmp(namebuf, name) == 0) 188 if (strcmp(namebuf, name) == 0)
182 return kallsyms_addresses[i]; 189 return kallsyms_addresses[i];
@@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
195 int ret; 202 int ret;
196 203
197 for (i = 0, off = 0; i < kallsyms_num_syms; i++) { 204 for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
198 off = kallsyms_expand_symbol(off, namebuf); 205 off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
199 ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); 206 ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
200 if (ret != 0) 207 if (ret != 0)
201 return ret; 208 return ret;
@@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr,
294 301
295 pos = get_symbol_pos(addr, symbolsize, offset); 302 pos = get_symbol_pos(addr, symbolsize, offset);
296 /* Grab name */ 303 /* Grab name */
297 kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); 304 kallsyms_expand_symbol(get_symbol_offset(pos),
305 namebuf, KSYM_NAME_LEN);
298 if (modname) 306 if (modname)
299 *modname = NULL; 307 *modname = NULL;
300 return namebuf; 308 return namebuf;
@@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname)
315 323
316 pos = get_symbol_pos(addr, NULL, NULL); 324 pos = get_symbol_pos(addr, NULL, NULL);
317 /* Grab name */ 325 /* Grab name */
318 kallsyms_expand_symbol(get_symbol_offset(pos), symname); 326 kallsyms_expand_symbol(get_symbol_offset(pos),
327 symname, KSYM_NAME_LEN);
319 return 0; 328 return 0;
320 } 329 }
321 /* See if it's in a module. */ 330 /* See if it's in a module. */
@@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
333 342
334 pos = get_symbol_pos(addr, size, offset); 343 pos = get_symbol_pos(addr, size, offset);
335 /* Grab name */ 344 /* Grab name */
336 kallsyms_expand_symbol(get_symbol_offset(pos), name); 345 kallsyms_expand_symbol(get_symbol_offset(pos),
346 name, KSYM_NAME_LEN);
337 modname[0] = '\0'; 347 modname[0] = '\0';
338 return 0; 348 return 0;
339 } 349 }
@@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
463 473
464 iter->type = kallsyms_get_symbol_type(off); 474 iter->type = kallsyms_get_symbol_type(off);
465 475
466 off = kallsyms_expand_symbol(off, iter->name); 476 off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name));
467 477
468 return off - iter->nameoff; 478 return off - iter->nameoff;
469} 479}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bddd3d7a74b6..59f7b55ba745 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -55,7 +55,7 @@ struct resource crashk_res = {
55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
56}; 56};
57struct resource crashk_low_res = { 57struct resource crashk_low_res = {
58 .name = "Crash kernel low", 58 .name = "Crash kernel",
59 .start = 0, 59 .start = 0,
60 .end = 0, 60 .end = 0,
61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
@@ -786,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image,
786 struct kexec_segment *segment) 786 struct kexec_segment *segment)
787{ 787{
788 unsigned long maddr; 788 unsigned long maddr;
789 unsigned long ubytes, mbytes; 789 size_t ubytes, mbytes;
790 int result; 790 int result;
791 unsigned char __user *buf; 791 unsigned char __user *buf;
792 792
@@ -819,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image,
819 /* Start with a clear page */ 819 /* Start with a clear page */
820 clear_page(ptr); 820 clear_page(ptr);
821 ptr += maddr & ~PAGE_MASK; 821 ptr += maddr & ~PAGE_MASK;
822 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 822 mchunk = min_t(size_t, mbytes,
823 if (mchunk > mbytes) 823 PAGE_SIZE - (maddr & ~PAGE_MASK));
824 mchunk = mbytes; 824 uchunk = min(ubytes, mchunk);
825
826 uchunk = mchunk;
827 if (uchunk > ubytes)
828 uchunk = ubytes;
829 825
830 result = copy_from_user(ptr, buf, uchunk); 826 result = copy_from_user(ptr, buf, uchunk);
831 kunmap(page); 827 kunmap(page);
@@ -850,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image,
850 * We do things a page at a time for the sake of kmap. 846 * We do things a page at a time for the sake of kmap.
851 */ 847 */
852 unsigned long maddr; 848 unsigned long maddr;
853 unsigned long ubytes, mbytes; 849 size_t ubytes, mbytes;
854 int result; 850 int result;
855 unsigned char __user *buf; 851 unsigned char __user *buf;
856 852
@@ -871,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image,
871 } 867 }
872 ptr = kmap(page); 868 ptr = kmap(page);
873 ptr += maddr & ~PAGE_MASK; 869 ptr += maddr & ~PAGE_MASK;
874 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); 870 mchunk = min_t(size_t, mbytes,
875 if (mchunk > mbytes) 871 PAGE_SIZE - (maddr & ~PAGE_MASK));
876 mchunk = mbytes; 872 uchunk = min(ubytes, mchunk);
877 873 if (mchunk > uchunk) {
878 uchunk = mchunk;
879 if (uchunk > ubytes) {
880 uchunk = ubytes;
881 /* Zero the trailing part of the page */ 874 /* Zero the trailing part of the page */
882 memset(ptr + uchunk, 0, mchunk - uchunk); 875 memset(ptr + uchunk, 0, mchunk - uchunk);
883 } 876 }
@@ -1118,12 +1111,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
1118{ 1111{
1119 unsigned long addr; 1112 unsigned long addr;
1120 1113
1121 for (addr = begin; addr < end; addr += PAGE_SIZE) { 1114 for (addr = begin; addr < end; addr += PAGE_SIZE)
1122 ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); 1115 free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
1123 init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1124 free_page((unsigned long)__va(addr));
1125 totalram_pages++;
1126 }
1127} 1116}
1128 1117
1129int crash_shrink_memory(unsigned long new_size) 1118int crash_shrink_memory(unsigned long new_size)
@@ -1368,35 +1357,114 @@ static int __init parse_crashkernel_simple(char *cmdline,
1368 return 0; 1357 return 0;
1369} 1358}
1370 1359
1360#define SUFFIX_HIGH 0
1361#define SUFFIX_LOW 1
1362#define SUFFIX_NULL 2
1363static __initdata char *suffix_tbl[] = {
1364 [SUFFIX_HIGH] = ",high",
1365 [SUFFIX_LOW] = ",low",
1366 [SUFFIX_NULL] = NULL,
1367};
1368
1371/* 1369/*
1372 * That function is the entry point for command line parsing and should be 1370 * That function parses "suffix" crashkernel command lines like
1373 * called from the arch-specific code. 1371 *
1372 * crashkernel=size,[high|low]
1373 *
1374 * It returns 0 on success and -EINVAL on failure.
1374 */ 1375 */
1376static int __init parse_crashkernel_suffix(char *cmdline,
1377 unsigned long long *crash_size,
1378 unsigned long long *crash_base,
1379 const char *suffix)
1380{
1381 char *cur = cmdline;
1382
1383 *crash_size = memparse(cmdline, &cur);
1384 if (cmdline == cur) {
1385 pr_warn("crashkernel: memory value expected\n");
1386 return -EINVAL;
1387 }
1388
1389 /* check with suffix */
1390 if (strncmp(cur, suffix, strlen(suffix))) {
1391 pr_warn("crashkernel: unrecognized char\n");
1392 return -EINVAL;
1393 }
1394 cur += strlen(suffix);
1395 if (*cur != ' ' && *cur != '\0') {
1396 pr_warn("crashkernel: unrecognized char\n");
1397 return -EINVAL;
1398 }
1399
1400 return 0;
1401}
1402
1403static __init char *get_last_crashkernel(char *cmdline,
1404 const char *name,
1405 const char *suffix)
1406{
1407 char *p = cmdline, *ck_cmdline = NULL;
1408
1409 /* find crashkernel and use the last one if there are more */
1410 p = strstr(p, name);
1411 while (p) {
1412 char *end_p = strchr(p, ' ');
1413 char *q;
1414
1415 if (!end_p)
1416 end_p = p + strlen(p);
1417
1418 if (!suffix) {
1419 int i;
1420
1421 /* skip the one with any known suffix */
1422 for (i = 0; suffix_tbl[i]; i++) {
1423 q = end_p - strlen(suffix_tbl[i]);
1424 if (!strncmp(q, suffix_tbl[i],
1425 strlen(suffix_tbl[i])))
1426 goto next;
1427 }
1428 ck_cmdline = p;
1429 } else {
1430 q = end_p - strlen(suffix);
1431 if (!strncmp(q, suffix, strlen(suffix)))
1432 ck_cmdline = p;
1433 }
1434next:
1435 p = strstr(p+1, name);
1436 }
1437
1438 if (!ck_cmdline)
1439 return NULL;
1440
1441 return ck_cmdline;
1442}
1443
1375static int __init __parse_crashkernel(char *cmdline, 1444static int __init __parse_crashkernel(char *cmdline,
1376 unsigned long long system_ram, 1445 unsigned long long system_ram,
1377 unsigned long long *crash_size, 1446 unsigned long long *crash_size,
1378 unsigned long long *crash_base, 1447 unsigned long long *crash_base,
1379 const char *name) 1448 const char *name,
1449 const char *suffix)
1380{ 1450{
1381 char *p = cmdline, *ck_cmdline = NULL;
1382 char *first_colon, *first_space; 1451 char *first_colon, *first_space;
1452 char *ck_cmdline;
1383 1453
1384 BUG_ON(!crash_size || !crash_base); 1454 BUG_ON(!crash_size || !crash_base);
1385 *crash_size = 0; 1455 *crash_size = 0;
1386 *crash_base = 0; 1456 *crash_base = 0;
1387 1457
1388 /* find crashkernel and use the last one if there are more */ 1458 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1389 p = strstr(p, name);
1390 while (p) {
1391 ck_cmdline = p;
1392 p = strstr(p+1, name);
1393 }
1394 1459
1395 if (!ck_cmdline) 1460 if (!ck_cmdline)
1396 return -EINVAL; 1461 return -EINVAL;
1397 1462
1398 ck_cmdline += strlen(name); 1463 ck_cmdline += strlen(name);
1399 1464
1465 if (suffix)
1466 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1467 crash_base, suffix);
1400 /* 1468 /*
1401 * if the commandline contains a ':', then that's the extended 1469 * if the commandline contains a ':', then that's the extended
1402 * syntax -- if not, it must be the classic syntax 1470 * syntax -- if not, it must be the classic syntax
@@ -1413,13 +1481,26 @@ static int __init __parse_crashkernel(char *cmdline,
1413 return 0; 1481 return 0;
1414} 1482}
1415 1483
1484/*
1485 * That function is the entry point for command line parsing and should be
1486 * called from the arch-specific code.
1487 */
1416int __init parse_crashkernel(char *cmdline, 1488int __init parse_crashkernel(char *cmdline,
1417 unsigned long long system_ram, 1489 unsigned long long system_ram,
1418 unsigned long long *crash_size, 1490 unsigned long long *crash_size,
1419 unsigned long long *crash_base) 1491 unsigned long long *crash_base)
1420{ 1492{
1421 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, 1493 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1422 "crashkernel="); 1494 "crashkernel=", NULL);
1495}
1496
1497int __init parse_crashkernel_high(char *cmdline,
1498 unsigned long long system_ram,
1499 unsigned long long *crash_size,
1500 unsigned long long *crash_base)
1501{
1502 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1503 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1423} 1504}
1424 1505
1425int __init parse_crashkernel_low(char *cmdline, 1506int __init parse_crashkernel_low(char *cmdline,
@@ -1428,7 +1509,7 @@ int __init parse_crashkernel_low(char *cmdline,
1428 unsigned long long *crash_base) 1509 unsigned long long *crash_base)
1429{ 1510{
1430 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, 1511 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1431 "crashkernel_low="); 1512 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
1432} 1513}
1433 1514
1434static void update_vmcoreinfo_note(void) 1515static void update_vmcoreinfo_note(void)
@@ -1452,14 +1533,13 @@ void vmcoreinfo_append_str(const char *fmt, ...)
1452{ 1533{
1453 va_list args; 1534 va_list args;
1454 char buf[0x50]; 1535 char buf[0x50];
1455 int r; 1536 size_t r;
1456 1537
1457 va_start(args, fmt); 1538 va_start(args, fmt);
1458 r = vsnprintf(buf, sizeof(buf), fmt, args); 1539 r = vsnprintf(buf, sizeof(buf), fmt, args);
1459 va_end(args); 1540 va_end(args);
1460 1541
1461 if (r + vmcoreinfo_size > vmcoreinfo_max_size) 1542 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1462 r = vmcoreinfo_max_size - vmcoreinfo_size;
1463 1543
1464 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); 1544 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1465 1545
@@ -1489,7 +1569,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1489 VMCOREINFO_SYMBOL(swapper_pg_dir); 1569 VMCOREINFO_SYMBOL(swapper_pg_dir);
1490#endif 1570#endif
1491 VMCOREINFO_SYMBOL(_stext); 1571 VMCOREINFO_SYMBOL(_stext);
1492 VMCOREINFO_SYMBOL(vmlist); 1572 VMCOREINFO_SYMBOL(vmap_area_list);
1493 1573
1494#ifndef CONFIG_NEED_MULTIPLE_NODES 1574#ifndef CONFIG_NEED_MULTIPLE_NODES
1495 VMCOREINFO_SYMBOL(mem_map); 1575 VMCOREINFO_SYMBOL(mem_map);
@@ -1527,7 +1607,8 @@ static int __init crash_save_vmcoreinfo_init(void)
1527 VMCOREINFO_OFFSET(free_area, free_list); 1607 VMCOREINFO_OFFSET(free_area, free_list);
1528 VMCOREINFO_OFFSET(list_head, next); 1608 VMCOREINFO_OFFSET(list_head, next);
1529 VMCOREINFO_OFFSET(list_head, prev); 1609 VMCOREINFO_OFFSET(list_head, prev);
1530 VMCOREINFO_OFFSET(vm_struct, addr); 1610 VMCOREINFO_OFFSET(vmap_area, va_start);
1611 VMCOREINFO_OFFSET(vmap_area, list);
1531 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1612 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1532 log_buf_kexec_setup(); 1613 log_buf_kexec_setup();
1533 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1614 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 56dd34976d7b..1296e72e4161 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -77,6 +77,7 @@ static void free_modprobe_argv(struct subprocess_info *info)
77 77
78static int call_modprobe(char *module_name, int wait) 78static int call_modprobe(char *module_name, int wait)
79{ 79{
80 struct subprocess_info *info;
80 static char *envp[] = { 81 static char *envp[] = {
81 "HOME=/", 82 "HOME=/",
82 "TERM=linux", 83 "TERM=linux",
@@ -98,8 +99,15 @@ static int call_modprobe(char *module_name, int wait)
98 argv[3] = module_name; /* check free_modprobe_argv() */ 99 argv[3] = module_name; /* check free_modprobe_argv() */
99 argv[4] = NULL; 100 argv[4] = NULL;
100 101
101 return call_usermodehelper_fns(modprobe_path, argv, envp, 102 info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
102 wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); 103 NULL, free_modprobe_argv, NULL);
104 if (!info)
105 goto free_module_name;
106
107 return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
108
109free_module_name:
110 kfree(module_name);
103free_argv: 111free_argv:
104 kfree(argv); 112 kfree(argv);
105out: 113out:
@@ -502,14 +510,28 @@ static void helper_unlock(void)
502 * @argv: arg vector for process 510 * @argv: arg vector for process
503 * @envp: environment for process 511 * @envp: environment for process
504 * @gfp_mask: gfp mask for memory allocation 512 * @gfp_mask: gfp mask for memory allocation
513 * @cleanup: a cleanup function
514 * @init: an init function
515 * @data: arbitrary context sensitive data
505 * 516 *
506 * Returns either %NULL on allocation failure, or a subprocess_info 517 * Returns either %NULL on allocation failure, or a subprocess_info
507 * structure. This should be passed to call_usermodehelper_exec to 518 * structure. This should be passed to call_usermodehelper_exec to
508 * exec the process and free the structure. 519 * exec the process and free the structure.
520 *
521 * The init function is used to customize the helper process prior to
522 * exec. A non-zero return code causes the process to error out, exit,
523 * and return the failure to the calling process
524 *
525 * The cleanup function is just before ethe subprocess_info is about to
526 * be freed. This can be used for freeing the argv and envp. The
527 * Function must be runnable in either a process context or the
528 * context in which call_usermodehelper_exec is called.
509 */ 529 */
510static
511struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, 530struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
512 char **envp, gfp_t gfp_mask) 531 char **envp, gfp_t gfp_mask,
532 int (*init)(struct subprocess_info *info, struct cred *new),
533 void (*cleanup)(struct subprocess_info *info),
534 void *data)
513{ 535{
514 struct subprocess_info *sub_info; 536 struct subprocess_info *sub_info;
515 sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); 537 sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
@@ -520,50 +542,27 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
520 sub_info->path = path; 542 sub_info->path = path;
521 sub_info->argv = argv; 543 sub_info->argv = argv;
522 sub_info->envp = envp; 544 sub_info->envp = envp;
545
546 sub_info->cleanup = cleanup;
547 sub_info->init = init;
548 sub_info->data = data;
523 out: 549 out:
524 return sub_info; 550 return sub_info;
525} 551}
526 552EXPORT_SYMBOL(call_usermodehelper_setup);
527/**
528 * call_usermodehelper_setfns - set a cleanup/init function
529 * @info: a subprocess_info returned by call_usermodehelper_setup
530 * @cleanup: a cleanup function
531 * @init: an init function
532 * @data: arbitrary context sensitive data
533 *
534 * The init function is used to customize the helper process prior to
535 * exec. A non-zero return code causes the process to error out, exit,
536 * and return the failure to the calling process
537 *
538 * The cleanup function is just before ethe subprocess_info is about to
539 * be freed. This can be used for freeing the argv and envp. The
540 * Function must be runnable in either a process context or the
541 * context in which call_usermodehelper_exec is called.
542 */
543static
544void call_usermodehelper_setfns(struct subprocess_info *info,
545 int (*init)(struct subprocess_info *info, struct cred *new),
546 void (*cleanup)(struct subprocess_info *info),
547 void *data)
548{
549 info->cleanup = cleanup;
550 info->init = init;
551 info->data = data;
552}
553 553
554/** 554/**
555 * call_usermodehelper_exec - start a usermode application 555 * call_usermodehelper_exec - start a usermode application
556 * @sub_info: information about the subprocessa 556 * @sub_info: information about the subprocessa
557 * @wait: wait for the application to finish and return status. 557 * @wait: wait for the application to finish and return status.
558 * when -1 don't wait at all, but you get no useful error back when 558 * when UMH_NO_WAIT don't wait at all, but you get no useful error back
559 * the program couldn't be exec'ed. This makes it safe to call 559 * when the program couldn't be exec'ed. This makes it safe to call
560 * from interrupt context. 560 * from interrupt context.
561 * 561 *
562 * Runs a user-space application. The application is started 562 * Runs a user-space application. The application is started
563 * asynchronously if wait is not set, and runs as a child of keventd. 563 * asynchronously if wait is not set, and runs as a child of keventd.
564 * (ie. it runs with full root capabilities). 564 * (ie. it runs with full root capabilities).
565 */ 565 */
566static
567int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) 566int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
568{ 567{
569 DECLARE_COMPLETION_ONSTACK(done); 568 DECLARE_COMPLETION_ONSTACK(done);
@@ -615,31 +614,34 @@ unlock:
615 helper_unlock(); 614 helper_unlock();
616 return retval; 615 return retval;
617} 616}
617EXPORT_SYMBOL(call_usermodehelper_exec);
618 618
619/* 619/**
620 * call_usermodehelper_fns() will not run the caller-provided cleanup function 620 * call_usermodehelper() - prepare and start a usermode application
621 * if a memory allocation failure is experienced. So the caller might need to 621 * @path: path to usermode executable
622 * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform 622 * @argv: arg vector for process
623 * the necessaary cleanup within the caller. 623 * @envp: environment for process
624 * @wait: wait for the application to finish and return status.
625 * when UMH_NO_WAIT don't wait at all, but you get no useful error back
626 * when the program couldn't be exec'ed. This makes it safe to call
627 * from interrupt context.
628 *
629 * This function is the equivalent to use call_usermodehelper_setup() and
630 * call_usermodehelper_exec().
624 */ 631 */
625int call_usermodehelper_fns( 632int call_usermodehelper(char *path, char **argv, char **envp, int wait)
626 char *path, char **argv, char **envp, int wait,
627 int (*init)(struct subprocess_info *info, struct cred *new),
628 void (*cleanup)(struct subprocess_info *), void *data)
629{ 633{
630 struct subprocess_info *info; 634 struct subprocess_info *info;
631 gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; 635 gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
632 636
633 info = call_usermodehelper_setup(path, argv, envp, gfp_mask); 637 info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
634 638 NULL, NULL, NULL);
635 if (info == NULL) 639 if (info == NULL)
636 return -ENOMEM; 640 return -ENOMEM;
637 641
638 call_usermodehelper_setfns(info, init, cleanup, data);
639
640 return call_usermodehelper_exec(info, wait); 642 return call_usermodehelper_exec(info, wait);
641} 643}
642EXPORT_SYMBOL(call_usermodehelper_fns); 644EXPORT_SYMBOL(call_usermodehelper);
643 645
644static int proc_cap_handler(struct ctl_table *table, int write, 646static int proc_cap_handler(struct ctl_table *table, int write,
645 void __user *buffer, size_t *lenp, loff_t *ppos) 647 void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e35be53f6613..3fed7f0cbcdf 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -794,16 +794,16 @@ out:
794} 794}
795 795
796#ifdef CONFIG_SYSCTL 796#ifdef CONFIG_SYSCTL
797/* This should be called with kprobe_mutex locked */
798static void __kprobes optimize_all_kprobes(void) 797static void __kprobes optimize_all_kprobes(void)
799{ 798{
800 struct hlist_head *head; 799 struct hlist_head *head;
801 struct kprobe *p; 800 struct kprobe *p;
802 unsigned int i; 801 unsigned int i;
803 802
803 mutex_lock(&kprobe_mutex);
804 /* If optimization is already allowed, just return */ 804 /* If optimization is already allowed, just return */
805 if (kprobes_allow_optimization) 805 if (kprobes_allow_optimization)
806 return; 806 goto out;
807 807
808 kprobes_allow_optimization = true; 808 kprobes_allow_optimization = true;
809 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 809 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void)
813 optimize_kprobe(p); 813 optimize_kprobe(p);
814 } 814 }
815 printk(KERN_INFO "Kprobes globally optimized\n"); 815 printk(KERN_INFO "Kprobes globally optimized\n");
816out:
817 mutex_unlock(&kprobe_mutex);
816} 818}
817 819
818/* This should be called with kprobe_mutex locked */
819static void __kprobes unoptimize_all_kprobes(void) 820static void __kprobes unoptimize_all_kprobes(void)
820{ 821{
821 struct hlist_head *head; 822 struct hlist_head *head;
822 struct kprobe *p; 823 struct kprobe *p;
823 unsigned int i; 824 unsigned int i;
824 825
826 mutex_lock(&kprobe_mutex);
825 /* If optimization is already prohibited, just return */ 827 /* If optimization is already prohibited, just return */
826 if (!kprobes_allow_optimization) 828 if (!kprobes_allow_optimization) {
829 mutex_unlock(&kprobe_mutex);
827 return; 830 return;
831 }
828 832
829 kprobes_allow_optimization = false; 833 kprobes_allow_optimization = false;
830 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 834 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void)
834 unoptimize_kprobe(p, false); 838 unoptimize_kprobe(p, false);
835 } 839 }
836 } 840 }
841 mutex_unlock(&kprobe_mutex);
842
837 /* Wait for unoptimizing completion */ 843 /* Wait for unoptimizing completion */
838 wait_for_kprobe_optimizer(); 844 wait_for_kprobe_optimizer();
839 printk(KERN_INFO "Kprobes globally unoptimized\n"); 845 printk(KERN_INFO "Kprobes globally unoptimized\n");
840} 846}
841 847
848static DEFINE_MUTEX(kprobe_sysctl_mutex);
842int sysctl_kprobes_optimization; 849int sysctl_kprobes_optimization;
843int proc_kprobes_optimization_handler(struct ctl_table *table, int write, 850int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
844 void __user *buffer, size_t *length, 851 void __user *buffer, size_t *length,
@@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
846{ 853{
847 int ret; 854 int ret;
848 855
849 mutex_lock(&kprobe_mutex); 856 mutex_lock(&kprobe_sysctl_mutex);
850 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; 857 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
851 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 858 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
852 859
@@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
854 optimize_all_kprobes(); 861 optimize_all_kprobes();
855 else 862 else
856 unoptimize_all_kprobes(); 863 unoptimize_all_kprobes();
857 mutex_unlock(&kprobe_mutex); 864 mutex_unlock(&kprobe_sysctl_mutex);
858 865
859 return ret; 866 return ret;
860} 867}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9baf..760e86df8c20 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -17,6 +17,7 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/freezer.h> 18#include <linux/freezer.h>
19#include <linux/ptrace.h> 19#include <linux/ptrace.h>
20#include <linux/uaccess.h>
20#include <trace/events/sched.h> 21#include <trace/events/sched.h>
21 22
22static DEFINE_SPINLOCK(kthread_create_lock); 23static DEFINE_SPINLOCK(kthread_create_lock);
@@ -52,8 +53,21 @@ enum KTHREAD_BITS {
52 KTHREAD_IS_PARKED, 53 KTHREAD_IS_PARKED,
53}; 54};
54 55
55#define to_kthread(tsk) \ 56#define __to_kthread(vfork) \
56 container_of((tsk)->vfork_done, struct kthread, exited) 57 container_of(vfork, struct kthread, exited)
58
59static inline struct kthread *to_kthread(struct task_struct *k)
60{
61 return __to_kthread(k->vfork_done);
62}
63
64static struct kthread *to_live_kthread(struct task_struct *k)
65{
66 struct completion *vfork = ACCESS_ONCE(k->vfork_done);
67 if (likely(vfork))
68 return __to_kthread(vfork);
69 return NULL;
70}
57 71
58/** 72/**
59 * kthread_should_stop - should this kthread return now? 73 * kthread_should_stop - should this kthread return now?
@@ -122,14 +136,32 @@ void *kthread_data(struct task_struct *task)
122 return to_kthread(task)->data; 136 return to_kthread(task)->data;
123} 137}
124 138
139/**
140 * probe_kthread_data - speculative version of kthread_data()
141 * @task: possible kthread task in question
142 *
143 * @task could be a kthread task. Return the data value specified when it
144 * was created if accessible. If @task isn't a kthread task or its data is
145 * inaccessible for any reason, %NULL is returned. This function requires
146 * that @task itself is safe to dereference.
147 */
148void *probe_kthread_data(struct task_struct *task)
149{
150 struct kthread *kthread = to_kthread(task);
151 void *data = NULL;
152
153 probe_kernel_read(&data, &kthread->data, sizeof(data));
154 return data;
155}
156
125static void __kthread_parkme(struct kthread *self) 157static void __kthread_parkme(struct kthread *self)
126{ 158{
127 __set_current_state(TASK_INTERRUPTIBLE); 159 __set_current_state(TASK_PARKED);
128 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { 160 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
129 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) 161 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
130 complete(&self->parked); 162 complete(&self->parked);
131 schedule(); 163 schedule();
132 __set_current_state(TASK_INTERRUPTIBLE); 164 __set_current_state(TASK_PARKED);
133 } 165 }
134 clear_bit(KTHREAD_IS_PARKED, &self->flags); 166 clear_bit(KTHREAD_IS_PARKED, &self->flags);
135 __set_current_state(TASK_RUNNING); 167 __set_current_state(TASK_RUNNING);
@@ -256,11 +288,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
256} 288}
257EXPORT_SYMBOL(kthread_create_on_node); 289EXPORT_SYMBOL(kthread_create_on_node);
258 290
259static void __kthread_bind(struct task_struct *p, unsigned int cpu) 291static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
260{ 292{
293 /* Must have done schedule() in kthread() before we set_task_cpu */
294 if (!wait_task_inactive(p, state)) {
295 WARN_ON(1);
296 return;
297 }
261 /* It's safe because the task is inactive. */ 298 /* It's safe because the task is inactive. */
262 do_set_cpus_allowed(p, cpumask_of(cpu)); 299 do_set_cpus_allowed(p, cpumask_of(cpu));
263 p->flags |= PF_THREAD_BOUND; 300 p->flags |= PF_NO_SETAFFINITY;
264} 301}
265 302
266/** 303/**
@@ -274,12 +311,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
274 */ 311 */
275void kthread_bind(struct task_struct *p, unsigned int cpu) 312void kthread_bind(struct task_struct *p, unsigned int cpu)
276{ 313{
277 /* Must have done schedule() in kthread() before we set_task_cpu */ 314 __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
278 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
279 WARN_ON(1);
280 return;
281 }
282 __kthread_bind(p, cpu);
283} 315}
284EXPORT_SYMBOL(kthread_bind); 316EXPORT_SYMBOL(kthread_bind);
285 317
@@ -311,17 +343,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
311 return p; 343 return p;
312} 344}
313 345
314static struct kthread *task_get_live_kthread(struct task_struct *k) 346static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
315{ 347{
316 struct kthread *kthread; 348 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
317 349 /*
318 get_task_struct(k); 350 * We clear the IS_PARKED bit here as we don't wait
319 kthread = to_kthread(k); 351 * until the task has left the park code. So if we'd
320 /* It might have exited */ 352 * park before that happens we'd see the IS_PARKED bit
321 barrier(); 353 * which might be about to be cleared.
322 if (k->vfork_done != NULL) 354 */
323 return kthread; 355 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
324 return NULL; 356 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
357 __kthread_bind(k, kthread->cpu, TASK_PARKED);
358 wake_up_state(k, TASK_PARKED);
359 }
325} 360}
326 361
327/** 362/**
@@ -334,23 +369,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
334 */ 369 */
335void kthread_unpark(struct task_struct *k) 370void kthread_unpark(struct task_struct *k)
336{ 371{
337 struct kthread *kthread = task_get_live_kthread(k); 372 struct kthread *kthread = to_live_kthread(k);
338 373
339 if (kthread) { 374 if (kthread)
340 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 375 __kthread_unpark(k, kthread);
341 /*
342 * We clear the IS_PARKED bit here as we don't wait
343 * until the task has left the park code. So if we'd
344 * park before that happens we'd see the IS_PARKED bit
345 * which might be about to be cleared.
346 */
347 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
348 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
349 __kthread_bind(k, kthread->cpu);
350 wake_up_process(k);
351 }
352 }
353 put_task_struct(k);
354} 376}
355 377
356/** 378/**
@@ -367,7 +389,7 @@ void kthread_unpark(struct task_struct *k)
367 */ 389 */
368int kthread_park(struct task_struct *k) 390int kthread_park(struct task_struct *k)
369{ 391{
370 struct kthread *kthread = task_get_live_kthread(k); 392 struct kthread *kthread = to_live_kthread(k);
371 int ret = -ENOSYS; 393 int ret = -ENOSYS;
372 394
373 if (kthread) { 395 if (kthread) {
@@ -380,7 +402,6 @@ int kthread_park(struct task_struct *k)
380 } 402 }
381 ret = 0; 403 ret = 0;
382 } 404 }
383 put_task_struct(k);
384 return ret; 405 return ret;
385} 406}
386 407
@@ -401,21 +422,23 @@ int kthread_park(struct task_struct *k)
401 */ 422 */
402int kthread_stop(struct task_struct *k) 423int kthread_stop(struct task_struct *k)
403{ 424{
404 struct kthread *kthread = task_get_live_kthread(k); 425 struct kthread *kthread;
405 int ret; 426 int ret;
406 427
407 trace_sched_kthread_stop(k); 428 trace_sched_kthread_stop(k);
429
430 get_task_struct(k);
431 kthread = to_live_kthread(k);
408 if (kthread) { 432 if (kthread) {
409 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); 433 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
410 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 434 __kthread_unpark(k, kthread);
411 wake_up_process(k); 435 wake_up_process(k);
412 wait_for_completion(&kthread->exited); 436 wait_for_completion(&kthread->exited);
413 } 437 }
414 ret = k->exit_code; 438 ret = k->exit_code;
415
416 put_task_struct(k); 439 put_task_struct(k);
417 trace_sched_kthread_stop_ret(ret);
418 440
441 trace_sched_kthread_stop_ret(ret);
419 return ret; 442 return ret;
420} 443}
421EXPORT_SYMBOL(kthread_stop); 444EXPORT_SYMBOL(kthread_stop);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 259db207b5d9..6a3bccba7e7d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
380unsigned long nr_stack_trace_entries; 380unsigned long nr_stack_trace_entries;
381static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; 381static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
382 382
383static void print_lockdep_off(const char *bug_msg)
384{
385 printk(KERN_DEBUG "%s\n", bug_msg);
386 printk(KERN_DEBUG "turning off the locking correctness validator.\n");
387 printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
388}
389
383static int save_trace(struct stack_trace *trace) 390static int save_trace(struct stack_trace *trace)
384{ 391{
385 trace->nr_entries = 0; 392 trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
409 if (!debug_locks_off_graph_unlock()) 416 if (!debug_locks_off_graph_unlock())
410 return 0; 417 return 0;
411 418
412 printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); 419 print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
413 printk("turning off the locking correctness validator.\n");
414 dump_stack(); 420 dump_stack();
415 421
416 return 0; 422 return 0;
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
763 } 769 }
764 raw_local_irq_restore(flags); 770 raw_local_irq_restore(flags);
765 771
766 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 772 print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
767 printk("turning off the locking correctness validator.\n");
768 dump_stack(); 773 dump_stack();
769 return NULL; 774 return NULL;
770 } 775 }
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)
834 if (!debug_locks_off_graph_unlock()) 839 if (!debug_locks_off_graph_unlock())
835 return NULL; 840 return NULL;
836 841
837 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); 842 print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
838 printk("turning off the locking correctness validator.\n");
839 dump_stack(); 843 dump_stack();
840 return NULL; 844 return NULL;
841 } 845 }
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
2000 struct lock_class *class = hlock_class(hlock); 2004 struct lock_class *class = hlock_class(hlock);
2001 struct list_head *hash_head = chainhashentry(chain_key); 2005 struct list_head *hash_head = chainhashentry(chain_key);
2002 struct lock_chain *chain; 2006 struct lock_chain *chain;
2003 struct held_lock *hlock_curr, *hlock_next; 2007 struct held_lock *hlock_curr;
2004 int i, j; 2008 int i, j;
2005 2009
2006 /* 2010 /*
@@ -2048,8 +2052,7 @@ cache_hit:
2048 if (!debug_locks_off_graph_unlock()) 2052 if (!debug_locks_off_graph_unlock())
2049 return 0; 2053 return 0;
2050 2054
2051 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); 2055 print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
2052 printk("turning off the locking correctness validator.\n");
2053 dump_stack(); 2056 dump_stack();
2054 return 0; 2057 return 0;
2055 } 2058 }
@@ -2057,12 +2060,10 @@ cache_hit:
2057 chain->chain_key = chain_key; 2060 chain->chain_key = chain_key;
2058 chain->irq_context = hlock->irq_context; 2061 chain->irq_context = hlock->irq_context;
2059 /* Find the first held_lock of current chain */ 2062 /* Find the first held_lock of current chain */
2060 hlock_next = hlock;
2061 for (i = curr->lockdep_depth - 1; i >= 0; i--) { 2063 for (i = curr->lockdep_depth - 1; i >= 0; i--) {
2062 hlock_curr = curr->held_locks + i; 2064 hlock_curr = curr->held_locks + i;
2063 if (hlock_curr->irq_context != hlock_next->irq_context) 2065 if (hlock_curr->irq_context != hlock->irq_context)
2064 break; 2066 break;
2065 hlock_next = hlock;
2066 } 2067 }
2067 i++; 2068 i++;
2068 chain->depth = curr->lockdep_depth + 1 - i; 2069 chain->depth = curr->lockdep_depth + 1 - i;
@@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3190#endif 3191#endif
3191 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { 3192 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
3192 debug_locks_off(); 3193 debug_locks_off();
3193 printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", 3194 print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
3195 printk(KERN_DEBUG "depth: %i max: %lu!\n",
3194 curr->lockdep_depth, MAX_LOCK_DEPTH); 3196 curr->lockdep_depth, MAX_LOCK_DEPTH);
3195 printk("turning off the locking correctness validator.\n");
3196 3197
3197 lockdep_print_held_locks(current); 3198 lockdep_print_held_locks(current);
3198 debug_show_all_locks(); 3199 debug_show_all_locks();
@@ -4088,7 +4089,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
4088} 4089}
4089EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); 4090EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
4090 4091
4091static void print_held_locks_bug(void) 4092static void print_held_locks_bug(struct task_struct *curr)
4092{ 4093{
4093 if (!debug_locks_off()) 4094 if (!debug_locks_off())
4094 return; 4095 return;
@@ -4097,21 +4098,22 @@ static void print_held_locks_bug(void)
4097 4098
4098 printk("\n"); 4099 printk("\n");
4099 printk("=====================================\n"); 4100 printk("=====================================\n");
4100 printk("[ BUG: %s/%d still has locks held! ]\n", 4101 printk("[ BUG: lock held at task exit time! ]\n");
4101 current->comm, task_pid_nr(current));
4102 print_kernel_ident(); 4102 print_kernel_ident();
4103 printk("-------------------------------------\n"); 4103 printk("-------------------------------------\n");
4104 lockdep_print_held_locks(current); 4104 printk("%s/%d is exiting with locks still held!\n",
4105 curr->comm, task_pid_nr(curr));
4106 lockdep_print_held_locks(curr);
4107
4105 printk("\nstack backtrace:\n"); 4108 printk("\nstack backtrace:\n");
4106 dump_stack(); 4109 dump_stack();
4107} 4110}
4108 4111
4109void debug_check_no_locks_held(void) 4112void debug_check_no_locks_held(struct task_struct *task)
4110{ 4113{
4111 if (unlikely(current->lockdep_depth > 0)) 4114 if (unlikely(task->lockdep_depth > 0))
4112 print_held_locks_bug(); 4115 print_held_locks_bug(task);
4113} 4116}
4114EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
4115 4117
4116void debug_show_all_locks(void) 4118void debug_show_all_locks(void)
4117{ 4119{
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
index 246b4c6e6135..4a9a86d12c8b 100644
--- a/kernel/modsign_certificate.S
+++ b/kernel/modsign_certificate.S
@@ -1,15 +1,8 @@
1/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */ 1#include <linux/export.h>
2#ifndef SYMBOL_PREFIX
3#define ASM_SYMBOL(sym) sym
4#else
5#define PASTE2(x,y) x##y
6#define PASTE(x,y) PASTE2(x,y)
7#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
8#endif
9 2
10#define GLOBAL(name) \ 3#define GLOBAL(name) \
11 .globl ASM_SYMBOL(name); \ 4 .globl VMLINUX_SYMBOL(name); \
12 ASM_SYMBOL(name): 5 VMLINUX_SYMBOL(name):
13 6
14 .section ".init.data","aw" 7 .section ".init.data","aw"
15 8
diff --git a/kernel/module.c b/kernel/module.c
index 0925c9a71975..b049939177f6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1209,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1209 1209
1210 /* Since this should be found in kernel (which can't be removed), 1210 /* Since this should be found in kernel (which can't be removed),
1211 * no locking is necessary. */ 1211 * no locking is necessary. */
1212 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 1212 if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
1213 &crc, true, false)) 1213 &crc, true, false))
1214 BUG(); 1214 BUG();
1215 return check_version(sechdrs, versindex, "module_layout", mod, crc, 1215 return check_version(sechdrs, versindex,
1216 VMLINUX_SYMBOL_STR(module_layout), mod, crc,
1216 NULL); 1217 NULL);
1217} 1218}
1218 1219
@@ -1861,12 +1862,12 @@ static void free_module(struct module *mod)
1861{ 1862{
1862 trace_module_free(mod); 1863 trace_module_free(mod);
1863 1864
1864 /* Delete from various lists */
1865 mutex_lock(&module_mutex);
1866 stop_machine(__unlink_module, mod, NULL);
1867 mutex_unlock(&module_mutex);
1868 mod_sysfs_teardown(mod); 1865 mod_sysfs_teardown(mod);
1869 1866
1867 /* We leave it in list to prevent duplicate loads, but make sure
1868 * that noone uses it while it's being deconstructed. */
1869 mod->state = MODULE_STATE_UNFORMED;
1870
1870 /* Remove dynamic debug info */ 1871 /* Remove dynamic debug info */
1871 ddebug_remove_module(mod->name); 1872 ddebug_remove_module(mod->name);
1872 1873
@@ -1879,6 +1880,11 @@ static void free_module(struct module *mod)
1879 /* Free any allocated parameters. */ 1880 /* Free any allocated parameters. */
1880 destroy_params(mod->kp, mod->num_kp); 1881 destroy_params(mod->kp, mod->num_kp);
1881 1882
1883 /* Now we can delete it from the lists */
1884 mutex_lock(&module_mutex);
1885 stop_machine(__unlink_module, mod, NULL);
1886 mutex_unlock(&module_mutex);
1887
1882 /* This may be NULL, but that's OK */ 1888 /* This may be NULL, but that's OK */
1883 unset_module_init_ro_nx(mod); 1889 unset_module_init_ro_nx(mod);
1884 module_free(mod, mod->module_init); 1890 module_free(mod, mod->module_init);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 52f23011b6e0..ad53a664f113 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -37,6 +37,12 @@
37# include <asm/mutex.h> 37# include <asm/mutex.h>
38#endif 38#endif
39 39
40/*
41 * A negative mutex count indicates that waiters are sleeping waiting for the
42 * mutex.
43 */
44#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
45
40void 46void
41__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) 47__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
42{ 48{
@@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
44 spin_lock_init(&lock->wait_lock); 50 spin_lock_init(&lock->wait_lock);
45 INIT_LIST_HEAD(&lock->wait_list); 51 INIT_LIST_HEAD(&lock->wait_list);
46 mutex_clear_owner(lock); 52 mutex_clear_owner(lock);
53#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
54 lock->spin_mlock = NULL;
55#endif
47 56
48 debug_mutex_init(lock, name, key); 57 debug_mutex_init(lock, name, key);
49} 58}
@@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)
95EXPORT_SYMBOL(mutex_lock); 104EXPORT_SYMBOL(mutex_lock);
96#endif 105#endif
97 106
107#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
108/*
109 * In order to avoid a stampede of mutex spinners from acquiring the mutex
110 * more or less simultaneously, the spinners need to acquire a MCS lock
111 * first before spinning on the owner field.
112 *
113 * We don't inline mspin_lock() so that perf can correctly account for the
114 * time spent in this lock function.
115 */
116struct mspin_node {
117 struct mspin_node *next ;
118 int locked; /* 1 if lock acquired */
119};
120#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
121
122static noinline
123void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
124{
125 struct mspin_node *prev;
126
127 /* Init node */
128 node->locked = 0;
129 node->next = NULL;
130
131 prev = xchg(lock, node);
132 if (likely(prev == NULL)) {
133 /* Lock acquired */
134 node->locked = 1;
135 return;
136 }
137 ACCESS_ONCE(prev->next) = node;
138 smp_wmb();
139 /* Wait until the lock holder passes the lock down */
140 while (!ACCESS_ONCE(node->locked))
141 arch_mutex_cpu_relax();
142}
143
144static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
145{
146 struct mspin_node *next = ACCESS_ONCE(node->next);
147
148 if (likely(!next)) {
149 /*
150 * Release the lock by setting it to NULL
151 */
152 if (cmpxchg(lock, node, NULL) == node)
153 return;
154 /* Wait until the next pointer is set */
155 while (!(next = ACCESS_ONCE(node->next)))
156 arch_mutex_cpu_relax();
157 }
158 ACCESS_ONCE(next->locked) = 1;
159 smp_wmb();
160}
161
162/*
163 * Mutex spinning code migrated from kernel/sched/core.c
164 */
165
166static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
167{
168 if (lock->owner != owner)
169 return false;
170
171 /*
172 * Ensure we emit the owner->on_cpu, dereference _after_ checking
173 * lock->owner still matches owner, if that fails, owner might
174 * point to free()d memory, if it still matches, the rcu_read_lock()
175 * ensures the memory stays valid.
176 */
177 barrier();
178
179 return owner->on_cpu;
180}
181
182/*
183 * Look out! "owner" is an entirely speculative pointer
184 * access and not reliable.
185 */
186static noinline
187int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
188{
189 rcu_read_lock();
190 while (owner_running(lock, owner)) {
191 if (need_resched())
192 break;
193
194 arch_mutex_cpu_relax();
195 }
196 rcu_read_unlock();
197
198 /*
199 * We break out the loop above on need_resched() and when the
200 * owner changed, which is a sign for heavy contention. Return
201 * success only when lock->owner is NULL.
202 */
203 return lock->owner == NULL;
204}
205
206/*
207 * Initial check for entering the mutex spinning loop
208 */
209static inline int mutex_can_spin_on_owner(struct mutex *lock)
210{
211 int retval = 1;
212
213 rcu_read_lock();
214 if (lock->owner)
215 retval = lock->owner->on_cpu;
216 rcu_read_unlock();
217 /*
218 * if lock->owner is not set, the mutex owner may have just acquired
219 * it and not set the owner yet or the mutex has been released.
220 */
221 return retval;
222}
223#endif
224
98static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 225static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
99 226
100/** 227/**
@@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
158 * 285 *
159 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock 286 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
160 * to serialize everything. 287 * to serialize everything.
288 *
289 * The mutex spinners are queued up using MCS lock so that only one
290 * spinner can compete for the mutex. However, if mutex spinning isn't
291 * going to happen, there is no point in going through the lock/unlock
292 * overhead.
161 */ 293 */
294 if (!mutex_can_spin_on_owner(lock))
295 goto slowpath;
162 296
163 for (;;) { 297 for (;;) {
164 struct task_struct *owner; 298 struct task_struct *owner;
299 struct mspin_node node;
165 300
166 /* 301 /*
167 * If there's an owner, wait for it to either 302 * If there's an owner, wait for it to either
168 * release the lock or go to sleep. 303 * release the lock or go to sleep.
169 */ 304 */
305 mspin_lock(MLOCK(lock), &node);
170 owner = ACCESS_ONCE(lock->owner); 306 owner = ACCESS_ONCE(lock->owner);
171 if (owner && !mutex_spin_on_owner(lock, owner)) 307 if (owner && !mutex_spin_on_owner(lock, owner)) {
308 mspin_unlock(MLOCK(lock), &node);
172 break; 309 break;
310 }
173 311
174 if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { 312 if ((atomic_read(&lock->count) == 1) &&
313 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
175 lock_acquired(&lock->dep_map, ip); 314 lock_acquired(&lock->dep_map, ip);
176 mutex_set_owner(lock); 315 mutex_set_owner(lock);
316 mspin_unlock(MLOCK(lock), &node);
177 preempt_enable(); 317 preempt_enable();
178 return 0; 318 return 0;
179 } 319 }
320 mspin_unlock(MLOCK(lock), &node);
180 321
181 /* 322 /*
182 * When there's no owner, we might have preempted between the 323 * When there's no owner, we might have preempted between the
@@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
195 */ 336 */
196 arch_mutex_cpu_relax(); 337 arch_mutex_cpu_relax();
197 } 338 }
339slowpath:
198#endif 340#endif
199 spin_lock_mutex(&lock->wait_lock, flags); 341 spin_lock_mutex(&lock->wait_lock, flags);
200 342
@@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
205 list_add_tail(&waiter.list, &lock->wait_list); 347 list_add_tail(&waiter.list, &lock->wait_list);
206 waiter.task = task; 348 waiter.task = task;
207 349
208 if (atomic_xchg(&lock->count, -1) == 1) 350 if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
209 goto done; 351 goto done;
210 352
211 lock_contended(&lock->dep_map, ip); 353 lock_contended(&lock->dep_map, ip);
@@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
220 * that when we release the lock, we properly wake up the 362 * that when we release the lock, we properly wake up the
221 * other waiters: 363 * other waiters:
222 */ 364 */
223 if (atomic_xchg(&lock->count, -1) == 1) 365 if (MUTEX_SHOW_NO_WAITER(lock) &&
366 (atomic_xchg(&lock->count, -1) == 1))
224 break; 367 break;
225 368
226 /* 369 /*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index afc0456f227a..364ceab15f0c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,7 +22,7 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h> 23#include <net/net_namespace.h>
24#include <linux/ipc_namespace.h> 24#include <linux/ipc_namespace.h>
25#include <linux/proc_fs.h> 25#include <linux/proc_ns.h>
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/syscalls.h> 27#include <linux/syscalls.h>
28 28
@@ -241,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
241 const struct proc_ns_operations *ops; 241 const struct proc_ns_operations *ops;
242 struct task_struct *tsk = current; 242 struct task_struct *tsk = current;
243 struct nsproxy *new_nsproxy; 243 struct nsproxy *new_nsproxy;
244 struct proc_inode *ei; 244 struct proc_ns *ei;
245 struct file *file; 245 struct file *file;
246 int err; 246 int err;
247 247
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
250 return PTR_ERR(file); 250 return PTR_ERR(file);
251 251
252 err = -EINVAL; 252 err = -EINVAL;
253 ei = PROC_I(file_inode(file)); 253 ei = get_proc_ns(file_inode(file));
254 ops = ei->ns_ops; 254 ops = ei->ns_ops;
255 if (nstype && (ops->type != nstype)) 255 if (nstype && (ops->type != nstype))
256 goto out; 256 goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index 7c57cc9eee2c..167ec097ce8b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -22,7 +22,6 @@
22#include <linux/sysrq.h> 22#include <linux/sysrq.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/nmi.h> 24#include <linux/nmi.h>
25#include <linux/dmi.h>
26 25
27#define PANIC_TIMER_STEP 100 26#define PANIC_TIMER_STEP 100
28#define PANIC_BLINK_SPD 18 27#define PANIC_BLINK_SPD 18
@@ -400,13 +399,8 @@ struct slowpath_args {
400static void warn_slowpath_common(const char *file, int line, void *caller, 399static void warn_slowpath_common(const char *file, int line, void *caller,
401 unsigned taint, struct slowpath_args *args) 400 unsigned taint, struct slowpath_args *args)
402{ 401{
403 const char *board;
404
405 printk(KERN_WARNING "------------[ cut here ]------------\n"); 402 printk(KERN_WARNING "------------[ cut here ]------------\n");
406 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); 403 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
407 board = dmi_get_system_info(DMI_PRODUCT_NAME);
408 if (board)
409 printk(KERN_WARNING "Hardware name: %s\n", board);
410 404
411 if (args) 405 if (args)
412 vprintk(args->fmt, args->args); 406 vprintk(args->fmt, args->args);
diff --git a/kernel/pid.c b/kernel/pid.c
index 047dc6264638..0db3e791a06d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
36#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
37#include <linux/init_task.h> 37#include <linux/init_task.h>
38#include <linux/syscalls.h> 38#include <linux/syscalls.h>
39#include <linux/proc_ns.h>
39#include <linux/proc_fs.h> 40#include <linux/proc_fs.h>
40 41
41#define pid_hashfn(nr, ns) \ 42#define pid_hashfn(nr, ns) \
@@ -51,9 +52,6 @@ int pid_max = PID_MAX_DEFAULT;
51int pid_max_min = RESERVED_PIDS + 1; 52int pid_max_min = RESERVED_PIDS + 1;
52int pid_max_max = PID_MAX_LIMIT; 53int pid_max_max = PID_MAX_LIMIT;
53 54
54#define BITS_PER_PAGE (PAGE_SIZE*8)
55#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
56
57static inline int mk_pid(struct pid_namespace *pid_ns, 55static inline int mk_pid(struct pid_namespace *pid_ns,
58 struct pidmap *map, int off) 56 struct pidmap *map, int off)
59{ 57{
@@ -183,15 +181,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
183 break; 181 break;
184 } 182 }
185 if (likely(atomic_read(&map->nr_free))) { 183 if (likely(atomic_read(&map->nr_free))) {
186 do { 184 for ( ; ; ) {
187 if (!test_and_set_bit(offset, map->page)) { 185 if (!test_and_set_bit(offset, map->page)) {
188 atomic_dec(&map->nr_free); 186 atomic_dec(&map->nr_free);
189 set_last_pid(pid_ns, last, pid); 187 set_last_pid(pid_ns, last, pid);
190 return pid; 188 return pid;
191 } 189 }
192 offset = find_next_offset(map, offset); 190 offset = find_next_offset(map, offset);
191 if (offset >= BITS_PER_PAGE)
192 break;
193 pid = mk_pid(pid_ns, map, offset); 193 pid = mk_pid(pid_ns, map, offset);
194 } while (offset < BITS_PER_PAGE && pid < pid_max); 194 if (pid >= pid_max)
195 break;
196 }
195 } 197 }
196 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { 198 if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
197 ++map; 199 ++map;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1c6023..6917e8edb48e 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,12 +15,10 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/acct.h> 16#include <linux/acct.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/proc_fs.h> 18#include <linux/proc_ns.h>
19#include <linux/reboot.h> 19#include <linux/reboot.h>
20#include <linux/export.h> 20#include <linux/export.h>
21 21
22#define BITS_PER_PAGE (PAGE_SIZE*8)
23
24struct pid_cache { 22struct pid_cache {
25 int nr_ids; 23 int nr_ids;
26 char name[16]; 24 char name[16];
@@ -181,6 +179,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
181 int nr; 179 int nr;
182 int rc; 180 int rc;
183 struct task_struct *task, *me = current; 181 struct task_struct *task, *me = current;
182 int init_pids = thread_group_leader(me) ? 1 : 2;
184 183
185 /* Don't allow any more processes into the pid namespace */ 184 /* Don't allow any more processes into the pid namespace */
186 disable_pid_allocation(pid_ns); 185 disable_pid_allocation(pid_ns);
@@ -230,7 +229,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
230 */ 229 */
231 for (;;) { 230 for (;;) {
232 set_current_state(TASK_UNINTERRUPTIBLE); 231 set_current_state(TASK_UNINTERRUPTIBLE);
233 if (pid_ns->nr_hashed == 1) 232 if (pid_ns->nr_hashed == init_pids)
234 break; 233 break;
235 schedule(); 234 schedule();
236 } 235 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8fd709c9bb58..42670e9b44e0 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,6 +10,8 @@
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12#include <linux/random.h> 12#include <linux/random.h>
13#include <linux/tick.h>
14#include <linux/workqueue.h>
13 15
14/* 16/*
15 * Called after updating RLIMIT_CPU to run cpu timer and update 17 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer,
153 } 155 }
154} 156}
155 157
158/**
159 * task_cputime_zero - Check a task_cputime struct for all zero fields.
160 *
161 * @cputime: The struct to compare.
162 *
163 * Checks @cputime to see if all fields are zero. Returns true if all fields
164 * are zero, false if any field is nonzero.
165 */
166static inline int task_cputime_zero(const struct task_cputime *cputime)
167{
168 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
169 return 1;
170 return 0;
171}
172
156static inline cputime_t prof_ticks(struct task_struct *p) 173static inline cputime_t prof_ticks(struct task_struct *p)
157{ 174{
158 cputime_t utime, stime; 175 cputime_t utime, stime;
@@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
636 return 0; 653 return 0;
637} 654}
638 655
656#ifdef CONFIG_NO_HZ_FULL
657static void nohz_kick_work_fn(struct work_struct *work)
658{
659 tick_nohz_full_kick_all();
660}
661
662static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
663
664/*
665 * We need the IPIs to be sent from sane process context.
666 * The posix cpu timers are always set with irqs disabled.
667 */
668static void posix_cpu_timer_kick_nohz(void)
669{
670 schedule_work(&nohz_kick_work);
671}
672
673bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
674{
675 if (!task_cputime_zero(&tsk->cputime_expires))
676 return false;
677
678 if (tsk->signal->cputimer.running)
679 return false;
680
681 return true;
682}
683#else
684static inline void posix_cpu_timer_kick_nohz(void) { }
685#endif
686
639/* 687/*
640 * Guts of sys_timer_settime for CPU timers. 688 * Guts of sys_timer_settime for CPU timers.
641 * This is called with the timer locked and interrupts disabled. 689 * This is called with the timer locked and interrupts disabled.
@@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
794 sample_to_timespec(timer->it_clock, 842 sample_to_timespec(timer->it_clock,
795 old_incr, &old->it_interval); 843 old_incr, &old->it_interval);
796 } 844 }
845 if (!ret)
846 posix_cpu_timer_kick_nohz();
797 return ret; 847 return ret;
798} 848}
799 849
@@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1008 } 1058 }
1009} 1059}
1010 1060
1011/**
1012 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1013 *
1014 * @cputime: The struct to compare.
1015 *
1016 * Checks @cputime to see if all fields are zero. Returns true if all fields
1017 * are zero, false if any field is nonzero.
1018 */
1019static inline int task_cputime_zero(const struct task_cputime *cputime)
1020{
1021 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
1022 return 1;
1023 return 0;
1024}
1025
1026/* 1061/*
1027 * Check for any per-thread CPU timers that have fired and move them 1062 * Check for any per-thread CPU timers that have fired and move them
1028 * off the tsk->*_timers list onto the firing list. Per-thread timers 1063 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1336 cpu_timer_fire(timer); 1371 cpu_timer_fire(timer);
1337 spin_unlock(&timer->it_lock); 1372 spin_unlock(&timer->it_lock);
1338 } 1373 }
1374
1375 /*
1376 * In case some timers were rescheduled after the queue got emptied,
1377 * wake up full dynticks CPUs.
1378 */
1379 if (tsk->signal->cputimer.running)
1380 posix_cpu_timer_kick_nohz();
1339} 1381}
1340 1382
1341/* 1383/*
@@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1366 } 1408 }
1367 1409
1368 if (!*newval) 1410 if (!*newval)
1369 return; 1411 goto out;
1370 *newval += now.cpu; 1412 *newval += now.cpu;
1371 } 1413 }
1372 1414
@@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1384 tsk->signal->cputime_expires.virt_exp = *newval; 1426 tsk->signal->cputime_expires.virt_exp = *newval;
1385 break; 1427 break;
1386 } 1428 }
1429out:
1430 posix_cpu_timer_kick_nohz();
1387} 1431}
1388 1432
1389static int do_cpu_nanosleep(const clockid_t which_clock, int flags, 1433static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 6edbb2c55c22..424c2d4265c9 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -40,38 +40,31 @@
40#include <linux/list.h> 40#include <linux/list.h>
41#include <linux/init.h> 41#include <linux/init.h>
42#include <linux/compiler.h> 42#include <linux/compiler.h>
43#include <linux/idr.h> 43#include <linux/hash.h>
44#include <linux/posix-clock.h> 44#include <linux/posix-clock.h>
45#include <linux/posix-timers.h> 45#include <linux/posix-timers.h>
46#include <linux/syscalls.h> 46#include <linux/syscalls.h>
47#include <linux/wait.h> 47#include <linux/wait.h>
48#include <linux/workqueue.h> 48#include <linux/workqueue.h>
49#include <linux/export.h> 49#include <linux/export.h>
50#include <linux/hashtable.h>
50 51
51/* 52/*
52 * Management arrays for POSIX timers. Timers are kept in slab memory 53 * Management arrays for POSIX timers. Timers are now kept in static hash table
53 * Timer ids are allocated by an external routine that keeps track of the 54 * with 512 entries.
54 * id and the timer. The external interface is: 55 * Timer ids are allocated by local routine, which selects proper hash head by
55 * 56 * key, constructed from current->signal address and per signal struct counter.
56 * void *idr_find(struct idr *idp, int id); to find timer_id <id> 57 * This keeps timer ids unique per process, but now they can intersect between
57 * int idr_get_new(struct idr *idp, void *ptr); to get a new id and 58 * processes.
58 * related it to <ptr>
59 * void idr_remove(struct idr *idp, int id); to release <id>
60 * void idr_init(struct idr *idp); to initialize <idp>
61 * which we supply.
62 * The idr_get_new *may* call slab for more memory so it must not be
63 * called under a spin lock. Likewise idr_remore may release memory
64 * (but it may be ok to do this under a lock...).
65 * idr_find is just a memory look up and is quite fast. A -1 return
66 * indicates that the requested id does not exist.
67 */ 59 */
68 60
69/* 61/*
70 * Lets keep our timers in a slab cache :-) 62 * Lets keep our timers in a slab cache :-)
71 */ 63 */
72static struct kmem_cache *posix_timers_cache; 64static struct kmem_cache *posix_timers_cache;
73static struct idr posix_timers_id; 65
74static DEFINE_SPINLOCK(idr_lock); 66static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
67static DEFINE_SPINLOCK(hash_lock);
75 68
76/* 69/*
77 * we assume that the new SIGEV_THREAD_ID shares no bits with the other 70 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
152 __timr; \ 145 __timr; \
153}) 146})
154 147
148static int hash(struct signal_struct *sig, unsigned int nr)
149{
150 return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
151}
152
153static struct k_itimer *__posix_timers_find(struct hlist_head *head,
154 struct signal_struct *sig,
155 timer_t id)
156{
157 struct k_itimer *timer;
158
159 hlist_for_each_entry_rcu(timer, head, t_hash) {
160 if ((timer->it_signal == sig) && (timer->it_id == id))
161 return timer;
162 }
163 return NULL;
164}
165
166static struct k_itimer *posix_timer_by_id(timer_t id)
167{
168 struct signal_struct *sig = current->signal;
169 struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
170
171 return __posix_timers_find(head, sig, id);
172}
173
174static int posix_timer_add(struct k_itimer *timer)
175{
176 struct signal_struct *sig = current->signal;
177 int first_free_id = sig->posix_timer_id;
178 struct hlist_head *head;
179 int ret = -ENOENT;
180
181 do {
182 spin_lock(&hash_lock);
183 head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
184 if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
185 hlist_add_head_rcu(&timer->t_hash, head);
186 ret = sig->posix_timer_id;
187 }
188 if (++sig->posix_timer_id < 0)
189 sig->posix_timer_id = 0;
190 if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
191 /* Loop over all possible ids completed */
192 ret = -EAGAIN;
193 spin_unlock(&hash_lock);
194 } while (ret == -ENOENT);
195 return ret;
196}
197
155static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) 198static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
156{ 199{
157 spin_unlock_irqrestore(&timr->it_lock, flags); 200 spin_unlock_irqrestore(&timr->it_lock, flags);
@@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
221 return 0; 264 return 0;
222} 265}
223 266
267static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
268{
269 timekeeping_clocktai(tp);
270 return 0;
271}
224 272
225/* 273/*
226 * Initialize everything, well, just everything in Posix clocks/timers ;) 274 * Initialize everything, well, just everything in Posix clocks/timers ;)
@@ -261,6 +309,16 @@ static __init int init_posix_timers(void)
261 .clock_getres = posix_get_coarse_res, 309 .clock_getres = posix_get_coarse_res,
262 .clock_get = posix_get_monotonic_coarse, 310 .clock_get = posix_get_monotonic_coarse,
263 }; 311 };
312 struct k_clock clock_tai = {
313 .clock_getres = hrtimer_get_res,
314 .clock_get = posix_get_tai,
315 .nsleep = common_nsleep,
316 .nsleep_restart = hrtimer_nanosleep_restart,
317 .timer_create = common_timer_create,
318 .timer_set = common_timer_set,
319 .timer_get = common_timer_get,
320 .timer_del = common_timer_del,
321 };
264 struct k_clock clock_boottime = { 322 struct k_clock clock_boottime = {
265 .clock_getres = hrtimer_get_res, 323 .clock_getres = hrtimer_get_res,
266 .clock_get = posix_get_boottime, 324 .clock_get = posix_get_boottime,
@@ -278,11 +336,11 @@ static __init int init_posix_timers(void)
278 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); 336 posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
279 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); 337 posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
280 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); 338 posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
339 posix_timers_register_clock(CLOCK_TAI, &clock_tai);
281 340
282 posix_timers_cache = kmem_cache_create("posix_timers_cache", 341 posix_timers_cache = kmem_cache_create("posix_timers_cache",
283 sizeof (struct k_itimer), 0, SLAB_PANIC, 342 sizeof (struct k_itimer), 0, SLAB_PANIC,
284 NULL); 343 NULL);
285 idr_init(&posix_timers_id);
286 return 0; 344 return 0;
287} 345}
288 346
@@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
504{ 562{
505 if (it_id_set) { 563 if (it_id_set) {
506 unsigned long flags; 564 unsigned long flags;
507 spin_lock_irqsave(&idr_lock, flags); 565 spin_lock_irqsave(&hash_lock, flags);
508 idr_remove(&posix_timers_id, tmr->it_id); 566 hlist_del_rcu(&tmr->t_hash);
509 spin_unlock_irqrestore(&idr_lock, flags); 567 spin_unlock_irqrestore(&hash_lock, flags);
510 } 568 }
511 put_pid(tmr->it_pid); 569 put_pid(tmr->it_pid);
512 sigqueue_free(tmr->sigq); 570 sigqueue_free(tmr->sigq);
@@ -552,22 +610,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
552 return -EAGAIN; 610 return -EAGAIN;
553 611
554 spin_lock_init(&new_timer->it_lock); 612 spin_lock_init(&new_timer->it_lock);
555 613 new_timer_id = posix_timer_add(new_timer);
556 idr_preload(GFP_KERNEL); 614 if (new_timer_id < 0) {
557 spin_lock_irq(&idr_lock); 615 error = new_timer_id;
558 error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
559 spin_unlock_irq(&idr_lock);
560 idr_preload_end();
561 if (error < 0) {
562 /*
563 * Weird looking, but we return EAGAIN if the IDR is
564 * full (proper POSIX return value for this)
565 */
566 if (error == -ENOSPC)
567 error = -EAGAIN;
568 goto out; 616 goto out;
569 } 617 }
570 new_timer_id = error;
571 618
572 it_id_set = IT_ID_SET; 619 it_id_set = IT_ID_SET;
573 new_timer->it_id = (timer_t) new_timer_id; 620 new_timer->it_id = (timer_t) new_timer_id;
@@ -645,7 +692,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
645 return NULL; 692 return NULL;
646 693
647 rcu_read_lock(); 694 rcu_read_lock();
648 timr = idr_find(&posix_timers_id, (int)timer_id); 695 timr = posix_timer_by_id(timer_id);
649 if (timr) { 696 if (timr) {
650 spin_lock_irqsave(&timr->it_lock, *flags); 697 spin_lock_irqsave(&timr->it_lock, *flags);
651 if (timr->it_signal == current->signal) { 698 if (timr->it_signal == current->signal) {
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b1dc456474b5..463aa6736751 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -4,6 +4,7 @@
4 * Originally from swsusp. 4 * Originally from swsusp.
5 */ 5 */
6 6
7#include <linux/console.h>
7#include <linux/vt_kern.h> 8#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 9#include <linux/kbd_kern.h>
9#include <linux/vt.h> 10#include <linux/vt.h>
@@ -14,8 +15,120 @@
14 15
15static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
16 17
18static DEFINE_MUTEX(vt_switch_mutex);
19
20struct pm_vt_switch {
21 struct list_head head;
22 struct device *dev;
23 bool required;
24};
25
26static LIST_HEAD(pm_vt_switch_list);
27
28
29/**
30 * pm_vt_switch_required - indicate VT switch at suspend requirements
31 * @dev: device
32 * @required: if true, caller needs VT switch at suspend/resume time
33 *
34 * The different console drivers may or may not require VT switches across
35 * suspend/resume, depending on how they handle restoring video state and
36 * what may be running.
37 *
38 * Drivers can indicate support for switchless suspend/resume, which can
39 * save time and flicker, by using this routine and passing 'false' as
40 * the argument. If any loaded driver needs VT switching, or the
41 * no_console_suspend argument has been passed on the command line, VT
42 * switches will occur.
43 */
44void pm_vt_switch_required(struct device *dev, bool required)
45{
46 struct pm_vt_switch *entry, *tmp;
47
48 mutex_lock(&vt_switch_mutex);
49 list_for_each_entry(tmp, &pm_vt_switch_list, head) {
50 if (tmp->dev == dev) {
51 /* already registered, update requirement */
52 tmp->required = required;
53 goto out;
54 }
55 }
56
57 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
58 if (!entry)
59 goto out;
60
61 entry->required = required;
62 entry->dev = dev;
63
64 list_add(&entry->head, &pm_vt_switch_list);
65out:
66 mutex_unlock(&vt_switch_mutex);
67}
68EXPORT_SYMBOL(pm_vt_switch_required);
69
70/**
71 * pm_vt_switch_unregister - stop tracking a device's VT switching needs
72 * @dev: device
73 *
74 * Remove @dev from the vt switch list.
75 */
76void pm_vt_switch_unregister(struct device *dev)
77{
78 struct pm_vt_switch *tmp;
79
80 mutex_lock(&vt_switch_mutex);
81 list_for_each_entry(tmp, &pm_vt_switch_list, head) {
82 if (tmp->dev == dev) {
83 list_del(&tmp->head);
84 break;
85 }
86 }
87 mutex_unlock(&vt_switch_mutex);
88}
89EXPORT_SYMBOL(pm_vt_switch_unregister);
90
91/*
92 * There are three cases when a VT switch on suspend/resume are required:
93 * 1) no driver has indicated a requirement one way or another, so preserve
94 * the old behavior
95 * 2) console suspend is disabled, we want to see debug messages across
96 * suspend/resume
97 * 3) any registered driver indicates it needs a VT switch
98 *
99 * If none of these conditions is present, meaning we have at least one driver
100 * that doesn't need the switch, and none that do, we can avoid it to make
101 * resume look a little prettier (and suspend too, but that's usually hidden,
102 * e.g. when closing the lid on a laptop).
103 */
104static bool pm_vt_switch(void)
105{
106 struct pm_vt_switch *entry;
107 bool ret = true;
108
109 mutex_lock(&vt_switch_mutex);
110 if (list_empty(&pm_vt_switch_list))
111 goto out;
112
113 if (!console_suspend_enabled)
114 goto out;
115
116 list_for_each_entry(entry, &pm_vt_switch_list, head) {
117 if (entry->required)
118 goto out;
119 }
120
121 ret = false;
122out:
123 mutex_unlock(&vt_switch_mutex);
124 return ret;
125}
126
17int pm_prepare_console(void) 127int pm_prepare_console(void)
18{ 128{
129 if (!pm_vt_switch())
130 return 0;
131
19 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); 132 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
20 if (orig_fgconsole < 0) 133 if (orig_fgconsole < 0)
21 return 1; 134 return 1;
@@ -26,6 +139,9 @@ int pm_prepare_console(void)
26 139
27void pm_restore_console(void) 140void pm_restore_console(void)
28{ 141{
142 if (!pm_vt_switch())
143 return;
144
29 if (orig_fgconsole >= 0) { 145 if (orig_fgconsole >= 0) {
30 vt_move_to_console(orig_fgconsole, 0); 146 vt_move_to_console(orig_fgconsole, 0);
31 vt_kmsg_redirect(orig_kmsg); 147 vt_kmsg_redirect(orig_kmsg);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 68197a4e8fc9..7ef6866b521d 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -32,7 +32,7 @@ static void handle_poweroff(int key)
32 32
33static struct sysrq_key_op sysrq_poweroff_op = { 33static struct sysrq_key_op sysrq_poweroff_op = {
34 .handler = handle_poweroff, 34 .handler = handle_poweroff,
35 .help_msg = "powerOff", 35 .help_msg = "poweroff(o)",
36 .action_msg = "Power Off", 36 .action_msg = "Power Off",
37 .enable_mask = SYSRQ_ENABLE_BOOT, 37 .enable_mask = SYSRQ_ENABLE_BOOT,
38}; 38};
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index d4feda084a3a..bef86d121eb2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -76,8 +76,20 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
76 76
77bool valid_state(suspend_state_t state) 77bool valid_state(suspend_state_t state)
78{ 78{
79 if (state == PM_SUSPEND_FREEZE) 79 if (state == PM_SUSPEND_FREEZE) {
80 return true; 80#ifdef CONFIG_PM_DEBUG
81 if (pm_test_level != TEST_NONE &&
82 pm_test_level != TEST_FREEZER &&
83 pm_test_level != TEST_DEVICES &&
84 pm_test_level != TEST_PLATFORM) {
85 printk(KERN_WARNING "Unsupported pm_test mode for "
86 "freeze state, please choose "
87 "none/freezer/devices/platform.\n");
88 return false;
89 }
90#endif
91 return true;
92 }
81 /* 93 /*
82 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel 94 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
83 * support and need to be valid to the lowlevel 95 * support and need to be valid to the lowlevel
@@ -184,6 +196,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
184 goto Platform_wake; 196 goto Platform_wake;
185 } 197 }
186 198
199 if (suspend_test(TEST_PLATFORM))
200 goto Platform_wake;
201
187 /* 202 /*
188 * PM_SUSPEND_FREEZE equals 203 * PM_SUSPEND_FREEZE equals
189 * frozen processes + suspended devices + idle processors. 204 * frozen processes + suspended devices + idle processors.
@@ -195,9 +210,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
195 goto Platform_wake; 210 goto Platform_wake;
196 } 211 }
197 212
198 if (suspend_test(TEST_PLATFORM))
199 goto Platform_wake;
200
201 error = disable_nonboot_cpus(); 213 error = disable_nonboot_cpus();
202 if (error || suspend_test(TEST_CPUS)) 214 if (error || suspend_test(TEST_CPUS))
203 goto Enable_cpus; 215 goto Enable_cpus;
diff --git a/kernel/printk.c b/kernel/printk.c
index 0b31715f335a..96dcfcd9a2d4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -43,19 +43,13 @@
43#include <linux/rculist.h> 43#include <linux/rculist.h>
44#include <linux/poll.h> 44#include <linux/poll.h>
45#include <linux/irq_work.h> 45#include <linux/irq_work.h>
46#include <linux/utsname.h>
46 47
47#include <asm/uaccess.h> 48#include <asm/uaccess.h>
48 49
49#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
50#include <trace/events/printk.h> 51#include <trace/events/printk.h>
51 52
52/*
53 * Architectures can override it:
54 */
55void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
56{
57}
58
59/* printk's without a loglevel use this.. */ 53/* printk's without a loglevel use this.. */
60#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL 54#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
61 55
@@ -63,8 +57,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
63#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ 57#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
64#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ 58#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
65 59
66DECLARE_WAIT_QUEUE_HEAD(log_wait);
67
68int console_printk[4] = { 60int console_printk[4] = {
69 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ 61 DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
70 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ 62 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
@@ -224,6 +216,7 @@ struct log {
224static DEFINE_RAW_SPINLOCK(logbuf_lock); 216static DEFINE_RAW_SPINLOCK(logbuf_lock);
225 217
226#ifdef CONFIG_PRINTK 218#ifdef CONFIG_PRINTK
219DECLARE_WAIT_QUEUE_HEAD(log_wait);
227/* the next printk record to read by syslog(READ) or /proc/kmsg */ 220/* the next printk record to read by syslog(READ) or /proc/kmsg */
228static u64 syslog_seq; 221static u64 syslog_seq;
229static u32 syslog_idx; 222static u32 syslog_idx;
@@ -609,7 +602,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
609 /* return error when data has vanished underneath us */ 602 /* return error when data has vanished underneath us */
610 if (user->seq < log_first_seq) 603 if (user->seq < log_first_seq)
611 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; 604 ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
612 ret = POLLIN|POLLRDNORM; 605 else
606 ret = POLLIN|POLLRDNORM;
613 } 607 }
614 raw_spin_unlock_irq(&logbuf_lock); 608 raw_spin_unlock_irq(&logbuf_lock);
615 609
@@ -1266,7 +1260,7 @@ static void call_console_drivers(int level, const char *text, size_t len)
1266{ 1260{
1267 struct console *con; 1261 struct console *con;
1268 1262
1269 trace_console(text, 0, len, len); 1263 trace_console(text, len);
1270 1264
1271 if (level >= console_loglevel && !ignore_loglevel) 1265 if (level >= console_loglevel && !ignore_loglevel)
1272 return; 1266 return;
@@ -1724,6 +1718,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }
1724 1718
1725#endif /* CONFIG_PRINTK */ 1719#endif /* CONFIG_PRINTK */
1726 1720
1721#ifdef CONFIG_EARLY_PRINTK
1722struct console *early_console;
1723
1724void early_vprintk(const char *fmt, va_list ap)
1725{
1726 if (early_console) {
1727 char buf[512];
1728 int n = vscnprintf(buf, sizeof(buf), fmt, ap);
1729
1730 early_console->write(early_console, buf, n);
1731 }
1732}
1733
1734asmlinkage void early_printk(const char *fmt, ...)
1735{
1736 va_list ap;
1737
1738 va_start(ap, fmt);
1739 early_vprintk(fmt, ap);
1740 va_end(ap);
1741}
1742#endif
1743
1727static int __add_preferred_console(char *name, int idx, char *options, 1744static int __add_preferred_console(char *name, int idx, char *options,
1728 char *brl_options) 1745 char *brl_options)
1729{ 1746{
@@ -1957,45 +1974,6 @@ int is_console_locked(void)
1957 return console_locked; 1974 return console_locked;
1958} 1975}
1959 1976
1960/*
1961 * Delayed printk version, for scheduler-internal messages:
1962 */
1963#define PRINTK_BUF_SIZE 512
1964
1965#define PRINTK_PENDING_WAKEUP 0x01
1966#define PRINTK_PENDING_SCHED 0x02
1967
1968static DEFINE_PER_CPU(int, printk_pending);
1969static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
1970
1971static void wake_up_klogd_work_func(struct irq_work *irq_work)
1972{
1973 int pending = __this_cpu_xchg(printk_pending, 0);
1974
1975 if (pending & PRINTK_PENDING_SCHED) {
1976 char *buf = __get_cpu_var(printk_sched_buf);
1977 printk(KERN_WARNING "[sched_delayed] %s", buf);
1978 }
1979
1980 if (pending & PRINTK_PENDING_WAKEUP)
1981 wake_up_interruptible(&log_wait);
1982}
1983
1984static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
1985 .func = wake_up_klogd_work_func,
1986 .flags = IRQ_WORK_LAZY,
1987};
1988
1989void wake_up_klogd(void)
1990{
1991 preempt_disable();
1992 if (waitqueue_active(&log_wait)) {
1993 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1994 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
1995 }
1996 preempt_enable();
1997}
1998
1999static void console_cont_flush(char *text, size_t size) 1977static void console_cont_flush(char *text, size_t size)
2000{ 1978{
2001 unsigned long flags; 1979 unsigned long flags;
@@ -2458,6 +2436,44 @@ static int __init printk_late_init(void)
2458late_initcall(printk_late_init); 2436late_initcall(printk_late_init);
2459 2437
2460#if defined CONFIG_PRINTK 2438#if defined CONFIG_PRINTK
2439/*
2440 * Delayed printk version, for scheduler-internal messages:
2441 */
2442#define PRINTK_BUF_SIZE 512
2443
2444#define PRINTK_PENDING_WAKEUP 0x01
2445#define PRINTK_PENDING_SCHED 0x02
2446
2447static DEFINE_PER_CPU(int, printk_pending);
2448static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
2449
2450static void wake_up_klogd_work_func(struct irq_work *irq_work)
2451{
2452 int pending = __this_cpu_xchg(printk_pending, 0);
2453
2454 if (pending & PRINTK_PENDING_SCHED) {
2455 char *buf = __get_cpu_var(printk_sched_buf);
2456 printk(KERN_WARNING "[sched_delayed] %s", buf);
2457 }
2458
2459 if (pending & PRINTK_PENDING_WAKEUP)
2460 wake_up_interruptible(&log_wait);
2461}
2462
2463static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
2464 .func = wake_up_klogd_work_func,
2465 .flags = IRQ_WORK_LAZY,
2466};
2467
2468void wake_up_klogd(void)
2469{
2470 preempt_disable();
2471 if (waitqueue_active(&log_wait)) {
2472 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
2473 irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
2474 }
2475 preempt_enable();
2476}
2461 2477
2462int printk_sched(const char *fmt, ...) 2478int printk_sched(const char *fmt, ...)
2463{ 2479{
@@ -2834,4 +2850,65 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
2834 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 2850 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
2835} 2851}
2836EXPORT_SYMBOL_GPL(kmsg_dump_rewind); 2852EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
2853
2854static char dump_stack_arch_desc_str[128];
2855
2856/**
2857 * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
2858 * @fmt: printf-style format string
2859 * @...: arguments for the format string
2860 *
2861 * The configured string will be printed right after utsname during task
2862 * dumps. Usually used to add arch-specific system identifiers. If an
2863 * arch wants to make use of such an ID string, it should initialize this
2864 * as soon as possible during boot.
2865 */
2866void __init dump_stack_set_arch_desc(const char *fmt, ...)
2867{
2868 va_list args;
2869
2870 va_start(args, fmt);
2871 vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
2872 fmt, args);
2873 va_end(args);
2874}
2875
2876/**
2877 * dump_stack_print_info - print generic debug info for dump_stack()
2878 * @log_lvl: log level
2879 *
2880 * Arch-specific dump_stack() implementations can use this function to
2881 * print out the same debug information as the generic dump_stack().
2882 */
2883void dump_stack_print_info(const char *log_lvl)
2884{
2885 printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
2886 log_lvl, raw_smp_processor_id(), current->pid, current->comm,
2887 print_tainted(), init_utsname()->release,
2888 (int)strcspn(init_utsname()->version, " "),
2889 init_utsname()->version);
2890
2891 if (dump_stack_arch_desc_str[0] != '\0')
2892 printk("%sHardware name: %s\n",
2893 log_lvl, dump_stack_arch_desc_str);
2894
2895 print_worker_info(log_lvl, current);
2896}
2897
2898/**
2899 * show_regs_print_info - print generic debug info for show_regs()
2900 * @log_lvl: log level
2901 *
2902 * show_regs() implementations can use this function to print out generic
2903 * debug information.
2904 */
2905void show_regs_print_info(const char *log_lvl)
2906{
2907 dump_stack_print_info(log_lvl);
2908
2909 printk("%stask: %p ti: %p task.ti: %p\n",
2910 log_lvl, current, current_thread_info(),
2911 task_thread_info(current));
2912}
2913
2837#endif 2914#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index dc3384ee874e..0bf400737660 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -462,10 +462,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = {
462 .write = prof_cpu_mask_proc_write, 462 .write = prof_cpu_mask_proc_write,
463}; 463};
464 464
465void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 465void create_prof_cpu_mask(void)
466{ 466{
467 /* create /proc/irq/prof_cpu_mask */ 467 /* create /proc/irq/prof_cpu_mask */
468 proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); 468 proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
469} 469}
470 470
471/* 471/*
@@ -600,7 +600,7 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
600 NULL, &proc_profile_operations); 600 NULL, &proc_profile_operations);
601 if (!entry) 601 if (!entry)
602 return 0; 602 return 0;
603 entry->size = (1+prof_len) * sizeof(atomic_t); 603 proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
604 hotcpu_notifier(profile_cpu_callback, 0); 604 hotcpu_notifier(profile_cpu_callback, 0);
605 return 0; 605 return 0;
606} 606}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index acbd28424d81..17ae54da0ec2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -24,6 +24,7 @@
24#include <linux/regset.h> 24#include <linux/regset.h>
25#include <linux/hw_breakpoint.h> 25#include <linux/hw_breakpoint.h>
26#include <linux/cn_proc.h> 26#include <linux/cn_proc.h>
27#include <linux/compat.h>
27 28
28 29
29static int ptrace_trapping_sleep_fn(void *flags) 30static int ptrace_trapping_sleep_fn(void *flags)
@@ -618,6 +619,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
618 return error; 619 return error;
619} 620}
620 621
622static int ptrace_peek_siginfo(struct task_struct *child,
623 unsigned long addr,
624 unsigned long data)
625{
626 struct ptrace_peeksiginfo_args arg;
627 struct sigpending *pending;
628 struct sigqueue *q;
629 int ret, i;
630
631 ret = copy_from_user(&arg, (void __user *) addr,
632 sizeof(struct ptrace_peeksiginfo_args));
633 if (ret)
634 return -EFAULT;
635
636 if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED)
637 return -EINVAL; /* unknown flags */
638
639 if (arg.nr < 0)
640 return -EINVAL;
641
642 if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
643 pending = &child->signal->shared_pending;
644 else
645 pending = &child->pending;
646
647 for (i = 0; i < arg.nr; ) {
648 siginfo_t info;
649 s32 off = arg.off + i;
650
651 spin_lock_irq(&child->sighand->siglock);
652 list_for_each_entry(q, &pending->list, list) {
653 if (!off--) {
654 copy_siginfo(&info, &q->info);
655 break;
656 }
657 }
658 spin_unlock_irq(&child->sighand->siglock);
659
660 if (off >= 0) /* beyond the end of the list */
661 break;
662
663#ifdef CONFIG_COMPAT
664 if (unlikely(is_compat_task())) {
665 compat_siginfo_t __user *uinfo = compat_ptr(data);
666
667 ret = copy_siginfo_to_user32(uinfo, &info);
668 ret |= __put_user(info.si_code, &uinfo->si_code);
669 } else
670#endif
671 {
672 siginfo_t __user *uinfo = (siginfo_t __user *) data;
673
674 ret = copy_siginfo_to_user(uinfo, &info);
675 ret |= __put_user(info.si_code, &uinfo->si_code);
676 }
677
678 if (ret) {
679 ret = -EFAULT;
680 break;
681 }
682
683 data += sizeof(siginfo_t);
684 i++;
685
686 if (signal_pending(current))
687 break;
688
689 cond_resched();
690 }
691
692 if (i > 0)
693 return i;
694
695 return ret;
696}
621 697
622#ifdef PTRACE_SINGLESTEP 698#ifdef PTRACE_SINGLESTEP
623#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) 699#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
@@ -748,6 +824,10 @@ int ptrace_request(struct task_struct *child, long request,
748 ret = put_user(child->ptrace_message, datalp); 824 ret = put_user(child->ptrace_message, datalp);
749 break; 825 break;
750 826
827 case PTRACE_PEEKSIGINFO:
828 ret = ptrace_peek_siginfo(child, addr, data);
829 break;
830
751 case PTRACE_GETSIGINFO: 831 case PTRACE_GETSIGINFO:
752 ret = ptrace_getsiginfo(child, &siginfo); 832 ret = ptrace_getsiginfo(child, &siginfo);
753 if (!ret) 833 if (!ret)
diff --git a/kernel/range.c b/kernel/range.c
index 9b8ae2d6ed68..071b0ab455cb 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -97,7 +97,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
97 range[i].end = range[j].end; 97 range[i].end = range[j].end;
98 range[i].start = end; 98 range[i].start = end;
99 } else { 99 } else {
100 printk(KERN_ERR "run of slot in ranges\n"); 100 pr_err("%s: run out of slot in ranges\n",
101 __func__);
101 } 102 }
102 range[j].end = start; 103 range[j].end = start;
103 continue; 104 continue;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 5b8ad827fd86..16ea67925015 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -64,7 +64,7 @@
64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66 66
67#define RCU_STATE_INITIALIZER(sname, cr) { \ 67#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
68 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 69 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 70 .fqs_state = RCU_GP_IDLE, \
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78 .name = #sname, \ 78 .name = #sname, \
79 .abbr = sabbr, \
79} 80}
80 81
81struct rcu_state rcu_sched_state = 82struct rcu_state rcu_sched_state =
82 RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); 83 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
83DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 84DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
84 85
85struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); 86struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
86DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 87DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
87 88
88static struct rcu_state *rcu_state; 89static struct rcu_state *rcu_state;
@@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
223module_param(jiffies_till_first_fqs, ulong, 0644); 224module_param(jiffies_till_first_fqs, ulong, 0644);
224module_param(jiffies_till_next_fqs, ulong, 0644); 225module_param(jiffies_till_next_fqs, ulong, 0644);
225 226
227static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
228 struct rcu_data *rdp);
226static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); 229static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
227static void force_quiescent_state(struct rcu_state *rsp); 230static void force_quiescent_state(struct rcu_state *rsp);
228static int rcu_pending(int cpu); 231static int rcu_pending(int cpu);
@@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
310 313
311 if (rcu_gp_in_progress(rsp)) 314 if (rcu_gp_in_progress(rsp))
312 return 0; /* No, a grace period is already in progress. */ 315 return 0; /* No, a grace period is already in progress. */
316 if (rcu_nocb_needs_gp(rsp))
317 return 1; /* Yes, a no-CBs CPU needs one. */
313 if (!rdp->nxttail[RCU_NEXT_TAIL]) 318 if (!rdp->nxttail[RCU_NEXT_TAIL])
314 return 0; /* No, this is a no-CBs (or offline) CPU. */ 319 return 0; /* No, this is a no-CBs (or offline) CPU. */
315 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 320 if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
@@ -794,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
794 rdp->offline_fqs++; 799 rdp->offline_fqs++;
795 return 1; 800 return 1;
796 } 801 }
802
803 /*
804 * There is a possibility that a CPU in adaptive-ticks state
805 * might run in the kernel with the scheduling-clock tick disabled
806 * for an extended time period. Invoke rcu_kick_nohz_cpu() to
807 * force the CPU to restart the scheduling-clock tick in this
808 * CPU is in this state.
809 */
810 rcu_kick_nohz_cpu(rdp->cpu);
811
797 return 0; 812 return 0;
798} 813}
799 814
@@ -1035,10 +1050,11 @@ static void init_callback_list(struct rcu_data *rdp)
1035{ 1050{
1036 int i; 1051 int i;
1037 1052
1053 if (init_nocb_callback_list(rdp))
1054 return;
1038 rdp->nxtlist = NULL; 1055 rdp->nxtlist = NULL;
1039 for (i = 0; i < RCU_NEXT_SIZE; i++) 1056 for (i = 0; i < RCU_NEXT_SIZE; i++)
1040 rdp->nxttail[i] = &rdp->nxtlist; 1057 rdp->nxttail[i] = &rdp->nxtlist;
1041 init_nocb_callback_list(rdp);
1042} 1058}
1043 1059
1044/* 1060/*
@@ -1071,6 +1087,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1071} 1087}
1072 1088
1073/* 1089/*
1090 * Trace-event helper function for rcu_start_future_gp() and
1091 * rcu_nocb_wait_gp().
1092 */
1093static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1094 unsigned long c, char *s)
1095{
1096 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
1097 rnp->completed, c, rnp->level,
1098 rnp->grplo, rnp->grphi, s);
1099}
1100
1101/*
1102 * Start some future grace period, as needed to handle newly arrived
1103 * callbacks. The required future grace periods are recorded in each
1104 * rcu_node structure's ->need_future_gp field.
1105 *
1106 * The caller must hold the specified rcu_node structure's ->lock.
1107 */
1108static unsigned long __maybe_unused
1109rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1110{
1111 unsigned long c;
1112 int i;
1113 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1114
1115 /*
1116 * Pick up grace-period number for new callbacks. If this
1117 * grace period is already marked as needed, return to the caller.
1118 */
1119 c = rcu_cbs_completed(rdp->rsp, rnp);
1120 trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
1121 if (rnp->need_future_gp[c & 0x1]) {
1122 trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
1123 return c;
1124 }
1125
1126 /*
1127 * If either this rcu_node structure or the root rcu_node structure
1128 * believe that a grace period is in progress, then we must wait
1129 * for the one following, which is in "c". Because our request
1130 * will be noticed at the end of the current grace period, we don't
1131 * need to explicitly start one.
1132 */
1133 if (rnp->gpnum != rnp->completed ||
1134 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1135 rnp->need_future_gp[c & 0x1]++;
1136 trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
1137 return c;
1138 }
1139
1140 /*
1141 * There might be no grace period in progress. If we don't already
1142 * hold it, acquire the root rcu_node structure's lock in order to
1143 * start one (if needed).
1144 */
1145 if (rnp != rnp_root)
1146 raw_spin_lock(&rnp_root->lock);
1147
1148 /*
1149 * Get a new grace-period number. If there really is no grace
1150 * period in progress, it will be smaller than the one we obtained
1151 * earlier. Adjust callbacks as needed. Note that even no-CBs
1152 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
1153 */
1154 c = rcu_cbs_completed(rdp->rsp, rnp_root);
1155 for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
1156 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
1157 rdp->nxtcompleted[i] = c;
1158
1159 /*
1160 * If the needed for the required grace period is already
1161 * recorded, trace and leave.
1162 */
1163 if (rnp_root->need_future_gp[c & 0x1]) {
1164 trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
1165 goto unlock_out;
1166 }
1167
1168 /* Record the need for the future grace period. */
1169 rnp_root->need_future_gp[c & 0x1]++;
1170
1171 /* If a grace period is not already in progress, start one. */
1172 if (rnp_root->gpnum != rnp_root->completed) {
1173 trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
1174 } else {
1175 trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
1176 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1177 }
1178unlock_out:
1179 if (rnp != rnp_root)
1180 raw_spin_unlock(&rnp_root->lock);
1181 return c;
1182}
1183
1184/*
1185 * Clean up any old requests for the just-ended grace period. Also return
1186 * whether any additional grace periods have been requested. Also invoke
1187 * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
1188 * waiting for this grace period to complete.
1189 */
1190static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1191{
1192 int c = rnp->completed;
1193 int needmore;
1194 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1195
1196 rcu_nocb_gp_cleanup(rsp, rnp);
1197 rnp->need_future_gp[c & 0x1] = 0;
1198 needmore = rnp->need_future_gp[(c + 1) & 0x1];
1199 trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
1200 return needmore;
1201}
1202
1203/*
1074 * If there is room, assign a ->completed number to any callbacks on 1204 * If there is room, assign a ->completed number to any callbacks on
1075 * this CPU that have not already been assigned. Also accelerate any 1205 * this CPU that have not already been assigned. Also accelerate any
1076 * callbacks that were previously assigned a ->completed number that has 1206 * callbacks that were previously assigned a ->completed number that has
@@ -1129,6 +1259,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1129 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; 1259 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1130 rdp->nxtcompleted[i] = c; 1260 rdp->nxtcompleted[i] = c;
1131 } 1261 }
1262 /* Record any needed additional grace periods. */
1263 rcu_start_future_gp(rnp, rdp);
1132 1264
1133 /* Trace depending on how much we were able to accelerate. */ 1265 /* Trace depending on how much we were able to accelerate. */
1134 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1266 if (!*rdp->nxttail[RCU_WAIT_TAIL])
@@ -1308,9 +1440,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
1308 rdp = this_cpu_ptr(rsp->rda); 1440 rdp = this_cpu_ptr(rsp->rda);
1309 rcu_preempt_check_blocked_tasks(rnp); 1441 rcu_preempt_check_blocked_tasks(rnp);
1310 rnp->qsmask = rnp->qsmaskinit; 1442 rnp->qsmask = rnp->qsmaskinit;
1311 rnp->gpnum = rsp->gpnum; 1443 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
1312 WARN_ON_ONCE(rnp->completed != rsp->completed); 1444 WARN_ON_ONCE(rnp->completed != rsp->completed);
1313 rnp->completed = rsp->completed; 1445 ACCESS_ONCE(rnp->completed) = rsp->completed;
1314 if (rnp == rdp->mynode) 1446 if (rnp == rdp->mynode)
1315 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1447 rcu_start_gp_per_cpu(rsp, rnp, rdp);
1316 rcu_preempt_boost_start_gp(rnp); 1448 rcu_preempt_boost_start_gp(rnp);
@@ -1319,7 +1451,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
1319 rnp->grphi, rnp->qsmask); 1451 rnp->grphi, rnp->qsmask);
1320 raw_spin_unlock_irq(&rnp->lock); 1452 raw_spin_unlock_irq(&rnp->lock);
1321#ifdef CONFIG_PROVE_RCU_DELAY 1453#ifdef CONFIG_PROVE_RCU_DELAY
1322 if ((random32() % (rcu_num_nodes * 8)) == 0) 1454 if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
1455 system_state == SYSTEM_RUNNING)
1323 schedule_timeout_uninterruptible(2); 1456 schedule_timeout_uninterruptible(2);
1324#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ 1457#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1325 cond_resched(); 1458 cond_resched();
@@ -1361,6 +1494,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1361static void rcu_gp_cleanup(struct rcu_state *rsp) 1494static void rcu_gp_cleanup(struct rcu_state *rsp)
1362{ 1495{
1363 unsigned long gp_duration; 1496 unsigned long gp_duration;
1497 int nocb = 0;
1364 struct rcu_data *rdp; 1498 struct rcu_data *rdp;
1365 struct rcu_node *rnp = rcu_get_root(rsp); 1499 struct rcu_node *rnp = rcu_get_root(rsp);
1366 1500
@@ -1390,17 +1524,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1390 */ 1524 */
1391 rcu_for_each_node_breadth_first(rsp, rnp) { 1525 rcu_for_each_node_breadth_first(rsp, rnp) {
1392 raw_spin_lock_irq(&rnp->lock); 1526 raw_spin_lock_irq(&rnp->lock);
1393 rnp->completed = rsp->gpnum; 1527 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1528 rdp = this_cpu_ptr(rsp->rda);
1529 if (rnp == rdp->mynode)
1530 __rcu_process_gp_end(rsp, rnp, rdp);
1531 nocb += rcu_future_gp_cleanup(rsp, rnp);
1394 raw_spin_unlock_irq(&rnp->lock); 1532 raw_spin_unlock_irq(&rnp->lock);
1395 cond_resched(); 1533 cond_resched();
1396 } 1534 }
1397 rnp = rcu_get_root(rsp); 1535 rnp = rcu_get_root(rsp);
1398 raw_spin_lock_irq(&rnp->lock); 1536 raw_spin_lock_irq(&rnp->lock);
1537 rcu_nocb_gp_set(rnp, nocb);
1399 1538
1400 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1539 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1401 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1540 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1402 rsp->fqs_state = RCU_GP_IDLE; 1541 rsp->fqs_state = RCU_GP_IDLE;
1403 rdp = this_cpu_ptr(rsp->rda); 1542 rdp = this_cpu_ptr(rsp->rda);
1543 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
1404 if (cpu_needs_another_gp(rsp, rdp)) 1544 if (cpu_needs_another_gp(rsp, rdp))
1405 rsp->gp_flags = 1; 1545 rsp->gp_flags = 1;
1406 raw_spin_unlock_irq(&rnp->lock); 1546 raw_spin_unlock_irq(&rnp->lock);
@@ -1476,57 +1616,62 @@ static int __noreturn rcu_gp_kthread(void *arg)
1476/* 1616/*
1477 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1617 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1478 * in preparation for detecting the next grace period. The caller must hold 1618 * in preparation for detecting the next grace period. The caller must hold
1479 * the root node's ->lock, which is released before return. Hard irqs must 1619 * the root node's ->lock and hard irqs must be disabled.
1480 * be disabled.
1481 * 1620 *
1482 * Note that it is legal for a dying CPU (which is marked as offline) to 1621 * Note that it is legal for a dying CPU (which is marked as offline) to
1483 * invoke this function. This can happen when the dying CPU reports its 1622 * invoke this function. This can happen when the dying CPU reports its
1484 * quiescent state. 1623 * quiescent state.
1485 */ 1624 */
1486static void 1625static void
1487rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 1626rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1488 __releases(rcu_get_root(rsp)->lock) 1627 struct rcu_data *rdp)
1489{ 1628{
1490 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1629 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
1491 struct rcu_node *rnp = rcu_get_root(rsp);
1492
1493 if (!rsp->gp_kthread ||
1494 !cpu_needs_another_gp(rsp, rdp)) {
1495 /* 1630 /*
1496 * Either we have not yet spawned the grace-period 1631 * Either we have not yet spawned the grace-period
1497 * task, this CPU does not need another grace period, 1632 * task, this CPU does not need another grace period,
1498 * or a grace period is already in progress. 1633 * or a grace period is already in progress.
1499 * Either way, don't start a new grace period. 1634 * Either way, don't start a new grace period.
1500 */ 1635 */
1501 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1502 return; 1636 return;
1503 } 1637 }
1504
1505 /*
1506 * Because there is no grace period in progress right now,
1507 * any callbacks we have up to this point will be satisfied
1508 * by the next grace period. So this is a good place to
1509 * assign a grace period number to recently posted callbacks.
1510 */
1511 rcu_accelerate_cbs(rsp, rnp, rdp);
1512
1513 rsp->gp_flags = RCU_GP_FLAG_INIT; 1638 rsp->gp_flags = RCU_GP_FLAG_INIT;
1514 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1515
1516 /* Ensure that CPU is aware of completion of last grace period. */
1517 rcu_process_gp_end(rsp, rdp);
1518 local_irq_restore(flags);
1519 1639
1520 /* Wake up rcu_gp_kthread() to start the grace period. */ 1640 /* Wake up rcu_gp_kthread() to start the grace period. */
1521 wake_up(&rsp->gp_wq); 1641 wake_up(&rsp->gp_wq);
1522} 1642}
1523 1643
1524/* 1644/*
1645 * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
1646 * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
1647 * is invoked indirectly from rcu_advance_cbs(), which would result in
1648 * endless recursion -- or would do so if it wasn't for the self-deadlock
1649 * that is encountered beforehand.
1650 */
1651static void
1652rcu_start_gp(struct rcu_state *rsp)
1653{
1654 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1655 struct rcu_node *rnp = rcu_get_root(rsp);
1656
1657 /*
1658 * If there is no grace period in progress right now, any
1659 * callbacks we have up to this point will be satisfied by the
1660 * next grace period. Also, advancing the callbacks reduces the
1661 * probability of false positives from cpu_needs_another_gp()
1662 * resulting in pointless grace periods. So, advance callbacks
1663 * then start the grace period!
1664 */
1665 rcu_advance_cbs(rsp, rnp, rdp);
1666 rcu_start_gp_advanced(rsp, rnp, rdp);
1667}
1668
1669/*
1525 * Report a full set of quiescent states to the specified rcu_state 1670 * Report a full set of quiescent states to the specified rcu_state
1526 * data structure. This involves cleaning up after the prior grace 1671 * data structure. This involves cleaning up after the prior grace
1527 * period and letting rcu_start_gp() start up the next grace period 1672 * period and letting rcu_start_gp() start up the next grace period
1528 * if one is needed. Note that the caller must hold rnp->lock, as 1673 * if one is needed. Note that the caller must hold rnp->lock, which
1529 * required by rcu_start_gp(), which will release it. 1674 * is released before return.
1530 */ 1675 */
1531static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 1676static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1532 __releases(rcu_get_root(rsp)->lock) 1677 __releases(rcu_get_root(rsp)->lock)
@@ -1685,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1685 struct rcu_node *rnp, struct rcu_data *rdp) 1830 struct rcu_node *rnp, struct rcu_data *rdp)
1686{ 1831{
1687 /* No-CBs CPUs do not have orphanable callbacks. */ 1832 /* No-CBs CPUs do not have orphanable callbacks. */
1688 if (is_nocb_cpu(rdp->cpu)) 1833 if (rcu_is_nocb_cpu(rdp->cpu))
1689 return; 1834 return;
1690 1835
1691 /* 1836 /*
@@ -2124,7 +2269,8 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2124 local_irq_save(flags); 2269 local_irq_save(flags);
2125 if (cpu_needs_another_gp(rsp, rdp)) { 2270 if (cpu_needs_another_gp(rsp, rdp)) {
2126 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2271 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2127 rcu_start_gp(rsp, flags); /* releases above lock */ 2272 rcu_start_gp(rsp);
2273 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2128 } else { 2274 } else {
2129 local_irq_restore(flags); 2275 local_irq_restore(flags);
2130 } 2276 }
@@ -2169,7 +2315,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
2169 2315
2170static void invoke_rcu_core(void) 2316static void invoke_rcu_core(void)
2171{ 2317{
2172 raise_softirq(RCU_SOFTIRQ); 2318 if (cpu_online(smp_processor_id()))
2319 raise_softirq(RCU_SOFTIRQ);
2173} 2320}
2174 2321
2175/* 2322/*
@@ -2204,11 +2351,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2204 2351
2205 /* Start a new grace period if one not already started. */ 2352 /* Start a new grace period if one not already started. */
2206 if (!rcu_gp_in_progress(rsp)) { 2353 if (!rcu_gp_in_progress(rsp)) {
2207 unsigned long nestflag;
2208 struct rcu_node *rnp_root = rcu_get_root(rsp); 2354 struct rcu_node *rnp_root = rcu_get_root(rsp);
2209 2355
2210 raw_spin_lock_irqsave(&rnp_root->lock, nestflag); 2356 raw_spin_lock(&rnp_root->lock);
2211 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ 2357 rcu_start_gp(rsp);
2358 raw_spin_unlock(&rnp_root->lock);
2212 } else { 2359 } else {
2213 /* Give the grace period a kick. */ 2360 /* Give the grace period a kick. */
2214 rdp->blimit = LONG_MAX; 2361 rdp->blimit = LONG_MAX;
@@ -2628,19 +2775,27 @@ static int rcu_pending(int cpu)
2628} 2775}
2629 2776
2630/* 2777/*
2631 * Check to see if any future RCU-related work will need to be done 2778 * Return true if the specified CPU has any callback. If all_lazy is
2632 * by the current CPU, even if none need be done immediately, returning 2779 * non-NULL, store an indication of whether all callbacks are lazy.
2633 * 1 if so. 2780 * (If there are no callbacks, all of them are deemed to be lazy.)
2634 */ 2781 */
2635static int rcu_cpu_has_callbacks(int cpu) 2782static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2636{ 2783{
2784 bool al = true;
2785 bool hc = false;
2786 struct rcu_data *rdp;
2637 struct rcu_state *rsp; 2787 struct rcu_state *rsp;
2638 2788
2639 /* RCU callbacks either ready or pending? */ 2789 for_each_rcu_flavor(rsp) {
2640 for_each_rcu_flavor(rsp) 2790 rdp = per_cpu_ptr(rsp->rda, cpu);
2641 if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) 2791 if (rdp->qlen != rdp->qlen_lazy)
2642 return 1; 2792 al = false;
2643 return 0; 2793 if (rdp->nxtlist)
2794 hc = true;
2795 }
2796 if (all_lazy)
2797 *all_lazy = al;
2798 return hc;
2644} 2799}
2645 2800
2646/* 2801/*
@@ -2747,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
2747 * corresponding CPU's preceding callbacks have been invoked. 2902 * corresponding CPU's preceding callbacks have been invoked.
2748 */ 2903 */
2749 for_each_possible_cpu(cpu) { 2904 for_each_possible_cpu(cpu) {
2750 if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) 2905 if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
2751 continue; 2906 continue;
2752 rdp = per_cpu_ptr(rsp->rda, cpu); 2907 rdp = per_cpu_ptr(rsp->rda, cpu);
2753 if (is_nocb_cpu(cpu)) { 2908 if (rcu_is_nocb_cpu(cpu)) {
2754 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 2909 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
2755 rsp->n_barrier_done); 2910 rsp->n_barrier_done);
2756 atomic_inc(&rsp->barrier_cpu_count); 2911 atomic_inc(&rsp->barrier_cpu_count);
@@ -2859,7 +3014,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2859 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3014 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2860 atomic_set(&rdp->dynticks->dynticks, 3015 atomic_set(&rdp->dynticks->dynticks,
2861 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 3016 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2862 rcu_prepare_for_idle_init(cpu);
2863 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 3017 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2864 3018
2865 /* Add CPU to rcu_node bitmasks. */ 3019 /* Add CPU to rcu_node bitmasks. */
@@ -2909,7 +3063,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2909 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 3063 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2910 struct rcu_node *rnp = rdp->mynode; 3064 struct rcu_node *rnp = rdp->mynode;
2911 struct rcu_state *rsp; 3065 struct rcu_state *rsp;
2912 int ret = NOTIFY_OK;
2913 3066
2914 trace_rcu_utilization("Start CPU hotplug"); 3067 trace_rcu_utilization("Start CPU hotplug");
2915 switch (action) { 3068 switch (action) {
@@ -2923,21 +3076,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2923 rcu_boost_kthread_setaffinity(rnp, -1); 3076 rcu_boost_kthread_setaffinity(rnp, -1);
2924 break; 3077 break;
2925 case CPU_DOWN_PREPARE: 3078 case CPU_DOWN_PREPARE:
2926 if (nocb_cpu_expendable(cpu)) 3079 rcu_boost_kthread_setaffinity(rnp, cpu);
2927 rcu_boost_kthread_setaffinity(rnp, cpu);
2928 else
2929 ret = NOTIFY_BAD;
2930 break; 3080 break;
2931 case CPU_DYING: 3081 case CPU_DYING:
2932 case CPU_DYING_FROZEN: 3082 case CPU_DYING_FROZEN:
2933 /*
2934 * The whole machine is "stopped" except this CPU, so we can
2935 * touch any data without introducing corruption. We send the
2936 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2937 */
2938 for_each_rcu_flavor(rsp) 3083 for_each_rcu_flavor(rsp)
2939 rcu_cleanup_dying_cpu(rsp); 3084 rcu_cleanup_dying_cpu(rsp);
2940 rcu_cleanup_after_idle(cpu);
2941 break; 3085 break;
2942 case CPU_DEAD: 3086 case CPU_DEAD:
2943 case CPU_DEAD_FROZEN: 3087 case CPU_DEAD_FROZEN:
@@ -2950,7 +3094,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2950 break; 3094 break;
2951 } 3095 }
2952 trace_rcu_utilization("End CPU hotplug"); 3096 trace_rcu_utilization("End CPU hotplug");
2953 return ret; 3097 return NOTIFY_OK;
2954} 3098}
2955 3099
2956/* 3100/*
@@ -3085,6 +3229,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3085 } 3229 }
3086 rnp->level = i; 3230 rnp->level = i;
3087 INIT_LIST_HEAD(&rnp->blkd_tasks); 3231 INIT_LIST_HEAD(&rnp->blkd_tasks);
3232 rcu_init_one_nocb(rnp);
3088 } 3233 }
3089 } 3234 }
3090 3235
@@ -3170,8 +3315,7 @@ void __init rcu_init(void)
3170 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 3315 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3171 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3316 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3172 __rcu_init_preempt(); 3317 __rcu_init_preempt();
3173 rcu_init_nocb(); 3318 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3174 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3175 3319
3176 /* 3320 /*
3177 * We don't need protection against CPU-hotplug here because 3321 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index c896b5045d9d..da77a8f57ff9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,18 +88,13 @@ struct rcu_dynticks {
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
89 atomic_t dynticks; /* Even value for idle, else odd. */ 89 atomic_t dynticks; /* Even value for idle, else odd. */
90#ifdef CONFIG_RCU_FAST_NO_HZ 90#ifdef CONFIG_RCU_FAST_NO_HZ
91 int dyntick_drain; /* Prepare-for-idle state variable. */ 91 bool all_lazy; /* Are all CPU's CBs lazy? */
92 unsigned long dyntick_holdoff;
93 /* No retries for the jiffy of failure. */
94 struct timer_list idle_gp_timer;
95 /* Wake up CPU sleeping with callbacks. */
96 unsigned long idle_gp_timer_expires;
97 /* When to wake up CPU (for repost). */
98 bool idle_first_pass; /* First pass of attempt to go idle? */
99 unsigned long nonlazy_posted; 92 unsigned long nonlazy_posted;
100 /* # times non-lazy CBs posted to CPU. */ 93 /* # times non-lazy CBs posted to CPU. */
101 unsigned long nonlazy_posted_snap; 94 unsigned long nonlazy_posted_snap;
102 /* idle-period nonlazy_posted snapshot. */ 95 /* idle-period nonlazy_posted snapshot. */
96 unsigned long last_accelerate;
97 /* Last jiffy CBs were accelerated. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 98 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 99#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105}; 100};
@@ -134,9 +129,6 @@ struct rcu_node {
134 /* elements that need to drain to allow the */ 129 /* elements that need to drain to allow the */
135 /* current expedited grace period to */ 130 /* current expedited grace period to */
136 /* complete (only for TREE_PREEMPT_RCU). */ 131 /* complete (only for TREE_PREEMPT_RCU). */
137 atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
138 /* Since this has meaning only for leaf */
139 /* rcu_node structures, 32 bits suffices. */
140 unsigned long qsmaskinit; 132 unsigned long qsmaskinit;
141 /* Per-GP initial value for qsmask & expmask. */ 133 /* Per-GP initial value for qsmask & expmask. */
142 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 134 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -196,6 +188,12 @@ struct rcu_node {
196 /* Refused to boost: not sure why, though. */ 188 /* Refused to boost: not sure why, though. */
197 /* This can happen due to race conditions. */ 189 /* This can happen due to race conditions. */
198#endif /* #ifdef CONFIG_RCU_BOOST */ 190#endif /* #ifdef CONFIG_RCU_BOOST */
191#ifdef CONFIG_RCU_NOCB_CPU
192 wait_queue_head_t nocb_gp_wq[2];
193 /* Place for rcu_nocb_kthread() to wait GP. */
194#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
195 int need_future_gp[2];
196 /* Counts of upcoming no-CB GP requests. */
199 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; 197 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
200} ____cacheline_internodealigned_in_smp; 198} ____cacheline_internodealigned_in_smp;
201 199
@@ -328,6 +326,11 @@ struct rcu_data {
328 struct task_struct *nocb_kthread; 326 struct task_struct *nocb_kthread;
329#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 327#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
330 328
329 /* 8) RCU CPU stall data. */
330#ifdef CONFIG_RCU_CPU_STALL_INFO
331 unsigned int softirq_snap; /* Snapshot of softirq activity. */
332#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
333
331 int cpu; 334 int cpu;
332 struct rcu_state *rsp; 335 struct rcu_state *rsp;
333}; 336};
@@ -375,12 +378,6 @@ struct rcu_state {
375 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 378 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
376 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 379 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
377 void (*func)(struct rcu_head *head)); 380 void (*func)(struct rcu_head *head));
378#ifdef CONFIG_RCU_NOCB_CPU
379 void (*call_remote)(struct rcu_head *head,
380 void (*func)(struct rcu_head *head));
381 /* call_rcu() flavor, but for */
382 /* placing on remote CPU. */
383#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
384 381
385 /* The following fields are guarded by the root rcu_node's lock. */ 382 /* The following fields are guarded by the root rcu_node's lock. */
386 383
@@ -443,6 +440,7 @@ struct rcu_state {
443 unsigned long gp_max; /* Maximum GP duration in */ 440 unsigned long gp_max; /* Maximum GP duration in */
444 /* jiffies. */ 441 /* jiffies. */
445 char *name; /* Name of structure. */ 442 char *name; /* Name of structure. */
443 char abbr; /* Abbreviated name. */
446 struct list_head flavors; /* List of RCU flavors. */ 444 struct list_head flavors; /* List of RCU flavors. */
447}; 445};
448 446
@@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
520 struct rcu_node *rnp); 518 struct rcu_node *rnp);
521#endif /* #ifdef CONFIG_RCU_BOOST */ 519#endif /* #ifdef CONFIG_RCU_BOOST */
522static void __cpuinit rcu_prepare_kthreads(int cpu); 520static void __cpuinit rcu_prepare_kthreads(int cpu);
523static void rcu_prepare_for_idle_init(int cpu);
524static void rcu_cleanup_after_idle(int cpu); 521static void rcu_cleanup_after_idle(int cpu);
525static void rcu_prepare_for_idle(int cpu); 522static void rcu_prepare_for_idle(int cpu);
526static void rcu_idle_count_callbacks_posted(void); 523static void rcu_idle_count_callbacks_posted(void);
@@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
529static void print_cpu_stall_info_end(void); 526static void print_cpu_stall_info_end(void);
530static void zero_cpu_stall_ticks(struct rcu_data *rdp); 527static void zero_cpu_stall_ticks(struct rcu_data *rdp);
531static void increment_cpu_stall_ticks(void); 528static void increment_cpu_stall_ticks(void);
532static bool is_nocb_cpu(int cpu); 529static int rcu_nocb_needs_gp(struct rcu_state *rsp);
530static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
531static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
532static void rcu_init_one_nocb(struct rcu_node *rnp);
533static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 533static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
534 bool lazy); 534 bool lazy);
535static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 535static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
536 struct rcu_data *rdp); 536 struct rcu_data *rdp);
537static bool nocb_cpu_expendable(int cpu);
538static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 537static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
539static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 538static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
540static void init_nocb_callback_list(struct rcu_data *rdp); 539static void rcu_kick_nohz_cpu(int cpu);
541static void __init rcu_init_nocb(void); 540static bool init_nocb_callback_list(struct rcu_data *rdp);
542 541
543#endif /* #ifndef RCU_TREE_NONCORE */ 542#endif /* #ifndef RCU_TREE_NONCORE */
544 543
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c1cc7e17ff9d..170814dc418f 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,6 +28,7 @@
28#include <linux/gfp.h> 28#include <linux/gfp.h>
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include <linux/tick.h>
31 32
32#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
33 34
@@ -85,11 +86,21 @@ static void __init rcu_bootup_announce_oddness(void)
85 if (nr_cpu_ids != NR_CPUS) 86 if (nr_cpu_ids != NR_CPUS)
86 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 87 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
87#ifdef CONFIG_RCU_NOCB_CPU 88#ifdef CONFIG_RCU_NOCB_CPU
89#ifndef CONFIG_RCU_NOCB_CPU_NONE
90 if (!have_rcu_nocb_mask) {
91 alloc_bootmem_cpumask_var(&rcu_nocb_mask);
92 have_rcu_nocb_mask = true;
93 }
94#ifdef CONFIG_RCU_NOCB_CPU_ZERO
95 pr_info("\tExperimental no-CBs CPU 0\n");
96 cpumask_set_cpu(0, rcu_nocb_mask);
97#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
98#ifdef CONFIG_RCU_NOCB_CPU_ALL
99 pr_info("\tExperimental no-CBs for all CPUs\n");
100 cpumask_setall(rcu_nocb_mask);
101#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
102#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
88 if (have_rcu_nocb_mask) { 103 if (have_rcu_nocb_mask) {
89 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
90 cpumask_clear_cpu(0, rcu_nocb_mask);
91 pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
92 }
93 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 104 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
94 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); 105 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
95 if (rcu_nocb_poll) 106 if (rcu_nocb_poll)
@@ -101,7 +112,7 @@ static void __init rcu_bootup_announce_oddness(void)
101#ifdef CONFIG_TREE_PREEMPT_RCU 112#ifdef CONFIG_TREE_PREEMPT_RCU
102 113
103struct rcu_state rcu_preempt_state = 114struct rcu_state rcu_preempt_state =
104 RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); 115 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
105DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 116DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
106static struct rcu_state *rcu_state = &rcu_preempt_state; 117static struct rcu_state *rcu_state = &rcu_preempt_state;
107 118
@@ -1533,14 +1544,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1533int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1544int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1534{ 1545{
1535 *delta_jiffies = ULONG_MAX; 1546 *delta_jiffies = ULONG_MAX;
1536 return rcu_cpu_has_callbacks(cpu); 1547 return rcu_cpu_has_callbacks(cpu, NULL);
1537}
1538
1539/*
1540 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1541 */
1542static void rcu_prepare_for_idle_init(int cpu)
1543{
1544} 1548}
1545 1549
1546/* 1550/*
@@ -1577,16 +1581,6 @@ static void rcu_idle_count_callbacks_posted(void)
1577 * 1581 *
1578 * The following three proprocessor symbols control this state machine: 1582 * The following three proprocessor symbols control this state machine:
1579 * 1583 *
1580 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
1581 * to satisfy RCU. Beyond this point, it is better to incur a periodic
1582 * scheduling-clock interrupt than to loop through the state machine
1583 * at full power.
1584 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
1585 * optional if RCU does not need anything immediately from this
1586 * CPU, even if this CPU still has RCU callbacks queued. The first
1587 * times through the state machine are mandatory: we need to give
1588 * the state machine a chance to communicate a quiescent state
1589 * to the RCU core.
1590 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted 1584 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1591 * to sleep in dyntick-idle mode with RCU callbacks pending. This 1585 * to sleep in dyntick-idle mode with RCU callbacks pending. This
1592 * is sized to be roughly one RCU grace period. Those energy-efficiency 1586 * is sized to be roughly one RCU grace period. Those energy-efficiency
@@ -1602,186 +1596,108 @@ static void rcu_idle_count_callbacks_posted(void)
1602 * adjustment, they can be converted into kernel config parameters, though 1596 * adjustment, they can be converted into kernel config parameters, though
1603 * making the state machine smarter might be a better option. 1597 * making the state machine smarter might be a better option.
1604 */ 1598 */
1605#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
1606#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
1607#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ 1599#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1608#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1600#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1609 1601
1610extern int tick_nohz_enabled; 1602static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
1611 1603module_param(rcu_idle_gp_delay, int, 0644);
1612/* 1604static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1613 * Does the specified flavor of RCU have non-lazy callbacks pending on 1605module_param(rcu_idle_lazy_gp_delay, int, 0644);
1614 * the specified CPU? Both RCU flavor and CPU are specified by the
1615 * rcu_data structure.
1616 */
1617static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
1618{
1619 return rdp->qlen != rdp->qlen_lazy;
1620}
1621 1606
1622#ifdef CONFIG_TREE_PREEMPT_RCU 1607extern int tick_nohz_enabled;
1623 1608
1624/* 1609/*
1625 * Are there non-lazy RCU-preempt callbacks? (There cannot be if there 1610 * Try to advance callbacks for all flavors of RCU on the current CPU.
1626 * is no RCU-preempt in the kernel.) 1611 * Afterwards, if there are any callbacks ready for immediate invocation,
1612 * return true.
1627 */ 1613 */
1628static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) 1614static bool rcu_try_advance_all_cbs(void)
1629{ 1615{
1630 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 1616 bool cbs_ready = false;
1631 1617 struct rcu_data *rdp;
1632 return __rcu_cpu_has_nonlazy_callbacks(rdp); 1618 struct rcu_node *rnp;
1633} 1619 struct rcu_state *rsp;
1634
1635#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1636 1620
1637static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) 1621 for_each_rcu_flavor(rsp) {
1638{ 1622 rdp = this_cpu_ptr(rsp->rda);
1639 return 0; 1623 rnp = rdp->mynode;
1640}
1641 1624
1642#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1625 /*
1626 * Don't bother checking unless a grace period has
1627 * completed since we last checked and there are
1628 * callbacks not yet ready to invoke.
1629 */
1630 if (rdp->completed != rnp->completed &&
1631 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1632 rcu_process_gp_end(rsp, rdp);
1643 1633
1644/* 1634 if (cpu_has_callbacks_ready_to_invoke(rdp))
1645 * Does any flavor of RCU have non-lazy callbacks on the specified CPU? 1635 cbs_ready = true;
1646 */ 1636 }
1647static bool rcu_cpu_has_nonlazy_callbacks(int cpu) 1637 return cbs_ready;
1648{
1649 return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
1650 __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
1651 rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
1652} 1638}
1653 1639
1654/* 1640/*
1655 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1641 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
1656 * callbacks on this CPU, (2) this CPU has not yet attempted to enter 1642 * to invoke. If the CPU has callbacks, try to advance them. Tell the
1657 * dyntick-idle mode, or (3) this CPU is in the process of attempting to 1643 * caller to set the timeout based on whether or not there are non-lazy
1658 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed 1644 * callbacks.
1659 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
1660 * it is better to incur scheduling-clock interrupts than to spin
1661 * continuously for the same time duration!
1662 * 1645 *
1663 * The delta_jiffies argument is used to store the time when RCU is 1646 * The caller must have disabled interrupts.
1664 * going to need the CPU again if it still has callbacks. The reason
1665 * for this is that rcu_prepare_for_idle() might need to post a timer,
1666 * but if so, it will do so after tick_nohz_stop_sched_tick() has set
1667 * the wakeup time for this CPU. This means that RCU's timer can be
1668 * delayed until the wakeup time, which defeats the purpose of posting
1669 * a timer.
1670 */ 1647 */
1671int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1648int rcu_needs_cpu(int cpu, unsigned long *dj)
1672{ 1649{
1673 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1650 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1674 1651
1675 /* Flag a new idle sojourn to the idle-entry state machine. */ 1652 /* Snapshot to detect later posting of non-lazy callback. */
1676 rdtp->idle_first_pass = 1; 1653 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1654
1677 /* If no callbacks, RCU doesn't need the CPU. */ 1655 /* If no callbacks, RCU doesn't need the CPU. */
1678 if (!rcu_cpu_has_callbacks(cpu)) { 1656 if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
1679 *delta_jiffies = ULONG_MAX; 1657 *dj = ULONG_MAX;
1680 return 0; 1658 return 0;
1681 } 1659 }
1682 if (rdtp->dyntick_holdoff == jiffies) { 1660
1683 /* RCU recently tried and failed, so don't try again. */ 1661 /* Attempt to advance callbacks. */
1684 *delta_jiffies = 1; 1662 if (rcu_try_advance_all_cbs()) {
1663 /* Some ready to invoke, so initiate later invocation. */
1664 invoke_rcu_core();
1685 return 1; 1665 return 1;
1686 } 1666 }
1687 /* Set up for the possibility that RCU will post a timer. */ 1667 rdtp->last_accelerate = jiffies;
1688 if (rcu_cpu_has_nonlazy_callbacks(cpu)) { 1668
1689 *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, 1669 /* Request timer delay depending on laziness, and round. */
1690 RCU_IDLE_GP_DELAY) - jiffies; 1670 if (rdtp->all_lazy) {
1671 *dj = round_up(rcu_idle_gp_delay + jiffies,
1672 rcu_idle_gp_delay) - jiffies;
1691 } else { 1673 } else {
1692 *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; 1674 *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
1693 *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
1694 } 1675 }
1695 return 0; 1676 return 0;
1696} 1677}
1697 1678
1698/* 1679/*
1699 * Handler for smp_call_function_single(). The only point of this 1680 * Prepare a CPU for idle from an RCU perspective. The first major task
1700 * handler is to wake the CPU up, so the handler does only tracing. 1681 * is to sense whether nohz mode has been enabled or disabled via sysfs.
1701 */ 1682 * The second major task is to check to see if a non-lazy callback has
1702void rcu_idle_demigrate(void *unused) 1683 * arrived at a CPU that previously had only lazy callbacks. The third
1703{ 1684 * major task is to accelerate (that is, assign grace-period numbers to)
1704 trace_rcu_prep_idle("Demigrate"); 1685 * any recently arrived callbacks.
1705}
1706
1707/*
1708 * Timer handler used to force CPU to start pushing its remaining RCU
1709 * callbacks in the case where it entered dyntick-idle mode with callbacks
1710 * pending. The hander doesn't really need to do anything because the
1711 * real work is done upon re-entry to idle, or by the next scheduling-clock
1712 * interrupt should idle not be re-entered.
1713 *
1714 * One special case: the timer gets migrated without awakening the CPU
1715 * on which the timer was scheduled on. In this case, we must wake up
1716 * that CPU. We do so with smp_call_function_single().
1717 */
1718static void rcu_idle_gp_timer_func(unsigned long cpu_in)
1719{
1720 int cpu = (int)cpu_in;
1721
1722 trace_rcu_prep_idle("Timer");
1723 if (cpu != smp_processor_id())
1724 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
1725 else
1726 WARN_ON_ONCE(1); /* Getting here can hang the system... */
1727}
1728
1729/*
1730 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
1731 */
1732static void rcu_prepare_for_idle_init(int cpu)
1733{
1734 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1735
1736 rdtp->dyntick_holdoff = jiffies - 1;
1737 setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
1738 rdtp->idle_gp_timer_expires = jiffies - 1;
1739 rdtp->idle_first_pass = 1;
1740}
1741
1742/*
1743 * Clean up for exit from idle. Because we are exiting from idle, there
1744 * is no longer any point to ->idle_gp_timer, so cancel it. This will
1745 * do nothing if this timer is not active, so just cancel it unconditionally.
1746 */
1747static void rcu_cleanup_after_idle(int cpu)
1748{
1749 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1750
1751 del_timer(&rdtp->idle_gp_timer);
1752 trace_rcu_prep_idle("Cleanup after idle");
1753 rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
1754}
1755
1756/*
1757 * Check to see if any RCU-related work can be done by the current CPU,
1758 * and if so, schedule a softirq to get it done. This function is part
1759 * of the RCU implementation; it is -not- an exported member of the RCU API.
1760 *
1761 * The idea is for the current CPU to clear out all work required by the
1762 * RCU core for the current grace period, so that this CPU can be permitted
1763 * to enter dyntick-idle mode. In some cases, it will need to be awakened
1764 * at the end of the grace period by whatever CPU ends the grace period.
1765 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
1766 * number of wakeups by a modest integer factor.
1767 *
1768 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1769 * disabled, we do one pass of force_quiescent_state(), then do a
1770 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1771 * later. The ->dyntick_drain field controls the sequencing.
1772 * 1686 *
1773 * The caller must have disabled interrupts. 1687 * The caller must have disabled interrupts.
1774 */ 1688 */
1775static void rcu_prepare_for_idle(int cpu) 1689static void rcu_prepare_for_idle(int cpu)
1776{ 1690{
1777 struct timer_list *tp; 1691 struct rcu_data *rdp;
1778 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1692 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1693 struct rcu_node *rnp;
1694 struct rcu_state *rsp;
1779 int tne; 1695 int tne;
1780 1696
1781 /* Handle nohz enablement switches conservatively. */ 1697 /* Handle nohz enablement switches conservatively. */
1782 tne = ACCESS_ONCE(tick_nohz_enabled); 1698 tne = ACCESS_ONCE(tick_nohz_enabled);
1783 if (tne != rdtp->tick_nohz_enabled_snap) { 1699 if (tne != rdtp->tick_nohz_enabled_snap) {
1784 if (rcu_cpu_has_callbacks(cpu)) 1700 if (rcu_cpu_has_callbacks(cpu, NULL))
1785 invoke_rcu_core(); /* force nohz to see update. */ 1701 invoke_rcu_core(); /* force nohz to see update. */
1786 rdtp->tick_nohz_enabled_snap = tne; 1702 rdtp->tick_nohz_enabled_snap = tne;
1787 return; 1703 return;
@@ -1789,125 +1705,56 @@ static void rcu_prepare_for_idle(int cpu)
1789 if (!tne) 1705 if (!tne)
1790 return; 1706 return;
1791 1707
1792 /* Adaptive-tick mode, where usermode execution is idle to RCU. */ 1708 /* If this is a no-CBs CPU, no callbacks, just return. */
1793 if (!is_idle_task(current)) { 1709 if (rcu_is_nocb_cpu(cpu))
1794 rdtp->dyntick_holdoff = jiffies - 1;
1795 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1796 trace_rcu_prep_idle("User dyntick with callbacks");
1797 rdtp->idle_gp_timer_expires =
1798 round_up(jiffies + RCU_IDLE_GP_DELAY,
1799 RCU_IDLE_GP_DELAY);
1800 } else if (rcu_cpu_has_callbacks(cpu)) {
1801 rdtp->idle_gp_timer_expires =
1802 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1803 trace_rcu_prep_idle("User dyntick with lazy callbacks");
1804 } else {
1805 return;
1806 }
1807 tp = &rdtp->idle_gp_timer;
1808 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1809 return; 1710 return;
1810 }
1811 1711
1812 /* 1712 /*
1813 * If this is an idle re-entry, for example, due to use of 1713 * If a non-lazy callback arrived at a CPU having only lazy
1814 * RCU_NONIDLE() or the new idle-loop tracing API within the idle 1714 * callbacks, invoke RCU core for the side-effect of recalculating
1815 * loop, then don't take any state-machine actions, unless the 1715 * idle duration on re-entry to idle.
1816 * momentary exit from idle queued additional non-lazy callbacks.
1817 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
1818 * pending.
1819 */ 1716 */
1820 if (!rdtp->idle_first_pass && 1717 if (rdtp->all_lazy &&
1821 (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { 1718 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1822 if (rcu_cpu_has_callbacks(cpu)) { 1719 invoke_rcu_core();
1823 tp = &rdtp->idle_gp_timer;
1824 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1825 }
1826 return; 1720 return;
1827 } 1721 }
1828 rdtp->idle_first_pass = 0;
1829 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
1830 1722
1831 /* 1723 /*
1832 * If there are no callbacks on this CPU, enter dyntick-idle mode. 1724 * If we have not yet accelerated this jiffy, accelerate all
1833 * Also reset state to avoid prejudicing later attempts. 1725 * callbacks on this CPU.
1834 */ 1726 */
1835 if (!rcu_cpu_has_callbacks(cpu)) { 1727 if (rdtp->last_accelerate == jiffies)
1836 rdtp->dyntick_holdoff = jiffies - 1;
1837 rdtp->dyntick_drain = 0;
1838 trace_rcu_prep_idle("No callbacks");
1839 return; 1728 return;
1729 rdtp->last_accelerate = jiffies;
1730 for_each_rcu_flavor(rsp) {
1731 rdp = per_cpu_ptr(rsp->rda, cpu);
1732 if (!*rdp->nxttail[RCU_DONE_TAIL])
1733 continue;
1734 rnp = rdp->mynode;
1735 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1736 rcu_accelerate_cbs(rsp, rnp, rdp);
1737 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1840 } 1738 }
1739}
1841 1740
1842 /* 1741/*
1843 * If in holdoff mode, just return. We will presumably have 1742 * Clean up for exit from idle. Attempt to advance callbacks based on
1844 * refrained from disabling the scheduling-clock tick. 1743 * any grace periods that elapsed while the CPU was idle, and if any
1845 */ 1744 * callbacks are now ready to invoke, initiate invocation.
1846 if (rdtp->dyntick_holdoff == jiffies) { 1745 */
1847 trace_rcu_prep_idle("In holdoff"); 1746static void rcu_cleanup_after_idle(int cpu)
1848 return; 1747{
1849 } 1748 struct rcu_data *rdp;
1749 struct rcu_state *rsp;
1850 1750
1851 /* Check and update the ->dyntick_drain sequencing. */ 1751 if (rcu_is_nocb_cpu(cpu))
1852 if (rdtp->dyntick_drain <= 0) {
1853 /* First time through, initialize the counter. */
1854 rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
1855 } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
1856 !rcu_pending(cpu) &&
1857 !local_softirq_pending()) {
1858 /* Can we go dyntick-idle despite still having callbacks? */
1859 rdtp->dyntick_drain = 0;
1860 rdtp->dyntick_holdoff = jiffies;
1861 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1862 trace_rcu_prep_idle("Dyntick with callbacks");
1863 rdtp->idle_gp_timer_expires =
1864 round_up(jiffies + RCU_IDLE_GP_DELAY,
1865 RCU_IDLE_GP_DELAY);
1866 } else {
1867 rdtp->idle_gp_timer_expires =
1868 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1869 trace_rcu_prep_idle("Dyntick with lazy callbacks");
1870 }
1871 tp = &rdtp->idle_gp_timer;
1872 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1873 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1874 return; /* Nothing more to do immediately. */
1875 } else if (--(rdtp->dyntick_drain) <= 0) {
1876 /* We have hit the limit, so time to give up. */
1877 rdtp->dyntick_holdoff = jiffies;
1878 trace_rcu_prep_idle("Begin holdoff");
1879 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
1880 return; 1752 return;
1881 } 1753 rcu_try_advance_all_cbs();
1882 1754 for_each_rcu_flavor(rsp) {
1883 /* 1755 rdp = per_cpu_ptr(rsp->rda, cpu);
1884 * Do one step of pushing the remaining RCU callbacks through 1756 if (cpu_has_callbacks_ready_to_invoke(rdp))
1885 * the RCU core state machine. 1757 invoke_rcu_core();
1886 */
1887#ifdef CONFIG_TREE_PREEMPT_RCU
1888 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
1889 rcu_preempt_qs(cpu);
1890 force_quiescent_state(&rcu_preempt_state);
1891 }
1892#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1893 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1894 rcu_sched_qs(cpu);
1895 force_quiescent_state(&rcu_sched_state);
1896 }
1897 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1898 rcu_bh_qs(cpu);
1899 force_quiescent_state(&rcu_bh_state);
1900 }
1901
1902 /*
1903 * If RCU callbacks are still pending, RCU still needs this CPU.
1904 * So try forcing the callbacks through the grace period.
1905 */
1906 if (rcu_cpu_has_callbacks(cpu)) {
1907 trace_rcu_prep_idle("More callbacks");
1908 invoke_rcu_core();
1909 } else {
1910 trace_rcu_prep_idle("Callbacks drained");
1911 } 1758 }
1912} 1759}
1913 1760
@@ -2015,16 +1862,13 @@ early_initcall(rcu_register_oom_notifier);
2015static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1862static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2016{ 1863{
2017 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1864 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2018 struct timer_list *tltp = &rdtp->idle_gp_timer; 1865 unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
2019 char c;
2020 1866
2021 c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; 1867 sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
2022 if (timer_pending(tltp)) 1868 rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
2023 sprintf(cp, "drain=%d %c timer=%lu", 1869 ulong2long(nlpd),
2024 rdtp->dyntick_drain, c, tltp->expires - jiffies); 1870 rdtp->all_lazy ? 'L' : '.',
2025 else 1871 rdtp->tick_nohz_enabled_snap ? '.' : 'D');
2026 sprintf(cp, "drain=%d %c timer not pending",
2027 rdtp->dyntick_drain, c);
2028} 1872}
2029 1873
2030#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 1874#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2070,10 +1914,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2070 ticks_value = rsp->gpnum - rdp->gpnum; 1914 ticks_value = rsp->gpnum - rdp->gpnum;
2071 } 1915 }
2072 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1916 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2073 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", 1917 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
2074 cpu, ticks_value, ticks_title, 1918 cpu, ticks_value, ticks_title,
2075 atomic_read(&rdtp->dynticks) & 0xfff, 1919 atomic_read(&rdtp->dynticks) & 0xfff,
2076 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1920 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1921 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
2077 fast_no_hz); 1922 fast_no_hz);
2078} 1923}
2079 1924
@@ -2087,6 +1932,7 @@ static void print_cpu_stall_info_end(void)
2087static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1932static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2088{ 1933{
2089 rdp->ticks_this_gp = 0; 1934 rdp->ticks_this_gp = 0;
1935 rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
2090} 1936}
2091 1937
2092/* Increment ->ticks_this_gp for all flavors of RCU. */ 1938/* Increment ->ticks_this_gp for all flavors of RCU. */
@@ -2165,8 +2011,49 @@ static int __init parse_rcu_nocb_poll(char *arg)
2165} 2011}
2166early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 2012early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2167 2013
2014/*
2015 * Do any no-CBs CPUs need another grace period?
2016 *
2017 * Interrupts must be disabled. If the caller does not hold the root
2018 * rnp_node structure's ->lock, the results are advisory only.
2019 */
2020static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2021{
2022 struct rcu_node *rnp = rcu_get_root(rsp);
2023
2024 return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
2025}
2026
2027/*
2028 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
2029 * grace period.
2030 */
2031static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2032{
2033 wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
2034}
2035
2036/*
2037 * Set the root rcu_node structure's ->need_future_gp field
2038 * based on the sum of those of all rcu_node structures. This does
2039 * double-count the root rcu_node structure's requests, but this
2040 * is necessary to handle the possibility of a rcu_nocb_kthread()
2041 * having awakened during the time that the rcu_node structures
2042 * were being updated for the end of the previous grace period.
2043 */
2044static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2045{
2046 rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
2047}
2048
2049static void rcu_init_one_nocb(struct rcu_node *rnp)
2050{
2051 init_waitqueue_head(&rnp->nocb_gp_wq[0]);
2052 init_waitqueue_head(&rnp->nocb_gp_wq[1]);
2053}
2054
2168/* Is the specified CPU a no-CPUs CPU? */ 2055/* Is the specified CPU a no-CPUs CPU? */
2169static bool is_nocb_cpu(int cpu) 2056bool rcu_is_nocb_cpu(int cpu)
2170{ 2057{
2171 if (have_rcu_nocb_mask) 2058 if (have_rcu_nocb_mask)
2172 return cpumask_test_cpu(cpu, rcu_nocb_mask); 2059 return cpumask_test_cpu(cpu, rcu_nocb_mask);
@@ -2224,9 +2111,16 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2224 bool lazy) 2111 bool lazy)
2225{ 2112{
2226 2113
2227 if (!is_nocb_cpu(rdp->cpu)) 2114 if (!rcu_is_nocb_cpu(rdp->cpu))
2228 return 0; 2115 return 0;
2229 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); 2116 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2117 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2118 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2119 (unsigned long)rhp->func,
2120 rdp->qlen_lazy, rdp->qlen);
2121 else
2122 trace_rcu_callback(rdp->rsp->name, rhp,
2123 rdp->qlen_lazy, rdp->qlen);
2230 return 1; 2124 return 1;
2231} 2125}
2232 2126
@@ -2241,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2241 long qll = rsp->qlen_lazy; 2135 long qll = rsp->qlen_lazy;
2242 2136
2243 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ 2137 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2244 if (!is_nocb_cpu(smp_processor_id())) 2138 if (!rcu_is_nocb_cpu(smp_processor_id()))
2245 return 0; 2139 return 0;
2246 rsp->qlen = 0; 2140 rsp->qlen = 0;
2247 rsp->qlen_lazy = 0; 2141 rsp->qlen_lazy = 0;
@@ -2265,95 +2159,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2265} 2159}
2266 2160
2267/* 2161/*
2268 * There must be at least one non-no-CBs CPU in operation at any given 2162 * If necessary, kick off a new grace period, and either way wait
2269 * time, because no-CBs CPUs are not capable of initiating grace periods 2163 * for a subsequent grace period to complete.
2270 * independently. This function therefore complains if the specified
2271 * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2272 * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2273 * but you have to have a base case!)
2274 */ 2164 */
2275static bool nocb_cpu_expendable(int cpu) 2165static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2276{ 2166{
2277 cpumask_var_t non_nocb_cpus; 2167 unsigned long c;
2278 int ret; 2168 bool d;
2169 unsigned long flags;
2170 struct rcu_node *rnp = rdp->mynode;
2171
2172 raw_spin_lock_irqsave(&rnp->lock, flags);
2173 c = rcu_start_future_gp(rnp, rdp);
2174 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2279 2175
2280 /* 2176 /*
2281 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, 2177 * Wait for the grace period. Do so interruptibly to avoid messing
2282 * then offlining this CPU is harmless. Let it happen. 2178 * up the load average.
2283 */ 2179 */
2284 if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) 2180 trace_rcu_future_gp(rnp, rdp, c, "StartWait");
2285 return 1; 2181 for (;;) {
2286 2182 wait_event_interruptible(
2287 /* If no memory, play it safe and keep the CPU around. */ 2183 rnp->nocb_gp_wq[c & 0x1],
2288 if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) 2184 (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
2289 return 0; 2185 if (likely(d))
2290 cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); 2186 break;
2291 cpumask_clear_cpu(cpu, non_nocb_cpus); 2187 flush_signals(current);
2292 ret = !cpumask_empty(non_nocb_cpus); 2188 trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
2293 free_cpumask_var(non_nocb_cpus); 2189 }
2294 return ret; 2190 trace_rcu_future_gp(rnp, rdp, c, "EndWait");
2295} 2191 smp_mb(); /* Ensure that CB invocation happens after GP end. */
2296
2297/*
2298 * Helper structure for remote registry of RCU callbacks.
2299 * This is needed for when a no-CBs CPU needs to start a grace period.
2300 * If it just invokes call_rcu(), the resulting callback will be queued,
2301 * which can result in deadlock.
2302 */
2303struct rcu_head_remote {
2304 struct rcu_head *rhp;
2305 call_rcu_func_t *crf;
2306 void (*func)(struct rcu_head *rhp);
2307};
2308
2309/*
2310 * Register a callback as specified by the rcu_head_remote struct.
2311 * This function is intended to be invoked via smp_call_function_single().
2312 */
2313static void call_rcu_local(void *arg)
2314{
2315 struct rcu_head_remote *rhrp =
2316 container_of(arg, struct rcu_head_remote, rhp);
2317
2318 rhrp->crf(rhrp->rhp, rhrp->func);
2319}
2320
2321/*
2322 * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2323 * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2324 * smp_call_function_single().
2325 */
2326static void invoke_crf_remote(struct rcu_head *rhp,
2327 void (*func)(struct rcu_head *rhp),
2328 call_rcu_func_t crf)
2329{
2330 struct rcu_head_remote rhr;
2331
2332 rhr.rhp = rhp;
2333 rhr.crf = crf;
2334 rhr.func = func;
2335 smp_call_function_single(0, call_rcu_local, &rhr, 1);
2336}
2337
2338/*
2339 * Helper functions to be passed to wait_rcu_gp(), each of which
2340 * invokes invoke_crf_remote() to register a callback appropriately.
2341 */
2342static void __maybe_unused
2343call_rcu_preempt_remote(struct rcu_head *rhp,
2344 void (*func)(struct rcu_head *rhp))
2345{
2346 invoke_crf_remote(rhp, func, call_rcu);
2347}
2348static void call_rcu_bh_remote(struct rcu_head *rhp,
2349 void (*func)(struct rcu_head *rhp))
2350{
2351 invoke_crf_remote(rhp, func, call_rcu_bh);
2352}
2353static void call_rcu_sched_remote(struct rcu_head *rhp,
2354 void (*func)(struct rcu_head *rhp))
2355{
2356 invoke_crf_remote(rhp, func, call_rcu_sched);
2357} 2192}
2358 2193
2359/* 2194/*
@@ -2390,7 +2225,7 @@ static int rcu_nocb_kthread(void *arg)
2390 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); 2225 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2391 ACCESS_ONCE(rdp->nocb_p_count) += c; 2226 ACCESS_ONCE(rdp->nocb_p_count) += c;
2392 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; 2227 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2393 wait_rcu_gp(rdp->rsp->call_remote); 2228 rcu_nocb_wait_gp(rdp);
2394 2229
2395 /* Each pass through the following loop invokes a callback. */ 2230 /* Each pass through the following loop invokes a callback. */
2396 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2231 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2436,36 +2271,40 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2436 return; 2271 return;
2437 for_each_cpu(cpu, rcu_nocb_mask) { 2272 for_each_cpu(cpu, rcu_nocb_mask) {
2438 rdp = per_cpu_ptr(rsp->rda, cpu); 2273 rdp = per_cpu_ptr(rsp->rda, cpu);
2439 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); 2274 t = kthread_run(rcu_nocb_kthread, rdp,
2275 "rcuo%c/%d", rsp->abbr, cpu);
2440 BUG_ON(IS_ERR(t)); 2276 BUG_ON(IS_ERR(t));
2441 ACCESS_ONCE(rdp->nocb_kthread) = t; 2277 ACCESS_ONCE(rdp->nocb_kthread) = t;
2442 } 2278 }
2443} 2279}
2444 2280
2445/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ 2281/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2446static void init_nocb_callback_list(struct rcu_data *rdp) 2282static bool init_nocb_callback_list(struct rcu_data *rdp)
2447{ 2283{
2448 if (rcu_nocb_mask == NULL || 2284 if (rcu_nocb_mask == NULL ||
2449 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) 2285 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2450 return; 2286 return false;
2451 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2287 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2288 return true;
2452} 2289}
2453 2290
2454/* Initialize the ->call_remote fields in the rcu_state structures. */ 2291#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2455static void __init rcu_init_nocb(void) 2292
2293static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2456{ 2294{
2457#ifdef CONFIG_PREEMPT_RCU 2295 return 0;
2458 rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2459#endif /* #ifdef CONFIG_PREEMPT_RCU */
2460 rcu_bh_state.call_remote = call_rcu_bh_remote;
2461 rcu_sched_state.call_remote = call_rcu_sched_remote;
2462} 2296}
2463 2297
2464#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2298static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2299{
2300}
2465 2301
2466static bool is_nocb_cpu(int cpu) 2302static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2303{
2304}
2305
2306static void rcu_init_one_nocb(struct rcu_node *rnp)
2467{ 2307{
2468 return false;
2469} 2308}
2470 2309
2471static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2310static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
@@ -2480,11 +2319,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2480 return 0; 2319 return 0;
2481} 2320}
2482 2321
2483static bool nocb_cpu_expendable(int cpu)
2484{
2485 return 1;
2486}
2487
2488static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2322static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2489{ 2323{
2490} 2324}
@@ -2493,12 +2327,26 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2493{ 2327{
2494} 2328}
2495 2329
2496static void init_nocb_callback_list(struct rcu_data *rdp) 2330static bool init_nocb_callback_list(struct rcu_data *rdp)
2497{ 2331{
2332 return false;
2498} 2333}
2499 2334
2500static void __init rcu_init_nocb(void) 2335#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
2336
2337/*
2338 * An adaptive-ticks CPU can potentially execute in kernel mode for an
2339 * arbitrarily long period of time with the scheduling-clock tick turned
2340 * off. RCU will be paying attention to this CPU because it is in the
2341 * kernel, but the CPU cannot be guaranteed to be executing the RCU state
2342 * machine because the scheduling-clock tick has been disabled. Therefore,
2343 * if an adaptive-ticks CPU is failing to respond to the current grace
2344 * period and has not be idle from an RCU perspective, kick it.
2345 */
2346static void rcu_kick_nohz_cpu(int cpu)
2501{ 2347{
2348#ifdef CONFIG_NO_HZ_FULL
2349 if (tick_nohz_full_cpu(cpu))
2350 smp_send_reschedule(cpu);
2351#endif /* #ifdef CONFIG_NO_HZ_FULL */
2502} 2352}
2503
2504#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0d095dcaa670..cf6c17412932 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,8 +46,6 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file, 49static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op) 50 const struct seq_operations *op)
53{ 51{
@@ -97,7 +95,7 @@ static const struct file_operations rcubarrier_fops = {
97 .open = rcubarrier_open, 95 .open = rcubarrier_open,
98 .read = seq_read, 96 .read = seq_read,
99 .llseek = no_llseek, 97 .llseek = no_llseek,
100 .release = seq_release, 98 .release = single_release,
101}; 99};
102 100
103#ifdef CONFIG_RCU_BOOST 101#ifdef CONFIG_RCU_BOOST
@@ -208,7 +206,7 @@ static const struct file_operations rcuexp_fops = {
208 .open = rcuexp_open, 206 .open = rcuexp_open,
209 .read = seq_read, 207 .read = seq_read,
210 .llseek = no_llseek, 208 .llseek = no_llseek,
211 .release = seq_release, 209 .release = single_release,
212}; 210};
213 211
214#ifdef CONFIG_RCU_BOOST 212#ifdef CONFIG_RCU_BOOST
@@ -308,7 +306,7 @@ static const struct file_operations rcuhier_fops = {
308 .open = rcuhier_open, 306 .open = rcuhier_open,
309 .read = seq_read, 307 .read = seq_read,
310 .llseek = no_llseek, 308 .llseek = no_llseek,
311 .release = seq_release, 309 .release = single_release,
312}; 310};
313 311
314static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) 312static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -350,7 +348,7 @@ static const struct file_operations rcugp_fops = {
350 .open = rcugp_open, 348 .open = rcugp_open,
351 .read = seq_read, 349 .read = seq_read,
352 .llseek = no_llseek, 350 .llseek = no_llseek,
353 .release = seq_release, 351 .release = single_release,
354}; 352};
355 353
356static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 354static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
diff --git a/kernel/relay.c b/kernel/relay.c
index 01ab081ac53a..eef0d113b79e 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename,
588 chan->version = RELAYFS_CHANNEL_VERSION; 588 chan->version = RELAYFS_CHANNEL_VERSION;
589 chan->n_subbufs = n_subbufs; 589 chan->n_subbufs = n_subbufs;
590 chan->subbuf_size = subbuf_size; 590 chan->subbuf_size = subbuf_size;
591 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); 591 chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
592 chan->parent = parent; 592 chan->parent = parent;
593 chan->private_data = private_data; 593 chan->private_data = private_data;
594 if (base_filename) { 594 if (base_filename) {
@@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
1099static int subbuf_read_actor(size_t read_start, 1099static int subbuf_read_actor(size_t read_start,
1100 struct rchan_buf *buf, 1100 struct rchan_buf *buf,
1101 size_t avail, 1101 size_t avail,
1102 read_descriptor_t *desc, 1102 read_descriptor_t *desc)
1103 read_actor_t actor)
1104{ 1103{
1105 void *from; 1104 void *from;
1106 int ret = 0; 1105 int ret = 0;
@@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start,
1121typedef int (*subbuf_actor_t) (size_t read_start, 1120typedef int (*subbuf_actor_t) (size_t read_start,
1122 struct rchan_buf *buf, 1121 struct rchan_buf *buf,
1123 size_t avail, 1122 size_t avail,
1124 read_descriptor_t *desc, 1123 read_descriptor_t *desc);
1125 read_actor_t actor);
1126 1124
1127/* 1125/*
1128 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries 1126 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
1129 */ 1127 */
1130static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, 1128static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1131 subbuf_actor_t subbuf_actor, 1129 subbuf_actor_t subbuf_actor,
1132 read_actor_t actor,
1133 read_descriptor_t *desc) 1130 read_descriptor_t *desc)
1134{ 1131{
1135 struct rchan_buf *buf = filp->private_data; 1132 struct rchan_buf *buf = filp->private_data;
@@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
1150 break; 1147 break;
1151 1148
1152 avail = min(desc->count, avail); 1149 avail = min(desc->count, avail);
1153 ret = subbuf_actor(read_start, buf, avail, desc, actor); 1150 ret = subbuf_actor(read_start, buf, avail, desc);
1154 if (desc->error < 0) 1151 if (desc->error < 0)
1155 break; 1152 break;
1156 1153
@@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp,
1174 desc.count = count; 1171 desc.count = count;
1175 desc.arg.buf = buffer; 1172 desc.arg.buf = buffer;
1176 desc.error = 0; 1173 desc.error = 0;
1177 return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, 1174 return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
1178 NULL, &desc);
1179} 1175}
1180 1176
1181static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) 1177static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
diff --git a/kernel/resource.c b/kernel/resource.c
index 73f35d4b30b9..d7386986e10e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -21,6 +21,7 @@
21#include <linux/seq_file.h> 21#include <linux/seq_file.h>
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/pfn.h> 23#include <linux/pfn.h>
24#include <linux/mm.h>
24#include <asm/io.h> 25#include <asm/io.h>
25 26
26 27
@@ -50,6 +51,14 @@ struct resource_constraint {
50 51
51static DEFINE_RWLOCK(resource_lock); 52static DEFINE_RWLOCK(resource_lock);
52 53
54/*
55 * For memory hotplug, there is no way to free resource entries allocated
56 * by boot mem after the system is up. So for reusing the resource entry
57 * we need to remember the resource.
58 */
59static struct resource *bootmem_resource_free;
60static DEFINE_SPINLOCK(bootmem_resource_lock);
61
53static void *r_next(struct seq_file *m, void *v, loff_t *pos) 62static void *r_next(struct seq_file *m, void *v, loff_t *pos)
54{ 63{
55 struct resource *p = v; 64 struct resource *p = v;
@@ -151,6 +160,40 @@ __initcall(ioresources_init);
151 160
152#endif /* CONFIG_PROC_FS */ 161#endif /* CONFIG_PROC_FS */
153 162
163static void free_resource(struct resource *res)
164{
165 if (!res)
166 return;
167
168 if (!PageSlab(virt_to_head_page(res))) {
169 spin_lock(&bootmem_resource_lock);
170 res->sibling = bootmem_resource_free;
171 bootmem_resource_free = res;
172 spin_unlock(&bootmem_resource_lock);
173 } else {
174 kfree(res);
175 }
176}
177
178static struct resource *alloc_resource(gfp_t flags)
179{
180 struct resource *res = NULL;
181
182 spin_lock(&bootmem_resource_lock);
183 if (bootmem_resource_free) {
184 res = bootmem_resource_free;
185 bootmem_resource_free = res->sibling;
186 }
187 spin_unlock(&bootmem_resource_lock);
188
189 if (res)
190 memset(res, 0, sizeof(struct resource));
191 else
192 res = kzalloc(sizeof(struct resource), flags);
193
194 return res;
195}
196
154/* Return the conflict entry if you can't request it */ 197/* Return the conflict entry if you can't request it */
155static struct resource * __request_resource(struct resource *root, struct resource *new) 198static struct resource * __request_resource(struct resource *root, struct resource *new)
156{ 199{
@@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
706 write_unlock(&resource_lock); 749 write_unlock(&resource_lock);
707} 750}
708 751
709/** 752static int __adjust_resource(struct resource *res, resource_size_t start,
710 * adjust_resource - modify a resource's start and size 753 resource_size_t size)
711 * @res: resource to modify
712 * @start: new start value
713 * @size: new size
714 *
715 * Given an existing resource, change its start and size to match the
716 * arguments. Returns 0 on success, -EBUSY if it can't fit.
717 * Existing children of the resource are assumed to be immutable.
718 */
719int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
720{ 754{
721 struct resource *tmp, *parent = res->parent; 755 struct resource *tmp, *parent = res->parent;
722 resource_size_t end = start + size - 1; 756 resource_size_t end = start + size - 1;
723 int result = -EBUSY; 757 int result = -EBUSY;
724 758
725 write_lock(&resource_lock);
726
727 if (!parent) 759 if (!parent)
728 goto skip; 760 goto skip;
729 761
@@ -751,6 +783,26 @@ skip:
751 result = 0; 783 result = 0;
752 784
753 out: 785 out:
786 return result;
787}
788
789/**
790 * adjust_resource - modify a resource's start and size
791 * @res: resource to modify
792 * @start: new start value
793 * @size: new size
794 *
795 * Given an existing resource, change its start and size to match the
796 * arguments. Returns 0 on success, -EBUSY if it can't fit.
797 * Existing children of the resource are assumed to be immutable.
798 */
799int adjust_resource(struct resource *res, resource_size_t start,
800 resource_size_t size)
801{
802 int result;
803
804 write_lock(&resource_lock);
805 result = __adjust_resource(res, start, size);
754 write_unlock(&resource_lock); 806 write_unlock(&resource_lock);
755 return result; 807 return result;
756} 808}
@@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,
762{ 814{
763 struct resource *parent = root; 815 struct resource *parent = root;
764 struct resource *conflict; 816 struct resource *conflict;
765 struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); 817 struct resource *res = alloc_resource(GFP_ATOMIC);
766 struct resource *next_res = NULL; 818 struct resource *next_res = NULL;
767 819
768 if (!res) 820 if (!res)
@@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,
787 /* conflict covered whole area */ 839 /* conflict covered whole area */
788 if (conflict->start <= res->start && 840 if (conflict->start <= res->start &&
789 conflict->end >= res->end) { 841 conflict->end >= res->end) {
790 kfree(res); 842 free_resource(res);
791 WARN_ON(next_res); 843 WARN_ON(next_res);
792 break; 844 break;
793 } 845 }
@@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,
797 end = res->end; 849 end = res->end;
798 res->end = conflict->start - 1; 850 res->end = conflict->start - 1;
799 if (conflict->end < end) { 851 if (conflict->end < end) {
800 next_res = kzalloc(sizeof(*next_res), 852 next_res = alloc_resource(GFP_ATOMIC);
801 GFP_ATOMIC);
802 if (!next_res) { 853 if (!next_res) {
803 kfree(res); 854 free_resource(res);
804 break; 855 break;
805 } 856 }
806 next_res->name = name; 857 next_res->name = name;
@@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent,
890 const char *name, int flags) 941 const char *name, int flags)
891{ 942{
892 DECLARE_WAITQUEUE(wait, current); 943 DECLARE_WAITQUEUE(wait, current);
893 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 944 struct resource *res = alloc_resource(GFP_KERNEL);
894 945
895 if (!res) 946 if (!res)
896 return NULL; 947 return NULL;
@@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent,
924 continue; 975 continue;
925 } 976 }
926 /* Uhhuh, that didn't work out.. */ 977 /* Uhhuh, that didn't work out.. */
927 kfree(res); 978 free_resource(res);
928 res = NULL; 979 res = NULL;
929 break; 980 break;
930 } 981 }
@@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,
958 return -EBUSY; 1009 return -EBUSY;
959 1010
960 release_resource(res); 1011 release_resource(res);
961 kfree(res); 1012 free_resource(res);
962 return 0; 1013 return 0;
963} 1014}
964EXPORT_SYMBOL(__check_region); 1015EXPORT_SYMBOL(__check_region);
@@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,
998 write_unlock(&resource_lock); 1049 write_unlock(&resource_lock);
999 if (res->flags & IORESOURCE_MUXED) 1050 if (res->flags & IORESOURCE_MUXED)
1000 wake_up(&muxed_resource_wait); 1051 wake_up(&muxed_resource_wait);
1001 kfree(res); 1052 free_resource(res);
1002 return; 1053 return;
1003 } 1054 }
1004 p = &res->sibling; 1055 p = &res->sibling;
@@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start,
1012} 1063}
1013EXPORT_SYMBOL(__release_region); 1064EXPORT_SYMBOL(__release_region);
1014 1065
1066#ifdef CONFIG_MEMORY_HOTREMOVE
1067/**
1068 * release_mem_region_adjustable - release a previously reserved memory region
1069 * @parent: parent resource descriptor
1070 * @start: resource start address
1071 * @size: resource region size
1072 *
1073 * This interface is intended for memory hot-delete. The requested region
1074 * is released from a currently busy memory resource. The requested region
1075 * must either match exactly or fit into a single busy resource entry. In
1076 * the latter case, the remaining resource is adjusted accordingly.
1077 * Existing children of the busy memory resource must be immutable in the
1078 * request.
1079 *
1080 * Note:
1081 * - Additional release conditions, such as overlapping region, can be
1082 * supported after they are confirmed as valid cases.
1083 * - When a busy memory resource gets split into two entries, the code
1084 * assumes that all children remain in the lower address entry for
1085 * simplicity. Enhance this logic when necessary.
1086 */
1087int release_mem_region_adjustable(struct resource *parent,
1088 resource_size_t start, resource_size_t size)
1089{
1090 struct resource **p;
1091 struct resource *res;
1092 struct resource *new_res;
1093 resource_size_t end;
1094 int ret = -EINVAL;
1095
1096 end = start + size - 1;
1097 if ((start < parent->start) || (end > parent->end))
1098 return ret;
1099
1100 /* The alloc_resource() result gets checked later */
1101 new_res = alloc_resource(GFP_KERNEL);
1102
1103 p = &parent->child;
1104 write_lock(&resource_lock);
1105
1106 while ((res = *p)) {
1107 if (res->start >= end)
1108 break;
1109
1110 /* look for the next resource if it does not fit into */
1111 if (res->start > start || res->end < end) {
1112 p = &res->sibling;
1113 continue;
1114 }
1115
1116 if (!(res->flags & IORESOURCE_MEM))
1117 break;
1118
1119 if (!(res->flags & IORESOURCE_BUSY)) {
1120 p = &res->child;
1121 continue;
1122 }
1123
1124 /* found the target resource; let's adjust accordingly */
1125 if (res->start == start && res->end == end) {
1126 /* free the whole entry */
1127 *p = res->sibling;
1128 free_resource(res);
1129 ret = 0;
1130 } else if (res->start == start && res->end != end) {
1131 /* adjust the start */
1132 ret = __adjust_resource(res, end + 1,
1133 res->end - end);
1134 } else if (res->start != start && res->end == end) {
1135 /* adjust the end */
1136 ret = __adjust_resource(res, res->start,
1137 start - res->start);
1138 } else {
1139 /* split into two entries */
1140 if (!new_res) {
1141 ret = -ENOMEM;
1142 break;
1143 }
1144 new_res->name = res->name;
1145 new_res->start = end + 1;
1146 new_res->end = res->end;
1147 new_res->flags = res->flags;
1148 new_res->parent = res->parent;
1149 new_res->sibling = res->sibling;
1150 new_res->child = NULL;
1151
1152 ret = __adjust_resource(res, res->start,
1153 start - res->start);
1154 if (ret)
1155 break;
1156 res->sibling = new_res;
1157 new_res = NULL;
1158 }
1159
1160 break;
1161 }
1162
1163 write_unlock(&resource_lock);
1164 free_resource(new_res);
1165 return ret;
1166}
1167#endif /* CONFIG_MEMORY_HOTREMOVE */
1168
1015/* 1169/*
1016 * Managed region resource 1170 * Managed region resource
1017 */ 1171 */
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 7890b10084a7..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -14,6 +14,7 @@
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16#include <linux/freezer.h> 16#include <linux/freezer.h>
17#include <linux/stat.h>
17 18
18#include "rtmutex.h" 19#include "rtmutex.h"
19 20
@@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
366 return curr - buf; 367 return curr - buf;
367} 368}
368 369
369static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); 370static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
370static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); 371static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
371 372
372static struct bus_type rttest_subsys = { 373static struct bus_type rttest_subsys = {
373 .name = "rttest", 374 .name = "rttest",
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103b..deaf90e4a1de 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
176 u64 this_clock, remote_clock; 176 u64 this_clock, remote_clock;
177 u64 *ptr, old_val, val; 177 u64 *ptr, old_val, val;
178 178
179#if BITS_PER_LONG != 64
180again:
181 /*
182 * Careful here: The local and the remote clock values need to
183 * be read out atomic as we need to compare the values and
184 * then update either the local or the remote side. So the
185 * cmpxchg64 below only protects one readout.
186 *
187 * We must reread via sched_clock_local() in the retry case on
188 * 32bit as an NMI could use sched_clock_local() via the
189 * tracer and hit between the readout of
190 * the low32bit and the high 32bit portion.
191 */
192 this_clock = sched_clock_local(my_scd);
193 /*
194 * We must enforce atomic readout on 32bit, otherwise the
195 * update on the remote cpu can hit inbetween the readout of
196 * the low32bit and the high 32bit portion.
197 */
198 remote_clock = cmpxchg64(&scd->clock, 0, 0);
199#else
200 /*
201 * On 64bit the read of [my]scd->clock is atomic versus the
202 * update, so we can avoid the above 32bit dance.
203 */
179 sched_clock_local(my_scd); 204 sched_clock_local(my_scd);
180again: 205again:
181 this_clock = my_scd->clock; 206 this_clock = my_scd->clock;
182 remote_clock = scd->clock; 207 remote_clock = scd->clock;
208#endif
183 209
184 /* 210 /*
185 * Use the opportunity that we have both locks 211 * Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624a393c..58453b8272fd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
512 * the target CPU. 512 * the target CPU.
513 */ 513 */
514#ifdef CONFIG_SMP 514#ifdef CONFIG_SMP
515
516#ifndef tsk_is_polling
517#define tsk_is_polling(t) 0
518#endif
519
520void resched_task(struct task_struct *p) 515void resched_task(struct task_struct *p)
521{ 516{
522 int cpu; 517 int cpu;
@@ -549,7 +544,7 @@ void resched_cpu(int cpu)
549 raw_spin_unlock_irqrestore(&rq->lock, flags); 544 raw_spin_unlock_irqrestore(&rq->lock, flags);
550} 545}
551 546
552#ifdef CONFIG_NO_HZ 547#ifdef CONFIG_NO_HZ_COMMON
553/* 548/*
554 * In the semi idle case, use the nearest busy cpu for migrating timers 549 * In the semi idle case, use the nearest busy cpu for migrating timers
555 * from an idle cpu. This is good for power-savings. 550 * from an idle cpu. This is good for power-savings.
@@ -587,7 +582,7 @@ unlock:
587 * account when the CPU goes back to idle and evaluates the timer 582 * account when the CPU goes back to idle and evaluates the timer
588 * wheel for the next timer event. 583 * wheel for the next timer event.
589 */ 584 */
590void wake_up_idle_cpu(int cpu) 585static void wake_up_idle_cpu(int cpu)
591{ 586{
592 struct rq *rq = cpu_rq(cpu); 587 struct rq *rq = cpu_rq(cpu);
593 588
@@ -617,20 +612,56 @@ void wake_up_idle_cpu(int cpu)
617 smp_send_reschedule(cpu); 612 smp_send_reschedule(cpu);
618} 613}
619 614
615static bool wake_up_full_nohz_cpu(int cpu)
616{
617 if (tick_nohz_full_cpu(cpu)) {
618 if (cpu != smp_processor_id() ||
619 tick_nohz_tick_stopped())
620 smp_send_reschedule(cpu);
621 return true;
622 }
623
624 return false;
625}
626
627void wake_up_nohz_cpu(int cpu)
628{
629 if (!wake_up_full_nohz_cpu(cpu))
630 wake_up_idle_cpu(cpu);
631}
632
620static inline bool got_nohz_idle_kick(void) 633static inline bool got_nohz_idle_kick(void)
621{ 634{
622 int cpu = smp_processor_id(); 635 int cpu = smp_processor_id();
623 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 636 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
624} 637}
625 638
626#else /* CONFIG_NO_HZ */ 639#else /* CONFIG_NO_HZ_COMMON */
627 640
628static inline bool got_nohz_idle_kick(void) 641static inline bool got_nohz_idle_kick(void)
629{ 642{
630 return false; 643 return false;
631} 644}
632 645
633#endif /* CONFIG_NO_HZ */ 646#endif /* CONFIG_NO_HZ_COMMON */
647
648#ifdef CONFIG_NO_HZ_FULL
649bool sched_can_stop_tick(void)
650{
651 struct rq *rq;
652
653 rq = this_rq();
654
655 /* Make sure rq->nr_running update is visible after the IPI */
656 smp_rmb();
657
658 /* More than one running task need preemption */
659 if (rq->nr_running > 1)
660 return false;
661
662 return true;
663}
664#endif /* CONFIG_NO_HZ_FULL */
634 665
635void sched_avg_update(struct rq *rq) 666void sched_avg_update(struct rq *rq)
636{ 667{
@@ -1288,8 +1319,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1288static void 1319static void
1289ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1320ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1290{ 1321{
1291 trace_sched_wakeup(p, true);
1292 check_preempt_curr(rq, p, wake_flags); 1322 check_preempt_curr(rq, p, wake_flags);
1323 trace_sched_wakeup(p, true);
1293 1324
1294 p->state = TASK_RUNNING; 1325 p->state = TASK_RUNNING;
1295#ifdef CONFIG_SMP 1326#ifdef CONFIG_SMP
@@ -1362,7 +1393,8 @@ static void sched_ttwu_pending(void)
1362 1393
1363void scheduler_ipi(void) 1394void scheduler_ipi(void)
1364{ 1395{
1365 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1396 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
1397 && !tick_nohz_full_cpu(smp_processor_id()))
1366 return; 1398 return;
1367 1399
1368 /* 1400 /*
@@ -1379,6 +1411,7 @@ void scheduler_ipi(void)
1379 * somewhat pessimize the simple resched case. 1411 * somewhat pessimize the simple resched case.
1380 */ 1412 */
1381 irq_enter(); 1413 irq_enter();
1414 tick_nohz_full_check();
1382 sched_ttwu_pending(); 1415 sched_ttwu_pending();
1383 1416
1384 /* 1417 /*
@@ -1498,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p)
1498{ 1531{
1499 struct rq *rq = task_rq(p); 1532 struct rq *rq = task_rq(p);
1500 1533
1501 BUG_ON(rq != this_rq()); 1534 if (WARN_ON_ONCE(rq != this_rq()) ||
1502 BUG_ON(p == current); 1535 WARN_ON_ONCE(p == current))
1536 return;
1537
1503 lockdep_assert_held(&rq->lock); 1538 lockdep_assert_held(&rq->lock);
1504 1539
1505 if (!raw_spin_trylock(&p->pi_lock)) { 1540 if (!raw_spin_trylock(&p->pi_lock)) {
@@ -1858,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1858 kprobe_flush_task(prev); 1893 kprobe_flush_task(prev);
1859 put_task_struct(prev); 1894 put_task_struct(prev);
1860 } 1895 }
1896
1897 tick_nohz_task_switch(current);
1861} 1898}
1862 1899
1863#ifdef CONFIG_SMP 1900#ifdef CONFIG_SMP
@@ -2121,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2121 return load >> FSHIFT; 2158 return load >> FSHIFT;
2122} 2159}
2123 2160
2124#ifdef CONFIG_NO_HZ 2161#ifdef CONFIG_NO_HZ_COMMON
2125/* 2162/*
2126 * Handle NO_HZ for the global load-average. 2163 * Handle NO_HZ for the global load-average.
2127 * 2164 *
@@ -2347,12 +2384,12 @@ static void calc_global_nohz(void)
2347 smp_wmb(); 2384 smp_wmb();
2348 calc_load_idx++; 2385 calc_load_idx++;
2349} 2386}
2350#else /* !CONFIG_NO_HZ */ 2387#else /* !CONFIG_NO_HZ_COMMON */
2351 2388
2352static inline long calc_load_fold_idle(void) { return 0; } 2389static inline long calc_load_fold_idle(void) { return 0; }
2353static inline void calc_global_nohz(void) { } 2390static inline void calc_global_nohz(void) { }
2354 2391
2355#endif /* CONFIG_NO_HZ */ 2392#endif /* CONFIG_NO_HZ_COMMON */
2356 2393
2357/* 2394/*
2358 * calc_load - update the avenrun load estimates 10 ticks after the 2395 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2512,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2512 sched_avg_update(this_rq); 2549 sched_avg_update(this_rq);
2513} 2550}
2514 2551
2515#ifdef CONFIG_NO_HZ 2552#ifdef CONFIG_NO_HZ_COMMON
2516/* 2553/*
2517 * There is no sane way to deal with nohz on smp when using jiffies because the 2554 * There is no sane way to deal with nohz on smp when using jiffies because the
2518 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 2555 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -2572,7 +2609,7 @@ void update_cpu_load_nohz(void)
2572 } 2609 }
2573 raw_spin_unlock(&this_rq->lock); 2610 raw_spin_unlock(&this_rq->lock);
2574} 2611}
2575#endif /* CONFIG_NO_HZ */ 2612#endif /* CONFIG_NO_HZ_COMMON */
2576 2613
2577/* 2614/*
2578 * Called from scheduler_tick() 2615 * Called from scheduler_tick()
@@ -2699,8 +2736,35 @@ void scheduler_tick(void)
2699 rq->idle_balance = idle_cpu(cpu); 2736 rq->idle_balance = idle_cpu(cpu);
2700 trigger_load_balance(rq, cpu); 2737 trigger_load_balance(rq, cpu);
2701#endif 2738#endif
2739 rq_last_tick_reset(rq);
2702} 2740}
2703 2741
2742#ifdef CONFIG_NO_HZ_FULL
2743/**
2744 * scheduler_tick_max_deferment
2745 *
2746 * Keep at least one tick per second when a single
2747 * active task is running because the scheduler doesn't
2748 * yet completely support full dynticks environment.
2749 *
2750 * This makes sure that uptime, CFS vruntime, load
2751 * balancing, etc... continue to move forward, even
2752 * with a very low granularity.
2753 */
2754u64 scheduler_tick_max_deferment(void)
2755{
2756 struct rq *rq = this_rq();
2757 unsigned long next, now = ACCESS_ONCE(jiffies);
2758
2759 next = rq->last_sched_tick + HZ;
2760
2761 if (time_before_eq(next, now))
2762 return 0;
2763
2764 return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
2765}
2766#endif
2767
2704notrace unsigned long get_parent_ip(unsigned long addr) 2768notrace unsigned long get_parent_ip(unsigned long addr)
2705{ 2769{
2706 if (in_lock_functions(addr)) { 2770 if (in_lock_functions(addr)) {
@@ -2997,51 +3061,6 @@ void __sched schedule_preempt_disabled(void)
2997 preempt_disable(); 3061 preempt_disable();
2998} 3062}
2999 3063
3000#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3001
3002static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3003{
3004 if (lock->owner != owner)
3005 return false;
3006
3007 /*
3008 * Ensure we emit the owner->on_cpu, dereference _after_ checking
3009 * lock->owner still matches owner, if that fails, owner might
3010 * point to free()d memory, if it still matches, the rcu_read_lock()
3011 * ensures the memory stays valid.
3012 */
3013 barrier();
3014
3015 return owner->on_cpu;
3016}
3017
3018/*
3019 * Look out! "owner" is an entirely speculative pointer
3020 * access and not reliable.
3021 */
3022int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3023{
3024 if (!sched_feat(OWNER_SPIN))
3025 return 0;
3026
3027 rcu_read_lock();
3028 while (owner_running(lock, owner)) {
3029 if (need_resched())
3030 break;
3031
3032 arch_mutex_cpu_relax();
3033 }
3034 rcu_read_unlock();
3035
3036 /*
3037 * We break out the loop above on need_resched() and when the
3038 * owner changed, which is a sign for heavy contention. Return
3039 * success only when lock->owner is NULL.
3040 */
3041 return lock->owner == NULL;
3042}
3043#endif
3044
3045#ifdef CONFIG_PREEMPT 3064#ifdef CONFIG_PREEMPT
3046/* 3065/*
3047 * this is the entry point to schedule() from in-kernel preemption 3066 * this is the entry point to schedule() from in-kernel preemption
@@ -3082,11 +3101,13 @@ EXPORT_SYMBOL(preempt_schedule);
3082asmlinkage void __sched preempt_schedule_irq(void) 3101asmlinkage void __sched preempt_schedule_irq(void)
3083{ 3102{
3084 struct thread_info *ti = current_thread_info(); 3103 struct thread_info *ti = current_thread_info();
3104 enum ctx_state prev_state;
3085 3105
3086 /* Catch callers which need to be fixed */ 3106 /* Catch callers which need to be fixed */
3087 BUG_ON(ti->preempt_count || !irqs_disabled()); 3107 BUG_ON(ti->preempt_count || !irqs_disabled());
3088 3108
3089 user_exit(); 3109 prev_state = exception_enter();
3110
3090 do { 3111 do {
3091 add_preempt_count(PREEMPT_ACTIVE); 3112 add_preempt_count(PREEMPT_ACTIVE);
3092 local_irq_enable(); 3113 local_irq_enable();
@@ -3100,6 +3121,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
3100 */ 3121 */
3101 barrier(); 3122 barrier();
3102 } while (need_resched()); 3123 } while (need_resched());
3124
3125 exception_exit(prev_state);
3103} 3126}
3104 3127
3105#endif /* CONFIG_PREEMPT */ 3128#endif /* CONFIG_PREEMPT */
@@ -4126,6 +4149,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4126 get_task_struct(p); 4149 get_task_struct(p);
4127 rcu_read_unlock(); 4150 rcu_read_unlock();
4128 4151
4152 if (p->flags & PF_NO_SETAFFINITY) {
4153 retval = -EINVAL;
4154 goto out_put_task;
4155 }
4129 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4156 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4130 retval = -ENOMEM; 4157 retval = -ENOMEM;
4131 goto out_put_task; 4158 goto out_put_task;
@@ -4626,6 +4653,7 @@ void sched_show_task(struct task_struct *p)
4626 task_pid_nr(p), ppid, 4653 task_pid_nr(p), ppid,
4627 (unsigned long)task_thread_info(p)->flags); 4654 (unsigned long)task_thread_info(p)->flags);
4628 4655
4656 print_worker_info(KERN_INFO, p);
4629 show_stack(p, NULL); 4657 show_stack(p, NULL);
4630} 4658}
4631 4659
@@ -4773,11 +4801,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4773 goto out; 4801 goto out;
4774 } 4802 }
4775 4803
4776 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4777 ret = -EINVAL;
4778 goto out;
4779 }
4780
4781 do_set_cpus_allowed(p, new_mask); 4804 do_set_cpus_allowed(p, new_mask);
4782 4805
4783 /* Can the task run on the task's current CPU? If so, we're done */ 4806 /* Can the task run on the task's current CPU? If so, we're done */
@@ -4999,7 +5022,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
4999} 5022}
5000 5023
5001static int min_load_idx = 0; 5024static int min_load_idx = 0;
5002static int max_load_idx = CPU_LOAD_IDX_MAX; 5025static int max_load_idx = CPU_LOAD_IDX_MAX-1;
5003 5026
5004static void 5027static void
5005set_table_entry(struct ctl_table *entry, 5028set_table_entry(struct ctl_table *entry,
@@ -6248,7 +6271,7 @@ static void sched_init_numa(void)
6248 * 'level' contains the number of unique distances, excluding the 6271 * 'level' contains the number of unique distances, excluding the
6249 * identity distance node_distance(i,i). 6272 * identity distance node_distance(i,i).
6250 * 6273 *
6251 * The sched_domains_nume_distance[] array includes the actual distance 6274 * The sched_domains_numa_distance[] array includes the actual distance
6252 * numbers. 6275 * numbers.
6253 */ 6276 */
6254 6277
@@ -6861,11 +6884,15 @@ int in_sched_functions(unsigned long addr)
6861} 6884}
6862 6885
6863#ifdef CONFIG_CGROUP_SCHED 6886#ifdef CONFIG_CGROUP_SCHED
6887/*
6888 * Default task group.
6889 * Every task in system belongs to this group at bootup.
6890 */
6864struct task_group root_task_group; 6891struct task_group root_task_group;
6865LIST_HEAD(task_groups); 6892LIST_HEAD(task_groups);
6866#endif 6893#endif
6867 6894
6868DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6895DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6869 6896
6870void __init sched_init(void) 6897void __init sched_init(void)
6871{ 6898{
@@ -6902,7 +6929,7 @@ void __init sched_init(void)
6902#endif /* CONFIG_RT_GROUP_SCHED */ 6929#endif /* CONFIG_RT_GROUP_SCHED */
6903#ifdef CONFIG_CPUMASK_OFFSTACK 6930#ifdef CONFIG_CPUMASK_OFFSTACK
6904 for_each_possible_cpu(i) { 6931 for_each_possible_cpu(i) {
6905 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 6932 per_cpu(load_balance_mask, i) = (void *)ptr;
6906 ptr += cpumask_size(); 6933 ptr += cpumask_size();
6907 } 6934 }
6908#endif /* CONFIG_CPUMASK_OFFSTACK */ 6935#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6928,12 +6955,6 @@ void __init sched_init(void)
6928 6955
6929#endif /* CONFIG_CGROUP_SCHED */ 6956#endif /* CONFIG_CGROUP_SCHED */
6930 6957
6931#ifdef CONFIG_CGROUP_CPUACCT
6932 root_cpuacct.cpustat = &kernel_cpustat;
6933 root_cpuacct.cpuusage = alloc_percpu(u64);
6934 /* Too early, not expected to fail */
6935 BUG_ON(!root_cpuacct.cpuusage);
6936#endif
6937 for_each_possible_cpu(i) { 6958 for_each_possible_cpu(i) {
6938 struct rq *rq; 6959 struct rq *rq;
6939 6960
@@ -6997,9 +7018,12 @@ void __init sched_init(void)
6997 INIT_LIST_HEAD(&rq->cfs_tasks); 7018 INIT_LIST_HEAD(&rq->cfs_tasks);
6998 7019
6999 rq_attach_root(rq, &def_root_domain); 7020 rq_attach_root(rq, &def_root_domain);
7000#ifdef CONFIG_NO_HZ 7021#ifdef CONFIG_NO_HZ_COMMON
7001 rq->nohz_flags = 0; 7022 rq->nohz_flags = 0;
7002#endif 7023#endif
7024#ifdef CONFIG_NO_HZ_FULL
7025 rq->last_sched_tick = 0;
7026#endif
7003#endif 7027#endif
7004 init_rq_hrtick(rq); 7028 init_rq_hrtick(rq);
7005 atomic_set(&rq->nr_iowait, 0); 7029 atomic_set(&rq->nr_iowait, 0);
@@ -7455,7 +7479,7 @@ unlock:
7455 return err; 7479 return err;
7456} 7480}
7457 7481
7458int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7482static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7459{ 7483{
7460 u64 rt_runtime, rt_period; 7484 u64 rt_runtime, rt_period;
7461 7485
@@ -7467,7 +7491,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7467 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7491 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7468} 7492}
7469 7493
7470long sched_group_rt_runtime(struct task_group *tg) 7494static long sched_group_rt_runtime(struct task_group *tg)
7471{ 7495{
7472 u64 rt_runtime_us; 7496 u64 rt_runtime_us;
7473 7497
@@ -7479,7 +7503,7 @@ long sched_group_rt_runtime(struct task_group *tg)
7479 return rt_runtime_us; 7503 return rt_runtime_us;
7480} 7504}
7481 7505
7482int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7506static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7483{ 7507{
7484 u64 rt_runtime, rt_period; 7508 u64 rt_runtime, rt_period;
7485 7509
@@ -7492,7 +7516,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7492 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7516 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7493} 7517}
7494 7518
7495long sched_group_rt_period(struct task_group *tg) 7519static long sched_group_rt_period(struct task_group *tg)
7496{ 7520{
7497 u64 rt_period_us; 7521 u64 rt_period_us;
7498 7522
@@ -7527,7 +7551,7 @@ static int sched_rt_global_constraints(void)
7527 return ret; 7551 return ret;
7528} 7552}
7529 7553
7530int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7554static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7531{ 7555{
7532 /* Don't accept realtime tasks when there is no way for them to run */ 7556 /* Don't accept realtime tasks when there is no way for them to run */
7533 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7557 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -8035,226 +8059,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8035 8059
8036#endif /* CONFIG_CGROUP_SCHED */ 8060#endif /* CONFIG_CGROUP_SCHED */
8037 8061
8038#ifdef CONFIG_CGROUP_CPUACCT
8039
8040/*
8041 * CPU accounting code for task groups.
8042 *
8043 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
8044 * (balbir@in.ibm.com).
8045 */
8046
8047struct cpuacct root_cpuacct;
8048
8049/* create a new cpu accounting group */
8050static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
8051{
8052 struct cpuacct *ca;
8053
8054 if (!cgrp->parent)
8055 return &root_cpuacct.css;
8056
8057 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8058 if (!ca)
8059 goto out;
8060
8061 ca->cpuusage = alloc_percpu(u64);
8062 if (!ca->cpuusage)
8063 goto out_free_ca;
8064
8065 ca->cpustat = alloc_percpu(struct kernel_cpustat);
8066 if (!ca->cpustat)
8067 goto out_free_cpuusage;
8068
8069 return &ca->css;
8070
8071out_free_cpuusage:
8072 free_percpu(ca->cpuusage);
8073out_free_ca:
8074 kfree(ca);
8075out:
8076 return ERR_PTR(-ENOMEM);
8077}
8078
8079/* destroy an existing cpu accounting group */
8080static void cpuacct_css_free(struct cgroup *cgrp)
8081{
8082 struct cpuacct *ca = cgroup_ca(cgrp);
8083
8084 free_percpu(ca->cpustat);
8085 free_percpu(ca->cpuusage);
8086 kfree(ca);
8087}
8088
8089static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8090{
8091 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8092 u64 data;
8093
8094#ifndef CONFIG_64BIT
8095 /*
8096 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
8097 */
8098 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8099 data = *cpuusage;
8100 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8101#else
8102 data = *cpuusage;
8103#endif
8104
8105 return data;
8106}
8107
8108static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8109{
8110 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8111
8112#ifndef CONFIG_64BIT
8113 /*
8114 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8115 */
8116 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8117 *cpuusage = val;
8118 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8119#else
8120 *cpuusage = val;
8121#endif
8122}
8123
8124/* return total cpu usage (in nanoseconds) of a group */
8125static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8126{
8127 struct cpuacct *ca = cgroup_ca(cgrp);
8128 u64 totalcpuusage = 0;
8129 int i;
8130
8131 for_each_present_cpu(i)
8132 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8133
8134 return totalcpuusage;
8135}
8136
8137static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8138 u64 reset)
8139{
8140 struct cpuacct *ca = cgroup_ca(cgrp);
8141 int err = 0;
8142 int i;
8143
8144 if (reset) {
8145 err = -EINVAL;
8146 goto out;
8147 }
8148
8149 for_each_present_cpu(i)
8150 cpuacct_cpuusage_write(ca, i, 0);
8151
8152out:
8153 return err;
8154}
8155
8156static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8157 struct seq_file *m)
8158{
8159 struct cpuacct *ca = cgroup_ca(cgroup);
8160 u64 percpu;
8161 int i;
8162
8163 for_each_present_cpu(i) {
8164 percpu = cpuacct_cpuusage_read(ca, i);
8165 seq_printf(m, "%llu ", (unsigned long long) percpu);
8166 }
8167 seq_printf(m, "\n");
8168 return 0;
8169}
8170
8171static const char *cpuacct_stat_desc[] = {
8172 [CPUACCT_STAT_USER] = "user",
8173 [CPUACCT_STAT_SYSTEM] = "system",
8174};
8175
8176static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8177 struct cgroup_map_cb *cb)
8178{
8179 struct cpuacct *ca = cgroup_ca(cgrp);
8180 int cpu;
8181 s64 val = 0;
8182
8183 for_each_online_cpu(cpu) {
8184 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8185 val += kcpustat->cpustat[CPUTIME_USER];
8186 val += kcpustat->cpustat[CPUTIME_NICE];
8187 }
8188 val = cputime64_to_clock_t(val);
8189 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8190
8191 val = 0;
8192 for_each_online_cpu(cpu) {
8193 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8194 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8195 val += kcpustat->cpustat[CPUTIME_IRQ];
8196 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8197 }
8198
8199 val = cputime64_to_clock_t(val);
8200 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8201
8202 return 0;
8203}
8204
8205static struct cftype files[] = {
8206 {
8207 .name = "usage",
8208 .read_u64 = cpuusage_read,
8209 .write_u64 = cpuusage_write,
8210 },
8211 {
8212 .name = "usage_percpu",
8213 .read_seq_string = cpuacct_percpu_seq_read,
8214 },
8215 {
8216 .name = "stat",
8217 .read_map = cpuacct_stats_show,
8218 },
8219 { } /* terminate */
8220};
8221
8222/*
8223 * charge this task's execution time to its accounting group.
8224 *
8225 * called with rq->lock held.
8226 */
8227void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8228{
8229 struct cpuacct *ca;
8230 int cpu;
8231
8232 if (unlikely(!cpuacct_subsys.active))
8233 return;
8234
8235 cpu = task_cpu(tsk);
8236
8237 rcu_read_lock();
8238
8239 ca = task_ca(tsk);
8240
8241 for (; ca; ca = parent_ca(ca)) {
8242 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8243 *cpuusage += cputime;
8244 }
8245
8246 rcu_read_unlock();
8247}
8248
8249struct cgroup_subsys cpuacct_subsys = {
8250 .name = "cpuacct",
8251 .css_alloc = cpuacct_css_alloc,
8252 .css_free = cpuacct_css_free,
8253 .subsys_id = cpuacct_subsys_id,
8254 .base_cftypes = files,
8255};
8256#endif /* CONFIG_CGROUP_CPUACCT */
8257
8258void dump_cpu_task(int cpu) 8062void dump_cpu_task(int cpu)
8259{ 8063{
8260 pr_info("Task dump for CPU %d:\n", cpu); 8064 pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000000..dbb7e2cd95eb
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
1#include <linux/cgroup.h>
2#include <linux/slab.h>
3#include <linux/percpu.h>
4#include <linux/spinlock.h>
5#include <linux/cpumask.h>
6#include <linux/seq_file.h>
7#include <linux/rcupdate.h>
8#include <linux/kernel_stat.h>
9#include <linux/err.h>
10
11#include "sched.h"
12
13/*
14 * CPU accounting code for task groups.
15 *
16 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
17 * (balbir@in.ibm.com).
18 */
19
20/* Time spent by the tasks of the cpu accounting group executing in ... */
21enum cpuacct_stat_index {
22 CPUACCT_STAT_USER, /* ... user mode */
23 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
24
25 CPUACCT_STAT_NSTATS,
26};
27
28/* track cpu usage of a group of tasks and its child groups */
29struct cpuacct {
30 struct cgroup_subsys_state css;
31 /* cpuusage holds pointer to a u64-type object on every cpu */
32 u64 __percpu *cpuusage;
33 struct kernel_cpustat __percpu *cpustat;
34};
35
36/* return cpu accounting group corresponding to this container */
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
40 struct cpuacct, css);
41}
42
43/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53}
54
55static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{
57 if (!ca->css.cgroup->parent)
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60}
61
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
63static struct cpuacct root_cpuacct = {
64 .cpustat = &kernel_cpustat,
65 .cpuusage = &root_cpuacct_cpuusage,
66};
67
68/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
70{
71 struct cpuacct *ca;
72
73 if (!cgrp->parent)
74 return &root_cpuacct.css;
75
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
77 if (!ca)
78 goto out;
79
80 ca->cpuusage = alloc_percpu(u64);
81 if (!ca->cpuusage)
82 goto out_free_ca;
83
84 ca->cpustat = alloc_percpu(struct kernel_cpustat);
85 if (!ca->cpustat)
86 goto out_free_cpuusage;
87
88 return &ca->css;
89
90out_free_cpuusage:
91 free_percpu(ca->cpuusage);
92out_free_ca:
93 kfree(ca);
94out:
95 return ERR_PTR(-ENOMEM);
96}
97
98/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp)
100{
101 struct cpuacct *ca = cgroup_ca(cgrp);
102
103 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage);
105 kfree(ca);
106}
107
108static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
109{
110 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
111 u64 data;
112
113#ifndef CONFIG_64BIT
114 /*
115 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
116 */
117 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
118 data = *cpuusage;
119 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
120#else
121 data = *cpuusage;
122#endif
123
124 return data;
125}
126
127static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
128{
129 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
130
131#ifndef CONFIG_64BIT
132 /*
133 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
134 */
135 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
136 *cpuusage = val;
137 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
138#else
139 *cpuusage = val;
140#endif
141}
142
143/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
145{
146 struct cpuacct *ca = cgroup_ca(cgrp);
147 u64 totalcpuusage = 0;
148 int i;
149
150 for_each_present_cpu(i)
151 totalcpuusage += cpuacct_cpuusage_read(ca, i);
152
153 return totalcpuusage;
154}
155
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
157 u64 reset)
158{
159 struct cpuacct *ca = cgroup_ca(cgrp);
160 int err = 0;
161 int i;
162
163 if (reset) {
164 err = -EINVAL;
165 goto out;
166 }
167
168 for_each_present_cpu(i)
169 cpuacct_cpuusage_write(ca, i, 0);
170
171out:
172 return err;
173}
174
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
176 struct seq_file *m)
177{
178 struct cpuacct *ca = cgroup_ca(cgroup);
179 u64 percpu;
180 int i;
181
182 for_each_present_cpu(i) {
183 percpu = cpuacct_cpuusage_read(ca, i);
184 seq_printf(m, "%llu ", (unsigned long long) percpu);
185 }
186 seq_printf(m, "\n");
187 return 0;
188}
189
190static const char * const cpuacct_stat_desc[] = {
191 [CPUACCT_STAT_USER] = "user",
192 [CPUACCT_STAT_SYSTEM] = "system",
193};
194
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
196 struct cgroup_map_cb *cb)
197{
198 struct cpuacct *ca = cgroup_ca(cgrp);
199 int cpu;
200 s64 val = 0;
201
202 for_each_online_cpu(cpu) {
203 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
204 val += kcpustat->cpustat[CPUTIME_USER];
205 val += kcpustat->cpustat[CPUTIME_NICE];
206 }
207 val = cputime64_to_clock_t(val);
208 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
209
210 val = 0;
211 for_each_online_cpu(cpu) {
212 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
213 val += kcpustat->cpustat[CPUTIME_SYSTEM];
214 val += kcpustat->cpustat[CPUTIME_IRQ];
215 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
216 }
217
218 val = cputime64_to_clock_t(val);
219 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
220
221 return 0;
222}
223
224static struct cftype files[] = {
225 {
226 .name = "usage",
227 .read_u64 = cpuusage_read,
228 .write_u64 = cpuusage_write,
229 },
230 {
231 .name = "usage_percpu",
232 .read_seq_string = cpuacct_percpu_seq_read,
233 },
234 {
235 .name = "stat",
236 .read_map = cpuacct_stats_show,
237 },
238 { } /* terminate */
239};
240
241/*
242 * charge this task's execution time to its accounting group.
243 *
244 * called with rq->lock held.
245 */
246void cpuacct_charge(struct task_struct *tsk, u64 cputime)
247{
248 struct cpuacct *ca;
249 int cpu;
250
251 cpu = task_cpu(tsk);
252
253 rcu_read_lock();
254
255 ca = task_ca(tsk);
256
257 while (true) {
258 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
259 *cpuusage += cputime;
260
261 ca = parent_ca(ca);
262 if (!ca)
263 break;
264 }
265
266 rcu_read_unlock();
267}
268
269/*
270 * Add user/system time to cpuacct.
271 *
272 * Note: it's the caller that updates the account of the root cgroup.
273 */
274void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275{
276 struct kernel_cpustat *kcpustat;
277 struct cpuacct *ca;
278
279 rcu_read_lock();
280 ca = task_ca(p);
281 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca);
285 }
286 rcu_read_unlock();
287}
288
289struct cgroup_subsys cpuacct_subsys = {
290 .name = "cpuacct",
291 .css_alloc = cpuacct_css_alloc,
292 .css_free = cpuacct_css_free,
293 .subsys_id = cpuacct_subsys_id,
294 .base_cftypes = files,
295 .early_init = 1,
296};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000000..ed605624a5e7
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
1#ifdef CONFIG_CGROUP_CPUACCT
2
3extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
4extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
5
6#else
7
8static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9{
10}
11
12static inline void
13cpuacct_account_field(struct task_struct *p, int index, u64 val)
14{
15}
16
17#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ed12cbb135f4..cc2dc3eea8a3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
115static inline void task_group_account_field(struct task_struct *p, int index, 115static inline void task_group_account_field(struct task_struct *p, int index,
116 u64 tmp) 116 u64 tmp)
117{ 117{
118#ifdef CONFIG_CGROUP_CPUACCT
119 struct kernel_cpustat *kcpustat;
120 struct cpuacct *ca;
121#endif
122 /* 118 /*
123 * Since all updates are sure to touch the root cgroup, we 119 * Since all updates are sure to touch the root cgroup, we
124 * get ourselves ahead and touch it first. If the root cgroup 120 * get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
127 */ 123 */
128 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
129 125
130#ifdef CONFIG_CGROUP_CPUACCT 126 cpuacct_account_field(p, index, tmp);
131 if (unlikely(!cpuacct_subsys.active))
132 return;
133
134 rcu_read_lock();
135 ca = task_ca(p);
136 while (ca && (ca != &root_cpuacct)) {
137 kcpustat = this_cpu_ptr(ca->cpustat);
138 kcpustat->cpustat[index] += tmp;
139 ca = parent_ca(ca);
140 }
141 rcu_read_unlock();
142#endif
143} 127}
144 128
145/* 129/*
@@ -310,7 +294,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
310 294
311 t = tsk; 295 t = tsk;
312 do { 296 do {
313 task_cputime(tsk, &utime, &stime); 297 task_cputime(t, &utime, &stime);
314 times->utime += utime; 298 times->utime += utime;
315 times->stime += stime; 299 times->stime += stime;
316 times->sum_exec_runtime += task_sched_runtime(t); 300 times->sum_exec_runtime += task_sched_runtime(t);
@@ -388,7 +372,84 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
388 struct rq *rq) {} 372 struct rq *rq) {}
389#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 373#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
390 374
391#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 375/*
376 * Use precise platform statistics if available:
377 */
378#ifdef CONFIG_VIRT_CPU_ACCOUNTING
379
380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
381void vtime_task_switch(struct task_struct *prev)
382{
383 if (!vtime_accounting_enabled())
384 return;
385
386 if (is_idle_task(prev))
387 vtime_account_idle(prev);
388 else
389 vtime_account_system(prev);
390
391#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
392 vtime_account_user(prev);
393#endif
394 arch_vtime_task_switch(prev);
395}
396#endif
397
398/*
399 * Archs that account the whole time spent in the idle task
400 * (outside irq) as idle time can rely on this and just implement
401 * vtime_account_system() and vtime_account_idle(). Archs that
402 * have other meaning of the idle time (s390 only includes the
403 * time spent by the CPU when it's in low power mode) must override
404 * vtime_account().
405 */
406#ifndef __ARCH_HAS_VTIME_ACCOUNT
407void vtime_account_irq_enter(struct task_struct *tsk)
408{
409 if (!vtime_accounting_enabled())
410 return;
411
412 if (!in_interrupt()) {
413 /*
414 * If we interrupted user, context_tracking_in_user()
415 * is 1 because the context tracking don't hook
416 * on irq entry/exit. This way we know if
417 * we need to flush user time on kernel entry.
418 */
419 if (context_tracking_in_user()) {
420 vtime_account_user(tsk);
421 return;
422 }
423
424 if (is_idle_task(tsk)) {
425 vtime_account_idle(tsk);
426 return;
427 }
428 }
429 vtime_account_system(tsk);
430}
431EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
432#endif /* __ARCH_HAS_VTIME_ACCOUNT */
433#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
434
435
436#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
437void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
438{
439 *ut = p->utime;
440 *st = p->stime;
441}
442
443void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
444{
445 struct task_cputime cputime;
446
447 thread_group_cputime(p, &cputime);
448
449 *ut = cputime.utime;
450 *st = cputime.stime;
451}
452#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
392/* 453/*
393 * Account a single tick of cpu time. 454 * Account a single tick of cpu time.
394 * @p: the process that the cpu time gets accounted to 455 * @p: the process that the cpu time gets accounted to
@@ -443,96 +504,50 @@ void account_idle_ticks(unsigned long ticks)
443 504
444 account_idle_time(jiffies_to_cputime(ticks)); 505 account_idle_time(jiffies_to_cputime(ticks));
445} 506}
446#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
447
448/*
449 * Use precise platform statistics if available:
450 */
451#ifdef CONFIG_VIRT_CPU_ACCOUNTING
452void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
453{
454 *ut = p->utime;
455 *st = p->stime;
456}
457
458void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
459{
460 struct task_cputime cputime;
461
462 thread_group_cputime(p, &cputime);
463
464 *ut = cputime.utime;
465 *st = cputime.stime;
466}
467
468#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
469void vtime_task_switch(struct task_struct *prev)
470{
471 if (!vtime_accounting_enabled())
472 return;
473
474 if (is_idle_task(prev))
475 vtime_account_idle(prev);
476 else
477 vtime_account_system(prev);
478
479#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
480 vtime_account_user(prev);
481#endif
482 arch_vtime_task_switch(prev);
483}
484#endif
485 507
486/* 508/*
487 * Archs that account the whole time spent in the idle task 509 * Perform (stime * rtime) / total, but avoid multiplication overflow by
488 * (outside irq) as idle time can rely on this and just implement 510 * loosing precision when the numbers are big.
489 * vtime_account_system() and vtime_account_idle(). Archs that
490 * have other meaning of the idle time (s390 only includes the
491 * time spent by the CPU when it's in low power mode) must override
492 * vtime_account().
493 */ 511 */
494#ifndef __ARCH_HAS_VTIME_ACCOUNT 512static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
495void vtime_account_irq_enter(struct task_struct *tsk)
496{ 513{
497 if (!vtime_accounting_enabled()) 514 u64 scaled;
498 return;
499 515
500 if (!in_interrupt()) { 516 for (;;) {
501 /* 517 /* Make sure "rtime" is the bigger of stime/rtime */
502 * If we interrupted user, context_tracking_in_user() 518 if (stime > rtime) {
503 * is 1 because the context tracking don't hook 519 u64 tmp = rtime; rtime = stime; stime = tmp;
504 * on irq entry/exit. This way we know if
505 * we need to flush user time on kernel entry.
506 */
507 if (context_tracking_in_user()) {
508 vtime_account_user(tsk);
509 return;
510 } 520 }
511 521
512 if (is_idle_task(tsk)) { 522 /* Make sure 'total' fits in 32 bits */
513 vtime_account_idle(tsk); 523 if (total >> 32)
514 return; 524 goto drop_precision;
515 }
516 }
517 vtime_account_system(tsk);
518}
519EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
520#endif /* __ARCH_HAS_VTIME_ACCOUNT */
521 525
522#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ 526 /* Does rtime (and thus stime) fit in 32 bits? */
527 if (!(rtime >> 32))
528 break;
523 529
524static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) 530 /* Can we just balance rtime/stime rather than dropping bits? */
525{ 531 if (stime >> 31)
526 u64 temp = (__force u64) rtime; 532 goto drop_precision;
527 533
528 temp *= (__force u64) stime; 534 /* We can grow stime and shrink rtime and try to make them both fit */
535 stime <<= 1;
536 rtime >>= 1;
537 continue;
529 538
530 if (sizeof(cputime_t) == 4) 539drop_precision:
531 temp = div_u64(temp, (__force u32) total); 540 /* We drop from rtime, it has more bits than stime */
532 else 541 rtime >>= 1;
533 temp = div64_u64(temp, (__force u64) total); 542 total >>= 1;
543 }
534 544
535 return (__force cputime_t) temp; 545 /*
546 * Make sure gcc understands that this is a 32x32->64 multiply,
547 * followed by a 64/32->64 divide.
548 */
549 scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
550 return (__force cputime_t) scaled;
536} 551}
537 552
538/* 553/*
@@ -543,7 +558,13 @@ static void cputime_adjust(struct task_cputime *curr,
543 struct cputime *prev, 558 struct cputime *prev,
544 cputime_t *ut, cputime_t *st) 559 cputime_t *ut, cputime_t *st)
545{ 560{
546 cputime_t rtime, stime, total; 561 cputime_t rtime, stime, utime, total;
562
563 if (vtime_accounting_enabled()) {
564 *ut = curr->utime;
565 *st = curr->stime;
566 return;
567 }
547 568
548 stime = curr->stime; 569 stime = curr->stime;
549 total = stime + curr->utime; 570 total = stime + curr->utime;
@@ -560,10 +581,22 @@ static void cputime_adjust(struct task_cputime *curr,
560 */ 581 */
561 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 582 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
562 583
563 if (total) 584 /*
564 stime = scale_stime(stime, rtime, total); 585 * Update userspace visible utime/stime values only if actual execution
565 else 586 * time is bigger than already exported. Note that can happen, that we
587 * provided bigger values due to scaling inaccuracy on big numbers.
588 */
589 if (prev->stime + prev->utime >= rtime)
590 goto out;
591
592 if (total) {
593 stime = scale_stime((__force u64)stime,
594 (__force u64)rtime, (__force u64)total);
595 utime = rtime - stime;
596 } else {
566 stime = rtime; 597 stime = rtime;
598 utime = 0;
599 }
567 600
568 /* 601 /*
569 * If the tick based count grows faster than the scheduler one, 602 * If the tick based count grows faster than the scheduler one,
@@ -571,8 +604,9 @@ static void cputime_adjust(struct task_cputime *curr,
571 * Let's enforce monotonicity. 604 * Let's enforce monotonicity.
572 */ 605 */
573 prev->stime = max(prev->stime, stime); 606 prev->stime = max(prev->stime, stime);
574 prev->utime = max(prev->utime, rtime - prev->stime); 607 prev->utime = max(prev->utime, utime);
575 608
609out:
576 *ut = prev->utime; 610 *ut = prev->utime;
577 *st = prev->stime; 611 *st = prev->stime;
578} 612}
@@ -597,7 +631,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
597 thread_group_cputime(p, &cputime); 631 thread_group_cputime(p, &cputime);
598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 632 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
599} 633}
600#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ 634#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
601 635
602#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 636#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
603static unsigned long long vtime_delta(struct task_struct *tsk) 637static unsigned long long vtime_delta(struct task_struct *tsk)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e5986fc5..c61a614465c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
431 * Scheduling class tree data structure manipulation methods: 431 * Scheduling class tree data structure manipulation methods:
432 */ 432 */
433 433
434static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) 434static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
435{ 435{
436 s64 delta = (s64)(vruntime - min_vruntime); 436 s64 delta = (s64)(vruntime - max_vruntime);
437 if (delta > 0) 437 if (delta > 0)
438 min_vruntime = vruntime; 438 max_vruntime = vruntime;
439 439
440 return min_vruntime; 440 return max_vruntime;
441} 441}
442 442
443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) 443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
473 vruntime = min_vruntime(vruntime, se->vruntime); 473 vruntime = min_vruntime(vruntime, se->vruntime);
474 } 474 }
475 475
476 /* ensure we never gain time by being placed backwards. */
476 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 477 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477#ifndef CONFIG_64BIT 478#ifndef CONFIG_64BIT
478 smp_wmb(); 479 smp_wmb();
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
652} 653}
653 654
654/* 655/*
655 * We calculate the vruntime slice of a to be inserted task 656 * We calculate the vruntime slice of a to-be-inserted task.
656 * 657 *
657 * vs = s/w 658 * vs = s/w
658 */ 659 */
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 1563 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */ 1564 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564} 1565}
1566
1567/*
1568 * Update the rq's load with the elapsed running time before entering
1569 * idle. if the last scheduled task is not a CFS task, idle_enter will
1570 * be the only way to update the runnable statistic.
1571 */
1572void idle_enter_fair(struct rq *this_rq)
1573{
1574 update_rq_runnable_avg(this_rq, 1);
1575}
1576
1577/*
1578 * Update the rq's load with the elapsed idle time before a task is
1579 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1580 * be the only way to update the runnable statistic.
1581 */
1582void idle_exit_fair(struct rq *this_rq)
1583{
1584 update_rq_runnable_avg(this_rq, 0);
1585}
1586
1565#else 1587#else
1566static inline void update_entity_load_avg(struct sched_entity *se, 1588static inline void update_entity_load_avg(struct sched_entity *se,
1567 int update_cfs_rq) {} 1589 int update_cfs_rq) {}
@@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3874 int tsk_cache_hot = 0; 3896 int tsk_cache_hot = 0;
3875 /* 3897 /*
3876 * We do not migrate tasks that are: 3898 * We do not migrate tasks that are:
3877 * 1) running (obviously), or 3899 * 1) throttled_lb_pair, or
3878 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3900 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3879 * 3) are cache-hot on their current CPU. 3901 * 3) running (obviously), or
3902 * 4) are cache-hot on their current CPU.
3880 */ 3903 */
3904 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3905 return 0;
3906
3881 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3907 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3882 int new_dst_cpu; 3908 int cpu;
3883 3909
3884 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3910 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3885 3911
@@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3894 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 3920 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3895 return 0; 3921 return 0;
3896 3922
3897 new_dst_cpu = cpumask_first_and(env->dst_grpmask, 3923 /* Prevent to re-select dst_cpu via env's cpus */
3898 tsk_cpus_allowed(p)); 3924 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3899 if (new_dst_cpu < nr_cpu_ids) { 3925 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
3900 env->flags |= LBF_SOME_PINNED; 3926 env->flags |= LBF_SOME_PINNED;
3901 env->new_dst_cpu = new_dst_cpu; 3927 env->new_dst_cpu = cpu;
3928 break;
3929 }
3902 } 3930 }
3931
3903 return 0; 3932 return 0;
3904 } 3933 }
3905 3934
@@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3920 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3949 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3921 if (!tsk_cache_hot || 3950 if (!tsk_cache_hot ||
3922 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3951 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3923#ifdef CONFIG_SCHEDSTATS 3952
3924 if (tsk_cache_hot) { 3953 if (tsk_cache_hot) {
3925 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 3954 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3926 schedstat_inc(p, se.statistics.nr_forced_migrations); 3955 schedstat_inc(p, se.statistics.nr_forced_migrations);
3927 } 3956 }
3928#endif 3957
3929 return 1; 3958 return 1;
3930 } 3959 }
3931 3960
3932 if (tsk_cache_hot) { 3961 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3933 schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 3962 return 0;
3934 return 0;
3935 }
3936 return 1;
3937} 3963}
3938 3964
3939/* 3965/*
@@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
3948 struct task_struct *p, *n; 3974 struct task_struct *p, *n;
3949 3975
3950 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 3976 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3951 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3952 continue;
3953
3954 if (!can_migrate_task(p, env)) 3977 if (!can_migrate_task(p, env))
3955 continue; 3978 continue;
3956 3979
@@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
4002 break; 4025 break;
4003 } 4026 }
4004 4027
4005 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 4028 if (!can_migrate_task(p, env))
4006 goto next; 4029 goto next;
4007 4030
4008 load = task_h_load(p); 4031 load = task_h_load(p);
@@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
4013 if ((load / 2) > env->imbalance) 4036 if ((load / 2) > env->imbalance)
4014 goto next; 4037 goto next;
4015 4038
4016 if (!can_migrate_task(p, env))
4017 goto next;
4018
4019 move_task(p, env); 4039 move_task(p, env);
4020 pulled++; 4040 pulled++;
4021 env->imbalance -= load; 4041 env->imbalance -= load;
@@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
4245 return load_idx; 4265 return load_idx;
4246} 4266}
4247 4267
4248unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 4268static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4249{ 4269{
4250 return SCHED_POWER_SCALE; 4270 return SCHED_POWER_SCALE;
4251} 4271}
@@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4255 return default_scale_freq_power(sd, cpu); 4275 return default_scale_freq_power(sd, cpu);
4256} 4276}
4257 4277
4258unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 4278static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4259{ 4279{
4260 unsigned long weight = sd->span_weight; 4280 unsigned long weight = sd->span_weight;
4261 unsigned long smt_gain = sd->smt_gain; 4281 unsigned long smt_gain = sd->smt_gain;
@@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4270 return default_scale_smt_power(sd, cpu); 4290 return default_scale_smt_power(sd, cpu);
4271} 4291}
4272 4292
4273unsigned long scale_rt_power(int cpu) 4293static unsigned long scale_rt_power(int cpu)
4274{ 4294{
4275 struct rq *rq = cpu_rq(cpu); 4295 struct rq *rq = cpu_rq(cpu);
4276 u64 total, available, age_stamp, avg; 4296 u64 total, available, age_stamp, avg;
@@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4960#define MAX_PINNED_INTERVAL 512 4980#define MAX_PINNED_INTERVAL 512
4961 4981
4962/* Working cpumask for load_balance and load_balance_newidle. */ 4982/* Working cpumask for load_balance and load_balance_newidle. */
4963DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4983DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4964 4984
4965static int need_active_balance(struct lb_env *env) 4985static int need_active_balance(struct lb_env *env)
4966{ 4986{
@@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4991 int *balance) 5011 int *balance)
4992{ 5012{
4993 int ld_moved, cur_ld_moved, active_balance = 0; 5013 int ld_moved, cur_ld_moved, active_balance = 0;
4994 int lb_iterations, max_lb_iterations;
4995 struct sched_group *group; 5014 struct sched_group *group;
4996 struct rq *busiest; 5015 struct rq *busiest;
4997 unsigned long flags; 5016 unsigned long flags;
4998 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 5017 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
4999 5018
5000 struct lb_env env = { 5019 struct lb_env env = {
5001 .sd = sd, 5020 .sd = sd,
@@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5007 .cpus = cpus, 5026 .cpus = cpus,
5008 }; 5027 };
5009 5028
5029 /*
5030 * For NEWLY_IDLE load_balancing, we don't need to consider
5031 * other cpus in our group
5032 */
5033 if (idle == CPU_NEWLY_IDLE)
5034 env.dst_grpmask = NULL;
5035
5010 cpumask_copy(cpus, cpu_active_mask); 5036 cpumask_copy(cpus, cpu_active_mask);
5011 max_lb_iterations = cpumask_weight(env.dst_grpmask);
5012 5037
5013 schedstat_inc(sd, lb_count[idle]); 5038 schedstat_inc(sd, lb_count[idle]);
5014 5039
@@ -5034,7 +5059,6 @@ redo:
5034 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 5059 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5035 5060
5036 ld_moved = 0; 5061 ld_moved = 0;
5037 lb_iterations = 1;
5038 if (busiest->nr_running > 1) { 5062 if (busiest->nr_running > 1) {
5039 /* 5063 /*
5040 * Attempt to move tasks. If find_busiest_group has found 5064 * Attempt to move tasks. If find_busiest_group has found
@@ -5061,17 +5085,17 @@ more_balance:
5061 double_rq_unlock(env.dst_rq, busiest); 5085 double_rq_unlock(env.dst_rq, busiest);
5062 local_irq_restore(flags); 5086 local_irq_restore(flags);
5063 5087
5064 if (env.flags & LBF_NEED_BREAK) {
5065 env.flags &= ~LBF_NEED_BREAK;
5066 goto more_balance;
5067 }
5068
5069 /* 5088 /*
5070 * some other cpu did the load balance for us. 5089 * some other cpu did the load balance for us.
5071 */ 5090 */
5072 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 5091 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5073 resched_cpu(env.dst_cpu); 5092 resched_cpu(env.dst_cpu);
5074 5093
5094 if (env.flags & LBF_NEED_BREAK) {
5095 env.flags &= ~LBF_NEED_BREAK;
5096 goto more_balance;
5097 }
5098
5075 /* 5099 /*
5076 * Revisit (affine) tasks on src_cpu that couldn't be moved to 5100 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5077 * us and move them to an alternate dst_cpu in our sched_group 5101 * us and move them to an alternate dst_cpu in our sched_group
@@ -5091,14 +5115,17 @@ more_balance:
5091 * moreover subsequent load balance cycles should correct the 5115 * moreover subsequent load balance cycles should correct the
5092 * excess load moved. 5116 * excess load moved.
5093 */ 5117 */
5094 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 5118 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5095 lb_iterations++ < max_lb_iterations) {
5096 5119
5097 env.dst_rq = cpu_rq(env.new_dst_cpu); 5120 env.dst_rq = cpu_rq(env.new_dst_cpu);
5098 env.dst_cpu = env.new_dst_cpu; 5121 env.dst_cpu = env.new_dst_cpu;
5099 env.flags &= ~LBF_SOME_PINNED; 5122 env.flags &= ~LBF_SOME_PINNED;
5100 env.loop = 0; 5123 env.loop = 0;
5101 env.loop_break = sched_nr_migrate_break; 5124 env.loop_break = sched_nr_migrate_break;
5125
5126 /* Prevent to re-select dst_cpu via env's cpus */
5127 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5128
5102 /* 5129 /*
5103 * Go back to "more_balance" rather than "redo" since we 5130 * Go back to "more_balance" rather than "redo" since we
5104 * need to continue with same src_cpu. 5131 * need to continue with same src_cpu.
@@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5219 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5246 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5220 return; 5247 return;
5221 5248
5222 update_rq_runnable_avg(this_rq, 1);
5223
5224 /* 5249 /*
5225 * Drop the rq->lock, but keep IRQ/preempt disabled. 5250 * Drop the rq->lock, but keep IRQ/preempt disabled.
5226 */ 5251 */
@@ -5330,7 +5355,7 @@ out_unlock:
5330 return 0; 5355 return 0;
5331} 5356}
5332 5357
5333#ifdef CONFIG_NO_HZ 5358#ifdef CONFIG_NO_HZ_COMMON
5334/* 5359/*
5335 * idle load balancing details 5360 * idle load balancing details
5336 * - When one of the busy CPUs notice that there may be an idle rebalancing 5361 * - When one of the busy CPUs notice that there may be an idle rebalancing
@@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
5395 struct sched_domain *sd; 5420 struct sched_domain *sd;
5396 int cpu = smp_processor_id(); 5421 int cpu = smp_processor_id();
5397 5422
5398 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5399 return;
5400 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5401
5402 rcu_read_lock(); 5423 rcu_read_lock();
5403 for_each_domain(cpu, sd) 5424 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5425
5426 if (!sd || !sd->nohz_idle)
5427 goto unlock;
5428 sd->nohz_idle = 0;
5429
5430 for (; sd; sd = sd->parent)
5404 atomic_inc(&sd->groups->sgp->nr_busy_cpus); 5431 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5432unlock:
5405 rcu_read_unlock(); 5433 rcu_read_unlock();
5406} 5434}
5407 5435
@@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
5410 struct sched_domain *sd; 5438 struct sched_domain *sd;
5411 int cpu = smp_processor_id(); 5439 int cpu = smp_processor_id();
5412 5440
5413 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5414 return;
5415 set_bit(NOHZ_IDLE, nohz_flags(cpu));
5416
5417 rcu_read_lock(); 5441 rcu_read_lock();
5418 for_each_domain(cpu, sd) 5442 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5443
5444 if (!sd || sd->nohz_idle)
5445 goto unlock;
5446 sd->nohz_idle = 1;
5447
5448 for (; sd; sd = sd->parent)
5419 atomic_dec(&sd->groups->sgp->nr_busy_cpus); 5449 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5450unlock:
5420 rcu_read_unlock(); 5451 rcu_read_unlock();
5421} 5452}
5422 5453
@@ -5468,7 +5499,7 @@ void update_max_interval(void)
5468 * It checks each scheduling domain to see if it is due to be balanced, 5499 * It checks each scheduling domain to see if it is due to be balanced,
5469 * and initiates a balancing operation if so. 5500 * and initiates a balancing operation if so.
5470 * 5501 *
5471 * Balancing parameters are set up in arch_init_sched_domains. 5502 * Balancing parameters are set up in init_sched_domains.
5472 */ 5503 */
5473static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5504static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5474{ 5505{
@@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5506 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5537 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5507 if (load_balance(cpu, rq, sd, idle, &balance)) { 5538 if (load_balance(cpu, rq, sd, idle, &balance)) {
5508 /* 5539 /*
5509 * We've pulled tasks over so either we're no 5540 * The LBF_SOME_PINNED logic could have changed
5510 * longer idle. 5541 * env->dst_cpu, so we can't know our idle
5542 * state even if we migrated tasks. Update it.
5511 */ 5543 */
5512 idle = CPU_NOT_IDLE; 5544 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
5513 } 5545 }
5514 sd->last_balance = jiffies; 5546 sd->last_balance = jiffies;
5515 } 5547 }
@@ -5540,9 +5572,9 @@ out:
5540 rq->next_balance = next_balance; 5572 rq->next_balance = next_balance;
5541} 5573}
5542 5574
5543#ifdef CONFIG_NO_HZ 5575#ifdef CONFIG_NO_HZ_COMMON
5544/* 5576/*
5545 * In CONFIG_NO_HZ case, the idle balance kickee will do the 5577 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
5546 * rebalancing for all the cpus for whom scheduler ticks are stopped. 5578 * rebalancing for all the cpus for whom scheduler ticks are stopped.
5547 */ 5579 */
5548static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) 5580static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
@@ -5685,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu)
5685 if (time_after_eq(jiffies, rq->next_balance) && 5717 if (time_after_eq(jiffies, rq->next_balance) &&
5686 likely(!on_null_domain(cpu))) 5718 likely(!on_null_domain(cpu)))
5687 raise_softirq(SCHED_SOFTIRQ); 5719 raise_softirq(SCHED_SOFTIRQ);
5688#ifdef CONFIG_NO_HZ 5720#ifdef CONFIG_NO_HZ_COMMON
5689 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 5721 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5690 nohz_balancer_kick(cpu); 5722 nohz_balancer_kick(cpu);
5691#endif 5723#endif
@@ -6155,7 +6187,7 @@ __init void init_sched_fair_class(void)
6155#ifdef CONFIG_SMP 6187#ifdef CONFIG_SMP
6156 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 6188 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6157 6189
6158#ifdef CONFIG_NO_HZ 6190#ifdef CONFIG_NO_HZ_COMMON
6159 nohz.next_balance = jiffies; 6191 nohz.next_balance = jiffies;
6160 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 6192 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
6161 cpu_notifier(sched_ilb_notifier, 0); 6193 cpu_notifier(sched_ilb_notifier, 0);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
46SCHED_FEAT(LB_BIAS, true) 46SCHED_FEAT(LB_BIAS, true)
47 47
48/* 48/*
49 * Spin-wait on mutex acquisition when the mutex owner is running on
50 * another cpu -- assumes that when the owner is running, it will soon
51 * release the lock. Decreases scheduling overhead.
52 */
53SCHED_FEAT(OWNER_SPIN, true)
54
55/*
56 * Decrement CPU power based on time not spent running tasks 49 * Decrement CPU power based on time not spent running tasks
57 */ 50 */
58SCHED_FEAT(NONTASK_POWER, true) 51SCHED_FEAT(NONTASK_POWER, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..d8da01008d39 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,17 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20 rq_last_tick_reset(rq);
21}
22
23static void post_schedule_idle(struct rq *rq)
24{
25 idle_enter_fair(rq);
26}
16#endif /* CONFIG_SMP */ 27#endif /* CONFIG_SMP */
17/* 28/*
18 * Idle tasks are unconditionally rescheduled: 29 * Idle tasks are unconditionally rescheduled:
@@ -25,6 +36,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 36static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 37{
27 schedstat_inc(rq, sched_goidle); 38 schedstat_inc(rq, sched_goidle);
39#ifdef CONFIG_SMP
40 /* Trigger the post schedule to do an idle_enter for CFS */
41 rq->post_schedule = 1;
42#endif
28 return rq->idle; 43 return rq->idle;
29} 44}
30 45
@@ -86,6 +101,8 @@ const struct sched_class idle_sched_class = {
86 101
87#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
88 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
104 .pre_schedule = pre_schedule_idle,
105 .post_schedule = post_schedule_idle,
89#endif 106#endif
90 107
91 .set_curr_task = set_curr_task_idle, 108 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfdf469f..ce39224d6155 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,8 +5,10 @@
5#include <linux/mutex.h> 5#include <linux/mutex.h>
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8#include <linux/tick.h>
8 9
9#include "cpupri.h" 10#include "cpupri.h"
11#include "cpuacct.h"
10 12
11extern __read_mostly int scheduler_running; 13extern __read_mostly int scheduler_running;
12 14
@@ -33,6 +35,31 @@ extern __read_mostly int scheduler_running;
33 */ 35 */
34#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 36#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
35 37
38/*
39 * Increase resolution of nice-level calculations for 64-bit architectures.
40 * The extra resolution improves shares distribution and load balancing of
41 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
42 * hierarchies, especially on larger systems. This is not a user-visible change
43 * and does not change the user-interface for setting shares/weights.
44 *
45 * We increase resolution only if we have enough bits to allow this increased
46 * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
47 * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
48 * increased costs.
49 */
50#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
51# define SCHED_LOAD_RESOLUTION 10
52# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
53# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
54#else
55# define SCHED_LOAD_RESOLUTION 0
56# define scale_load(w) (w)
57# define scale_load_down(w) (w)
58#endif
59
60#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
61#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
62
36#define NICE_0_LOAD SCHED_LOAD_SCALE 63#define NICE_0_LOAD SCHED_LOAD_SCALE
37#define NICE_0_SHIFT SCHED_LOAD_SHIFT 64#define NICE_0_SHIFT SCHED_LOAD_SHIFT
38 65
@@ -154,11 +181,6 @@ struct task_group {
154#define MAX_SHARES (1UL << 18) 181#define MAX_SHARES (1UL << 18)
155#endif 182#endif
156 183
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *); 184typedef int (*tg_visitor)(struct task_group *, void *);
163 185
164extern int walk_tg_tree_from(struct task_group *from, 186extern int walk_tg_tree_from(struct task_group *from,
@@ -196,6 +218,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu, 218 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent); 219 struct sched_rt_entity *parent);
198 220
221extern struct task_group *sched_create_group(struct task_group *parent);
222extern void sched_online_group(struct task_group *tg,
223 struct task_group *parent);
224extern void sched_destroy_group(struct task_group *tg);
225extern void sched_offline_group(struct task_group *tg);
226
227extern void sched_move_task(struct task_struct *tsk);
228
229#ifdef CONFIG_FAIR_GROUP_SCHED
230extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
231#endif
232
199#else /* CONFIG_CGROUP_SCHED */ 233#else /* CONFIG_CGROUP_SCHED */
200 234
201struct cfs_bandwidth { }; 235struct cfs_bandwidth { };
@@ -372,10 +406,13 @@ struct rq {
372 #define CPU_LOAD_IDX_MAX 5 406 #define CPU_LOAD_IDX_MAX 5
373 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 407 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
374 unsigned long last_load_update_tick; 408 unsigned long last_load_update_tick;
375#ifdef CONFIG_NO_HZ 409#ifdef CONFIG_NO_HZ_COMMON
376 u64 nohz_stamp; 410 u64 nohz_stamp;
377 unsigned long nohz_flags; 411 unsigned long nohz_flags;
378#endif 412#endif
413#ifdef CONFIG_NO_HZ_FULL
414 unsigned long last_sched_tick;
415#endif
379 int skip_clock_update; 416 int skip_clock_update;
380 417
381 /* capture load from *all* tasks on this cpu: */ 418 /* capture load from *all* tasks on this cpu: */
@@ -547,6 +584,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
547DECLARE_PER_CPU(struct sched_domain *, sd_llc); 584DECLARE_PER_CPU(struct sched_domain *, sd_llc);
548DECLARE_PER_CPU(int, sd_llc_id); 585DECLARE_PER_CPU(int, sd_llc_id);
549 586
587struct sched_group_power {
588 atomic_t ref;
589 /*
590 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
591 * single CPU.
592 */
593 unsigned int power, power_orig;
594 unsigned long next_update;
595 /*
596 * Number of busy cpus in this group.
597 */
598 atomic_t nr_busy_cpus;
599
600 unsigned long cpumask[0]; /* iteration mask */
601};
602
603struct sched_group {
604 struct sched_group *next; /* Must be a circular list */
605 atomic_t ref;
606
607 unsigned int group_weight;
608 struct sched_group_power *sgp;
609
610 /*
611 * The CPUs this group covers.
612 *
613 * NOTE: this field is variable length. (Allocated dynamically
614 * by attaching extra space to the end of the structure,
615 * depending on how many CPUs the kernel has booted up with)
616 */
617 unsigned long cpumask[0];
618};
619
620static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
621{
622 return to_cpumask(sg->cpumask);
623}
624
625/*
626 * cpumask masking which cpus in the group are allowed to iterate up the domain
627 * tree.
628 */
629static inline struct cpumask *sched_group_mask(struct sched_group *sg)
630{
631 return to_cpumask(sg->sgp->cpumask);
632}
633
634/**
635 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
636 * @group: The group whose first cpu is to be returned.
637 */
638static inline unsigned int group_first_cpu(struct sched_group *group)
639{
640 return cpumask_first(sched_group_cpus(group));
641}
642
550extern int group_balance_cpu(struct sched_group *sg); 643extern int group_balance_cpu(struct sched_group *sg);
551 644
552#endif /* CONFIG_SMP */ 645#endif /* CONFIG_SMP */
@@ -784,6 +877,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
784} 877}
785#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 878#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
786 879
880/*
881 * wake flags
882 */
883#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
884#define WF_FORK 0x02 /* child wakeup after fork */
885#define WF_MIGRATED 0x4 /* internal use, task got migrated */
787 886
788static inline void update_load_add(struct load_weight *lw, unsigned long inc) 887static inline void update_load_add(struct load_weight *lw, unsigned long inc)
789{ 888{
@@ -856,14 +955,61 @@ static const u32 prio_to_wmult[40] = {
856 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 955 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
857}; 956};
858 957
859/* Time spent by the tasks of the cpu accounting group executing in ... */ 958#define ENQUEUE_WAKEUP 1
860enum cpuacct_stat_index { 959#define ENQUEUE_HEAD 2
861 CPUACCT_STAT_USER, /* ... user mode */ 960#ifdef CONFIG_SMP
862 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 961#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
962#else
963#define ENQUEUE_WAKING 0
964#endif
863 965
864 CPUACCT_STAT_NSTATS, 966#define DEQUEUE_SLEEP 1
865};
866 967
968struct sched_class {
969 const struct sched_class *next;
970
971 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
972 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
973 void (*yield_task) (struct rq *rq);
974 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
975
976 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
977
978 struct task_struct * (*pick_next_task) (struct rq *rq);
979 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
980
981#ifdef CONFIG_SMP
982 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
983 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
984
985 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
986 void (*post_schedule) (struct rq *this_rq);
987 void (*task_waking) (struct task_struct *task);
988 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
989
990 void (*set_cpus_allowed)(struct task_struct *p,
991 const struct cpumask *newmask);
992
993 void (*rq_online)(struct rq *rq);
994 void (*rq_offline)(struct rq *rq);
995#endif
996
997 void (*set_curr_task) (struct rq *rq);
998 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
999 void (*task_fork) (struct task_struct *p);
1000
1001 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1002 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1003 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1004 int oldprio);
1005
1006 unsigned int (*get_rr_interval) (struct rq *rq,
1007 struct task_struct *task);
1008
1009#ifdef CONFIG_FAIR_GROUP_SCHED
1010 void (*task_move_group) (struct task_struct *p, int on_rq);
1011#endif
1012};
867 1013
868#define sched_class_highest (&stop_sched_class) 1014#define sched_class_highest (&stop_sched_class)
869#define for_each_class(class) \ 1015#define for_each_class(class) \
@@ -877,9 +1023,23 @@ extern const struct sched_class idle_sched_class;
877 1023
878#ifdef CONFIG_SMP 1024#ifdef CONFIG_SMP
879 1025
1026extern void update_group_power(struct sched_domain *sd, int cpu);
1027
880extern void trigger_load_balance(struct rq *rq, int cpu); 1028extern void trigger_load_balance(struct rq *rq, int cpu);
881extern void idle_balance(int this_cpu, struct rq *this_rq); 1029extern void idle_balance(int this_cpu, struct rq *this_rq);
882 1030
1031/*
1032 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1033 * becomes useful in lb
1034 */
1035#if defined(CONFIG_FAIR_GROUP_SCHED)
1036extern void idle_enter_fair(struct rq *this_rq);
1037extern void idle_exit_fair(struct rq *this_rq);
1038#else
1039static inline void idle_enter_fair(struct rq *this_rq) {}
1040static inline void idle_exit_fair(struct rq *this_rq) {}
1041#endif
1042
883#else /* CONFIG_SMP */ 1043#else /* CONFIG_SMP */
884 1044
885static inline void idle_balance(int cpu, struct rq *rq) 1045static inline void idle_balance(int cpu, struct rq *rq)
@@ -891,7 +1051,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
891extern void sysrq_sched_debug_show(void); 1051extern void sysrq_sched_debug_show(void);
892extern void sched_init_granularity(void); 1052extern void sched_init_granularity(void);
893extern void update_max_interval(void); 1053extern void update_max_interval(void);
894extern void update_group_power(struct sched_domain *sd, int cpu);
895extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); 1054extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
896extern void init_sched_rt_class(void); 1055extern void init_sched_rt_class(void);
897extern void init_sched_fair_class(void); 1056extern void init_sched_fair_class(void);
@@ -904,45 +1063,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
904 1063
905extern void update_idle_cpu_load(struct rq *this_rq); 1064extern void update_idle_cpu_load(struct rq *this_rq);
906 1065
907#ifdef CONFIG_CGROUP_CPUACCT
908#include <linux/cgroup.h>
909/* track cpu usage of a group of tasks and its child groups */
910struct cpuacct {
911 struct cgroup_subsys_state css;
912 /* cpuusage holds pointer to a u64-type object on every cpu */
913 u64 __percpu *cpuusage;
914 struct kernel_cpustat __percpu *cpustat;
915};
916
917extern struct cgroup_subsys cpuacct_subsys;
918extern struct cpuacct root_cpuacct;
919
920/* return cpu accounting group corresponding to this container */
921static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
922{
923 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
924 struct cpuacct, css);
925}
926
927/* return cpu accounting group to which this task belongs */
928static inline struct cpuacct *task_ca(struct task_struct *tsk)
929{
930 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
931 struct cpuacct, css);
932}
933
934static inline struct cpuacct *parent_ca(struct cpuacct *ca)
935{
936 if (!ca || !ca->css.cgroup->parent)
937 return NULL;
938 return cgroup_ca(ca->css.cgroup->parent);
939}
940
941extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
942#else
943static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
944#endif
945
946#ifdef CONFIG_PARAVIRT 1066#ifdef CONFIG_PARAVIRT
947static inline u64 steal_ticks(u64 steal) 1067static inline u64 steal_ticks(u64 steal)
948{ 1068{
@@ -956,6 +1076,16 @@ static inline u64 steal_ticks(u64 steal)
956static inline void inc_nr_running(struct rq *rq) 1076static inline void inc_nr_running(struct rq *rq)
957{ 1077{
958 rq->nr_running++; 1078 rq->nr_running++;
1079
1080#ifdef CONFIG_NO_HZ_FULL
1081 if (rq->nr_running == 2) {
1082 if (tick_nohz_full_cpu(rq->cpu)) {
1083 /* Order rq->nr_running write against the IPI */
1084 smp_wmb();
1085 smp_send_reschedule(rq->cpu);
1086 }
1087 }
1088#endif
959} 1089}
960 1090
961static inline void dec_nr_running(struct rq *rq) 1091static inline void dec_nr_running(struct rq *rq)
@@ -963,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq)
963 rq->nr_running--; 1093 rq->nr_running--;
964} 1094}
965 1095
1096static inline void rq_last_tick_reset(struct rq *rq)
1097{
1098#ifdef CONFIG_NO_HZ_FULL
1099 rq->last_sched_tick = jiffies;
1100#endif
1101}
1102
966extern void update_rq_clock(struct rq *rq); 1103extern void update_rq_clock(struct rq *rq);
967 1104
968extern void activate_task(struct rq *rq, struct task_struct *p, int flags); 1105extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1183,11 +1320,10 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1183 1320
1184extern void account_cfs_bandwidth_used(int enabled, int was_enabled); 1321extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1185 1322
1186#ifdef CONFIG_NO_HZ 1323#ifdef CONFIG_NO_HZ_COMMON
1187enum rq_nohz_flag_bits { 1324enum rq_nohz_flag_bits {
1188 NOHZ_TICK_STOPPED, 1325 NOHZ_TICK_STOPPED,
1189 NOHZ_BALANCE_KICK, 1326 NOHZ_BALANCE_KICK,
1190 NOHZ_IDLE,
1191}; 1327};
1192 1328
1193#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 1329#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index e036eda1a9c9..da98af347e8b 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -130,16 +130,11 @@ static int schedstat_open(struct inode *inode, struct file *file)
130 return seq_open(file, &schedstat_sops); 130 return seq_open(file, &schedstat_sops);
131} 131}
132 132
133static int schedstat_release(struct inode *inode, struct file *file)
134{
135 return 0;
136};
137
138static const struct file_operations proc_schedstat_operations = { 133static const struct file_operations proc_schedstat_operations = {
139 .open = schedstat_open, 134 .open = schedstat_open,
140 .read = seq_read, 135 .read = seq_read,
141 .llseek = seq_lseek, 136 .llseek = seq_lseek,
142 .release = schedstat_release, 137 .release = seq_release,
143}; 138};
144 139
145static int __init proc_schedstat_init(void) 140static int __init proc_schedstat_init(void)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5af44b593770..b7a10048a32c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -160,6 +160,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
160 case BPF_S_ALU_AND_X: 160 case BPF_S_ALU_AND_X:
161 case BPF_S_ALU_OR_K: 161 case BPF_S_ALU_OR_K:
162 case BPF_S_ALU_OR_X: 162 case BPF_S_ALU_OR_X:
163 case BPF_S_ALU_XOR_K:
164 case BPF_S_ALU_XOR_X:
163 case BPF_S_ALU_LSH_K: 165 case BPF_S_ALU_LSH_K:
164 case BPF_S_ALU_LSH_X: 166 case BPF_S_ALU_LSH_X:
165 case BPF_S_ALU_RSH_K: 167 case BPF_S_ALU_RSH_K:
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 4567fc020fe3..6815171a4fff 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(up);
193struct semaphore_waiter { 193struct semaphore_waiter {
194 struct list_head list; 194 struct list_head list;
195 struct task_struct *task; 195 struct task_struct *task;
196 int up; 196 bool up;
197}; 197};
198 198
199/* 199/*
@@ -209,12 +209,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
209 209
210 list_add_tail(&waiter.list, &sem->wait_list); 210 list_add_tail(&waiter.list, &sem->wait_list);
211 waiter.task = task; 211 waiter.task = task;
212 waiter.up = 0; 212 waiter.up = false;
213 213
214 for (;;) { 214 for (;;) {
215 if (signal_pending_state(state, task)) 215 if (signal_pending_state(state, task))
216 goto interrupted; 216 goto interrupted;
217 if (timeout <= 0) 217 if (unlikely(timeout <= 0))
218 goto timed_out; 218 goto timed_out;
219 __set_task_state(task, state); 219 __set_task_state(task, state);
220 raw_spin_unlock_irq(&sem->lock); 220 raw_spin_unlock_irq(&sem->lock);
@@ -258,6 +258,6 @@ static noinline void __sched __up(struct semaphore *sem)
258 struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, 258 struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
259 struct semaphore_waiter, list); 259 struct semaphore_waiter, list);
260 list_del(&waiter->list); 260 list_del(&waiter->list);
261 waiter->up = 1; 261 waiter->up = true;
262 wake_up_process(waiter->task); 262 wake_up_process(waiter->task);
263} 263}
diff --git a/kernel/signal.c b/kernel/signal.c
index 2ec870a4c3c4..113411bfe8b1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -32,6 +32,7 @@
32#include <linux/user_namespace.h> 32#include <linux/user_namespace.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/compat.h> 34#include <linux/compat.h>
35#include <linux/cn_proc.h>
35#define CREATE_TRACE_POINTS 36#define CREATE_TRACE_POINTS
36#include <trace/events/signal.h> 37#include <trace/events/signal.h>
37 38
@@ -485,6 +486,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
485 if (force_default || ka->sa.sa_handler != SIG_IGN) 486 if (force_default || ka->sa.sa_handler != SIG_IGN)
486 ka->sa.sa_handler = SIG_DFL; 487 ka->sa.sa_handler = SIG_DFL;
487 ka->sa.sa_flags = 0; 488 ka->sa.sa_flags = 0;
489#ifdef __ARCH_HAS_SA_RESTORER
490 ka->sa.sa_restorer = NULL;
491#endif
488 sigemptyset(&ka->sa.sa_mask); 492 sigemptyset(&ka->sa.sa_mask);
489 ka++; 493 ka++;
490 } 494 }
@@ -851,12 +855,14 @@ static void ptrace_trap_notify(struct task_struct *t)
851 * Returns true if the signal should be actually delivered, otherwise 855 * Returns true if the signal should be actually delivered, otherwise
852 * it should be dropped. 856 * it should be dropped.
853 */ 857 */
854static int prepare_signal(int sig, struct task_struct *p, bool force) 858static bool prepare_signal(int sig, struct task_struct *p, bool force)
855{ 859{
856 struct signal_struct *signal = p->signal; 860 struct signal_struct *signal = p->signal;
857 struct task_struct *t; 861 struct task_struct *t;
858 862
859 if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { 863 if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
864 if (signal->flags & SIGNAL_GROUP_COREDUMP)
865 return sig == SIGKILL;
860 /* 866 /*
861 * The process is in the middle of dying, nothing to do. 867 * The process is in the middle of dying, nothing to do.
862 */ 868 */
@@ -1157,8 +1163,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
1157static void print_fatal_signal(int signr) 1163static void print_fatal_signal(int signr)
1158{ 1164{
1159 struct pt_regs *regs = signal_pt_regs(); 1165 struct pt_regs *regs = signal_pt_regs();
1160 printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n", 1166 printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
1161 current->comm, task_pid_nr(current), signr);
1162 1167
1163#if defined(__i386__) && !defined(__arch_um__) 1168#if defined(__i386__) && !defined(__arch_um__)
1164 printk(KERN_INFO "code at %08lx: ", regs->ip); 1169 printk(KERN_INFO "code at %08lx: ", regs->ip);
@@ -2347,6 +2352,7 @@ relock:
2347 if (sig_kernel_coredump(signr)) { 2352 if (sig_kernel_coredump(signr)) {
2348 if (print_fatal_signals) 2353 if (print_fatal_signals)
2349 print_fatal_signal(info->si_signo); 2354 print_fatal_signal(info->si_signo);
2355 proc_coredump_connector(current);
2350 /* 2356 /*
2351 * If it was able to dump core, this kills all 2357 * If it was able to dump core, this kills all
2352 * other threads in the group and synchronizes with 2358 * other threads in the group and synchronizes with
@@ -2682,7 +2688,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
2682/** 2688/**
2683 * sys_rt_sigpending - examine a pending signal that has been raised 2689 * sys_rt_sigpending - examine a pending signal that has been raised
2684 * while blocked 2690 * while blocked
2685 * @set: stores pending signals 2691 * @uset: stores pending signals
2686 * @sigsetsize: size of sigset_t type or larger 2692 * @sigsetsize: size of sigset_t type or larger
2687 */ 2693 */
2688SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) 2694SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
@@ -2945,7 +2951,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2945 2951
2946static int do_tkill(pid_t tgid, pid_t pid, int sig) 2952static int do_tkill(pid_t tgid, pid_t pid, int sig)
2947{ 2953{
2948 struct siginfo info; 2954 struct siginfo info = {};
2949 2955
2950 info.si_signo = sig; 2956 info.si_signo = sig;
2951 info.si_errno = 0; 2957 info.si_errno = 0;
diff --git a/kernel/smp.c b/kernel/smp.c
index 8e451f3ff51b..4dba0f7b72ad 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -100,16 +100,16 @@ void __init call_function_init(void)
100 * previous function call. For multi-cpu calls its even more interesting 100 * previous function call. For multi-cpu calls its even more interesting
101 * as we'll have to ensure no other cpu is observing our csd. 101 * as we'll have to ensure no other cpu is observing our csd.
102 */ 102 */
103static void csd_lock_wait(struct call_single_data *data) 103static void csd_lock_wait(struct call_single_data *csd)
104{ 104{
105 while (data->flags & CSD_FLAG_LOCK) 105 while (csd->flags & CSD_FLAG_LOCK)
106 cpu_relax(); 106 cpu_relax();
107} 107}
108 108
109static void csd_lock(struct call_single_data *data) 109static void csd_lock(struct call_single_data *csd)
110{ 110{
111 csd_lock_wait(data); 111 csd_lock_wait(csd);
112 data->flags = CSD_FLAG_LOCK; 112 csd->flags |= CSD_FLAG_LOCK;
113 113
114 /* 114 /*
115 * prevent CPU from reordering the above assignment 115 * prevent CPU from reordering the above assignment
@@ -119,16 +119,16 @@ static void csd_lock(struct call_single_data *data)
119 smp_mb(); 119 smp_mb();
120} 120}
121 121
122static void csd_unlock(struct call_single_data *data) 122static void csd_unlock(struct call_single_data *csd)
123{ 123{
124 WARN_ON(!(data->flags & CSD_FLAG_LOCK)); 124 WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
125 125
126 /* 126 /*
127 * ensure we're all done before releasing data: 127 * ensure we're all done before releasing data:
128 */ 128 */
129 smp_mb(); 129 smp_mb();
130 130
131 data->flags &= ~CSD_FLAG_LOCK; 131 csd->flags &= ~CSD_FLAG_LOCK;
132} 132}
133 133
134/* 134/*
@@ -137,7 +137,7 @@ static void csd_unlock(struct call_single_data *data)
137 * ->func, ->info, and ->flags set. 137 * ->func, ->info, and ->flags set.
138 */ 138 */
139static 139static
140void generic_exec_single(int cpu, struct call_single_data *data, int wait) 140void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
141{ 141{
142 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); 142 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
143 unsigned long flags; 143 unsigned long flags;
@@ -145,7 +145,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
145 145
146 raw_spin_lock_irqsave(&dst->lock, flags); 146 raw_spin_lock_irqsave(&dst->lock, flags);
147 ipi = list_empty(&dst->list); 147 ipi = list_empty(&dst->list);
148 list_add_tail(&data->list, &dst->list); 148 list_add_tail(&csd->list, &dst->list);
149 raw_spin_unlock_irqrestore(&dst->lock, flags); 149 raw_spin_unlock_irqrestore(&dst->lock, flags);
150 150
151 /* 151 /*
@@ -163,7 +163,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
163 arch_send_call_function_single_ipi(cpu); 163 arch_send_call_function_single_ipi(cpu);
164 164
165 if (wait) 165 if (wait)
166 csd_lock_wait(data); 166 csd_lock_wait(csd);
167} 167}
168 168
169/* 169/*
@@ -173,7 +173,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
173void generic_smp_call_function_single_interrupt(void) 173void generic_smp_call_function_single_interrupt(void)
174{ 174{
175 struct call_single_queue *q = &__get_cpu_var(call_single_queue); 175 struct call_single_queue *q = &__get_cpu_var(call_single_queue);
176 unsigned int data_flags;
177 LIST_HEAD(list); 176 LIST_HEAD(list);
178 177
179 /* 178 /*
@@ -186,25 +185,26 @@ void generic_smp_call_function_single_interrupt(void)
186 raw_spin_unlock(&q->lock); 185 raw_spin_unlock(&q->lock);
187 186
188 while (!list_empty(&list)) { 187 while (!list_empty(&list)) {
189 struct call_single_data *data; 188 struct call_single_data *csd;
189 unsigned int csd_flags;
190 190
191 data = list_entry(list.next, struct call_single_data, list); 191 csd = list_entry(list.next, struct call_single_data, list);
192 list_del(&data->list); 192 list_del(&csd->list);
193 193
194 /* 194 /*
195 * 'data' can be invalid after this call if flags == 0 195 * 'csd' can be invalid after this call if flags == 0
196 * (when called through generic_exec_single()), 196 * (when called through generic_exec_single()),
197 * so save them away before making the call: 197 * so save them away before making the call:
198 */ 198 */
199 data_flags = data->flags; 199 csd_flags = csd->flags;
200 200
201 data->func(data->info); 201 csd->func(csd->info);
202 202
203 /* 203 /*
204 * Unlocked CSDs are valid through generic_exec_single(): 204 * Unlocked CSDs are valid through generic_exec_single():
205 */ 205 */
206 if (data_flags & CSD_FLAG_LOCK) 206 if (csd_flags & CSD_FLAG_LOCK)
207 csd_unlock(data); 207 csd_unlock(csd);
208 } 208 }
209} 209}
210 210
@@ -249,16 +249,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
249 local_irq_restore(flags); 249 local_irq_restore(flags);
250 } else { 250 } else {
251 if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { 251 if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
252 struct call_single_data *data = &d; 252 struct call_single_data *csd = &d;
253 253
254 if (!wait) 254 if (!wait)
255 data = &__get_cpu_var(csd_data); 255 csd = &__get_cpu_var(csd_data);
256 256
257 csd_lock(data); 257 csd_lock(csd);
258 258
259 data->func = func; 259 csd->func = func;
260 data->info = info; 260 csd->info = info;
261 generic_exec_single(cpu, data, wait); 261 generic_exec_single(cpu, csd, wait);
262 } else { 262 } else {
263 err = -ENXIO; /* CPU not online */ 263 err = -ENXIO; /* CPU not online */
264 } 264 }
@@ -325,7 +325,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
325 * pre-allocated data structure. Useful for embedding @data inside 325 * pre-allocated data structure. Useful for embedding @data inside
326 * other structures, for instance. 326 * other structures, for instance.
327 */ 327 */
328void __smp_call_function_single(int cpu, struct call_single_data *data, 328void __smp_call_function_single(int cpu, struct call_single_data *csd,
329 int wait) 329 int wait)
330{ 330{
331 unsigned int this_cpu; 331 unsigned int this_cpu;
@@ -343,11 +343,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
343 343
344 if (cpu == this_cpu) { 344 if (cpu == this_cpu) {
345 local_irq_save(flags); 345 local_irq_save(flags);
346 data->func(data->info); 346 csd->func(csd->info);
347 local_irq_restore(flags); 347 local_irq_restore(flags);
348 } else { 348 } else {
349 csd_lock(data); 349 csd_lock(csd);
350 generic_exec_single(cpu, data, wait); 350 generic_exec_single(cpu, csd, wait);
351 } 351 }
352 put_cpu(); 352 put_cpu();
353} 353}
@@ -369,7 +369,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
369void smp_call_function_many(const struct cpumask *mask, 369void smp_call_function_many(const struct cpumask *mask,
370 smp_call_func_t func, void *info, bool wait) 370 smp_call_func_t func, void *info, bool wait)
371{ 371{
372 struct call_function_data *data; 372 struct call_function_data *cfd;
373 int cpu, next_cpu, this_cpu = smp_processor_id(); 373 int cpu, next_cpu, this_cpu = smp_processor_id();
374 374
375 /* 375 /*
@@ -401,24 +401,24 @@ void smp_call_function_many(const struct cpumask *mask,
401 return; 401 return;
402 } 402 }
403 403
404 data = &__get_cpu_var(cfd_data); 404 cfd = &__get_cpu_var(cfd_data);
405 405
406 cpumask_and(data->cpumask, mask, cpu_online_mask); 406 cpumask_and(cfd->cpumask, mask, cpu_online_mask);
407 cpumask_clear_cpu(this_cpu, data->cpumask); 407 cpumask_clear_cpu(this_cpu, cfd->cpumask);
408 408
409 /* Some callers race with other cpus changing the passed mask */ 409 /* Some callers race with other cpus changing the passed mask */
410 if (unlikely(!cpumask_weight(data->cpumask))) 410 if (unlikely(!cpumask_weight(cfd->cpumask)))
411 return; 411 return;
412 412
413 /* 413 /*
414 * After we put an entry into the list, data->cpumask 414 * After we put an entry into the list, cfd->cpumask may be cleared
415 * may be cleared again when another CPU sends another IPI for 415 * again when another CPU sends another IPI for a SMP function call, so
416 * a SMP function call, so data->cpumask will be zero. 416 * cfd->cpumask will be zero.
417 */ 417 */
418 cpumask_copy(data->cpumask_ipi, data->cpumask); 418 cpumask_copy(cfd->cpumask_ipi, cfd->cpumask);
419 419
420 for_each_cpu(cpu, data->cpumask) { 420 for_each_cpu(cpu, cfd->cpumask) {
421 struct call_single_data *csd = per_cpu_ptr(data->csd, cpu); 421 struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
422 struct call_single_queue *dst = 422 struct call_single_queue *dst =
423 &per_cpu(call_single_queue, cpu); 423 &per_cpu(call_single_queue, cpu);
424 unsigned long flags; 424 unsigned long flags;
@@ -433,12 +433,13 @@ void smp_call_function_many(const struct cpumask *mask,
433 } 433 }
434 434
435 /* Send a message to all CPUs in the map */ 435 /* Send a message to all CPUs in the map */
436 arch_send_call_function_ipi_mask(data->cpumask_ipi); 436 arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
437 437
438 if (wait) { 438 if (wait) {
439 for_each_cpu(cpu, data->cpumask) { 439 for_each_cpu(cpu, cfd->cpumask) {
440 struct call_single_data *csd = 440 struct call_single_data *csd;
441 per_cpu_ptr(data->csd, cpu); 441
442 csd = per_cpu_ptr(cfd->csd, cpu);
442 csd_lock_wait(csd); 443 csd_lock_wait(csd);
443 } 444 }
444 } 445 }
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 8eaed9aa9cf0..02fc5c933673 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
185 } 185 }
186 get_task_struct(tsk); 186 get_task_struct(tsk);
187 *per_cpu_ptr(ht->store, cpu) = tsk; 187 *per_cpu_ptr(ht->store, cpu) = tsk;
188 if (ht->create) 188 if (ht->create) {
189 ht->create(cpu); 189 /*
190 * Make sure that the task has actually scheduled out
191 * into park position, before calling the create
192 * callback. At least the migration thread callback
193 * requires that the task is off the runqueue.
194 */
195 if (!wait_task_inactive(tsk, TASK_PARKED))
196 WARN_ON(1);
197 else
198 ht->create(cpu);
199 }
190 return 0; 200 return 0;
191} 201}
192 202
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 14d7758074aa..b5197dcb0dad 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -329,6 +329,19 @@ static inline void invoke_softirq(void)
329 wakeup_softirqd(); 329 wakeup_softirqd();
330} 330}
331 331
332static inline void tick_irq_exit(void)
333{
334#ifdef CONFIG_NO_HZ_COMMON
335 int cpu = smp_processor_id();
336
337 /* Make sure that timer wheel updates are propagated */
338 if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
339 if (!in_interrupt())
340 tick_nohz_irq_exit();
341 }
342#endif
343}
344
332/* 345/*
333 * Exit an interrupt context. Process softirqs if needed and possible: 346 * Exit an interrupt context. Process softirqs if needed and possible:
334 */ 347 */
@@ -346,11 +359,7 @@ void irq_exit(void)
346 if (!in_interrupt() && local_softirq_pending()) 359 if (!in_interrupt() && local_softirq_pending())
347 invoke_softirq(); 360 invoke_softirq();
348 361
349#ifdef CONFIG_NO_HZ 362 tick_irq_exit();
350 /* Make sure that timer wheel updates are propagated */
351 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
352 tick_nohz_irq_exit();
353#endif
354 rcu_irq_exit(); 363 rcu_irq_exit();
355} 364}
356 365
@@ -620,8 +629,7 @@ static void remote_softirq_receive(void *data)
620 unsigned long flags; 629 unsigned long flags;
621 int softirq; 630 int softirq;
622 631
623 softirq = cp->priv; 632 softirq = *(int *)cp->info;
624
625 local_irq_save(flags); 633 local_irq_save(flags);
626 __local_trigger(cp, softirq); 634 __local_trigger(cp, softirq);
627 local_irq_restore(flags); 635 local_irq_restore(flags);
@@ -631,9 +639,8 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
631{ 639{
632 if (cpu_online(cpu)) { 640 if (cpu_online(cpu)) {
633 cp->func = remote_softirq_receive; 641 cp->func = remote_softirq_receive;
634 cp->info = cp; 642 cp->info = &softirq;
635 cp->flags = 0; 643 cp->flags = 0;
636 cp->priv = softirq;
637 644
638 __smp_call_function_single(cpu, cp, 0); 645 __smp_call_function_single(cpu, cp, 0);
639 return 0; 646 return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 81f56445fba9..b95d3c72ba21 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,11 @@
49#include <linux/user_namespace.h> 49#include <linux/user_namespace.h>
50#include <linux/binfmts.h> 50#include <linux/binfmts.h>
51 51
52#include <linux/sched.h>
53#include <linux/rcupdate.h>
54#include <linux/uidgid.h>
55#include <linux/cred.h>
56
52#include <linux/kmsg_dump.h> 57#include <linux/kmsg_dump.h>
53/* Move somewhere else to avoid recompiling? */ 58/* Move somewhere else to avoid recompiling? */
54#include <generated/utsrelease.h> 59#include <generated/utsrelease.h>
@@ -324,7 +329,6 @@ void kernel_restart_prepare(char *cmd)
324 system_state = SYSTEM_RESTART; 329 system_state = SYSTEM_RESTART;
325 usermodehelper_disable(); 330 usermodehelper_disable();
326 device_shutdown(); 331 device_shutdown();
327 syscore_shutdown();
328} 332}
329 333
330/** 334/**
@@ -370,6 +374,7 @@ void kernel_restart(char *cmd)
370{ 374{
371 kernel_restart_prepare(cmd); 375 kernel_restart_prepare(cmd);
372 disable_nonboot_cpus(); 376 disable_nonboot_cpus();
377 syscore_shutdown();
373 if (!cmd) 378 if (!cmd)
374 printk(KERN_EMERG "Restarting system.\n"); 379 printk(KERN_EMERG "Restarting system.\n");
375 else 380 else
@@ -395,6 +400,7 @@ static void kernel_shutdown_prepare(enum system_states state)
395void kernel_halt(void) 400void kernel_halt(void)
396{ 401{
397 kernel_shutdown_prepare(SYSTEM_HALT); 402 kernel_shutdown_prepare(SYSTEM_HALT);
403 disable_nonboot_cpus();
398 syscore_shutdown(); 404 syscore_shutdown();
399 printk(KERN_EMERG "System halted.\n"); 405 printk(KERN_EMERG "System halted.\n");
400 kmsg_dump(KMSG_DUMP_HALT); 406 kmsg_dump(KMSG_DUMP_HALT);
@@ -1043,6 +1049,67 @@ change_okay:
1043 return old_fsgid; 1049 return old_fsgid;
1044} 1050}
1045 1051
1052/**
1053 * sys_getpid - return the thread group id of the current process
1054 *
1055 * Note, despite the name, this returns the tgid not the pid. The tgid and
1056 * the pid are identical unless CLONE_THREAD was specified on clone() in
1057 * which case the tgid is the same in all threads of the same group.
1058 *
1059 * This is SMP safe as current->tgid does not change.
1060 */
1061SYSCALL_DEFINE0(getpid)
1062{
1063 return task_tgid_vnr(current);
1064}
1065
1066/* Thread ID - the internal kernel "pid" */
1067SYSCALL_DEFINE0(gettid)
1068{
1069 return task_pid_vnr(current);
1070}
1071
1072/*
1073 * Accessing ->real_parent is not SMP-safe, it could
1074 * change from under us. However, we can use a stale
1075 * value of ->real_parent under rcu_read_lock(), see
1076 * release_task()->call_rcu(delayed_put_task_struct).
1077 */
1078SYSCALL_DEFINE0(getppid)
1079{
1080 int pid;
1081
1082 rcu_read_lock();
1083 pid = task_tgid_vnr(rcu_dereference(current->real_parent));
1084 rcu_read_unlock();
1085
1086 return pid;
1087}
1088
1089SYSCALL_DEFINE0(getuid)
1090{
1091 /* Only we change this so SMP safe */
1092 return from_kuid_munged(current_user_ns(), current_uid());
1093}
1094
1095SYSCALL_DEFINE0(geteuid)
1096{
1097 /* Only we change this so SMP safe */
1098 return from_kuid_munged(current_user_ns(), current_euid());
1099}
1100
1101SYSCALL_DEFINE0(getgid)
1102{
1103 /* Only we change this so SMP safe */
1104 return from_kgid_munged(current_user_ns(), current_gid());
1105}
1106
1107SYSCALL_DEFINE0(getegid)
1108{
1109 /* Only we change this so SMP safe */
1110 return from_kgid_munged(current_user_ns(), current_egid());
1111}
1112
1046void do_sys_times(struct tms *tms) 1113void do_sys_times(struct tms *tms)
1047{ 1114{
1048 cputime_t tgutime, tgstime, cutime, cstime; 1115 cputime_t tgutime, tgstime, cutime, cstime;
@@ -1784,13 +1851,26 @@ SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1784 return getrusage(current, who, ru); 1851 return getrusage(current, who, ru);
1785} 1852}
1786 1853
1854#ifdef CONFIG_COMPAT
1855COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
1856{
1857 struct rusage r;
1858
1859 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1860 who != RUSAGE_THREAD)
1861 return -EINVAL;
1862
1863 k_getrusage(current, who, &r);
1864 return put_compat_rusage(&r, ru);
1865}
1866#endif
1867
1787SYSCALL_DEFINE1(umask, int, mask) 1868SYSCALL_DEFINE1(umask, int, mask)
1788{ 1869{
1789 mask = xchg(&current->fs->umask, mask & S_IRWXUGO); 1870 mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1790 return mask; 1871 return mask;
1791} 1872}
1792 1873
1793#ifdef CONFIG_CHECKPOINT_RESTORE
1794static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1874static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1795{ 1875{
1796 struct fd exe; 1876 struct fd exe;
@@ -1984,17 +2064,12 @@ out:
1984 return error; 2064 return error;
1985} 2065}
1986 2066
2067#ifdef CONFIG_CHECKPOINT_RESTORE
1987static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) 2068static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1988{ 2069{
1989 return put_user(me->clear_child_tid, tid_addr); 2070 return put_user(me->clear_child_tid, tid_addr);
1990} 2071}
1991 2072#else
1992#else /* CONFIG_CHECKPOINT_RESTORE */
1993static int prctl_set_mm(int opt, unsigned long addr,
1994 unsigned long arg4, unsigned long arg5)
1995{
1996 return -EINVAL;
1997}
1998static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) 2073static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
1999{ 2074{
2000 return -EINVAL; 2075 return -EINVAL;
@@ -2185,9 +2260,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2185 2260
2186char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 2261char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
2187 2262
2188static int __orderly_poweroff(void) 2263static int __orderly_poweroff(bool force)
2189{ 2264{
2190 int argc;
2191 char **argv; 2265 char **argv;
2192 static char *envp[] = { 2266 static char *envp[] = {
2193 "HOME=/", 2267 "HOME=/",
@@ -2196,20 +2270,40 @@ static int __orderly_poweroff(void)
2196 }; 2270 };
2197 int ret; 2271 int ret;
2198 2272
2199 argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); 2273 argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
2200 if (argv == NULL) { 2274 if (argv) {
2275 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
2276 argv_free(argv);
2277 } else {
2201 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", 2278 printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
2202 __func__, poweroff_cmd); 2279 __func__, poweroff_cmd);
2203 return -ENOMEM; 2280 ret = -ENOMEM;
2204 } 2281 }
2205 2282
2206 ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, 2283 if (ret && force) {
2207 NULL, NULL, NULL); 2284 printk(KERN_WARNING "Failed to start orderly shutdown: "
2208 argv_free(argv); 2285 "forcing the issue\n");
2286 /*
2287 * I guess this should try to kick off some daemon to sync and
2288 * poweroff asap. Or not even bother syncing if we're doing an
2289 * emergency shutdown?
2290 */
2291 emergency_sync();
2292 kernel_power_off();
2293 }
2209 2294
2210 return ret; 2295 return ret;
2211} 2296}
2212 2297
2298static bool poweroff_force;
2299
2300static void poweroff_work_func(struct work_struct *work)
2301{
2302 __orderly_poweroff(poweroff_force);
2303}
2304
2305static DECLARE_WORK(poweroff_work, poweroff_work_func);
2306
2213/** 2307/**
2214 * orderly_poweroff - Trigger an orderly system poweroff 2308 * orderly_poweroff - Trigger an orderly system poweroff
2215 * @force: force poweroff if command execution fails 2309 * @force: force poweroff if command execution fails
@@ -2219,21 +2313,154 @@ static int __orderly_poweroff(void)
2219 */ 2313 */
2220int orderly_poweroff(bool force) 2314int orderly_poweroff(bool force)
2221{ 2315{
2222 int ret = __orderly_poweroff(); 2316 if (force) /* do not override the pending "true" */
2317 poweroff_force = true;
2318 schedule_work(&poweroff_work);
2319 return 0;
2320}
2321EXPORT_SYMBOL_GPL(orderly_poweroff);
2223 2322
2224 if (ret && force) { 2323/**
2225 printk(KERN_WARNING "Failed to start orderly shutdown: " 2324 * do_sysinfo - fill in sysinfo struct
2226 "forcing the issue\n"); 2325 * @info: pointer to buffer to fill
2326 */
2327static int do_sysinfo(struct sysinfo *info)
2328{
2329 unsigned long mem_total, sav_total;
2330 unsigned int mem_unit, bitcount;
2331 struct timespec tp;
2227 2332
2228 /* 2333 memset(info, 0, sizeof(struct sysinfo));
2229 * I guess this should try to kick off some daemon to sync and 2334
2230 * poweroff asap. Or not even bother syncing if we're doing an 2335 ktime_get_ts(&tp);
2231 * emergency shutdown? 2336 monotonic_to_bootbased(&tp);
2232 */ 2337 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
2233 emergency_sync(); 2338
2234 kernel_power_off(); 2339 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
2340
2341 info->procs = nr_threads;
2342
2343 si_meminfo(info);
2344 si_swapinfo(info);
2345
2346 /*
2347 * If the sum of all the available memory (i.e. ram + swap)
2348 * is less than can be stored in a 32 bit unsigned long then
2349 * we can be binary compatible with 2.2.x kernels. If not,
2350 * well, in that case 2.2.x was broken anyways...
2351 *
2352 * -Erik Andersen <andersee@debian.org>
2353 */
2354
2355 mem_total = info->totalram + info->totalswap;
2356 if (mem_total < info->totalram || mem_total < info->totalswap)
2357 goto out;
2358 bitcount = 0;
2359 mem_unit = info->mem_unit;
2360 while (mem_unit > 1) {
2361 bitcount++;
2362 mem_unit >>= 1;
2363 sav_total = mem_total;
2364 mem_total <<= 1;
2365 if (mem_total < sav_total)
2366 goto out;
2235 } 2367 }
2236 2368
2237 return ret; 2369 /*
2370 * If mem_total did not overflow, multiply all memory values by
2371 * info->mem_unit and set it to 1. This leaves things compatible
2372 * with 2.2.x, and also retains compatibility with earlier 2.4.x
2373 * kernels...
2374 */
2375
2376 info->mem_unit = 1;
2377 info->totalram <<= bitcount;
2378 info->freeram <<= bitcount;
2379 info->sharedram <<= bitcount;
2380 info->bufferram <<= bitcount;
2381 info->totalswap <<= bitcount;
2382 info->freeswap <<= bitcount;
2383 info->totalhigh <<= bitcount;
2384 info->freehigh <<= bitcount;
2385
2386out:
2387 return 0;
2238} 2388}
2239EXPORT_SYMBOL_GPL(orderly_poweroff); 2389
2390SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
2391{
2392 struct sysinfo val;
2393
2394 do_sysinfo(&val);
2395
2396 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
2397 return -EFAULT;
2398
2399 return 0;
2400}
2401
2402#ifdef CONFIG_COMPAT
2403struct compat_sysinfo {
2404 s32 uptime;
2405 u32 loads[3];
2406 u32 totalram;
2407 u32 freeram;
2408 u32 sharedram;
2409 u32 bufferram;
2410 u32 totalswap;
2411 u32 freeswap;
2412 u16 procs;
2413 u16 pad;
2414 u32 totalhigh;
2415 u32 freehigh;
2416 u32 mem_unit;
2417 char _f[20-2*sizeof(u32)-sizeof(int)];
2418};
2419
2420COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
2421{
2422 struct sysinfo s;
2423
2424 do_sysinfo(&s);
2425
2426 /* Check to see if any memory value is too large for 32-bit and scale
2427 * down if needed
2428 */
2429 if ((s.totalram >> 32) || (s.totalswap >> 32)) {
2430 int bitcount = 0;
2431
2432 while (s.mem_unit < PAGE_SIZE) {
2433 s.mem_unit <<= 1;
2434 bitcount++;
2435 }
2436
2437 s.totalram >>= bitcount;
2438 s.freeram >>= bitcount;
2439 s.sharedram >>= bitcount;
2440 s.bufferram >>= bitcount;
2441 s.totalswap >>= bitcount;
2442 s.freeswap >>= bitcount;
2443 s.totalhigh >>= bitcount;
2444 s.freehigh >>= bitcount;
2445 }
2446
2447 if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
2448 __put_user(s.uptime, &info->uptime) ||
2449 __put_user(s.loads[0], &info->loads[0]) ||
2450 __put_user(s.loads[1], &info->loads[1]) ||
2451 __put_user(s.loads[2], &info->loads[2]) ||
2452 __put_user(s.totalram, &info->totalram) ||
2453 __put_user(s.freeram, &info->freeram) ||
2454 __put_user(s.sharedram, &info->sharedram) ||
2455 __put_user(s.bufferram, &info->bufferram) ||
2456 __put_user(s.totalswap, &info->totalswap) ||
2457 __put_user(s.freeswap, &info->freeswap) ||
2458 __put_user(s.procs, &info->procs) ||
2459 __put_user(s.totalhigh, &info->totalhigh) ||
2460 __put_user(s.freehigh, &info->freehigh) ||
2461 __put_user(s.mem_unit, &info->mem_unit))
2462 return -EFAULT;
2463
2464 return 0;
2465}
2466#endif /* CONFIG_COMPAT */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 395084d4ce16..bfd6787b355a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -20,6 +20,7 @@ cond_syscall(sys_quotactl);
20cond_syscall(sys32_quotactl); 20cond_syscall(sys32_quotactl);
21cond_syscall(sys_acct); 21cond_syscall(sys_acct);
22cond_syscall(sys_lookup_dcookie); 22cond_syscall(sys_lookup_dcookie);
23cond_syscall(compat_sys_lookup_dcookie);
23cond_syscall(sys_swapon); 24cond_syscall(sys_swapon);
24cond_syscall(sys_swapoff); 25cond_syscall(sys_swapoff);
25cond_syscall(sys_kexec_load); 26cond_syscall(sys_kexec_load);
@@ -155,7 +156,7 @@ cond_syscall(compat_sys_process_vm_writev);
155cond_syscall(sys_pciconfig_read); 156cond_syscall(sys_pciconfig_read);
156cond_syscall(sys_pciconfig_write); 157cond_syscall(sys_pciconfig_write);
157cond_syscall(sys_pciconfig_iobase); 158cond_syscall(sys_pciconfig_iobase);
158cond_syscall(sys32_ipc); 159cond_syscall(compat_sys_s390_ipc);
159cond_syscall(ppc_rtas); 160cond_syscall(ppc_rtas);
160cond_syscall(sys_spu_run); 161cond_syscall(sys_spu_run);
161cond_syscall(sys_spu_create); 162cond_syscall(sys_spu_create);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afc1dc60f3f8..9edcf456e0fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;
106#endif 106#endif
107extern int pid_max; 107extern int pid_max;
108extern int pid_max_min, pid_max_max; 108extern int pid_max_min, pid_max_max;
109extern int sysctl_drop_caches;
110extern int percpu_pagelist_fraction; 109extern int percpu_pagelist_fraction;
111extern int compat_log; 110extern int compat_log;
112extern int latencytop_enabled; 111extern int latencytop_enabled;
@@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = {
1430 .extra2 = &one, 1429 .extra2 = &one,
1431 }, 1430 },
1432#endif 1431#endif
1432 {
1433 .procname = "user_reserve_kbytes",
1434 .data = &sysctl_user_reserve_kbytes,
1435 .maxlen = sizeof(sysctl_user_reserve_kbytes),
1436 .mode = 0644,
1437 .proc_handler = proc_doulongvec_minmax,
1438 },
1439 {
1440 .procname = "admin_reserve_kbytes",
1441 .data = &sysctl_admin_reserve_kbytes,
1442 .maxlen = sizeof(sysctl_admin_reserve_kbytes),
1443 .mode = 0644,
1444 .proc_handler = proc_doulongvec_minmax,
1445 },
1433 { } 1446 { }
1434}; 1447};
1435 1448
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index f8b11a283171..12d6ebbfdd83 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -365,7 +365,7 @@ int init_test_probes(void)
365 target2 = kprobe_target2; 365 target2 = kprobe_target2;
366 366
367 do { 367 do {
368 rand1 = random32(); 368 rand1 = prandom_u32();
369 } while (rand1 <= div_factor); 369 } while (rand1 <= div_factor);
370 370
371 printk(KERN_INFO "Kprobe smoke test started\n"); 371 printk(KERN_INFO "Kprobe smoke test started\n");
diff --git a/kernel/time.c b/kernel/time.c
index f8342a41efa6..d3617dbd3dca 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -138,13 +138,14 @@ int persistent_clock_is_local;
138 */ 138 */
139static inline void warp_clock(void) 139static inline void warp_clock(void)
140{ 140{
141 struct timespec adjust; 141 if (sys_tz.tz_minuteswest != 0) {
142 struct timespec adjust;
142 143
143 adjust = current_kernel_time();
144 if (sys_tz.tz_minuteswest != 0)
145 persistent_clock_is_local = 1; 144 persistent_clock_is_local = 1;
146 adjust.tv_sec += sys_tz.tz_minuteswest * 60; 145 adjust.tv_sec = sys_tz.tz_minuteswest * 60;
147 do_settimeofday(&adjust); 146 adjust.tv_nsec = 0;
147 timekeeping_inject_offset(&adjust);
148 }
148} 149}
149 150
150/* 151/*
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 24510d84efd7..e4c07b0692bb 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -64,20 +64,88 @@ config GENERIC_CMOS_UPDATE
64if GENERIC_CLOCKEVENTS 64if GENERIC_CLOCKEVENTS
65menu "Timers subsystem" 65menu "Timers subsystem"
66 66
67# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is 67# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
68# only related to the tick functionality. Oneshot clockevent devices 68# only related to the tick functionality. Oneshot clockevent devices
69# are supported independ of this. 69# are supported independ of this.
70config TICK_ONESHOT 70config TICK_ONESHOT
71 bool 71 bool
72 72
73config NO_HZ 73config NO_HZ_COMMON
74 bool "Tickless System (Dynamic Ticks)" 74 bool
75 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS 75 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
76 select TICK_ONESHOT 76 select TICK_ONESHOT
77
78choice
79 prompt "Timer tick handling"
80 default NO_HZ_IDLE if NO_HZ
81
82config HZ_PERIODIC
83 bool "Periodic timer ticks (constant rate, no dynticks)"
84 help
85 This option keeps the tick running periodically at a constant
86 rate, even when the CPU doesn't need it.
87
88config NO_HZ_IDLE
89 bool "Idle dynticks system (tickless idle)"
90 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
91 select NO_HZ_COMMON
92 help
93 This option enables a tickless idle system: timer interrupts
94 will only trigger on an as-needed basis when the system is idle.
95 This is usually interesting for energy saving.
96
97 Most of the time you want to say Y here.
98
99config NO_HZ_FULL
100 bool "Full dynticks system (tickless)"
101 # NO_HZ_COMMON dependency
102 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
103 # We need at least one periodic CPU for timekeeping
104 depends on SMP
105 # RCU_USER_QS dependency
106 depends on HAVE_CONTEXT_TRACKING
107 # VIRT_CPU_ACCOUNTING_GEN dependency
108 depends on 64BIT
109 select NO_HZ_COMMON
110 select RCU_USER_QS
111 select RCU_NOCB_CPU
112 select VIRT_CPU_ACCOUNTING_GEN
113 select CONTEXT_TRACKING_FORCE
114 select IRQ_WORK
115 help
116 Adaptively try to shutdown the tick whenever possible, even when
117 the CPU is running tasks. Typically this requires running a single
118 task on the CPU. Chances for running tickless are maximized when
119 the task mostly runs in userspace and has few kernel activity.
120
121 You need to fill up the nohz_full boot parameter with the
122 desired range of dynticks CPUs.
123
124 This is implemented at the expense of some overhead in user <-> kernel
125 transitions: syscalls, exceptions and interrupts. Even when it's
126 dynamically off.
127
128 Say N.
129
130endchoice
131
132config NO_HZ_FULL_ALL
133 bool "Full dynticks system on all CPUs by default"
134 depends on NO_HZ_FULL
135 help
136 If the user doesn't pass the nohz_full boot option to
137 define the range of full dynticks CPUs, consider that all
138 CPUs in the system are full dynticks by default.
139 Note the boot CPU will still be kept outside the range to
140 handle the timekeeping duty.
141
142config NO_HZ
143 bool "Old Idle dynticks config"
144 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
77 help 145 help
78 This option enables a tickless system: timer interrupts will 146 This is the old config entry that enables dynticks idle.
79 only trigger on an as-needed basis both when the system is 147 We keep it around for a little while to enforce backward
80 busy and when the system is idle. 148 compatibility with older config files.
81 149
82config HIGH_RES_TIMERS 150config HIGH_RES_TIMERS
83 bool "High Resolution Timer Support" 151 bool "High Resolution Timer Support"
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 072bb066bb7d..12ff13a838c6 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,13 +18,14 @@
18#include <linux/rtc.h> 18#include <linux/rtc.h>
19 19
20#include "tick-internal.h" 20#include "tick-internal.h"
21#include "ntp_internal.h"
21 22
22/* 23/*
23 * NTP timekeeping variables: 24 * NTP timekeeping variables:
25 *
26 * Note: All of the NTP state is protected by the timekeeping locks.
24 */ 27 */
25 28
26DEFINE_RAW_SPINLOCK(ntp_lock);
27
28 29
29/* USER_HZ period (usecs): */ 30/* USER_HZ period (usecs): */
30unsigned long tick_usec = TICK_USEC; 31unsigned long tick_usec = TICK_USEC;
@@ -53,9 +54,6 @@ static int time_state = TIME_OK;
53/* clock status bits: */ 54/* clock status bits: */
54static int time_status = STA_UNSYNC; 55static int time_status = STA_UNSYNC;
55 56
56/* TAI offset (secs): */
57static long time_tai;
58
59/* time adjustment (nsecs): */ 57/* time adjustment (nsecs): */
60static s64 time_offset; 58static s64 time_offset;
61 59
@@ -134,8 +132,6 @@ static inline void pps_reset_freq_interval(void)
134 132
135/** 133/**
136 * pps_clear - Clears the PPS state variables 134 * pps_clear - Clears the PPS state variables
137 *
138 * Must be called while holding a write on the ntp_lock
139 */ 135 */
140static inline void pps_clear(void) 136static inline void pps_clear(void)
141{ 137{
@@ -150,8 +146,6 @@ static inline void pps_clear(void)
150/* Decrease pps_valid to indicate that another second has passed since 146/* Decrease pps_valid to indicate that another second has passed since
151 * the last PPS signal. When it reaches 0, indicate that PPS signal is 147 * the last PPS signal. When it reaches 0, indicate that PPS signal is
152 * missing. 148 * missing.
153 *
154 * Must be called while holding a write on the ntp_lock
155 */ 149 */
156static inline void pps_dec_valid(void) 150static inline void pps_dec_valid(void)
157{ 151{
@@ -346,10 +340,6 @@ static void ntp_update_offset(long offset)
346 */ 340 */
347void ntp_clear(void) 341void ntp_clear(void)
348{ 342{
349 unsigned long flags;
350
351 raw_spin_lock_irqsave(&ntp_lock, flags);
352
353 time_adjust = 0; /* stop active adjtime() */ 343 time_adjust = 0; /* stop active adjtime() */
354 time_status |= STA_UNSYNC; 344 time_status |= STA_UNSYNC;
355 time_maxerror = NTP_PHASE_LIMIT; 345 time_maxerror = NTP_PHASE_LIMIT;
@@ -362,20 +352,12 @@ void ntp_clear(void)
362 352
363 /* Clear PPS state variables */ 353 /* Clear PPS state variables */
364 pps_clear(); 354 pps_clear();
365 raw_spin_unlock_irqrestore(&ntp_lock, flags);
366
367} 355}
368 356
369 357
370u64 ntp_tick_length(void) 358u64 ntp_tick_length(void)
371{ 359{
372 unsigned long flags; 360 return tick_length;
373 s64 ret;
374
375 raw_spin_lock_irqsave(&ntp_lock, flags);
376 ret = tick_length;
377 raw_spin_unlock_irqrestore(&ntp_lock, flags);
378 return ret;
379} 361}
380 362
381 363
@@ -393,9 +375,6 @@ int second_overflow(unsigned long secs)
393{ 375{
394 s64 delta; 376 s64 delta;
395 int leap = 0; 377 int leap = 0;
396 unsigned long flags;
397
398 raw_spin_lock_irqsave(&ntp_lock, flags);
399 378
400 /* 379 /*
401 * Leap second processing. If in leap-insert state at the end of the 380 * Leap second processing. If in leap-insert state at the end of the
@@ -415,7 +394,6 @@ int second_overflow(unsigned long secs)
415 else if (secs % 86400 == 0) { 394 else if (secs % 86400 == 0) {
416 leap = -1; 395 leap = -1;
417 time_state = TIME_OOP; 396 time_state = TIME_OOP;
418 time_tai++;
419 printk(KERN_NOTICE 397 printk(KERN_NOTICE
420 "Clock: inserting leap second 23:59:60 UTC\n"); 398 "Clock: inserting leap second 23:59:60 UTC\n");
421 } 399 }
@@ -425,7 +403,6 @@ int second_overflow(unsigned long secs)
425 time_state = TIME_OK; 403 time_state = TIME_OK;
426 else if ((secs + 1) % 86400 == 0) { 404 else if ((secs + 1) % 86400 == 0) {
427 leap = 1; 405 leap = 1;
428 time_tai--;
429 time_state = TIME_WAIT; 406 time_state = TIME_WAIT;
430 printk(KERN_NOTICE 407 printk(KERN_NOTICE
431 "Clock: deleting leap second 23:59:59 UTC\n"); 408 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -479,8 +456,6 @@ int second_overflow(unsigned long secs)
479 time_adjust = 0; 456 time_adjust = 0;
480 457
481out: 458out:
482 raw_spin_unlock_irqrestore(&ntp_lock, flags);
483
484 return leap; 459 return leap;
485} 460}
486 461
@@ -575,11 +550,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
575 time_status |= txc->status & ~STA_RONLY; 550 time_status |= txc->status & ~STA_RONLY;
576} 551}
577 552
578/* 553
579 * Called with ntp_lock held, so we can access and modify 554static inline void process_adjtimex_modes(struct timex *txc,
580 * all the global NTP state: 555 struct timespec *ts,
581 */ 556 s32 *time_tai)
582static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
583{ 557{
584 if (txc->modes & ADJ_STATUS) 558 if (txc->modes & ADJ_STATUS)
585 process_adj_status(txc, ts); 559 process_adj_status(txc, ts);
@@ -613,7 +587,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
613 } 587 }
614 588
615 if (txc->modes & ADJ_TAI && txc->constant > 0) 589 if (txc->modes & ADJ_TAI && txc->constant > 0)
616 time_tai = txc->constant; 590 *time_tai = txc->constant;
617 591
618 if (txc->modes & ADJ_OFFSET) 592 if (txc->modes & ADJ_OFFSET)
619 ntp_update_offset(txc->offset); 593 ntp_update_offset(txc->offset);
@@ -625,16 +599,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
625 ntp_update_frequency(); 599 ntp_update_frequency();
626} 600}
627 601
628/* 602
629 * adjtimex mainly allows reading (and writing, if superuser) of 603
630 * kernel time-keeping variables. used by xntpd. 604/**
605 * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
631 */ 606 */
632int do_adjtimex(struct timex *txc) 607int ntp_validate_timex(struct timex *txc)
633{ 608{
634 struct timespec ts;
635 int result;
636
637 /* Validate the data before disabling interrupts */
638 if (txc->modes & ADJ_ADJTIME) { 609 if (txc->modes & ADJ_ADJTIME) {
639 /* singleshot must not be used with any other mode bits */ 610 /* singleshot must not be used with any other mode bits */
640 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) 611 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
@@ -646,7 +617,6 @@ int do_adjtimex(struct timex *txc)
646 /* In order to modify anything, you gotta be super-user! */ 617 /* In order to modify anything, you gotta be super-user! */
647 if (txc->modes && !capable(CAP_SYS_TIME)) 618 if (txc->modes && !capable(CAP_SYS_TIME))
648 return -EPERM; 619 return -EPERM;
649
650 /* 620 /*
651 * if the quartz is off by more than 10% then 621 * if the quartz is off by more than 10% then
652 * something is VERY wrong! 622 * something is VERY wrong!
@@ -657,22 +627,20 @@ int do_adjtimex(struct timex *txc)
657 return -EINVAL; 627 return -EINVAL;
658 } 628 }
659 629
660 if (txc->modes & ADJ_SETOFFSET) { 630 if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
661 struct timespec delta; 631 return -EPERM;
662 delta.tv_sec = txc->time.tv_sec;
663 delta.tv_nsec = txc->time.tv_usec;
664 if (!capable(CAP_SYS_TIME))
665 return -EPERM;
666 if (!(txc->modes & ADJ_NANO))
667 delta.tv_nsec *= 1000;
668 result = timekeeping_inject_offset(&delta);
669 if (result)
670 return result;
671 }
672 632
673 getnstimeofday(&ts); 633 return 0;
634}
674 635
675 raw_spin_lock_irq(&ntp_lock); 636
637/*
638 * adjtimex mainly allows reading (and writing, if superuser) of
639 * kernel time-keeping variables. used by xntpd.
640 */
641int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
642{
643 int result;
676 644
677 if (txc->modes & ADJ_ADJTIME) { 645 if (txc->modes & ADJ_ADJTIME) {
678 long save_adjust = time_adjust; 646 long save_adjust = time_adjust;
@@ -687,7 +655,7 @@ int do_adjtimex(struct timex *txc)
687 655
688 /* If there are input parameters, then process them: */ 656 /* If there are input parameters, then process them: */
689 if (txc->modes) 657 if (txc->modes)
690 process_adjtimex_modes(txc, &ts); 658 process_adjtimex_modes(txc, ts, time_tai);
691 659
692 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, 660 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
693 NTP_SCALE_SHIFT); 661 NTP_SCALE_SHIFT);
@@ -709,15 +677,13 @@ int do_adjtimex(struct timex *txc)
709 txc->precision = 1; 677 txc->precision = 1;
710 txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; 678 txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
711 txc->tick = tick_usec; 679 txc->tick = tick_usec;
712 txc->tai = time_tai; 680 txc->tai = *time_tai;
713 681
714 /* fill PPS status fields */ 682 /* fill PPS status fields */
715 pps_fill_timex(txc); 683 pps_fill_timex(txc);
716 684
717 raw_spin_unlock_irq(&ntp_lock); 685 txc->time.tv_sec = ts->tv_sec;
718 686 txc->time.tv_usec = ts->tv_nsec;
719 txc->time.tv_sec = ts.tv_sec;
720 txc->time.tv_usec = ts.tv_nsec;
721 if (!(time_status & STA_NANO)) 687 if (!(time_status & STA_NANO))
722 txc->time.tv_usec /= NSEC_PER_USEC; 688 txc->time.tv_usec /= NSEC_PER_USEC;
723 689
@@ -894,7 +860,7 @@ static void hardpps_update_phase(long error)
894} 860}
895 861
896/* 862/*
897 * hardpps() - discipline CPU clock oscillator to external PPS signal 863 * __hardpps() - discipline CPU clock oscillator to external PPS signal
898 * 864 *
899 * This routine is called at each PPS signal arrival in order to 865 * This routine is called at each PPS signal arrival in order to
900 * discipline the CPU clock oscillator to the PPS signal. It takes two 866 * discipline the CPU clock oscillator to the PPS signal. It takes two
@@ -905,15 +871,13 @@ static void hardpps_update_phase(long error)
905 * This code is based on David Mills's reference nanokernel 871 * This code is based on David Mills's reference nanokernel
906 * implementation. It was mostly rewritten but keeps the same idea. 872 * implementation. It was mostly rewritten but keeps the same idea.
907 */ 873 */
908void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) 874void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
909{ 875{
910 struct pps_normtime pts_norm, freq_norm; 876 struct pps_normtime pts_norm, freq_norm;
911 unsigned long flags; 877 unsigned long flags;
912 878
913 pts_norm = pps_normalize_ts(*phase_ts); 879 pts_norm = pps_normalize_ts(*phase_ts);
914 880
915 raw_spin_lock_irqsave(&ntp_lock, flags);
916
917 /* clear the error bits, they will be set again if needed */ 881 /* clear the error bits, they will be set again if needed */
918 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 882 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
919 883
@@ -925,7 +889,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
925 * just start the frequency interval */ 889 * just start the frequency interval */
926 if (unlikely(pps_fbase.tv_sec == 0)) { 890 if (unlikely(pps_fbase.tv_sec == 0)) {
927 pps_fbase = *raw_ts; 891 pps_fbase = *raw_ts;
928 raw_spin_unlock_irqrestore(&ntp_lock, flags);
929 return; 892 return;
930 } 893 }
931 894
@@ -940,7 +903,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
940 time_status |= STA_PPSJITTER; 903 time_status |= STA_PPSJITTER;
941 /* restart the frequency calibration interval */ 904 /* restart the frequency calibration interval */
942 pps_fbase = *raw_ts; 905 pps_fbase = *raw_ts;
943 raw_spin_unlock_irqrestore(&ntp_lock, flags);
944 pr_err("hardpps: PPSJITTER: bad pulse\n"); 906 pr_err("hardpps: PPSJITTER: bad pulse\n");
945 return; 907 return;
946 } 908 }
@@ -957,10 +919,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
957 919
958 hardpps_update_phase(pts_norm.nsec); 920 hardpps_update_phase(pts_norm.nsec);
959 921
960 raw_spin_unlock_irqrestore(&ntp_lock, flags);
961} 922}
962EXPORT_SYMBOL(hardpps);
963
964#endif /* CONFIG_NTP_PPS */ 923#endif /* CONFIG_NTP_PPS */
965 924
966static int __init ntp_tick_adj_setup(char *str) 925static int __init ntp_tick_adj_setup(char *str)
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
new file mode 100644
index 000000000000..1950cb4ca2a4
--- /dev/null
+++ b/kernel/time/ntp_internal.h
@@ -0,0 +1,12 @@
1#ifndef _LINUX_NTP_INTERNAL_H
2#define _LINUX_NTP_INTERNAL_H
3
4extern void ntp_init(void);
5extern void ntp_clear(void);
6/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
7extern u64 ntp_tick_length(void);
8extern int second_overflow(unsigned long secs);
9extern int ntp_validate_timex(struct timex *);
10extern int __do_adjtimex(struct timex *, struct timespec *, s32 *);
11extern void __hardpps(const struct timespec *, const struct timespec *);
12#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2fb8cb88df8d..206bbfb34e09 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,9 +28,8 @@
28 */ 28 */
29 29
30static struct tick_device tick_broadcast_device; 30static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31static cpumask_var_t tick_broadcast_mask;
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static cpumask_var_t tmpmask;
33static DECLARE_BITMAP(tmpmask, NR_CPUS);
34static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); 33static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
35static int tick_broadcast_force; 34static int tick_broadcast_force;
36 35
@@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void)
50 49
51struct cpumask *tick_get_broadcast_mask(void) 50struct cpumask *tick_get_broadcast_mask(void)
52{ 51{
53 return to_cpumask(tick_broadcast_mask); 52 return tick_broadcast_mask;
54} 53}
55 54
56/* 55/*
@@ -67,15 +66,30 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
67 */ 66 */
68int tick_check_broadcast_device(struct clock_event_device *dev) 67int tick_check_broadcast_device(struct clock_event_device *dev)
69{ 68{
70 if ((tick_broadcast_device.evtdev && 69 struct clock_event_device *cur = tick_broadcast_device.evtdev;
70
71 if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
72 (tick_broadcast_device.evtdev &&
71 tick_broadcast_device.evtdev->rating >= dev->rating) || 73 tick_broadcast_device.evtdev->rating >= dev->rating) ||
72 (dev->features & CLOCK_EVT_FEAT_C3STOP)) 74 (dev->features & CLOCK_EVT_FEAT_C3STOP))
73 return 0; 75 return 0;
74 76
75 clockevents_exchange_device(tick_broadcast_device.evtdev, dev); 77 clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
78 if (cur)
79 cur->event_handler = clockevents_handle_noop;
76 tick_broadcast_device.evtdev = dev; 80 tick_broadcast_device.evtdev = dev;
77 if (!cpumask_empty(tick_get_broadcast_mask())) 81 if (!cpumask_empty(tick_broadcast_mask))
78 tick_broadcast_start_periodic(dev); 82 tick_broadcast_start_periodic(dev);
83 /*
84 * Inform all cpus about this. We might be in a situation
85 * where we did not switch to oneshot mode because the per cpu
86 * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
87 * of a oneshot capable broadcast device. Without that
88 * notification the systems stays stuck in periodic mode
89 * forever.
90 */
91 if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
92 tick_clock_notify();
79 return 1; 93 return 1;
80} 94}
81 95
@@ -123,7 +137,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
123 if (!tick_device_is_functional(dev)) { 137 if (!tick_device_is_functional(dev)) {
124 dev->event_handler = tick_handle_periodic; 138 dev->event_handler = tick_handle_periodic;
125 tick_device_setup_broadcast_func(dev); 139 tick_device_setup_broadcast_func(dev);
126 cpumask_set_cpu(cpu, tick_get_broadcast_mask()); 140 cpumask_set_cpu(cpu, tick_broadcast_mask);
127 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 141 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
128 ret = 1; 142 ret = 1;
129 } else { 143 } else {
@@ -134,7 +148,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
134 */ 148 */
135 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 149 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
136 int cpu = smp_processor_id(); 150 int cpu = smp_processor_id();
137 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 151 cpumask_clear_cpu(cpu, tick_broadcast_mask);
138 tick_broadcast_clear_oneshot(cpu); 152 tick_broadcast_clear_oneshot(cpu);
139 } else { 153 } else {
140 tick_device_setup_broadcast_func(dev); 154 tick_device_setup_broadcast_func(dev);
@@ -198,9 +212,8 @@ static void tick_do_periodic_broadcast(void)
198{ 212{
199 raw_spin_lock(&tick_broadcast_lock); 213 raw_spin_lock(&tick_broadcast_lock);
200 214
201 cpumask_and(to_cpumask(tmpmask), 215 cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
202 cpu_online_mask, tick_get_broadcast_mask()); 216 tick_do_broadcast(tmpmask);
203 tick_do_broadcast(to_cpumask(tmpmask));
204 217
205 raw_spin_unlock(&tick_broadcast_lock); 218 raw_spin_unlock(&tick_broadcast_lock);
206} 219}
@@ -263,13 +276,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
263 if (!tick_device_is_functional(dev)) 276 if (!tick_device_is_functional(dev))
264 goto out; 277 goto out;
265 278
266 bc_stopped = cpumask_empty(tick_get_broadcast_mask()); 279 bc_stopped = cpumask_empty(tick_broadcast_mask);
267 280
268 switch (*reason) { 281 switch (*reason) {
269 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 282 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
270 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 283 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
271 if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { 284 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
272 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
273 if (tick_broadcast_device.mode == 285 if (tick_broadcast_device.mode ==
274 TICKDEV_MODE_PERIODIC) 286 TICKDEV_MODE_PERIODIC)
275 clockevents_shutdown(dev); 287 clockevents_shutdown(dev);
@@ -279,8 +291,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
279 break; 291 break;
280 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 292 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
281 if (!tick_broadcast_force && 293 if (!tick_broadcast_force &&
282 cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { 294 cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
283 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
284 if (tick_broadcast_device.mode == 295 if (tick_broadcast_device.mode ==
285 TICKDEV_MODE_PERIODIC) 296 TICKDEV_MODE_PERIODIC)
286 tick_setup_periodic(dev, 0); 297 tick_setup_periodic(dev, 0);
@@ -288,7 +299,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
288 break; 299 break;
289 } 300 }
290 301
291 if (cpumask_empty(tick_get_broadcast_mask())) { 302 if (cpumask_empty(tick_broadcast_mask)) {
292 if (!bc_stopped) 303 if (!bc_stopped)
293 clockevents_shutdown(bc); 304 clockevents_shutdown(bc);
294 } else if (bc_stopped) { 305 } else if (bc_stopped) {
@@ -337,10 +348,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
337 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 348 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
338 349
339 bc = tick_broadcast_device.evtdev; 350 bc = tick_broadcast_device.evtdev;
340 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 351 cpumask_clear_cpu(cpu, tick_broadcast_mask);
341 352
342 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 353 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
343 if (bc && cpumask_empty(tick_get_broadcast_mask())) 354 if (bc && cpumask_empty(tick_broadcast_mask))
344 clockevents_shutdown(bc); 355 clockevents_shutdown(bc);
345 } 356 }
346 357
@@ -376,13 +387,13 @@ int tick_resume_broadcast(void)
376 387
377 switch (tick_broadcast_device.mode) { 388 switch (tick_broadcast_device.mode) {
378 case TICKDEV_MODE_PERIODIC: 389 case TICKDEV_MODE_PERIODIC:
379 if (!cpumask_empty(tick_get_broadcast_mask())) 390 if (!cpumask_empty(tick_broadcast_mask))
380 tick_broadcast_start_periodic(bc); 391 tick_broadcast_start_periodic(bc);
381 broadcast = cpumask_test_cpu(smp_processor_id(), 392 broadcast = cpumask_test_cpu(smp_processor_id(),
382 tick_get_broadcast_mask()); 393 tick_broadcast_mask);
383 break; 394 break;
384 case TICKDEV_MODE_ONESHOT: 395 case TICKDEV_MODE_ONESHOT:
385 if (!cpumask_empty(tick_get_broadcast_mask())) 396 if (!cpumask_empty(tick_broadcast_mask))
386 broadcast = tick_resume_broadcast_oneshot(bc); 397 broadcast = tick_resume_broadcast_oneshot(bc);
387 break; 398 break;
388 } 399 }
@@ -395,25 +406,58 @@ int tick_resume_broadcast(void)
395 406
396#ifdef CONFIG_TICK_ONESHOT 407#ifdef CONFIG_TICK_ONESHOT
397 408
398/* FIXME: use cpumask_var_t. */ 409static cpumask_var_t tick_broadcast_oneshot_mask;
399static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); 410static cpumask_var_t tick_broadcast_pending_mask;
411static cpumask_var_t tick_broadcast_force_mask;
400 412
401/* 413/*
402 * Exposed for debugging: see timer_list.c 414 * Exposed for debugging: see timer_list.c
403 */ 415 */
404struct cpumask *tick_get_broadcast_oneshot_mask(void) 416struct cpumask *tick_get_broadcast_oneshot_mask(void)
405{ 417{
406 return to_cpumask(tick_broadcast_oneshot_mask); 418 return tick_broadcast_oneshot_mask;
407} 419}
408 420
409static int tick_broadcast_set_event(ktime_t expires, int force) 421/*
422 * Called before going idle with interrupts disabled. Checks whether a
423 * broadcast event from the other core is about to happen. We detected
424 * that in tick_broadcast_oneshot_control(). The callsite can use this
425 * to avoid a deep idle transition as we are about to get the
426 * broadcast IPI right away.
427 */
428int tick_check_broadcast_expired(void)
410{ 429{
411 struct clock_event_device *bc = tick_broadcast_device.evtdev; 430 return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
431}
432
433/*
434 * Set broadcast interrupt affinity
435 */
436static void tick_broadcast_set_affinity(struct clock_event_device *bc,
437 const struct cpumask *cpumask)
438{
439 if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
440 return;
441
442 if (cpumask_equal(bc->cpumask, cpumask))
443 return;
444
445 bc->cpumask = cpumask;
446 irq_set_affinity(bc->irq, bc->cpumask);
447}
448
449static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
450 ktime_t expires, int force)
451{
452 int ret;
412 453
413 if (bc->mode != CLOCK_EVT_MODE_ONESHOT) 454 if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
414 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 455 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
415 456
416 return clockevents_program_event(bc, expires, force); 457 ret = clockevents_program_event(bc, expires, force);
458 if (!ret)
459 tick_broadcast_set_affinity(bc, cpumask_of(cpu));
460 return ret;
417} 461}
418 462
419int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 463int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -428,7 +472,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
428 */ 472 */
429void tick_check_oneshot_broadcast(int cpu) 473void tick_check_oneshot_broadcast(int cpu)
430{ 474{
431 if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { 475 if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
432 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 476 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
433 477
434 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 478 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -442,27 +486,39 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
442{ 486{
443 struct tick_device *td; 487 struct tick_device *td;
444 ktime_t now, next_event; 488 ktime_t now, next_event;
445 int cpu; 489 int cpu, next_cpu = 0;
446 490
447 raw_spin_lock(&tick_broadcast_lock); 491 raw_spin_lock(&tick_broadcast_lock);
448again: 492again:
449 dev->next_event.tv64 = KTIME_MAX; 493 dev->next_event.tv64 = KTIME_MAX;
450 next_event.tv64 = KTIME_MAX; 494 next_event.tv64 = KTIME_MAX;
451 cpumask_clear(to_cpumask(tmpmask)); 495 cpumask_clear(tmpmask);
452 now = ktime_get(); 496 now = ktime_get();
453 /* Find all expired events */ 497 /* Find all expired events */
454 for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { 498 for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
455 td = &per_cpu(tick_cpu_device, cpu); 499 td = &per_cpu(tick_cpu_device, cpu);
456 if (td->evtdev->next_event.tv64 <= now.tv64) 500 if (td->evtdev->next_event.tv64 <= now.tv64) {
457 cpumask_set_cpu(cpu, to_cpumask(tmpmask)); 501 cpumask_set_cpu(cpu, tmpmask);
458 else if (td->evtdev->next_event.tv64 < next_event.tv64) 502 /*
503 * Mark the remote cpu in the pending mask, so
504 * it can avoid reprogramming the cpu local
505 * timer in tick_broadcast_oneshot_control().
506 */
507 cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
508 } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
459 next_event.tv64 = td->evtdev->next_event.tv64; 509 next_event.tv64 = td->evtdev->next_event.tv64;
510 next_cpu = cpu;
511 }
460 } 512 }
461 513
514 /* Take care of enforced broadcast requests */
515 cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
516 cpumask_clear(tick_broadcast_force_mask);
517
462 /* 518 /*
463 * Wakeup the cpus which have an expired event. 519 * Wakeup the cpus which have an expired event.
464 */ 520 */
465 tick_do_broadcast(to_cpumask(tmpmask)); 521 tick_do_broadcast(tmpmask);
466 522
467 /* 523 /*
468 * Two reasons for reprogram: 524 * Two reasons for reprogram:
@@ -479,7 +535,7 @@ again:
479 * Rearm the broadcast device. If event expired, 535 * Rearm the broadcast device. If event expired,
480 * repeat the above 536 * repeat the above
481 */ 537 */
482 if (tick_broadcast_set_event(next_event, 0)) 538 if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
483 goto again; 539 goto again;
484 } 540 }
485 raw_spin_unlock(&tick_broadcast_lock); 541 raw_spin_unlock(&tick_broadcast_lock);
@@ -494,6 +550,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
494 struct clock_event_device *bc, *dev; 550 struct clock_event_device *bc, *dev;
495 struct tick_device *td; 551 struct tick_device *td;
496 unsigned long flags; 552 unsigned long flags;
553 ktime_t now;
497 int cpu; 554 int cpu;
498 555
499 /* 556 /*
@@ -518,21 +575,84 @@ void tick_broadcast_oneshot_control(unsigned long reason)
518 575
519 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 576 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
520 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 577 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
521 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 578 WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
522 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); 579 if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
523 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 580 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
524 if (dev->next_event.tv64 < bc->next_event.tv64) 581 /*
525 tick_broadcast_set_event(dev->next_event, 1); 582 * We only reprogram the broadcast timer if we
583 * did not mark ourself in the force mask and
584 * if the cpu local event is earlier than the
585 * broadcast event. If the current CPU is in
586 * the force mask, then we are going to be
587 * woken by the IPI right away.
588 */
589 if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
590 dev->next_event.tv64 < bc->next_event.tv64)
591 tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
526 } 592 }
527 } else { 593 } else {
528 if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { 594 if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
529 cpumask_clear_cpu(cpu,
530 tick_get_broadcast_oneshot_mask());
531 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 595 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
532 if (dev->next_event.tv64 != KTIME_MAX) 596 if (dev->next_event.tv64 == KTIME_MAX)
533 tick_program_event(dev->next_event, 1); 597 goto out;
598 /*
599 * The cpu which was handling the broadcast
600 * timer marked this cpu in the broadcast
601 * pending mask and fired the broadcast
602 * IPI. So we are going to handle the expired
603 * event anyway via the broadcast IPI
604 * handler. No need to reprogram the timer
605 * with an already expired event.
606 */
607 if (cpumask_test_and_clear_cpu(cpu,
608 tick_broadcast_pending_mask))
609 goto out;
610
611 /*
612 * If the pending bit is not set, then we are
613 * either the CPU handling the broadcast
614 * interrupt or we got woken by something else.
615 *
616 * We are not longer in the broadcast mask, so
617 * if the cpu local expiry time is already
618 * reached, we would reprogram the cpu local
619 * timer with an already expired event.
620 *
621 * This can lead to a ping-pong when we return
622 * to idle and therefor rearm the broadcast
623 * timer before the cpu local timer was able
624 * to fire. This happens because the forced
625 * reprogramming makes sure that the event
626 * will happen in the future and depending on
627 * the min_delta setting this might be far
628 * enough out that the ping-pong starts.
629 *
630 * If the cpu local next_event has expired
631 * then we know that the broadcast timer
632 * next_event has expired as well and
633 * broadcast is about to be handled. So we
634 * avoid reprogramming and enforce that the
635 * broadcast handler, which did not run yet,
636 * will invoke the cpu local handler.
637 *
638 * We cannot call the handler directly from
639 * here, because we might be in a NOHZ phase
640 * and we did not go through the irq_enter()
641 * nohz fixups.
642 */
643 now = ktime_get();
644 if (dev->next_event.tv64 <= now.tv64) {
645 cpumask_set_cpu(cpu, tick_broadcast_force_mask);
646 goto out;
647 }
648 /*
649 * We got woken by something else. Reprogram
650 * the cpu local timer device.
651 */
652 tick_program_event(dev->next_event, 1);
534 } 653 }
535 } 654 }
655out:
536 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 656 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
537} 657}
538 658
@@ -543,7 +663,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
543 */ 663 */
544static void tick_broadcast_clear_oneshot(int cpu) 664static void tick_broadcast_clear_oneshot(int cpu)
545{ 665{
546 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 666 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
547} 667}
548 668
549static void tick_broadcast_init_next_event(struct cpumask *mask, 669static void tick_broadcast_init_next_event(struct cpumask *mask,
@@ -573,7 +693,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
573 bc->event_handler = tick_handle_oneshot_broadcast; 693 bc->event_handler = tick_handle_oneshot_broadcast;
574 694
575 /* Take the do_timer update */ 695 /* Take the do_timer update */
576 tick_do_timer_cpu = cpu; 696 if (!tick_nohz_full_cpu(cpu))
697 tick_do_timer_cpu = cpu;
577 698
578 /* 699 /*
579 * We must be careful here. There might be other CPUs 700 * We must be careful here. There might be other CPUs
@@ -581,17 +702,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
581 * oneshot_mask bits for those and program the 702 * oneshot_mask bits for those and program the
582 * broadcast device to fire. 703 * broadcast device to fire.
583 */ 704 */
584 cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); 705 cpumask_copy(tmpmask, tick_broadcast_mask);
585 cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); 706 cpumask_clear_cpu(cpu, tmpmask);
586 cpumask_or(tick_get_broadcast_oneshot_mask(), 707 cpumask_or(tick_broadcast_oneshot_mask,
587 tick_get_broadcast_oneshot_mask(), 708 tick_broadcast_oneshot_mask, tmpmask);
588 to_cpumask(tmpmask));
589 709
590 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { 710 if (was_periodic && !cpumask_empty(tmpmask)) {
591 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 711 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
592 tick_broadcast_init_next_event(to_cpumask(tmpmask), 712 tick_broadcast_init_next_event(tmpmask,
593 tick_next_period); 713 tick_next_period);
594 tick_broadcast_set_event(tick_next_period, 1); 714 tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
595 } else 715 } else
596 bc->next_event.tv64 = KTIME_MAX; 716 bc->next_event.tv64 = KTIME_MAX;
597 } else { 717 } else {
@@ -639,7 +759,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
639 * Clear the broadcast mask flag for the dead cpu, but do not 759 * Clear the broadcast mask flag for the dead cpu, but do not
640 * stop the broadcast device! 760 * stop the broadcast device!
641 */ 761 */
642 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 762 cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
643 763
644 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 764 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
645} 765}
@@ -663,3 +783,14 @@ bool tick_broadcast_oneshot_available(void)
663} 783}
664 784
665#endif 785#endif
786
787void __init tick_broadcast_init(void)
788{
789 alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
790 alloc_cpumask_var(&tmpmask, GFP_NOWAIT);
791#ifdef CONFIG_TICK_ONESHOT
792 alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
793 alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
794 alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
795#endif
796}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6973f4..5d3fb100bc06 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td,
163 * this cpu: 163 * this cpu:
164 */ 164 */
165 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { 165 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
166 tick_do_timer_cpu = cpu; 166 if (!tick_nohz_full_cpu(cpu))
167 tick_do_timer_cpu = cpu;
168 else
169 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
167 tick_next_period = ktime_get(); 170 tick_next_period = ktime_get();
168 tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 171 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
169 } 172 }
@@ -323,6 +326,7 @@ static void tick_shutdown(unsigned int *cpup)
323 */ 326 */
324 dev->mode = CLOCK_EVT_MODE_UNUSED; 327 dev->mode = CLOCK_EVT_MODE_UNUSED;
325 clockevents_exchange_device(dev, NULL); 328 clockevents_exchange_device(dev, NULL);
329 dev->event_handler = clockevents_handle_noop;
326 td->evtdev = NULL; 330 td->evtdev = NULL;
327 } 331 }
328 raw_spin_unlock_irqrestore(&tick_device_lock, flags); 332 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
@@ -416,4 +420,5 @@ static struct notifier_block tick_notifier = {
416void __init tick_init(void) 420void __init tick_init(void)
417{ 421{
418 clockevents_register_notifier(&tick_notifier); 422 clockevents_register_notifier(&tick_notifier);
423 tick_broadcast_init();
419} 424}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59ed6dc0..f0299eae4602 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
4#include <linux/hrtimer.h> 4#include <linux/hrtimer.h>
5#include <linux/tick.h> 5#include <linux/tick.h>
6 6
7extern seqlock_t jiffies_lock;
8
7#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD 9#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
8 10
9#define TICK_DO_TIMER_NONE -1 11#define TICK_DO_TIMER_NONE -1
@@ -94,7 +96,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
94extern void tick_shutdown_broadcast(unsigned int *cpup); 96extern void tick_shutdown_broadcast(unsigned int *cpup);
95extern void tick_suspend_broadcast(void); 97extern void tick_suspend_broadcast(void);
96extern int tick_resume_broadcast(void); 98extern int tick_resume_broadcast(void);
97 99extern void tick_broadcast_init(void);
98extern void 100extern void
99tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); 101tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
100 102
@@ -119,6 +121,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
119static inline void tick_shutdown_broadcast(unsigned int *cpup) { } 121static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
120static inline void tick_suspend_broadcast(void) { } 122static inline void tick_suspend_broadcast(void) { }
121static inline int tick_resume_broadcast(void) { return 0; } 123static inline int tick_resume_broadcast(void) { return 0; }
124static inline void tick_broadcast_init(void) { }
122 125
123/* 126/*
124 * Set the periodic handler in non broadcast mode 127 * Set the periodic handler in non broadcast mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a19a39952c1b..bc67d4245e1d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -21,11 +21,15 @@
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/irq_work.h> 23#include <linux/irq_work.h>
24#include <linux/posix-timers.h>
25#include <linux/perf_event.h>
24 26
25#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
26 28
27#include "tick-internal.h" 29#include "tick-internal.h"
28 30
31#include <trace/events/timer.h>
32
29/* 33/*
30 * Per cpu nohz control structure 34 * Per cpu nohz control structure
31 */ 35 */
@@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now)
104{ 108{
105 int cpu = smp_processor_id(); 109 int cpu = smp_processor_id();
106 110
107#ifdef CONFIG_NO_HZ 111#ifdef CONFIG_NO_HZ_COMMON
108 /* 112 /*
109 * Check if the do_timer duty was dropped. We don't care about 113 * Check if the do_timer duty was dropped. We don't care about
110 * concurrency: This happens only when the cpu in charge went 114 * concurrency: This happens only when the cpu in charge went
@@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
112 * this duty, then the jiffies update is still serialized by 116 * this duty, then the jiffies update is still serialized by
113 * jiffies_lock. 117 * jiffies_lock.
114 */ 118 */
115 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) 119 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
120 && !tick_nohz_full_cpu(cpu))
116 tick_do_timer_cpu = cpu; 121 tick_do_timer_cpu = cpu;
117#endif 122#endif
118 123
@@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now)
123 128
124static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 129static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
125{ 130{
126#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ_COMMON
127 /* 132 /*
128 * When we are idle and the tick is stopped, we have to touch 133 * When we are idle and the tick is stopped, we have to touch
129 * the watchdog as we might not schedule for a really long 134 * the watchdog as we might not schedule for a really long
@@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
142 profile_tick(CPU_PROFILING); 147 profile_tick(CPU_PROFILING);
143} 148}
144 149
150#ifdef CONFIG_NO_HZ_FULL
151static cpumask_var_t nohz_full_mask;
152bool have_nohz_full_mask;
153
154static bool can_stop_full_tick(void)
155{
156 WARN_ON_ONCE(!irqs_disabled());
157
158 if (!sched_can_stop_tick()) {
159 trace_tick_stop(0, "more than 1 task in runqueue\n");
160 return false;
161 }
162
163 if (!posix_cpu_timers_can_stop_tick(current)) {
164 trace_tick_stop(0, "posix timers running\n");
165 return false;
166 }
167
168 if (!perf_event_can_stop_tick()) {
169 trace_tick_stop(0, "perf events running\n");
170 return false;
171 }
172
173 /* sched_clock_tick() needs us? */
174#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
175 /*
176 * TODO: kick full dynticks CPUs when
177 * sched_clock_stable is set.
178 */
179 if (!sched_clock_stable) {
180 trace_tick_stop(0, "unstable sched clock\n");
181 return false;
182 }
183#endif
184
185 return true;
186}
187
188static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
189
190/*
191 * Re-evaluate the need for the tick on the current CPU
192 * and restart it if necessary.
193 */
194void tick_nohz_full_check(void)
195{
196 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
197
198 if (tick_nohz_full_cpu(smp_processor_id())) {
199 if (ts->tick_stopped && !is_idle_task(current)) {
200 if (!can_stop_full_tick())
201 tick_nohz_restart_sched_tick(ts, ktime_get());
202 }
203 }
204}
205
206static void nohz_full_kick_work_func(struct irq_work *work)
207{
208 tick_nohz_full_check();
209}
210
211static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
212 .func = nohz_full_kick_work_func,
213};
214
215/*
216 * Kick the current CPU if it's full dynticks in order to force it to
217 * re-evaluate its dependency on the tick and restart it if necessary.
218 */
219void tick_nohz_full_kick(void)
220{
221 if (tick_nohz_full_cpu(smp_processor_id()))
222 irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
223}
224
225static void nohz_full_kick_ipi(void *info)
226{
227 tick_nohz_full_check();
228}
229
230/*
231 * Kick all full dynticks CPUs in order to force these to re-evaluate
232 * their dependency on the tick and restart it if necessary.
233 */
234void tick_nohz_full_kick_all(void)
235{
236 if (!have_nohz_full_mask)
237 return;
238
239 preempt_disable();
240 smp_call_function_many(nohz_full_mask,
241 nohz_full_kick_ipi, NULL, false);
242 preempt_enable();
243}
244
245/*
246 * Re-evaluate the need for the tick as we switch the current task.
247 * It might need the tick due to per task/process properties:
248 * perf events, posix cpu timers, ...
249 */
250void tick_nohz_task_switch(struct task_struct *tsk)
251{
252 unsigned long flags;
253
254 local_irq_save(flags);
255
256 if (!tick_nohz_full_cpu(smp_processor_id()))
257 goto out;
258
259 if (tick_nohz_tick_stopped() && !can_stop_full_tick())
260 tick_nohz_full_kick();
261
262out:
263 local_irq_restore(flags);
264}
265
266int tick_nohz_full_cpu(int cpu)
267{
268 if (!have_nohz_full_mask)
269 return 0;
270
271 return cpumask_test_cpu(cpu, nohz_full_mask);
272}
273
274/* Parse the boot-time nohz CPU list from the kernel parameters. */
275static int __init tick_nohz_full_setup(char *str)
276{
277 int cpu;
278
279 alloc_bootmem_cpumask_var(&nohz_full_mask);
280 if (cpulist_parse(str, nohz_full_mask) < 0) {
281 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
282 return 1;
283 }
284
285 cpu = smp_processor_id();
286 if (cpumask_test_cpu(cpu, nohz_full_mask)) {
287 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
288 cpumask_clear_cpu(cpu, nohz_full_mask);
289 }
290 have_nohz_full_mask = true;
291
292 return 1;
293}
294__setup("nohz_full=", tick_nohz_full_setup);
295
296static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
297 unsigned long action,
298 void *hcpu)
299{
300 unsigned int cpu = (unsigned long)hcpu;
301
302 switch (action & ~CPU_TASKS_FROZEN) {
303 case CPU_DOWN_PREPARE:
304 /*
305 * If we handle the timekeeping duty for full dynticks CPUs,
306 * we can't safely shutdown that CPU.
307 */
308 if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
309 return -EINVAL;
310 break;
311 }
312 return NOTIFY_OK;
313}
314
315/*
316 * Worst case string length in chunks of CPU range seems 2 steps
317 * separations: 0,2,4,6,...
318 * This is NR_CPUS + sizeof('\0')
319 */
320static char __initdata nohz_full_buf[NR_CPUS + 1];
321
322static int tick_nohz_init_all(void)
323{
324 int err = -1;
325
326#ifdef CONFIG_NO_HZ_FULL_ALL
327 if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
328 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
329 return err;
330 }
331 err = 0;
332 cpumask_setall(nohz_full_mask);
333 cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
334 have_nohz_full_mask = true;
335#endif
336 return err;
337}
338
339void __init tick_nohz_init(void)
340{
341 int cpu;
342
343 if (!have_nohz_full_mask) {
344 if (tick_nohz_init_all() < 0)
345 return;
346 }
347
348 cpu_notifier(tick_nohz_cpu_down_callback, 0);
349
350 /* Make sure full dynticks CPU are also RCU nocbs */
351 for_each_cpu(cpu, nohz_full_mask) {
352 if (!rcu_is_nocb_cpu(cpu)) {
353 pr_warning("NO_HZ: CPU %d is not RCU nocb: "
354 "cleared from nohz_full range", cpu);
355 cpumask_clear_cpu(cpu, nohz_full_mask);
356 }
357 }
358
359 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
360 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
361}
362#else
363#define have_nohz_full_mask (0)
364#endif
365
145/* 366/*
146 * NOHZ - aka dynamic tick functionality 367 * NOHZ - aka dynamic tick functionality
147 */ 368 */
148#ifdef CONFIG_NO_HZ 369#ifdef CONFIG_NO_HZ_COMMON
149/* 370/*
150 * NO HZ enabled ? 371 * NO HZ enabled ?
151 */ 372 */
@@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
345 delta_jiffies = rcu_delta_jiffies; 566 delta_jiffies = rcu_delta_jiffies;
346 } 567 }
347 } 568 }
569
348 /* 570 /*
349 * Do not stop the tick, if we are only one off 571 * Do not stop the tick, if we are only one off (or less)
350 * or if the cpu is required for rcu 572 * or if the cpu is required for RCU:
351 */ 573 */
352 if (!ts->tick_stopped && delta_jiffies == 1) 574 if (!ts->tick_stopped && delta_jiffies <= 1)
353 goto out; 575 goto out;
354 576
355 /* Schedule the tick, if we are at least one jiffie off */ 577 /* Schedule the tick, if we are at least one jiffie off */
@@ -378,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
378 time_delta = KTIME_MAX; 600 time_delta = KTIME_MAX;
379 } 601 }
380 602
603#ifdef CONFIG_NO_HZ_FULL
604 if (!ts->inidle) {
605 time_delta = min(time_delta,
606 scheduler_tick_max_deferment());
607 }
608#endif
609
381 /* 610 /*
382 * calculate the expiry time for the next timer wheel 611 * calculate the expiry time for the next timer wheel
383 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals 612 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
@@ -421,6 +650,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
421 650
422 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 651 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
423 ts->tick_stopped = 1; 652 ts->tick_stopped = 1;
653 trace_tick_stop(1, " ");
424 } 654 }
425 655
426 /* 656 /*
@@ -457,6 +687,24 @@ out:
457 return ret; 687 return ret;
458} 688}
459 689
690static void tick_nohz_full_stop_tick(struct tick_sched *ts)
691{
692#ifdef CONFIG_NO_HZ_FULL
693 int cpu = smp_processor_id();
694
695 if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
696 return;
697
698 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
699 return;
700
701 if (!can_stop_full_tick())
702 return;
703
704 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
705#endif
706}
707
460static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 708static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
461{ 709{
462 /* 710 /*
@@ -482,13 +730,28 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
482 730
483 if (ratelimit < 10 && 731 if (ratelimit < 10 &&
484 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { 732 (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
485 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 733 pr_warn("NOHZ: local_softirq_pending %02x\n",
486 (unsigned int) local_softirq_pending()); 734 (unsigned int) local_softirq_pending());
487 ratelimit++; 735 ratelimit++;
488 } 736 }
489 return false; 737 return false;
490 } 738 }
491 739
740 if (have_nohz_full_mask) {
741 /*
742 * Keep the tick alive to guarantee timekeeping progression
743 * if there are full dynticks CPUs around
744 */
745 if (tick_do_timer_cpu == cpu)
746 return false;
747 /*
748 * Boot safety: make sure the timekeeping duty has been
749 * assigned before entering dyntick-idle mode,
750 */
751 if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
752 return false;
753 }
754
492 return true; 755 return true;
493} 756}
494 757
@@ -568,12 +831,13 @@ void tick_nohz_irq_exit(void)
568{ 831{
569 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 832 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
570 833
571 if (!ts->inidle) 834 if (ts->inidle) {
572 return; 835 /* Cancel the timer because CPU already waken up from the C-states*/
573 836 menu_hrtimer_cancel();
574 /* Cancel the timer because CPU already waken up from the C-states*/ 837 __tick_nohz_idle_enter(ts);
575 menu_hrtimer_cancel(); 838 } else {
576 __tick_nohz_idle_enter(ts); 839 tick_nohz_full_stop_tick(ts);
840 }
577} 841}
578 842
579/** 843/**
@@ -802,7 +1066,7 @@ static inline void tick_check_nohz(int cpu)
802static inline void tick_nohz_switch_to_nohz(void) { } 1066static inline void tick_nohz_switch_to_nohz(void) { }
803static inline void tick_check_nohz(int cpu) { } 1067static inline void tick_check_nohz(int cpu) { }
804 1068
805#endif /* NO_HZ */ 1069#endif /* CONFIG_NO_HZ_COMMON */
806 1070
807/* 1071/*
808 * Called from irq_enter to notify about the possible interruption of idle() 1072 * Called from irq_enter to notify about the possible interruption of idle()
@@ -887,14 +1151,14 @@ void tick_setup_sched_timer(void)
887 now = ktime_get(); 1151 now = ktime_get();
888 } 1152 }
889 1153
890#ifdef CONFIG_NO_HZ 1154#ifdef CONFIG_NO_HZ_COMMON
891 if (tick_nohz_enabled) 1155 if (tick_nohz_enabled)
892 ts->nohz_mode = NOHZ_MODE_HIGHRES; 1156 ts->nohz_mode = NOHZ_MODE_HIGHRES;
893#endif 1157#endif
894} 1158}
895#endif /* HIGH_RES_TIMERS */ 1159#endif /* HIGH_RES_TIMERS */
896 1160
897#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS 1161#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
898void tick_cancel_sched_timer(int cpu) 1162void tick_cancel_sched_timer(int cpu)
899{ 1163{
900 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1164 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9a0bc98fbe1d..98cd470bbe49 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -23,8 +23,13 @@
23#include <linux/stop_machine.h> 23#include <linux/stop_machine.h>
24#include <linux/pvclock_gtod.h> 24#include <linux/pvclock_gtod.h>
25 25
26#include "tick-internal.h"
27#include "ntp_internal.h"
26 28
27static struct timekeeper timekeeper; 29static struct timekeeper timekeeper;
30static DEFINE_RAW_SPINLOCK(timekeeper_lock);
31static seqcount_t timekeeper_seq;
32static struct timekeeper shadow_timekeeper;
28 33
29/* flag for if timekeeping is suspended */ 34/* flag for if timekeeping is suspended */
30int __read_mostly timekeeping_suspended; 35int __read_mostly timekeeping_suspended;
@@ -67,6 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
67 tk->wall_to_monotonic = wtm; 72 tk->wall_to_monotonic = wtm;
68 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); 73 set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
69 tk->offs_real = timespec_to_ktime(tmp); 74 tk->offs_real = timespec_to_ktime(tmp);
75 tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));
70} 76}
71 77
72static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) 78static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -96,7 +102,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
96 102
97 old_clock = tk->clock; 103 old_clock = tk->clock;
98 tk->clock = clock; 104 tk->clock = clock;
99 clock->cycle_last = clock->read(clock); 105 tk->cycle_last = clock->cycle_last = clock->read(clock);
100 106
101 /* Do the ns -> cycle conversion first, using original mult */ 107 /* Do the ns -> cycle conversion first, using original mult */
102 tmp = NTP_INTERVAL_LENGTH; 108 tmp = NTP_INTERVAL_LENGTH;
@@ -201,8 +207,6 @@ static void update_pvclock_gtod(struct timekeeper *tk)
201 207
202/** 208/**
203 * pvclock_gtod_register_notifier - register a pvclock timedata update listener 209 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
204 *
205 * Must hold write on timekeeper.lock
206 */ 210 */
207int pvclock_gtod_register_notifier(struct notifier_block *nb) 211int pvclock_gtod_register_notifier(struct notifier_block *nb)
208{ 212{
@@ -210,11 +214,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
210 unsigned long flags; 214 unsigned long flags;
211 int ret; 215 int ret;
212 216
213 write_seqlock_irqsave(&tk->lock, flags); 217 raw_spin_lock_irqsave(&timekeeper_lock, flags);
214 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); 218 ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
215 /* update timekeeping data */
216 update_pvclock_gtod(tk); 219 update_pvclock_gtod(tk);
217 write_sequnlock_irqrestore(&tk->lock, flags); 220 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
218 221
219 return ret; 222 return ret;
220} 223}
@@ -223,25 +226,22 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
223/** 226/**
224 * pvclock_gtod_unregister_notifier - unregister a pvclock 227 * pvclock_gtod_unregister_notifier - unregister a pvclock
225 * timedata update listener 228 * timedata update listener
226 *
227 * Must hold write on timekeeper.lock
228 */ 229 */
229int pvclock_gtod_unregister_notifier(struct notifier_block *nb) 230int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
230{ 231{
231 struct timekeeper *tk = &timekeeper;
232 unsigned long flags; 232 unsigned long flags;
233 int ret; 233 int ret;
234 234
235 write_seqlock_irqsave(&tk->lock, flags); 235 raw_spin_lock_irqsave(&timekeeper_lock, flags);
236 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); 236 ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
237 write_sequnlock_irqrestore(&tk->lock, flags); 237 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
238 238
239 return ret; 239 return ret;
240} 240}
241EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); 241EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
242 242
243/* must hold write on timekeeper.lock */ 243/* must hold timekeeper_lock */
244static void timekeeping_update(struct timekeeper *tk, bool clearntp) 244static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)
245{ 245{
246 if (clearntp) { 246 if (clearntp) {
247 tk->ntp_error = 0; 247 tk->ntp_error = 0;
@@ -249,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
249 } 249 }
250 update_vsyscall(tk); 250 update_vsyscall(tk);
251 update_pvclock_gtod(tk); 251 update_pvclock_gtod(tk);
252
253 if (mirror)
254 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
252} 255}
253 256
254/** 257/**
@@ -267,7 +270,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
267 clock = tk->clock; 270 clock = tk->clock;
268 cycle_now = clock->read(clock); 271 cycle_now = clock->read(clock);
269 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 272 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
270 clock->cycle_last = cycle_now; 273 tk->cycle_last = clock->cycle_last = cycle_now;
271 274
272 tk->xtime_nsec += cycle_delta * tk->mult; 275 tk->xtime_nsec += cycle_delta * tk->mult;
273 276
@@ -294,12 +297,12 @@ int __getnstimeofday(struct timespec *ts)
294 s64 nsecs = 0; 297 s64 nsecs = 0;
295 298
296 do { 299 do {
297 seq = read_seqbegin(&tk->lock); 300 seq = read_seqcount_begin(&timekeeper_seq);
298 301
299 ts->tv_sec = tk->xtime_sec; 302 ts->tv_sec = tk->xtime_sec;
300 nsecs = timekeeping_get_ns(tk); 303 nsecs = timekeeping_get_ns(tk);
301 304
302 } while (read_seqretry(&tk->lock, seq)); 305 } while (read_seqcount_retry(&timekeeper_seq, seq));
303 306
304 ts->tv_nsec = 0; 307 ts->tv_nsec = 0;
305 timespec_add_ns(ts, nsecs); 308 timespec_add_ns(ts, nsecs);
@@ -335,11 +338,11 @@ ktime_t ktime_get(void)
335 WARN_ON(timekeeping_suspended); 338 WARN_ON(timekeeping_suspended);
336 339
337 do { 340 do {
338 seq = read_seqbegin(&tk->lock); 341 seq = read_seqcount_begin(&timekeeper_seq);
339 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; 342 secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
340 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; 343 nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
341 344
342 } while (read_seqretry(&tk->lock, seq)); 345 } while (read_seqcount_retry(&timekeeper_seq, seq));
343 /* 346 /*
344 * Use ktime_set/ktime_add_ns to create a proper ktime on 347 * Use ktime_set/ktime_add_ns to create a proper ktime on
345 * 32-bit architectures without CONFIG_KTIME_SCALAR. 348 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -366,12 +369,12 @@ void ktime_get_ts(struct timespec *ts)
366 WARN_ON(timekeeping_suspended); 369 WARN_ON(timekeeping_suspended);
367 370
368 do { 371 do {
369 seq = read_seqbegin(&tk->lock); 372 seq = read_seqcount_begin(&timekeeper_seq);
370 ts->tv_sec = tk->xtime_sec; 373 ts->tv_sec = tk->xtime_sec;
371 nsec = timekeeping_get_ns(tk); 374 nsec = timekeeping_get_ns(tk);
372 tomono = tk->wall_to_monotonic; 375 tomono = tk->wall_to_monotonic;
373 376
374 } while (read_seqretry(&tk->lock, seq)); 377 } while (read_seqcount_retry(&timekeeper_seq, seq));
375 378
376 ts->tv_sec += tomono.tv_sec; 379 ts->tv_sec += tomono.tv_sec;
377 ts->tv_nsec = 0; 380 ts->tv_nsec = 0;
@@ -379,6 +382,50 @@ void ktime_get_ts(struct timespec *ts)
379} 382}
380EXPORT_SYMBOL_GPL(ktime_get_ts); 383EXPORT_SYMBOL_GPL(ktime_get_ts);
381 384
385
386/**
387 * timekeeping_clocktai - Returns the TAI time of day in a timespec
388 * @ts: pointer to the timespec to be set
389 *
390 * Returns the time of day in a timespec.
391 */
392void timekeeping_clocktai(struct timespec *ts)
393{
394 struct timekeeper *tk = &timekeeper;
395 unsigned long seq;
396 u64 nsecs;
397
398 WARN_ON(timekeeping_suspended);
399
400 do {
401 seq = read_seqcount_begin(&timekeeper_seq);
402
403 ts->tv_sec = tk->xtime_sec + tk->tai_offset;
404 nsecs = timekeeping_get_ns(tk);
405
406 } while (read_seqcount_retry(&timekeeper_seq, seq));
407
408 ts->tv_nsec = 0;
409 timespec_add_ns(ts, nsecs);
410
411}
412EXPORT_SYMBOL(timekeeping_clocktai);
413
414
415/**
416 * ktime_get_clocktai - Returns the TAI time of day in a ktime
417 *
418 * Returns the time of day in a ktime.
419 */
420ktime_t ktime_get_clocktai(void)
421{
422 struct timespec ts;
423
424 timekeeping_clocktai(&ts);
425 return timespec_to_ktime(ts);
426}
427EXPORT_SYMBOL(ktime_get_clocktai);
428
382#ifdef CONFIG_NTP_PPS 429#ifdef CONFIG_NTP_PPS
383 430
384/** 431/**
@@ -399,7 +446,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
399 WARN_ON_ONCE(timekeeping_suspended); 446 WARN_ON_ONCE(timekeeping_suspended);
400 447
401 do { 448 do {
402 seq = read_seqbegin(&tk->lock); 449 seq = read_seqcount_begin(&timekeeper_seq);
403 450
404 *ts_raw = tk->raw_time; 451 *ts_raw = tk->raw_time;
405 ts_real->tv_sec = tk->xtime_sec; 452 ts_real->tv_sec = tk->xtime_sec;
@@ -408,7 +455,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
408 nsecs_raw = timekeeping_get_ns_raw(tk); 455 nsecs_raw = timekeeping_get_ns_raw(tk);
409 nsecs_real = timekeeping_get_ns(tk); 456 nsecs_real = timekeeping_get_ns(tk);
410 457
411 } while (read_seqretry(&tk->lock, seq)); 458 } while (read_seqcount_retry(&timekeeper_seq, seq));
412 459
413 timespec_add_ns(ts_raw, nsecs_raw); 460 timespec_add_ns(ts_raw, nsecs_raw);
414 timespec_add_ns(ts_real, nsecs_real); 461 timespec_add_ns(ts_real, nsecs_real);
@@ -448,7 +495,8 @@ int do_settimeofday(const struct timespec *tv)
448 if (!timespec_valid_strict(tv)) 495 if (!timespec_valid_strict(tv))
449 return -EINVAL; 496 return -EINVAL;
450 497
451 write_seqlock_irqsave(&tk->lock, flags); 498 raw_spin_lock_irqsave(&timekeeper_lock, flags);
499 write_seqcount_begin(&timekeeper_seq);
452 500
453 timekeeping_forward_now(tk); 501 timekeeping_forward_now(tk);
454 502
@@ -460,9 +508,10 @@ int do_settimeofday(const struct timespec *tv)
460 508
461 tk_set_xtime(tk, tv); 509 tk_set_xtime(tk, tv);
462 510
463 timekeeping_update(tk, true); 511 timekeeping_update(tk, true, true);
464 512
465 write_sequnlock_irqrestore(&tk->lock, flags); 513 write_seqcount_end(&timekeeper_seq);
514 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
466 515
467 /* signal hrtimers about time change */ 516 /* signal hrtimers about time change */
468 clock_was_set(); 517 clock_was_set();
@@ -487,7 +536,8 @@ int timekeeping_inject_offset(struct timespec *ts)
487 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 536 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
488 return -EINVAL; 537 return -EINVAL;
489 538
490 write_seqlock_irqsave(&tk->lock, flags); 539 raw_spin_lock_irqsave(&timekeeper_lock, flags);
540 write_seqcount_begin(&timekeeper_seq);
491 541
492 timekeeping_forward_now(tk); 542 timekeeping_forward_now(tk);
493 543
@@ -502,9 +552,10 @@ int timekeeping_inject_offset(struct timespec *ts)
502 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); 552 tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
503 553
504error: /* even if we error out, we forwarded the time, so call update */ 554error: /* even if we error out, we forwarded the time, so call update */
505 timekeeping_update(tk, true); 555 timekeeping_update(tk, true, true);
506 556
507 write_sequnlock_irqrestore(&tk->lock, flags); 557 write_seqcount_end(&timekeeper_seq);
558 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
508 559
509 /* signal hrtimers about time change */ 560 /* signal hrtimers about time change */
510 clock_was_set(); 561 clock_was_set();
@@ -513,6 +564,52 @@ error: /* even if we error out, we forwarded the time, so call update */
513} 564}
514EXPORT_SYMBOL(timekeeping_inject_offset); 565EXPORT_SYMBOL(timekeeping_inject_offset);
515 566
567
568/**
569 * timekeeping_get_tai_offset - Returns current TAI offset from UTC
570 *
571 */
572s32 timekeeping_get_tai_offset(void)
573{
574 struct timekeeper *tk = &timekeeper;
575 unsigned int seq;
576 s32 ret;
577
578 do {
579 seq = read_seqcount_begin(&timekeeper_seq);
580 ret = tk->tai_offset;
581 } while (read_seqcount_retry(&timekeeper_seq, seq));
582
583 return ret;
584}
585
586/**
587 * __timekeeping_set_tai_offset - Lock free worker function
588 *
589 */
590static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
591{
592 tk->tai_offset = tai_offset;
593 tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));
594}
595
596/**
597 * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
598 *
599 */
600void timekeeping_set_tai_offset(s32 tai_offset)
601{
602 struct timekeeper *tk = &timekeeper;
603 unsigned long flags;
604
605 raw_spin_lock_irqsave(&timekeeper_lock, flags);
606 write_seqcount_begin(&timekeeper_seq);
607 __timekeeping_set_tai_offset(tk, tai_offset);
608 write_seqcount_end(&timekeeper_seq);
609 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
610 clock_was_set();
611}
612
516/** 613/**
517 * change_clocksource - Swaps clocksources if a new one is available 614 * change_clocksource - Swaps clocksources if a new one is available
518 * 615 *
@@ -526,7 +623,8 @@ static int change_clocksource(void *data)
526 623
527 new = (struct clocksource *) data; 624 new = (struct clocksource *) data;
528 625
529 write_seqlock_irqsave(&tk->lock, flags); 626 raw_spin_lock_irqsave(&timekeeper_lock, flags);
627 write_seqcount_begin(&timekeeper_seq);
530 628
531 timekeeping_forward_now(tk); 629 timekeeping_forward_now(tk);
532 if (!new->enable || new->enable(new) == 0) { 630 if (!new->enable || new->enable(new) == 0) {
@@ -535,9 +633,10 @@ static int change_clocksource(void *data)
535 if (old->disable) 633 if (old->disable)
536 old->disable(old); 634 old->disable(old);
537 } 635 }
538 timekeeping_update(tk, true); 636 timekeeping_update(tk, true, true);
539 637
540 write_sequnlock_irqrestore(&tk->lock, flags); 638 write_seqcount_end(&timekeeper_seq);
639 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
541 640
542 return 0; 641 return 0;
543} 642}
@@ -587,11 +686,11 @@ void getrawmonotonic(struct timespec *ts)
587 s64 nsecs; 686 s64 nsecs;
588 687
589 do { 688 do {
590 seq = read_seqbegin(&tk->lock); 689 seq = read_seqcount_begin(&timekeeper_seq);
591 nsecs = timekeeping_get_ns_raw(tk); 690 nsecs = timekeeping_get_ns_raw(tk);
592 *ts = tk->raw_time; 691 *ts = tk->raw_time;
593 692
594 } while (read_seqretry(&tk->lock, seq)); 693 } while (read_seqcount_retry(&timekeeper_seq, seq));
595 694
596 timespec_add_ns(ts, nsecs); 695 timespec_add_ns(ts, nsecs);
597} 696}
@@ -607,11 +706,11 @@ int timekeeping_valid_for_hres(void)
607 int ret; 706 int ret;
608 707
609 do { 708 do {
610 seq = read_seqbegin(&tk->lock); 709 seq = read_seqcount_begin(&timekeeper_seq);
611 710
612 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 711 ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
613 712
614 } while (read_seqretry(&tk->lock, seq)); 713 } while (read_seqcount_retry(&timekeeper_seq, seq));
615 714
616 return ret; 715 return ret;
617} 716}
@@ -626,11 +725,11 @@ u64 timekeeping_max_deferment(void)
626 u64 ret; 725 u64 ret;
627 726
628 do { 727 do {
629 seq = read_seqbegin(&tk->lock); 728 seq = read_seqcount_begin(&timekeeper_seq);
630 729
631 ret = tk->clock->max_idle_ns; 730 ret = tk->clock->max_idle_ns;
632 731
633 } while (read_seqretry(&tk->lock, seq)); 732 } while (read_seqcount_retry(&timekeeper_seq, seq));
634 733
635 return ret; 734 return ret;
636} 735}
@@ -693,11 +792,10 @@ void __init timekeeping_init(void)
693 boot.tv_nsec = 0; 792 boot.tv_nsec = 0;
694 } 793 }
695 794
696 seqlock_init(&tk->lock); 795 raw_spin_lock_irqsave(&timekeeper_lock, flags);
697 796 write_seqcount_begin(&timekeeper_seq);
698 ntp_init(); 797 ntp_init();
699 798
700 write_seqlock_irqsave(&tk->lock, flags);
701 clock = clocksource_default_clock(); 799 clock = clocksource_default_clock();
702 if (clock->enable) 800 if (clock->enable)
703 clock->enable(clock); 801 clock->enable(clock);
@@ -716,7 +814,10 @@ void __init timekeeping_init(void)
716 tmp.tv_nsec = 0; 814 tmp.tv_nsec = 0;
717 tk_set_sleep_time(tk, tmp); 815 tk_set_sleep_time(tk, tmp);
718 816
719 write_sequnlock_irqrestore(&tk->lock, flags); 817 memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
818
819 write_seqcount_end(&timekeeper_seq);
820 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
720} 821}
721 822
722/* time in seconds when suspend began */ 823/* time in seconds when suspend began */
@@ -764,15 +865,17 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
764 if (has_persistent_clock()) 865 if (has_persistent_clock())
765 return; 866 return;
766 867
767 write_seqlock_irqsave(&tk->lock, flags); 868 raw_spin_lock_irqsave(&timekeeper_lock, flags);
869 write_seqcount_begin(&timekeeper_seq);
768 870
769 timekeeping_forward_now(tk); 871 timekeeping_forward_now(tk);
770 872
771 __timekeeping_inject_sleeptime(tk, delta); 873 __timekeeping_inject_sleeptime(tk, delta);
772 874
773 timekeeping_update(tk, true); 875 timekeeping_update(tk, true, true);
774 876
775 write_sequnlock_irqrestore(&tk->lock, flags); 877 write_seqcount_end(&timekeeper_seq);
878 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
776 879
777 /* signal hrtimers about time change */ 880 /* signal hrtimers about time change */
778 clock_was_set(); 881 clock_was_set();
@@ -788,26 +891,72 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
788static void timekeeping_resume(void) 891static void timekeeping_resume(void)
789{ 892{
790 struct timekeeper *tk = &timekeeper; 893 struct timekeeper *tk = &timekeeper;
894 struct clocksource *clock = tk->clock;
791 unsigned long flags; 895 unsigned long flags;
792 struct timespec ts; 896 struct timespec ts_new, ts_delta;
897 cycle_t cycle_now, cycle_delta;
898 bool suspendtime_found = false;
793 899
794 read_persistent_clock(&ts); 900 read_persistent_clock(&ts_new);
795 901
796 clockevents_resume(); 902 clockevents_resume();
797 clocksource_resume(); 903 clocksource_resume();
798 904
799 write_seqlock_irqsave(&tk->lock, flags); 905 raw_spin_lock_irqsave(&timekeeper_lock, flags);
906 write_seqcount_begin(&timekeeper_seq);
907
908 /*
909 * After system resumes, we need to calculate the suspended time and
910 * compensate it for the OS time. There are 3 sources that could be
911 * used: Nonstop clocksource during suspend, persistent clock and rtc
912 * device.
913 *
914 * One specific platform may have 1 or 2 or all of them, and the
915 * preference will be:
916 * suspend-nonstop clocksource -> persistent clock -> rtc
917 * The less preferred source will only be tried if there is no better
918 * usable source. The rtc part is handled separately in rtc core code.
919 */
920 cycle_now = clock->read(clock);
921 if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
922 cycle_now > clock->cycle_last) {
923 u64 num, max = ULLONG_MAX;
924 u32 mult = clock->mult;
925 u32 shift = clock->shift;
926 s64 nsec = 0;
927
928 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
800 929
801 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 930 /*
802 ts = timespec_sub(ts, timekeeping_suspend_time); 931 * "cycle_delta * mutl" may cause 64 bits overflow, if the
803 __timekeeping_inject_sleeptime(tk, &ts); 932 * suspended time is too long. In that case we need do the
933 * 64 bits math carefully
934 */
935 do_div(max, mult);
936 if (cycle_delta > max) {
937 num = div64_u64(cycle_delta, max);
938 nsec = (((u64) max * mult) >> shift) * num;
939 cycle_delta -= num * max;
940 }
941 nsec += ((u64) cycle_delta * mult) >> shift;
942
943 ts_delta = ns_to_timespec(nsec);
944 suspendtime_found = true;
945 } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) {
946 ts_delta = timespec_sub(ts_new, timekeeping_suspend_time);
947 suspendtime_found = true;
804 } 948 }
805 /* re-base the last cycle value */ 949
806 tk->clock->cycle_last = tk->clock->read(tk->clock); 950 if (suspendtime_found)
951 __timekeeping_inject_sleeptime(tk, &ts_delta);
952
953 /* Re-base the last cycle value */
954 tk->cycle_last = clock->cycle_last = cycle_now;
807 tk->ntp_error = 0; 955 tk->ntp_error = 0;
808 timekeeping_suspended = 0; 956 timekeeping_suspended = 0;
809 timekeeping_update(tk, false); 957 timekeeping_update(tk, false, true);
810 write_sequnlock_irqrestore(&tk->lock, flags); 958 write_seqcount_end(&timekeeper_seq);
959 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
811 960
812 touch_softlockup_watchdog(); 961 touch_softlockup_watchdog();
813 962
@@ -826,7 +975,8 @@ static int timekeeping_suspend(void)
826 975
827 read_persistent_clock(&timekeeping_suspend_time); 976 read_persistent_clock(&timekeeping_suspend_time);
828 977
829 write_seqlock_irqsave(&tk->lock, flags); 978 raw_spin_lock_irqsave(&timekeeper_lock, flags);
979 write_seqcount_begin(&timekeeper_seq);
830 timekeeping_forward_now(tk); 980 timekeeping_forward_now(tk);
831 timekeeping_suspended = 1; 981 timekeeping_suspended = 1;
832 982
@@ -849,7 +999,8 @@ static int timekeeping_suspend(void)
849 timekeeping_suspend_time = 999 timekeeping_suspend_time =
850 timespec_add(timekeeping_suspend_time, delta_delta); 1000 timespec_add(timekeeping_suspend_time, delta_delta);
851 } 1001 }
852 write_sequnlock_irqrestore(&tk->lock, flags); 1002 write_seqcount_end(&timekeeper_seq);
1003 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
853 1004
854 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 1005 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
855 clocksource_suspend(); 1006 clocksource_suspend();
@@ -1099,6 +1250,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1099 tk_set_wall_to_mono(tk, 1250 tk_set_wall_to_mono(tk,
1100 timespec_sub(tk->wall_to_monotonic, ts)); 1251 timespec_sub(tk->wall_to_monotonic, ts));
1101 1252
1253 __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
1254
1102 clock_was_set_delayed(); 1255 clock_was_set_delayed();
1103 } 1256 }
1104 } 1257 }
@@ -1116,15 +1269,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
1116static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, 1269static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
1117 u32 shift) 1270 u32 shift)
1118{ 1271{
1272 cycle_t interval = tk->cycle_interval << shift;
1119 u64 raw_nsecs; 1273 u64 raw_nsecs;
1120 1274
1121 /* If the offset is smaller then a shifted interval, do nothing */ 1275 /* If the offset is smaller then a shifted interval, do nothing */
1122 if (offset < tk->cycle_interval<<shift) 1276 if (offset < interval)
1123 return offset; 1277 return offset;
1124 1278
1125 /* Accumulate one shifted interval */ 1279 /* Accumulate one shifted interval */
1126 offset -= tk->cycle_interval << shift; 1280 offset -= interval;
1127 tk->clock->cycle_last += tk->cycle_interval << shift; 1281 tk->cycle_last += interval;
1128 1282
1129 tk->xtime_nsec += tk->xtime_interval << shift; 1283 tk->xtime_nsec += tk->xtime_interval << shift;
1130 accumulate_nsecs_to_secs(tk); 1284 accumulate_nsecs_to_secs(tk);
@@ -1181,27 +1335,28 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
1181static void update_wall_time(void) 1335static void update_wall_time(void)
1182{ 1336{
1183 struct clocksource *clock; 1337 struct clocksource *clock;
1184 struct timekeeper *tk = &timekeeper; 1338 struct timekeeper *real_tk = &timekeeper;
1339 struct timekeeper *tk = &shadow_timekeeper;
1185 cycle_t offset; 1340 cycle_t offset;
1186 int shift = 0, maxshift; 1341 int shift = 0, maxshift;
1187 unsigned long flags; 1342 unsigned long flags;
1188 1343
1189 write_seqlock_irqsave(&tk->lock, flags); 1344 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1190 1345
1191 /* Make sure we're fully resumed: */ 1346 /* Make sure we're fully resumed: */
1192 if (unlikely(timekeeping_suspended)) 1347 if (unlikely(timekeeping_suspended))
1193 goto out; 1348 goto out;
1194 1349
1195 clock = tk->clock; 1350 clock = real_tk->clock;
1196 1351
1197#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET 1352#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
1198 offset = tk->cycle_interval; 1353 offset = real_tk->cycle_interval;
1199#else 1354#else
1200 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1355 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
1201#endif 1356#endif
1202 1357
1203 /* Check if there's really nothing to do */ 1358 /* Check if there's really nothing to do */
1204 if (offset < tk->cycle_interval) 1359 if (offset < real_tk->cycle_interval)
1205 goto out; 1360 goto out;
1206 1361
1207 /* 1362 /*
@@ -1238,11 +1393,24 @@ static void update_wall_time(void)
1238 */ 1393 */
1239 accumulate_nsecs_to_secs(tk); 1394 accumulate_nsecs_to_secs(tk);
1240 1395
1241 timekeeping_update(tk, false); 1396 write_seqcount_begin(&timekeeper_seq);
1242 1397 /* Update clock->cycle_last with the new value */
1398 clock->cycle_last = tk->cycle_last;
1399 /*
1400 * Update the real timekeeper.
1401 *
1402 * We could avoid this memcpy by switching pointers, but that
1403 * requires changes to all other timekeeper usage sites as
1404 * well, i.e. move the timekeeper pointer getter into the
1405 * spinlocked/seqcount protected sections. And we trade this
1406 * memcpy under the timekeeper_seq against one before we start
1407 * updating.
1408 */
1409 memcpy(real_tk, tk, sizeof(*tk));
1410 timekeeping_update(real_tk, false, false);
1411 write_seqcount_end(&timekeeper_seq);
1243out: 1412out:
1244 write_sequnlock_irqrestore(&tk->lock, flags); 1413 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1245
1246} 1414}
1247 1415
1248/** 1416/**
@@ -1289,13 +1457,13 @@ void get_monotonic_boottime(struct timespec *ts)
1289 WARN_ON(timekeeping_suspended); 1457 WARN_ON(timekeeping_suspended);
1290 1458
1291 do { 1459 do {
1292 seq = read_seqbegin(&tk->lock); 1460 seq = read_seqcount_begin(&timekeeper_seq);
1293 ts->tv_sec = tk->xtime_sec; 1461 ts->tv_sec = tk->xtime_sec;
1294 nsec = timekeeping_get_ns(tk); 1462 nsec = timekeeping_get_ns(tk);
1295 tomono = tk->wall_to_monotonic; 1463 tomono = tk->wall_to_monotonic;
1296 sleep = tk->total_sleep_time; 1464 sleep = tk->total_sleep_time;
1297 1465
1298 } while (read_seqretry(&tk->lock, seq)); 1466 } while (read_seqcount_retry(&timekeeper_seq, seq));
1299 1467
1300 ts->tv_sec += tomono.tv_sec + sleep.tv_sec; 1468 ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
1301 ts->tv_nsec = 0; 1469 ts->tv_nsec = 0;
@@ -1354,10 +1522,10 @@ struct timespec current_kernel_time(void)
1354 unsigned long seq; 1522 unsigned long seq;
1355 1523
1356 do { 1524 do {
1357 seq = read_seqbegin(&tk->lock); 1525 seq = read_seqcount_begin(&timekeeper_seq);
1358 1526
1359 now = tk_xtime(tk); 1527 now = tk_xtime(tk);
1360 } while (read_seqretry(&tk->lock, seq)); 1528 } while (read_seqcount_retry(&timekeeper_seq, seq));
1361 1529
1362 return now; 1530 return now;
1363} 1531}
@@ -1370,11 +1538,11 @@ struct timespec get_monotonic_coarse(void)
1370 unsigned long seq; 1538 unsigned long seq;
1371 1539
1372 do { 1540 do {
1373 seq = read_seqbegin(&tk->lock); 1541 seq = read_seqcount_begin(&timekeeper_seq);
1374 1542
1375 now = tk_xtime(tk); 1543 now = tk_xtime(tk);
1376 mono = tk->wall_to_monotonic; 1544 mono = tk->wall_to_monotonic;
1377 } while (read_seqretry(&tk->lock, seq)); 1545 } while (read_seqcount_retry(&timekeeper_seq, seq));
1378 1546
1379 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1547 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
1380 now.tv_nsec + mono.tv_nsec); 1548 now.tv_nsec + mono.tv_nsec);
@@ -1405,11 +1573,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1405 unsigned long seq; 1573 unsigned long seq;
1406 1574
1407 do { 1575 do {
1408 seq = read_seqbegin(&tk->lock); 1576 seq = read_seqcount_begin(&timekeeper_seq);
1409 *xtim = tk_xtime(tk); 1577 *xtim = tk_xtime(tk);
1410 *wtom = tk->wall_to_monotonic; 1578 *wtom = tk->wall_to_monotonic;
1411 *sleep = tk->total_sleep_time; 1579 *sleep = tk->total_sleep_time;
1412 } while (read_seqretry(&tk->lock, seq)); 1580 } while (read_seqcount_retry(&timekeeper_seq, seq));
1413} 1581}
1414 1582
1415#ifdef CONFIG_HIGH_RES_TIMERS 1583#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1421,7 +1589,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1421 * Returns current monotonic time and updates the offsets 1589 * Returns current monotonic time and updates the offsets
1422 * Called from hrtimer_interupt() or retrigger_next_event() 1590 * Called from hrtimer_interupt() or retrigger_next_event()
1423 */ 1591 */
1424ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) 1592ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
1593 ktime_t *offs_tai)
1425{ 1594{
1426 struct timekeeper *tk = &timekeeper; 1595 struct timekeeper *tk = &timekeeper;
1427 ktime_t now; 1596 ktime_t now;
@@ -1429,14 +1598,15 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
1429 u64 secs, nsecs; 1598 u64 secs, nsecs;
1430 1599
1431 do { 1600 do {
1432 seq = read_seqbegin(&tk->lock); 1601 seq = read_seqcount_begin(&timekeeper_seq);
1433 1602
1434 secs = tk->xtime_sec; 1603 secs = tk->xtime_sec;
1435 nsecs = timekeeping_get_ns(tk); 1604 nsecs = timekeeping_get_ns(tk);
1436 1605
1437 *offs_real = tk->offs_real; 1606 *offs_real = tk->offs_real;
1438 *offs_boot = tk->offs_boot; 1607 *offs_boot = tk->offs_boot;
1439 } while (read_seqretry(&tk->lock, seq)); 1608 *offs_tai = tk->offs_tai;
1609 } while (read_seqcount_retry(&timekeeper_seq, seq));
1440 1610
1441 now = ktime_add_ns(ktime_set(secs, 0), nsecs); 1611 now = ktime_add_ns(ktime_set(secs, 0), nsecs);
1442 now = ktime_sub(now, *offs_real); 1612 now = ktime_sub(now, *offs_real);
@@ -1454,15 +1624,79 @@ ktime_t ktime_get_monotonic_offset(void)
1454 struct timespec wtom; 1624 struct timespec wtom;
1455 1625
1456 do { 1626 do {
1457 seq = read_seqbegin(&tk->lock); 1627 seq = read_seqcount_begin(&timekeeper_seq);
1458 wtom = tk->wall_to_monotonic; 1628 wtom = tk->wall_to_monotonic;
1459 } while (read_seqretry(&tk->lock, seq)); 1629 } while (read_seqcount_retry(&timekeeper_seq, seq));
1460 1630
1461 return timespec_to_ktime(wtom); 1631 return timespec_to_ktime(wtom);
1462} 1632}
1463EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); 1633EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1464 1634
1465/** 1635/**
1636 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
1637 */
1638int do_adjtimex(struct timex *txc)
1639{
1640 struct timekeeper *tk = &timekeeper;
1641 unsigned long flags;
1642 struct timespec ts;
1643 s32 orig_tai, tai;
1644 int ret;
1645
1646 /* Validate the data before disabling interrupts */
1647 ret = ntp_validate_timex(txc);
1648 if (ret)
1649 return ret;
1650
1651 if (txc->modes & ADJ_SETOFFSET) {
1652 struct timespec delta;
1653 delta.tv_sec = txc->time.tv_sec;
1654 delta.tv_nsec = txc->time.tv_usec;
1655 if (!(txc->modes & ADJ_NANO))
1656 delta.tv_nsec *= 1000;
1657 ret = timekeeping_inject_offset(&delta);
1658 if (ret)
1659 return ret;
1660 }
1661
1662 getnstimeofday(&ts);
1663
1664 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1665 write_seqcount_begin(&timekeeper_seq);
1666
1667 orig_tai = tai = tk->tai_offset;
1668 ret = __do_adjtimex(txc, &ts, &tai);
1669
1670 if (tai != orig_tai) {
1671 __timekeeping_set_tai_offset(tk, tai);
1672 clock_was_set_delayed();
1673 }
1674 write_seqcount_end(&timekeeper_seq);
1675 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1676
1677 return ret;
1678}
1679
1680#ifdef CONFIG_NTP_PPS
1681/**
1682 * hardpps() - Accessor function to NTP __hardpps function
1683 */
1684void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
1685{
1686 unsigned long flags;
1687
1688 raw_spin_lock_irqsave(&timekeeper_lock, flags);
1689 write_seqcount_begin(&timekeeper_seq);
1690
1691 __hardpps(phase_ts, raw_ts);
1692
1693 write_seqcount_end(&timekeeper_seq);
1694 raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
1695}
1696EXPORT_SYMBOL(hardpps);
1697#endif
1698
1699/**
1466 * xtime_update() - advances the timekeeping infrastructure 1700 * xtime_update() - advances the timekeeping infrastructure
1467 * @ticks: number of ticks, that have elapsed since the last call. 1701 * @ticks: number of ticks, that have elapsed since the last call.
1468 * 1702 *
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9f164b..3bdf28323012 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -20,6 +20,13 @@
20 20
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22 22
23
24struct timer_list_iter {
25 int cpu;
26 bool second_pass;
27 u64 now;
28};
29
23typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); 30typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
24 31
25DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); 32DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
@@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
133 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 140 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
134 int i; 141 int i;
135 142
136 SEQ_printf(m, "\n");
137 SEQ_printf(m, "cpu: %d\n", cpu); 143 SEQ_printf(m, "cpu: %d\n", cpu);
138 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 144 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
139 SEQ_printf(m, " clock %d:\n", i); 145 SEQ_printf(m, " clock %d:\n", i);
@@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
187 193
188#undef P 194#undef P
189#undef P_ns 195#undef P_ns
196 SEQ_printf(m, "\n");
190} 197}
191 198
192#ifdef CONFIG_GENERIC_CLOCKEVENTS 199#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
195{ 202{
196 struct clock_event_device *dev = td->evtdev; 203 struct clock_event_device *dev = td->evtdev;
197 204
198 SEQ_printf(m, "\n");
199 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); 205 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
200 if (cpu < 0) 206 if (cpu < 0)
201 SEQ_printf(m, "Broadcast device\n"); 207 SEQ_printf(m, "Broadcast device\n");
@@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
230 print_name_offset(m, dev->event_handler); 236 print_name_offset(m, dev->event_handler);
231 SEQ_printf(m, "\n"); 237 SEQ_printf(m, "\n");
232 SEQ_printf(m, " retries: %lu\n", dev->retries); 238 SEQ_printf(m, " retries: %lu\n", dev->retries);
239 SEQ_printf(m, "\n");
233} 240}
234 241
235static void timer_list_show_tickdevices(struct seq_file *m) 242static void timer_list_show_tickdevices_header(struct seq_file *m)
236{ 243{
237 int cpu;
238
239#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 244#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
240 print_tickdevice(m, tick_get_broadcast_device(), -1); 245 print_tickdevice(m, tick_get_broadcast_device(), -1);
241 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 246 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
@@ -246,47 +251,104 @@ static void timer_list_show_tickdevices(struct seq_file *m)
246#endif 251#endif
247 SEQ_printf(m, "\n"); 252 SEQ_printf(m, "\n");
248#endif 253#endif
249 for_each_online_cpu(cpu)
250 print_tickdevice(m, tick_get_device(cpu), cpu);
251 SEQ_printf(m, "\n");
252} 254}
253#else
254static void timer_list_show_tickdevices(struct seq_file *m) { }
255#endif 255#endif
256 256
257static inline void timer_list_header(struct seq_file *m, u64 now)
258{
259 SEQ_printf(m, "Timer List Version: v0.7\n");
260 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
261 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
262 SEQ_printf(m, "\n");
263}
264
257static int timer_list_show(struct seq_file *m, void *v) 265static int timer_list_show(struct seq_file *m, void *v)
258{ 266{
267 struct timer_list_iter *iter = v;
268 u64 now = ktime_to_ns(ktime_get());
269
270 if (iter->cpu == -1 && !iter->second_pass)
271 timer_list_header(m, now);
272 else if (!iter->second_pass)
273 print_cpu(m, iter->cpu, iter->now);
274#ifdef CONFIG_GENERIC_CLOCKEVENTS
275 else if (iter->cpu == -1 && iter->second_pass)
276 timer_list_show_tickdevices_header(m);
277 else
278 print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
279#endif
280 return 0;
281}
282
283void sysrq_timer_list_show(void)
284{
259 u64 now = ktime_to_ns(ktime_get()); 285 u64 now = ktime_to_ns(ktime_get());
260 int cpu; 286 int cpu;
261 287
262 SEQ_printf(m, "Timer List Version: v0.7\n"); 288 timer_list_header(NULL, now);
263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
265 289
266 for_each_online_cpu(cpu) 290 for_each_online_cpu(cpu)
267 print_cpu(m, cpu, now); 291 print_cpu(NULL, cpu, now);
268 292
269 SEQ_printf(m, "\n"); 293#ifdef CONFIG_GENERIC_CLOCKEVENTS
270 timer_list_show_tickdevices(m); 294 timer_list_show_tickdevices_header(NULL);
295 for_each_online_cpu(cpu)
296 print_tickdevice(NULL, tick_get_device(cpu), cpu);
297#endif
298 return;
299}
271 300
272 return 0; 301static void *timer_list_start(struct seq_file *file, loff_t *offset)
302{
303 struct timer_list_iter *iter = file->private;
304
305 if (!*offset) {
306 iter->cpu = -1;
307 iter->now = ktime_to_ns(ktime_get());
308 } else if (iter->cpu >= nr_cpu_ids) {
309#ifdef CONFIG_GENERIC_CLOCKEVENTS
310 if (!iter->second_pass) {
311 iter->cpu = -1;
312 iter->second_pass = true;
313 } else
314 return NULL;
315#else
316 return NULL;
317#endif
318 }
319 return iter;
273} 320}
274 321
275void sysrq_timer_list_show(void) 322static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
323{
324 struct timer_list_iter *iter = file->private;
325 iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
326 ++*offset;
327 return timer_list_start(file, offset);
328}
329
330static void timer_list_stop(struct seq_file *seq, void *v)
276{ 331{
277 timer_list_show(NULL, NULL);
278} 332}
279 333
334static const struct seq_operations timer_list_sops = {
335 .start = timer_list_start,
336 .next = timer_list_next,
337 .stop = timer_list_stop,
338 .show = timer_list_show,
339};
340
280static int timer_list_open(struct inode *inode, struct file *filp) 341static int timer_list_open(struct inode *inode, struct file *filp)
281{ 342{
282 return single_open(filp, timer_list_show, NULL); 343 return seq_open_private(filp, &timer_list_sops,
344 sizeof(struct timer_list_iter));
283} 345}
284 346
285static const struct file_operations timer_list_fops = { 347static const struct file_operations timer_list_fops = {
286 .open = timer_list_open, 348 .open = timer_list_open,
287 .read = seq_read, 349 .read = seq_read,
288 .llseek = seq_lseek, 350 .llseek = seq_lseek,
289 .release = single_release, 351 .release = seq_release_private,
290}; 352};
291 353
292static int __init init_timer_list_procfs(void) 354static int __init init_timer_list_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index dbf7a78a1ef1..a860bba34412 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * linux/kernel/timer.c 2 * linux/kernel/timer.c
3 * 3 *
4 * Kernel internal timers, basic process system calls 4 * Kernel internal timers
5 * 5 *
6 * Copyright (C) 1991, 1992 Linus Torvalds 6 * Copyright (C) 1991, 1992 Linus Torvalds
7 * 7 *
@@ -41,6 +41,7 @@
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/sched/sysctl.h> 42#include <linux/sched/sysctl.h>
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/compat.h>
44 45
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
46#include <asm/unistd.h> 47#include <asm/unistd.h>
@@ -738,7 +739,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
738 739
739 cpu = smp_processor_id(); 740 cpu = smp_processor_id();
740 741
741#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 742#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
742 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) 743 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
743 cpu = get_nohz_timer_target(); 744 cpu = get_nohz_timer_target();
744#endif 745#endif
@@ -930,14 +931,14 @@ void add_timer_on(struct timer_list *timer, int cpu)
930 debug_activate(timer, timer->expires); 931 debug_activate(timer, timer->expires);
931 internal_add_timer(base, timer); 932 internal_add_timer(base, timer);
932 /* 933 /*
933 * Check whether the other CPU is idle and needs to be 934 * Check whether the other CPU is in dynticks mode and needs
934 * triggered to reevaluate the timer wheel when nohz is 935 * to be triggered to reevaluate the timer wheel.
935 * active. We are protected against the other CPU fiddling 936 * We are protected against the other CPU fiddling
936 * with the timer by holding the timer base lock. This also 937 * with the timer by holding the timer base lock. This also
937 * makes sure that a CPU on the way to idle can not evaluate 938 * makes sure that a CPU on the way to stop its tick can not
938 * the timer wheel. 939 * evaluate the timer wheel.
939 */ 940 */
940 wake_up_idle_cpu(cpu); 941 wake_up_nohz_cpu(cpu);
941 spin_unlock_irqrestore(&base->lock, flags); 942 spin_unlock_irqrestore(&base->lock, flags);
942} 943}
943EXPORT_SYMBOL_GPL(add_timer_on); 944EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1188,7 +1189,7 @@ static inline void __run_timers(struct tvec_base *base)
1188 spin_unlock_irq(&base->lock); 1189 spin_unlock_irq(&base->lock);
1189} 1190}
1190 1191
1191#ifdef CONFIG_NO_HZ 1192#ifdef CONFIG_NO_HZ_COMMON
1192/* 1193/*
1193 * Find out when the next timer event is due to happen. This 1194 * Find out when the next timer event is due to happen. This
1194 * is used on S/390 to stop all activity when a CPU is idle. 1195 * is used on S/390 to stop all activity when a CPU is idle.
@@ -1395,61 +1396,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1395 1396
1396#endif 1397#endif
1397 1398
1398/**
1399 * sys_getpid - return the thread group id of the current process
1400 *
1401 * Note, despite the name, this returns the tgid not the pid. The tgid and
1402 * the pid are identical unless CLONE_THREAD was specified on clone() in
1403 * which case the tgid is the same in all threads of the same group.
1404 *
1405 * This is SMP safe as current->tgid does not change.
1406 */
1407SYSCALL_DEFINE0(getpid)
1408{
1409 return task_tgid_vnr(current);
1410}
1411
1412/*
1413 * Accessing ->real_parent is not SMP-safe, it could
1414 * change from under us. However, we can use a stale
1415 * value of ->real_parent under rcu_read_lock(), see
1416 * release_task()->call_rcu(delayed_put_task_struct).
1417 */
1418SYSCALL_DEFINE0(getppid)
1419{
1420 int pid;
1421
1422 rcu_read_lock();
1423 pid = task_tgid_vnr(rcu_dereference(current->real_parent));
1424 rcu_read_unlock();
1425
1426 return pid;
1427}
1428
1429SYSCALL_DEFINE0(getuid)
1430{
1431 /* Only we change this so SMP safe */
1432 return from_kuid_munged(current_user_ns(), current_uid());
1433}
1434
1435SYSCALL_DEFINE0(geteuid)
1436{
1437 /* Only we change this so SMP safe */
1438 return from_kuid_munged(current_user_ns(), current_euid());
1439}
1440
1441SYSCALL_DEFINE0(getgid)
1442{
1443 /* Only we change this so SMP safe */
1444 return from_kgid_munged(current_user_ns(), current_gid());
1445}
1446
1447SYSCALL_DEFINE0(getegid)
1448{
1449 /* Only we change this so SMP safe */
1450 return from_kgid_munged(current_user_ns(), current_egid());
1451}
1452
1453static void process_timeout(unsigned long __data) 1399static void process_timeout(unsigned long __data)
1454{ 1400{
1455 wake_up_process((struct task_struct *)__data); 1401 wake_up_process((struct task_struct *)__data);
@@ -1557,91 +1503,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1557} 1503}
1558EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1504EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1559 1505
1560/* Thread ID - the internal kernel "pid" */
1561SYSCALL_DEFINE0(gettid)
1562{
1563 return task_pid_vnr(current);
1564}
1565
1566/**
1567 * do_sysinfo - fill in sysinfo struct
1568 * @info: pointer to buffer to fill
1569 */
1570int do_sysinfo(struct sysinfo *info)
1571{
1572 unsigned long mem_total, sav_total;
1573 unsigned int mem_unit, bitcount;
1574 struct timespec tp;
1575
1576 memset(info, 0, sizeof(struct sysinfo));
1577
1578 ktime_get_ts(&tp);
1579 monotonic_to_bootbased(&tp);
1580 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1581
1582 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1583
1584 info->procs = nr_threads;
1585
1586 si_meminfo(info);
1587 si_swapinfo(info);
1588
1589 /*
1590 * If the sum of all the available memory (i.e. ram + swap)
1591 * is less than can be stored in a 32 bit unsigned long then
1592 * we can be binary compatible with 2.2.x kernels. If not,
1593 * well, in that case 2.2.x was broken anyways...
1594 *
1595 * -Erik Andersen <andersee@debian.org>
1596 */
1597
1598 mem_total = info->totalram + info->totalswap;
1599 if (mem_total < info->totalram || mem_total < info->totalswap)
1600 goto out;
1601 bitcount = 0;
1602 mem_unit = info->mem_unit;
1603 while (mem_unit > 1) {
1604 bitcount++;
1605 mem_unit >>= 1;
1606 sav_total = mem_total;
1607 mem_total <<= 1;
1608 if (mem_total < sav_total)
1609 goto out;
1610 }
1611
1612 /*
1613 * If mem_total did not overflow, multiply all memory values by
1614 * info->mem_unit and set it to 1. This leaves things compatible
1615 * with 2.2.x, and also retains compatibility with earlier 2.4.x
1616 * kernels...
1617 */
1618
1619 info->mem_unit = 1;
1620 info->totalram <<= bitcount;
1621 info->freeram <<= bitcount;
1622 info->sharedram <<= bitcount;
1623 info->bufferram <<= bitcount;
1624 info->totalswap <<= bitcount;
1625 info->freeswap <<= bitcount;
1626 info->totalhigh <<= bitcount;
1627 info->freehigh <<= bitcount;
1628
1629out:
1630 return 0;
1631}
1632
1633SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
1634{
1635 struct sysinfo val;
1636
1637 do_sysinfo(&val);
1638
1639 if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1640 return -EFAULT;
1641
1642 return 0;
1643}
1644
1645static int __cpuinit init_timers_cpu(int cpu) 1506static int __cpuinit init_timers_cpu(int cpu)
1646{ 1507{
1647 int j; 1508 int j;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 192473b22799..5e9efd4b83a4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -176,6 +176,8 @@ config IRQSOFF_TRACER
176 select GENERIC_TRACER 176 select GENERIC_TRACER
177 select TRACER_MAX_TRACE 177 select TRACER_MAX_TRACE
178 select RING_BUFFER_ALLOW_SWAP 178 select RING_BUFFER_ALLOW_SWAP
179 select TRACER_SNAPSHOT
180 select TRACER_SNAPSHOT_PER_CPU_SWAP
179 help 181 help
180 This option measures the time spent in irqs-off critical 182 This option measures the time spent in irqs-off critical
181 sections, with microsecond accuracy. 183 sections, with microsecond accuracy.
@@ -198,6 +200,8 @@ config PREEMPT_TRACER
198 select GENERIC_TRACER 200 select GENERIC_TRACER
199 select TRACER_MAX_TRACE 201 select TRACER_MAX_TRACE
200 select RING_BUFFER_ALLOW_SWAP 202 select RING_BUFFER_ALLOW_SWAP
203 select TRACER_SNAPSHOT
204 select TRACER_SNAPSHOT_PER_CPU_SWAP
201 help 205 help
202 This option measures the time spent in preemption-off critical 206 This option measures the time spent in preemption-off critical
203 sections, with microsecond accuracy. 207 sections, with microsecond accuracy.
@@ -217,6 +221,7 @@ config SCHED_TRACER
217 select GENERIC_TRACER 221 select GENERIC_TRACER
218 select CONTEXT_SWITCH_TRACER 222 select CONTEXT_SWITCH_TRACER
219 select TRACER_MAX_TRACE 223 select TRACER_MAX_TRACE
224 select TRACER_SNAPSHOT
220 help 225 help
221 This tracer tracks the latency of the highest priority task 226 This tracer tracks the latency of the highest priority task
222 to be scheduled in, starting from the point it has woken up. 227 to be scheduled in, starting from the point it has woken up.
@@ -248,6 +253,27 @@ config TRACER_SNAPSHOT
248 echo 1 > /sys/kernel/debug/tracing/snapshot 253 echo 1 > /sys/kernel/debug/tracing/snapshot
249 cat snapshot 254 cat snapshot
250 255
256config TRACER_SNAPSHOT_PER_CPU_SWAP
257 bool "Allow snapshot to swap per CPU"
258 depends on TRACER_SNAPSHOT
259 select RING_BUFFER_ALLOW_SWAP
260 help
261 Allow doing a snapshot of a single CPU buffer instead of a
262 full swap (all buffers). If this is set, then the following is
263 allowed:
264
265 echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
266
267 After which, only the tracing buffer for CPU 2 was swapped with
268 the main tracing buffer, and the other CPU buffers remain the same.
269
270 When this is enabled, this adds a little more overhead to the
271 trace recording, as it needs to add some checks to synchronize
272 recording with swaps. But this does not affect the performance
273 of the overall system. This is enabled by default when the preempt
274 or irq latency tracers are enabled, as those need to swap as well
275 and already adds the overhead (plus a lot more).
276
251config TRACE_BRANCH_PROFILING 277config TRACE_BRANCH_PROFILING
252 bool 278 bool
253 select GENERIC_TRACER 279 select GENERIC_TRACER
@@ -414,24 +440,28 @@ config PROBE_EVENTS
414 def_bool n 440 def_bool n
415 441
416config DYNAMIC_FTRACE 442config DYNAMIC_FTRACE
417 bool "enable/disable ftrace tracepoints dynamically" 443 bool "enable/disable function tracing dynamically"
418 depends on FUNCTION_TRACER 444 depends on FUNCTION_TRACER
419 depends on HAVE_DYNAMIC_FTRACE 445 depends on HAVE_DYNAMIC_FTRACE
420 default y 446 default y
421 help 447 help
422 This option will modify all the calls to ftrace dynamically 448 This option will modify all the calls to function tracing
423 (will patch them out of the binary image and replace them 449 dynamically (will patch them out of the binary image and
424 with a No-Op instruction) as they are called. A table is 450 replace them with a No-Op instruction) on boot up. During
425 created to dynamically enable them again. 451 compile time, a table is made of all the locations that ftrace
452 can function trace, and this table is linked into the kernel
453 image. When this is enabled, functions can be individually
454 enabled, and the functions not enabled will not affect
455 performance of the system.
456
457 See the files in /sys/kernel/debug/tracing:
458 available_filter_functions
459 set_ftrace_filter
460 set_ftrace_notrace
426 461
427 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but 462 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
428 otherwise has native performance as long as no tracing is active. 463 otherwise has native performance as long as no tracing is active.
429 464
430 The changes to the code are done by a kernel thread that
431 wakes up once a second and checks to see if any ftrace calls
432 were made. If so, it runs stop_machine (stops all CPUS)
433 and modifies the code to jump over the call to ftrace.
434
435config DYNAMIC_FTRACE_WITH_REGS 465config DYNAMIC_FTRACE_WITH_REGS
436 def_bool y 466 def_bool y
437 depends on DYNAMIC_FTRACE 467 depends on DYNAMIC_FTRACE
@@ -520,6 +550,29 @@ config RING_BUFFER_BENCHMARK
520 550
521 If unsure, say N. 551 If unsure, say N.
522 552
553config RING_BUFFER_STARTUP_TEST
554 bool "Ring buffer startup self test"
555 depends on RING_BUFFER
556 help
557 Run a simple self test on the ring buffer on boot up. Late in the
558 kernel boot sequence, the test will start that kicks off
559 a thread per cpu. Each thread will write various size events
560 into the ring buffer. Another thread is created to send IPIs
561 to each of the threads, where the IPI handler will also write
562 to the ring buffer, to test/stress the nesting ability.
563 If any anomalies are discovered, a warning will be displayed
564 and all ring buffers will be disabled.
565
566 The test runs for 10 seconds. This will slow your boot time
567 by at least 10 more seconds.
568
569 At the end of the test, statics and more checks are done.
570 It will output the stats of each per cpu buffer. What
571 was written, the sizes, what was read, what was lost, and
572 other similar details.
573
574 If unsure, say N
575
523endif # FTRACE 576endif # FTRACE
524 577
525endif # TRACING_SUPPORT 578endif # TRACING_SUPPORT
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e5b8c272eec..ed58a3216a6d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
72 bool blk_tracer = blk_tracer_enabled; 72 bool blk_tracer = blk_tracer_enabled;
73 73
74 if (blk_tracer) { 74 if (blk_tracer) {
75 buffer = blk_tr->buffer; 75 buffer = blk_tr->trace_buffer.buffer;
76 pc = preempt_count(); 76 pc = preempt_count();
77 event = trace_buffer_lock_reserve(buffer, TRACE_BLK, 77 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
78 sizeof(*t) + len, 78 sizeof(*t) + len,
@@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
218 if (blk_tracer) { 218 if (blk_tracer) {
219 tracing_record_cmdline(current); 219 tracing_record_cmdline(current);
220 220
221 buffer = blk_tr->buffer; 221 buffer = blk_tr->trace_buffer.buffer;
222 pc = preempt_count(); 222 pc = preempt_count();
223 event = trace_buffer_lock_reserve(buffer, TRACE_BLK, 223 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
224 sizeof(*t) + pdu_len, 224 sizeof(*t) + pdu_len,
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,
739 struct request_queue *q, 739 struct request_queue *q,
740 struct request *rq) 740 struct request *rq)
741{ 741{
742 struct blk_trace *bt = q->blk_trace;
743
744 /* if control ever passes through here, it's a request based driver */
745 if (unlikely(bt && !bt->rq_based))
746 bt->rq_based = true;
747
748 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 742 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
749} 743}
750 744
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
780 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); 774 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
781} 775}
782 776
783static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) 777static void blk_add_trace_bio_complete(void *ignore,
778 struct request_queue *q, struct bio *bio,
779 int error)
784{ 780{
785 struct request_queue *q;
786 struct blk_trace *bt;
787
788 if (!bio->bi_bdev)
789 return;
790
791 q = bdev_get_queue(bio->bi_bdev);
792 bt = q->blk_trace;
793
794 /*
795 * Request based drivers will generate both rq and bio completions.
796 * Ignore bio ones.
797 */
798 if (likely(!bt) || bt->rq_based)
799 return;
800
801 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); 781 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
802} 782}
803 783
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ab25b88aae56..8a5c017bb50c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -66,7 +66,7 @@
66 66
67static struct ftrace_ops ftrace_list_end __read_mostly = { 67static struct ftrace_ops ftrace_list_end __read_mostly = {
68 .func = ftrace_stub, 68 .func = ftrace_stub,
69 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 69 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
70}; 70};
71 71
72/* ftrace_enabled is a method to turn ftrace on or off */ 72/* ftrace_enabled is a method to turn ftrace on or off */
@@ -486,7 +486,6 @@ struct ftrace_profile_stat {
486#define PROFILES_PER_PAGE \ 486#define PROFILES_PER_PAGE \
487 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) 487 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
488 488
489static int ftrace_profile_bits __read_mostly;
490static int ftrace_profile_enabled __read_mostly; 489static int ftrace_profile_enabled __read_mostly;
491 490
492/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ 491/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
@@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);
494 493
495static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); 494static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
496 495
497#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ 496#define FTRACE_PROFILE_HASH_BITS 10
497#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)
498 498
499static void * 499static void *
500function_stat_next(void *v, int idx) 500function_stat_next(void *v, int idx)
@@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
676 676
677 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); 677 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
678 678
679 for (i = 0; i < pages; i++) { 679 for (i = 1; i < pages; i++) {
680 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 680 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
681 if (!pg->next) 681 if (!pg->next)
682 goto out_free; 682 goto out_free;
@@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
694 free_page(tmp); 694 free_page(tmp);
695 } 695 }
696 696
697 free_page((unsigned long)stat->pages);
698 stat->pages = NULL; 697 stat->pages = NULL;
699 stat->start = NULL; 698 stat->start = NULL;
700 699
@@ -725,13 +724,6 @@ static int ftrace_profile_init_cpu(int cpu)
725 if (!stat->hash) 724 if (!stat->hash)
726 return -ENOMEM; 725 return -ENOMEM;
727 726
728 if (!ftrace_profile_bits) {
729 size--;
730
731 for (; size; size >>= 1)
732 ftrace_profile_bits++;
733 }
734
735 /* Preallocate the function profiling pages */ 727 /* Preallocate the function profiling pages */
736 if (ftrace_profile_pages_init(stat) < 0) { 728 if (ftrace_profile_pages_init(stat) < 0) {
737 kfree(stat->hash); 729 kfree(stat->hash);
@@ -764,7 +756,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
764 struct hlist_head *hhd; 756 struct hlist_head *hhd;
765 unsigned long key; 757 unsigned long key;
766 758
767 key = hash_long(ip, ftrace_profile_bits); 759 key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
768 hhd = &stat->hash[key]; 760 hhd = &stat->hash[key];
769 761
770 if (hlist_empty(hhd)) 762 if (hlist_empty(hhd))
@@ -783,7 +775,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
783{ 775{
784 unsigned long key; 776 unsigned long key;
785 777
786 key = hash_long(rec->ip, ftrace_profile_bits); 778 key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);
787 hlist_add_head_rcu(&rec->node, &stat->hash[key]); 779 hlist_add_head_rcu(&rec->node, &stat->hash[key]);
788} 780}
789 781
@@ -1053,6 +1045,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
1053 1045
1054static struct pid * const ftrace_swapper_pid = &init_struct_pid; 1046static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1055 1047
1048loff_t
1049ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
1050{
1051 loff_t ret;
1052
1053 if (file->f_mode & FMODE_READ)
1054 ret = seq_lseek(file, offset, whence);
1055 else
1056 file->f_pos = ret = 1;
1057
1058 return ret;
1059}
1060
1056#ifdef CONFIG_DYNAMIC_FTRACE 1061#ifdef CONFIG_DYNAMIC_FTRACE
1057 1062
1058#ifndef CONFIG_FTRACE_MCOUNT_RECORD 1063#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1067,7 +1072,7 @@ struct ftrace_func_probe {
1067 unsigned long flags; 1072 unsigned long flags;
1068 unsigned long ip; 1073 unsigned long ip;
1069 void *data; 1074 void *data;
1070 struct rcu_head rcu; 1075 struct list_head free_list;
1071}; 1076};
1072 1077
1073struct ftrace_func_entry { 1078struct ftrace_func_entry {
@@ -1317,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1317 struct hlist_head *hhd; 1322 struct hlist_head *hhd;
1318 struct ftrace_hash *old_hash; 1323 struct ftrace_hash *old_hash;
1319 struct ftrace_hash *new_hash; 1324 struct ftrace_hash *new_hash;
1320 unsigned long key;
1321 int size = src->count; 1325 int size = src->count;
1322 int bits = 0; 1326 int bits = 0;
1323 int ret; 1327 int ret;
@@ -1360,10 +1364,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1360 for (i = 0; i < size; i++) { 1364 for (i = 0; i < size; i++) {
1361 hhd = &src->buckets[i]; 1365 hhd = &src->buckets[i];
1362 hlist_for_each_entry_safe(entry, tn, hhd, hlist) { 1366 hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
1363 if (bits > 0)
1364 key = hash_long(entry->ip, bits);
1365 else
1366 key = 0;
1367 remove_hash_entry(src, entry); 1367 remove_hash_entry(src, entry);
1368 __add_hash_entry(new_hash, entry); 1368 __add_hash_entry(new_hash, entry);
1369 } 1369 }
@@ -2613,7 +2613,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
2613 * routine, you can use ftrace_filter_write() for the write 2613 * routine, you can use ftrace_filter_write() for the write
2614 * routine if @flag has FTRACE_ITER_FILTER set, or 2614 * routine if @flag has FTRACE_ITER_FILTER set, or
2615 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. 2615 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2616 * ftrace_regex_lseek() should be used as the lseek routine, and 2616 * ftrace_filter_lseek() should be used as the lseek routine, and
2617 * release must call ftrace_regex_release(). 2617 * release must call ftrace_regex_release().
2618 */ 2618 */
2619int 2619int
@@ -2697,19 +2697,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2697 inode, file); 2697 inode, file);
2698} 2698}
2699 2699
2700loff_t
2701ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
2702{
2703 loff_t ret;
2704
2705 if (file->f_mode & FMODE_READ)
2706 ret = seq_lseek(file, offset, whence);
2707 else
2708 file->f_pos = ret = 1;
2709
2710 return ret;
2711}
2712
2713static int ftrace_match(char *str, char *regex, int len, int type) 2700static int ftrace_match(char *str, char *regex, int len, int type)
2714{ 2701{
2715 int matched = 0; 2702 int matched = 0;
@@ -2974,28 +2961,27 @@ static void __disable_ftrace_function_probe(void)
2974} 2961}
2975 2962
2976 2963
2977static void ftrace_free_entry_rcu(struct rcu_head *rhp) 2964static void ftrace_free_entry(struct ftrace_func_probe *entry)
2978{ 2965{
2979 struct ftrace_func_probe *entry =
2980 container_of(rhp, struct ftrace_func_probe, rcu);
2981
2982 if (entry->ops->free) 2966 if (entry->ops->free)
2983 entry->ops->free(&entry->data); 2967 entry->ops->free(entry->ops, entry->ip, &entry->data);
2984 kfree(entry); 2968 kfree(entry);
2985} 2969}
2986 2970
2987
2988int 2971int
2989register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 2972register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2990 void *data) 2973 void *data)
2991{ 2974{
2992 struct ftrace_func_probe *entry; 2975 struct ftrace_func_probe *entry;
2976 struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
2977 struct ftrace_hash *hash;
2993 struct ftrace_page *pg; 2978 struct ftrace_page *pg;
2994 struct dyn_ftrace *rec; 2979 struct dyn_ftrace *rec;
2995 int type, len, not; 2980 int type, len, not;
2996 unsigned long key; 2981 unsigned long key;
2997 int count = 0; 2982 int count = 0;
2998 char *search; 2983 char *search;
2984 int ret;
2999 2985
3000 type = filter_parse_regex(glob, strlen(glob), &search, &not); 2986 type = filter_parse_regex(glob, strlen(glob), &search, &not);
3001 len = strlen(search); 2987 len = strlen(search);
@@ -3006,8 +2992,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3006 2992
3007 mutex_lock(&ftrace_lock); 2993 mutex_lock(&ftrace_lock);
3008 2994
3009 if (unlikely(ftrace_disabled)) 2995 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
2996 if (!hash) {
2997 count = -ENOMEM;
2998 goto out_unlock;
2999 }
3000
3001 if (unlikely(ftrace_disabled)) {
3002 count = -ENODEV;
3010 goto out_unlock; 3003 goto out_unlock;
3004 }
3011 3005
3012 do_for_each_ftrace_rec(pg, rec) { 3006 do_for_each_ftrace_rec(pg, rec) {
3013 3007
@@ -3031,14 +3025,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3031 * for each function we find. We call the callback 3025 * for each function we find. We call the callback
3032 * to give the caller an opportunity to do so. 3026 * to give the caller an opportunity to do so.
3033 */ 3027 */
3034 if (ops->callback) { 3028 if (ops->init) {
3035 if (ops->callback(rec->ip, &entry->data) < 0) { 3029 if (ops->init(ops, rec->ip, &entry->data) < 0) {
3036 /* caller does not like this func */ 3030 /* caller does not like this func */
3037 kfree(entry); 3031 kfree(entry);
3038 continue; 3032 continue;
3039 } 3033 }
3040 } 3034 }
3041 3035
3036 ret = enter_record(hash, rec, 0);
3037 if (ret < 0) {
3038 kfree(entry);
3039 count = ret;
3040 goto out_unlock;
3041 }
3042
3042 entry->ops = ops; 3043 entry->ops = ops;
3043 entry->ip = rec->ip; 3044 entry->ip = rec->ip;
3044 3045
@@ -3046,10 +3047,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3046 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); 3047 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
3047 3048
3048 } while_for_each_ftrace_rec(); 3049 } while_for_each_ftrace_rec();
3050
3051 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3052 if (ret < 0)
3053 count = ret;
3054
3049 __enable_ftrace_function_probe(); 3055 __enable_ftrace_function_probe();
3050 3056
3051 out_unlock: 3057 out_unlock:
3052 mutex_unlock(&ftrace_lock); 3058 mutex_unlock(&ftrace_lock);
3059 free_ftrace_hash(hash);
3053 3060
3054 return count; 3061 return count;
3055} 3062}
@@ -3063,7 +3070,12 @@ static void
3063__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, 3070__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3064 void *data, int flags) 3071 void *data, int flags)
3065{ 3072{
3073 struct ftrace_func_entry *rec_entry;
3066 struct ftrace_func_probe *entry; 3074 struct ftrace_func_probe *entry;
3075 struct ftrace_func_probe *p;
3076 struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
3077 struct list_head free_list;
3078 struct ftrace_hash *hash;
3067 struct hlist_node *tmp; 3079 struct hlist_node *tmp;
3068 char str[KSYM_SYMBOL_LEN]; 3080 char str[KSYM_SYMBOL_LEN];
3069 int type = MATCH_FULL; 3081 int type = MATCH_FULL;
@@ -3084,6 +3096,14 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3084 } 3096 }
3085 3097
3086 mutex_lock(&ftrace_lock); 3098 mutex_lock(&ftrace_lock);
3099
3100 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3101 if (!hash)
3102 /* Hmm, should report this somehow */
3103 goto out_unlock;
3104
3105 INIT_LIST_HEAD(&free_list);
3106
3087 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { 3107 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
3088 struct hlist_head *hhd = &ftrace_func_hash[i]; 3108 struct hlist_head *hhd = &ftrace_func_hash[i];
3089 3109
@@ -3104,12 +3124,30 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3104 continue; 3124 continue;
3105 } 3125 }
3106 3126
3107 hlist_del(&entry->node); 3127 rec_entry = ftrace_lookup_ip(hash, entry->ip);
3108 call_rcu(&entry->rcu, ftrace_free_entry_rcu); 3128 /* It is possible more than one entry had this ip */
3129 if (rec_entry)
3130 free_hash_entry(hash, rec_entry);
3131
3132 hlist_del_rcu(&entry->node);
3133 list_add(&entry->free_list, &free_list);
3109 } 3134 }
3110 } 3135 }
3111 __disable_ftrace_function_probe(); 3136 __disable_ftrace_function_probe();
3137 /*
3138 * Remove after the disable is called. Otherwise, if the last
3139 * probe is removed, a null hash means *all enabled*.
3140 */
3141 ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3142 synchronize_sched();
3143 list_for_each_entry_safe(entry, p, &free_list, free_list) {
3144 list_del(&entry->free_list);
3145 ftrace_free_entry(entry);
3146 }
3147
3148 out_unlock:
3112 mutex_unlock(&ftrace_lock); 3149 mutex_unlock(&ftrace_lock);
3150 free_ftrace_hash(hash);
3113} 3151}
3114 3152
3115void 3153void
@@ -3441,14 +3479,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
3441 3479
3442static int __init set_ftrace_notrace(char *str) 3480static int __init set_ftrace_notrace(char *str)
3443{ 3481{
3444 strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); 3482 strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
3445 return 1; 3483 return 1;
3446} 3484}
3447__setup("ftrace_notrace=", set_ftrace_notrace); 3485__setup("ftrace_notrace=", set_ftrace_notrace);
3448 3486
3449static int __init set_ftrace_filter(char *str) 3487static int __init set_ftrace_filter(char *str)
3450{ 3488{
3451 strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); 3489 strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
3452 return 1; 3490 return 1;
3453} 3491}
3454__setup("ftrace_filter=", set_ftrace_filter); 3492__setup("ftrace_filter=", set_ftrace_filter);
@@ -3571,7 +3609,7 @@ static const struct file_operations ftrace_filter_fops = {
3571 .open = ftrace_filter_open, 3609 .open = ftrace_filter_open,
3572 .read = seq_read, 3610 .read = seq_read,
3573 .write = ftrace_filter_write, 3611 .write = ftrace_filter_write,
3574 .llseek = ftrace_regex_lseek, 3612 .llseek = ftrace_filter_lseek,
3575 .release = ftrace_regex_release, 3613 .release = ftrace_regex_release,
3576}; 3614};
3577 3615
@@ -3579,7 +3617,7 @@ static const struct file_operations ftrace_notrace_fops = {
3579 .open = ftrace_notrace_open, 3617 .open = ftrace_notrace_open,
3580 .read = seq_read, 3618 .read = seq_read,
3581 .write = ftrace_notrace_write, 3619 .write = ftrace_notrace_write,
3582 .llseek = ftrace_regex_lseek, 3620 .llseek = ftrace_filter_lseek,
3583 .release = ftrace_regex_release, 3621 .release = ftrace_regex_release,
3584}; 3622};
3585 3623
@@ -3737,7 +3775,8 @@ out:
3737 if (fail) 3775 if (fail)
3738 return -EINVAL; 3776 return -EINVAL;
3739 3777
3740 ftrace_graph_filter_enabled = 1; 3778 ftrace_graph_filter_enabled = !!(*idx);
3779
3741 return 0; 3780 return 0;
3742} 3781}
3743 3782
@@ -3784,8 +3823,8 @@ static const struct file_operations ftrace_graph_fops = {
3784 .open = ftrace_graph_open, 3823 .open = ftrace_graph_open,
3785 .read = seq_read, 3824 .read = seq_read,
3786 .write = ftrace_graph_write, 3825 .write = ftrace_graph_write,
3826 .llseek = ftrace_filter_lseek,
3787 .release = ftrace_graph_release, 3827 .release = ftrace_graph_release,
3788 .llseek = seq_lseek,
3789}; 3828};
3790#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3829#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3791 3830
@@ -4131,7 +4170,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4131 preempt_disable_notrace(); 4170 preempt_disable_notrace();
4132 trace_recursion_set(TRACE_CONTROL_BIT); 4171 trace_recursion_set(TRACE_CONTROL_BIT);
4133 do_for_each_ftrace_op(op, ftrace_control_list) { 4172 do_for_each_ftrace_op(op, ftrace_control_list) {
4134 if (!ftrace_function_local_disabled(op) && 4173 if (!(op->flags & FTRACE_OPS_FL_STUB) &&
4174 !ftrace_function_local_disabled(op) &&
4135 ftrace_ops_test(op, ip)) 4175 ftrace_ops_test(op, ip))
4136 op->func(ip, parent_ip, op, regs); 4176 op->func(ip, parent_ip, op, regs);
4137 } while_for_each_ftrace_op(op); 4177 } while_for_each_ftrace_op(op);
@@ -4439,7 +4479,7 @@ static const struct file_operations ftrace_pid_fops = {
4439 .open = ftrace_pid_open, 4479 .open = ftrace_pid_open,
4440 .write = ftrace_pid_write, 4480 .write = ftrace_pid_write,
4441 .read = seq_read, 4481 .read = seq_read,
4442 .llseek = seq_lseek, 4482 .llseek = ftrace_filter_lseek,
4443 .release = ftrace_pid_release, 4483 .release = ftrace_pid_release,
4444}; 4484};
4445 4485
@@ -4555,12 +4595,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
4555 ftrace_startup_sysctl(); 4595 ftrace_startup_sysctl();
4556 4596
4557 /* we are starting ftrace again */ 4597 /* we are starting ftrace again */
4558 if (ftrace_ops_list != &ftrace_list_end) { 4598 if (ftrace_ops_list != &ftrace_list_end)
4559 if (ftrace_ops_list->next == &ftrace_list_end) 4599 update_ftrace_function();
4560 ftrace_trace_function = ftrace_ops_list->func;
4561 else
4562 ftrace_trace_function = ftrace_ops_list_func;
4563 }
4564 4600
4565 } else { 4601 } else {
4566 /* stopping ftrace calls (just send to ftrace_stub) */ 4602 /* stopping ftrace calls (just send to ftrace_stub) */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6989df2ba194..b59aea2c48c2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,13 +8,16 @@
8#include <linux/trace_clock.h> 8#include <linux/trace_clock.h>
9#include <linux/trace_seq.h> 9#include <linux/trace_seq.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/irq_work.h>
11#include <linux/debugfs.h> 12#include <linux/debugfs.h>
12#include <linux/uaccess.h> 13#include <linux/uaccess.h>
13#include <linux/hardirq.h> 14#include <linux/hardirq.h>
15#include <linux/kthread.h> /* for self test */
14#include <linux/kmemcheck.h> 16#include <linux/kmemcheck.h>
15#include <linux/module.h> 17#include <linux/module.h>
16#include <linux/percpu.h> 18#include <linux/percpu.h>
17#include <linux/mutex.h> 19#include <linux/mutex.h>
20#include <linux/delay.h>
18#include <linux/slab.h> 21#include <linux/slab.h>
19#include <linux/init.h> 22#include <linux/init.h>
20#include <linux/hash.h> 23#include <linux/hash.h>
@@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
444 return ret; 447 return ret;
445} 448}
446 449
450struct rb_irq_work {
451 struct irq_work work;
452 wait_queue_head_t waiters;
453 bool waiters_pending;
454};
455
447/* 456/*
448 * head_page == tail_page && head == tail then buffer is empty. 457 * head_page == tail_page && head == tail then buffer is empty.
449 */ 458 */
@@ -478,6 +487,8 @@ struct ring_buffer_per_cpu {
478 struct list_head new_pages; /* new pages to add */ 487 struct list_head new_pages; /* new pages to add */
479 struct work_struct update_pages_work; 488 struct work_struct update_pages_work;
480 struct completion update_done; 489 struct completion update_done;
490
491 struct rb_irq_work irq_work;
481}; 492};
482 493
483struct ring_buffer { 494struct ring_buffer {
@@ -497,6 +508,8 @@ struct ring_buffer {
497 struct notifier_block cpu_notify; 508 struct notifier_block cpu_notify;
498#endif 509#endif
499 u64 (*clock)(void); 510 u64 (*clock)(void);
511
512 struct rb_irq_work irq_work;
500}; 513};
501 514
502struct ring_buffer_iter { 515struct ring_buffer_iter {
@@ -508,6 +521,118 @@ struct ring_buffer_iter {
508 u64 read_stamp; 521 u64 read_stamp;
509}; 522};
510 523
524/*
525 * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
526 *
527 * Schedules a delayed work to wake up any task that is blocked on the
528 * ring buffer waiters queue.
529 */
530static void rb_wake_up_waiters(struct irq_work *work)
531{
532 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
533
534 wake_up_all(&rbwork->waiters);
535}
536
537/**
538 * ring_buffer_wait - wait for input to the ring buffer
539 * @buffer: buffer to wait on
540 * @cpu: the cpu buffer to wait on
541 *
542 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
543 * as data is added to any of the @buffer's cpu buffers. Otherwise
544 * it will wait for data to be added to a specific cpu buffer.
545 */
546void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
547{
548 struct ring_buffer_per_cpu *cpu_buffer;
549 DEFINE_WAIT(wait);
550 struct rb_irq_work *work;
551
552 /*
553 * Depending on what the caller is waiting for, either any
554 * data in any cpu buffer, or a specific buffer, put the
555 * caller on the appropriate wait queue.
556 */
557 if (cpu == RING_BUFFER_ALL_CPUS)
558 work = &buffer->irq_work;
559 else {
560 cpu_buffer = buffer->buffers[cpu];
561 work = &cpu_buffer->irq_work;
562 }
563
564
565 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
566
567 /*
568 * The events can happen in critical sections where
569 * checking a work queue can cause deadlocks.
570 * After adding a task to the queue, this flag is set
571 * only to notify events to try to wake up the queue
572 * using irq_work.
573 *
574 * We don't clear it even if the buffer is no longer
575 * empty. The flag only causes the next event to run
576 * irq_work to do the work queue wake up. The worse
577 * that can happen if we race with !trace_empty() is that
578 * an event will cause an irq_work to try to wake up
579 * an empty queue.
580 *
581 * There's no reason to protect this flag either, as
582 * the work queue and irq_work logic will do the necessary
583 * synchronization for the wake ups. The only thing
584 * that is necessary is that the wake up happens after
585 * a task has been queued. It's OK for spurious wake ups.
586 */
587 work->waiters_pending = true;
588
589 if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
590 (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
591 schedule();
592
593 finish_wait(&work->waiters, &wait);
594}
595
596/**
597 * ring_buffer_poll_wait - poll on buffer input
598 * @buffer: buffer to wait on
599 * @cpu: the cpu buffer to wait on
600 * @filp: the file descriptor
601 * @poll_table: The poll descriptor
602 *
603 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
604 * as data is added to any of the @buffer's cpu buffers. Otherwise
605 * it will wait for data to be added to a specific cpu buffer.
606 *
607 * Returns POLLIN | POLLRDNORM if data exists in the buffers,
608 * zero otherwise.
609 */
610int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
611 struct file *filp, poll_table *poll_table)
612{
613 struct ring_buffer_per_cpu *cpu_buffer;
614 struct rb_irq_work *work;
615
616 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
617 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
618 return POLLIN | POLLRDNORM;
619
620 if (cpu == RING_BUFFER_ALL_CPUS)
621 work = &buffer->irq_work;
622 else {
623 cpu_buffer = buffer->buffers[cpu];
624 work = &cpu_buffer->irq_work;
625 }
626
627 work->waiters_pending = true;
628 poll_wait(filp, &work->waiters, poll_table);
629
630 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
631 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
632 return POLLIN | POLLRDNORM;
633 return 0;
634}
635
511/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 636/* buffer may be either ring_buffer or ring_buffer_per_cpu */
512#define RB_WARN_ON(b, cond) \ 637#define RB_WARN_ON(b, cond) \
513 ({ \ 638 ({ \
@@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
1063 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 1188 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1064 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); 1189 INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
1065 init_completion(&cpu_buffer->update_done); 1190 init_completion(&cpu_buffer->update_done);
1191 init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
1192 init_waitqueue_head(&cpu_buffer->irq_work.waiters);
1066 1193
1067 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1194 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1068 GFP_KERNEL, cpu_to_node(cpu)); 1195 GFP_KERNEL, cpu_to_node(cpu));
@@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1158 buffer->clock = trace_clock_local; 1285 buffer->clock = trace_clock_local;
1159 buffer->reader_lock_key = key; 1286 buffer->reader_lock_key = key;
1160 1287
1288 init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
1289 init_waitqueue_head(&buffer->irq_work.waiters);
1290
1161 /* need at least two pages */ 1291 /* need at least two pages */
1162 if (nr_pages < 2) 1292 if (nr_pages < 2)
1163 nr_pages = 2; 1293 nr_pages = 2;
@@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1553 if (!cpu_buffer->nr_pages_to_update) 1683 if (!cpu_buffer->nr_pages_to_update)
1554 continue; 1684 continue;
1555 1685
1556 if (cpu_online(cpu)) 1686 /* The update must run on the CPU that is being updated. */
1687 preempt_disable();
1688 if (cpu == smp_processor_id() || !cpu_online(cpu)) {
1689 rb_update_pages(cpu_buffer);
1690 cpu_buffer->nr_pages_to_update = 0;
1691 } else {
1692 /*
1693 * Can not disable preemption for schedule_work_on()
1694 * on PREEMPT_RT.
1695 */
1696 preempt_enable();
1557 schedule_work_on(cpu, 1697 schedule_work_on(cpu,
1558 &cpu_buffer->update_pages_work); 1698 &cpu_buffer->update_pages_work);
1559 else 1699 preempt_disable();
1560 rb_update_pages(cpu_buffer); 1700 }
1701 preempt_enable();
1561 } 1702 }
1562 1703
1563 /* wait for all the updates to complete */ 1704 /* wait for all the updates to complete */
@@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
1595 1736
1596 get_online_cpus(); 1737 get_online_cpus();
1597 1738
1598 if (cpu_online(cpu_id)) { 1739 preempt_disable();
1740 /* The update must run on the CPU that is being updated. */
1741 if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
1742 rb_update_pages(cpu_buffer);
1743 else {
1744 /*
1745 * Can not disable preemption for schedule_work_on()
1746 * on PREEMPT_RT.
1747 */
1748 preempt_enable();
1599 schedule_work_on(cpu_id, 1749 schedule_work_on(cpu_id,
1600 &cpu_buffer->update_pages_work); 1750 &cpu_buffer->update_pages_work);
1601 wait_for_completion(&cpu_buffer->update_done); 1751 wait_for_completion(&cpu_buffer->update_done);
1602 } else 1752 preempt_disable();
1603 rb_update_pages(cpu_buffer); 1753 }
1754 preempt_enable();
1604 1755
1605 cpu_buffer->nr_pages_to_update = 0; 1756 cpu_buffer->nr_pages_to_update = 0;
1606 put_online_cpus(); 1757 put_online_cpus();
@@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2612 rb_end_commit(cpu_buffer); 2763 rb_end_commit(cpu_buffer);
2613} 2764}
2614 2765
2766static __always_inline void
2767rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2768{
2769 if (buffer->irq_work.waiters_pending) {
2770 buffer->irq_work.waiters_pending = false;
2771 /* irq_work_queue() supplies it's own memory barriers */
2772 irq_work_queue(&buffer->irq_work.work);
2773 }
2774
2775 if (cpu_buffer->irq_work.waiters_pending) {
2776 cpu_buffer->irq_work.waiters_pending = false;
2777 /* irq_work_queue() supplies it's own memory barriers */
2778 irq_work_queue(&cpu_buffer->irq_work.work);
2779 }
2780}
2781
2615/** 2782/**
2616 * ring_buffer_unlock_commit - commit a reserved 2783 * ring_buffer_unlock_commit - commit a reserved
2617 * @buffer: The buffer to commit to 2784 * @buffer: The buffer to commit to
@@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2631 2798
2632 rb_commit(cpu_buffer, event); 2799 rb_commit(cpu_buffer, event);
2633 2800
2801 rb_wakeups(buffer, cpu_buffer);
2802
2634 trace_recursive_unlock(); 2803 trace_recursive_unlock();
2635 2804
2636 preempt_enable_notrace(); 2805 preempt_enable_notrace();
@@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
2803 2972
2804 rb_commit(cpu_buffer, event); 2973 rb_commit(cpu_buffer, event);
2805 2974
2975 rb_wakeups(buffer, cpu_buffer);
2976
2806 ret = 0; 2977 ret = 0;
2807 out: 2978 out:
2808 preempt_enable_notrace(); 2979 preempt_enable_notrace();
@@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self,
4467 return NOTIFY_OK; 4638 return NOTIFY_OK;
4468} 4639}
4469#endif 4640#endif
4641
4642#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
4643/*
4644 * This is a basic integrity check of the ring buffer.
4645 * Late in the boot cycle this test will run when configured in.
4646 * It will kick off a thread per CPU that will go into a loop
4647 * writing to the per cpu ring buffer various sizes of data.
4648 * Some of the data will be large items, some small.
4649 *
4650 * Another thread is created that goes into a spin, sending out
4651 * IPIs to the other CPUs to also write into the ring buffer.
4652 * this is to test the nesting ability of the buffer.
4653 *
4654 * Basic stats are recorded and reported. If something in the
4655 * ring buffer should happen that's not expected, a big warning
4656 * is displayed and all ring buffers are disabled.
4657 */
4658static struct task_struct *rb_threads[NR_CPUS] __initdata;
4659
4660struct rb_test_data {
4661 struct ring_buffer *buffer;
4662 unsigned long events;
4663 unsigned long bytes_written;
4664 unsigned long bytes_alloc;
4665 unsigned long bytes_dropped;
4666 unsigned long events_nested;
4667 unsigned long bytes_written_nested;
4668 unsigned long bytes_alloc_nested;
4669 unsigned long bytes_dropped_nested;
4670 int min_size_nested;
4671 int max_size_nested;
4672 int max_size;
4673 int min_size;
4674 int cpu;
4675 int cnt;
4676};
4677
4678static struct rb_test_data rb_data[NR_CPUS] __initdata;
4679
4680/* 1 meg per cpu */
4681#define RB_TEST_BUFFER_SIZE 1048576
4682
4683static char rb_string[] __initdata =
4684 "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
4685 "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
4686 "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
4687
4688static bool rb_test_started __initdata;
4689
4690struct rb_item {
4691 int size;
4692 char str[];
4693};
4694
4695static __init int rb_write_something(struct rb_test_data *data, bool nested)
4696{
4697 struct ring_buffer_event *event;
4698 struct rb_item *item;
4699 bool started;
4700 int event_len;
4701 int size;
4702 int len;
4703 int cnt;
4704
4705 /* Have nested writes different that what is written */
4706 cnt = data->cnt + (nested ? 27 : 0);
4707
4708 /* Multiply cnt by ~e, to make some unique increment */
4709 size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
4710
4711 len = size + sizeof(struct rb_item);
4712
4713 started = rb_test_started;
4714 /* read rb_test_started before checking buffer enabled */
4715 smp_rmb();
4716
4717 event = ring_buffer_lock_reserve(data->buffer, len);
4718 if (!event) {
4719 /* Ignore dropped events before test starts. */
4720 if (started) {
4721 if (nested)
4722 data->bytes_dropped += len;
4723 else
4724 data->bytes_dropped_nested += len;
4725 }
4726 return len;
4727 }
4728
4729 event_len = ring_buffer_event_length(event);
4730
4731 if (RB_WARN_ON(data->buffer, event_len < len))
4732 goto out;
4733
4734 item = ring_buffer_event_data(event);
4735 item->size = size;
4736 memcpy(item->str, rb_string, size);
4737
4738 if (nested) {
4739 data->bytes_alloc_nested += event_len;
4740 data->bytes_written_nested += len;
4741 data->events_nested++;
4742 if (!data->min_size_nested || len < data->min_size_nested)
4743 data->min_size_nested = len;
4744 if (len > data->max_size_nested)
4745 data->max_size_nested = len;
4746 } else {
4747 data->bytes_alloc += event_len;
4748 data->bytes_written += len;
4749 data->events++;
4750 if (!data->min_size || len < data->min_size)
4751 data->max_size = len;
4752 if (len > data->max_size)
4753 data->max_size = len;
4754 }
4755
4756 out:
4757 ring_buffer_unlock_commit(data->buffer, event);
4758
4759 return 0;
4760}
4761
4762static __init int rb_test(void *arg)
4763{
4764 struct rb_test_data *data = arg;
4765
4766 while (!kthread_should_stop()) {
4767 rb_write_something(data, false);
4768 data->cnt++;
4769
4770 set_current_state(TASK_INTERRUPTIBLE);
4771 /* Now sleep between a min of 100-300us and a max of 1ms */
4772 usleep_range(((data->cnt % 3) + 1) * 100, 1000);
4773 }
4774
4775 return 0;
4776}
4777
4778static __init void rb_ipi(void *ignore)
4779{
4780 struct rb_test_data *data;
4781 int cpu = smp_processor_id();
4782
4783 data = &rb_data[cpu];
4784 rb_write_something(data, true);
4785}
4786
4787static __init int rb_hammer_test(void *arg)
4788{
4789 while (!kthread_should_stop()) {
4790
4791 /* Send an IPI to all cpus to write data! */
4792 smp_call_function(rb_ipi, NULL, 1);
4793 /* No sleep, but for non preempt, let others run */
4794 schedule();
4795 }
4796
4797 return 0;
4798}
4799
4800static __init int test_ringbuffer(void)
4801{
4802 struct task_struct *rb_hammer;
4803 struct ring_buffer *buffer;
4804 int cpu;
4805 int ret = 0;
4806
4807 pr_info("Running ring buffer tests...\n");
4808
4809 buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
4810 if (WARN_ON(!buffer))
4811 return 0;
4812
4813 /* Disable buffer so that threads can't write to it yet */
4814 ring_buffer_record_off(buffer);
4815
4816 for_each_online_cpu(cpu) {
4817 rb_data[cpu].buffer = buffer;
4818 rb_data[cpu].cpu = cpu;
4819 rb_data[cpu].cnt = cpu;
4820 rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
4821 "rbtester/%d", cpu);
4822 if (WARN_ON(!rb_threads[cpu])) {
4823 pr_cont("FAILED\n");
4824 ret = -1;
4825 goto out_free;
4826 }
4827
4828 kthread_bind(rb_threads[cpu], cpu);
4829 wake_up_process(rb_threads[cpu]);
4830 }
4831
4832 /* Now create the rb hammer! */
4833 rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
4834 if (WARN_ON(!rb_hammer)) {
4835 pr_cont("FAILED\n");
4836 ret = -1;
4837 goto out_free;
4838 }
4839
4840 ring_buffer_record_on(buffer);
4841 /*
4842 * Show buffer is enabled before setting rb_test_started.
4843 * Yes there's a small race window where events could be
4844 * dropped and the thread wont catch it. But when a ring
4845 * buffer gets enabled, there will always be some kind of
4846 * delay before other CPUs see it. Thus, we don't care about
4847 * those dropped events. We care about events dropped after
4848 * the threads see that the buffer is active.
4849 */
4850 smp_wmb();
4851 rb_test_started = true;
4852
4853 set_current_state(TASK_INTERRUPTIBLE);
4854 /* Just run for 10 seconds */;
4855 schedule_timeout(10 * HZ);
4856
4857 kthread_stop(rb_hammer);
4858
4859 out_free:
4860 for_each_online_cpu(cpu) {
4861 if (!rb_threads[cpu])
4862 break;
4863 kthread_stop(rb_threads[cpu]);
4864 }
4865 if (ret) {
4866 ring_buffer_free(buffer);
4867 return ret;
4868 }
4869
4870 /* Report! */
4871 pr_info("finished\n");
4872 for_each_online_cpu(cpu) {
4873 struct ring_buffer_event *event;
4874 struct rb_test_data *data = &rb_data[cpu];
4875 struct rb_item *item;
4876 unsigned long total_events;
4877 unsigned long total_dropped;
4878 unsigned long total_written;
4879 unsigned long total_alloc;
4880 unsigned long total_read = 0;
4881 unsigned long total_size = 0;
4882 unsigned long total_len = 0;
4883 unsigned long total_lost = 0;
4884 unsigned long lost;
4885 int big_event_size;
4886 int small_event_size;
4887
4888 ret = -1;
4889
4890 total_events = data->events + data->events_nested;
4891 total_written = data->bytes_written + data->bytes_written_nested;
4892 total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
4893 total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
4894
4895 big_event_size = data->max_size + data->max_size_nested;
4896 small_event_size = data->min_size + data->min_size_nested;
4897
4898 pr_info("CPU %d:\n", cpu);
4899 pr_info(" events: %ld\n", total_events);
4900 pr_info(" dropped bytes: %ld\n", total_dropped);
4901 pr_info(" alloced bytes: %ld\n", total_alloc);
4902 pr_info(" written bytes: %ld\n", total_written);
4903 pr_info(" biggest event: %d\n", big_event_size);
4904 pr_info(" smallest event: %d\n", small_event_size);
4905
4906 if (RB_WARN_ON(buffer, total_dropped))
4907 break;
4908
4909 ret = 0;
4910
4911 while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
4912 total_lost += lost;
4913 item = ring_buffer_event_data(event);
4914 total_len += ring_buffer_event_length(event);
4915 total_size += item->size + sizeof(struct rb_item);
4916 if (memcmp(&item->str[0], rb_string, item->size) != 0) {
4917 pr_info("FAILED!\n");
4918 pr_info("buffer had: %.*s\n", item->size, item->str);
4919 pr_info("expected: %.*s\n", item->size, rb_string);
4920 RB_WARN_ON(buffer, 1);
4921 ret = -1;
4922 break;
4923 }
4924 total_read++;
4925 }
4926 if (ret)
4927 break;
4928
4929 ret = -1;
4930
4931 pr_info(" read events: %ld\n", total_read);
4932 pr_info(" lost events: %ld\n", total_lost);
4933 pr_info(" total events: %ld\n", total_lost + total_read);
4934 pr_info(" recorded len bytes: %ld\n", total_len);
4935 pr_info(" recorded size bytes: %ld\n", total_size);
4936 if (total_lost)
4937 pr_info(" With dropped events, record len and size may not match\n"
4938 " alloced and written from above\n");
4939 if (!total_lost) {
4940 if (RB_WARN_ON(buffer, total_len != total_alloc ||
4941 total_size != total_written))
4942 break;
4943 }
4944 if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
4945 break;
4946
4947 ret = 0;
4948 }
4949 if (!ret)
4950 pr_info("Ring buffer PASSED!\n");
4951
4952 ring_buffer_free(buffer);
4953 return 0;
4954}
4955
4956late_initcall(test_ringbuffer);
4957#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c2e2c2310374..ae6fa2d1cdf7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * ring buffer based function tracer 2 * ring buffer based function tracer
3 * 3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> 5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 * 6 *
7 * Originally taken from the RT patch by: 7 * Originally taken from the RT patch by:
@@ -19,7 +19,6 @@
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h> 21#include <linux/irqflags.h>
22#include <linux/irq_work.h>
23#include <linux/debugfs.h> 22#include <linux/debugfs.h>
24#include <linux/pagemap.h> 23#include <linux/pagemap.h>
25#include <linux/hardirq.h> 24#include <linux/hardirq.h>
@@ -48,7 +47,7 @@
48 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
49 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
50 */ 49 */
51int ring_buffer_expanded; 50bool ring_buffer_expanded;
52 51
53/* 52/*
54 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
87static DEFINE_PER_CPU(bool, trace_cmdline_save); 86static DEFINE_PER_CPU(bool, trace_cmdline_save);
88 87
89/* 88/*
90 * When a reader is waiting for data, then this variable is
91 * set to true.
92 */
93static bool trace_wakeup_needed;
94
95static struct irq_work trace_work_wakeup;
96
97/*
98 * Kill all tracing for good (never come back). 89 * Kill all tracing for good (never come back).
99 * It is initialized to 1 but will turn to zero if the initialization 90 * It is initialized to 1 but will turn to zero if the initialization
100 * of the tracer is successful. But that is the only place that sets 91 * of the tracer is successful. But that is the only place that sets
@@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf);
130static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; 121static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
131static char *default_bootup_tracer; 122static char *default_bootup_tracer;
132 123
124static bool allocate_snapshot;
125
133static int __init set_cmdline_ftrace(char *str) 126static int __init set_cmdline_ftrace(char *str)
134{ 127{
135 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 128 strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
136 default_bootup_tracer = bootup_tracer_buf; 129 default_bootup_tracer = bootup_tracer_buf;
137 /* We are using ftrace early, expand it */ 130 /* We are using ftrace early, expand it */
138 ring_buffer_expanded = 1; 131 ring_buffer_expanded = true;
139 return 1; 132 return 1;
140} 133}
141__setup("ftrace=", set_cmdline_ftrace); 134__setup("ftrace=", set_cmdline_ftrace);
@@ -156,13 +149,22 @@ static int __init set_ftrace_dump_on_oops(char *str)
156} 149}
157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 150__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
158 151
152static int __init boot_alloc_snapshot(char *str)
153{
154 allocate_snapshot = true;
155 /* We also need the main ring buffer expanded */
156 ring_buffer_expanded = true;
157 return 1;
158}
159__setup("alloc_snapshot", boot_alloc_snapshot);
160
159 161
160static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; 162static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
161static char *trace_boot_options __initdata; 163static char *trace_boot_options __initdata;
162 164
163static int __init set_trace_boot_options(char *str) 165static int __init set_trace_boot_options(char *str)
164{ 166{
165 strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); 167 strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
166 trace_boot_options = trace_boot_options_buf; 168 trace_boot_options = trace_boot_options_buf;
167 return 0; 169 return 0;
168} 170}
@@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec)
189 */ 191 */
190static struct trace_array global_trace; 192static struct trace_array global_trace;
191 193
192static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 194LIST_HEAD(ftrace_trace_arrays);
193 195
194int filter_current_check_discard(struct ring_buffer *buffer, 196int filter_current_check_discard(struct ring_buffer *buffer,
195 struct ftrace_event_call *call, void *rec, 197 struct ftrace_event_call *call, void *rec,
@@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu)
204 u64 ts; 206 u64 ts;
205 207
206 /* Early boot up does not have a buffer yet */ 208 /* Early boot up does not have a buffer yet */
207 if (!global_trace.buffer) 209 if (!global_trace.trace_buffer.buffer)
208 return trace_clock_local(); 210 return trace_clock_local();
209 211
210 ts = ring_buffer_time_stamp(global_trace.buffer, cpu); 212 ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
211 ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); 213 ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
212 214
213 return ts; 215 return ts;
214} 216}
215 217
216/*
217 * The max_tr is used to snapshot the global_trace when a maximum
218 * latency is reached. Some tracers will use this to store a maximum
219 * trace while it continues examining live traces.
220 *
221 * The buffers for the max_tr are set up the same as the global_trace.
222 * When a snapshot is taken, the link list of the max_tr is swapped
223 * with the link list of the global_trace and the buffers are reset for
224 * the global_trace so the tracing can continue.
225 */
226static struct trace_array max_tr;
227
228static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
229
230int tracing_is_enabled(void) 218int tracing_is_enabled(void)
231{ 219{
232 return tracing_is_on(); 220 return tracing_is_on();
@@ -249,9 +237,6 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
249/* trace_types holds a link list of available tracers. */ 237/* trace_types holds a link list of available tracers. */
250static struct tracer *trace_types __read_mostly; 238static struct tracer *trace_types __read_mostly;
251 239
252/* current_trace points to the tracer that is currently active */
253static struct tracer *current_trace __read_mostly = &nop_trace;
254
255/* 240/*
256 * trace_types_lock is used to protect the trace_types list. 241 * trace_types_lock is used to protect the trace_types list.
257 */ 242 */
@@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
285 270
286static inline void trace_access_lock(int cpu) 271static inline void trace_access_lock(int cpu)
287{ 272{
288 if (cpu == TRACE_PIPE_ALL_CPU) { 273 if (cpu == RING_BUFFER_ALL_CPUS) {
289 /* gain it for accessing the whole ring buffer. */ 274 /* gain it for accessing the whole ring buffer. */
290 down_write(&all_cpu_access_lock); 275 down_write(&all_cpu_access_lock);
291 } else { 276 } else {
292 /* gain it for accessing a cpu ring buffer. */ 277 /* gain it for accessing a cpu ring buffer. */
293 278
294 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ 279 /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
295 down_read(&all_cpu_access_lock); 280 down_read(&all_cpu_access_lock);
296 281
297 /* Secondly block other access to this @cpu ring buffer. */ 282 /* Secondly block other access to this @cpu ring buffer. */
@@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu)
301 286
302static inline void trace_access_unlock(int cpu) 287static inline void trace_access_unlock(int cpu)
303{ 288{
304 if (cpu == TRACE_PIPE_ALL_CPU) { 289 if (cpu == RING_BUFFER_ALL_CPUS) {
305 up_write(&all_cpu_access_lock); 290 up_write(&all_cpu_access_lock);
306 } else { 291 } else {
307 mutex_unlock(&per_cpu(cpu_access_lock, cpu)); 292 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
@@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void)
339 324
340#endif 325#endif
341 326
342/* trace_wait is a waitqueue for tasks blocked on trace_poll */
343static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
344
345/* trace_flags holds trace_options default values */ 327/* trace_flags holds trace_options default values */
346unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 328unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
347 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 329 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
348 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | 330 TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
349 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; 331 TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
350
351static int trace_stop_count;
352static DEFINE_RAW_SPINLOCK(tracing_start_lock);
353
354/**
355 * trace_wake_up - wake up tasks waiting for trace input
356 *
357 * Schedules a delayed work to wake up any task that is blocked on the
358 * trace_wait queue. These is used with trace_poll for tasks polling the
359 * trace.
360 */
361static void trace_wake_up(struct irq_work *work)
362{
363 wake_up_all(&trace_wait);
364
365}
366 332
367/** 333/**
368 * tracing_on - enable tracing buffers 334 * tracing_on - enable tracing buffers
@@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work)
372 */ 338 */
373void tracing_on(void) 339void tracing_on(void)
374{ 340{
375 if (global_trace.buffer) 341 if (global_trace.trace_buffer.buffer)
376 ring_buffer_record_on(global_trace.buffer); 342 ring_buffer_record_on(global_trace.trace_buffer.buffer);
377 /* 343 /*
378 * This flag is only looked at when buffers haven't been 344 * This flag is only looked at when buffers haven't been
379 * allocated yet. We don't really care about the race 345 * allocated yet. We don't really care about the race
@@ -385,6 +351,196 @@ void tracing_on(void)
385EXPORT_SYMBOL_GPL(tracing_on); 351EXPORT_SYMBOL_GPL(tracing_on);
386 352
387/** 353/**
354 * __trace_puts - write a constant string into the trace buffer.
355 * @ip: The address of the caller
356 * @str: The constant string to write
357 * @size: The size of the string.
358 */
359int __trace_puts(unsigned long ip, const char *str, int size)
360{
361 struct ring_buffer_event *event;
362 struct ring_buffer *buffer;
363 struct print_entry *entry;
364 unsigned long irq_flags;
365 int alloc;
366
367 alloc = sizeof(*entry) + size + 2; /* possible \n added */
368
369 local_save_flags(irq_flags);
370 buffer = global_trace.trace_buffer.buffer;
371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
372 irq_flags, preempt_count());
373 if (!event)
374 return 0;
375
376 entry = ring_buffer_event_data(event);
377 entry->ip = ip;
378
379 memcpy(&entry->buf, str, size);
380
381 /* Add a newline if necessary */
382 if (entry->buf[size - 1] != '\n') {
383 entry->buf[size] = '\n';
384 entry->buf[size + 1] = '\0';
385 } else
386 entry->buf[size] = '\0';
387
388 __buffer_unlock_commit(buffer, event);
389
390 return size;
391}
392EXPORT_SYMBOL_GPL(__trace_puts);
393
394/**
395 * __trace_bputs - write the pointer to a constant string into trace buffer
396 * @ip: The address of the caller
397 * @str: The constant string to write to the buffer to
398 */
399int __trace_bputs(unsigned long ip, const char *str)
400{
401 struct ring_buffer_event *event;
402 struct ring_buffer *buffer;
403 struct bputs_entry *entry;
404 unsigned long irq_flags;
405 int size = sizeof(struct bputs_entry);
406
407 local_save_flags(irq_flags);
408 buffer = global_trace.trace_buffer.buffer;
409 event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
410 irq_flags, preempt_count());
411 if (!event)
412 return 0;
413
414 entry = ring_buffer_event_data(event);
415 entry->ip = ip;
416 entry->str = str;
417
418 __buffer_unlock_commit(buffer, event);
419
420 return 1;
421}
422EXPORT_SYMBOL_GPL(__trace_bputs);
423
424#ifdef CONFIG_TRACER_SNAPSHOT
425/**
426 * trace_snapshot - take a snapshot of the current buffer.
427 *
428 * This causes a swap between the snapshot buffer and the current live
429 * tracing buffer. You can use this to take snapshots of the live
430 * trace when some condition is triggered, but continue to trace.
431 *
432 * Note, make sure to allocate the snapshot with either
433 * a tracing_snapshot_alloc(), or by doing it manually
434 * with: echo 1 > /sys/kernel/debug/tracing/snapshot
435 *
436 * If the snapshot buffer is not allocated, it will stop tracing.
437 * Basically making a permanent snapshot.
438 */
439void tracing_snapshot(void)
440{
441 struct trace_array *tr = &global_trace;
442 struct tracer *tracer = tr->current_trace;
443 unsigned long flags;
444
445 if (in_nmi()) {
446 internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
447 internal_trace_puts("*** snapshot is being ignored ***\n");
448 return;
449 }
450
451 if (!tr->allocated_snapshot) {
452 internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
453 internal_trace_puts("*** stopping trace here! ***\n");
454 tracing_off();
455 return;
456 }
457
458 /* Note, snapshot can not be used when the tracer uses it */
459 if (tracer->use_max_tr) {
460 internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
461 internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
462 return;
463 }
464
465 local_irq_save(flags);
466 update_max_tr(tr, current, smp_processor_id());
467 local_irq_restore(flags);
468}
469EXPORT_SYMBOL_GPL(tracing_snapshot);
470
471static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
472 struct trace_buffer *size_buf, int cpu_id);
473static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
474
475static int alloc_snapshot(struct trace_array *tr)
476{
477 int ret;
478
479 if (!tr->allocated_snapshot) {
480
481 /* allocate spare buffer */
482 ret = resize_buffer_duplicate_size(&tr->max_buffer,
483 &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
484 if (ret < 0)
485 return ret;
486
487 tr->allocated_snapshot = true;
488 }
489
490 return 0;
491}
492
493void free_snapshot(struct trace_array *tr)
494{
495 /*
496 * We don't free the ring buffer. instead, resize it because
497 * The max_tr ring buffer has some state (e.g. ring->clock) and
498 * we want preserve it.
499 */
500 ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
501 set_buffer_entries(&tr->max_buffer, 1);
502 tracing_reset_online_cpus(&tr->max_buffer);
503 tr->allocated_snapshot = false;
504}
505
506/**
507 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
508 *
509 * This is similar to trace_snapshot(), but it will allocate the
510 * snapshot buffer if it isn't already allocated. Use this only
511 * where it is safe to sleep, as the allocation may sleep.
512 *
513 * This causes a swap between the snapshot buffer and the current live
514 * tracing buffer. You can use this to take snapshots of the live
515 * trace when some condition is triggered, but continue to trace.
516 */
517void tracing_snapshot_alloc(void)
518{
519 struct trace_array *tr = &global_trace;
520 int ret;
521
522 ret = alloc_snapshot(tr);
523 if (WARN_ON(ret < 0))
524 return;
525
526 tracing_snapshot();
527}
528EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
529#else
530void tracing_snapshot(void)
531{
532 WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
533}
534EXPORT_SYMBOL_GPL(tracing_snapshot);
535void tracing_snapshot_alloc(void)
536{
537 /* Give warning */
538 tracing_snapshot();
539}
540EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
541#endif /* CONFIG_TRACER_SNAPSHOT */
542
543/**
388 * tracing_off - turn off tracing buffers 544 * tracing_off - turn off tracing buffers
389 * 545 *
390 * This function stops the tracing buffers from recording data. 546 * This function stops the tracing buffers from recording data.
@@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on);
394 */ 550 */
395void tracing_off(void) 551void tracing_off(void)
396{ 552{
397 if (global_trace.buffer) 553 if (global_trace.trace_buffer.buffer)
398 ring_buffer_record_off(global_trace.buffer); 554 ring_buffer_record_off(global_trace.trace_buffer.buffer);
399 /* 555 /*
400 * This flag is only looked at when buffers haven't been 556 * This flag is only looked at when buffers haven't been
401 * allocated yet. We don't really care about the race 557 * allocated yet. We don't really care about the race
@@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off);
411 */ 567 */
412int tracing_is_on(void) 568int tracing_is_on(void)
413{ 569{
414 if (global_trace.buffer) 570 if (global_trace.trace_buffer.buffer)
415 return ring_buffer_record_is_on(global_trace.buffer); 571 return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
416 return !global_trace.buffer_disabled; 572 return !global_trace.buffer_disabled;
417} 573}
418EXPORT_SYMBOL_GPL(tracing_is_on); 574EXPORT_SYMBOL_GPL(tracing_is_on);
@@ -479,6 +635,7 @@ static const char *trace_options[] = {
479 "disable_on_free", 635 "disable_on_free",
480 "irq-info", 636 "irq-info",
481 "markers", 637 "markers",
638 "function-trace",
482 NULL 639 NULL
483}; 640};
484 641
@@ -490,6 +647,8 @@ static struct {
490 { trace_clock_local, "local", 1 }, 647 { trace_clock_local, "local", 1 },
491 { trace_clock_global, "global", 1 }, 648 { trace_clock_global, "global", 1 },
492 { trace_clock_counter, "counter", 0 }, 649 { trace_clock_counter, "counter", 0 },
650 { trace_clock_jiffies, "uptime", 1 },
651 { trace_clock, "perf", 1 },
493 ARCH_TRACE_CLOCKS 652 ARCH_TRACE_CLOCKS
494}; 653};
495 654
@@ -670,13 +829,14 @@ unsigned long __read_mostly tracing_max_latency;
670static void 829static void
671__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 830__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
672{ 831{
673 struct trace_array_cpu *data = tr->data[cpu]; 832 struct trace_buffer *trace_buf = &tr->trace_buffer;
674 struct trace_array_cpu *max_data; 833 struct trace_buffer *max_buf = &tr->max_buffer;
834 struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
835 struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
675 836
676 max_tr.cpu = cpu; 837 max_buf->cpu = cpu;
677 max_tr.time_start = data->preempt_timestamp; 838 max_buf->time_start = data->preempt_timestamp;
678 839
679 max_data = max_tr.data[cpu];
680 max_data->saved_latency = tracing_max_latency; 840 max_data->saved_latency = tracing_max_latency;
681 max_data->critical_start = data->critical_start; 841 max_data->critical_start = data->critical_start;
682 max_data->critical_end = data->critical_end; 842 max_data->critical_end = data->critical_end;
@@ -704,23 +864,24 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
704void 864void
705update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 865update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
706{ 866{
707 struct ring_buffer *buf = tr->buffer; 867 struct ring_buffer *buf;
708 868
709 if (trace_stop_count) 869 if (tr->stop_count)
710 return; 870 return;
711 871
712 WARN_ON_ONCE(!irqs_disabled()); 872 WARN_ON_ONCE(!irqs_disabled());
713 873
714 if (!current_trace->allocated_snapshot) { 874 if (!tr->allocated_snapshot) {
715 /* Only the nop tracer should hit this when disabling */ 875 /* Only the nop tracer should hit this when disabling */
716 WARN_ON_ONCE(current_trace != &nop_trace); 876 WARN_ON_ONCE(tr->current_trace != &nop_trace);
717 return; 877 return;
718 } 878 }
719 879
720 arch_spin_lock(&ftrace_max_lock); 880 arch_spin_lock(&ftrace_max_lock);
721 881
722 tr->buffer = max_tr.buffer; 882 buf = tr->trace_buffer.buffer;
723 max_tr.buffer = buf; 883 tr->trace_buffer.buffer = tr->max_buffer.buffer;
884 tr->max_buffer.buffer = buf;
724 885
725 __update_max_tr(tr, tsk, cpu); 886 __update_max_tr(tr, tsk, cpu);
726 arch_spin_unlock(&ftrace_max_lock); 887 arch_spin_unlock(&ftrace_max_lock);
@@ -739,16 +900,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
739{ 900{
740 int ret; 901 int ret;
741 902
742 if (trace_stop_count) 903 if (tr->stop_count)
743 return; 904 return;
744 905
745 WARN_ON_ONCE(!irqs_disabled()); 906 WARN_ON_ONCE(!irqs_disabled());
746 if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) 907 if (!tr->allocated_snapshot) {
908 /* Only the nop tracer should hit this when disabling */
909 WARN_ON_ONCE(tr->current_trace != &nop_trace);
747 return; 910 return;
911 }
748 912
749 arch_spin_lock(&ftrace_max_lock); 913 arch_spin_lock(&ftrace_max_lock);
750 914
751 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 915 ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
752 916
753 if (ret == -EBUSY) { 917 if (ret == -EBUSY) {
754 /* 918 /*
@@ -757,7 +921,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
757 * the max trace buffer (no one writes directly to it) 921 * the max trace buffer (no one writes directly to it)
758 * and flag that it failed. 922 * and flag that it failed.
759 */ 923 */
760 trace_array_printk(&max_tr, _THIS_IP_, 924 trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
761 "Failed to swap buffers due to commit in progress\n"); 925 "Failed to swap buffers due to commit in progress\n");
762 } 926 }
763 927
@@ -770,37 +934,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
770 934
771static void default_wait_pipe(struct trace_iterator *iter) 935static void default_wait_pipe(struct trace_iterator *iter)
772{ 936{
773 DEFINE_WAIT(wait); 937 /* Iterators are static, they should be filled or empty */
938 if (trace_buffer_iter(iter, iter->cpu_file))
939 return;
774 940
775 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); 941 ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
942}
943
944#ifdef CONFIG_FTRACE_STARTUP_TEST
945static int run_tracer_selftest(struct tracer *type)
946{
947 struct trace_array *tr = &global_trace;
948 struct tracer *saved_tracer = tr->current_trace;
949 int ret;
950
951 if (!type->selftest || tracing_selftest_disabled)
952 return 0;
776 953
777 /* 954 /*
778 * The events can happen in critical sections where 955 * Run a selftest on this tracer.
779 * checking a work queue can cause deadlocks. 956 * Here we reset the trace buffer, and set the current
780 * After adding a task to the queue, this flag is set 957 * tracer to be this tracer. The tracer can then run some
781 * only to notify events to try to wake up the queue 958 * internal tracing to verify that everything is in order.
782 * using irq_work. 959 * If we fail, we do not register this tracer.
783 *
784 * We don't clear it even if the buffer is no longer
785 * empty. The flag only causes the next event to run
786 * irq_work to do the work queue wake up. The worse
787 * that can happen if we race with !trace_empty() is that
788 * an event will cause an irq_work to try to wake up
789 * an empty queue.
790 *
791 * There's no reason to protect this flag either, as
792 * the work queue and irq_work logic will do the necessary
793 * synchronization for the wake ups. The only thing
794 * that is necessary is that the wake up happens after
795 * a task has been queued. It's OK for spurious wake ups.
796 */ 960 */
797 trace_wakeup_needed = true; 961 tracing_reset_online_cpus(&tr->trace_buffer);
798 962
799 if (trace_empty(iter)) 963 tr->current_trace = type;
800 schedule(); 964
965#ifdef CONFIG_TRACER_MAX_TRACE
966 if (type->use_max_tr) {
967 /* If we expanded the buffers, make sure the max is expanded too */
968 if (ring_buffer_expanded)
969 ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
970 RING_BUFFER_ALL_CPUS);
971 tr->allocated_snapshot = true;
972 }
973#endif
801 974
802 finish_wait(&trace_wait, &wait); 975 /* the test is responsible for initializing and enabling */
976 pr_info("Testing tracer %s: ", type->name);
977 ret = type->selftest(type, tr);
978 /* the test is responsible for resetting too */
979 tr->current_trace = saved_tracer;
980 if (ret) {
981 printk(KERN_CONT "FAILED!\n");
982 /* Add the warning after printing 'FAILED' */
983 WARN_ON(1);
984 return -1;
985 }
986 /* Only reset on passing, to avoid touching corrupted buffers */
987 tracing_reset_online_cpus(&tr->trace_buffer);
988
989#ifdef CONFIG_TRACER_MAX_TRACE
990 if (type->use_max_tr) {
991 tr->allocated_snapshot = false;
992
993 /* Shrink the max buffer again */
994 if (ring_buffer_expanded)
995 ring_buffer_resize(tr->max_buffer.buffer, 1,
996 RING_BUFFER_ALL_CPUS);
997 }
998#endif
999
1000 printk(KERN_CONT "PASSED\n");
1001 return 0;
803} 1002}
1003#else
1004static inline int run_tracer_selftest(struct tracer *type)
1005{
1006 return 0;
1007}
1008#endif /* CONFIG_FTRACE_STARTUP_TEST */
804 1009
805/** 1010/**
806 * register_tracer - register a tracer with the ftrace system. 1011 * register_tracer - register a tracer with the ftrace system.
@@ -847,57 +1052,9 @@ int register_tracer(struct tracer *type)
847 if (!type->wait_pipe) 1052 if (!type->wait_pipe)
848 type->wait_pipe = default_wait_pipe; 1053 type->wait_pipe = default_wait_pipe;
849 1054
850 1055 ret = run_tracer_selftest(type);
851#ifdef CONFIG_FTRACE_STARTUP_TEST 1056 if (ret < 0)
852 if (type->selftest && !tracing_selftest_disabled) { 1057 goto out;
853 struct tracer *saved_tracer = current_trace;
854 struct trace_array *tr = &global_trace;
855
856 /*
857 * Run a selftest on this tracer.
858 * Here we reset the trace buffer, and set the current
859 * tracer to be this tracer. The tracer can then run some
860 * internal tracing to verify that everything is in order.
861 * If we fail, we do not register this tracer.
862 */
863 tracing_reset_online_cpus(tr);
864
865 current_trace = type;
866
867 if (type->use_max_tr) {
868 /* If we expanded the buffers, make sure the max is expanded too */
869 if (ring_buffer_expanded)
870 ring_buffer_resize(max_tr.buffer, trace_buf_size,
871 RING_BUFFER_ALL_CPUS);
872 type->allocated_snapshot = true;
873 }
874
875 /* the test is responsible for initializing and enabling */
876 pr_info("Testing tracer %s: ", type->name);
877 ret = type->selftest(type, tr);
878 /* the test is responsible for resetting too */
879 current_trace = saved_tracer;
880 if (ret) {
881 printk(KERN_CONT "FAILED!\n");
882 /* Add the warning after printing 'FAILED' */
883 WARN_ON(1);
884 goto out;
885 }
886 /* Only reset on passing, to avoid touching corrupted buffers */
887 tracing_reset_online_cpus(tr);
888
889 if (type->use_max_tr) {
890 type->allocated_snapshot = false;
891
892 /* Shrink the max buffer again */
893 if (ring_buffer_expanded)
894 ring_buffer_resize(max_tr.buffer, 1,
895 RING_BUFFER_ALL_CPUS);
896 }
897
898 printk(KERN_CONT "PASSED\n");
899 }
900#endif
901 1058
902 type->next = trace_types; 1059 type->next = trace_types;
903 trace_types = type; 1060 trace_types = type;
@@ -917,7 +1074,7 @@ int register_tracer(struct tracer *type)
917 tracing_set_tracer(type->name); 1074 tracing_set_tracer(type->name);
918 default_bootup_tracer = NULL; 1075 default_bootup_tracer = NULL;
919 /* disable other selftests, since this will break it. */ 1076 /* disable other selftests, since this will break it. */
920 tracing_selftest_disabled = 1; 1077 tracing_selftest_disabled = true;
921#ifdef CONFIG_FTRACE_STARTUP_TEST 1078#ifdef CONFIG_FTRACE_STARTUP_TEST
922 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", 1079 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
923 type->name); 1080 type->name);
@@ -927,9 +1084,9 @@ int register_tracer(struct tracer *type)
927 return ret; 1084 return ret;
928} 1085}
929 1086
930void tracing_reset(struct trace_array *tr, int cpu) 1087void tracing_reset(struct trace_buffer *buf, int cpu)
931{ 1088{
932 struct ring_buffer *buffer = tr->buffer; 1089 struct ring_buffer *buffer = buf->buffer;
933 1090
934 if (!buffer) 1091 if (!buffer)
935 return; 1092 return;
@@ -943,9 +1100,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
943 ring_buffer_record_enable(buffer); 1100 ring_buffer_record_enable(buffer);
944} 1101}
945 1102
946void tracing_reset_online_cpus(struct trace_array *tr) 1103void tracing_reset_online_cpus(struct trace_buffer *buf)
947{ 1104{
948 struct ring_buffer *buffer = tr->buffer; 1105 struct ring_buffer *buffer = buf->buffer;
949 int cpu; 1106 int cpu;
950 1107
951 if (!buffer) 1108 if (!buffer)
@@ -956,7 +1113,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
956 /* Make sure all commits have finished */ 1113 /* Make sure all commits have finished */
957 synchronize_sched(); 1114 synchronize_sched();
958 1115
959 tr->time_start = ftrace_now(tr->cpu); 1116 buf->time_start = ftrace_now(buf->cpu);
960 1117
961 for_each_online_cpu(cpu) 1118 for_each_online_cpu(cpu)
962 ring_buffer_reset_cpu(buffer, cpu); 1119 ring_buffer_reset_cpu(buffer, cpu);
@@ -966,12 +1123,21 @@ void tracing_reset_online_cpus(struct trace_array *tr)
966 1123
967void tracing_reset_current(int cpu) 1124void tracing_reset_current(int cpu)
968{ 1125{
969 tracing_reset(&global_trace, cpu); 1126 tracing_reset(&global_trace.trace_buffer, cpu);
970} 1127}
971 1128
972void tracing_reset_current_online_cpus(void) 1129void tracing_reset_all_online_cpus(void)
973{ 1130{
974 tracing_reset_online_cpus(&global_trace); 1131 struct trace_array *tr;
1132
1133 mutex_lock(&trace_types_lock);
1134 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
1135 tracing_reset_online_cpus(&tr->trace_buffer);
1136#ifdef CONFIG_TRACER_MAX_TRACE
1137 tracing_reset_online_cpus(&tr->max_buffer);
1138#endif
1139 }
1140 mutex_unlock(&trace_types_lock);
975} 1141}
976 1142
977#define SAVED_CMDLINES 128 1143#define SAVED_CMDLINES 128
@@ -994,7 +1160,7 @@ static void trace_init_cmdlines(void)
994 1160
995int is_tracing_stopped(void) 1161int is_tracing_stopped(void)
996{ 1162{
997 return trace_stop_count; 1163 return global_trace.stop_count;
998} 1164}
999 1165
1000/** 1166/**
@@ -1026,12 +1192,12 @@ void tracing_start(void)
1026 if (tracing_disabled) 1192 if (tracing_disabled)
1027 return; 1193 return;
1028 1194
1029 raw_spin_lock_irqsave(&tracing_start_lock, flags); 1195 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1030 if (--trace_stop_count) { 1196 if (--global_trace.stop_count) {
1031 if (trace_stop_count < 0) { 1197 if (global_trace.stop_count < 0) {
1032 /* Someone screwed up their debugging */ 1198 /* Someone screwed up their debugging */
1033 WARN_ON_ONCE(1); 1199 WARN_ON_ONCE(1);
1034 trace_stop_count = 0; 1200 global_trace.stop_count = 0;
1035 } 1201 }
1036 goto out; 1202 goto out;
1037 } 1203 }
@@ -1039,19 +1205,52 @@ void tracing_start(void)
1039 /* Prevent the buffers from switching */ 1205 /* Prevent the buffers from switching */
1040 arch_spin_lock(&ftrace_max_lock); 1206 arch_spin_lock(&ftrace_max_lock);
1041 1207
1042 buffer = global_trace.buffer; 1208 buffer = global_trace.trace_buffer.buffer;
1043 if (buffer) 1209 if (buffer)
1044 ring_buffer_record_enable(buffer); 1210 ring_buffer_record_enable(buffer);
1045 1211
1046 buffer = max_tr.buffer; 1212#ifdef CONFIG_TRACER_MAX_TRACE
1213 buffer = global_trace.max_buffer.buffer;
1047 if (buffer) 1214 if (buffer)
1048 ring_buffer_record_enable(buffer); 1215 ring_buffer_record_enable(buffer);
1216#endif
1049 1217
1050 arch_spin_unlock(&ftrace_max_lock); 1218 arch_spin_unlock(&ftrace_max_lock);
1051 1219
1052 ftrace_start(); 1220 ftrace_start();
1053 out: 1221 out:
1054 raw_spin_unlock_irqrestore(&tracing_start_lock, flags); 1222 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1223}
1224
1225static void tracing_start_tr(struct trace_array *tr)
1226{
1227 struct ring_buffer *buffer;
1228 unsigned long flags;
1229
1230 if (tracing_disabled)
1231 return;
1232
1233 /* If global, we need to also start the max tracer */
1234 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
1235 return tracing_start();
1236
1237 raw_spin_lock_irqsave(&tr->start_lock, flags);
1238
1239 if (--tr->stop_count) {
1240 if (tr->stop_count < 0) {
1241 /* Someone screwed up their debugging */
1242 WARN_ON_ONCE(1);
1243 tr->stop_count = 0;
1244 }
1245 goto out;
1246 }
1247
1248 buffer = tr->trace_buffer.buffer;
1249 if (buffer)
1250 ring_buffer_record_enable(buffer);
1251
1252 out:
1253 raw_spin_unlock_irqrestore(&tr->start_lock, flags);
1055} 1254}
1056 1255
1057/** 1256/**
@@ -1066,25 +1265,48 @@ void tracing_stop(void)
1066 unsigned long flags; 1265 unsigned long flags;
1067 1266
1068 ftrace_stop(); 1267 ftrace_stop();
1069 raw_spin_lock_irqsave(&tracing_start_lock, flags); 1268 raw_spin_lock_irqsave(&global_trace.start_lock, flags);
1070 if (trace_stop_count++) 1269 if (global_trace.stop_count++)
1071 goto out; 1270 goto out;
1072 1271
1073 /* Prevent the buffers from switching */ 1272 /* Prevent the buffers from switching */
1074 arch_spin_lock(&ftrace_max_lock); 1273 arch_spin_lock(&ftrace_max_lock);
1075 1274
1076 buffer = global_trace.buffer; 1275 buffer = global_trace.trace_buffer.buffer;
1077 if (buffer) 1276 if (buffer)
1078 ring_buffer_record_disable(buffer); 1277 ring_buffer_record_disable(buffer);
1079 1278
1080 buffer = max_tr.buffer; 1279#ifdef CONFIG_TRACER_MAX_TRACE
1280 buffer = global_trace.max_buffer.buffer;
1081 if (buffer) 1281 if (buffer)
1082 ring_buffer_record_disable(buffer); 1282 ring_buffer_record_disable(buffer);
1283#endif
1083 1284
1084 arch_spin_unlock(&ftrace_max_lock); 1285 arch_spin_unlock(&ftrace_max_lock);
1085 1286
1086 out: 1287 out:
1087 raw_spin_unlock_irqrestore(&tracing_start_lock, flags); 1288 raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
1289}
1290
1291static void tracing_stop_tr(struct trace_array *tr)
1292{
1293 struct ring_buffer *buffer;
1294 unsigned long flags;
1295
1296 /* If global, we need to also stop the max tracer */
1297 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
1298 return tracing_stop();
1299
1300 raw_spin_lock_irqsave(&tr->start_lock, flags);
1301 if (tr->stop_count++)
1302 goto out;
1303
1304 buffer = tr->trace_buffer.buffer;
1305 if (buffer)
1306 ring_buffer_record_disable(buffer);
1307
1308 out:
1309 raw_spin_unlock_irqrestore(&tr->start_lock, flags);
1088} 1310}
1089 1311
1090void trace_stop_cmdline_recording(void); 1312void trace_stop_cmdline_recording(void);
@@ -1217,11 +1439,6 @@ void
1217__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) 1439__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
1218{ 1440{
1219 __this_cpu_write(trace_cmdline_save, true); 1441 __this_cpu_write(trace_cmdline_save, true);
1220 if (trace_wakeup_needed) {
1221 trace_wakeup_needed = false;
1222 /* irq_work_queue() supplies it's own memory barriers */
1223 irq_work_queue(&trace_work_wakeup);
1224 }
1225 ring_buffer_unlock_commit(buffer, event); 1442 ring_buffer_unlock_commit(buffer, event);
1226} 1443}
1227 1444
@@ -1245,11 +1462,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
1245EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); 1462EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
1246 1463
1247struct ring_buffer_event * 1464struct ring_buffer_event *
1465trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
1466 struct ftrace_event_file *ftrace_file,
1467 int type, unsigned long len,
1468 unsigned long flags, int pc)
1469{
1470 *current_rb = ftrace_file->tr->trace_buffer.buffer;
1471 return trace_buffer_lock_reserve(*current_rb,
1472 type, len, flags, pc);
1473}
1474EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
1475
1476struct ring_buffer_event *
1248trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, 1477trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
1249 int type, unsigned long len, 1478 int type, unsigned long len,
1250 unsigned long flags, int pc) 1479 unsigned long flags, int pc)
1251{ 1480{
1252 *current_rb = global_trace.buffer; 1481 *current_rb = global_trace.trace_buffer.buffer;
1253 return trace_buffer_lock_reserve(*current_rb, 1482 return trace_buffer_lock_reserve(*current_rb,
1254 type, len, flags, pc); 1483 type, len, flags, pc);
1255} 1484}
@@ -1288,7 +1517,7 @@ trace_function(struct trace_array *tr,
1288 int pc) 1517 int pc)
1289{ 1518{
1290 struct ftrace_event_call *call = &event_function; 1519 struct ftrace_event_call *call = &event_function;
1291 struct ring_buffer *buffer = tr->buffer; 1520 struct ring_buffer *buffer = tr->trace_buffer.buffer;
1292 struct ring_buffer_event *event; 1521 struct ring_buffer_event *event;
1293 struct ftrace_entry *entry; 1522 struct ftrace_entry *entry;
1294 1523
@@ -1429,13 +1658,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1429void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 1658void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1430 int pc) 1659 int pc)
1431{ 1660{
1432 __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); 1661 __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
1433} 1662}
1434 1663
1435/** 1664/**
1436 * trace_dump_stack - record a stack back trace in the trace buffer 1665 * trace_dump_stack - record a stack back trace in the trace buffer
1666 * @skip: Number of functions to skip (helper handlers)
1437 */ 1667 */
1438void trace_dump_stack(void) 1668void trace_dump_stack(int skip)
1439{ 1669{
1440 unsigned long flags; 1670 unsigned long flags;
1441 1671
@@ -1444,8 +1674,13 @@ void trace_dump_stack(void)
1444 1674
1445 local_save_flags(flags); 1675 local_save_flags(flags);
1446 1676
1447 /* skipping 3 traces, seems to get us at the caller of this function */ 1677 /*
1448 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); 1678 * Skip 3 more, seems to get us at the caller of
1679 * this function.
1680 */
1681 skip += 3;
1682 __ftrace_trace_stack(global_trace.trace_buffer.buffer,
1683 flags, skip, preempt_count(), NULL);
1449} 1684}
1450 1685
1451static DEFINE_PER_CPU(int, user_stack_count); 1686static DEFINE_PER_CPU(int, user_stack_count);
@@ -1615,7 +1850,7 @@ void trace_printk_init_buffers(void)
1615 * directly here. If the global_trace.buffer is already 1850 * directly here. If the global_trace.buffer is already
1616 * allocated here, then this was called by module code. 1851 * allocated here, then this was called by module code.
1617 */ 1852 */
1618 if (global_trace.buffer) 1853 if (global_trace.trace_buffer.buffer)
1619 tracing_start_cmdline_record(); 1854 tracing_start_cmdline_record();
1620} 1855}
1621 1856
@@ -1675,7 +1910,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1675 1910
1676 local_save_flags(flags); 1911 local_save_flags(flags);
1677 size = sizeof(*entry) + sizeof(u32) * len; 1912 size = sizeof(*entry) + sizeof(u32) * len;
1678 buffer = tr->buffer; 1913 buffer = tr->trace_buffer.buffer;
1679 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, 1914 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1680 flags, pc); 1915 flags, pc);
1681 if (!event) 1916 if (!event)
@@ -1698,27 +1933,12 @@ out:
1698} 1933}
1699EXPORT_SYMBOL_GPL(trace_vbprintk); 1934EXPORT_SYMBOL_GPL(trace_vbprintk);
1700 1935
1701int trace_array_printk(struct trace_array *tr, 1936static int
1702 unsigned long ip, const char *fmt, ...) 1937__trace_array_vprintk(struct ring_buffer *buffer,
1703{ 1938 unsigned long ip, const char *fmt, va_list args)
1704 int ret;
1705 va_list ap;
1706
1707 if (!(trace_flags & TRACE_ITER_PRINTK))
1708 return 0;
1709
1710 va_start(ap, fmt);
1711 ret = trace_array_vprintk(tr, ip, fmt, ap);
1712 va_end(ap);
1713 return ret;
1714}
1715
1716int trace_array_vprintk(struct trace_array *tr,
1717 unsigned long ip, const char *fmt, va_list args)
1718{ 1939{
1719 struct ftrace_event_call *call = &event_print; 1940 struct ftrace_event_call *call = &event_print;
1720 struct ring_buffer_event *event; 1941 struct ring_buffer_event *event;
1721 struct ring_buffer *buffer;
1722 int len = 0, size, pc; 1942 int len = 0, size, pc;
1723 struct print_entry *entry; 1943 struct print_entry *entry;
1724 unsigned long flags; 1944 unsigned long flags;
@@ -1746,7 +1966,6 @@ int trace_array_vprintk(struct trace_array *tr,
1746 1966
1747 local_save_flags(flags); 1967 local_save_flags(flags);
1748 size = sizeof(*entry) + len + 1; 1968 size = sizeof(*entry) + len + 1;
1749 buffer = tr->buffer;
1750 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 1969 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1751 flags, pc); 1970 flags, pc);
1752 if (!event) 1971 if (!event)
@@ -1767,6 +1986,42 @@ int trace_array_vprintk(struct trace_array *tr,
1767 return len; 1986 return len;
1768} 1987}
1769 1988
1989int trace_array_vprintk(struct trace_array *tr,
1990 unsigned long ip, const char *fmt, va_list args)
1991{
1992 return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
1993}
1994
1995int trace_array_printk(struct trace_array *tr,
1996 unsigned long ip, const char *fmt, ...)
1997{
1998 int ret;
1999 va_list ap;
2000
2001 if (!(trace_flags & TRACE_ITER_PRINTK))
2002 return 0;
2003
2004 va_start(ap, fmt);
2005 ret = trace_array_vprintk(tr, ip, fmt, ap);
2006 va_end(ap);
2007 return ret;
2008}
2009
2010int trace_array_printk_buf(struct ring_buffer *buffer,
2011 unsigned long ip, const char *fmt, ...)
2012{
2013 int ret;
2014 va_list ap;
2015
2016 if (!(trace_flags & TRACE_ITER_PRINTK))
2017 return 0;
2018
2019 va_start(ap, fmt);
2020 ret = __trace_array_vprintk(buffer, ip, fmt, ap);
2021 va_end(ap);
2022 return ret;
2023}
2024
1770int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 2025int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1771{ 2026{
1772 return trace_array_vprintk(&global_trace, ip, fmt, args); 2027 return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -1792,7 +2047,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1792 if (buf_iter) 2047 if (buf_iter)
1793 event = ring_buffer_iter_peek(buf_iter, ts); 2048 event = ring_buffer_iter_peek(buf_iter, ts);
1794 else 2049 else
1795 event = ring_buffer_peek(iter->tr->buffer, cpu, ts, 2050 event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
1796 lost_events); 2051 lost_events);
1797 2052
1798 if (event) { 2053 if (event) {
@@ -1807,7 +2062,7 @@ static struct trace_entry *
1807__find_next_entry(struct trace_iterator *iter, int *ent_cpu, 2062__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1808 unsigned long *missing_events, u64 *ent_ts) 2063 unsigned long *missing_events, u64 *ent_ts)
1809{ 2064{
1810 struct ring_buffer *buffer = iter->tr->buffer; 2065 struct ring_buffer *buffer = iter->trace_buffer->buffer;
1811 struct trace_entry *ent, *next = NULL; 2066 struct trace_entry *ent, *next = NULL;
1812 unsigned long lost_events = 0, next_lost = 0; 2067 unsigned long lost_events = 0, next_lost = 0;
1813 int cpu_file = iter->cpu_file; 2068 int cpu_file = iter->cpu_file;
@@ -1820,7 +2075,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1820 * If we are in a per_cpu trace file, don't bother by iterating over 2075 * If we are in a per_cpu trace file, don't bother by iterating over
1821 * all cpu and peek directly. 2076 * all cpu and peek directly.
1822 */ 2077 */
1823 if (cpu_file > TRACE_PIPE_ALL_CPU) { 2078 if (cpu_file > RING_BUFFER_ALL_CPUS) {
1824 if (ring_buffer_empty_cpu(buffer, cpu_file)) 2079 if (ring_buffer_empty_cpu(buffer, cpu_file))
1825 return NULL; 2080 return NULL;
1826 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); 2081 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
@@ -1884,7 +2139,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
1884 2139
1885static void trace_consume(struct trace_iterator *iter) 2140static void trace_consume(struct trace_iterator *iter)
1886{ 2141{
1887 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, 2142 ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
1888 &iter->lost_events); 2143 &iter->lost_events);
1889} 2144}
1890 2145
@@ -1917,13 +2172,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1917 2172
1918void tracing_iter_reset(struct trace_iterator *iter, int cpu) 2173void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1919{ 2174{
1920 struct trace_array *tr = iter->tr;
1921 struct ring_buffer_event *event; 2175 struct ring_buffer_event *event;
1922 struct ring_buffer_iter *buf_iter; 2176 struct ring_buffer_iter *buf_iter;
1923 unsigned long entries = 0; 2177 unsigned long entries = 0;
1924 u64 ts; 2178 u64 ts;
1925 2179
1926 tr->data[cpu]->skipped_entries = 0; 2180 per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
1927 2181
1928 buf_iter = trace_buffer_iter(iter, cpu); 2182 buf_iter = trace_buffer_iter(iter, cpu);
1929 if (!buf_iter) 2183 if (!buf_iter)
@@ -1937,13 +2191,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1937 * by the timestamp being before the start of the buffer. 2191 * by the timestamp being before the start of the buffer.
1938 */ 2192 */
1939 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { 2193 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1940 if (ts >= iter->tr->time_start) 2194 if (ts >= iter->trace_buffer->time_start)
1941 break; 2195 break;
1942 entries++; 2196 entries++;
1943 ring_buffer_read(buf_iter, NULL); 2197 ring_buffer_read(buf_iter, NULL);
1944 } 2198 }
1945 2199
1946 tr->data[cpu]->skipped_entries = entries; 2200 per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
1947} 2201}
1948 2202
1949/* 2203/*
@@ -1953,6 +2207,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1953static void *s_start(struct seq_file *m, loff_t *pos) 2207static void *s_start(struct seq_file *m, loff_t *pos)
1954{ 2208{
1955 struct trace_iterator *iter = m->private; 2209 struct trace_iterator *iter = m->private;
2210 struct trace_array *tr = iter->tr;
1956 int cpu_file = iter->cpu_file; 2211 int cpu_file = iter->cpu_file;
1957 void *p = NULL; 2212 void *p = NULL;
1958 loff_t l = 0; 2213 loff_t l = 0;
@@ -1965,12 +2220,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1965 * will point to the same string as current_trace->name. 2220 * will point to the same string as current_trace->name.
1966 */ 2221 */
1967 mutex_lock(&trace_types_lock); 2222 mutex_lock(&trace_types_lock);
1968 if (unlikely(current_trace && iter->trace->name != current_trace->name)) 2223 if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
1969 *iter->trace = *current_trace; 2224 *iter->trace = *tr->current_trace;
1970 mutex_unlock(&trace_types_lock); 2225 mutex_unlock(&trace_types_lock);
1971 2226
2227#ifdef CONFIG_TRACER_MAX_TRACE
1972 if (iter->snapshot && iter->trace->use_max_tr) 2228 if (iter->snapshot && iter->trace->use_max_tr)
1973 return ERR_PTR(-EBUSY); 2229 return ERR_PTR(-EBUSY);
2230#endif
1974 2231
1975 if (!iter->snapshot) 2232 if (!iter->snapshot)
1976 atomic_inc(&trace_record_cmdline_disabled); 2233 atomic_inc(&trace_record_cmdline_disabled);
@@ -1980,7 +2237,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1980 iter->cpu = 0; 2237 iter->cpu = 0;
1981 iter->idx = -1; 2238 iter->idx = -1;
1982 2239
1983 if (cpu_file == TRACE_PIPE_ALL_CPU) { 2240 if (cpu_file == RING_BUFFER_ALL_CPUS) {
1984 for_each_tracing_cpu(cpu) 2241 for_each_tracing_cpu(cpu)
1985 tracing_iter_reset(iter, cpu); 2242 tracing_iter_reset(iter, cpu);
1986 } else 2243 } else
@@ -2012,17 +2269,21 @@ static void s_stop(struct seq_file *m, void *p)
2012{ 2269{
2013 struct trace_iterator *iter = m->private; 2270 struct trace_iterator *iter = m->private;
2014 2271
2272#ifdef CONFIG_TRACER_MAX_TRACE
2015 if (iter->snapshot && iter->trace->use_max_tr) 2273 if (iter->snapshot && iter->trace->use_max_tr)
2016 return; 2274 return;
2275#endif
2017 2276
2018 if (!iter->snapshot) 2277 if (!iter->snapshot)
2019 atomic_dec(&trace_record_cmdline_disabled); 2278 atomic_dec(&trace_record_cmdline_disabled);
2279
2020 trace_access_unlock(iter->cpu_file); 2280 trace_access_unlock(iter->cpu_file);
2021 trace_event_read_unlock(); 2281 trace_event_read_unlock();
2022} 2282}
2023 2283
2024static void 2284static void
2025get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) 2285get_total_entries(struct trace_buffer *buf,
2286 unsigned long *total, unsigned long *entries)
2026{ 2287{
2027 unsigned long count; 2288 unsigned long count;
2028 int cpu; 2289 int cpu;
@@ -2031,19 +2292,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
2031 *entries = 0; 2292 *entries = 0;
2032 2293
2033 for_each_tracing_cpu(cpu) { 2294 for_each_tracing_cpu(cpu) {
2034 count = ring_buffer_entries_cpu(tr->buffer, cpu); 2295 count = ring_buffer_entries_cpu(buf->buffer, cpu);
2035 /* 2296 /*
2036 * If this buffer has skipped entries, then we hold all 2297 * If this buffer has skipped entries, then we hold all
2037 * entries for the trace and we need to ignore the 2298 * entries for the trace and we need to ignore the
2038 * ones before the time stamp. 2299 * ones before the time stamp.
2039 */ 2300 */
2040 if (tr->data[cpu]->skipped_entries) { 2301 if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
2041 count -= tr->data[cpu]->skipped_entries; 2302 count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
2042 /* total is the same as the entries */ 2303 /* total is the same as the entries */
2043 *total += count; 2304 *total += count;
2044 } else 2305 } else
2045 *total += count + 2306 *total += count +
2046 ring_buffer_overrun_cpu(tr->buffer, cpu); 2307 ring_buffer_overrun_cpu(buf->buffer, cpu);
2047 *entries += count; 2308 *entries += count;
2048 } 2309 }
2049} 2310}
@@ -2060,27 +2321,27 @@ static void print_lat_help_header(struct seq_file *m)
2060 seq_puts(m, "# \\ / ||||| \\ | / \n"); 2321 seq_puts(m, "# \\ / ||||| \\ | / \n");
2061} 2322}
2062 2323
2063static void print_event_info(struct trace_array *tr, struct seq_file *m) 2324static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
2064{ 2325{
2065 unsigned long total; 2326 unsigned long total;
2066 unsigned long entries; 2327 unsigned long entries;
2067 2328
2068 get_total_entries(tr, &total, &entries); 2329 get_total_entries(buf, &total, &entries);
2069 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", 2330 seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
2070 entries, total, num_online_cpus()); 2331 entries, total, num_online_cpus());
2071 seq_puts(m, "#\n"); 2332 seq_puts(m, "#\n");
2072} 2333}
2073 2334
2074static void print_func_help_header(struct trace_array *tr, struct seq_file *m) 2335static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
2075{ 2336{
2076 print_event_info(tr, m); 2337 print_event_info(buf, m);
2077 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 2338 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
2078 seq_puts(m, "# | | | | |\n"); 2339 seq_puts(m, "# | | | | |\n");
2079} 2340}
2080 2341
2081static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) 2342static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
2082{ 2343{
2083 print_event_info(tr, m); 2344 print_event_info(buf, m);
2084 seq_puts(m, "# _-----=> irqs-off\n"); 2345 seq_puts(m, "# _-----=> irqs-off\n");
2085 seq_puts(m, "# / _----=> need-resched\n"); 2346 seq_puts(m, "# / _----=> need-resched\n");
2086 seq_puts(m, "# | / _---=> hardirq/softirq\n"); 2347 seq_puts(m, "# | / _---=> hardirq/softirq\n");
@@ -2094,16 +2355,16 @@ void
2094print_trace_header(struct seq_file *m, struct trace_iterator *iter) 2355print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2095{ 2356{
2096 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 2357 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
2097 struct trace_array *tr = iter->tr; 2358 struct trace_buffer *buf = iter->trace_buffer;
2098 struct trace_array_cpu *data = tr->data[tr->cpu]; 2359 struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
2099 struct tracer *type = current_trace; 2360 struct tracer *type = iter->trace;
2100 unsigned long entries; 2361 unsigned long entries;
2101 unsigned long total; 2362 unsigned long total;
2102 const char *name = "preemption"; 2363 const char *name = "preemption";
2103 2364
2104 name = type->name; 2365 name = type->name;
2105 2366
2106 get_total_entries(tr, &total, &entries); 2367 get_total_entries(buf, &total, &entries);
2107 2368
2108 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 2369 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
2109 name, UTS_RELEASE); 2370 name, UTS_RELEASE);
@@ -2114,7 +2375,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2114 nsecs_to_usecs(data->saved_latency), 2375 nsecs_to_usecs(data->saved_latency),
2115 entries, 2376 entries,
2116 total, 2377 total,
2117 tr->cpu, 2378 buf->cpu,
2118#if defined(CONFIG_PREEMPT_NONE) 2379#if defined(CONFIG_PREEMPT_NONE)
2119 "server", 2380 "server",
2120#elif defined(CONFIG_PREEMPT_VOLUNTARY) 2381#elif defined(CONFIG_PREEMPT_VOLUNTARY)
@@ -2165,7 +2426,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
2165 if (cpumask_test_cpu(iter->cpu, iter->started)) 2426 if (cpumask_test_cpu(iter->cpu, iter->started))
2166 return; 2427 return;
2167 2428
2168 if (iter->tr->data[iter->cpu]->skipped_entries) 2429 if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
2169 return; 2430 return;
2170 2431
2171 cpumask_set_cpu(iter->cpu, iter->started); 2432 cpumask_set_cpu(iter->cpu, iter->started);
@@ -2288,14 +2549,14 @@ int trace_empty(struct trace_iterator *iter)
2288 int cpu; 2549 int cpu;
2289 2550
2290 /* If we are looking at one CPU buffer, only check that one */ 2551 /* If we are looking at one CPU buffer, only check that one */
2291 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { 2552 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
2292 cpu = iter->cpu_file; 2553 cpu = iter->cpu_file;
2293 buf_iter = trace_buffer_iter(iter, cpu); 2554 buf_iter = trace_buffer_iter(iter, cpu);
2294 if (buf_iter) { 2555 if (buf_iter) {
2295 if (!ring_buffer_iter_empty(buf_iter)) 2556 if (!ring_buffer_iter_empty(buf_iter))
2296 return 0; 2557 return 0;
2297 } else { 2558 } else {
2298 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2559 if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
2299 return 0; 2560 return 0;
2300 } 2561 }
2301 return 1; 2562 return 1;
@@ -2307,7 +2568,7 @@ int trace_empty(struct trace_iterator *iter)
2307 if (!ring_buffer_iter_empty(buf_iter)) 2568 if (!ring_buffer_iter_empty(buf_iter))
2308 return 0; 2569 return 0;
2309 } else { 2570 } else {
2310 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) 2571 if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
2311 return 0; 2572 return 0;
2312 } 2573 }
2313 } 2574 }
@@ -2331,6 +2592,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
2331 return ret; 2592 return ret;
2332 } 2593 }
2333 2594
2595 if (iter->ent->type == TRACE_BPUTS &&
2596 trace_flags & TRACE_ITER_PRINTK &&
2597 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
2598 return trace_print_bputs_msg_only(iter);
2599
2334 if (iter->ent->type == TRACE_BPRINT && 2600 if (iter->ent->type == TRACE_BPRINT &&
2335 trace_flags & TRACE_ITER_PRINTK && 2601 trace_flags & TRACE_ITER_PRINTK &&
2336 trace_flags & TRACE_ITER_PRINTK_MSGONLY) 2602 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
@@ -2385,9 +2651,9 @@ void trace_default_header(struct seq_file *m)
2385 } else { 2651 } else {
2386 if (!(trace_flags & TRACE_ITER_VERBOSE)) { 2652 if (!(trace_flags & TRACE_ITER_VERBOSE)) {
2387 if (trace_flags & TRACE_ITER_IRQ_INFO) 2653 if (trace_flags & TRACE_ITER_IRQ_INFO)
2388 print_func_help_header_irq(iter->tr, m); 2654 print_func_help_header_irq(iter->trace_buffer, m);
2389 else 2655 else
2390 print_func_help_header(iter->tr, m); 2656 print_func_help_header(iter->trace_buffer, m);
2391 } 2657 }
2392 } 2658 }
2393} 2659}
@@ -2400,6 +2666,50 @@ static void test_ftrace_alive(struct seq_file *m)
2400 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); 2666 seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
2401} 2667}
2402 2668
2669#ifdef CONFIG_TRACER_MAX_TRACE
2670static void show_snapshot_main_help(struct seq_file *m)
2671{
2672 seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
2673 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2674 seq_printf(m, "# Takes a snapshot of the main buffer.\n");
2675 seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
2676 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2677 seq_printf(m, "# is not a '0' or '1')\n");
2678}
2679
2680static void show_snapshot_percpu_help(struct seq_file *m)
2681{
2682 seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
2683#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2684 seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
2685 seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n");
2686#else
2687 seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
2688 seq_printf(m, "# Must use main snapshot file to allocate.\n");
2689#endif
2690 seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
2691 seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
2692 seq_printf(m, "# is not a '0' or '1')\n");
2693}
2694
2695static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
2696{
2697 if (iter->tr->allocated_snapshot)
2698 seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
2699 else
2700 seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
2701
2702 seq_printf(m, "# Snapshot commands:\n");
2703 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
2704 show_snapshot_main_help(m);
2705 else
2706 show_snapshot_percpu_help(m);
2707}
2708#else
2709/* Should never be called */
2710static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
2711#endif
2712
2403static int s_show(struct seq_file *m, void *v) 2713static int s_show(struct seq_file *m, void *v)
2404{ 2714{
2405 struct trace_iterator *iter = v; 2715 struct trace_iterator *iter = v;
@@ -2411,7 +2721,9 @@ static int s_show(struct seq_file *m, void *v)
2411 seq_puts(m, "#\n"); 2721 seq_puts(m, "#\n");
2412 test_ftrace_alive(m); 2722 test_ftrace_alive(m);
2413 } 2723 }
2414 if (iter->trace && iter->trace->print_header) 2724 if (iter->snapshot && trace_empty(iter))
2725 print_snapshot_help(m, iter);
2726 else if (iter->trace && iter->trace->print_header)
2415 iter->trace->print_header(m); 2727 iter->trace->print_header(m);
2416 else 2728 else
2417 trace_default_header(m); 2729 trace_default_header(m);
@@ -2452,7 +2764,8 @@ static const struct seq_operations tracer_seq_ops = {
2452static struct trace_iterator * 2764static struct trace_iterator *
2453__tracing_open(struct inode *inode, struct file *file, bool snapshot) 2765__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2454{ 2766{
2455 long cpu_file = (long) inode->i_private; 2767 struct trace_cpu *tc = inode->i_private;
2768 struct trace_array *tr = tc->tr;
2456 struct trace_iterator *iter; 2769 struct trace_iterator *iter;
2457 int cpu; 2770 int cpu;
2458 2771
@@ -2477,26 +2790,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2477 if (!iter->trace) 2790 if (!iter->trace)
2478 goto fail; 2791 goto fail;
2479 2792
2480 *iter->trace = *current_trace; 2793 *iter->trace = *tr->current_trace;
2481 2794
2482 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) 2795 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
2483 goto fail; 2796 goto fail;
2484 2797
2485 if (current_trace->print_max || snapshot) 2798 iter->tr = tr;
2486 iter->tr = &max_tr; 2799
2800#ifdef CONFIG_TRACER_MAX_TRACE
2801 /* Currently only the top directory has a snapshot */
2802 if (tr->current_trace->print_max || snapshot)
2803 iter->trace_buffer = &tr->max_buffer;
2487 else 2804 else
2488 iter->tr = &global_trace; 2805#endif
2806 iter->trace_buffer = &tr->trace_buffer;
2489 iter->snapshot = snapshot; 2807 iter->snapshot = snapshot;
2490 iter->pos = -1; 2808 iter->pos = -1;
2491 mutex_init(&iter->mutex); 2809 mutex_init(&iter->mutex);
2492 iter->cpu_file = cpu_file; 2810 iter->cpu_file = tc->cpu;
2493 2811
2494 /* Notify the tracer early; before we stop tracing. */ 2812 /* Notify the tracer early; before we stop tracing. */
2495 if (iter->trace && iter->trace->open) 2813 if (iter->trace && iter->trace->open)
2496 iter->trace->open(iter); 2814 iter->trace->open(iter);
2497 2815
2498 /* Annotate start of buffers if we had overruns */ 2816 /* Annotate start of buffers if we had overruns */
2499 if (ring_buffer_overruns(iter->tr->buffer)) 2817 if (ring_buffer_overruns(iter->trace_buffer->buffer))
2500 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2818 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2501 2819
2502 /* Output in nanoseconds only if we are using a clock in nanoseconds. */ 2820 /* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -2505,12 +2823,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2505 2823
2506 /* stop the trace while dumping if we are not opening "snapshot" */ 2824 /* stop the trace while dumping if we are not opening "snapshot" */
2507 if (!iter->snapshot) 2825 if (!iter->snapshot)
2508 tracing_stop(); 2826 tracing_stop_tr(tr);
2509 2827
2510 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2828 if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
2511 for_each_tracing_cpu(cpu) { 2829 for_each_tracing_cpu(cpu) {
2512 iter->buffer_iter[cpu] = 2830 iter->buffer_iter[cpu] =
2513 ring_buffer_read_prepare(iter->tr->buffer, cpu); 2831 ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
2514 } 2832 }
2515 ring_buffer_read_prepare_sync(); 2833 ring_buffer_read_prepare_sync();
2516 for_each_tracing_cpu(cpu) { 2834 for_each_tracing_cpu(cpu) {
@@ -2520,12 +2838,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
2520 } else { 2838 } else {
2521 cpu = iter->cpu_file; 2839 cpu = iter->cpu_file;
2522 iter->buffer_iter[cpu] = 2840 iter->buffer_iter[cpu] =
2523 ring_buffer_read_prepare(iter->tr->buffer, cpu); 2841 ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
2524 ring_buffer_read_prepare_sync(); 2842 ring_buffer_read_prepare_sync();
2525 ring_buffer_read_start(iter->buffer_iter[cpu]); 2843 ring_buffer_read_start(iter->buffer_iter[cpu]);
2526 tracing_iter_reset(iter, cpu); 2844 tracing_iter_reset(iter, cpu);
2527 } 2845 }
2528 2846
2847 tr->ref++;
2848
2529 mutex_unlock(&trace_types_lock); 2849 mutex_unlock(&trace_types_lock);
2530 2850
2531 return iter; 2851 return iter;
@@ -2552,14 +2872,20 @@ static int tracing_release(struct inode *inode, struct file *file)
2552{ 2872{
2553 struct seq_file *m = file->private_data; 2873 struct seq_file *m = file->private_data;
2554 struct trace_iterator *iter; 2874 struct trace_iterator *iter;
2875 struct trace_array *tr;
2555 int cpu; 2876 int cpu;
2556 2877
2557 if (!(file->f_mode & FMODE_READ)) 2878 if (!(file->f_mode & FMODE_READ))
2558 return 0; 2879 return 0;
2559 2880
2560 iter = m->private; 2881 iter = m->private;
2882 tr = iter->tr;
2561 2883
2562 mutex_lock(&trace_types_lock); 2884 mutex_lock(&trace_types_lock);
2885
2886 WARN_ON(!tr->ref);
2887 tr->ref--;
2888
2563 for_each_tracing_cpu(cpu) { 2889 for_each_tracing_cpu(cpu) {
2564 if (iter->buffer_iter[cpu]) 2890 if (iter->buffer_iter[cpu])
2565 ring_buffer_read_finish(iter->buffer_iter[cpu]); 2891 ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2570,7 +2896,7 @@ static int tracing_release(struct inode *inode, struct file *file)
2570 2896
2571 if (!iter->snapshot) 2897 if (!iter->snapshot)
2572 /* reenable tracing if it was previously enabled */ 2898 /* reenable tracing if it was previously enabled */
2573 tracing_start(); 2899 tracing_start_tr(tr);
2574 mutex_unlock(&trace_types_lock); 2900 mutex_unlock(&trace_types_lock);
2575 2901
2576 mutex_destroy(&iter->mutex); 2902 mutex_destroy(&iter->mutex);
@@ -2589,12 +2915,13 @@ static int tracing_open(struct inode *inode, struct file *file)
2589 /* If this file was open for write, then erase contents */ 2915 /* If this file was open for write, then erase contents */
2590 if ((file->f_mode & FMODE_WRITE) && 2916 if ((file->f_mode & FMODE_WRITE) &&
2591 (file->f_flags & O_TRUNC)) { 2917 (file->f_flags & O_TRUNC)) {
2592 long cpu = (long) inode->i_private; 2918 struct trace_cpu *tc = inode->i_private;
2919 struct trace_array *tr = tc->tr;
2593 2920
2594 if (cpu == TRACE_PIPE_ALL_CPU) 2921 if (tc->cpu == RING_BUFFER_ALL_CPUS)
2595 tracing_reset_online_cpus(&global_trace); 2922 tracing_reset_online_cpus(&tr->trace_buffer);
2596 else 2923 else
2597 tracing_reset(&global_trace, cpu); 2924 tracing_reset(&tr->trace_buffer, tc->cpu);
2598 } 2925 }
2599 2926
2600 if (file->f_mode & FMODE_READ) { 2927 if (file->f_mode & FMODE_READ) {
@@ -2741,8 +3068,9 @@ static ssize_t
2741tracing_cpumask_write(struct file *filp, const char __user *ubuf, 3068tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2742 size_t count, loff_t *ppos) 3069 size_t count, loff_t *ppos)
2743{ 3070{
2744 int err, cpu; 3071 struct trace_array *tr = filp->private_data;
2745 cpumask_var_t tracing_cpumask_new; 3072 cpumask_var_t tracing_cpumask_new;
3073 int err, cpu;
2746 3074
2747 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) 3075 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2748 return -ENOMEM; 3076 return -ENOMEM;
@@ -2762,13 +3090,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2762 */ 3090 */
2763 if (cpumask_test_cpu(cpu, tracing_cpumask) && 3091 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2764 !cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3092 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2765 atomic_inc(&global_trace.data[cpu]->disabled); 3093 atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
2766 ring_buffer_record_disable_cpu(global_trace.buffer, cpu); 3094 ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
2767 } 3095 }
2768 if (!cpumask_test_cpu(cpu, tracing_cpumask) && 3096 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2769 cpumask_test_cpu(cpu, tracing_cpumask_new)) { 3097 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2770 atomic_dec(&global_trace.data[cpu]->disabled); 3098 atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
2771 ring_buffer_record_enable_cpu(global_trace.buffer, cpu); 3099 ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
2772 } 3100 }
2773 } 3101 }
2774 arch_spin_unlock(&ftrace_max_lock); 3102 arch_spin_unlock(&ftrace_max_lock);
@@ -2797,12 +3125,13 @@ static const struct file_operations tracing_cpumask_fops = {
2797static int tracing_trace_options_show(struct seq_file *m, void *v) 3125static int tracing_trace_options_show(struct seq_file *m, void *v)
2798{ 3126{
2799 struct tracer_opt *trace_opts; 3127 struct tracer_opt *trace_opts;
3128 struct trace_array *tr = m->private;
2800 u32 tracer_flags; 3129 u32 tracer_flags;
2801 int i; 3130 int i;
2802 3131
2803 mutex_lock(&trace_types_lock); 3132 mutex_lock(&trace_types_lock);
2804 tracer_flags = current_trace->flags->val; 3133 tracer_flags = tr->current_trace->flags->val;
2805 trace_opts = current_trace->flags->opts; 3134 trace_opts = tr->current_trace->flags->opts;
2806 3135
2807 for (i = 0; trace_options[i]; i++) { 3136 for (i = 0; trace_options[i]; i++) {
2808 if (trace_flags & (1 << i)) 3137 if (trace_flags & (1 << i))
@@ -2857,11 +3186,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2857 return -EINVAL; 3186 return -EINVAL;
2858} 3187}
2859 3188
2860static void set_tracer_flags(unsigned int mask, int enabled) 3189/* Some tracers require overwrite to stay enabled */
3190int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
3191{
3192 if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
3193 return -1;
3194
3195 return 0;
3196}
3197
3198int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
2861{ 3199{
2862 /* do nothing if flag is already set */ 3200 /* do nothing if flag is already set */
2863 if (!!(trace_flags & mask) == !!enabled) 3201 if (!!(trace_flags & mask) == !!enabled)
2864 return; 3202 return 0;
3203
3204 /* Give the tracer a chance to approve the change */
3205 if (tr->current_trace->flag_changed)
3206 if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled))
3207 return -EINVAL;
2865 3208
2866 if (enabled) 3209 if (enabled)
2867 trace_flags |= mask; 3210 trace_flags |= mask;
@@ -2871,18 +3214,24 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2871 if (mask == TRACE_ITER_RECORD_CMD) 3214 if (mask == TRACE_ITER_RECORD_CMD)
2872 trace_event_enable_cmd_record(enabled); 3215 trace_event_enable_cmd_record(enabled);
2873 3216
2874 if (mask == TRACE_ITER_OVERWRITE) 3217 if (mask == TRACE_ITER_OVERWRITE) {
2875 ring_buffer_change_overwrite(global_trace.buffer, enabled); 3218 ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
3219#ifdef CONFIG_TRACER_MAX_TRACE
3220 ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
3221#endif
3222 }
2876 3223
2877 if (mask == TRACE_ITER_PRINTK) 3224 if (mask == TRACE_ITER_PRINTK)
2878 trace_printk_start_stop_comm(enabled); 3225 trace_printk_start_stop_comm(enabled);
3226
3227 return 0;
2879} 3228}
2880 3229
2881static int trace_set_options(char *option) 3230static int trace_set_options(struct trace_array *tr, char *option)
2882{ 3231{
2883 char *cmp; 3232 char *cmp;
2884 int neg = 0; 3233 int neg = 0;
2885 int ret = 0; 3234 int ret = -ENODEV;
2886 int i; 3235 int i;
2887 3236
2888 cmp = strstrip(option); 3237 cmp = strstrip(option);
@@ -2892,19 +3241,20 @@ static int trace_set_options(char *option)
2892 cmp += 2; 3241 cmp += 2;
2893 } 3242 }
2894 3243
3244 mutex_lock(&trace_types_lock);
3245
2895 for (i = 0; trace_options[i]; i++) { 3246 for (i = 0; trace_options[i]; i++) {
2896 if (strcmp(cmp, trace_options[i]) == 0) { 3247 if (strcmp(cmp, trace_options[i]) == 0) {
2897 set_tracer_flags(1 << i, !neg); 3248 ret = set_tracer_flag(tr, 1 << i, !neg);
2898 break; 3249 break;
2899 } 3250 }
2900 } 3251 }
2901 3252
2902 /* If no option could be set, test the specific tracer options */ 3253 /* If no option could be set, test the specific tracer options */
2903 if (!trace_options[i]) { 3254 if (!trace_options[i])
2904 mutex_lock(&trace_types_lock); 3255 ret = set_tracer_option(tr->current_trace, cmp, neg);
2905 ret = set_tracer_option(current_trace, cmp, neg); 3256
2906 mutex_unlock(&trace_types_lock); 3257 mutex_unlock(&trace_types_lock);
2907 }
2908 3258
2909 return ret; 3259 return ret;
2910} 3260}
@@ -2913,7 +3263,10 @@ static ssize_t
2913tracing_trace_options_write(struct file *filp, const char __user *ubuf, 3263tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2914 size_t cnt, loff_t *ppos) 3264 size_t cnt, loff_t *ppos)
2915{ 3265{
3266 struct seq_file *m = filp->private_data;
3267 struct trace_array *tr = m->private;
2916 char buf[64]; 3268 char buf[64];
3269 int ret;
2917 3270
2918 if (cnt >= sizeof(buf)) 3271 if (cnt >= sizeof(buf))
2919 return -EINVAL; 3272 return -EINVAL;
@@ -2923,7 +3276,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2923 3276
2924 buf[cnt] = 0; 3277 buf[cnt] = 0;
2925 3278
2926 trace_set_options(buf); 3279 ret = trace_set_options(tr, buf);
3280 if (ret < 0)
3281 return ret;
2927 3282
2928 *ppos += cnt; 3283 *ppos += cnt;
2929 3284
@@ -2934,7 +3289,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)
2934{ 3289{
2935 if (tracing_disabled) 3290 if (tracing_disabled)
2936 return -ENODEV; 3291 return -ENODEV;
2937 return single_open(file, tracing_trace_options_show, NULL); 3292
3293 return single_open(file, tracing_trace_options_show, inode->i_private);
2938} 3294}
2939 3295
2940static const struct file_operations tracing_iter_fops = { 3296static const struct file_operations tracing_iter_fops = {
@@ -2947,20 +3303,84 @@ static const struct file_operations tracing_iter_fops = {
2947 3303
2948static const char readme_msg[] = 3304static const char readme_msg[] =
2949 "tracing mini-HOWTO:\n\n" 3305 "tracing mini-HOWTO:\n\n"
2950 "# mount -t debugfs nodev /sys/kernel/debug\n\n" 3306 "# echo 0 > tracing_on : quick way to disable tracing\n"
2951 "# cat /sys/kernel/debug/tracing/available_tracers\n" 3307 "# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
2952 "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" 3308 " Important files:\n"
2953 "# cat /sys/kernel/debug/tracing/current_tracer\n" 3309 " trace\t\t\t- The static contents of the buffer\n"
2954 "nop\n" 3310 "\t\t\t To clear the buffer write into this file: echo > trace\n"
2955 "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" 3311 " trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
2956 "# cat /sys/kernel/debug/tracing/current_tracer\n" 3312 " current_tracer\t- function and latency tracers\n"
2957 "wakeup\n" 3313 " available_tracers\t- list of configured tracers for current_tracer\n"
2958 "# cat /sys/kernel/debug/tracing/trace_options\n" 3314 " buffer_size_kb\t- view and modify size of per cpu buffer\n"
2959 "noprint-parent nosym-offset nosym-addr noverbose\n" 3315 " buffer_total_size_kb - view total size of all cpu buffers\n\n"
2960 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 3316 " trace_clock\t\t-change the clock used to order events\n"
2961 "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" 3317 " local: Per cpu clock but may not be synced across CPUs\n"
2962 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" 3318 " global: Synced across CPUs but slows tracing down.\n"
2963 "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" 3319 " counter: Not a clock, but just an increment\n"
3320 " uptime: Jiffy counter from time of boot\n"
3321 " perf: Same clock that perf events use\n"
3322#ifdef CONFIG_X86_64
3323 " x86-tsc: TSC cycle counter\n"
3324#endif
3325 "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
3326 " tracing_cpumask\t- Limit which CPUs to trace\n"
3327 " instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
3328 "\t\t\t Remove sub-buffer with rmdir\n"
3329 " trace_options\t\t- Set format or modify how tracing happens\n"
3330 "\t\t\t Disable an option by adding a suffix 'no' to the option name\n"
3331#ifdef CONFIG_DYNAMIC_FTRACE
3332 "\n available_filter_functions - list of functions that can be filtered on\n"
3333 " set_ftrace_filter\t- echo function name in here to only trace these functions\n"
3334 " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
3335 " modules: Can select a group via module\n"
3336 " Format: :mod:<module-name>\n"
3337 " example: echo :mod:ext3 > set_ftrace_filter\n"
3338 " triggers: a command to perform when function is hit\n"
3339 " Format: <function>:<trigger>[:count]\n"
3340 " trigger: traceon, traceoff\n"
3341 " enable_event:<system>:<event>\n"
3342 " disable_event:<system>:<event>\n"
3343#ifdef CONFIG_STACKTRACE
3344 " stacktrace\n"
3345#endif
3346#ifdef CONFIG_TRACER_SNAPSHOT
3347 " snapshot\n"
3348#endif
3349 " example: echo do_fault:traceoff > set_ftrace_filter\n"
3350 " echo do_trap:traceoff:3 > set_ftrace_filter\n"
3351 " The first one will disable tracing every time do_fault is hit\n"
3352 " The second will disable tracing at most 3 times when do_trap is hit\n"
3353 " The first time do trap is hit and it disables tracing, the counter\n"
3354 " will decrement to 2. If tracing is already disabled, the counter\n"
3355 " will not decrement. It only decrements when the trigger did work\n"
3356 " To remove trigger without count:\n"
3357 " echo '!<function>:<trigger> > set_ftrace_filter\n"
3358 " To remove trigger with a count:\n"
3359 " echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
3360 " set_ftrace_notrace\t- echo function name in here to never trace.\n"
3361 " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
3362 " modules: Can select a group via module command :mod:\n"
3363 " Does not accept triggers\n"
3364#endif /* CONFIG_DYNAMIC_FTRACE */
3365#ifdef CONFIG_FUNCTION_TRACER
3366 " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n"
3367#endif
3368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
3369 " set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
3370 " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
3371#endif
3372#ifdef CONFIG_TRACER_SNAPSHOT
3373 "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
3374 "\t\t\t Read the contents for more information\n"
3375#endif
3376#ifdef CONFIG_STACKTRACE
3377 " stack_trace\t\t- Shows the max stack trace when active\n"
3378 " stack_max_size\t- Shows current max stack size that was traced\n"
3379 "\t\t\t Write into this file to reset the max size (trigger a new trace)\n"
3380#ifdef CONFIG_DYNAMIC_FTRACE
3381 " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
3382#endif
3383#endif /* CONFIG_STACKTRACE */
2964; 3384;
2965 3385
2966static ssize_t 3386static ssize_t
@@ -3032,11 +3452,12 @@ static ssize_t
3032tracing_set_trace_read(struct file *filp, char __user *ubuf, 3452tracing_set_trace_read(struct file *filp, char __user *ubuf,
3033 size_t cnt, loff_t *ppos) 3453 size_t cnt, loff_t *ppos)
3034{ 3454{
3455 struct trace_array *tr = filp->private_data;
3035 char buf[MAX_TRACER_SIZE+2]; 3456 char buf[MAX_TRACER_SIZE+2];
3036 int r; 3457 int r;
3037 3458
3038 mutex_lock(&trace_types_lock); 3459 mutex_lock(&trace_types_lock);
3039 r = sprintf(buf, "%s\n", current_trace->name); 3460 r = sprintf(buf, "%s\n", tr->current_trace->name);
3040 mutex_unlock(&trace_types_lock); 3461 mutex_unlock(&trace_types_lock);
3041 3462
3042 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3463 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3044,43 +3465,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
3044 3465
3045int tracer_init(struct tracer *t, struct trace_array *tr) 3466int tracer_init(struct tracer *t, struct trace_array *tr)
3046{ 3467{
3047 tracing_reset_online_cpus(tr); 3468 tracing_reset_online_cpus(&tr->trace_buffer);
3048 return t->init(tr); 3469 return t->init(tr);
3049} 3470}
3050 3471
3051static void set_buffer_entries(struct trace_array *tr, unsigned long val) 3472static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
3052{ 3473{
3053 int cpu; 3474 int cpu;
3475
3054 for_each_tracing_cpu(cpu) 3476 for_each_tracing_cpu(cpu)
3055 tr->data[cpu]->entries = val; 3477 per_cpu_ptr(buf->data, cpu)->entries = val;
3056} 3478}
3057 3479
3480#ifdef CONFIG_TRACER_MAX_TRACE
3058/* resize @tr's buffer to the size of @size_tr's entries */ 3481/* resize @tr's buffer to the size of @size_tr's entries */
3059static int resize_buffer_duplicate_size(struct trace_array *tr, 3482static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
3060 struct trace_array *size_tr, int cpu_id) 3483 struct trace_buffer *size_buf, int cpu_id)
3061{ 3484{
3062 int cpu, ret = 0; 3485 int cpu, ret = 0;
3063 3486
3064 if (cpu_id == RING_BUFFER_ALL_CPUS) { 3487 if (cpu_id == RING_BUFFER_ALL_CPUS) {
3065 for_each_tracing_cpu(cpu) { 3488 for_each_tracing_cpu(cpu) {
3066 ret = ring_buffer_resize(tr->buffer, 3489 ret = ring_buffer_resize(trace_buf->buffer,
3067 size_tr->data[cpu]->entries, cpu); 3490 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
3068 if (ret < 0) 3491 if (ret < 0)
3069 break; 3492 break;
3070 tr->data[cpu]->entries = size_tr->data[cpu]->entries; 3493 per_cpu_ptr(trace_buf->data, cpu)->entries =
3494 per_cpu_ptr(size_buf->data, cpu)->entries;
3071 } 3495 }
3072 } else { 3496 } else {
3073 ret = ring_buffer_resize(tr->buffer, 3497 ret = ring_buffer_resize(trace_buf->buffer,
3074 size_tr->data[cpu_id]->entries, cpu_id); 3498 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
3075 if (ret == 0) 3499 if (ret == 0)
3076 tr->data[cpu_id]->entries = 3500 per_cpu_ptr(trace_buf->data, cpu_id)->entries =
3077 size_tr->data[cpu_id]->entries; 3501 per_cpu_ptr(size_buf->data, cpu_id)->entries;
3078 } 3502 }
3079 3503
3080 return ret; 3504 return ret;
3081} 3505}
3506#endif /* CONFIG_TRACER_MAX_TRACE */
3082 3507
3083static int __tracing_resize_ring_buffer(unsigned long size, int cpu) 3508static int __tracing_resize_ring_buffer(struct trace_array *tr,
3509 unsigned long size, int cpu)
3084{ 3510{
3085 int ret; 3511 int ret;
3086 3512
@@ -3089,23 +3515,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3089 * we use the size that was given, and we can forget about 3515 * we use the size that was given, and we can forget about
3090 * expanding it later. 3516 * expanding it later.
3091 */ 3517 */
3092 ring_buffer_expanded = 1; 3518 ring_buffer_expanded = true;
3093 3519
3094 /* May be called before buffers are initialized */ 3520 /* May be called before buffers are initialized */
3095 if (!global_trace.buffer) 3521 if (!tr->trace_buffer.buffer)
3096 return 0; 3522 return 0;
3097 3523
3098 ret = ring_buffer_resize(global_trace.buffer, size, cpu); 3524 ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
3099 if (ret < 0) 3525 if (ret < 0)
3100 return ret; 3526 return ret;
3101 3527
3102 if (!current_trace->use_max_tr) 3528#ifdef CONFIG_TRACER_MAX_TRACE
3529 if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
3530 !tr->current_trace->use_max_tr)
3103 goto out; 3531 goto out;
3104 3532
3105 ret = ring_buffer_resize(max_tr.buffer, size, cpu); 3533 ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
3106 if (ret < 0) { 3534 if (ret < 0) {
3107 int r = resize_buffer_duplicate_size(&global_trace, 3535 int r = resize_buffer_duplicate_size(&tr->trace_buffer,
3108 &global_trace, cpu); 3536 &tr->trace_buffer, cpu);
3109 if (r < 0) { 3537 if (r < 0) {
3110 /* 3538 /*
3111 * AARGH! We are left with different 3539 * AARGH! We are left with different
@@ -3128,20 +3556,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
3128 } 3556 }
3129 3557
3130 if (cpu == RING_BUFFER_ALL_CPUS) 3558 if (cpu == RING_BUFFER_ALL_CPUS)
3131 set_buffer_entries(&max_tr, size); 3559 set_buffer_entries(&tr->max_buffer, size);
3132 else 3560 else
3133 max_tr.data[cpu]->entries = size; 3561 per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
3134 3562
3135 out: 3563 out:
3564#endif /* CONFIG_TRACER_MAX_TRACE */
3565
3136 if (cpu == RING_BUFFER_ALL_CPUS) 3566 if (cpu == RING_BUFFER_ALL_CPUS)
3137 set_buffer_entries(&global_trace, size); 3567 set_buffer_entries(&tr->trace_buffer, size);
3138 else 3568 else
3139 global_trace.data[cpu]->entries = size; 3569 per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
3140 3570
3141 return ret; 3571 return ret;
3142} 3572}
3143 3573
3144static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) 3574static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
3575 unsigned long size, int cpu_id)
3145{ 3576{
3146 int ret = size; 3577 int ret = size;
3147 3578
@@ -3155,7 +3586,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
3155 } 3586 }
3156 } 3587 }
3157 3588
3158 ret = __tracing_resize_ring_buffer(size, cpu_id); 3589 ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
3159 if (ret < 0) 3590 if (ret < 0)
3160 ret = -ENOMEM; 3591 ret = -ENOMEM;
3161 3592
@@ -3182,7 +3613,7 @@ int tracing_update_buffers(void)
3182 3613
3183 mutex_lock(&trace_types_lock); 3614 mutex_lock(&trace_types_lock);
3184 if (!ring_buffer_expanded) 3615 if (!ring_buffer_expanded)
3185 ret = __tracing_resize_ring_buffer(trace_buf_size, 3616 ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
3186 RING_BUFFER_ALL_CPUS); 3617 RING_BUFFER_ALL_CPUS);
3187 mutex_unlock(&trace_types_lock); 3618 mutex_unlock(&trace_types_lock);
3188 3619
@@ -3192,7 +3623,7 @@ int tracing_update_buffers(void)
3192struct trace_option_dentry; 3623struct trace_option_dentry;
3193 3624
3194static struct trace_option_dentry * 3625static struct trace_option_dentry *
3195create_trace_option_files(struct tracer *tracer); 3626create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
3196 3627
3197static void 3628static void
3198destroy_trace_option_files(struct trace_option_dentry *topts); 3629destroy_trace_option_files(struct trace_option_dentry *topts);
@@ -3202,13 +3633,15 @@ static int tracing_set_tracer(const char *buf)
3202 static struct trace_option_dentry *topts; 3633 static struct trace_option_dentry *topts;
3203 struct trace_array *tr = &global_trace; 3634 struct trace_array *tr = &global_trace;
3204 struct tracer *t; 3635 struct tracer *t;
3636#ifdef CONFIG_TRACER_MAX_TRACE
3205 bool had_max_tr; 3637 bool had_max_tr;
3638#endif
3206 int ret = 0; 3639 int ret = 0;
3207 3640
3208 mutex_lock(&trace_types_lock); 3641 mutex_lock(&trace_types_lock);
3209 3642
3210 if (!ring_buffer_expanded) { 3643 if (!ring_buffer_expanded) {
3211 ret = __tracing_resize_ring_buffer(trace_buf_size, 3644 ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
3212 RING_BUFFER_ALL_CPUS); 3645 RING_BUFFER_ALL_CPUS);
3213 if (ret < 0) 3646 if (ret < 0)
3214 goto out; 3647 goto out;
@@ -3223,15 +3656,21 @@ static int tracing_set_tracer(const char *buf)
3223 ret = -EINVAL; 3656 ret = -EINVAL;
3224 goto out; 3657 goto out;
3225 } 3658 }
3226 if (t == current_trace) 3659 if (t == tr->current_trace)
3227 goto out; 3660 goto out;
3228 3661
3229 trace_branch_disable(); 3662 trace_branch_disable();
3230 if (current_trace->reset)
3231 current_trace->reset(tr);
3232 3663
3233 had_max_tr = current_trace->allocated_snapshot; 3664 tr->current_trace->enabled = false;
3234 current_trace = &nop_trace; 3665
3666 if (tr->current_trace->reset)
3667 tr->current_trace->reset(tr);
3668
3669 /* Current trace needs to be nop_trace before synchronize_sched */
3670 tr->current_trace = &nop_trace;
3671
3672#ifdef CONFIG_TRACER_MAX_TRACE
3673 had_max_tr = tr->allocated_snapshot;
3235 3674
3236 if (had_max_tr && !t->use_max_tr) { 3675 if (had_max_tr && !t->use_max_tr) {
3237 /* 3676 /*
@@ -3242,27 +3681,20 @@ static int tracing_set_tracer(const char *buf)
3242 * so a synchronized_sched() is sufficient. 3681 * so a synchronized_sched() is sufficient.
3243 */ 3682 */
3244 synchronize_sched(); 3683 synchronize_sched();
3245 /* 3684 free_snapshot(tr);
3246 * We don't free the ring buffer. instead, resize it because
3247 * The max_tr ring buffer has some state (e.g. ring->clock) and
3248 * we want preserve it.
3249 */
3250 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3251 set_buffer_entries(&max_tr, 1);
3252 tracing_reset_online_cpus(&max_tr);
3253 current_trace->allocated_snapshot = false;
3254 } 3685 }
3686#endif
3255 destroy_trace_option_files(topts); 3687 destroy_trace_option_files(topts);
3256 3688
3257 topts = create_trace_option_files(t); 3689 topts = create_trace_option_files(tr, t);
3690
3691#ifdef CONFIG_TRACER_MAX_TRACE
3258 if (t->use_max_tr && !had_max_tr) { 3692 if (t->use_max_tr && !had_max_tr) {
3259 /* we need to make per cpu buffer sizes equivalent */ 3693 ret = alloc_snapshot(tr);
3260 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3261 RING_BUFFER_ALL_CPUS);
3262 if (ret < 0) 3694 if (ret < 0)
3263 goto out; 3695 goto out;
3264 t->allocated_snapshot = true;
3265 } 3696 }
3697#endif
3266 3698
3267 if (t->init) { 3699 if (t->init) {
3268 ret = tracer_init(t, tr); 3700 ret = tracer_init(t, tr);
@@ -3270,7 +3702,8 @@ static int tracing_set_tracer(const char *buf)
3270 goto out; 3702 goto out;
3271 } 3703 }
3272 3704
3273 current_trace = t; 3705 tr->current_trace = t;
3706 tr->current_trace->enabled = true;
3274 trace_branch_enable(tr); 3707 trace_branch_enable(tr);
3275 out: 3708 out:
3276 mutex_unlock(&trace_types_lock); 3709 mutex_unlock(&trace_types_lock);
@@ -3344,7 +3777,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3344 3777
3345static int tracing_open_pipe(struct inode *inode, struct file *filp) 3778static int tracing_open_pipe(struct inode *inode, struct file *filp)
3346{ 3779{
3347 long cpu_file = (long) inode->i_private; 3780 struct trace_cpu *tc = inode->i_private;
3781 struct trace_array *tr = tc->tr;
3348 struct trace_iterator *iter; 3782 struct trace_iterator *iter;
3349 int ret = 0; 3783 int ret = 0;
3350 3784
@@ -3369,7 +3803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3369 ret = -ENOMEM; 3803 ret = -ENOMEM;
3370 goto fail; 3804 goto fail;
3371 } 3805 }
3372 *iter->trace = *current_trace; 3806 *iter->trace = *tr->current_trace;
3373 3807
3374 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 3808 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3375 ret = -ENOMEM; 3809 ret = -ENOMEM;
@@ -3386,8 +3820,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3386 if (trace_clocks[trace_clock_id].in_ns) 3820 if (trace_clocks[trace_clock_id].in_ns)
3387 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 3821 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
3388 3822
3389 iter->cpu_file = cpu_file; 3823 iter->cpu_file = tc->cpu;
3390 iter->tr = &global_trace; 3824 iter->tr = tc->tr;
3825 iter->trace_buffer = &tc->tr->trace_buffer;
3391 mutex_init(&iter->mutex); 3826 mutex_init(&iter->mutex);
3392 filp->private_data = iter; 3827 filp->private_data = iter;
3393 3828
@@ -3426,24 +3861,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3426} 3861}
3427 3862
3428static unsigned int 3863static unsigned int
3429tracing_poll_pipe(struct file *filp, poll_table *poll_table) 3864trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
3430{ 3865{
3431 struct trace_iterator *iter = filp->private_data; 3866 /* Iterators are static, they should be filled or empty */
3867 if (trace_buffer_iter(iter, iter->cpu_file))
3868 return POLLIN | POLLRDNORM;
3432 3869
3433 if (trace_flags & TRACE_ITER_BLOCK) { 3870 if (trace_flags & TRACE_ITER_BLOCK)
3434 /* 3871 /*
3435 * Always select as readable when in blocking mode 3872 * Always select as readable when in blocking mode
3436 */ 3873 */
3437 return POLLIN | POLLRDNORM; 3874 return POLLIN | POLLRDNORM;
3438 } else { 3875 else
3439 if (!trace_empty(iter)) 3876 return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
3440 return POLLIN | POLLRDNORM; 3877 filp, poll_table);
3441 poll_wait(filp, &trace_wait, poll_table); 3878}
3442 if (!trace_empty(iter))
3443 return POLLIN | POLLRDNORM;
3444 3879
3445 return 0; 3880static unsigned int
3446 } 3881tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3882{
3883 struct trace_iterator *iter = filp->private_data;
3884
3885 return trace_poll(iter, filp, poll_table);
3447} 3886}
3448 3887
3449/* 3888/*
@@ -3509,6 +3948,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3509 size_t cnt, loff_t *ppos) 3948 size_t cnt, loff_t *ppos)
3510{ 3949{
3511 struct trace_iterator *iter = filp->private_data; 3950 struct trace_iterator *iter = filp->private_data;
3951 struct trace_array *tr = iter->tr;
3512 ssize_t sret; 3952 ssize_t sret;
3513 3953
3514 /* return any leftover data */ 3954 /* return any leftover data */
@@ -3520,8 +3960,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3520 3960
3521 /* copy the tracer to avoid using a global lock all around */ 3961 /* copy the tracer to avoid using a global lock all around */
3522 mutex_lock(&trace_types_lock); 3962 mutex_lock(&trace_types_lock);
3523 if (unlikely(iter->trace->name != current_trace->name)) 3963 if (unlikely(iter->trace->name != tr->current_trace->name))
3524 *iter->trace = *current_trace; 3964 *iter->trace = *tr->current_trace;
3525 mutex_unlock(&trace_types_lock); 3965 mutex_unlock(&trace_types_lock);
3526 3966
3527 /* 3967 /*
@@ -3677,6 +4117,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3677 .ops = &tracing_pipe_buf_ops, 4117 .ops = &tracing_pipe_buf_ops,
3678 .spd_release = tracing_spd_release_pipe, 4118 .spd_release = tracing_spd_release_pipe,
3679 }; 4119 };
4120 struct trace_array *tr = iter->tr;
3680 ssize_t ret; 4121 ssize_t ret;
3681 size_t rem; 4122 size_t rem;
3682 unsigned int i; 4123 unsigned int i;
@@ -3686,8 +4127,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3686 4127
3687 /* copy the tracer to avoid using a global lock all around */ 4128 /* copy the tracer to avoid using a global lock all around */
3688 mutex_lock(&trace_types_lock); 4129 mutex_lock(&trace_types_lock);
3689 if (unlikely(iter->trace->name != current_trace->name)) 4130 if (unlikely(iter->trace->name != tr->current_trace->name))
3690 *iter->trace = *current_trace; 4131 *iter->trace = *tr->current_trace;
3691 mutex_unlock(&trace_types_lock); 4132 mutex_unlock(&trace_types_lock);
3692 4133
3693 mutex_lock(&iter->mutex); 4134 mutex_lock(&iter->mutex);
@@ -3749,43 +4190,19 @@ out_err:
3749 goto out; 4190 goto out;
3750} 4191}
3751 4192
3752struct ftrace_entries_info {
3753 struct trace_array *tr;
3754 int cpu;
3755};
3756
3757static int tracing_entries_open(struct inode *inode, struct file *filp)
3758{
3759 struct ftrace_entries_info *info;
3760
3761 if (tracing_disabled)
3762 return -ENODEV;
3763
3764 info = kzalloc(sizeof(*info), GFP_KERNEL);
3765 if (!info)
3766 return -ENOMEM;
3767
3768 info->tr = &global_trace;
3769 info->cpu = (unsigned long)inode->i_private;
3770
3771 filp->private_data = info;
3772
3773 return 0;
3774}
3775
3776static ssize_t 4193static ssize_t
3777tracing_entries_read(struct file *filp, char __user *ubuf, 4194tracing_entries_read(struct file *filp, char __user *ubuf,
3778 size_t cnt, loff_t *ppos) 4195 size_t cnt, loff_t *ppos)
3779{ 4196{
3780 struct ftrace_entries_info *info = filp->private_data; 4197 struct trace_cpu *tc = filp->private_data;
3781 struct trace_array *tr = info->tr; 4198 struct trace_array *tr = tc->tr;
3782 char buf[64]; 4199 char buf[64];
3783 int r = 0; 4200 int r = 0;
3784 ssize_t ret; 4201 ssize_t ret;
3785 4202
3786 mutex_lock(&trace_types_lock); 4203 mutex_lock(&trace_types_lock);
3787 4204
3788 if (info->cpu == RING_BUFFER_ALL_CPUS) { 4205 if (tc->cpu == RING_BUFFER_ALL_CPUS) {
3789 int cpu, buf_size_same; 4206 int cpu, buf_size_same;
3790 unsigned long size; 4207 unsigned long size;
3791 4208
@@ -3795,8 +4212,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
3795 for_each_tracing_cpu(cpu) { 4212 for_each_tracing_cpu(cpu) {
3796 /* fill in the size from first enabled cpu */ 4213 /* fill in the size from first enabled cpu */
3797 if (size == 0) 4214 if (size == 0)
3798 size = tr->data[cpu]->entries; 4215 size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
3799 if (size != tr->data[cpu]->entries) { 4216 if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
3800 buf_size_same = 0; 4217 buf_size_same = 0;
3801 break; 4218 break;
3802 } 4219 }
@@ -3812,7 +4229,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
3812 } else 4229 } else
3813 r = sprintf(buf, "X\n"); 4230 r = sprintf(buf, "X\n");
3814 } else 4231 } else
3815 r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); 4232 r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);
3816 4233
3817 mutex_unlock(&trace_types_lock); 4234 mutex_unlock(&trace_types_lock);
3818 4235
@@ -3824,7 +4241,7 @@ static ssize_t
3824tracing_entries_write(struct file *filp, const char __user *ubuf, 4241tracing_entries_write(struct file *filp, const char __user *ubuf,
3825 size_t cnt, loff_t *ppos) 4242 size_t cnt, loff_t *ppos)
3826{ 4243{
3827 struct ftrace_entries_info *info = filp->private_data; 4244 struct trace_cpu *tc = filp->private_data;
3828 unsigned long val; 4245 unsigned long val;
3829 int ret; 4246 int ret;
3830 4247
@@ -3839,7 +4256,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3839 /* value is in KB */ 4256 /* value is in KB */
3840 val <<= 10; 4257 val <<= 10;
3841 4258
3842 ret = tracing_resize_ring_buffer(val, info->cpu); 4259 ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
3843 if (ret < 0) 4260 if (ret < 0)
3844 return ret; 4261 return ret;
3845 4262
@@ -3848,16 +4265,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3848 return cnt; 4265 return cnt;
3849} 4266}
3850 4267
3851static int
3852tracing_entries_release(struct inode *inode, struct file *filp)
3853{
3854 struct ftrace_entries_info *info = filp->private_data;
3855
3856 kfree(info);
3857
3858 return 0;
3859}
3860
3861static ssize_t 4268static ssize_t
3862tracing_total_entries_read(struct file *filp, char __user *ubuf, 4269tracing_total_entries_read(struct file *filp, char __user *ubuf,
3863 size_t cnt, loff_t *ppos) 4270 size_t cnt, loff_t *ppos)
@@ -3869,7 +4276,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
3869 4276
3870 mutex_lock(&trace_types_lock); 4277 mutex_lock(&trace_types_lock);
3871 for_each_tracing_cpu(cpu) { 4278 for_each_tracing_cpu(cpu) {
3872 size += tr->data[cpu]->entries >> 10; 4279 size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
3873 if (!ring_buffer_expanded) 4280 if (!ring_buffer_expanded)
3874 expanded_size += trace_buf_size >> 10; 4281 expanded_size += trace_buf_size >> 10;
3875 } 4282 }
@@ -3899,11 +4306,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
3899static int 4306static int
3900tracing_free_buffer_release(struct inode *inode, struct file *filp) 4307tracing_free_buffer_release(struct inode *inode, struct file *filp)
3901{ 4308{
4309 struct trace_array *tr = inode->i_private;
4310
3902 /* disable tracing ? */ 4311 /* disable tracing ? */
3903 if (trace_flags & TRACE_ITER_STOP_ON_FREE) 4312 if (trace_flags & TRACE_ITER_STOP_ON_FREE)
3904 tracing_off(); 4313 tracing_off();
3905 /* resize the ring buffer to 0 */ 4314 /* resize the ring buffer to 0 */
3906 tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); 4315 tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
3907 4316
3908 return 0; 4317 return 0;
3909} 4318}
@@ -3972,7 +4381,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3972 4381
3973 local_save_flags(irq_flags); 4382 local_save_flags(irq_flags);
3974 size = sizeof(*entry) + cnt + 2; /* possible \n added */ 4383 size = sizeof(*entry) + cnt + 2; /* possible \n added */
3975 buffer = global_trace.buffer; 4384 buffer = global_trace.trace_buffer.buffer;
3976 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, 4385 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
3977 irq_flags, preempt_count()); 4386 irq_flags, preempt_count());
3978 if (!event) { 4387 if (!event) {
@@ -4014,13 +4423,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
4014 4423
4015static int tracing_clock_show(struct seq_file *m, void *v) 4424static int tracing_clock_show(struct seq_file *m, void *v)
4016{ 4425{
4426 struct trace_array *tr = m->private;
4017 int i; 4427 int i;
4018 4428
4019 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) 4429 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
4020 seq_printf(m, 4430 seq_printf(m,
4021 "%s%s%s%s", i ? " " : "", 4431 "%s%s%s%s", i ? " " : "",
4022 i == trace_clock_id ? "[" : "", trace_clocks[i].name, 4432 i == tr->clock_id ? "[" : "", trace_clocks[i].name,
4023 i == trace_clock_id ? "]" : ""); 4433 i == tr->clock_id ? "]" : "");
4024 seq_putc(m, '\n'); 4434 seq_putc(m, '\n');
4025 4435
4026 return 0; 4436 return 0;
@@ -4029,6 +4439,8 @@ static int tracing_clock_show(struct seq_file *m, void *v)
4029static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 4439static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4030 size_t cnt, loff_t *fpos) 4440 size_t cnt, loff_t *fpos)
4031{ 4441{
4442 struct seq_file *m = filp->private_data;
4443 struct trace_array *tr = m->private;
4032 char buf[64]; 4444 char buf[64];
4033 const char *clockstr; 4445 const char *clockstr;
4034 int i; 4446 int i;
@@ -4050,20 +4462,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4050 if (i == ARRAY_SIZE(trace_clocks)) 4462 if (i == ARRAY_SIZE(trace_clocks))
4051 return -EINVAL; 4463 return -EINVAL;
4052 4464
4053 trace_clock_id = i;
4054
4055 mutex_lock(&trace_types_lock); 4465 mutex_lock(&trace_types_lock);
4056 4466
4057 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); 4467 tr->clock_id = i;
4058 if (max_tr.buffer) 4468
4059 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); 4469 ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
4060 4470
4061 /* 4471 /*
4062 * New clock may not be consistent with the previous clock. 4472 * New clock may not be consistent with the previous clock.
4063 * Reset the buffer so that it doesn't have incomparable timestamps. 4473 * Reset the buffer so that it doesn't have incomparable timestamps.
4064 */ 4474 */
4065 tracing_reset_online_cpus(&global_trace); 4475 tracing_reset_online_cpus(&global_trace.trace_buffer);
4066 tracing_reset_online_cpus(&max_tr); 4476
4477#ifdef CONFIG_TRACER_MAX_TRACE
4478 if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
4479 ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
4480 tracing_reset_online_cpus(&global_trace.max_buffer);
4481#endif
4067 4482
4068 mutex_unlock(&trace_types_lock); 4483 mutex_unlock(&trace_types_lock);
4069 4484
@@ -4076,20 +4491,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
4076{ 4491{
4077 if (tracing_disabled) 4492 if (tracing_disabled)
4078 return -ENODEV; 4493 return -ENODEV;
4079 return single_open(file, tracing_clock_show, NULL); 4494
4495 return single_open(file, tracing_clock_show, inode->i_private);
4080} 4496}
4081 4497
4498struct ftrace_buffer_info {
4499 struct trace_iterator iter;
4500 void *spare;
4501 unsigned int read;
4502};
4503
4082#ifdef CONFIG_TRACER_SNAPSHOT 4504#ifdef CONFIG_TRACER_SNAPSHOT
4083static int tracing_snapshot_open(struct inode *inode, struct file *file) 4505static int tracing_snapshot_open(struct inode *inode, struct file *file)
4084{ 4506{
4507 struct trace_cpu *tc = inode->i_private;
4085 struct trace_iterator *iter; 4508 struct trace_iterator *iter;
4509 struct seq_file *m;
4086 int ret = 0; 4510 int ret = 0;
4087 4511
4088 if (file->f_mode & FMODE_READ) { 4512 if (file->f_mode & FMODE_READ) {
4089 iter = __tracing_open(inode, file, true); 4513 iter = __tracing_open(inode, file, true);
4090 if (IS_ERR(iter)) 4514 if (IS_ERR(iter))
4091 ret = PTR_ERR(iter); 4515 ret = PTR_ERR(iter);
4516 } else {
4517 /* Writes still need the seq_file to hold the private data */
4518 m = kzalloc(sizeof(*m), GFP_KERNEL);
4519 if (!m)
4520 return -ENOMEM;
4521 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
4522 if (!iter) {
4523 kfree(m);
4524 return -ENOMEM;
4525 }
4526 iter->tr = tc->tr;
4527 iter->trace_buffer = &tc->tr->max_buffer;
4528 iter->cpu_file = tc->cpu;
4529 m->private = iter;
4530 file->private_data = m;
4092 } 4531 }
4532
4093 return ret; 4533 return ret;
4094} 4534}
4095 4535
@@ -4097,6 +4537,9 @@ static ssize_t
4097tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, 4537tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4098 loff_t *ppos) 4538 loff_t *ppos)
4099{ 4539{
4540 struct seq_file *m = filp->private_data;
4541 struct trace_iterator *iter = m->private;
4542 struct trace_array *tr = iter->tr;
4100 unsigned long val; 4543 unsigned long val;
4101 int ret; 4544 int ret;
4102 4545
@@ -4110,42 +4553,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4110 4553
4111 mutex_lock(&trace_types_lock); 4554 mutex_lock(&trace_types_lock);
4112 4555
4113 if (current_trace->use_max_tr) { 4556 if (tr->current_trace->use_max_tr) {
4114 ret = -EBUSY; 4557 ret = -EBUSY;
4115 goto out; 4558 goto out;
4116 } 4559 }
4117 4560
4118 switch (val) { 4561 switch (val) {
4119 case 0: 4562 case 0:
4120 if (current_trace->allocated_snapshot) { 4563 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
4121 /* free spare buffer */ 4564 ret = -EINVAL;
4122 ring_buffer_resize(max_tr.buffer, 1, 4565 break;
4123 RING_BUFFER_ALL_CPUS);
4124 set_buffer_entries(&max_tr, 1);
4125 tracing_reset_online_cpus(&max_tr);
4126 current_trace->allocated_snapshot = false;
4127 } 4566 }
4567 if (tr->allocated_snapshot)
4568 free_snapshot(tr);
4128 break; 4569 break;
4129 case 1: 4570 case 1:
4130 if (!current_trace->allocated_snapshot) { 4571/* Only allow per-cpu swap if the ring buffer supports it */
4131 /* allocate spare buffer */ 4572#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
4132 ret = resize_buffer_duplicate_size(&max_tr, 4573 if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
4133 &global_trace, RING_BUFFER_ALL_CPUS); 4574 ret = -EINVAL;
4575 break;
4576 }
4577#endif
4578 if (!tr->allocated_snapshot) {
4579 ret = alloc_snapshot(tr);
4134 if (ret < 0) 4580 if (ret < 0)
4135 break; 4581 break;
4136 current_trace->allocated_snapshot = true;
4137 } 4582 }
4138
4139 local_irq_disable(); 4583 local_irq_disable();
4140 /* Now, we're going to swap */ 4584 /* Now, we're going to swap */
4141 update_max_tr(&global_trace, current, smp_processor_id()); 4585 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
4586 update_max_tr(tr, current, smp_processor_id());
4587 else
4588 update_max_tr_single(tr, current, iter->cpu_file);
4142 local_irq_enable(); 4589 local_irq_enable();
4143 break; 4590 break;
4144 default: 4591 default:
4145 if (current_trace->allocated_snapshot) 4592 if (tr->allocated_snapshot) {
4146 tracing_reset_online_cpus(&max_tr); 4593 if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
4147 else 4594 tracing_reset_online_cpus(&tr->max_buffer);
4148 ret = -EINVAL; 4595 else
4596 tracing_reset(&tr->max_buffer, iter->cpu_file);
4597 }
4149 break; 4598 break;
4150 } 4599 }
4151 4600
@@ -4157,6 +4606,51 @@ out:
4157 mutex_unlock(&trace_types_lock); 4606 mutex_unlock(&trace_types_lock);
4158 return ret; 4607 return ret;
4159} 4608}
4609
4610static int tracing_snapshot_release(struct inode *inode, struct file *file)
4611{
4612 struct seq_file *m = file->private_data;
4613
4614 if (file->f_mode & FMODE_READ)
4615 return tracing_release(inode, file);
4616
4617 /* If write only, the seq_file is just a stub */
4618 if (m)
4619 kfree(m->private);
4620 kfree(m);
4621
4622 return 0;
4623}
4624
4625static int tracing_buffers_open(struct inode *inode, struct file *filp);
4626static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
4627 size_t count, loff_t *ppos);
4628static int tracing_buffers_release(struct inode *inode, struct file *file);
4629static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4630 struct pipe_inode_info *pipe, size_t len, unsigned int flags);
4631
4632static int snapshot_raw_open(struct inode *inode, struct file *filp)
4633{
4634 struct ftrace_buffer_info *info;
4635 int ret;
4636
4637 ret = tracing_buffers_open(inode, filp);
4638 if (ret < 0)
4639 return ret;
4640
4641 info = filp->private_data;
4642
4643 if (info->iter.trace->use_max_tr) {
4644 tracing_buffers_release(inode, filp);
4645 return -EBUSY;
4646 }
4647
4648 info->iter.snapshot = true;
4649 info->iter.trace_buffer = &info->iter.tr->max_buffer;
4650
4651 return ret;
4652}
4653
4160#endif /* CONFIG_TRACER_SNAPSHOT */ 4654#endif /* CONFIG_TRACER_SNAPSHOT */
4161 4655
4162 4656
@@ -4184,10 +4678,9 @@ static const struct file_operations tracing_pipe_fops = {
4184}; 4678};
4185 4679
4186static const struct file_operations tracing_entries_fops = { 4680static const struct file_operations tracing_entries_fops = {
4187 .open = tracing_entries_open, 4681 .open = tracing_open_generic,
4188 .read = tracing_entries_read, 4682 .read = tracing_entries_read,
4189 .write = tracing_entries_write, 4683 .write = tracing_entries_write,
4190 .release = tracing_entries_release,
4191 .llseek = generic_file_llseek, 4684 .llseek = generic_file_llseek,
4192}; 4685};
4193 4686
@@ -4222,20 +4715,23 @@ static const struct file_operations snapshot_fops = {
4222 .read = seq_read, 4715 .read = seq_read,
4223 .write = tracing_snapshot_write, 4716 .write = tracing_snapshot_write,
4224 .llseek = tracing_seek, 4717 .llseek = tracing_seek,
4225 .release = tracing_release, 4718 .release = tracing_snapshot_release,
4226}; 4719};
4227#endif /* CONFIG_TRACER_SNAPSHOT */
4228 4720
4229struct ftrace_buffer_info { 4721static const struct file_operations snapshot_raw_fops = {
4230 struct trace_array *tr; 4722 .open = snapshot_raw_open,
4231 void *spare; 4723 .read = tracing_buffers_read,
4232 int cpu; 4724 .release = tracing_buffers_release,
4233 unsigned int read; 4725 .splice_read = tracing_buffers_splice_read,
4726 .llseek = no_llseek,
4234}; 4727};
4235 4728
4729#endif /* CONFIG_TRACER_SNAPSHOT */
4730
4236static int tracing_buffers_open(struct inode *inode, struct file *filp) 4731static int tracing_buffers_open(struct inode *inode, struct file *filp)
4237{ 4732{
4238 int cpu = (int)(long)inode->i_private; 4733 struct trace_cpu *tc = inode->i_private;
4734 struct trace_array *tr = tc->tr;
4239 struct ftrace_buffer_info *info; 4735 struct ftrace_buffer_info *info;
4240 4736
4241 if (tracing_disabled) 4737 if (tracing_disabled)
@@ -4245,72 +4741,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
4245 if (!info) 4741 if (!info)
4246 return -ENOMEM; 4742 return -ENOMEM;
4247 4743
4248 info->tr = &global_trace; 4744 mutex_lock(&trace_types_lock);
4249 info->cpu = cpu; 4745
4250 info->spare = NULL; 4746 tr->ref++;
4747
4748 info->iter.tr = tr;
4749 info->iter.cpu_file = tc->cpu;
4750 info->iter.trace = tr->current_trace;
4751 info->iter.trace_buffer = &tr->trace_buffer;
4752 info->spare = NULL;
4251 /* Force reading ring buffer for first read */ 4753 /* Force reading ring buffer for first read */
4252 info->read = (unsigned int)-1; 4754 info->read = (unsigned int)-1;
4253 4755
4254 filp->private_data = info; 4756 filp->private_data = info;
4255 4757
4758 mutex_unlock(&trace_types_lock);
4759
4256 return nonseekable_open(inode, filp); 4760 return nonseekable_open(inode, filp);
4257} 4761}
4258 4762
4763static unsigned int
4764tracing_buffers_poll(struct file *filp, poll_table *poll_table)
4765{
4766 struct ftrace_buffer_info *info = filp->private_data;
4767 struct trace_iterator *iter = &info->iter;
4768
4769 return trace_poll(iter, filp, poll_table);
4770}
4771
4259static ssize_t 4772static ssize_t
4260tracing_buffers_read(struct file *filp, char __user *ubuf, 4773tracing_buffers_read(struct file *filp, char __user *ubuf,
4261 size_t count, loff_t *ppos) 4774 size_t count, loff_t *ppos)
4262{ 4775{
4263 struct ftrace_buffer_info *info = filp->private_data; 4776 struct ftrace_buffer_info *info = filp->private_data;
4777 struct trace_iterator *iter = &info->iter;
4264 ssize_t ret; 4778 ssize_t ret;
4265 size_t size; 4779 ssize_t size;
4266 4780
4267 if (!count) 4781 if (!count)
4268 return 0; 4782 return 0;
4269 4783
4784 mutex_lock(&trace_types_lock);
4785
4786#ifdef CONFIG_TRACER_MAX_TRACE
4787 if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
4788 size = -EBUSY;
4789 goto out_unlock;
4790 }
4791#endif
4792
4270 if (!info->spare) 4793 if (!info->spare)
4271 info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); 4794 info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
4795 iter->cpu_file);
4796 size = -ENOMEM;
4272 if (!info->spare) 4797 if (!info->spare)
4273 return -ENOMEM; 4798 goto out_unlock;
4274 4799
4275 /* Do we have previous read data to read? */ 4800 /* Do we have previous read data to read? */
4276 if (info->read < PAGE_SIZE) 4801 if (info->read < PAGE_SIZE)
4277 goto read; 4802 goto read;
4278 4803
4279 trace_access_lock(info->cpu); 4804 again:
4280 ret = ring_buffer_read_page(info->tr->buffer, 4805 trace_access_lock(iter->cpu_file);
4806 ret = ring_buffer_read_page(iter->trace_buffer->buffer,
4281 &info->spare, 4807 &info->spare,
4282 count, 4808 count,
4283 info->cpu, 0); 4809 iter->cpu_file, 0);
4284 trace_access_unlock(info->cpu); 4810 trace_access_unlock(iter->cpu_file);
4285 if (ret < 0)
4286 return 0;
4287 4811
4288 info->read = 0; 4812 if (ret < 0) {
4813 if (trace_empty(iter)) {
4814 if ((filp->f_flags & O_NONBLOCK)) {
4815 size = -EAGAIN;
4816 goto out_unlock;
4817 }
4818 mutex_unlock(&trace_types_lock);
4819 iter->trace->wait_pipe(iter);
4820 mutex_lock(&trace_types_lock);
4821 if (signal_pending(current)) {
4822 size = -EINTR;
4823 goto out_unlock;
4824 }
4825 goto again;
4826 }
4827 size = 0;
4828 goto out_unlock;
4829 }
4289 4830
4290read: 4831 info->read = 0;
4832 read:
4291 size = PAGE_SIZE - info->read; 4833 size = PAGE_SIZE - info->read;
4292 if (size > count) 4834 if (size > count)
4293 size = count; 4835 size = count;
4294 4836
4295 ret = copy_to_user(ubuf, info->spare + info->read, size); 4837 ret = copy_to_user(ubuf, info->spare + info->read, size);
4296 if (ret == size) 4838 if (ret == size) {
4297 return -EFAULT; 4839 size = -EFAULT;
4840 goto out_unlock;
4841 }
4298 size -= ret; 4842 size -= ret;
4299 4843
4300 *ppos += size; 4844 *ppos += size;
4301 info->read += size; 4845 info->read += size;
4302 4846
4847 out_unlock:
4848 mutex_unlock(&trace_types_lock);
4849
4303 return size; 4850 return size;
4304} 4851}
4305 4852
4306static int tracing_buffers_release(struct inode *inode, struct file *file) 4853static int tracing_buffers_release(struct inode *inode, struct file *file)
4307{ 4854{
4308 struct ftrace_buffer_info *info = file->private_data; 4855 struct ftrace_buffer_info *info = file->private_data;
4856 struct trace_iterator *iter = &info->iter;
4857
4858 mutex_lock(&trace_types_lock);
4859
4860 WARN_ON(!iter->tr->ref);
4861 iter->tr->ref--;
4309 4862
4310 if (info->spare) 4863 if (info->spare)
4311 ring_buffer_free_read_page(info->tr->buffer, info->spare); 4864 ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
4312 kfree(info); 4865 kfree(info);
4313 4866
4867 mutex_unlock(&trace_types_lock);
4868
4314 return 0; 4869 return 0;
4315} 4870}
4316 4871
@@ -4375,6 +4930,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4375 unsigned int flags) 4930 unsigned int flags)
4376{ 4931{
4377 struct ftrace_buffer_info *info = file->private_data; 4932 struct ftrace_buffer_info *info = file->private_data;
4933 struct trace_iterator *iter = &info->iter;
4378 struct partial_page partial_def[PIPE_DEF_BUFFERS]; 4934 struct partial_page partial_def[PIPE_DEF_BUFFERS];
4379 struct page *pages_def[PIPE_DEF_BUFFERS]; 4935 struct page *pages_def[PIPE_DEF_BUFFERS];
4380 struct splice_pipe_desc spd = { 4936 struct splice_pipe_desc spd = {
@@ -4387,10 +4943,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4387 }; 4943 };
4388 struct buffer_ref *ref; 4944 struct buffer_ref *ref;
4389 int entries, size, i; 4945 int entries, size, i;
4390 size_t ret; 4946 ssize_t ret;
4391 4947
4392 if (splice_grow_spd(pipe, &spd)) 4948 mutex_lock(&trace_types_lock);
4393 return -ENOMEM; 4949
4950#ifdef CONFIG_TRACER_MAX_TRACE
4951 if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
4952 ret = -EBUSY;
4953 goto out;
4954 }
4955#endif
4956
4957 if (splice_grow_spd(pipe, &spd)) {
4958 ret = -ENOMEM;
4959 goto out;
4960 }
4394 4961
4395 if (*ppos & (PAGE_SIZE - 1)) { 4962 if (*ppos & (PAGE_SIZE - 1)) {
4396 ret = -EINVAL; 4963 ret = -EINVAL;
@@ -4405,8 +4972,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4405 len &= PAGE_MASK; 4972 len &= PAGE_MASK;
4406 } 4973 }
4407 4974
4408 trace_access_lock(info->cpu); 4975 again:
4409 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 4976 trace_access_lock(iter->cpu_file);
4977 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
4410 4978
4411 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { 4979 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
4412 struct page *page; 4980 struct page *page;
@@ -4417,15 +4985,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4417 break; 4985 break;
4418 4986
4419 ref->ref = 1; 4987 ref->ref = 1;
4420 ref->buffer = info->tr->buffer; 4988 ref->buffer = iter->trace_buffer->buffer;
4421 ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); 4989 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
4422 if (!ref->page) { 4990 if (!ref->page) {
4423 kfree(ref); 4991 kfree(ref);
4424 break; 4992 break;
4425 } 4993 }
4426 4994
4427 r = ring_buffer_read_page(ref->buffer, &ref->page, 4995 r = ring_buffer_read_page(ref->buffer, &ref->page,
4428 len, info->cpu, 1); 4996 len, iter->cpu_file, 1);
4429 if (r < 0) { 4997 if (r < 0) {
4430 ring_buffer_free_read_page(ref->buffer, ref->page); 4998 ring_buffer_free_read_page(ref->buffer, ref->page);
4431 kfree(ref); 4999 kfree(ref);
@@ -4449,31 +5017,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
4449 spd.nr_pages++; 5017 spd.nr_pages++;
4450 *ppos += PAGE_SIZE; 5018 *ppos += PAGE_SIZE;
4451 5019
4452 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 5020 entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
4453 } 5021 }
4454 5022
4455 trace_access_unlock(info->cpu); 5023 trace_access_unlock(iter->cpu_file);
4456 spd.nr_pages = i; 5024 spd.nr_pages = i;
4457 5025
4458 /* did we read anything? */ 5026 /* did we read anything? */
4459 if (!spd.nr_pages) { 5027 if (!spd.nr_pages) {
4460 if (flags & SPLICE_F_NONBLOCK) 5028 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
4461 ret = -EAGAIN; 5029 ret = -EAGAIN;
4462 else 5030 goto out;
4463 ret = 0; 5031 }
4464 /* TODO: block */ 5032 mutex_unlock(&trace_types_lock);
4465 goto out; 5033 iter->trace->wait_pipe(iter);
5034 mutex_lock(&trace_types_lock);
5035 if (signal_pending(current)) {
5036 ret = -EINTR;
5037 goto out;
5038 }
5039 goto again;
4466 } 5040 }
4467 5041
4468 ret = splice_to_pipe(pipe, &spd); 5042 ret = splice_to_pipe(pipe, &spd);
4469 splice_shrink_spd(&spd); 5043 splice_shrink_spd(&spd);
4470out: 5044out:
5045 mutex_unlock(&trace_types_lock);
5046
4471 return ret; 5047 return ret;
4472} 5048}
4473 5049
4474static const struct file_operations tracing_buffers_fops = { 5050static const struct file_operations tracing_buffers_fops = {
4475 .open = tracing_buffers_open, 5051 .open = tracing_buffers_open,
4476 .read = tracing_buffers_read, 5052 .read = tracing_buffers_read,
5053 .poll = tracing_buffers_poll,
4477 .release = tracing_buffers_release, 5054 .release = tracing_buffers_release,
4478 .splice_read = tracing_buffers_splice_read, 5055 .splice_read = tracing_buffers_splice_read,
4479 .llseek = no_llseek, 5056 .llseek = no_llseek,
@@ -4483,12 +5060,14 @@ static ssize_t
4483tracing_stats_read(struct file *filp, char __user *ubuf, 5060tracing_stats_read(struct file *filp, char __user *ubuf,
4484 size_t count, loff_t *ppos) 5061 size_t count, loff_t *ppos)
4485{ 5062{
4486 unsigned long cpu = (unsigned long)filp->private_data; 5063 struct trace_cpu *tc = filp->private_data;
4487 struct trace_array *tr = &global_trace; 5064 struct trace_array *tr = tc->tr;
5065 struct trace_buffer *trace_buf = &tr->trace_buffer;
4488 struct trace_seq *s; 5066 struct trace_seq *s;
4489 unsigned long cnt; 5067 unsigned long cnt;
4490 unsigned long long t; 5068 unsigned long long t;
4491 unsigned long usec_rem; 5069 unsigned long usec_rem;
5070 int cpu = tc->cpu;
4492 5071
4493 s = kmalloc(sizeof(*s), GFP_KERNEL); 5072 s = kmalloc(sizeof(*s), GFP_KERNEL);
4494 if (!s) 5073 if (!s)
@@ -4496,41 +5075,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4496 5075
4497 trace_seq_init(s); 5076 trace_seq_init(s);
4498 5077
4499 cnt = ring_buffer_entries_cpu(tr->buffer, cpu); 5078 cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
4500 trace_seq_printf(s, "entries: %ld\n", cnt); 5079 trace_seq_printf(s, "entries: %ld\n", cnt);
4501 5080
4502 cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); 5081 cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
4503 trace_seq_printf(s, "overrun: %ld\n", cnt); 5082 trace_seq_printf(s, "overrun: %ld\n", cnt);
4504 5083
4505 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 5084 cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
4506 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 5085 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
4507 5086
4508 cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); 5087 cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
4509 trace_seq_printf(s, "bytes: %ld\n", cnt); 5088 trace_seq_printf(s, "bytes: %ld\n", cnt);
4510 5089
4511 if (trace_clocks[trace_clock_id].in_ns) { 5090 if (trace_clocks[trace_clock_id].in_ns) {
4512 /* local or global for trace_clock */ 5091 /* local or global for trace_clock */
4513 t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); 5092 t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
4514 usec_rem = do_div(t, USEC_PER_SEC); 5093 usec_rem = do_div(t, USEC_PER_SEC);
4515 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", 5094 trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
4516 t, usec_rem); 5095 t, usec_rem);
4517 5096
4518 t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); 5097 t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
4519 usec_rem = do_div(t, USEC_PER_SEC); 5098 usec_rem = do_div(t, USEC_PER_SEC);
4520 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); 5099 trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
4521 } else { 5100 } else {
4522 /* counter or tsc mode for trace_clock */ 5101 /* counter or tsc mode for trace_clock */
4523 trace_seq_printf(s, "oldest event ts: %llu\n", 5102 trace_seq_printf(s, "oldest event ts: %llu\n",
4524 ring_buffer_oldest_event_ts(tr->buffer, cpu)); 5103 ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
4525 5104
4526 trace_seq_printf(s, "now ts: %llu\n", 5105 trace_seq_printf(s, "now ts: %llu\n",
4527 ring_buffer_time_stamp(tr->buffer, cpu)); 5106 ring_buffer_time_stamp(trace_buf->buffer, cpu));
4528 } 5107 }
4529 5108
4530 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); 5109 cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
4531 trace_seq_printf(s, "dropped events: %ld\n", cnt); 5110 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4532 5111
4533 cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); 5112 cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
4534 trace_seq_printf(s, "read events: %ld\n", cnt); 5113 trace_seq_printf(s, "read events: %ld\n", cnt);
4535 5114
4536 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 5115 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
@@ -4582,60 +5161,161 @@ static const struct file_operations tracing_dyn_info_fops = {
4582 .read = tracing_read_dyn_info, 5161 .read = tracing_read_dyn_info,
4583 .llseek = generic_file_llseek, 5162 .llseek = generic_file_llseek,
4584}; 5163};
4585#endif 5164#endif /* CONFIG_DYNAMIC_FTRACE */
4586 5165
4587static struct dentry *d_tracer; 5166#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
5167static void
5168ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
5169{
5170 tracing_snapshot();
5171}
4588 5172
4589struct dentry *tracing_init_dentry(void) 5173static void
5174ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
5175{
5176 unsigned long *count = (long *)data;
5177
5178 if (!*count)
5179 return;
5180
5181 if (*count != -1)
5182 (*count)--;
5183
5184 tracing_snapshot();
5185}
5186
5187static int
5188ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
5189 struct ftrace_probe_ops *ops, void *data)
5190{
5191 long count = (long)data;
5192
5193 seq_printf(m, "%ps:", (void *)ip);
5194
5195 seq_printf(m, "snapshot");
5196
5197 if (count == -1)
5198 seq_printf(m, ":unlimited\n");
5199 else
5200 seq_printf(m, ":count=%ld\n", count);
5201
5202 return 0;
5203}
5204
5205static struct ftrace_probe_ops snapshot_probe_ops = {
5206 .func = ftrace_snapshot,
5207 .print = ftrace_snapshot_print,
5208};
5209
5210static struct ftrace_probe_ops snapshot_count_probe_ops = {
5211 .func = ftrace_count_snapshot,
5212 .print = ftrace_snapshot_print,
5213};
5214
5215static int
5216ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
5217 char *glob, char *cmd, char *param, int enable)
5218{
5219 struct ftrace_probe_ops *ops;
5220 void *count = (void *)-1;
5221 char *number;
5222 int ret;
5223
5224 /* hash funcs only work with set_ftrace_filter */
5225 if (!enable)
5226 return -EINVAL;
5227
5228 ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
5229
5230 if (glob[0] == '!') {
5231 unregister_ftrace_function_probe_func(glob+1, ops);
5232 return 0;
5233 }
5234
5235 if (!param)
5236 goto out_reg;
5237
5238 number = strsep(&param, ":");
5239
5240 if (!strlen(number))
5241 goto out_reg;
5242
5243 /*
5244 * We use the callback data field (which is a pointer)
5245 * as our counter.
5246 */
5247 ret = kstrtoul(number, 0, (unsigned long *)&count);
5248 if (ret)
5249 return ret;
5250
5251 out_reg:
5252 ret = register_ftrace_function_probe(glob, ops, count);
5253
5254 if (ret >= 0)
5255 alloc_snapshot(&global_trace);
5256
5257 return ret < 0 ? ret : 0;
5258}
5259
5260static struct ftrace_func_command ftrace_snapshot_cmd = {
5261 .name = "snapshot",
5262 .func = ftrace_trace_snapshot_callback,
5263};
5264
5265static int register_snapshot_cmd(void)
4590{ 5266{
4591 static int once; 5267 return register_ftrace_command(&ftrace_snapshot_cmd);
5268}
5269#else
5270static inline int register_snapshot_cmd(void) { return 0; }
5271#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
4592 5272
4593 if (d_tracer) 5273struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
4594 return d_tracer; 5274{
5275 if (tr->dir)
5276 return tr->dir;
4595 5277
4596 if (!debugfs_initialized()) 5278 if (!debugfs_initialized())
4597 return NULL; 5279 return NULL;
4598 5280
4599 d_tracer = debugfs_create_dir("tracing", NULL); 5281 if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
5282 tr->dir = debugfs_create_dir("tracing", NULL);
4600 5283
4601 if (!d_tracer && !once) { 5284 if (!tr->dir)
4602 once = 1; 5285 pr_warn_once("Could not create debugfs directory 'tracing'\n");
4603 pr_warning("Could not create debugfs directory 'tracing'\n");
4604 return NULL;
4605 }
4606 5286
4607 return d_tracer; 5287 return tr->dir;
4608} 5288}
4609 5289
4610static struct dentry *d_percpu; 5290struct dentry *tracing_init_dentry(void)
5291{
5292 return tracing_init_dentry_tr(&global_trace);
5293}
4611 5294
4612static struct dentry *tracing_dentry_percpu(void) 5295static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
4613{ 5296{
4614 static int once;
4615 struct dentry *d_tracer; 5297 struct dentry *d_tracer;
4616 5298
4617 if (d_percpu) 5299 if (tr->percpu_dir)
4618 return d_percpu; 5300 return tr->percpu_dir;
4619
4620 d_tracer = tracing_init_dentry();
4621 5301
5302 d_tracer = tracing_init_dentry_tr(tr);
4622 if (!d_tracer) 5303 if (!d_tracer)
4623 return NULL; 5304 return NULL;
4624 5305
4625 d_percpu = debugfs_create_dir("per_cpu", d_tracer); 5306 tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
4626 5307
4627 if (!d_percpu && !once) { 5308 WARN_ONCE(!tr->percpu_dir,
4628 once = 1; 5309 "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
4629 pr_warning("Could not create debugfs directory 'per_cpu'\n");
4630 return NULL;
4631 }
4632 5310
4633 return d_percpu; 5311 return tr->percpu_dir;
4634} 5312}
4635 5313
4636static void tracing_init_debugfs_percpu(long cpu) 5314static void
5315tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
4637{ 5316{
4638 struct dentry *d_percpu = tracing_dentry_percpu(); 5317 struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
5318 struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
4639 struct dentry *d_cpu; 5319 struct dentry *d_cpu;
4640 char cpu_dir[30]; /* 30 characters should be more than enough */ 5320 char cpu_dir[30]; /* 30 characters should be more than enough */
4641 5321
@@ -4651,20 +5331,28 @@ static void tracing_init_debugfs_percpu(long cpu)
4651 5331
4652 /* per cpu trace_pipe */ 5332 /* per cpu trace_pipe */
4653 trace_create_file("trace_pipe", 0444, d_cpu, 5333 trace_create_file("trace_pipe", 0444, d_cpu,
4654 (void *) cpu, &tracing_pipe_fops); 5334 (void *)&data->trace_cpu, &tracing_pipe_fops);
4655 5335
4656 /* per cpu trace */ 5336 /* per cpu trace */
4657 trace_create_file("trace", 0644, d_cpu, 5337 trace_create_file("trace", 0644, d_cpu,
4658 (void *) cpu, &tracing_fops); 5338 (void *)&data->trace_cpu, &tracing_fops);
4659 5339
4660 trace_create_file("trace_pipe_raw", 0444, d_cpu, 5340 trace_create_file("trace_pipe_raw", 0444, d_cpu,
4661 (void *) cpu, &tracing_buffers_fops); 5341 (void *)&data->trace_cpu, &tracing_buffers_fops);
4662 5342
4663 trace_create_file("stats", 0444, d_cpu, 5343 trace_create_file("stats", 0444, d_cpu,
4664 (void *) cpu, &tracing_stats_fops); 5344 (void *)&data->trace_cpu, &tracing_stats_fops);
4665 5345
4666 trace_create_file("buffer_size_kb", 0444, d_cpu, 5346 trace_create_file("buffer_size_kb", 0444, d_cpu,
4667 (void *) cpu, &tracing_entries_fops); 5347 (void *)&data->trace_cpu, &tracing_entries_fops);
5348
5349#ifdef CONFIG_TRACER_SNAPSHOT
5350 trace_create_file("snapshot", 0644, d_cpu,
5351 (void *)&data->trace_cpu, &snapshot_fops);
5352
5353 trace_create_file("snapshot_raw", 0444, d_cpu,
5354 (void *)&data->trace_cpu, &snapshot_raw_fops);
5355#endif
4668} 5356}
4669 5357
4670#ifdef CONFIG_FTRACE_SELFTEST 5358#ifdef CONFIG_FTRACE_SELFTEST
@@ -4675,6 +5363,7 @@ static void tracing_init_debugfs_percpu(long cpu)
4675struct trace_option_dentry { 5363struct trace_option_dentry {
4676 struct tracer_opt *opt; 5364 struct tracer_opt *opt;
4677 struct tracer_flags *flags; 5365 struct tracer_flags *flags;
5366 struct trace_array *tr;
4678 struct dentry *entry; 5367 struct dentry *entry;
4679}; 5368};
4680 5369
@@ -4710,7 +5399,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
4710 5399
4711 if (!!(topt->flags->val & topt->opt->bit) != val) { 5400 if (!!(topt->flags->val & topt->opt->bit) != val) {
4712 mutex_lock(&trace_types_lock); 5401 mutex_lock(&trace_types_lock);
4713 ret = __set_tracer_option(current_trace, topt->flags, 5402 ret = __set_tracer_option(topt->tr->current_trace, topt->flags,
4714 topt->opt, !val); 5403 topt->opt, !val);
4715 mutex_unlock(&trace_types_lock); 5404 mutex_unlock(&trace_types_lock);
4716 if (ret) 5405 if (ret)
@@ -4749,6 +5438,7 @@ static ssize_t
4749trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, 5438trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4750 loff_t *ppos) 5439 loff_t *ppos)
4751{ 5440{
5441 struct trace_array *tr = &global_trace;
4752 long index = (long)filp->private_data; 5442 long index = (long)filp->private_data;
4753 unsigned long val; 5443 unsigned long val;
4754 int ret; 5444 int ret;
@@ -4759,7 +5449,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
4759 5449
4760 if (val != 0 && val != 1) 5450 if (val != 0 && val != 1)
4761 return -EINVAL; 5451 return -EINVAL;
4762 set_tracer_flags(1 << index, val); 5452
5453 mutex_lock(&trace_types_lock);
5454 ret = set_tracer_flag(tr, 1 << index, val);
5455 mutex_unlock(&trace_types_lock);
5456
5457 if (ret < 0)
5458 return ret;
4763 5459
4764 *ppos += cnt; 5460 *ppos += cnt;
4765 5461
@@ -4789,40 +5485,41 @@ struct dentry *trace_create_file(const char *name,
4789} 5485}
4790 5486
4791 5487
4792static struct dentry *trace_options_init_dentry(void) 5488static struct dentry *trace_options_init_dentry(struct trace_array *tr)
4793{ 5489{
4794 struct dentry *d_tracer; 5490 struct dentry *d_tracer;
4795 static struct dentry *t_options;
4796 5491
4797 if (t_options) 5492 if (tr->options)
4798 return t_options; 5493 return tr->options;
4799 5494
4800 d_tracer = tracing_init_dentry(); 5495 d_tracer = tracing_init_dentry_tr(tr);
4801 if (!d_tracer) 5496 if (!d_tracer)
4802 return NULL; 5497 return NULL;
4803 5498
4804 t_options = debugfs_create_dir("options", d_tracer); 5499 tr->options = debugfs_create_dir("options", d_tracer);
4805 if (!t_options) { 5500 if (!tr->options) {
4806 pr_warning("Could not create debugfs directory 'options'\n"); 5501 pr_warning("Could not create debugfs directory 'options'\n");
4807 return NULL; 5502 return NULL;
4808 } 5503 }
4809 5504
4810 return t_options; 5505 return tr->options;
4811} 5506}
4812 5507
4813static void 5508static void
4814create_trace_option_file(struct trace_option_dentry *topt, 5509create_trace_option_file(struct trace_array *tr,
5510 struct trace_option_dentry *topt,
4815 struct tracer_flags *flags, 5511 struct tracer_flags *flags,
4816 struct tracer_opt *opt) 5512 struct tracer_opt *opt)
4817{ 5513{
4818 struct dentry *t_options; 5514 struct dentry *t_options;
4819 5515
4820 t_options = trace_options_init_dentry(); 5516 t_options = trace_options_init_dentry(tr);
4821 if (!t_options) 5517 if (!t_options)
4822 return; 5518 return;
4823 5519
4824 topt->flags = flags; 5520 topt->flags = flags;
4825 topt->opt = opt; 5521 topt->opt = opt;
5522 topt->tr = tr;
4826 5523
4827 topt->entry = trace_create_file(opt->name, 0644, t_options, topt, 5524 topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
4828 &trace_options_fops); 5525 &trace_options_fops);
@@ -4830,7 +5527,7 @@ create_trace_option_file(struct trace_option_dentry *topt,
4830} 5527}
4831 5528
4832static struct trace_option_dentry * 5529static struct trace_option_dentry *
4833create_trace_option_files(struct tracer *tracer) 5530create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
4834{ 5531{
4835 struct trace_option_dentry *topts; 5532 struct trace_option_dentry *topts;
4836 struct tracer_flags *flags; 5533 struct tracer_flags *flags;
@@ -4855,7 +5552,7 @@ create_trace_option_files(struct tracer *tracer)
4855 return NULL; 5552 return NULL;
4856 5553
4857 for (cnt = 0; opts[cnt].name; cnt++) 5554 for (cnt = 0; opts[cnt].name; cnt++)
4858 create_trace_option_file(&topts[cnt], flags, 5555 create_trace_option_file(tr, &topts[cnt], flags,
4859 &opts[cnt]); 5556 &opts[cnt]);
4860 5557
4861 return topts; 5558 return topts;
@@ -4878,11 +5575,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
4878} 5575}
4879 5576
4880static struct dentry * 5577static struct dentry *
4881create_trace_option_core_file(const char *option, long index) 5578create_trace_option_core_file(struct trace_array *tr,
5579 const char *option, long index)
4882{ 5580{
4883 struct dentry *t_options; 5581 struct dentry *t_options;
4884 5582
4885 t_options = trace_options_init_dentry(); 5583 t_options = trace_options_init_dentry(tr);
4886 if (!t_options) 5584 if (!t_options)
4887 return NULL; 5585 return NULL;
4888 5586
@@ -4890,17 +5588,17 @@ create_trace_option_core_file(const char *option, long index)
4890 &trace_options_core_fops); 5588 &trace_options_core_fops);
4891} 5589}
4892 5590
4893static __init void create_trace_options_dir(void) 5591static __init void create_trace_options_dir(struct trace_array *tr)
4894{ 5592{
4895 struct dentry *t_options; 5593 struct dentry *t_options;
4896 int i; 5594 int i;
4897 5595
4898 t_options = trace_options_init_dentry(); 5596 t_options = trace_options_init_dentry(tr);
4899 if (!t_options) 5597 if (!t_options)
4900 return; 5598 return;
4901 5599
4902 for (i = 0; trace_options[i]; i++) 5600 for (i = 0; trace_options[i]; i++)
4903 create_trace_option_core_file(trace_options[i], i); 5601 create_trace_option_core_file(tr, trace_options[i], i);
4904} 5602}
4905 5603
4906static ssize_t 5604static ssize_t
@@ -4908,7 +5606,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,
4908 size_t cnt, loff_t *ppos) 5606 size_t cnt, loff_t *ppos)
4909{ 5607{
4910 struct trace_array *tr = filp->private_data; 5608 struct trace_array *tr = filp->private_data;
4911 struct ring_buffer *buffer = tr->buffer; 5609 struct ring_buffer *buffer = tr->trace_buffer.buffer;
4912 char buf[64]; 5610 char buf[64];
4913 int r; 5611 int r;
4914 5612
@@ -4927,7 +5625,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
4927 size_t cnt, loff_t *ppos) 5625 size_t cnt, loff_t *ppos)
4928{ 5626{
4929 struct trace_array *tr = filp->private_data; 5627 struct trace_array *tr = filp->private_data;
4930 struct ring_buffer *buffer = tr->buffer; 5628 struct ring_buffer *buffer = tr->trace_buffer.buffer;
4931 unsigned long val; 5629 unsigned long val;
4932 int ret; 5630 int ret;
4933 5631
@@ -4939,12 +5637,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
4939 mutex_lock(&trace_types_lock); 5637 mutex_lock(&trace_types_lock);
4940 if (val) { 5638 if (val) {
4941 ring_buffer_record_on(buffer); 5639 ring_buffer_record_on(buffer);
4942 if (current_trace->start) 5640 if (tr->current_trace->start)
4943 current_trace->start(tr); 5641 tr->current_trace->start(tr);
4944 } else { 5642 } else {
4945 ring_buffer_record_off(buffer); 5643 ring_buffer_record_off(buffer);
4946 if (current_trace->stop) 5644 if (tr->current_trace->stop)
4947 current_trace->stop(tr); 5645 tr->current_trace->stop(tr);
4948 } 5646 }
4949 mutex_unlock(&trace_types_lock); 5647 mutex_unlock(&trace_types_lock);
4950 } 5648 }
@@ -4961,23 +5659,310 @@ static const struct file_operations rb_simple_fops = {
4961 .llseek = default_llseek, 5659 .llseek = default_llseek,
4962}; 5660};
4963 5661
5662struct dentry *trace_instance_dir;
5663
5664static void
5665init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
5666
5667static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
5668{
5669 int cpu;
5670
5671 for_each_tracing_cpu(cpu) {
5672 memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu));
5673 per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu;
5674 per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr;
5675 }
5676}
5677
5678static int
5679allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
5680{
5681 enum ring_buffer_flags rb_flags;
5682
5683 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
5684
5685 buf->buffer = ring_buffer_alloc(size, rb_flags);
5686 if (!buf->buffer)
5687 return -ENOMEM;
5688
5689 buf->data = alloc_percpu(struct trace_array_cpu);
5690 if (!buf->data) {
5691 ring_buffer_free(buf->buffer);
5692 return -ENOMEM;
5693 }
5694
5695 init_trace_buffers(tr, buf);
5696
5697 /* Allocate the first page for all buffers */
5698 set_buffer_entries(&tr->trace_buffer,
5699 ring_buffer_size(tr->trace_buffer.buffer, 0));
5700
5701 return 0;
5702}
5703
5704static int allocate_trace_buffers(struct trace_array *tr, int size)
5705{
5706 int ret;
5707
5708 ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
5709 if (ret)
5710 return ret;
5711
5712#ifdef CONFIG_TRACER_MAX_TRACE
5713 ret = allocate_trace_buffer(tr, &tr->max_buffer,
5714 allocate_snapshot ? size : 1);
5715 if (WARN_ON(ret)) {
5716 ring_buffer_free(tr->trace_buffer.buffer);
5717 free_percpu(tr->trace_buffer.data);
5718 return -ENOMEM;
5719 }
5720 tr->allocated_snapshot = allocate_snapshot;
5721
5722 /*
5723 * Only the top level trace array gets its snapshot allocated
5724 * from the kernel command line.
5725 */
5726 allocate_snapshot = false;
5727#endif
5728 return 0;
5729}
5730
5731static int new_instance_create(const char *name)
5732{
5733 struct trace_array *tr;
5734 int ret;
5735
5736 mutex_lock(&trace_types_lock);
5737
5738 ret = -EEXIST;
5739 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
5740 if (tr->name && strcmp(tr->name, name) == 0)
5741 goto out_unlock;
5742 }
5743
5744 ret = -ENOMEM;
5745 tr = kzalloc(sizeof(*tr), GFP_KERNEL);
5746 if (!tr)
5747 goto out_unlock;
5748
5749 tr->name = kstrdup(name, GFP_KERNEL);
5750 if (!tr->name)
5751 goto out_free_tr;
5752
5753 raw_spin_lock_init(&tr->start_lock);
5754
5755 tr->current_trace = &nop_trace;
5756
5757 INIT_LIST_HEAD(&tr->systems);
5758 INIT_LIST_HEAD(&tr->events);
5759
5760 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
5761 goto out_free_tr;
5762
5763 /* Holder for file callbacks */
5764 tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
5765 tr->trace_cpu.tr = tr;
5766
5767 tr->dir = debugfs_create_dir(name, trace_instance_dir);
5768 if (!tr->dir)
5769 goto out_free_tr;
5770
5771 ret = event_trace_add_tracer(tr->dir, tr);
5772 if (ret)
5773 goto out_free_tr;
5774
5775 init_tracer_debugfs(tr, tr->dir);
5776
5777 list_add(&tr->list, &ftrace_trace_arrays);
5778
5779 mutex_unlock(&trace_types_lock);
5780
5781 return 0;
5782
5783 out_free_tr:
5784 if (tr->trace_buffer.buffer)
5785 ring_buffer_free(tr->trace_buffer.buffer);
5786 kfree(tr->name);
5787 kfree(tr);
5788
5789 out_unlock:
5790 mutex_unlock(&trace_types_lock);
5791
5792 return ret;
5793
5794}
5795
5796static int instance_delete(const char *name)
5797{
5798 struct trace_array *tr;
5799 int found = 0;
5800 int ret;
5801
5802 mutex_lock(&trace_types_lock);
5803
5804 ret = -ENODEV;
5805 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
5806 if (tr->name && strcmp(tr->name, name) == 0) {
5807 found = 1;
5808 break;
5809 }
5810 }
5811 if (!found)
5812 goto out_unlock;
5813
5814 ret = -EBUSY;
5815 if (tr->ref)
5816 goto out_unlock;
5817
5818 list_del(&tr->list);
5819
5820 event_trace_del_tracer(tr);
5821 debugfs_remove_recursive(tr->dir);
5822 free_percpu(tr->trace_buffer.data);
5823 ring_buffer_free(tr->trace_buffer.buffer);
5824
5825 kfree(tr->name);
5826 kfree(tr);
5827
5828 ret = 0;
5829
5830 out_unlock:
5831 mutex_unlock(&trace_types_lock);
5832
5833 return ret;
5834}
5835
5836static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
5837{
5838 struct dentry *parent;
5839 int ret;
5840
5841 /* Paranoid: Make sure the parent is the "instances" directory */
5842 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
5843 if (WARN_ON_ONCE(parent != trace_instance_dir))
5844 return -ENOENT;
5845
5846 /*
5847 * The inode mutex is locked, but debugfs_create_dir() will also
5848 * take the mutex. As the instances directory can not be destroyed
5849 * or changed in any other way, it is safe to unlock it, and
5850 * let the dentry try. If two users try to make the same dir at
5851 * the same time, then the new_instance_create() will determine the
5852 * winner.
5853 */
5854 mutex_unlock(&inode->i_mutex);
5855
5856 ret = new_instance_create(dentry->d_iname);
5857
5858 mutex_lock(&inode->i_mutex);
5859
5860 return ret;
5861}
5862
5863static int instance_rmdir(struct inode *inode, struct dentry *dentry)
5864{
5865 struct dentry *parent;
5866 int ret;
5867
5868 /* Paranoid: Make sure the parent is the "instances" directory */
5869 parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
5870 if (WARN_ON_ONCE(parent != trace_instance_dir))
5871 return -ENOENT;
5872
5873 /* The caller did a dget() on dentry */
5874 mutex_unlock(&dentry->d_inode->i_mutex);
5875
5876 /*
5877 * The inode mutex is locked, but debugfs_create_dir() will also
5878 * take the mutex. As the instances directory can not be destroyed
5879 * or changed in any other way, it is safe to unlock it, and
5880 * let the dentry try. If two users try to make the same dir at
5881 * the same time, then the instance_delete() will determine the
5882 * winner.
5883 */
5884 mutex_unlock(&inode->i_mutex);
5885
5886 ret = instance_delete(dentry->d_iname);
5887
5888 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
5889 mutex_lock(&dentry->d_inode->i_mutex);
5890
5891 return ret;
5892}
5893
5894static const struct inode_operations instance_dir_inode_operations = {
5895 .lookup = simple_lookup,
5896 .mkdir = instance_mkdir,
5897 .rmdir = instance_rmdir,
5898};
5899
5900static __init void create_trace_instances(struct dentry *d_tracer)
5901{
5902 trace_instance_dir = debugfs_create_dir("instances", d_tracer);
5903 if (WARN_ON(!trace_instance_dir))
5904 return;
5905
5906 /* Hijack the dir inode operations, to allow mkdir */
5907 trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
5908}
5909
5910static void
5911init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
5912{
5913 int cpu;
5914
5915 trace_create_file("trace_options", 0644, d_tracer,
5916 tr, &tracing_iter_fops);
5917
5918 trace_create_file("trace", 0644, d_tracer,
5919 (void *)&tr->trace_cpu, &tracing_fops);
5920
5921 trace_create_file("trace_pipe", 0444, d_tracer,
5922 (void *)&tr->trace_cpu, &tracing_pipe_fops);
5923
5924 trace_create_file("buffer_size_kb", 0644, d_tracer,
5925 (void *)&tr->trace_cpu, &tracing_entries_fops);
5926
5927 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5928 tr, &tracing_total_entries_fops);
5929
5930 trace_create_file("free_buffer", 0644, d_tracer,
5931 tr, &tracing_free_buffer_fops);
5932
5933 trace_create_file("trace_marker", 0220, d_tracer,
5934 tr, &tracing_mark_fops);
5935
5936 trace_create_file("trace_clock", 0644, d_tracer, tr,
5937 &trace_clock_fops);
5938
5939 trace_create_file("tracing_on", 0644, d_tracer,
5940 tr, &rb_simple_fops);
5941
5942#ifdef CONFIG_TRACER_SNAPSHOT
5943 trace_create_file("snapshot", 0644, d_tracer,
5944 (void *)&tr->trace_cpu, &snapshot_fops);
5945#endif
5946
5947 for_each_tracing_cpu(cpu)
5948 tracing_init_debugfs_percpu(tr, cpu);
5949
5950}
5951
4964static __init int tracer_init_debugfs(void) 5952static __init int tracer_init_debugfs(void)
4965{ 5953{
4966 struct dentry *d_tracer; 5954 struct dentry *d_tracer;
4967 int cpu;
4968 5955
4969 trace_access_lock_init(); 5956 trace_access_lock_init();
4970 5957
4971 d_tracer = tracing_init_dentry(); 5958 d_tracer = tracing_init_dentry();
5959 if (!d_tracer)
5960 return 0;
4972 5961
4973 trace_create_file("trace_options", 0644, d_tracer, 5962 init_tracer_debugfs(&global_trace, d_tracer);
4974 NULL, &tracing_iter_fops);
4975 5963
4976 trace_create_file("tracing_cpumask", 0644, d_tracer, 5964 trace_create_file("tracing_cpumask", 0644, d_tracer,
4977 NULL, &tracing_cpumask_fops); 5965 &global_trace, &tracing_cpumask_fops);
4978
4979 trace_create_file("trace", 0644, d_tracer,
4980 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
4981 5966
4982 trace_create_file("available_tracers", 0444, d_tracer, 5967 trace_create_file("available_tracers", 0444, d_tracer,
4983 &global_trace, &show_traces_fops); 5968 &global_trace, &show_traces_fops);
@@ -4996,44 +5981,17 @@ static __init int tracer_init_debugfs(void)
4996 trace_create_file("README", 0444, d_tracer, 5981 trace_create_file("README", 0444, d_tracer,
4997 NULL, &tracing_readme_fops); 5982 NULL, &tracing_readme_fops);
4998 5983
4999 trace_create_file("trace_pipe", 0444, d_tracer,
5000 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
5001
5002 trace_create_file("buffer_size_kb", 0644, d_tracer,
5003 (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
5004
5005 trace_create_file("buffer_total_size_kb", 0444, d_tracer,
5006 &global_trace, &tracing_total_entries_fops);
5007
5008 trace_create_file("free_buffer", 0644, d_tracer,
5009 &global_trace, &tracing_free_buffer_fops);
5010
5011 trace_create_file("trace_marker", 0220, d_tracer,
5012 NULL, &tracing_mark_fops);
5013
5014 trace_create_file("saved_cmdlines", 0444, d_tracer, 5984 trace_create_file("saved_cmdlines", 0444, d_tracer,
5015 NULL, &tracing_saved_cmdlines_fops); 5985 NULL, &tracing_saved_cmdlines_fops);
5016 5986
5017 trace_create_file("trace_clock", 0644, d_tracer, NULL,
5018 &trace_clock_fops);
5019
5020 trace_create_file("tracing_on", 0644, d_tracer,
5021 &global_trace, &rb_simple_fops);
5022
5023#ifdef CONFIG_DYNAMIC_FTRACE 5987#ifdef CONFIG_DYNAMIC_FTRACE
5024 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 5988 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
5025 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 5989 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
5026#endif 5990#endif
5027 5991
5028#ifdef CONFIG_TRACER_SNAPSHOT 5992 create_trace_instances(d_tracer);
5029 trace_create_file("snapshot", 0644, d_tracer,
5030 (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
5031#endif
5032
5033 create_trace_options_dir();
5034 5993
5035 for_each_tracing_cpu(cpu) 5994 create_trace_options_dir(&global_trace);
5036 tracing_init_debugfs_percpu(cpu);
5037 5995
5038 return 0; 5996 return 0;
5039} 5997}
@@ -5089,8 +6047,8 @@ void
5089trace_printk_seq(struct trace_seq *s) 6047trace_printk_seq(struct trace_seq *s)
5090{ 6048{
5091 /* Probably should print a warning here. */ 6049 /* Probably should print a warning here. */
5092 if (s->len >= 1000) 6050 if (s->len >= TRACE_MAX_PRINT)
5093 s->len = 1000; 6051 s->len = TRACE_MAX_PRINT;
5094 6052
5095 /* should be zero ended, but we are paranoid. */ 6053 /* should be zero ended, but we are paranoid. */
5096 s->buffer[s->len] = 0; 6054 s->buffer[s->len] = 0;
@@ -5103,46 +6061,43 @@ trace_printk_seq(struct trace_seq *s)
5103void trace_init_global_iter(struct trace_iterator *iter) 6061void trace_init_global_iter(struct trace_iterator *iter)
5104{ 6062{
5105 iter->tr = &global_trace; 6063 iter->tr = &global_trace;
5106 iter->trace = current_trace; 6064 iter->trace = iter->tr->current_trace;
5107 iter->cpu_file = TRACE_PIPE_ALL_CPU; 6065 iter->cpu_file = RING_BUFFER_ALL_CPUS;
6066 iter->trace_buffer = &global_trace.trace_buffer;
5108} 6067}
5109 6068
5110static void 6069void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
5111__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5112{ 6070{
5113 static arch_spinlock_t ftrace_dump_lock =
5114 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
5115 /* use static because iter can be a bit big for the stack */ 6071 /* use static because iter can be a bit big for the stack */
5116 static struct trace_iterator iter; 6072 static struct trace_iterator iter;
6073 static atomic_t dump_running;
5117 unsigned int old_userobj; 6074 unsigned int old_userobj;
5118 static int dump_ran;
5119 unsigned long flags; 6075 unsigned long flags;
5120 int cnt = 0, cpu; 6076 int cnt = 0, cpu;
5121 6077
5122 /* only one dump */ 6078 /* Only allow one dump user at a time. */
5123 local_irq_save(flags); 6079 if (atomic_inc_return(&dump_running) != 1) {
5124 arch_spin_lock(&ftrace_dump_lock); 6080 atomic_dec(&dump_running);
5125 if (dump_ran) 6081 return;
5126 goto out; 6082 }
5127
5128 dump_ran = 1;
5129 6083
6084 /*
6085 * Always turn off tracing when we dump.
6086 * We don't need to show trace output of what happens
6087 * between multiple crashes.
6088 *
6089 * If the user does a sysrq-z, then they can re-enable
6090 * tracing with echo 1 > tracing_on.
6091 */
5130 tracing_off(); 6092 tracing_off();
5131 6093
5132 /* Did function tracer already get disabled? */ 6094 local_irq_save(flags);
5133 if (ftrace_is_dead()) {
5134 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
5135 printk("# MAY BE MISSING FUNCTION EVENTS\n");
5136 }
5137
5138 if (disable_tracing)
5139 ftrace_kill();
5140 6095
5141 /* Simulate the iterator */ 6096 /* Simulate the iterator */
5142 trace_init_global_iter(&iter); 6097 trace_init_global_iter(&iter);
5143 6098
5144 for_each_tracing_cpu(cpu) { 6099 for_each_tracing_cpu(cpu) {
5145 atomic_inc(&iter.tr->data[cpu]->disabled); 6100 atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
5146 } 6101 }
5147 6102
5148 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 6103 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -5152,7 +6107,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5152 6107
5153 switch (oops_dump_mode) { 6108 switch (oops_dump_mode) {
5154 case DUMP_ALL: 6109 case DUMP_ALL:
5155 iter.cpu_file = TRACE_PIPE_ALL_CPU; 6110 iter.cpu_file = RING_BUFFER_ALL_CPUS;
5156 break; 6111 break;
5157 case DUMP_ORIG: 6112 case DUMP_ORIG:
5158 iter.cpu_file = raw_smp_processor_id(); 6113 iter.cpu_file = raw_smp_processor_id();
@@ -5161,11 +6116,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5161 goto out_enable; 6116 goto out_enable;
5162 default: 6117 default:
5163 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); 6118 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
5164 iter.cpu_file = TRACE_PIPE_ALL_CPU; 6119 iter.cpu_file = RING_BUFFER_ALL_CPUS;
5165 } 6120 }
5166 6121
5167 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 6122 printk(KERN_TRACE "Dumping ftrace buffer:\n");
5168 6123
6124 /* Did function tracer already get disabled? */
6125 if (ftrace_is_dead()) {
6126 printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
6127 printk("# MAY BE MISSING FUNCTION EVENTS\n");
6128 }
6129
5169 /* 6130 /*
5170 * We need to stop all tracing on all CPUS to read the 6131 * We need to stop all tracing on all CPUS to read the
5171 * the next buffer. This is a bit expensive, but is 6132 * the next buffer. This is a bit expensive, but is
@@ -5205,33 +6166,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5205 printk(KERN_TRACE "---------------------------------\n"); 6166 printk(KERN_TRACE "---------------------------------\n");
5206 6167
5207 out_enable: 6168 out_enable:
5208 /* Re-enable tracing if requested */ 6169 trace_flags |= old_userobj;
5209 if (!disable_tracing) {
5210 trace_flags |= old_userobj;
5211 6170
5212 for_each_tracing_cpu(cpu) { 6171 for_each_tracing_cpu(cpu) {
5213 atomic_dec(&iter.tr->data[cpu]->disabled); 6172 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
5214 }
5215 tracing_on();
5216 } 6173 }
5217 6174 atomic_dec(&dump_running);
5218 out:
5219 arch_spin_unlock(&ftrace_dump_lock);
5220 local_irq_restore(flags); 6175 local_irq_restore(flags);
5221} 6176}
5222
5223/* By default: disable tracing after the dump */
5224void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
5225{
5226 __ftrace_dump(true, oops_dump_mode);
5227}
5228EXPORT_SYMBOL_GPL(ftrace_dump); 6177EXPORT_SYMBOL_GPL(ftrace_dump);
5229 6178
5230__init static int tracer_alloc_buffers(void) 6179__init static int tracer_alloc_buffers(void)
5231{ 6180{
5232 int ring_buf_size; 6181 int ring_buf_size;
5233 enum ring_buffer_flags rb_flags;
5234 int i;
5235 int ret = -ENOMEM; 6182 int ret = -ENOMEM;
5236 6183
5237 6184
@@ -5252,49 +6199,27 @@ __init static int tracer_alloc_buffers(void)
5252 else 6199 else
5253 ring_buf_size = 1; 6200 ring_buf_size = 1;
5254 6201
5255 rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
5256
5257 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 6202 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
5258 cpumask_copy(tracing_cpumask, cpu_all_mask); 6203 cpumask_copy(tracing_cpumask, cpu_all_mask);
5259 6204
6205 raw_spin_lock_init(&global_trace.start_lock);
6206
5260 /* TODO: make the number of buffers hot pluggable with CPUS */ 6207 /* TODO: make the number of buffers hot pluggable with CPUS */
5261 global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); 6208 if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
5262 if (!global_trace.buffer) {
5263 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 6209 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
5264 WARN_ON(1); 6210 WARN_ON(1);
5265 goto out_free_cpumask; 6211 goto out_free_cpumask;
5266 } 6212 }
6213
5267 if (global_trace.buffer_disabled) 6214 if (global_trace.buffer_disabled)
5268 tracing_off(); 6215 tracing_off();
5269 6216
5270
5271#ifdef CONFIG_TRACER_MAX_TRACE
5272 max_tr.buffer = ring_buffer_alloc(1, rb_flags);
5273 if (!max_tr.buffer) {
5274 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
5275 WARN_ON(1);
5276 ring_buffer_free(global_trace.buffer);
5277 goto out_free_cpumask;
5278 }
5279#endif
5280
5281 /* Allocate the first page for all buffers */
5282 for_each_tracing_cpu(i) {
5283 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
5284 max_tr.data[i] = &per_cpu(max_tr_data, i);
5285 }
5286
5287 set_buffer_entries(&global_trace,
5288 ring_buffer_size(global_trace.buffer, 0));
5289#ifdef CONFIG_TRACER_MAX_TRACE
5290 set_buffer_entries(&max_tr, 1);
5291#endif
5292
5293 trace_init_cmdlines(); 6217 trace_init_cmdlines();
5294 init_irq_work(&trace_work_wakeup, trace_wake_up);
5295 6218
5296 register_tracer(&nop_trace); 6219 register_tracer(&nop_trace);
5297 6220
6221 global_trace.current_trace = &nop_trace;
6222
5298 /* All seems OK, enable tracing */ 6223 /* All seems OK, enable tracing */
5299 tracing_disabled = 0; 6224 tracing_disabled = 0;
5300 6225
@@ -5303,16 +6228,32 @@ __init static int tracer_alloc_buffers(void)
5303 6228
5304 register_die_notifier(&trace_die_notifier); 6229 register_die_notifier(&trace_die_notifier);
5305 6230
6231 global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
6232
6233 /* Holder for file callbacks */
6234 global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
6235 global_trace.trace_cpu.tr = &global_trace;
6236
6237 INIT_LIST_HEAD(&global_trace.systems);
6238 INIT_LIST_HEAD(&global_trace.events);
6239 list_add(&global_trace.list, &ftrace_trace_arrays);
6240
5306 while (trace_boot_options) { 6241 while (trace_boot_options) {
5307 char *option; 6242 char *option;
5308 6243
5309 option = strsep(&trace_boot_options, ","); 6244 option = strsep(&trace_boot_options, ",");
5310 trace_set_options(option); 6245 trace_set_options(&global_trace, option);
5311 } 6246 }
5312 6247
6248 register_snapshot_cmd();
6249
5313 return 0; 6250 return 0;
5314 6251
5315out_free_cpumask: 6252out_free_cpumask:
6253 free_percpu(global_trace.trace_buffer.data);
6254#ifdef CONFIG_TRACER_MAX_TRACE
6255 free_percpu(global_trace.max_buffer.data);
6256#endif
5316 free_cpumask_var(tracing_cpumask); 6257 free_cpumask_var(tracing_cpumask);
5317out_free_buffer_mask: 6258out_free_buffer_mask:
5318 free_cpumask_var(tracing_buffer_mask); 6259 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 57d7e5397d56..711ca7d3e7f1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,11 @@
13#include <linux/trace_seq.h> 13#include <linux/trace_seq.h>
14#include <linux/ftrace_event.h> 14#include <linux/ftrace_event.h>
15 15
16#ifdef CONFIG_FTRACE_SYSCALLS
17#include <asm/unistd.h> /* For NR_SYSCALLS */
18#include <asm/syscall.h> /* some archs define it here */
19#endif
20
16enum trace_type { 21enum trace_type {
17 __TRACE_FIRST_TYPE = 0, 22 __TRACE_FIRST_TYPE = 0,
18 23
@@ -29,6 +34,7 @@ enum trace_type {
29 TRACE_GRAPH_ENT, 34 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK, 35 TRACE_USER_STACK,
31 TRACE_BLK, 36 TRACE_BLK,
37 TRACE_BPUTS,
32 38
33 __TRACE_LAST_TYPE, 39 __TRACE_LAST_TYPE,
34}; 40};
@@ -103,11 +109,6 @@ struct kretprobe_trace_entry_head {
103 unsigned long ret_ip; 109 unsigned long ret_ip;
104}; 110};
105 111
106struct uprobe_trace_entry_head {
107 struct trace_entry ent;
108 unsigned long ip;
109};
110
111/* 112/*
112 * trace_flag_type is an enumeration that holds different 113 * trace_flag_type is an enumeration that holds different
113 * states when a trace occurs. These are: 114 * states when a trace occurs. These are:
@@ -127,12 +128,21 @@ enum trace_flag_type {
127 128
128#define TRACE_BUF_SIZE 1024 129#define TRACE_BUF_SIZE 1024
129 130
131struct trace_array;
132
133struct trace_cpu {
134 struct trace_array *tr;
135 struct dentry *dir;
136 int cpu;
137};
138
130/* 139/*
131 * The CPU trace array - it consists of thousands of trace entries 140 * The CPU trace array - it consists of thousands of trace entries
132 * plus some other descriptor data: (for example which task started 141 * plus some other descriptor data: (for example which task started
133 * the trace, etc.) 142 * the trace, etc.)
134 */ 143 */
135struct trace_array_cpu { 144struct trace_array_cpu {
145 struct trace_cpu trace_cpu;
136 atomic_t disabled; 146 atomic_t disabled;
137 void *buffer_page; /* ring buffer spare */ 147 void *buffer_page; /* ring buffer spare */
138 148
@@ -151,20 +161,83 @@ struct trace_array_cpu {
151 char comm[TASK_COMM_LEN]; 161 char comm[TASK_COMM_LEN];
152}; 162};
153 163
164struct tracer;
165
166struct trace_buffer {
167 struct trace_array *tr;
168 struct ring_buffer *buffer;
169 struct trace_array_cpu __percpu *data;
170 cycle_t time_start;
171 int cpu;
172};
173
154/* 174/*
155 * The trace array - an array of per-CPU trace arrays. This is the 175 * The trace array - an array of per-CPU trace arrays. This is the
156 * highest level data structure that individual tracers deal with. 176 * highest level data structure that individual tracers deal with.
157 * They have on/off state as well: 177 * They have on/off state as well:
158 */ 178 */
159struct trace_array { 179struct trace_array {
160 struct ring_buffer *buffer; 180 struct list_head list;
161 int cpu; 181 char *name;
182 struct trace_buffer trace_buffer;
183#ifdef CONFIG_TRACER_MAX_TRACE
184 /*
185 * The max_buffer is used to snapshot the trace when a maximum
186 * latency is reached, or when the user initiates a snapshot.
187 * Some tracers will use this to store a maximum trace while
188 * it continues examining live traces.
189 *
190 * The buffers for the max_buffer are set up the same as the trace_buffer
191 * When a snapshot is taken, the buffer of the max_buffer is swapped
192 * with the buffer of the trace_buffer and the buffers are reset for
193 * the trace_buffer so the tracing can continue.
194 */
195 struct trace_buffer max_buffer;
196 bool allocated_snapshot;
197#endif
162 int buffer_disabled; 198 int buffer_disabled;
163 cycle_t time_start; 199 struct trace_cpu trace_cpu; /* place holder */
200#ifdef CONFIG_FTRACE_SYSCALLS
201 int sys_refcount_enter;
202 int sys_refcount_exit;
203 DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
204 DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
205#endif
206 int stop_count;
207 int clock_id;
208 struct tracer *current_trace;
209 unsigned int flags;
210 raw_spinlock_t start_lock;
211 struct dentry *dir;
212 struct dentry *options;
213 struct dentry *percpu_dir;
214 struct dentry *event_dir;
215 struct list_head systems;
216 struct list_head events;
164 struct task_struct *waiter; 217 struct task_struct *waiter;
165 struct trace_array_cpu *data[NR_CPUS]; 218 int ref;
219};
220
221enum {
222 TRACE_ARRAY_FL_GLOBAL = (1 << 0)
166}; 223};
167 224
225extern struct list_head ftrace_trace_arrays;
226
227/*
228 * The global tracer (top) should be the first trace array added,
229 * but we check the flag anyway.
230 */
231static inline struct trace_array *top_trace_array(void)
232{
233 struct trace_array *tr;
234
235 tr = list_entry(ftrace_trace_arrays.prev,
236 typeof(*tr), list);
237 WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
238 return tr;
239}
240
168#define FTRACE_CMP_TYPE(var, type) \ 241#define FTRACE_CMP_TYPE(var, type) \
169 __builtin_types_compatible_p(typeof(var), type *) 242 __builtin_types_compatible_p(typeof(var), type *)
170 243
@@ -200,6 +273,7 @@ extern void __ftrace_bad_type(void);
200 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 273 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
201 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 274 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
202 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ 275 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
276 IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
203 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 277 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
204 TRACE_MMIO_RW); \ 278 TRACE_MMIO_RW); \
205 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 279 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -283,11 +357,16 @@ struct tracer {
283 enum print_line_t (*print_line)(struct trace_iterator *iter); 357 enum print_line_t (*print_line)(struct trace_iterator *iter);
284 /* If you handled the flag setting, return 0 */ 358 /* If you handled the flag setting, return 0 */
285 int (*set_flag)(u32 old_flags, u32 bit, int set); 359 int (*set_flag)(u32 old_flags, u32 bit, int set);
360 /* Return 0 if OK with change, else return non-zero */
361 int (*flag_changed)(struct tracer *tracer,
362 u32 mask, int set);
286 struct tracer *next; 363 struct tracer *next;
287 struct tracer_flags *flags; 364 struct tracer_flags *flags;
288 bool print_max; 365 bool print_max;
366 bool enabled;
367#ifdef CONFIG_TRACER_MAX_TRACE
289 bool use_max_tr; 368 bool use_max_tr;
290 bool allocated_snapshot; 369#endif
291}; 370};
292 371
293 372
@@ -423,8 +502,6 @@ static __always_inline void trace_clear_recursion(int bit)
423 current->trace_recursion = val; 502 current->trace_recursion = val;
424} 503}
425 504
426#define TRACE_PIPE_ALL_CPU -1
427
428static inline struct ring_buffer_iter * 505static inline struct ring_buffer_iter *
429trace_buffer_iter(struct trace_iterator *iter, int cpu) 506trace_buffer_iter(struct trace_iterator *iter, int cpu)
430{ 507{
@@ -435,10 +512,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
435 512
436int tracer_init(struct tracer *t, struct trace_array *tr); 513int tracer_init(struct tracer *t, struct trace_array *tr);
437int tracing_is_enabled(void); 514int tracing_is_enabled(void);
438void tracing_reset(struct trace_array *tr, int cpu); 515void tracing_reset(struct trace_buffer *buf, int cpu);
439void tracing_reset_online_cpus(struct trace_array *tr); 516void tracing_reset_online_cpus(struct trace_buffer *buf);
440void tracing_reset_current(int cpu); 517void tracing_reset_current(int cpu);
441void tracing_reset_current_online_cpus(void); 518void tracing_reset_all_online_cpus(void);
442int tracing_open_generic(struct inode *inode, struct file *filp); 519int tracing_open_generic(struct inode *inode, struct file *filp);
443struct dentry *trace_create_file(const char *name, 520struct dentry *trace_create_file(const char *name,
444 umode_t mode, 521 umode_t mode,
@@ -446,6 +523,7 @@ struct dentry *trace_create_file(const char *name,
446 void *data, 523 void *data,
447 const struct file_operations *fops); 524 const struct file_operations *fops);
448 525
526struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
449struct dentry *tracing_init_dentry(void); 527struct dentry *tracing_init_dentry(void);
450 528
451struct ring_buffer_event; 529struct ring_buffer_event;
@@ -579,7 +657,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
579#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 657#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
580extern int DYN_FTRACE_TEST_NAME2(void); 658extern int DYN_FTRACE_TEST_NAME2(void);
581 659
582extern int ring_buffer_expanded; 660extern bool ring_buffer_expanded;
583extern bool tracing_selftest_disabled; 661extern bool tracing_selftest_disabled;
584DECLARE_PER_CPU(int, ftrace_cpu_disabled); 662DECLARE_PER_CPU(int, ftrace_cpu_disabled);
585 663
@@ -615,6 +693,8 @@ trace_array_vprintk(struct trace_array *tr,
615 unsigned long ip, const char *fmt, va_list args); 693 unsigned long ip, const char *fmt, va_list args);
616int trace_array_printk(struct trace_array *tr, 694int trace_array_printk(struct trace_array *tr,
617 unsigned long ip, const char *fmt, ...); 695 unsigned long ip, const char *fmt, ...);
696int trace_array_printk_buf(struct ring_buffer *buffer,
697 unsigned long ip, const char *fmt, ...);
618void trace_printk_seq(struct trace_seq *s); 698void trace_printk_seq(struct trace_seq *s);
619enum print_line_t print_trace_line(struct trace_iterator *iter); 699enum print_line_t print_trace_line(struct trace_iterator *iter);
620 700
@@ -782,6 +862,7 @@ enum trace_iterator_flags {
782 TRACE_ITER_STOP_ON_FREE = 0x400000, 862 TRACE_ITER_STOP_ON_FREE = 0x400000,
783 TRACE_ITER_IRQ_INFO = 0x800000, 863 TRACE_ITER_IRQ_INFO = 0x800000,
784 TRACE_ITER_MARKERS = 0x1000000, 864 TRACE_ITER_MARKERS = 0x1000000,
865 TRACE_ITER_FUNCTION = 0x2000000,
785}; 866};
786 867
787/* 868/*
@@ -828,8 +909,8 @@ enum {
828 909
829struct ftrace_event_field { 910struct ftrace_event_field {
830 struct list_head link; 911 struct list_head link;
831 char *name; 912 const char *name;
832 char *type; 913 const char *type;
833 int filter_type; 914 int filter_type;
834 int offset; 915 int offset;
835 int size; 916 int size;
@@ -847,12 +928,19 @@ struct event_filter {
847struct event_subsystem { 928struct event_subsystem {
848 struct list_head list; 929 struct list_head list;
849 const char *name; 930 const char *name;
850 struct dentry *entry;
851 struct event_filter *filter; 931 struct event_filter *filter;
852 int nr_events;
853 int ref_count; 932 int ref_count;
854}; 933};
855 934
935struct ftrace_subsystem_dir {
936 struct list_head list;
937 struct event_subsystem *subsystem;
938 struct trace_array *tr;
939 struct dentry *entry;
940 int ref_count;
941 int nr_events;
942};
943
856#define FILTER_PRED_INVALID ((unsigned short)-1) 944#define FILTER_PRED_INVALID ((unsigned short)-1)
857#define FILTER_PRED_IS_RIGHT (1 << 15) 945#define FILTER_PRED_IS_RIGHT (1 << 15)
858#define FILTER_PRED_FOLD (1 << 15) 946#define FILTER_PRED_FOLD (1 << 15)
@@ -902,22 +990,20 @@ struct filter_pred {
902 unsigned short right; 990 unsigned short right;
903}; 991};
904 992
905extern struct list_head ftrace_common_fields;
906
907extern enum regex_type 993extern enum regex_type
908filter_parse_regex(char *buff, int len, char **search, int *not); 994filter_parse_regex(char *buff, int len, char **search, int *not);
909extern void print_event_filter(struct ftrace_event_call *call, 995extern void print_event_filter(struct ftrace_event_call *call,
910 struct trace_seq *s); 996 struct trace_seq *s);
911extern int apply_event_filter(struct ftrace_event_call *call, 997extern int apply_event_filter(struct ftrace_event_call *call,
912 char *filter_string); 998 char *filter_string);
913extern int apply_subsystem_event_filter(struct event_subsystem *system, 999extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
914 char *filter_string); 1000 char *filter_string);
915extern void print_subsystem_event_filter(struct event_subsystem *system, 1001extern void print_subsystem_event_filter(struct event_subsystem *system,
916 struct trace_seq *s); 1002 struct trace_seq *s);
917extern int filter_assign_type(const char *type); 1003extern int filter_assign_type(const char *type);
918 1004
919struct list_head * 1005struct ftrace_event_field *
920trace_get_fields(struct ftrace_event_call *event_call); 1006trace_find_event_field(struct ftrace_event_call *call, char *name);
921 1007
922static inline int 1008static inline int
923filter_check_discard(struct ftrace_event_call *call, void *rec, 1009filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -934,6 +1020,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
934} 1020}
935 1021
936extern void trace_event_enable_cmd_record(bool enable); 1022extern void trace_event_enable_cmd_record(bool enable);
1023extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
1024extern int event_trace_del_tracer(struct trace_array *tr);
937 1025
938extern struct mutex event_mutex; 1026extern struct mutex event_mutex;
939extern struct list_head ftrace_events; 1027extern struct list_head ftrace_events;
@@ -943,6 +1031,19 @@ extern const char *__stop___trace_bprintk_fmt[];
943 1031
944void trace_printk_init_buffers(void); 1032void trace_printk_init_buffers(void);
945void trace_printk_start_comm(void); 1033void trace_printk_start_comm(void);
1034int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
1035int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
1036
1037/*
1038 * Normal trace_printk() and friends allocates special buffers
1039 * to do the manipulation, as well as saves the print formats
1040 * into sections to display. But the trace infrastructure wants
1041 * to use these without the added overhead at the price of being
1042 * a bit slower (used mainly for warnings, where we don't care
1043 * about performance). The internal_trace_puts() is for such
1044 * a purpose.
1045 */
1046#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
946 1047
947#undef FTRACE_ENTRY 1048#undef FTRACE_ENTRY
948#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ 1049#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 95e96842ed29..d594da0dc03c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
32{ 32{
33 struct ftrace_event_call *call = &event_branch; 33 struct ftrace_event_call *call = &event_branch;
34 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
35 struct trace_array_cpu *data;
35 struct ring_buffer_event *event; 36 struct ring_buffer_event *event;
36 struct trace_branch *entry; 37 struct trace_branch *entry;
37 struct ring_buffer *buffer; 38 struct ring_buffer *buffer;
@@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
51 52
52 local_irq_save(flags); 53 local_irq_save(flags);
53 cpu = raw_smp_processor_id(); 54 cpu = raw_smp_processor_id();
54 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 55 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
56 if (atomic_inc_return(&data->disabled) != 1)
55 goto out; 57 goto out;
56 58
57 pc = preempt_count(); 59 pc = preempt_count();
58 buffer = tr->buffer; 60 buffer = tr->trace_buffer.buffer;
59 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, 61 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
60 sizeof(*entry), flags, pc); 62 sizeof(*entry), flags, pc);
61 if (!event) 63 if (!event)
@@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
80 __buffer_unlock_commit(buffer, event); 82 __buffer_unlock_commit(buffer, event);
81 83
82 out: 84 out:
83 atomic_dec(&tr->data[cpu]->disabled); 85 atomic_dec(&data->disabled);
84 local_irq_restore(flags); 86 local_irq_restore(flags);
85} 87}
86 88
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aa8f5f48dae6..26dc348332b7 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -57,6 +57,16 @@ u64 notrace trace_clock(void)
57 return local_clock(); 57 return local_clock();
58} 58}
59 59
60/*
61 * trace_jiffy_clock(): Simply use jiffies as a clock counter.
62 */
63u64 notrace trace_clock_jiffies(void)
64{
65 u64 jiffy = jiffies - INITIAL_JIFFIES;
66
67 /* Return nsecs */
68 return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
69}
60 70
61/* 71/*
62 * trace_clock_global(): special globally coherent trace clock 72 * trace_clock_global(): special globally coherent trace clock
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4108e1250ca2..e2d027ac66a2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,
223 __dynamic_array( u32, buf ) 223 __dynamic_array( u32, buf )
224 ), 224 ),
225 225
226 F_printk("%08lx fmt:%p", 226 F_printk("%pf: %s",
227 __entry->ip, __entry->fmt), 227 (void *)__entry->ip, __entry->fmt),
228 228
229 FILTER_OTHER 229 FILTER_OTHER
230); 230);
@@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,
238 __dynamic_array( char, buf ) 238 __dynamic_array( char, buf )
239 ), 239 ),
240 240
241 F_printk("%08lx %s", 241 F_printk("%pf: %s",
242 __entry->ip, __entry->buf), 242 (void *)__entry->ip, __entry->buf),
243
244 FILTER_OTHER
245);
246
247FTRACE_ENTRY(bputs, bputs_entry,
248
249 TRACE_BPUTS,
250
251 F_STRUCT(
252 __field( unsigned long, ip )
253 __field( const char *, str )
254 ),
255
256 F_printk("%pf: %s",
257 (void *)__entry->ip, __entry->str),
243 258
244 FILTER_OTHER 259 FILTER_OTHER
245); 260);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 57e9b284250c..53582e982e51 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE];
34EXPORT_SYMBOL_GPL(event_storage); 34EXPORT_SYMBOL_GPL(event_storage);
35 35
36LIST_HEAD(ftrace_events); 36LIST_HEAD(ftrace_events);
37LIST_HEAD(ftrace_common_fields); 37static LIST_HEAD(ftrace_common_fields);
38 38
39struct list_head * 39#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
40
41static struct kmem_cache *field_cachep;
42static struct kmem_cache *file_cachep;
43
44/* Double loops, do not use break, only goto's work */
45#define do_for_each_event_file(tr, file) \
46 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
47 list_for_each_entry(file, &tr->events, list)
48
49#define do_for_each_event_file_safe(tr, file) \
50 list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
51 struct ftrace_event_file *___n; \
52 list_for_each_entry_safe(file, ___n, &tr->events, list)
53
54#define while_for_each_event_file() \
55 }
56
57static struct list_head *
40trace_get_fields(struct ftrace_event_call *event_call) 58trace_get_fields(struct ftrace_event_call *event_call)
41{ 59{
42 if (!event_call->class->get_fields) 60 if (!event_call->class->get_fields)
@@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call)
44 return event_call->class->get_fields(event_call); 62 return event_call->class->get_fields(event_call);
45} 63}
46 64
65static struct ftrace_event_field *
66__find_event_field(struct list_head *head, char *name)
67{
68 struct ftrace_event_field *field;
69
70 list_for_each_entry(field, head, link) {
71 if (!strcmp(field->name, name))
72 return field;
73 }
74
75 return NULL;
76}
77
78struct ftrace_event_field *
79trace_find_event_field(struct ftrace_event_call *call, char *name)
80{
81 struct ftrace_event_field *field;
82 struct list_head *head;
83
84 field = __find_event_field(&ftrace_common_fields, name);
85 if (field)
86 return field;
87
88 head = trace_get_fields(call);
89 return __find_event_field(head, name);
90}
91
47static int __trace_define_field(struct list_head *head, const char *type, 92static int __trace_define_field(struct list_head *head, const char *type,
48 const char *name, int offset, int size, 93 const char *name, int offset, int size,
49 int is_signed, int filter_type) 94 int is_signed, int filter_type)
50{ 95{
51 struct ftrace_event_field *field; 96 struct ftrace_event_field *field;
52 97
53 field = kzalloc(sizeof(*field), GFP_KERNEL); 98 field = kmem_cache_alloc(field_cachep, GFP_TRACE);
54 if (!field) 99 if (!field)
55 goto err; 100 goto err;
56 101
57 field->name = kstrdup(name, GFP_KERNEL); 102 field->name = name;
58 if (!field->name) 103 field->type = type;
59 goto err;
60
61 field->type = kstrdup(type, GFP_KERNEL);
62 if (!field->type)
63 goto err;
64 104
65 if (filter_type == FILTER_OTHER) 105 if (filter_type == FILTER_OTHER)
66 field->filter_type = filter_assign_type(type); 106 field->filter_type = filter_assign_type(type);
@@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
76 return 0; 116 return 0;
77 117
78err: 118err:
79 if (field) 119 kmem_cache_free(field_cachep, field);
80 kfree(field->name);
81 kfree(field);
82 120
83 return -ENOMEM; 121 return -ENOMEM;
84} 122}
@@ -120,7 +158,7 @@ static int trace_define_common_fields(void)
120 return ret; 158 return ret;
121} 159}
122 160
123void trace_destroy_fields(struct ftrace_event_call *call) 161static void trace_destroy_fields(struct ftrace_event_call *call)
124{ 162{
125 struct ftrace_event_field *field, *next; 163 struct ftrace_event_field *field, *next;
126 struct list_head *head; 164 struct list_head *head;
@@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)
128 head = trace_get_fields(call); 166 head = trace_get_fields(call);
129 list_for_each_entry_safe(field, next, head, link) { 167 list_for_each_entry_safe(field, next, head, link) {
130 list_del(&field->link); 168 list_del(&field->link);
131 kfree(field->type); 169 kmem_cache_free(field_cachep, field);
132 kfree(field->name);
133 kfree(field);
134 } 170 }
135} 171}
136 172
@@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
149int ftrace_event_reg(struct ftrace_event_call *call, 185int ftrace_event_reg(struct ftrace_event_call *call,
150 enum trace_reg type, void *data) 186 enum trace_reg type, void *data)
151{ 187{
188 struct ftrace_event_file *file = data;
189
152 switch (type) { 190 switch (type) {
153 case TRACE_REG_REGISTER: 191 case TRACE_REG_REGISTER:
154 return tracepoint_probe_register(call->name, 192 return tracepoint_probe_register(call->name,
155 call->class->probe, 193 call->class->probe,
156 call); 194 file);
157 case TRACE_REG_UNREGISTER: 195 case TRACE_REG_UNREGISTER:
158 tracepoint_probe_unregister(call->name, 196 tracepoint_probe_unregister(call->name,
159 call->class->probe, 197 call->class->probe,
160 call); 198 file);
161 return 0; 199 return 0;
162 200
163#ifdef CONFIG_PERF_EVENTS 201#ifdef CONFIG_PERF_EVENTS
@@ -183,54 +221,100 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);
183 221
184void trace_event_enable_cmd_record(bool enable) 222void trace_event_enable_cmd_record(bool enable)
185{ 223{
186 struct ftrace_event_call *call; 224 struct ftrace_event_file *file;
225 struct trace_array *tr;
187 226
188 mutex_lock(&event_mutex); 227 mutex_lock(&event_mutex);
189 list_for_each_entry(call, &ftrace_events, list) { 228 do_for_each_event_file(tr, file) {
190 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) 229
230 if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
191 continue; 231 continue;
192 232
193 if (enable) { 233 if (enable) {
194 tracing_start_cmdline_record(); 234 tracing_start_cmdline_record();
195 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 235 set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
196 } else { 236 } else {
197 tracing_stop_cmdline_record(); 237 tracing_stop_cmdline_record();
198 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 238 clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
199 } 239 }
200 } 240 } while_for_each_event_file();
201 mutex_unlock(&event_mutex); 241 mutex_unlock(&event_mutex);
202} 242}
203 243
204static int ftrace_event_enable_disable(struct ftrace_event_call *call, 244static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
205 int enable) 245 int enable, int soft_disable)
206{ 246{
247 struct ftrace_event_call *call = file->event_call;
207 int ret = 0; 248 int ret = 0;
249 int disable;
208 250
209 switch (enable) { 251 switch (enable) {
210 case 0: 252 case 0:
211 if (call->flags & TRACE_EVENT_FL_ENABLED) { 253 /*
212 call->flags &= ~TRACE_EVENT_FL_ENABLED; 254 * When soft_disable is set and enable is cleared, we want
213 if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { 255 * to clear the SOFT_DISABLED flag but leave the event in the
256 * state that it was. That is, if the event was enabled and
257 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
258 * is set we do not want the event to be enabled before we
259 * clear the bit.
260 *
261 * When soft_disable is not set but the SOFT_MODE flag is,
262 * we do nothing. Do not disable the tracepoint, otherwise
263 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
264 */
265 if (soft_disable) {
266 disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
267 clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
268 } else
269 disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
270
271 if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
272 clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
273 if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
214 tracing_stop_cmdline_record(); 274 tracing_stop_cmdline_record();
215 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 275 clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
216 } 276 }
217 call->class->reg(call, TRACE_REG_UNREGISTER, NULL); 277 call->class->reg(call, TRACE_REG_UNREGISTER, file);
218 } 278 }
279 /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */
280 if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
281 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
219 break; 282 break;
220 case 1: 283 case 1:
221 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { 284 /*
285 * When soft_disable is set and enable is set, we want to
286 * register the tracepoint for the event, but leave the event
287 * as is. That means, if the event was already enabled, we do
288 * nothing (but set SOFT_MODE). If the event is disabled, we
289 * set SOFT_DISABLED before enabling the event tracepoint, so
290 * it still seems to be disabled.
291 */
292 if (!soft_disable)
293 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
294 else
295 set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
296
297 if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
298
299 /* Keep the event disabled, when going to SOFT_MODE. */
300 if (soft_disable)
301 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
302
222 if (trace_flags & TRACE_ITER_RECORD_CMD) { 303 if (trace_flags & TRACE_ITER_RECORD_CMD) {
223 tracing_start_cmdline_record(); 304 tracing_start_cmdline_record();
224 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 305 set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
225 } 306 }
226 ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); 307 ret = call->class->reg(call, TRACE_REG_REGISTER, file);
227 if (ret) { 308 if (ret) {
228 tracing_stop_cmdline_record(); 309 tracing_stop_cmdline_record();
229 pr_info("event trace: Could not enable event " 310 pr_info("event trace: Could not enable event "
230 "%s\n", call->name); 311 "%s\n", call->name);
231 break; 312 break;
232 } 313 }
233 call->flags |= TRACE_EVENT_FL_ENABLED; 314 set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
315
316 /* WAS_ENABLED gets set but never cleared. */
317 call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
234 } 318 }
235 break; 319 break;
236 } 320 }
@@ -238,13 +322,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
238 return ret; 322 return ret;
239} 323}
240 324
241static void ftrace_clear_events(void) 325static int ftrace_event_enable_disable(struct ftrace_event_file *file,
326 int enable)
242{ 327{
243 struct ftrace_event_call *call; 328 return __ftrace_event_enable_disable(file, enable, 0);
329}
330
331static void ftrace_clear_events(struct trace_array *tr)
332{
333 struct ftrace_event_file *file;
244 334
245 mutex_lock(&event_mutex); 335 mutex_lock(&event_mutex);
246 list_for_each_entry(call, &ftrace_events, list) { 336 list_for_each_entry(file, &tr->events, list) {
247 ftrace_event_enable_disable(call, 0); 337 ftrace_event_enable_disable(file, 0);
248 } 338 }
249 mutex_unlock(&event_mutex); 339 mutex_unlock(&event_mutex);
250} 340}
@@ -257,11 +347,12 @@ static void __put_system(struct event_subsystem *system)
257 if (--system->ref_count) 347 if (--system->ref_count)
258 return; 348 return;
259 349
350 list_del(&system->list);
351
260 if (filter) { 352 if (filter) {
261 kfree(filter->filter_string); 353 kfree(filter->filter_string);
262 kfree(filter); 354 kfree(filter);
263 } 355 }
264 kfree(system->name);
265 kfree(system); 356 kfree(system);
266} 357}
267 358
@@ -271,24 +362,45 @@ static void __get_system(struct event_subsystem *system)
271 system->ref_count++; 362 system->ref_count++;
272} 363}
273 364
274static void put_system(struct event_subsystem *system) 365static void __get_system_dir(struct ftrace_subsystem_dir *dir)
366{
367 WARN_ON_ONCE(dir->ref_count == 0);
368 dir->ref_count++;
369 __get_system(dir->subsystem);
370}
371
372static void __put_system_dir(struct ftrace_subsystem_dir *dir)
373{
374 WARN_ON_ONCE(dir->ref_count == 0);
375 /* If the subsystem is about to be freed, the dir must be too */
376 WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1);
377
378 __put_system(dir->subsystem);
379 if (!--dir->ref_count)
380 kfree(dir);
381}
382
383static void put_system(struct ftrace_subsystem_dir *dir)
275{ 384{
276 mutex_lock(&event_mutex); 385 mutex_lock(&event_mutex);
277 __put_system(system); 386 __put_system_dir(dir);
278 mutex_unlock(&event_mutex); 387 mutex_unlock(&event_mutex);
279} 388}
280 389
281/* 390/*
282 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. 391 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
283 */ 392 */
284static int __ftrace_set_clr_event(const char *match, const char *sub, 393static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
285 const char *event, int set) 394 const char *sub, const char *event, int set)
286{ 395{
396 struct ftrace_event_file *file;
287 struct ftrace_event_call *call; 397 struct ftrace_event_call *call;
288 int ret = -EINVAL; 398 int ret = -EINVAL;
289 399
290 mutex_lock(&event_mutex); 400 mutex_lock(&event_mutex);
291 list_for_each_entry(call, &ftrace_events, list) { 401 list_for_each_entry(file, &tr->events, list) {
402
403 call = file->event_call;
292 404
293 if (!call->name || !call->class || !call->class->reg) 405 if (!call->name || !call->class || !call->class->reg)
294 continue; 406 continue;
@@ -307,7 +419,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
307 if (event && strcmp(event, call->name) != 0) 419 if (event && strcmp(event, call->name) != 0)
308 continue; 420 continue;
309 421
310 ftrace_event_enable_disable(call, set); 422 ftrace_event_enable_disable(file, set);
311 423
312 ret = 0; 424 ret = 0;
313 } 425 }
@@ -316,7 +428,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
316 return ret; 428 return ret;
317} 429}
318 430
319static int ftrace_set_clr_event(char *buf, int set) 431static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
320{ 432{
321 char *event = NULL, *sub = NULL, *match; 433 char *event = NULL, *sub = NULL, *match;
322 434
@@ -344,7 +456,7 @@ static int ftrace_set_clr_event(char *buf, int set)
344 event = NULL; 456 event = NULL;
345 } 457 }
346 458
347 return __ftrace_set_clr_event(match, sub, event, set); 459 return __ftrace_set_clr_event(tr, match, sub, event, set);
348} 460}
349 461
350/** 462/**
@@ -361,7 +473,9 @@ static int ftrace_set_clr_event(char *buf, int set)
361 */ 473 */
362int trace_set_clr_event(const char *system, const char *event, int set) 474int trace_set_clr_event(const char *system, const char *event, int set)
363{ 475{
364 return __ftrace_set_clr_event(NULL, system, event, set); 476 struct trace_array *tr = top_trace_array();
477
478 return __ftrace_set_clr_event(tr, NULL, system, event, set);
365} 479}
366EXPORT_SYMBOL_GPL(trace_set_clr_event); 480EXPORT_SYMBOL_GPL(trace_set_clr_event);
367 481
@@ -373,6 +487,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
373 size_t cnt, loff_t *ppos) 487 size_t cnt, loff_t *ppos)
374{ 488{
375 struct trace_parser parser; 489 struct trace_parser parser;
490 struct seq_file *m = file->private_data;
491 struct trace_array *tr = m->private;
376 ssize_t read, ret; 492 ssize_t read, ret;
377 493
378 if (!cnt) 494 if (!cnt)
@@ -395,7 +511,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
395 511
396 parser.buffer[parser.idx] = 0; 512 parser.buffer[parser.idx] = 0;
397 513
398 ret = ftrace_set_clr_event(parser.buffer + !set, set); 514 ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
399 if (ret) 515 if (ret)
400 goto out_put; 516 goto out_put;
401 } 517 }
@@ -411,17 +527,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
411static void * 527static void *
412t_next(struct seq_file *m, void *v, loff_t *pos) 528t_next(struct seq_file *m, void *v, loff_t *pos)
413{ 529{
414 struct ftrace_event_call *call = v; 530 struct ftrace_event_file *file = v;
531 struct ftrace_event_call *call;
532 struct trace_array *tr = m->private;
415 533
416 (*pos)++; 534 (*pos)++;
417 535
418 list_for_each_entry_continue(call, &ftrace_events, list) { 536 list_for_each_entry_continue(file, &tr->events, list) {
537 call = file->event_call;
419 /* 538 /*
420 * The ftrace subsystem is for showing formats only. 539 * The ftrace subsystem is for showing formats only.
421 * They can not be enabled or disabled via the event files. 540 * They can not be enabled or disabled via the event files.
422 */ 541 */
423 if (call->class && call->class->reg) 542 if (call->class && call->class->reg)
424 return call; 543 return file;
425 } 544 }
426 545
427 return NULL; 546 return NULL;
@@ -429,30 +548,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
429 548
430static void *t_start(struct seq_file *m, loff_t *pos) 549static void *t_start(struct seq_file *m, loff_t *pos)
431{ 550{
432 struct ftrace_event_call *call; 551 struct ftrace_event_file *file;
552 struct trace_array *tr = m->private;
433 loff_t l; 553 loff_t l;
434 554
435 mutex_lock(&event_mutex); 555 mutex_lock(&event_mutex);
436 556
437 call = list_entry(&ftrace_events, struct ftrace_event_call, list); 557 file = list_entry(&tr->events, struct ftrace_event_file, list);
438 for (l = 0; l <= *pos; ) { 558 for (l = 0; l <= *pos; ) {
439 call = t_next(m, call, &l); 559 file = t_next(m, file, &l);
440 if (!call) 560 if (!file)
441 break; 561 break;
442 } 562 }
443 return call; 563 return file;
444} 564}
445 565
446static void * 566static void *
447s_next(struct seq_file *m, void *v, loff_t *pos) 567s_next(struct seq_file *m, void *v, loff_t *pos)
448{ 568{
449 struct ftrace_event_call *call = v; 569 struct ftrace_event_file *file = v;
570 struct trace_array *tr = m->private;
450 571
451 (*pos)++; 572 (*pos)++;
452 573
453 list_for_each_entry_continue(call, &ftrace_events, list) { 574 list_for_each_entry_continue(file, &tr->events, list) {
454 if (call->flags & TRACE_EVENT_FL_ENABLED) 575 if (file->flags & FTRACE_EVENT_FL_ENABLED)
455 return call; 576 return file;
456 } 577 }
457 578
458 return NULL; 579 return NULL;
@@ -460,23 +581,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
460 581
461static void *s_start(struct seq_file *m, loff_t *pos) 582static void *s_start(struct seq_file *m, loff_t *pos)
462{ 583{
463 struct ftrace_event_call *call; 584 struct ftrace_event_file *file;
585 struct trace_array *tr = m->private;
464 loff_t l; 586 loff_t l;
465 587
466 mutex_lock(&event_mutex); 588 mutex_lock(&event_mutex);
467 589
468 call = list_entry(&ftrace_events, struct ftrace_event_call, list); 590 file = list_entry(&tr->events, struct ftrace_event_file, list);
469 for (l = 0; l <= *pos; ) { 591 for (l = 0; l <= *pos; ) {
470 call = s_next(m, call, &l); 592 file = s_next(m, file, &l);
471 if (!call) 593 if (!file)
472 break; 594 break;
473 } 595 }
474 return call; 596 return file;
475} 597}
476 598
477static int t_show(struct seq_file *m, void *v) 599static int t_show(struct seq_file *m, void *v)
478{ 600{
479 struct ftrace_event_call *call = v; 601 struct ftrace_event_file *file = v;
602 struct ftrace_event_call *call = file->event_call;
480 603
481 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) 604 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
482 seq_printf(m, "%s:", call->class->system); 605 seq_printf(m, "%s:", call->class->system);
@@ -494,25 +617,31 @@ static ssize_t
494event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, 617event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
495 loff_t *ppos) 618 loff_t *ppos)
496{ 619{
497 struct ftrace_event_call *call = filp->private_data; 620 struct ftrace_event_file *file = filp->private_data;
498 char *buf; 621 char *buf;
499 622
500 if (call->flags & TRACE_EVENT_FL_ENABLED) 623 if (file->flags & FTRACE_EVENT_FL_ENABLED) {
501 buf = "1\n"; 624 if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
502 else 625 buf = "0*\n";
626 else
627 buf = "1\n";
628 } else
503 buf = "0\n"; 629 buf = "0\n";
504 630
505 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); 631 return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
506} 632}
507 633
508static ssize_t 634static ssize_t
509event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 635event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
510 loff_t *ppos) 636 loff_t *ppos)
511{ 637{
512 struct ftrace_event_call *call = filp->private_data; 638 struct ftrace_event_file *file = filp->private_data;
513 unsigned long val; 639 unsigned long val;
514 int ret; 640 int ret;
515 641
642 if (!file)
643 return -EINVAL;
644
516 ret = kstrtoul_from_user(ubuf, cnt, 10, &val); 645 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
517 if (ret) 646 if (ret)
518 return ret; 647 return ret;
@@ -525,7 +654,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
525 case 0: 654 case 0:
526 case 1: 655 case 1:
527 mutex_lock(&event_mutex); 656 mutex_lock(&event_mutex);
528 ret = ftrace_event_enable_disable(call, val); 657 ret = ftrace_event_enable_disable(file, val);
529 mutex_unlock(&event_mutex); 658 mutex_unlock(&event_mutex);
530 break; 659 break;
531 660
@@ -543,14 +672,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
543 loff_t *ppos) 672 loff_t *ppos)
544{ 673{
545 const char set_to_char[4] = { '?', '0', '1', 'X' }; 674 const char set_to_char[4] = { '?', '0', '1', 'X' };
546 struct event_subsystem *system = filp->private_data; 675 struct ftrace_subsystem_dir *dir = filp->private_data;
676 struct event_subsystem *system = dir->subsystem;
547 struct ftrace_event_call *call; 677 struct ftrace_event_call *call;
678 struct ftrace_event_file *file;
679 struct trace_array *tr = dir->tr;
548 char buf[2]; 680 char buf[2];
549 int set = 0; 681 int set = 0;
550 int ret; 682 int ret;
551 683
552 mutex_lock(&event_mutex); 684 mutex_lock(&event_mutex);
553 list_for_each_entry(call, &ftrace_events, list) { 685 list_for_each_entry(file, &tr->events, list) {
686 call = file->event_call;
554 if (!call->name || !call->class || !call->class->reg) 687 if (!call->name || !call->class || !call->class->reg)
555 continue; 688 continue;
556 689
@@ -562,7 +695,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
562 * or if all events or cleared, or if we have 695 * or if all events or cleared, or if we have
563 * a mixture. 696 * a mixture.
564 */ 697 */
565 set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); 698 set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
566 699
567 /* 700 /*
568 * If we have a mixture, no need to look further. 701 * If we have a mixture, no need to look further.
@@ -584,7 +717,8 @@ static ssize_t
584system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, 717system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
585 loff_t *ppos) 718 loff_t *ppos)
586{ 719{
587 struct event_subsystem *system = filp->private_data; 720 struct ftrace_subsystem_dir *dir = filp->private_data;
721 struct event_subsystem *system = dir->subsystem;
588 const char *name = NULL; 722 const char *name = NULL;
589 unsigned long val; 723 unsigned long val;
590 ssize_t ret; 724 ssize_t ret;
@@ -607,7 +741,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
607 if (system) 741 if (system)
608 name = system->name; 742 name = system->name;
609 743
610 ret = __ftrace_set_clr_event(NULL, name, NULL, val); 744 ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
611 if (ret) 745 if (ret)
612 goto out; 746 goto out;
613 747
@@ -845,43 +979,75 @@ static LIST_HEAD(event_subsystems);
845static int subsystem_open(struct inode *inode, struct file *filp) 979static int subsystem_open(struct inode *inode, struct file *filp)
846{ 980{
847 struct event_subsystem *system = NULL; 981 struct event_subsystem *system = NULL;
982 struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
983 struct trace_array *tr;
848 int ret; 984 int ret;
849 985
850 if (!inode->i_private)
851 goto skip_search;
852
853 /* Make sure the system still exists */ 986 /* Make sure the system still exists */
854 mutex_lock(&event_mutex); 987 mutex_lock(&event_mutex);
855 list_for_each_entry(system, &event_subsystems, list) { 988 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
856 if (system == inode->i_private) { 989 list_for_each_entry(dir, &tr->systems, list) {
857 /* Don't open systems with no events */ 990 if (dir == inode->i_private) {
858 if (!system->nr_events) { 991 /* Don't open systems with no events */
859 system = NULL; 992 if (dir->nr_events) {
860 break; 993 __get_system_dir(dir);
994 system = dir->subsystem;
995 }
996 goto exit_loop;
861 } 997 }
862 __get_system(system);
863 break;
864 } 998 }
865 } 999 }
1000 exit_loop:
866 mutex_unlock(&event_mutex); 1001 mutex_unlock(&event_mutex);
867 1002
868 if (system != inode->i_private) 1003 if (!system)
869 return -ENODEV; 1004 return -ENODEV;
870 1005
871 skip_search: 1006 /* Some versions of gcc think dir can be uninitialized here */
1007 WARN_ON(!dir);
1008
872 ret = tracing_open_generic(inode, filp); 1009 ret = tracing_open_generic(inode, filp);
873 if (ret < 0 && system) 1010 if (ret < 0)
874 put_system(system); 1011 put_system(dir);
1012
1013 return ret;
1014}
1015
1016static int system_tr_open(struct inode *inode, struct file *filp)
1017{
1018 struct ftrace_subsystem_dir *dir;
1019 struct trace_array *tr = inode->i_private;
1020 int ret;
1021
1022 /* Make a temporary dir that has no system but points to tr */
1023 dir = kzalloc(sizeof(*dir), GFP_KERNEL);
1024 if (!dir)
1025 return -ENOMEM;
1026
1027 dir->tr = tr;
1028
1029 ret = tracing_open_generic(inode, filp);
1030 if (ret < 0)
1031 kfree(dir);
1032
1033 filp->private_data = dir;
875 1034
876 return ret; 1035 return ret;
877} 1036}
878 1037
879static int subsystem_release(struct inode *inode, struct file *file) 1038static int subsystem_release(struct inode *inode, struct file *file)
880{ 1039{
881 struct event_subsystem *system = inode->i_private; 1040 struct ftrace_subsystem_dir *dir = file->private_data;
882 1041
883 if (system) 1042 /*
884 put_system(system); 1043 * If dir->subsystem is NULL, then this is a temporary
1044 * descriptor that was made for a trace_array to enable
1045 * all subsystems.
1046 */
1047 if (dir->subsystem)
1048 put_system(dir);
1049 else
1050 kfree(dir);
885 1051
886 return 0; 1052 return 0;
887} 1053}
@@ -890,7 +1056,8 @@ static ssize_t
890subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, 1056subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
891 loff_t *ppos) 1057 loff_t *ppos)
892{ 1058{
893 struct event_subsystem *system = filp->private_data; 1059 struct ftrace_subsystem_dir *dir = filp->private_data;
1060 struct event_subsystem *system = dir->subsystem;
894 struct trace_seq *s; 1061 struct trace_seq *s;
895 int r; 1062 int r;
896 1063
@@ -915,7 +1082,7 @@ static ssize_t
915subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, 1082subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
916 loff_t *ppos) 1083 loff_t *ppos)
917{ 1084{
918 struct event_subsystem *system = filp->private_data; 1085 struct ftrace_subsystem_dir *dir = filp->private_data;
919 char *buf; 1086 char *buf;
920 int err; 1087 int err;
921 1088
@@ -932,7 +1099,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
932 } 1099 }
933 buf[cnt] = '\0'; 1100 buf[cnt] = '\0';
934 1101
935 err = apply_subsystem_event_filter(system, buf); 1102 err = apply_subsystem_event_filter(dir, buf);
936 free_page((unsigned long) buf); 1103 free_page((unsigned long) buf);
937 if (err < 0) 1104 if (err < 0)
938 return err; 1105 return err;
@@ -1041,30 +1208,35 @@ static const struct file_operations ftrace_system_enable_fops = {
1041 .release = subsystem_release, 1208 .release = subsystem_release,
1042}; 1209};
1043 1210
1211static const struct file_operations ftrace_tr_enable_fops = {
1212 .open = system_tr_open,
1213 .read = system_enable_read,
1214 .write = system_enable_write,
1215 .llseek = default_llseek,
1216 .release = subsystem_release,
1217};
1218
1044static const struct file_operations ftrace_show_header_fops = { 1219static const struct file_operations ftrace_show_header_fops = {
1045 .open = tracing_open_generic, 1220 .open = tracing_open_generic,
1046 .read = show_header, 1221 .read = show_header,
1047 .llseek = default_llseek, 1222 .llseek = default_llseek,
1048}; 1223};
1049 1224
1050static struct dentry *event_trace_events_dir(void) 1225static int
1226ftrace_event_open(struct inode *inode, struct file *file,
1227 const struct seq_operations *seq_ops)
1051{ 1228{
1052 static struct dentry *d_tracer; 1229 struct seq_file *m;
1053 static struct dentry *d_events; 1230 int ret;
1054
1055 if (d_events)
1056 return d_events;
1057
1058 d_tracer = tracing_init_dentry();
1059 if (!d_tracer)
1060 return NULL;
1061 1231
1062 d_events = debugfs_create_dir("events", d_tracer); 1232 ret = seq_open(file, seq_ops);
1063 if (!d_events) 1233 if (ret < 0)
1064 pr_warning("Could not create debugfs " 1234 return ret;
1065 "'events' directory\n"); 1235 m = file->private_data;
1236 /* copy tr over to seq ops */
1237 m->private = inode->i_private;
1066 1238
1067 return d_events; 1239 return ret;
1068} 1240}
1069 1241
1070static int 1242static int
@@ -1072,117 +1244,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)
1072{ 1244{
1073 const struct seq_operations *seq_ops = &show_event_seq_ops; 1245 const struct seq_operations *seq_ops = &show_event_seq_ops;
1074 1246
1075 return seq_open(file, seq_ops); 1247 return ftrace_event_open(inode, file, seq_ops);
1076} 1248}
1077 1249
1078static int 1250static int
1079ftrace_event_set_open(struct inode *inode, struct file *file) 1251ftrace_event_set_open(struct inode *inode, struct file *file)
1080{ 1252{
1081 const struct seq_operations *seq_ops = &show_set_event_seq_ops; 1253 const struct seq_operations *seq_ops = &show_set_event_seq_ops;
1254 struct trace_array *tr = inode->i_private;
1082 1255
1083 if ((file->f_mode & FMODE_WRITE) && 1256 if ((file->f_mode & FMODE_WRITE) &&
1084 (file->f_flags & O_TRUNC)) 1257 (file->f_flags & O_TRUNC))
1085 ftrace_clear_events(); 1258 ftrace_clear_events(tr);
1086 1259
1087 return seq_open(file, seq_ops); 1260 return ftrace_event_open(inode, file, seq_ops);
1261}
1262
1263static struct event_subsystem *
1264create_new_subsystem(const char *name)
1265{
1266 struct event_subsystem *system;
1267
1268 /* need to create new entry */
1269 system = kmalloc(sizeof(*system), GFP_KERNEL);
1270 if (!system)
1271 return NULL;
1272
1273 system->ref_count = 1;
1274 system->name = name;
1275
1276 system->filter = NULL;
1277
1278 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
1279 if (!system->filter)
1280 goto out_free;
1281
1282 list_add(&system->list, &event_subsystems);
1283
1284 return system;
1285
1286 out_free:
1287 kfree(system);
1288 return NULL;
1088} 1289}
1089 1290
1090static struct dentry * 1291static struct dentry *
1091event_subsystem_dir(const char *name, struct dentry *d_events) 1292event_subsystem_dir(struct trace_array *tr, const char *name,
1293 struct ftrace_event_file *file, struct dentry *parent)
1092{ 1294{
1295 struct ftrace_subsystem_dir *dir;
1093 struct event_subsystem *system; 1296 struct event_subsystem *system;
1094 struct dentry *entry; 1297 struct dentry *entry;
1095 1298
1096 /* First see if we did not already create this dir */ 1299 /* First see if we did not already create this dir */
1097 list_for_each_entry(system, &event_subsystems, list) { 1300 list_for_each_entry(dir, &tr->systems, list) {
1301 system = dir->subsystem;
1098 if (strcmp(system->name, name) == 0) { 1302 if (strcmp(system->name, name) == 0) {
1099 system->nr_events++; 1303 dir->nr_events++;
1100 return system->entry; 1304 file->system = dir;
1305 return dir->entry;
1101 } 1306 }
1102 } 1307 }
1103 1308
1104 /* need to create new entry */ 1309 /* Now see if the system itself exists. */
1105 system = kmalloc(sizeof(*system), GFP_KERNEL); 1310 list_for_each_entry(system, &event_subsystems, list) {
1106 if (!system) { 1311 if (strcmp(system->name, name) == 0)
1107 pr_warning("No memory to create event subsystem %s\n", 1312 break;
1108 name);
1109 return d_events;
1110 } 1313 }
1314 /* Reset system variable when not found */
1315 if (&system->list == &event_subsystems)
1316 system = NULL;
1111 1317
1112 system->entry = debugfs_create_dir(name, d_events); 1318 dir = kmalloc(sizeof(*dir), GFP_KERNEL);
1113 if (!system->entry) { 1319 if (!dir)
1114 pr_warning("Could not create event subsystem %s\n", 1320 goto out_fail;
1115 name);
1116 kfree(system);
1117 return d_events;
1118 }
1119 1321
1120 system->nr_events = 1; 1322 if (!system) {
1121 system->ref_count = 1; 1323 system = create_new_subsystem(name);
1122 system->name = kstrdup(name, GFP_KERNEL); 1324 if (!system)
1123 if (!system->name) { 1325 goto out_free;
1124 debugfs_remove(system->entry); 1326 } else
1125 kfree(system); 1327 __get_system(system);
1126 return d_events; 1328
1329 dir->entry = debugfs_create_dir(name, parent);
1330 if (!dir->entry) {
1331 pr_warning("Failed to create system directory %s\n", name);
1332 __put_system(system);
1333 goto out_free;
1127 } 1334 }
1128 1335
1129 list_add(&system->list, &event_subsystems); 1336 dir->tr = tr;
1130 1337 dir->ref_count = 1;
1131 system->filter = NULL; 1338 dir->nr_events = 1;
1132 1339 dir->subsystem = system;
1133 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); 1340 file->system = dir;
1134 if (!system->filter) {
1135 pr_warning("Could not allocate filter for subsystem "
1136 "'%s'\n", name);
1137 return system->entry;
1138 }
1139 1341
1140 entry = debugfs_create_file("filter", 0644, system->entry, system, 1342 entry = debugfs_create_file("filter", 0644, dir->entry, dir,
1141 &ftrace_subsystem_filter_fops); 1343 &ftrace_subsystem_filter_fops);
1142 if (!entry) { 1344 if (!entry) {
1143 kfree(system->filter); 1345 kfree(system->filter);
1144 system->filter = NULL; 1346 system->filter = NULL;
1145 pr_warning("Could not create debugfs " 1347 pr_warning("Could not create debugfs '%s/filter' entry\n", name);
1146 "'%s/filter' entry\n", name);
1147 } 1348 }
1148 1349
1149 trace_create_file("enable", 0644, system->entry, system, 1350 trace_create_file("enable", 0644, dir->entry, dir,
1150 &ftrace_system_enable_fops); 1351 &ftrace_system_enable_fops);
1151 1352
1152 return system->entry; 1353 list_add(&dir->list, &tr->systems);
1354
1355 return dir->entry;
1356
1357 out_free:
1358 kfree(dir);
1359 out_fail:
1360 /* Only print this message if failed on memory allocation */
1361 if (!dir || !system)
1362 pr_warning("No memory to create event subsystem %s\n",
1363 name);
1364 return NULL;
1153} 1365}
1154 1366
1155static int 1367static int
1156event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, 1368event_create_dir(struct dentry *parent,
1369 struct ftrace_event_file *file,
1157 const struct file_operations *id, 1370 const struct file_operations *id,
1158 const struct file_operations *enable, 1371 const struct file_operations *enable,
1159 const struct file_operations *filter, 1372 const struct file_operations *filter,
1160 const struct file_operations *format) 1373 const struct file_operations *format)
1161{ 1374{
1375 struct ftrace_event_call *call = file->event_call;
1376 struct trace_array *tr = file->tr;
1162 struct list_head *head; 1377 struct list_head *head;
1378 struct dentry *d_events;
1163 int ret; 1379 int ret;
1164 1380
1165 /* 1381 /*
1166 * If the trace point header did not define TRACE_SYSTEM 1382 * If the trace point header did not define TRACE_SYSTEM
1167 * then the system would be called "TRACE_SYSTEM". 1383 * then the system would be called "TRACE_SYSTEM".
1168 */ 1384 */
1169 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) 1385 if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
1170 d_events = event_subsystem_dir(call->class->system, d_events); 1386 d_events = event_subsystem_dir(tr, call->class->system, file, parent);
1171 1387 if (!d_events)
1172 call->dir = debugfs_create_dir(call->name, d_events); 1388 return -ENOMEM;
1173 if (!call->dir) { 1389 } else
1174 pr_warning("Could not create debugfs " 1390 d_events = parent;
1175 "'%s' directory\n", call->name); 1391
1392 file->dir = debugfs_create_dir(call->name, d_events);
1393 if (!file->dir) {
1394 pr_warning("Could not create debugfs '%s' directory\n",
1395 call->name);
1176 return -1; 1396 return -1;
1177 } 1397 }
1178 1398
1179 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) 1399 if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
1180 trace_create_file("enable", 0644, call->dir, call, 1400 trace_create_file("enable", 0644, file->dir, file,
1181 enable); 1401 enable);
1182 1402
1183#ifdef CONFIG_PERF_EVENTS 1403#ifdef CONFIG_PERF_EVENTS
1184 if (call->event.type && call->class->reg) 1404 if (call->event.type && call->class->reg)
1185 trace_create_file("id", 0444, call->dir, call, 1405 trace_create_file("id", 0444, file->dir, call,
1186 id); 1406 id);
1187#endif 1407#endif
1188 1408
@@ -1196,23 +1416,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
1196 if (ret < 0) { 1416 if (ret < 0) {
1197 pr_warning("Could not initialize trace point" 1417 pr_warning("Could not initialize trace point"
1198 " events/%s\n", call->name); 1418 " events/%s\n", call->name);
1199 return ret; 1419 return -1;
1200 } 1420 }
1201 } 1421 }
1202 trace_create_file("filter", 0644, call->dir, call, 1422 trace_create_file("filter", 0644, file->dir, call,
1203 filter); 1423 filter);
1204 1424
1205 trace_create_file("format", 0444, call->dir, call, 1425 trace_create_file("format", 0444, file->dir, call,
1206 format); 1426 format);
1207 1427
1208 return 0; 1428 return 0;
1209} 1429}
1210 1430
1431static void remove_subsystem(struct ftrace_subsystem_dir *dir)
1432{
1433 if (!dir)
1434 return;
1435
1436 if (!--dir->nr_events) {
1437 debugfs_remove_recursive(dir->entry);
1438 list_del(&dir->list);
1439 __put_system_dir(dir);
1440 }
1441}
1442
1443static void remove_event_from_tracers(struct ftrace_event_call *call)
1444{
1445 struct ftrace_event_file *file;
1446 struct trace_array *tr;
1447
1448 do_for_each_event_file_safe(tr, file) {
1449
1450 if (file->event_call != call)
1451 continue;
1452
1453 list_del(&file->list);
1454 debugfs_remove_recursive(file->dir);
1455 remove_subsystem(file->system);
1456 kmem_cache_free(file_cachep, file);
1457
1458 /*
1459 * The do_for_each_event_file_safe() is
1460 * a double loop. After finding the call for this
1461 * trace_array, we use break to jump to the next
1462 * trace_array.
1463 */
1464 break;
1465 } while_for_each_event_file();
1466}
1467
1211static void event_remove(struct ftrace_event_call *call) 1468static void event_remove(struct ftrace_event_call *call)
1212{ 1469{
1213 ftrace_event_enable_disable(call, 0); 1470 struct trace_array *tr;
1471 struct ftrace_event_file *file;
1472
1473 do_for_each_event_file(tr, file) {
1474 if (file->event_call != call)
1475 continue;
1476 ftrace_event_enable_disable(file, 0);
1477 /*
1478 * The do_for_each_event_file() is
1479 * a double loop. After finding the call for this
1480 * trace_array, we use break to jump to the next
1481 * trace_array.
1482 */
1483 break;
1484 } while_for_each_event_file();
1485
1214 if (call->event.funcs) 1486 if (call->event.funcs)
1215 __unregister_ftrace_event(&call->event); 1487 __unregister_ftrace_event(&call->event);
1488 remove_event_from_tracers(call);
1216 list_del(&call->list); 1489 list_del(&call->list);
1217} 1490}
1218 1491
@@ -1234,82 +1507,99 @@ static int event_init(struct ftrace_event_call *call)
1234} 1507}
1235 1508
1236static int 1509static int
1237__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, 1510__register_event(struct ftrace_event_call *call, struct module *mod)
1238 const struct file_operations *id,
1239 const struct file_operations *enable,
1240 const struct file_operations *filter,
1241 const struct file_operations *format)
1242{ 1511{
1243 struct dentry *d_events;
1244 int ret; 1512 int ret;
1245 1513
1246 ret = event_init(call); 1514 ret = event_init(call);
1247 if (ret < 0) 1515 if (ret < 0)
1248 return ret; 1516 return ret;
1249 1517
1250 d_events = event_trace_events_dir(); 1518 list_add(&call->list, &ftrace_events);
1251 if (!d_events)
1252 return -ENOENT;
1253
1254 ret = event_create_dir(call, d_events, id, enable, filter, format);
1255 if (!ret)
1256 list_add(&call->list, &ftrace_events);
1257 call->mod = mod; 1519 call->mod = mod;
1258 1520
1259 return ret; 1521 return 0;
1522}
1523
1524/* Add an event to a trace directory */
1525static int
1526__trace_add_new_event(struct ftrace_event_call *call,
1527 struct trace_array *tr,
1528 const struct file_operations *id,
1529 const struct file_operations *enable,
1530 const struct file_operations *filter,
1531 const struct file_operations *format)
1532{
1533 struct ftrace_event_file *file;
1534
1535 file = kmem_cache_alloc(file_cachep, GFP_TRACE);
1536 if (!file)
1537 return -ENOMEM;
1538
1539 file->event_call = call;
1540 file->tr = tr;
1541 list_add(&file->list, &tr->events);
1542
1543 return event_create_dir(tr->event_dir, file, id, enable, filter, format);
1260} 1544}
1261 1545
1546/*
1547 * Just create a decriptor for early init. A descriptor is required
1548 * for enabling events at boot. We want to enable events before
1549 * the filesystem is initialized.
1550 */
1551static __init int
1552__trace_early_add_new_event(struct ftrace_event_call *call,
1553 struct trace_array *tr)
1554{
1555 struct ftrace_event_file *file;
1556
1557 file = kmem_cache_alloc(file_cachep, GFP_TRACE);
1558 if (!file)
1559 return -ENOMEM;
1560
1561 file->event_call = call;
1562 file->tr = tr;
1563 list_add(&file->list, &tr->events);
1564
1565 return 0;
1566}
1567
1568struct ftrace_module_file_ops;
1569static void __add_event_to_tracers(struct ftrace_event_call *call,
1570 struct ftrace_module_file_ops *file_ops);
1571
1262/* Add an additional event_call dynamically */ 1572/* Add an additional event_call dynamically */
1263int trace_add_event_call(struct ftrace_event_call *call) 1573int trace_add_event_call(struct ftrace_event_call *call)
1264{ 1574{
1265 int ret; 1575 int ret;
1266 mutex_lock(&event_mutex); 1576 mutex_lock(&event_mutex);
1267 ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
1268 &ftrace_enable_fops,
1269 &ftrace_event_filter_fops,
1270 &ftrace_event_format_fops);
1271 mutex_unlock(&event_mutex);
1272 return ret;
1273}
1274 1577
1275static void remove_subsystem_dir(const char *name) 1578 ret = __register_event(call, NULL);
1276{ 1579 if (ret >= 0)
1277 struct event_subsystem *system; 1580 __add_event_to_tracers(call, NULL);
1278
1279 if (strcmp(name, TRACE_SYSTEM) == 0)
1280 return;
1281 1581
1282 list_for_each_entry(system, &event_subsystems, list) { 1582 mutex_unlock(&event_mutex);
1283 if (strcmp(system->name, name) == 0) { 1583 return ret;
1284 if (!--system->nr_events) {
1285 debugfs_remove_recursive(system->entry);
1286 list_del(&system->list);
1287 __put_system(system);
1288 }
1289 break;
1290 }
1291 }
1292} 1584}
1293 1585
1294/* 1586/*
1295 * Must be called under locking both of event_mutex and trace_event_mutex. 1587 * Must be called under locking both of event_mutex and trace_event_sem.
1296 */ 1588 */
1297static void __trace_remove_event_call(struct ftrace_event_call *call) 1589static void __trace_remove_event_call(struct ftrace_event_call *call)
1298{ 1590{
1299 event_remove(call); 1591 event_remove(call);
1300 trace_destroy_fields(call); 1592 trace_destroy_fields(call);
1301 destroy_preds(call); 1593 destroy_preds(call);
1302 debugfs_remove_recursive(call->dir);
1303 remove_subsystem_dir(call->class->system);
1304} 1594}
1305 1595
1306/* Remove an event_call */ 1596/* Remove an event_call */
1307void trace_remove_event_call(struct ftrace_event_call *call) 1597void trace_remove_event_call(struct ftrace_event_call *call)
1308{ 1598{
1309 mutex_lock(&event_mutex); 1599 mutex_lock(&event_mutex);
1310 down_write(&trace_event_mutex); 1600 down_write(&trace_event_sem);
1311 __trace_remove_event_call(call); 1601 __trace_remove_event_call(call);
1312 up_write(&trace_event_mutex); 1602 up_write(&trace_event_sem);
1313 mutex_unlock(&event_mutex); 1603 mutex_unlock(&event_mutex);
1314} 1604}
1315 1605
@@ -1336,6 +1626,26 @@ struct ftrace_module_file_ops {
1336}; 1626};
1337 1627
1338static struct ftrace_module_file_ops * 1628static struct ftrace_module_file_ops *
1629find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1630{
1631 /*
1632 * As event_calls are added in groups by module,
1633 * when we find one file_ops, we don't need to search for
1634 * each call in that module, as the rest should be the
1635 * same. Only search for a new one if the last one did
1636 * not match.
1637 */
1638 if (file_ops && mod == file_ops->mod)
1639 return file_ops;
1640
1641 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1642 if (file_ops->mod == mod)
1643 return file_ops;
1644 }
1645 return NULL;
1646}
1647
1648static struct ftrace_module_file_ops *
1339trace_create_file_ops(struct module *mod) 1649trace_create_file_ops(struct module *mod)
1340{ 1650{
1341 struct ftrace_module_file_ops *file_ops; 1651 struct ftrace_module_file_ops *file_ops;
@@ -1386,9 +1696,8 @@ static void trace_module_add_events(struct module *mod)
1386 return; 1696 return;
1387 1697
1388 for_each_event(call, start, end) { 1698 for_each_event(call, start, end) {
1389 __trace_add_event_call(*call, mod, 1699 __register_event(*call, mod);
1390 &file_ops->id, &file_ops->enable, 1700 __add_event_to_tracers(*call, file_ops);
1391 &file_ops->filter, &file_ops->format);
1392 } 1701 }
1393} 1702}
1394 1703
@@ -1396,12 +1705,13 @@ static void trace_module_remove_events(struct module *mod)
1396{ 1705{
1397 struct ftrace_module_file_ops *file_ops; 1706 struct ftrace_module_file_ops *file_ops;
1398 struct ftrace_event_call *call, *p; 1707 struct ftrace_event_call *call, *p;
1399 bool found = false; 1708 bool clear_trace = false;
1400 1709
1401 down_write(&trace_event_mutex); 1710 down_write(&trace_event_sem);
1402 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1711 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1403 if (call->mod == mod) { 1712 if (call->mod == mod) {
1404 found = true; 1713 if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
1714 clear_trace = true;
1405 __trace_remove_event_call(call); 1715 __trace_remove_event_call(call);
1406 } 1716 }
1407 } 1717 }
@@ -1415,14 +1725,18 @@ static void trace_module_remove_events(struct module *mod)
1415 list_del(&file_ops->list); 1725 list_del(&file_ops->list);
1416 kfree(file_ops); 1726 kfree(file_ops);
1417 } 1727 }
1728 up_write(&trace_event_sem);
1418 1729
1419 /* 1730 /*
1420 * It is safest to reset the ring buffer if the module being unloaded 1731 * It is safest to reset the ring buffer if the module being unloaded
1421 * registered any events. 1732 * registered any events that were used. The only worry is if
1733 * a new module gets loaded, and takes on the same id as the events
1734 * of this module. When printing out the buffer, traced events left
1735 * over from this module may be passed to the new module events and
1736 * unexpected results may occur.
1422 */ 1737 */
1423 if (found) 1738 if (clear_trace)
1424 tracing_reset_current_online_cpus(); 1739 tracing_reset_all_online_cpus();
1425 up_write(&trace_event_mutex);
1426} 1740}
1427 1741
1428static int trace_module_notify(struct notifier_block *self, 1742static int trace_module_notify(struct notifier_block *self,
@@ -1443,14 +1757,433 @@ static int trace_module_notify(struct notifier_block *self,
1443 1757
1444 return 0; 1758 return 0;
1445} 1759}
1760
1761static int
1762__trace_add_new_mod_event(struct ftrace_event_call *call,
1763 struct trace_array *tr,
1764 struct ftrace_module_file_ops *file_ops)
1765{
1766 return __trace_add_new_event(call, tr,
1767 &file_ops->id, &file_ops->enable,
1768 &file_ops->filter, &file_ops->format);
1769}
1770
1446#else 1771#else
1447static int trace_module_notify(struct notifier_block *self, 1772static inline struct ftrace_module_file_ops *
1448 unsigned long val, void *data) 1773find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
1774{
1775 return NULL;
1776}
1777static inline int trace_module_notify(struct notifier_block *self,
1778 unsigned long val, void *data)
1449{ 1779{
1450 return 0; 1780 return 0;
1451} 1781}
1782static inline int
1783__trace_add_new_mod_event(struct ftrace_event_call *call,
1784 struct trace_array *tr,
1785 struct ftrace_module_file_ops *file_ops)
1786{
1787 return -ENODEV;
1788}
1452#endif /* CONFIG_MODULES */ 1789#endif /* CONFIG_MODULES */
1453 1790
1791/* Create a new event directory structure for a trace directory. */
1792static void
1793__trace_add_event_dirs(struct trace_array *tr)
1794{
1795 struct ftrace_module_file_ops *file_ops = NULL;
1796 struct ftrace_event_call *call;
1797 int ret;
1798
1799 list_for_each_entry(call, &ftrace_events, list) {
1800 if (call->mod) {
1801 /*
1802 * Directories for events by modules need to
1803 * keep module ref counts when opened (as we don't
1804 * want the module to disappear when reading one
1805 * of these files). The file_ops keep account of
1806 * the module ref count.
1807 */
1808 file_ops = find_ftrace_file_ops(file_ops, call->mod);
1809 if (!file_ops)
1810 continue; /* Warn? */
1811 ret = __trace_add_new_mod_event(call, tr, file_ops);
1812 if (ret < 0)
1813 pr_warning("Could not create directory for event %s\n",
1814 call->name);
1815 continue;
1816 }
1817 ret = __trace_add_new_event(call, tr,
1818 &ftrace_event_id_fops,
1819 &ftrace_enable_fops,
1820 &ftrace_event_filter_fops,
1821 &ftrace_event_format_fops);
1822 if (ret < 0)
1823 pr_warning("Could not create directory for event %s\n",
1824 call->name);
1825 }
1826}
1827
1828#ifdef CONFIG_DYNAMIC_FTRACE
1829
1830/* Avoid typos */
1831#define ENABLE_EVENT_STR "enable_event"
1832#define DISABLE_EVENT_STR "disable_event"
1833
1834struct event_probe_data {
1835 struct ftrace_event_file *file;
1836 unsigned long count;
1837 int ref;
1838 bool enable;
1839};
1840
1841static struct ftrace_event_file *
1842find_event_file(struct trace_array *tr, const char *system, const char *event)
1843{
1844 struct ftrace_event_file *file;
1845 struct ftrace_event_call *call;
1846
1847 list_for_each_entry(file, &tr->events, list) {
1848
1849 call = file->event_call;
1850
1851 if (!call->name || !call->class || !call->class->reg)
1852 continue;
1853
1854 if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
1855 continue;
1856
1857 if (strcmp(event, call->name) == 0 &&
1858 strcmp(system, call->class->system) == 0)
1859 return file;
1860 }
1861 return NULL;
1862}
1863
1864static void
1865event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
1866{
1867 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1868 struct event_probe_data *data = *pdata;
1869
1870 if (!data)
1871 return;
1872
1873 if (data->enable)
1874 clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
1875 else
1876 set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
1877}
1878
1879static void
1880event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
1881{
1882 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1883 struct event_probe_data *data = *pdata;
1884
1885 if (!data)
1886 return;
1887
1888 if (!data->count)
1889 return;
1890
1891 /* Skip if the event is in a state we want to switch to */
1892 if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
1893 return;
1894
1895 if (data->count != -1)
1896 (data->count)--;
1897
1898 event_enable_probe(ip, parent_ip, _data);
1899}
1900
1901static int
1902event_enable_print(struct seq_file *m, unsigned long ip,
1903 struct ftrace_probe_ops *ops, void *_data)
1904{
1905 struct event_probe_data *data = _data;
1906
1907 seq_printf(m, "%ps:", (void *)ip);
1908
1909 seq_printf(m, "%s:%s:%s",
1910 data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
1911 data->file->event_call->class->system,
1912 data->file->event_call->name);
1913
1914 if (data->count == -1)
1915 seq_printf(m, ":unlimited\n");
1916 else
1917 seq_printf(m, ":count=%ld\n", data->count);
1918
1919 return 0;
1920}
1921
1922static int
1923event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
1924 void **_data)
1925{
1926 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1927 struct event_probe_data *data = *pdata;
1928
1929 data->ref++;
1930 return 0;
1931}
1932
1933static void
1934event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
1935 void **_data)
1936{
1937 struct event_probe_data **pdata = (struct event_probe_data **)_data;
1938 struct event_probe_data *data = *pdata;
1939
1940 if (WARN_ON_ONCE(data->ref <= 0))
1941 return;
1942
1943 data->ref--;
1944 if (!data->ref) {
1945 /* Remove the SOFT_MODE flag */
1946 __ftrace_event_enable_disable(data->file, 0, 1);
1947 module_put(data->file->event_call->mod);
1948 kfree(data);
1949 }
1950 *pdata = NULL;
1951}
1952
1953static struct ftrace_probe_ops event_enable_probe_ops = {
1954 .func = event_enable_probe,
1955 .print = event_enable_print,
1956 .init = event_enable_init,
1957 .free = event_enable_free,
1958};
1959
1960static struct ftrace_probe_ops event_enable_count_probe_ops = {
1961 .func = event_enable_count_probe,
1962 .print = event_enable_print,
1963 .init = event_enable_init,
1964 .free = event_enable_free,
1965};
1966
1967static struct ftrace_probe_ops event_disable_probe_ops = {
1968 .func = event_enable_probe,
1969 .print = event_enable_print,
1970 .init = event_enable_init,
1971 .free = event_enable_free,
1972};
1973
1974static struct ftrace_probe_ops event_disable_count_probe_ops = {
1975 .func = event_enable_count_probe,
1976 .print = event_enable_print,
1977 .init = event_enable_init,
1978 .free = event_enable_free,
1979};
1980
1981static int
1982event_enable_func(struct ftrace_hash *hash,
1983 char *glob, char *cmd, char *param, int enabled)
1984{
1985 struct trace_array *tr = top_trace_array();
1986 struct ftrace_event_file *file;
1987 struct ftrace_probe_ops *ops;
1988 struct event_probe_data *data;
1989 const char *system;
1990 const char *event;
1991 char *number;
1992 bool enable;
1993 int ret;
1994
1995 /* hash funcs only work with set_ftrace_filter */
1996 if (!enabled)
1997 return -EINVAL;
1998
1999 if (!param)
2000 return -EINVAL;
2001
2002 system = strsep(&param, ":");
2003 if (!param)
2004 return -EINVAL;
2005
2006 event = strsep(&param, ":");
2007
2008 mutex_lock(&event_mutex);
2009
2010 ret = -EINVAL;
2011 file = find_event_file(tr, system, event);
2012 if (!file)
2013 goto out;
2014
2015 enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
2016
2017 if (enable)
2018 ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops;
2019 else
2020 ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
2021
2022 if (glob[0] == '!') {
2023 unregister_ftrace_function_probe_func(glob+1, ops);
2024 ret = 0;
2025 goto out;
2026 }
2027
2028 ret = -ENOMEM;
2029 data = kzalloc(sizeof(*data), GFP_KERNEL);
2030 if (!data)
2031 goto out;
2032
2033 data->enable = enable;
2034 data->count = -1;
2035 data->file = file;
2036
2037 if (!param)
2038 goto out_reg;
2039
2040 number = strsep(&param, ":");
2041
2042 ret = -EINVAL;
2043 if (!strlen(number))
2044 goto out_free;
2045
2046 /*
2047 * We use the callback data field (which is a pointer)
2048 * as our counter.
2049 */
2050 ret = kstrtoul(number, 0, &data->count);
2051 if (ret)
2052 goto out_free;
2053
2054 out_reg:
2055 /* Don't let event modules unload while probe registered */
2056 ret = try_module_get(file->event_call->mod);
2057 if (!ret)
2058 goto out_free;
2059
2060 ret = __ftrace_event_enable_disable(file, 1, 1);
2061 if (ret < 0)
2062 goto out_put;
2063 ret = register_ftrace_function_probe(glob, ops, data);
2064 if (!ret)
2065 goto out_disable;
2066 out:
2067 mutex_unlock(&event_mutex);
2068 return ret;
2069
2070 out_disable:
2071 __ftrace_event_enable_disable(file, 0, 1);
2072 out_put:
2073 module_put(file->event_call->mod);
2074 out_free:
2075 kfree(data);
2076 goto out;
2077}
2078
2079static struct ftrace_func_command event_enable_cmd = {
2080 .name = ENABLE_EVENT_STR,
2081 .func = event_enable_func,
2082};
2083
2084static struct ftrace_func_command event_disable_cmd = {
2085 .name = DISABLE_EVENT_STR,
2086 .func = event_enable_func,
2087};
2088
2089static __init int register_event_cmds(void)
2090{
2091 int ret;
2092
2093 ret = register_ftrace_command(&event_enable_cmd);
2094 if (WARN_ON(ret < 0))
2095 return ret;
2096 ret = register_ftrace_command(&event_disable_cmd);
2097 if (WARN_ON(ret < 0))
2098 unregister_ftrace_command(&event_enable_cmd);
2099 return ret;
2100}
2101#else
2102static inline int register_event_cmds(void) { return 0; }
2103#endif /* CONFIG_DYNAMIC_FTRACE */
2104
2105/*
2106 * The top level array has already had its ftrace_event_file
2107 * descriptors created in order to allow for early events to
2108 * be recorded. This function is called after the debugfs has been
2109 * initialized, and we now have to create the files associated
2110 * to the events.
2111 */
2112static __init void
2113__trace_early_add_event_dirs(struct trace_array *tr)
2114{
2115 struct ftrace_event_file *file;
2116 int ret;
2117
2118
2119 list_for_each_entry(file, &tr->events, list) {
2120 ret = event_create_dir(tr->event_dir, file,
2121 &ftrace_event_id_fops,
2122 &ftrace_enable_fops,
2123 &ftrace_event_filter_fops,
2124 &ftrace_event_format_fops);
2125 if (ret < 0)
2126 pr_warning("Could not create directory for event %s\n",
2127 file->event_call->name);
2128 }
2129}
2130
2131/*
2132 * For early boot up, the top trace array requires to have
2133 * a list of events that can be enabled. This must be done before
2134 * the filesystem is set up in order to allow events to be traced
2135 * early.
2136 */
2137static __init void
2138__trace_early_add_events(struct trace_array *tr)
2139{
2140 struct ftrace_event_call *call;
2141 int ret;
2142
2143 list_for_each_entry(call, &ftrace_events, list) {
2144 /* Early boot up should not have any modules loaded */
2145 if (WARN_ON_ONCE(call->mod))
2146 continue;
2147
2148 ret = __trace_early_add_new_event(call, tr);
2149 if (ret < 0)
2150 pr_warning("Could not create early event %s\n",
2151 call->name);
2152 }
2153}
2154
2155/* Remove the event directory structure for a trace directory. */
2156static void
2157__trace_remove_event_dirs(struct trace_array *tr)
2158{
2159 struct ftrace_event_file *file, *next;
2160
2161 list_for_each_entry_safe(file, next, &tr->events, list) {
2162 list_del(&file->list);
2163 debugfs_remove_recursive(file->dir);
2164 remove_subsystem(file->system);
2165 kmem_cache_free(file_cachep, file);
2166 }
2167}
2168
2169static void
2170__add_event_to_tracers(struct ftrace_event_call *call,
2171 struct ftrace_module_file_ops *file_ops)
2172{
2173 struct trace_array *tr;
2174
2175 list_for_each_entry(tr, &ftrace_trace_arrays, list) {
2176 if (file_ops)
2177 __trace_add_new_mod_event(call, tr, file_ops);
2178 else
2179 __trace_add_new_event(call, tr,
2180 &ftrace_event_id_fops,
2181 &ftrace_enable_fops,
2182 &ftrace_event_filter_fops,
2183 &ftrace_event_format_fops);
2184 }
2185}
2186
1454static struct notifier_block trace_module_nb = { 2187static struct notifier_block trace_module_nb = {
1455 .notifier_call = trace_module_notify, 2188 .notifier_call = trace_module_notify,
1456 .priority = 0, 2189 .priority = 0,
@@ -1464,15 +2197,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1464static __init int setup_trace_event(char *str) 2197static __init int setup_trace_event(char *str)
1465{ 2198{
1466 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); 2199 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1467 ring_buffer_expanded = 1; 2200 ring_buffer_expanded = true;
1468 tracing_selftest_disabled = 1; 2201 tracing_selftest_disabled = true;
1469 2202
1470 return 1; 2203 return 1;
1471} 2204}
1472__setup("trace_event=", setup_trace_event); 2205__setup("trace_event=", setup_trace_event);
1473 2206
2207/* Expects to have event_mutex held when called */
2208static int
2209create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
2210{
2211 struct dentry *d_events;
2212 struct dentry *entry;
2213
2214 entry = debugfs_create_file("set_event", 0644, parent,
2215 tr, &ftrace_set_event_fops);
2216 if (!entry) {
2217 pr_warning("Could not create debugfs 'set_event' entry\n");
2218 return -ENOMEM;
2219 }
2220
2221 d_events = debugfs_create_dir("events", parent);
2222 if (!d_events) {
2223 pr_warning("Could not create debugfs 'events' directory\n");
2224 return -ENOMEM;
2225 }
2226
2227 /* ring buffer internal formats */
2228 trace_create_file("header_page", 0444, d_events,
2229 ring_buffer_print_page_header,
2230 &ftrace_show_header_fops);
2231
2232 trace_create_file("header_event", 0444, d_events,
2233 ring_buffer_print_entry_header,
2234 &ftrace_show_header_fops);
2235
2236 trace_create_file("enable", 0644, d_events,
2237 tr, &ftrace_tr_enable_fops);
2238
2239 tr->event_dir = d_events;
2240
2241 return 0;
2242}
2243
2244/**
2245 * event_trace_add_tracer - add a instance of a trace_array to events
2246 * @parent: The parent dentry to place the files/directories for events in
2247 * @tr: The trace array associated with these events
2248 *
2249 * When a new instance is created, it needs to set up its events
2250 * directory, as well as other files associated with events. It also
2251 * creates the event hierachry in the @parent/events directory.
2252 *
2253 * Returns 0 on success.
2254 */
2255int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
2256{
2257 int ret;
2258
2259 mutex_lock(&event_mutex);
2260
2261 ret = create_event_toplevel_files(parent, tr);
2262 if (ret)
2263 goto out_unlock;
2264
2265 down_write(&trace_event_sem);
2266 __trace_add_event_dirs(tr);
2267 up_write(&trace_event_sem);
2268
2269 out_unlock:
2270 mutex_unlock(&event_mutex);
2271
2272 return ret;
2273}
2274
2275/*
2276 * The top trace array already had its file descriptors created.
2277 * Now the files themselves need to be created.
2278 */
2279static __init int
2280early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
2281{
2282 int ret;
2283
2284 mutex_lock(&event_mutex);
2285
2286 ret = create_event_toplevel_files(parent, tr);
2287 if (ret)
2288 goto out_unlock;
2289
2290 down_write(&trace_event_sem);
2291 __trace_early_add_event_dirs(tr);
2292 up_write(&trace_event_sem);
2293
2294 out_unlock:
2295 mutex_unlock(&event_mutex);
2296
2297 return ret;
2298}
2299
2300int event_trace_del_tracer(struct trace_array *tr)
2301{
2302 /* Disable any running events */
2303 __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
2304
2305 mutex_lock(&event_mutex);
2306
2307 down_write(&trace_event_sem);
2308 __trace_remove_event_dirs(tr);
2309 debugfs_remove_recursive(tr->event_dir);
2310 up_write(&trace_event_sem);
2311
2312 tr->event_dir = NULL;
2313
2314 mutex_unlock(&event_mutex);
2315
2316 return 0;
2317}
2318
2319static __init int event_trace_memsetup(void)
2320{
2321 field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
2322 file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
2323 return 0;
2324}
2325
1474static __init int event_trace_enable(void) 2326static __init int event_trace_enable(void)
1475{ 2327{
2328 struct trace_array *tr = top_trace_array();
1476 struct ftrace_event_call **iter, *call; 2329 struct ftrace_event_call **iter, *call;
1477 char *buf = bootup_event_buf; 2330 char *buf = bootup_event_buf;
1478 char *token; 2331 char *token;
@@ -1486,6 +2339,14 @@ static __init int event_trace_enable(void)
1486 list_add(&call->list, &ftrace_events); 2339 list_add(&call->list, &ftrace_events);
1487 } 2340 }
1488 2341
2342 /*
2343 * We need the top trace array to have a working set of trace
2344 * points at early init, before the debug files and directories
2345 * are created. Create the file entries now, and attach them
2346 * to the actual file dentries later.
2347 */
2348 __trace_early_add_events(tr);
2349
1489 while (true) { 2350 while (true) {
1490 token = strsep(&buf, ","); 2351 token = strsep(&buf, ",");
1491 2352
@@ -1494,73 +2355,43 @@ static __init int event_trace_enable(void)
1494 if (!*token) 2355 if (!*token)
1495 continue; 2356 continue;
1496 2357
1497 ret = ftrace_set_clr_event(token, 1); 2358 ret = ftrace_set_clr_event(tr, token, 1);
1498 if (ret) 2359 if (ret)
1499 pr_warn("Failed to enable trace event: %s\n", token); 2360 pr_warn("Failed to enable trace event: %s\n", token);
1500 } 2361 }
1501 2362
1502 trace_printk_start_comm(); 2363 trace_printk_start_comm();
1503 2364
2365 register_event_cmds();
2366
1504 return 0; 2367 return 0;
1505} 2368}
1506 2369
1507static __init int event_trace_init(void) 2370static __init int event_trace_init(void)
1508{ 2371{
1509 struct ftrace_event_call *call; 2372 struct trace_array *tr;
1510 struct dentry *d_tracer; 2373 struct dentry *d_tracer;
1511 struct dentry *entry; 2374 struct dentry *entry;
1512 struct dentry *d_events;
1513 int ret; 2375 int ret;
1514 2376
2377 tr = top_trace_array();
2378
1515 d_tracer = tracing_init_dentry(); 2379 d_tracer = tracing_init_dentry();
1516 if (!d_tracer) 2380 if (!d_tracer)
1517 return 0; 2381 return 0;
1518 2382
1519 entry = debugfs_create_file("available_events", 0444, d_tracer, 2383 entry = debugfs_create_file("available_events", 0444, d_tracer,
1520 NULL, &ftrace_avail_fops); 2384 tr, &ftrace_avail_fops);
1521 if (!entry) 2385 if (!entry)
1522 pr_warning("Could not create debugfs " 2386 pr_warning("Could not create debugfs "
1523 "'available_events' entry\n"); 2387 "'available_events' entry\n");
1524 2388
1525 entry = debugfs_create_file("set_event", 0644, d_tracer,
1526 NULL, &ftrace_set_event_fops);
1527 if (!entry)
1528 pr_warning("Could not create debugfs "
1529 "'set_event' entry\n");
1530
1531 d_events = event_trace_events_dir();
1532 if (!d_events)
1533 return 0;
1534
1535 /* ring buffer internal formats */
1536 trace_create_file("header_page", 0444, d_events,
1537 ring_buffer_print_page_header,
1538 &ftrace_show_header_fops);
1539
1540 trace_create_file("header_event", 0444, d_events,
1541 ring_buffer_print_entry_header,
1542 &ftrace_show_header_fops);
1543
1544 trace_create_file("enable", 0644, d_events,
1545 NULL, &ftrace_system_enable_fops);
1546
1547 if (trace_define_common_fields()) 2389 if (trace_define_common_fields())
1548 pr_warning("tracing: Failed to allocate common fields"); 2390 pr_warning("tracing: Failed to allocate common fields");
1549 2391
1550 /* 2392 ret = early_event_add_tracer(d_tracer, tr);
1551 * Early initialization already enabled ftrace event. 2393 if (ret)
1552 * Now it's only necessary to create the event directory. 2394 return ret;
1553 */
1554 list_for_each_entry(call, &ftrace_events, list) {
1555
1556 ret = event_create_dir(call, d_events,
1557 &ftrace_event_id_fops,
1558 &ftrace_enable_fops,
1559 &ftrace_event_filter_fops,
1560 &ftrace_event_format_fops);
1561 if (ret < 0)
1562 event_remove(call);
1563 }
1564 2395
1565 ret = register_module_notifier(&trace_module_nb); 2396 ret = register_module_notifier(&trace_module_nb);
1566 if (ret) 2397 if (ret)
@@ -1568,6 +2399,7 @@ static __init int event_trace_init(void)
1568 2399
1569 return 0; 2400 return 0;
1570} 2401}
2402early_initcall(event_trace_memsetup);
1571core_initcall(event_trace_enable); 2403core_initcall(event_trace_enable);
1572fs_initcall(event_trace_init); 2404fs_initcall(event_trace_init);
1573 2405
@@ -1627,13 +2459,20 @@ static __init void event_test_stuff(void)
1627 */ 2459 */
1628static __init void event_trace_self_tests(void) 2460static __init void event_trace_self_tests(void)
1629{ 2461{
2462 struct ftrace_subsystem_dir *dir;
2463 struct ftrace_event_file *file;
1630 struct ftrace_event_call *call; 2464 struct ftrace_event_call *call;
1631 struct event_subsystem *system; 2465 struct event_subsystem *system;
2466 struct trace_array *tr;
1632 int ret; 2467 int ret;
1633 2468
2469 tr = top_trace_array();
2470
1634 pr_info("Running tests on trace events:\n"); 2471 pr_info("Running tests on trace events:\n");
1635 2472
1636 list_for_each_entry(call, &ftrace_events, list) { 2473 list_for_each_entry(file, &tr->events, list) {
2474
2475 call = file->event_call;
1637 2476
1638 /* Only test those that have a probe */ 2477 /* Only test those that have a probe */
1639 if (!call->class || !call->class->probe) 2478 if (!call->class || !call->class->probe)
@@ -1657,15 +2496,15 @@ static __init void event_trace_self_tests(void)
1657 * If an event is already enabled, someone is using 2496 * If an event is already enabled, someone is using
1658 * it and the self test should not be on. 2497 * it and the self test should not be on.
1659 */ 2498 */
1660 if (call->flags & TRACE_EVENT_FL_ENABLED) { 2499 if (file->flags & FTRACE_EVENT_FL_ENABLED) {
1661 pr_warning("Enabled event during self test!\n"); 2500 pr_warning("Enabled event during self test!\n");
1662 WARN_ON_ONCE(1); 2501 WARN_ON_ONCE(1);
1663 continue; 2502 continue;
1664 } 2503 }
1665 2504
1666 ftrace_event_enable_disable(call, 1); 2505 ftrace_event_enable_disable(file, 1);
1667 event_test_stuff(); 2506 event_test_stuff();
1668 ftrace_event_enable_disable(call, 0); 2507 ftrace_event_enable_disable(file, 0);
1669 2508
1670 pr_cont("OK\n"); 2509 pr_cont("OK\n");
1671 } 2510 }
@@ -1674,7 +2513,9 @@ static __init void event_trace_self_tests(void)
1674 2513
1675 pr_info("Running tests on trace event systems:\n"); 2514 pr_info("Running tests on trace event systems:\n");
1676 2515
1677 list_for_each_entry(system, &event_subsystems, list) { 2516 list_for_each_entry(dir, &tr->systems, list) {
2517
2518 system = dir->subsystem;
1678 2519
1679 /* the ftrace system is special, skip it */ 2520 /* the ftrace system is special, skip it */
1680 if (strcmp(system->name, "ftrace") == 0) 2521 if (strcmp(system->name, "ftrace") == 0)
@@ -1682,7 +2523,7 @@ static __init void event_trace_self_tests(void)
1682 2523
1683 pr_info("Testing event system %s: ", system->name); 2524 pr_info("Testing event system %s: ", system->name);
1684 2525
1685 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); 2526 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
1686 if (WARN_ON_ONCE(ret)) { 2527 if (WARN_ON_ONCE(ret)) {
1687 pr_warning("error enabling system %s\n", 2528 pr_warning("error enabling system %s\n",
1688 system->name); 2529 system->name);
@@ -1691,7 +2532,7 @@ static __init void event_trace_self_tests(void)
1691 2532
1692 event_test_stuff(); 2533 event_test_stuff();
1693 2534
1694 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); 2535 ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
1695 if (WARN_ON_ONCE(ret)) { 2536 if (WARN_ON_ONCE(ret)) {
1696 pr_warning("error disabling system %s\n", 2537 pr_warning("error disabling system %s\n",
1697 system->name); 2538 system->name);
@@ -1706,7 +2547,7 @@ static __init void event_trace_self_tests(void)
1706 pr_info("Running tests on all trace events:\n"); 2547 pr_info("Running tests on all trace events:\n");
1707 pr_info("Testing all events: "); 2548 pr_info("Testing all events: ");
1708 2549
1709 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); 2550 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
1710 if (WARN_ON_ONCE(ret)) { 2551 if (WARN_ON_ONCE(ret)) {
1711 pr_warning("error enabling all events\n"); 2552 pr_warning("error enabling all events\n");
1712 return; 2553 return;
@@ -1715,7 +2556,7 @@ static __init void event_trace_self_tests(void)
1715 event_test_stuff(); 2556 event_test_stuff();
1716 2557
1717 /* reset sysname */ 2558 /* reset sysname */
1718 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); 2559 ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
1719 if (WARN_ON_ONCE(ret)) { 2560 if (WARN_ON_ONCE(ret)) {
1720 pr_warning("error disabling all events\n"); 2561 pr_warning("error disabling all events\n");
1721 return; 2562 return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e5b0ca8b8d4d..a6361178de5a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,
658 mutex_unlock(&event_mutex); 658 mutex_unlock(&event_mutex);
659} 659}
660 660
661static struct ftrace_event_field *
662__find_event_field(struct list_head *head, char *name)
663{
664 struct ftrace_event_field *field;
665
666 list_for_each_entry(field, head, link) {
667 if (!strcmp(field->name, name))
668 return field;
669 }
670
671 return NULL;
672}
673
674static struct ftrace_event_field *
675find_event_field(struct ftrace_event_call *call, char *name)
676{
677 struct ftrace_event_field *field;
678 struct list_head *head;
679
680 field = __find_event_field(&ftrace_common_fields, name);
681 if (field)
682 return field;
683
684 head = trace_get_fields(call);
685 return __find_event_field(head, name);
686}
687
688static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 661static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
689{ 662{
690 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); 663 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
@@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
1337 return NULL; 1310 return NULL;
1338 } 1311 }
1339 1312
1340 field = find_event_field(call, operand1); 1313 field = trace_find_event_field(call, operand1);
1341 if (!field) { 1314 if (!field) {
1342 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); 1315 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
1343 return NULL; 1316 return NULL;
@@ -1907,16 +1880,17 @@ out_unlock:
1907 return err; 1880 return err;
1908} 1881}
1909 1882
1910int apply_subsystem_event_filter(struct event_subsystem *system, 1883int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
1911 char *filter_string) 1884 char *filter_string)
1912{ 1885{
1886 struct event_subsystem *system = dir->subsystem;
1913 struct event_filter *filter; 1887 struct event_filter *filter;
1914 int err = 0; 1888 int err = 0;
1915 1889
1916 mutex_lock(&event_mutex); 1890 mutex_lock(&event_mutex);
1917 1891
1918 /* Make sure the system still has events */ 1892 /* Make sure the system still has events */
1919 if (!system->nr_events) { 1893 if (!dir->nr_events) {
1920 err = -ENODEV; 1894 err = -ENODEV;
1921 goto out_unlock; 1895 goto out_unlock;
1922 } 1896 }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e039906b037d..d21a74670088 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void) \
129 129
130#undef FTRACE_ENTRY 130#undef FTRACE_ENTRY
131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ 131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
132int \ 132static int __init \
133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ 133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
134{ \ 134{ \
135 struct struct_name field; \ 135 struct struct_name field; \
@@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ 168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
169 regfn) \ 169 regfn) \
170 \ 170 \
171struct ftrace_event_class event_class_ftrace_##call = { \ 171struct ftrace_event_class __refdata event_class_ftrace_##call = { \
172 .system = __stringify(TRACE_SYSTEM), \ 172 .system = __stringify(TRACE_SYSTEM), \
173 .define_fields = ftrace_define_fields_##call, \ 173 .define_fields = ftrace_define_fields_##call, \
174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 601152523326..c4d6d7191988 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);
28static int function_trace_init(struct trace_array *tr) 28static int function_trace_init(struct trace_array *tr)
29{ 29{
30 func_trace = tr; 30 func_trace = tr;
31 tr->cpu = get_cpu(); 31 tr->trace_buffer.cpu = get_cpu();
32 put_cpu(); 32 put_cpu();
33 33
34 tracing_start_cmdline_record(); 34 tracing_start_cmdline_record();
@@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)
44 44
45static void function_trace_start(struct trace_array *tr) 45static void function_trace_start(struct trace_array *tr)
46{ 46{
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(&tr->trace_buffer);
48} 48}
49 49
50/* Our option */ 50/* Our option */
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
76 goto out; 76 goto out;
77 77
78 cpu = smp_processor_id(); 78 cpu = smp_processor_id();
79 data = tr->data[cpu]; 79 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
80 if (!atomic_read(&data->disabled)) { 80 if (!atomic_read(&data->disabled)) {
81 local_save_flags(flags); 81 local_save_flags(flags);
82 trace_function(tr, ip, parent_ip, flags, pc); 82 trace_function(tr, ip, parent_ip, flags, pc);
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
107 */ 107 */
108 local_irq_save(flags); 108 local_irq_save(flags);
109 cpu = raw_smp_processor_id(); 109 cpu = raw_smp_processor_id();
110 data = tr->data[cpu]; 110 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
111 disabled = atomic_inc_return(&data->disabled); 111 disabled = atomic_inc_return(&data->disabled);
112 112
113 if (likely(disabled == 1)) { 113 if (likely(disabled == 1)) {
@@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly =
214}; 214};
215 215
216#ifdef CONFIG_DYNAMIC_FTRACE 216#ifdef CONFIG_DYNAMIC_FTRACE
217static void 217static int update_count(void **data)
218ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
219{ 218{
220 long *count = (long *)data; 219 unsigned long *count = (long *)data;
221
222 if (tracing_is_on())
223 return;
224 220
225 if (!*count) 221 if (!*count)
226 return; 222 return 0;
227 223
228 if (*count != -1) 224 if (*count != -1)
229 (*count)--; 225 (*count)--;
230 226
231 tracing_on(); 227 return 1;
232} 228}
233 229
234static void 230static void
235ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) 231ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
236{ 232{
237 long *count = (long *)data; 233 if (tracing_is_on())
234 return;
235
236 if (update_count(data))
237 tracing_on();
238}
238 239
240static void
241ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
242{
239 if (!tracing_is_on()) 243 if (!tracing_is_on())
240 return; 244 return;
241 245
242 if (!*count) 246 if (update_count(data))
247 tracing_off();
248}
249
250static void
251ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
252{
253 if (tracing_is_on())
243 return; 254 return;
244 255
245 if (*count != -1) 256 tracing_on();
246 (*count)--; 257}
258
259static void
260ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
261{
262 if (!tracing_is_on())
263 return;
247 264
248 tracing_off(); 265 tracing_off();
249} 266}
250 267
251static int 268/*
252ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 269 * Skip 4:
253 struct ftrace_probe_ops *ops, void *data); 270 * ftrace_stacktrace()
271 * function_trace_probe_call()
272 * ftrace_ops_list_func()
273 * ftrace_call()
274 */
275#define STACK_SKIP 4
254 276
255static struct ftrace_probe_ops traceon_probe_ops = { 277static void
256 .func = ftrace_traceon, 278ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
257 .print = ftrace_trace_onoff_print, 279{
258}; 280 trace_dump_stack(STACK_SKIP);
281}
259 282
260static struct ftrace_probe_ops traceoff_probe_ops = { 283static void
261 .func = ftrace_traceoff, 284ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
262 .print = ftrace_trace_onoff_print, 285{
263}; 286 if (!tracing_is_on())
287 return;
288
289 if (update_count(data))
290 trace_dump_stack(STACK_SKIP);
291}
264 292
265static int 293static int
266ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 294ftrace_probe_print(const char *name, struct seq_file *m,
267 struct ftrace_probe_ops *ops, void *data) 295 unsigned long ip, void *data)
268{ 296{
269 long count = (long)data; 297 long count = (long)data;
270 298
271 seq_printf(m, "%ps:", (void *)ip); 299 seq_printf(m, "%ps:%s", (void *)ip, name);
272
273 if (ops == &traceon_probe_ops)
274 seq_printf(m, "traceon");
275 else
276 seq_printf(m, "traceoff");
277 300
278 if (count == -1) 301 if (count == -1)
279 seq_printf(m, ":unlimited\n"); 302 seq_printf(m, ":unlimited\n");
@@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
284} 307}
285 308
286static int 309static int
287ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) 310ftrace_traceon_print(struct seq_file *m, unsigned long ip,
311 struct ftrace_probe_ops *ops, void *data)
288{ 312{
289 struct ftrace_probe_ops *ops; 313 return ftrace_probe_print("traceon", m, ip, data);
290 314}
291 /* we register both traceon and traceoff to this callback */
292 if (strcmp(cmd, "traceon") == 0)
293 ops = &traceon_probe_ops;
294 else
295 ops = &traceoff_probe_ops;
296 315
297 unregister_ftrace_function_probe_func(glob, ops); 316static int
317ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
318 struct ftrace_probe_ops *ops, void *data)
319{
320 return ftrace_probe_print("traceoff", m, ip, data);
321}
298 322
299 return 0; 323static int
324ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
325 struct ftrace_probe_ops *ops, void *data)
326{
327 return ftrace_probe_print("stacktrace", m, ip, data);
300} 328}
301 329
330static struct ftrace_probe_ops traceon_count_probe_ops = {
331 .func = ftrace_traceon_count,
332 .print = ftrace_traceon_print,
333};
334
335static struct ftrace_probe_ops traceoff_count_probe_ops = {
336 .func = ftrace_traceoff_count,
337 .print = ftrace_traceoff_print,
338};
339
340static struct ftrace_probe_ops stacktrace_count_probe_ops = {
341 .func = ftrace_stacktrace_count,
342 .print = ftrace_stacktrace_print,
343};
344
345static struct ftrace_probe_ops traceon_probe_ops = {
346 .func = ftrace_traceon,
347 .print = ftrace_traceon_print,
348};
349
350static struct ftrace_probe_ops traceoff_probe_ops = {
351 .func = ftrace_traceoff,
352 .print = ftrace_traceoff_print,
353};
354
355static struct ftrace_probe_ops stacktrace_probe_ops = {
356 .func = ftrace_stacktrace,
357 .print = ftrace_stacktrace_print,
358};
359
302static int 360static int
303ftrace_trace_onoff_callback(struct ftrace_hash *hash, 361ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
304 char *glob, char *cmd, char *param, int enable) 362 struct ftrace_hash *hash, char *glob,
363 char *cmd, char *param, int enable)
305{ 364{
306 struct ftrace_probe_ops *ops;
307 void *count = (void *)-1; 365 void *count = (void *)-1;
308 char *number; 366 char *number;
309 int ret; 367 int ret;
@@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
312 if (!enable) 370 if (!enable)
313 return -EINVAL; 371 return -EINVAL;
314 372
315 if (glob[0] == '!') 373 if (glob[0] == '!') {
316 return ftrace_trace_onoff_unreg(glob+1, cmd, param); 374 unregister_ftrace_function_probe_func(glob+1, ops);
317 375 return 0;
318 /* we register both traceon and traceoff to this callback */ 376 }
319 if (strcmp(cmd, "traceon") == 0)
320 ops = &traceon_probe_ops;
321 else
322 ops = &traceoff_probe_ops;
323 377
324 if (!param) 378 if (!param)
325 goto out_reg; 379 goto out_reg;
@@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
343 return ret < 0 ? ret : 0; 397 return ret < 0 ? ret : 0;
344} 398}
345 399
400static int
401ftrace_trace_onoff_callback(struct ftrace_hash *hash,
402 char *glob, char *cmd, char *param, int enable)
403{
404 struct ftrace_probe_ops *ops;
405
406 /* we register both traceon and traceoff to this callback */
407 if (strcmp(cmd, "traceon") == 0)
408 ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
409 else
410 ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
411
412 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
413 param, enable);
414}
415
416static int
417ftrace_stacktrace_callback(struct ftrace_hash *hash,
418 char *glob, char *cmd, char *param, int enable)
419{
420 struct ftrace_probe_ops *ops;
421
422 ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
423
424 return ftrace_trace_probe_callback(ops, hash, glob, cmd,
425 param, enable);
426}
427
346static struct ftrace_func_command ftrace_traceon_cmd = { 428static struct ftrace_func_command ftrace_traceon_cmd = {
347 .name = "traceon", 429 .name = "traceon",
348 .func = ftrace_trace_onoff_callback, 430 .func = ftrace_trace_onoff_callback,
@@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {
353 .func = ftrace_trace_onoff_callback, 435 .func = ftrace_trace_onoff_callback,
354}; 436};
355 437
438static struct ftrace_func_command ftrace_stacktrace_cmd = {
439 .name = "stacktrace",
440 .func = ftrace_stacktrace_callback,
441};
442
356static int __init init_func_cmd_traceon(void) 443static int __init init_func_cmd_traceon(void)
357{ 444{
358 int ret; 445 int ret;
@@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void)
364 ret = register_ftrace_command(&ftrace_traceon_cmd); 451 ret = register_ftrace_command(&ftrace_traceon_cmd);
365 if (ret) 452 if (ret)
366 unregister_ftrace_command(&ftrace_traceoff_cmd); 453 unregister_ftrace_command(&ftrace_traceoff_cmd);
454
455 ret = register_ftrace_command(&ftrace_stacktrace_cmd);
456 if (ret) {
457 unregister_ftrace_command(&ftrace_traceoff_cmd);
458 unregister_ftrace_command(&ftrace_traceon_cmd);
459 }
367 return ret; 460 return ret;
368} 461}
369#else 462#else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 39ada66389cc..8388bc99f2ee 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,
218{ 218{
219 struct ftrace_event_call *call = &event_funcgraph_entry; 219 struct ftrace_event_call *call = &event_funcgraph_entry;
220 struct ring_buffer_event *event; 220 struct ring_buffer_event *event;
221 struct ring_buffer *buffer = tr->buffer; 221 struct ring_buffer *buffer = tr->trace_buffer.buffer;
222 struct ftrace_graph_ent_entry *entry; 222 struct ftrace_graph_ent_entry *entry;
223 223
224 if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) 224 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
265 265
266 local_irq_save(flags); 266 local_irq_save(flags);
267 cpu = raw_smp_processor_id(); 267 cpu = raw_smp_processor_id();
268 data = tr->data[cpu]; 268 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
269 disabled = atomic_inc_return(&data->disabled); 269 disabled = atomic_inc_return(&data->disabled);
270 if (likely(disabled == 1)) { 270 if (likely(disabled == 1)) {
271 pc = preempt_count(); 271 pc = preempt_count();
@@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,
323{ 323{
324 struct ftrace_event_call *call = &event_funcgraph_exit; 324 struct ftrace_event_call *call = &event_funcgraph_exit;
325 struct ring_buffer_event *event; 325 struct ring_buffer_event *event;
326 struct ring_buffer *buffer = tr->buffer; 326 struct ring_buffer *buffer = tr->trace_buffer.buffer;
327 struct ftrace_graph_ret_entry *entry; 327 struct ftrace_graph_ret_entry *entry;
328 328
329 if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) 329 if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
350 350
351 local_irq_save(flags); 351 local_irq_save(flags);
352 cpu = raw_smp_processor_id(); 352 cpu = raw_smp_processor_id();
353 data = tr->data[cpu]; 353 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
354 disabled = atomic_inc_return(&data->disabled); 354 disabled = atomic_inc_return(&data->disabled);
355 if (likely(disabled == 1)) { 355 if (likely(disabled == 1)) {
356 pc = preempt_count(); 356 pc = preempt_count();
@@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,
560 * We need to consume the current entry to see 560 * We need to consume the current entry to see
561 * the next one. 561 * the next one.
562 */ 562 */
563 ring_buffer_consume(iter->tr->buffer, iter->cpu, 563 ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
564 NULL, NULL); 564 NULL, NULL);
565 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 565 event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
566 NULL, NULL); 566 NULL, NULL);
567 } 567 }
568 568
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 713a2cac4881..b19d065a28cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -32,7 +32,8 @@ enum {
32 32
33static int trace_type __read_mostly; 33static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_flags;
36static bool function_enabled;
36 37
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph); 38static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph); 39static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,
121 if (!irqs_disabled_flags(*flags)) 122 if (!irqs_disabled_flags(*flags))
122 return 0; 123 return 0;
123 124
124 *data = tr->data[cpu]; 125 *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
125 disabled = atomic_inc_return(&(*data)->disabled); 126 disabled = atomic_inc_return(&(*data)->disabled);
126 127
127 if (likely(disabled == 1)) 128 if (likely(disabled == 1))
@@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
175 per_cpu(tracing_cpu, cpu) = 0; 176 per_cpu(tracing_cpu, cpu) = 0;
176 177
177 tracing_max_latency = 0; 178 tracing_max_latency = 0;
178 tracing_reset_online_cpus(irqsoff_trace); 179 tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
179 180
180 return start_irqsoff_tracer(irqsoff_trace, set); 181 return start_irqsoff_tracer(irqsoff_trace, set);
181} 182}
@@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
380 if (per_cpu(tracing_cpu, cpu)) 381 if (per_cpu(tracing_cpu, cpu))
381 return; 382 return;
382 383
383 data = tr->data[cpu]; 384 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
384 385
385 if (unlikely(!data) || atomic_read(&data->disabled)) 386 if (unlikely(!data) || atomic_read(&data->disabled))
386 return; 387 return;
@@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
418 if (!tracer_enabled) 419 if (!tracer_enabled)
419 return; 420 return;
420 421
421 data = tr->data[cpu]; 422 data = per_cpu_ptr(tr->trace_buffer.data, cpu);
422 423
423 if (unlikely(!data) || 424 if (unlikely(!data) ||
424 !data->critical_start || atomic_read(&data->disabled)) 425 !data->critical_start || atomic_read(&data->disabled))
@@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
528} 529}
529#endif /* CONFIG_PREEMPT_TRACER */ 530#endif /* CONFIG_PREEMPT_TRACER */
530 531
531static int start_irqsoff_tracer(struct trace_array *tr, int graph) 532static int register_irqsoff_function(int graph, int set)
532{ 533{
533 int ret = 0; 534 int ret;
534 535
535 if (!graph) 536 /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
536 ret = register_ftrace_function(&trace_ops); 537 if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
537 else 538 return 0;
539
540 if (graph)
538 ret = register_ftrace_graph(&irqsoff_graph_return, 541 ret = register_ftrace_graph(&irqsoff_graph_return,
539 &irqsoff_graph_entry); 542 &irqsoff_graph_entry);
543 else
544 ret = register_ftrace_function(&trace_ops);
545
546 if (!ret)
547 function_enabled = true;
548
549 return ret;
550}
551
552static void unregister_irqsoff_function(int graph)
553{
554 if (!function_enabled)
555 return;
556
557 if (graph)
558 unregister_ftrace_graph();
559 else
560 unregister_ftrace_function(&trace_ops);
561
562 function_enabled = false;
563}
564
565static void irqsoff_function_set(int set)
566{
567 if (set)
568 register_irqsoff_function(is_graph(), 1);
569 else
570 unregister_irqsoff_function(is_graph());
571}
572
573static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set)
574{
575 if (mask & TRACE_ITER_FUNCTION)
576 irqsoff_function_set(set);
577
578 return trace_keep_overwrite(tracer, mask, set);
579}
580
581static int start_irqsoff_tracer(struct trace_array *tr, int graph)
582{
583 int ret;
584
585 ret = register_irqsoff_function(graph, 0);
540 586
541 if (!ret && tracing_is_enabled()) 587 if (!ret && tracing_is_enabled())
542 tracer_enabled = 1; 588 tracer_enabled = 1;
@@ -550,22 +596,22 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
550{ 596{
551 tracer_enabled = 0; 597 tracer_enabled = 0;
552 598
553 if (!graph) 599 unregister_irqsoff_function(graph);
554 unregister_ftrace_function(&trace_ops);
555 else
556 unregister_ftrace_graph();
557} 600}
558 601
559static void __irqsoff_tracer_init(struct trace_array *tr) 602static void __irqsoff_tracer_init(struct trace_array *tr)
560{ 603{
561 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; 604 save_flags = trace_flags;
562 trace_flags |= TRACE_ITER_LATENCY_FMT; 605
606 /* non overwrite screws up the latency tracers */
607 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
608 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
563 609
564 tracing_max_latency = 0; 610 tracing_max_latency = 0;
565 irqsoff_trace = tr; 611 irqsoff_trace = tr;
566 /* make sure that the tracer is visible */ 612 /* make sure that the tracer is visible */
567 smp_wmb(); 613 smp_wmb();
568 tracing_reset_online_cpus(tr); 614 tracing_reset_online_cpus(&tr->trace_buffer);
569 615
570 if (start_irqsoff_tracer(tr, is_graph())) 616 if (start_irqsoff_tracer(tr, is_graph()))
571 printk(KERN_ERR "failed to start irqsoff tracer\n"); 617 printk(KERN_ERR "failed to start irqsoff tracer\n");
@@ -573,10 +619,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
573 619
574static void irqsoff_tracer_reset(struct trace_array *tr) 620static void irqsoff_tracer_reset(struct trace_array *tr)
575{ 621{
622 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
623 int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
624
576 stop_irqsoff_tracer(tr, is_graph()); 625 stop_irqsoff_tracer(tr, is_graph());
577 626
578 if (!save_lat_flag) 627 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
579 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 628 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
580} 629}
581 630
582static void irqsoff_tracer_start(struct trace_array *tr) 631static void irqsoff_tracer_start(struct trace_array *tr)
@@ -609,6 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =
609 .print_line = irqsoff_print_line, 658 .print_line = irqsoff_print_line,
610 .flags = &tracer_flags, 659 .flags = &tracer_flags,
611 .set_flag = irqsoff_set_flag, 660 .set_flag = irqsoff_set_flag,
661 .flag_changed = irqsoff_flag_changed,
612#ifdef CONFIG_FTRACE_SELFTEST 662#ifdef CONFIG_FTRACE_SELFTEST
613 .selftest = trace_selftest_startup_irqsoff, 663 .selftest = trace_selftest_startup_irqsoff,
614#endif 664#endif
@@ -642,6 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =
642 .print_line = irqsoff_print_line, 692 .print_line = irqsoff_print_line,
643 .flags = &tracer_flags, 693 .flags = &tracer_flags,
644 .set_flag = irqsoff_set_flag, 694 .set_flag = irqsoff_set_flag,
695 .flag_changed = irqsoff_flag_changed,
645#ifdef CONFIG_FTRACE_SELFTEST 696#ifdef CONFIG_FTRACE_SELFTEST
646 .selftest = trace_selftest_startup_preemptoff, 697 .selftest = trace_selftest_startup_preemptoff,
647#endif 698#endif
@@ -677,6 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
677 .print_line = irqsoff_print_line, 728 .print_line = irqsoff_print_line,
678 .flags = &tracer_flags, 729 .flags = &tracer_flags,
679 .set_flag = irqsoff_set_flag, 730 .set_flag = irqsoff_set_flag,
731 .flag_changed = irqsoff_flag_changed,
680#ifdef CONFIG_FTRACE_SELFTEST 732#ifdef CONFIG_FTRACE_SELFTEST
681 .selftest = trace_selftest_startup_preemptirqsoff, 733 .selftest = trace_selftest_startup_preemptirqsoff,
682#endif 734#endif
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 3c5c5dfea0b3..bd90e1b06088 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
26 trace_init_global_iter(&iter); 26 trace_init_global_iter(&iter);
27 27
28 for_each_tracing_cpu(cpu) { 28 for_each_tracing_cpu(cpu) {
29 atomic_inc(&iter.tr->data[cpu]->disabled); 29 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
30 } 30 }
31 31
32 old_userobj = trace_flags; 32 old_userobj = trace_flags;
@@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
43 iter.iter_flags |= TRACE_FILE_LAT_FMT; 43 iter.iter_flags |= TRACE_FILE_LAT_FMT;
44 iter.pos = -1; 44 iter.pos = -1;
45 45
46 if (cpu_file == TRACE_PIPE_ALL_CPU) { 46 if (cpu_file == RING_BUFFER_ALL_CPUS) {
47 for_each_tracing_cpu(cpu) { 47 for_each_tracing_cpu(cpu) {
48 iter.buffer_iter[cpu] = 48 iter.buffer_iter[cpu] =
49 ring_buffer_read_prepare(iter.tr->buffer, cpu); 49 ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
50 ring_buffer_read_start(iter.buffer_iter[cpu]); 50 ring_buffer_read_start(iter.buffer_iter[cpu]);
51 tracing_iter_reset(&iter, cpu); 51 tracing_iter_reset(&iter, cpu);
52 } 52 }
53 } else { 53 } else {
54 iter.cpu_file = cpu_file; 54 iter.cpu_file = cpu_file;
55 iter.buffer_iter[cpu_file] = 55 iter.buffer_iter[cpu_file] =
56 ring_buffer_read_prepare(iter.tr->buffer, cpu_file); 56 ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
57 ring_buffer_read_start(iter.buffer_iter[cpu_file]); 57 ring_buffer_read_start(iter.buffer_iter[cpu_file]);
58 tracing_iter_reset(&iter, cpu_file); 58 tracing_iter_reset(&iter, cpu_file);
59 } 59 }
@@ -83,7 +83,7 @@ out:
83 trace_flags = old_userobj; 83 trace_flags = old_userobj;
84 84
85 for_each_tracing_cpu(cpu) { 85 for_each_tracing_cpu(cpu) {
86 atomic_dec(&iter.tr->data[cpu]->disabled); 86 atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
87 } 87 }
88 88
89 for_each_tracing_cpu(cpu) 89 for_each_tracing_cpu(cpu)
@@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)
115 !cpu_online(cpu_file)) 115 !cpu_online(cpu_file))
116 return KDB_BADINT; 116 return KDB_BADINT;
117 } else { 117 } else {
118 cpu_file = TRACE_PIPE_ALL_CPU; 118 cpu_file = RING_BUFFER_ALL_CPUS;
119 } 119 }
120 120
121 kdb_trap_printk++; 121 kdb_trap_printk++;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fd3c8aae55e5..a5e8f4878bfa 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)
31 overrun_detected = false; 31 overrun_detected = false;
32 prev_overruns = 0; 32 prev_overruns = 0;
33 33
34 tracing_reset_online_cpus(tr); 34 tracing_reset_online_cpus(&tr->trace_buffer);
35} 35}
36 36
37static int mmio_trace_init(struct trace_array *tr) 37static int mmio_trace_init(struct trace_array *tr)
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)
128static unsigned long count_overruns(struct trace_iterator *iter) 128static unsigned long count_overruns(struct trace_iterator *iter)
129{ 129{
130 unsigned long cnt = atomic_xchg(&dropped_count, 0); 130 unsigned long cnt = atomic_xchg(&dropped_count, 0);
131 unsigned long over = ring_buffer_overruns(iter->tr->buffer); 131 unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
132 132
133 if (over > prev_overruns) 133 if (over > prev_overruns)
134 cnt += over - prev_overruns; 134 cnt += over - prev_overruns;
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
309 struct mmiotrace_rw *rw) 309 struct mmiotrace_rw *rw)
310{ 310{
311 struct ftrace_event_call *call = &event_mmiotrace_rw; 311 struct ftrace_event_call *call = &event_mmiotrace_rw;
312 struct ring_buffer *buffer = tr->buffer; 312 struct ring_buffer *buffer = tr->trace_buffer.buffer;
313 struct ring_buffer_event *event; 313 struct ring_buffer_event *event;
314 struct trace_mmiotrace_rw *entry; 314 struct trace_mmiotrace_rw *entry;
315 int pc = preempt_count(); 315 int pc = preempt_count();
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
330void mmio_trace_rw(struct mmiotrace_rw *rw) 330void mmio_trace_rw(struct mmiotrace_rw *rw)
331{ 331{
332 struct trace_array *tr = mmio_trace_array; 332 struct trace_array *tr = mmio_trace_array;
333 struct trace_array_cpu *data = tr->data[smp_processor_id()]; 333 struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
334 __trace_mmiotrace_rw(tr, data, rw); 334 __trace_mmiotrace_rw(tr, data, rw);
335} 335}
336 336
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
339 struct mmiotrace_map *map) 339 struct mmiotrace_map *map)
340{ 340{
341 struct ftrace_event_call *call = &event_mmiotrace_map; 341 struct ftrace_event_call *call = &event_mmiotrace_map;
342 struct ring_buffer *buffer = tr->buffer; 342 struct ring_buffer *buffer = tr->trace_buffer.buffer;
343 struct ring_buffer_event *event; 343 struct ring_buffer_event *event;
344 struct trace_mmiotrace_map *entry; 344 struct trace_mmiotrace_map *entry;
345 int pc = preempt_count(); 345 int pc = preempt_count();
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
363 struct trace_array_cpu *data; 363 struct trace_array_cpu *data;
364 364
365 preempt_disable(); 365 preempt_disable();
366 data = tr->data[smp_processor_id()]; 366 data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
367 __trace_mmiotrace_map(tr, data, map); 367 __trace_mmiotrace_map(tr, data, map);
368 preempt_enable(); 368 preempt_enable();
369} 369}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 697e88d13907..bb922d9ee51b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
14/* must be a power of 2 */ 14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128 15#define EVENT_HASHSIZE 128
16 16
17DECLARE_RWSEM(trace_event_mutex); 17DECLARE_RWSEM(trace_event_sem);
18 18
19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 19static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
20 20
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
37 return ret; 37 return ret;
38} 38}
39 39
40enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
41{
42 struct trace_seq *s = &iter->seq;
43 struct trace_entry *entry = iter->ent;
44 struct bputs_entry *field;
45 int ret;
46
47 trace_assign_type(field, entry);
48
49 ret = trace_seq_puts(s, field->str);
50 if (!ret)
51 return TRACE_TYPE_PARTIAL_LINE;
52
53 return TRACE_TYPE_HANDLED;
54}
55
40enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 56enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
41{ 57{
42 struct trace_seq *s = &iter->seq; 58 struct trace_seq *s = &iter->seq;
@@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
397} 413}
398EXPORT_SYMBOL(ftrace_print_hex_seq); 414EXPORT_SYMBOL(ftrace_print_hex_seq);
399 415
416int ftrace_raw_output_prep(struct trace_iterator *iter,
417 struct trace_event *trace_event)
418{
419 struct ftrace_event_call *event;
420 struct trace_seq *s = &iter->seq;
421 struct trace_seq *p = &iter->tmp_seq;
422 struct trace_entry *entry;
423 int ret;
424
425 event = container_of(trace_event, struct ftrace_event_call, event);
426 entry = iter->ent;
427
428 if (entry->type != event->event.type) {
429 WARN_ON_ONCE(1);
430 return TRACE_TYPE_UNHANDLED;
431 }
432
433 trace_seq_init(p);
434 ret = trace_seq_printf(s, "%s: ", event->name);
435 if (!ret)
436 return TRACE_TYPE_PARTIAL_LINE;
437
438 return 0;
439}
440EXPORT_SYMBOL(ftrace_raw_output_prep);
441
400#ifdef CONFIG_KRETPROBES 442#ifdef CONFIG_KRETPROBES
401static inline const char *kretprobed(const char *name) 443static inline const char *kretprobed(const char *name)
402{ 444{
@@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
617{ 659{
618 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; 660 unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
619 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; 661 unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
620 unsigned long long abs_ts = iter->ts - iter->tr->time_start; 662 unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
621 unsigned long long rel_ts = next_ts - iter->ts; 663 unsigned long long rel_ts = next_ts - iter->ts;
622 struct trace_seq *s = &iter->seq; 664 struct trace_seq *s = &iter->seq;
623 665
@@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list)
783 825
784void trace_event_read_lock(void) 826void trace_event_read_lock(void)
785{ 827{
786 down_read(&trace_event_mutex); 828 down_read(&trace_event_sem);
787} 829}
788 830
789void trace_event_read_unlock(void) 831void trace_event_read_unlock(void)
790{ 832{
791 up_read(&trace_event_mutex); 833 up_read(&trace_event_sem);
792} 834}
793 835
794/** 836/**
@@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event)
811 unsigned key; 853 unsigned key;
812 int ret = 0; 854 int ret = 0;
813 855
814 down_write(&trace_event_mutex); 856 down_write(&trace_event_sem);
815 857
816 if (WARN_ON(!event)) 858 if (WARN_ON(!event))
817 goto out; 859 goto out;
@@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event)
866 908
867 ret = event->type; 909 ret = event->type;
868 out: 910 out:
869 up_write(&trace_event_mutex); 911 up_write(&trace_event_sem);
870 912
871 return ret; 913 return ret;
872} 914}
873EXPORT_SYMBOL_GPL(register_ftrace_event); 915EXPORT_SYMBOL_GPL(register_ftrace_event);
874 916
875/* 917/*
876 * Used by module code with the trace_event_mutex held for write. 918 * Used by module code with the trace_event_sem held for write.
877 */ 919 */
878int __unregister_ftrace_event(struct trace_event *event) 920int __unregister_ftrace_event(struct trace_event *event)
879{ 921{
@@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event)
888 */ 930 */
889int unregister_ftrace_event(struct trace_event *event) 931int unregister_ftrace_event(struct trace_event *event)
890{ 932{
891 down_write(&trace_event_mutex); 933 down_write(&trace_event_sem);
892 __unregister_ftrace_event(event); 934 __unregister_ftrace_event(event);
893 up_write(&trace_event_mutex); 935 up_write(&trace_event_sem);
894 936
895 return 0; 937 return 0;
896} 938}
@@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = {
1217 .funcs = &trace_user_stack_funcs, 1259 .funcs = &trace_user_stack_funcs,
1218}; 1260};
1219 1261
1262/* TRACE_BPUTS */
1263static enum print_line_t
1264trace_bputs_print(struct trace_iterator *iter, int flags,
1265 struct trace_event *event)
1266{
1267 struct trace_entry *entry = iter->ent;
1268 struct trace_seq *s = &iter->seq;
1269 struct bputs_entry *field;
1270
1271 trace_assign_type(field, entry);
1272
1273 if (!seq_print_ip_sym(s, field->ip, flags))
1274 goto partial;
1275
1276 if (!trace_seq_puts(s, ": "))
1277 goto partial;
1278
1279 if (!trace_seq_puts(s, field->str))
1280 goto partial;
1281
1282 return TRACE_TYPE_HANDLED;
1283
1284 partial:
1285 return TRACE_TYPE_PARTIAL_LINE;
1286}
1287
1288
1289static enum print_line_t
1290trace_bputs_raw(struct trace_iterator *iter, int flags,
1291 struct trace_event *event)
1292{
1293 struct bputs_entry *field;
1294 struct trace_seq *s = &iter->seq;
1295
1296 trace_assign_type(field, iter->ent);
1297
1298 if (!trace_seq_printf(s, ": %lx : ", field->ip))
1299 goto partial;
1300
1301 if (!trace_seq_puts(s, field->str))
1302 goto partial;
1303
1304 return TRACE_TYPE_HANDLED;
1305
1306 partial:
1307 return TRACE_TYPE_PARTIAL_LINE;
1308}
1309
1310static struct trace_event_functions trace_bputs_funcs = {
1311 .trace = trace_bputs_print,
1312 .raw = trace_bputs_raw,
1313};
1314
1315static struct trace_event trace_bputs_event = {
1316 .type = TRACE_BPUTS,
1317 .funcs = &trace_bputs_funcs,
1318};
1319
1220/* TRACE_BPRINT */ 1320/* TRACE_BPRINT */
1221static enum print_line_t 1321static enum print_line_t
1222trace_bprint_print(struct trace_iterator *iter, int flags, 1322trace_bprint_print(struct trace_iterator *iter, int flags,
@@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = {
1329 &trace_wake_event, 1429 &trace_wake_event,
1330 &trace_stack_event, 1430 &trace_stack_event,
1331 &trace_user_stack_event, 1431 &trace_user_stack_event,
1432 &trace_bputs_event,
1332 &trace_bprint_event, 1433 &trace_bprint_event,
1333 &trace_print_event, 1434 &trace_print_event,
1334 NULL 1435 NULL
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index c038eba0492b..127a9d8c8357 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -5,6 +5,8 @@
5#include "trace.h" 5#include "trace.h"
6 6
7extern enum print_line_t 7extern enum print_line_t
8trace_print_bputs_msg_only(struct trace_iterator *iter);
9extern enum print_line_t
8trace_print_bprintk_msg_only(struct trace_iterator *iter); 10trace_print_bprintk_msg_only(struct trace_iterator *iter);
9extern enum print_line_t 11extern enum print_line_t
10trace_print_printk_msg_only(struct trace_iterator *iter); 12trace_print_printk_msg_only(struct trace_iterator *iter);
@@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
31 33
32/* used by module unregistering */ 34/* used by module unregistering */
33extern int __unregister_ftrace_event(struct trace_event *event); 35extern int __unregister_ftrace_event(struct trace_event *event);
34extern struct rw_semaphore trace_event_mutex; 36extern struct rw_semaphore trace_event_sem;
35 37
36#define MAX_MEMHEX_BYTES 8 38#define MAX_MEMHEX_BYTES 8
37#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) 39#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3374c792ccd8..4e98e3b257a3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
28 unsigned long flags, int pc) 28 unsigned long flags, int pc)
29{ 29{
30 struct ftrace_event_call *call = &event_context_switch; 30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer; 31 struct ring_buffer *buffer = tr->trace_buffer.buffer;
32 struct ring_buffer_event *event; 32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry; 33 struct ctx_switch_entry *entry;
34 34
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
69 pc = preempt_count(); 69 pc = preempt_count();
70 local_irq_save(flags); 70 local_irq_save(flags);
71 cpu = raw_smp_processor_id(); 71 cpu = raw_smp_processor_id();
72 data = ctx_trace->data[cpu]; 72 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
73 73
74 if (likely(!atomic_read(&data->disabled))) 74 if (likely(!atomic_read(&data->disabled)))
75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); 75 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
86 struct ftrace_event_call *call = &event_wakeup; 86 struct ftrace_event_call *call = &event_wakeup;
87 struct ring_buffer_event *event; 87 struct ring_buffer_event *event;
88 struct ctx_switch_entry *entry; 88 struct ctx_switch_entry *entry;
89 struct ring_buffer *buffer = tr->buffer; 89 struct ring_buffer *buffer = tr->trace_buffer.buffer;
90 90
91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, 91 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
92 sizeof(*entry), flags, pc); 92 sizeof(*entry), flags, pc);
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
123 pc = preempt_count(); 123 pc = preempt_count();
124 local_irq_save(flags); 124 local_irq_save(flags);
125 cpu = raw_smp_processor_id(); 125 cpu = raw_smp_processor_id();
126 data = ctx_trace->data[cpu]; 126 data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
127 127
128 if (likely(!atomic_read(&data->disabled))) 128 if (likely(!atomic_read(&data->disabled)))
129 tracing_sched_wakeup_trace(ctx_trace, wakee, current, 129 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 75aa97fbe1a1..fee77e15d815 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -36,7 +36,8 @@ static void __wakeup_reset(struct trace_array *tr);
36static int wakeup_graph_entry(struct ftrace_graph_ent *trace); 36static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace); 37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
38 38
39static int save_lat_flag; 39static int save_flags;
40static bool function_enabled;
40 41
41#define TRACE_DISPLAY_GRAPH 1 42#define TRACE_DISPLAY_GRAPH 1
42 43
@@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
89 if (cpu != wakeup_current_cpu) 90 if (cpu != wakeup_current_cpu)
90 goto out_enable; 91 goto out_enable;
91 92
92 *data = tr->data[cpu]; 93 *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
93 disabled = atomic_inc_return(&(*data)->disabled); 94 disabled = atomic_inc_return(&(*data)->disabled);
94 if (unlikely(disabled != 1)) 95 if (unlikely(disabled != 1))
95 goto out; 96 goto out;
@@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =
134}; 135};
135#endif /* CONFIG_FUNCTION_TRACER */ 136#endif /* CONFIG_FUNCTION_TRACER */
136 137
137static int start_func_tracer(int graph) 138static int register_wakeup_function(int graph, int set)
138{ 139{
139 int ret; 140 int ret;
140 141
141 if (!graph) 142 /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
142 ret = register_ftrace_function(&trace_ops); 143 if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
143 else 144 return 0;
145
146 if (graph)
144 ret = register_ftrace_graph(&wakeup_graph_return, 147 ret = register_ftrace_graph(&wakeup_graph_return,
145 &wakeup_graph_entry); 148 &wakeup_graph_entry);
149 else
150 ret = register_ftrace_function(&trace_ops);
151
152 if (!ret)
153 function_enabled = true;
154
155 return ret;
156}
157
158static void unregister_wakeup_function(int graph)
159{
160 if (!function_enabled)
161 return;
162
163 if (graph)
164 unregister_ftrace_graph();
165 else
166 unregister_ftrace_function(&trace_ops);
167
168 function_enabled = false;
169}
170
171static void wakeup_function_set(int set)
172{
173 if (set)
174 register_wakeup_function(is_graph(), 1);
175 else
176 unregister_wakeup_function(is_graph());
177}
178
179static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set)
180{
181 if (mask & TRACE_ITER_FUNCTION)
182 wakeup_function_set(set);
183
184 return trace_keep_overwrite(tracer, mask, set);
185}
186
187static int start_func_tracer(int graph)
188{
189 int ret;
190
191 ret = register_wakeup_function(graph, 0);
146 192
147 if (!ret && tracing_is_enabled()) 193 if (!ret && tracing_is_enabled())
148 tracer_enabled = 1; 194 tracer_enabled = 1;
@@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)
156{ 202{
157 tracer_enabled = 0; 203 tracer_enabled = 0;
158 204
159 if (!graph) 205 unregister_wakeup_function(graph);
160 unregister_ftrace_function(&trace_ops);
161 else
162 unregister_ftrace_graph();
163} 206}
164 207
165#ifdef CONFIG_FUNCTION_GRAPH_TRACER 208#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore,
353 396
354 /* disable local data, not wakeup_cpu data */ 397 /* disable local data, not wakeup_cpu data */
355 cpu = raw_smp_processor_id(); 398 cpu = raw_smp_processor_id();
356 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 399 disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
357 if (likely(disabled != 1)) 400 if (likely(disabled != 1))
358 goto out; 401 goto out;
359 402
@@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore,
365 goto out_unlock; 408 goto out_unlock;
366 409
367 /* The task we are waiting for is waking up */ 410 /* The task we are waiting for is waking up */
368 data = wakeup_trace->data[wakeup_cpu]; 411 data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
369 412
370 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 413 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
371 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 414 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -387,7 +430,7 @@ out_unlock:
387 arch_spin_unlock(&wakeup_lock); 430 arch_spin_unlock(&wakeup_lock);
388 local_irq_restore(flags); 431 local_irq_restore(flags);
389out: 432out:
390 atomic_dec(&wakeup_trace->data[cpu]->disabled); 433 atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
391} 434}
392 435
393static void __wakeup_reset(struct trace_array *tr) 436static void __wakeup_reset(struct trace_array *tr)
@@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr)
405{ 448{
406 unsigned long flags; 449 unsigned long flags;
407 450
408 tracing_reset_online_cpus(tr); 451 tracing_reset_online_cpus(&tr->trace_buffer);
409 452
410 local_irq_save(flags); 453 local_irq_save(flags);
411 arch_spin_lock(&wakeup_lock); 454 arch_spin_lock(&wakeup_lock);
@@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
435 return; 478 return;
436 479
437 pc = preempt_count(); 480 pc = preempt_count();
438 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 481 disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
439 if (unlikely(disabled != 1)) 482 if (unlikely(disabled != 1))
440 goto out; 483 goto out;
441 484
@@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
458 501
459 local_save_flags(flags); 502 local_save_flags(flags);
460 503
461 data = wakeup_trace->data[wakeup_cpu]; 504 data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
462 data->preempt_timestamp = ftrace_now(cpu); 505 data->preempt_timestamp = ftrace_now(cpu);
463 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); 506 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
464 507
@@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
472out_locked: 515out_locked:
473 arch_spin_unlock(&wakeup_lock); 516 arch_spin_unlock(&wakeup_lock);
474out: 517out:
475 atomic_dec(&wakeup_trace->data[cpu]->disabled); 518 atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
476} 519}
477 520
478static void start_wakeup_tracer(struct trace_array *tr) 521static void start_wakeup_tracer(struct trace_array *tr)
@@ -540,8 +583,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
540 583
541static int __wakeup_tracer_init(struct trace_array *tr) 584static int __wakeup_tracer_init(struct trace_array *tr)
542{ 585{
543 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; 586 save_flags = trace_flags;
544 trace_flags |= TRACE_ITER_LATENCY_FMT; 587
588 /* non overwrite screws up the latency tracers */
589 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
590 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
545 591
546 tracing_max_latency = 0; 592 tracing_max_latency = 0;
547 wakeup_trace = tr; 593 wakeup_trace = tr;
@@ -563,12 +609,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
563 609
564static void wakeup_tracer_reset(struct trace_array *tr) 610static void wakeup_tracer_reset(struct trace_array *tr)
565{ 611{
612 int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
613 int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
614
566 stop_wakeup_tracer(tr); 615 stop_wakeup_tracer(tr);
567 /* make sure we put back any tasks we are tracing */ 616 /* make sure we put back any tasks we are tracing */
568 wakeup_reset(tr); 617 wakeup_reset(tr);
569 618
570 if (!save_lat_flag) 619 set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
571 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 620 set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
572} 621}
573 622
574static void wakeup_tracer_start(struct trace_array *tr) 623static void wakeup_tracer_start(struct trace_array *tr)
@@ -594,6 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =
594 .print_line = wakeup_print_line, 643 .print_line = wakeup_print_line,
595 .flags = &tracer_flags, 644 .flags = &tracer_flags,
596 .set_flag = wakeup_set_flag, 645 .set_flag = wakeup_set_flag,
646 .flag_changed = wakeup_flag_changed,
597#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
598 .selftest = trace_selftest_startup_wakeup, 648 .selftest = trace_selftest_startup_wakeup,
599#endif 649#endif
@@ -615,6 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
615 .print_line = wakeup_print_line, 665 .print_line = wakeup_print_line,
616 .flags = &tracer_flags, 666 .flags = &tracer_flags,
617 .set_flag = wakeup_set_flag, 667 .set_flag = wakeup_set_flag,
668 .flag_changed = wakeup_flag_changed,
618#ifdef CONFIG_FTRACE_SELFTEST 669#ifdef CONFIG_FTRACE_SELFTEST
619 .selftest = trace_selftest_startup_wakeup, 670 .selftest = trace_selftest_startup_wakeup,
620#endif 671#endif
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 51c819c12c29..55e2cf66967b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
21 return 0; 21 return 0;
22} 22}
23 23
24static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) 24static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
25{ 25{
26 struct ring_buffer_event *event; 26 struct ring_buffer_event *event;
27 struct trace_entry *entry; 27 struct trace_entry *entry;
28 unsigned int loops = 0; 28 unsigned int loops = 0;
29 29
30 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { 30 while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
31 entry = ring_buffer_event_data(event); 31 entry = ring_buffer_event_data(event);
32 32
33 /* 33 /*
@@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
58 * Test the trace buffer to see if all the elements 58 * Test the trace buffer to see if all the elements
59 * are still sane. 59 * are still sane.
60 */ 60 */
61static int trace_test_buffer(struct trace_array *tr, unsigned long *count) 61static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
62{ 62{
63 unsigned long flags, cnt = 0; 63 unsigned long flags, cnt = 0;
64 int cpu, ret = 0; 64 int cpu, ret = 0;
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
67 local_irq_save(flags); 67 local_irq_save(flags);
68 arch_spin_lock(&ftrace_max_lock); 68 arch_spin_lock(&ftrace_max_lock);
69 69
70 cnt = ring_buffer_entries(tr->buffer); 70 cnt = ring_buffer_entries(buf->buffer);
71 71
72 /* 72 /*
73 * The trace_test_buffer_cpu runs a while loop to consume all data. 73 * The trace_test_buffer_cpu runs a while loop to consume all data.
@@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
78 */ 78 */
79 tracing_off(); 79 tracing_off();
80 for_each_possible_cpu(cpu) { 80 for_each_possible_cpu(cpu) {
81 ret = trace_test_buffer_cpu(tr, cpu); 81 ret = trace_test_buffer_cpu(buf, cpu);
82 if (ret) 82 if (ret)
83 break; 83 break;
84 } 84 }
@@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
355 msleep(100); 355 msleep(100);
356 356
357 /* we should have nothing in the buffer */ 357 /* we should have nothing in the buffer */
358 ret = trace_test_buffer(tr, &count); 358 ret = trace_test_buffer(&tr->trace_buffer, &count);
359 if (ret) 359 if (ret)
360 goto out; 360 goto out;
361 361
@@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
376 ftrace_enabled = 0; 376 ftrace_enabled = 0;
377 377
378 /* check the trace buffer */ 378 /* check the trace buffer */
379 ret = trace_test_buffer(tr, &count); 379 ret = trace_test_buffer(&tr->trace_buffer, &count);
380 tracing_start(); 380 tracing_start();
381 381
382 /* we should only have one item */ 382 /* we should only have one item */
@@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
666 ftrace_enabled = 0; 666 ftrace_enabled = 0;
667 667
668 /* check the trace buffer */ 668 /* check the trace buffer */
669 ret = trace_test_buffer(tr, &count); 669 ret = trace_test_buffer(&tr->trace_buffer, &count);
670 trace->reset(tr); 670 trace->reset(tr);
671 tracing_start(); 671 tracing_start();
672 672
@@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
703/* Maximum number of functions to trace before diagnosing a hang */ 703/* Maximum number of functions to trace before diagnosing a hang */
704#define GRAPH_MAX_FUNC_TEST 100000000 704#define GRAPH_MAX_FUNC_TEST 100000000
705 705
706static void
707__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
708static unsigned int graph_hang_thresh; 706static unsigned int graph_hang_thresh;
709 707
710/* Wrap the real function entry probe to avoid possible hanging */ 708/* Wrap the real function entry probe to avoid possible hanging */
@@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
714 if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { 712 if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
715 ftrace_graph_stop(); 713 ftrace_graph_stop();
716 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 714 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
717 if (ftrace_dump_on_oops) 715 if (ftrace_dump_on_oops) {
718 __ftrace_dump(false, DUMP_ALL); 716 ftrace_dump(DUMP_ALL);
717 /* ftrace_dump() disables tracing */
718 tracing_on();
719 }
719 return 0; 720 return 0;
720 } 721 }
721 722
@@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
737 * Simulate the init() callback but we attach a watchdog callback 738 * Simulate the init() callback but we attach a watchdog callback
738 * to detect and recover from possible hangs 739 * to detect and recover from possible hangs
739 */ 740 */
740 tracing_reset_online_cpus(tr); 741 tracing_reset_online_cpus(&tr->trace_buffer);
741 set_graph_array(tr); 742 set_graph_array(tr);
742 ret = register_ftrace_graph(&trace_graph_return, 743 ret = register_ftrace_graph(&trace_graph_return,
743 &trace_graph_entry_watchdog); 744 &trace_graph_entry_watchdog);
@@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
760 tracing_stop(); 761 tracing_stop();
761 762
762 /* check the trace buffer */ 763 /* check the trace buffer */
763 ret = trace_test_buffer(tr, &count); 764 ret = trace_test_buffer(&tr->trace_buffer, &count);
764 765
765 trace->reset(tr); 766 trace->reset(tr);
766 tracing_start(); 767 tracing_start();
@@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
815 /* stop the tracing. */ 816 /* stop the tracing. */
816 tracing_stop(); 817 tracing_stop();
817 /* check both trace buffers */ 818 /* check both trace buffers */
818 ret = trace_test_buffer(tr, NULL); 819 ret = trace_test_buffer(&tr->trace_buffer, NULL);
819 if (!ret) 820 if (!ret)
820 ret = trace_test_buffer(&max_tr, &count); 821 ret = trace_test_buffer(&tr->max_buffer, &count);
821 trace->reset(tr); 822 trace->reset(tr);
822 tracing_start(); 823 tracing_start();
823 824
@@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
877 /* stop the tracing. */ 878 /* stop the tracing. */
878 tracing_stop(); 879 tracing_stop();
879 /* check both trace buffers */ 880 /* check both trace buffers */
880 ret = trace_test_buffer(tr, NULL); 881 ret = trace_test_buffer(&tr->trace_buffer, NULL);
881 if (!ret) 882 if (!ret)
882 ret = trace_test_buffer(&max_tr, &count); 883 ret = trace_test_buffer(&tr->max_buffer, &count);
883 trace->reset(tr); 884 trace->reset(tr);
884 tracing_start(); 885 tracing_start();
885 886
@@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
943 /* stop the tracing. */ 944 /* stop the tracing. */
944 tracing_stop(); 945 tracing_stop();
945 /* check both trace buffers */ 946 /* check both trace buffers */
946 ret = trace_test_buffer(tr, NULL); 947 ret = trace_test_buffer(&tr->trace_buffer, NULL);
947 if (ret) 948 if (ret)
948 goto out; 949 goto out;
949 950
950 ret = trace_test_buffer(&max_tr, &count); 951 ret = trace_test_buffer(&tr->max_buffer, &count);
951 if (ret) 952 if (ret)
952 goto out; 953 goto out;
953 954
@@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
973 /* stop the tracing. */ 974 /* stop the tracing. */
974 tracing_stop(); 975 tracing_stop();
975 /* check both trace buffers */ 976 /* check both trace buffers */
976 ret = trace_test_buffer(tr, NULL); 977 ret = trace_test_buffer(&tr->trace_buffer, NULL);
977 if (ret) 978 if (ret)
978 goto out; 979 goto out;
979 980
980 ret = trace_test_buffer(&max_tr, &count); 981 ret = trace_test_buffer(&tr->max_buffer, &count);
981 982
982 if (!ret && !count) { 983 if (!ret && !count) {
983 printk(KERN_CONT ".. no entries found .."); 984 printk(KERN_CONT ".. no entries found ..");
@@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1084 /* stop the tracing. */ 1085 /* stop the tracing. */
1085 tracing_stop(); 1086 tracing_stop();
1086 /* check both trace buffers */ 1087 /* check both trace buffers */
1087 ret = trace_test_buffer(tr, NULL); 1088 ret = trace_test_buffer(&tr->trace_buffer, NULL);
1088 printk("ret = %d\n", ret); 1089 printk("ret = %d\n", ret);
1089 if (!ret) 1090 if (!ret)
1090 ret = trace_test_buffer(&max_tr, &count); 1091 ret = trace_test_buffer(&tr->max_buffer, &count);
1091 1092
1092 1093
1093 trace->reset(tr); 1094 trace->reset(tr);
@@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
1126 /* stop the tracing. */ 1127 /* stop the tracing. */
1127 tracing_stop(); 1128 tracing_stop();
1128 /* check the trace buffer */ 1129 /* check the trace buffer */
1129 ret = trace_test_buffer(tr, &count); 1130 ret = trace_test_buffer(&tr->trace_buffer, &count);
1130 trace->reset(tr); 1131 trace->reset(tr);
1131 tracing_start(); 1132 tracing_start();
1132 1133
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc701..b20428c5efe2 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -20,13 +20,24 @@
20 20
21#define STACK_TRACE_ENTRIES 500 21#define STACK_TRACE_ENTRIES 500
22 22
23#ifdef CC_USING_FENTRY
24# define fentry 1
25#else
26# define fentry 0
27#endif
28
23static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = 29static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
24 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; 30 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
25static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; 31static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
26 32
33/*
34 * Reserve one entry for the passed in ip. This will allow
35 * us to remove most or all of the stack size overhead
36 * added by the stack tracer itself.
37 */
27static struct stack_trace max_stack_trace = { 38static struct stack_trace max_stack_trace = {
28 .max_entries = STACK_TRACE_ENTRIES, 39 .max_entries = STACK_TRACE_ENTRIES - 1,
29 .entries = stack_dump_trace, 40 .entries = &stack_dump_trace[1],
30}; 41};
31 42
32static unsigned long max_stack_size; 43static unsigned long max_stack_size;
@@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
39int stack_tracer_enabled; 50int stack_tracer_enabled;
40static int last_stack_tracer_enabled; 51static int last_stack_tracer_enabled;
41 52
42static inline void check_stack(void) 53static inline void
54check_stack(unsigned long ip, unsigned long *stack)
43{ 55{
44 unsigned long this_size, flags; 56 unsigned long this_size, flags;
45 unsigned long *p, *top, *start; 57 unsigned long *p, *top, *start;
58 static int tracer_frame;
59 int frame_size = ACCESS_ONCE(tracer_frame);
46 int i; 60 int i;
47 61
48 this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); 62 this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
49 this_size = THREAD_SIZE - this_size; 63 this_size = THREAD_SIZE - this_size;
64 /* Remove the frame of the tracer */
65 this_size -= frame_size;
50 66
51 if (this_size <= max_stack_size) 67 if (this_size <= max_stack_size)
52 return; 68 return;
53 69
54 /* we do not handle interrupt stacks yet */ 70 /* we do not handle interrupt stacks yet */
55 if (!object_is_on_stack(&this_size)) 71 if (!object_is_on_stack(stack))
56 return; 72 return;
57 73
58 local_irq_save(flags); 74 local_irq_save(flags);
59 arch_spin_lock(&max_stack_lock); 75 arch_spin_lock(&max_stack_lock);
60 76
77 /* In case another CPU set the tracer_frame on us */
78 if (unlikely(!frame_size))
79 this_size -= tracer_frame;
80
61 /* a race could have already updated it */ 81 /* a race could have already updated it */
62 if (this_size <= max_stack_size) 82 if (this_size <= max_stack_size)
63 goto out; 83 goto out;
@@ -70,10 +90,18 @@ static inline void check_stack(void)
70 save_stack_trace(&max_stack_trace); 90 save_stack_trace(&max_stack_trace);
71 91
72 /* 92 /*
93 * Add the passed in ip from the function tracer.
94 * Searching for this on the stack will skip over
95 * most of the overhead from the stack tracer itself.
96 */
97 stack_dump_trace[0] = ip;
98 max_stack_trace.nr_entries++;
99
100 /*
73 * Now find where in the stack these are. 101 * Now find where in the stack these are.
74 */ 102 */
75 i = 0; 103 i = 0;
76 start = &this_size; 104 start = stack;
77 top = (unsigned long *) 105 top = (unsigned long *)
78 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); 106 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
79 107
@@ -97,6 +125,18 @@ static inline void check_stack(void)
97 found = 1; 125 found = 1;
98 /* Start the search from here */ 126 /* Start the search from here */
99 start = p + 1; 127 start = p + 1;
128 /*
129 * We do not want to show the overhead
130 * of the stack tracer stack in the
131 * max stack. If we haven't figured
132 * out what that is, then figure it out
133 * now.
134 */
135 if (unlikely(!tracer_frame) && i == 1) {
136 tracer_frame = (p - stack) *
137 sizeof(unsigned long);
138 max_stack_size -= tracer_frame;
139 }
100 } 140 }
101 } 141 }
102 142
@@ -113,6 +153,7 @@ static void
113stack_trace_call(unsigned long ip, unsigned long parent_ip, 153stack_trace_call(unsigned long ip, unsigned long parent_ip,
114 struct ftrace_ops *op, struct pt_regs *pt_regs) 154 struct ftrace_ops *op, struct pt_regs *pt_regs)
115{ 155{
156 unsigned long stack;
116 int cpu; 157 int cpu;
117 158
118 preempt_disable_notrace(); 159 preempt_disable_notrace();
@@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
122 if (per_cpu(trace_active, cpu)++ != 0) 163 if (per_cpu(trace_active, cpu)++ != 0)
123 goto out; 164 goto out;
124 165
125 check_stack(); 166 /*
167 * When fentry is used, the traced function does not get
168 * its stack frame set up, and we lose the parent.
169 * The ip is pretty useless because the function tracer
170 * was called before that function set up its stack frame.
171 * In this case, we use the parent ip.
172 *
173 * By adding the return address of either the parent ip
174 * or the current ip we can disregard most of the stack usage
175 * caused by the stack tracer itself.
176 *
177 * The function tracer always reports the address of where the
178 * mcount call was, but the stack will hold the return address.
179 */
180 if (fentry)
181 ip = parent_ip;
182 else
183 ip += MCOUNT_INSN_SIZE;
184
185 check_stack(ip, &stack);
126 186
127 out: 187 out:
128 per_cpu(trace_active, cpu)--; 188 per_cpu(trace_active, cpu)--;
@@ -322,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {
322 .open = stack_trace_filter_open, 382 .open = stack_trace_filter_open,
323 .read = seq_read, 383 .read = seq_read,
324 .write = ftrace_filter_write, 384 .write = ftrace_filter_write,
325 .llseek = ftrace_regex_lseek, 385 .llseek = ftrace_filter_lseek,
326 .release = ftrace_regex_release, 386 .release = ftrace_regex_release,
327}; 387};
328 388
@@ -371,6 +431,8 @@ static __init int stack_trace_init(void)
371 struct dentry *d_tracer; 431 struct dentry *d_tracer;
372 432
373 d_tracer = tracing_init_dentry(); 433 d_tracer = tracing_init_dentry();
434 if (!d_tracer)
435 return 0;
374 436
375 trace_create_file("stack_max_size", 0644, d_tracer, 437 trace_create_file("stack_max_size", 0644, d_tracer,
376 &max_stack_size, &stack_max_size_fops); 438 &max_stack_size, &stack_max_size_fops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 96cffb269e73..847f88a6194b 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -307,6 +307,8 @@ static int tracing_stat_init(void)
307 struct dentry *d_tracing; 307 struct dentry *d_tracing;
308 308
309 d_tracing = tracing_init_dentry(); 309 d_tracing = tracing_init_dentry();
310 if (!d_tracing)
311 return 0;
310 312
311 stat_dir = debugfs_create_dir("trace_stat", d_tracing); 313 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
312 if (!stat_dir) 314 if (!stat_dir)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7a809e321058..8f2ac73c7a5f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -12,10 +12,6 @@
12#include "trace.h" 12#include "trace.h"
13 13
14static DEFINE_MUTEX(syscall_trace_lock); 14static DEFINE_MUTEX(syscall_trace_lock);
15static int sys_refcount_enter;
16static int sys_refcount_exit;
17static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
18static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
19 15
20static int syscall_enter_register(struct ftrace_event_call *event, 16static int syscall_enter_register(struct ftrace_event_call *event,
21 enum trace_reg type, void *data); 17 enum trace_reg type, void *data);
@@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
41 /* 37 /*
42 * Only compare after the "sys" prefix. Archs that use 38 * Only compare after the "sys" prefix. Archs that use
43 * syscall wrappers may have syscalls symbols aliases prefixed 39 * syscall wrappers may have syscalls symbols aliases prefixed
44 * with "SyS" instead of "sys", leading to an unwanted 40 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
45 * mismatch. 41 * mismatch.
46 */ 42 */
47 return !strcmp(sym + 3, name + 3); 43 return !strcmp(sym + 3, name + 3);
@@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
265 kfree(call->print_fmt); 261 kfree(call->print_fmt);
266} 262}
267 263
268static int syscall_enter_define_fields(struct ftrace_event_call *call) 264static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
269{ 265{
270 struct syscall_trace_enter trace; 266 struct syscall_trace_enter trace;
271 struct syscall_metadata *meta = call->data; 267 struct syscall_metadata *meta = call->data;
@@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)
288 return ret; 284 return ret;
289} 285}
290 286
291static int syscall_exit_define_fields(struct ftrace_event_call *call) 287static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
292{ 288{
293 struct syscall_trace_exit trace; 289 struct syscall_trace_exit trace;
294 int ret; 290 int ret;
@@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
303 return ret; 299 return ret;
304} 300}
305 301
306static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 302static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
307{ 303{
304 struct trace_array *tr = data;
308 struct syscall_trace_enter *entry; 305 struct syscall_trace_enter *entry;
309 struct syscall_metadata *sys_data; 306 struct syscall_metadata *sys_data;
310 struct ring_buffer_event *event; 307 struct ring_buffer_event *event;
@@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
315 syscall_nr = trace_get_syscall_nr(current, regs); 312 syscall_nr = trace_get_syscall_nr(current, regs);
316 if (syscall_nr < 0) 313 if (syscall_nr < 0)
317 return; 314 return;
318 if (!test_bit(syscall_nr, enabled_enter_syscalls)) 315 if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))
319 return; 316 return;
320 317
321 sys_data = syscall_nr_to_meta(syscall_nr); 318 sys_data = syscall_nr_to_meta(syscall_nr);
@@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
324 321
325 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 322 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
326 323
327 event = trace_current_buffer_lock_reserve(&buffer, 324 buffer = tr->trace_buffer.buffer;
325 event = trace_buffer_lock_reserve(buffer,
328 sys_data->enter_event->event.type, size, 0, 0); 326 sys_data->enter_event->event.type, size, 0, 0);
329 if (!event) 327 if (!event)
330 return; 328 return;
@@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
338 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 336 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
339} 337}
340 338
341static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 339static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
342{ 340{
341 struct trace_array *tr = data;
343 struct syscall_trace_exit *entry; 342 struct syscall_trace_exit *entry;
344 struct syscall_metadata *sys_data; 343 struct syscall_metadata *sys_data;
345 struct ring_buffer_event *event; 344 struct ring_buffer_event *event;
@@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
349 syscall_nr = trace_get_syscall_nr(current, regs); 348 syscall_nr = trace_get_syscall_nr(current, regs);
350 if (syscall_nr < 0) 349 if (syscall_nr < 0)
351 return; 350 return;
352 if (!test_bit(syscall_nr, enabled_exit_syscalls)) 351 if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))
353 return; 352 return;
354 353
355 sys_data = syscall_nr_to_meta(syscall_nr); 354 sys_data = syscall_nr_to_meta(syscall_nr);
356 if (!sys_data) 355 if (!sys_data)
357 return; 356 return;
358 357
359 event = trace_current_buffer_lock_reserve(&buffer, 358 buffer = tr->trace_buffer.buffer;
359 event = trace_buffer_lock_reserve(buffer,
360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0); 360 sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
361 if (!event) 361 if (!event)
362 return; 362 return;
@@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
370 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 370 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
371} 371}
372 372
373static int reg_event_syscall_enter(struct ftrace_event_call *call) 373static int reg_event_syscall_enter(struct ftrace_event_file *file,
374 struct ftrace_event_call *call)
374{ 375{
376 struct trace_array *tr = file->tr;
375 int ret = 0; 377 int ret = 0;
376 int num; 378 int num;
377 379
@@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)
379 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 381 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
380 return -ENOSYS; 382 return -ENOSYS;
381 mutex_lock(&syscall_trace_lock); 383 mutex_lock(&syscall_trace_lock);
382 if (!sys_refcount_enter) 384 if (!tr->sys_refcount_enter)
383 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); 385 ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
384 if (!ret) { 386 if (!ret) {
385 set_bit(num, enabled_enter_syscalls); 387 set_bit(num, tr->enabled_enter_syscalls);
386 sys_refcount_enter++; 388 tr->sys_refcount_enter++;
387 } 389 }
388 mutex_unlock(&syscall_trace_lock); 390 mutex_unlock(&syscall_trace_lock);
389 return ret; 391 return ret;
390} 392}
391 393
392static void unreg_event_syscall_enter(struct ftrace_event_call *call) 394static void unreg_event_syscall_enter(struct ftrace_event_file *file,
395 struct ftrace_event_call *call)
393{ 396{
397 struct trace_array *tr = file->tr;
394 int num; 398 int num;
395 399
396 num = ((struct syscall_metadata *)call->data)->syscall_nr; 400 num = ((struct syscall_metadata *)call->data)->syscall_nr;
397 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 401 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
398 return; 402 return;
399 mutex_lock(&syscall_trace_lock); 403 mutex_lock(&syscall_trace_lock);
400 sys_refcount_enter--; 404 tr->sys_refcount_enter--;
401 clear_bit(num, enabled_enter_syscalls); 405 clear_bit(num, tr->enabled_enter_syscalls);
402 if (!sys_refcount_enter) 406 if (!tr->sys_refcount_enter)
403 unregister_trace_sys_enter(ftrace_syscall_enter, NULL); 407 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
404 mutex_unlock(&syscall_trace_lock); 408 mutex_unlock(&syscall_trace_lock);
405} 409}
406 410
407static int reg_event_syscall_exit(struct ftrace_event_call *call) 411static int reg_event_syscall_exit(struct ftrace_event_file *file,
412 struct ftrace_event_call *call)
408{ 413{
414 struct trace_array *tr = file->tr;
409 int ret = 0; 415 int ret = 0;
410 int num; 416 int num;
411 417
@@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)
413 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 419 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
414 return -ENOSYS; 420 return -ENOSYS;
415 mutex_lock(&syscall_trace_lock); 421 mutex_lock(&syscall_trace_lock);
416 if (!sys_refcount_exit) 422 if (!tr->sys_refcount_exit)
417 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); 423 ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
418 if (!ret) { 424 if (!ret) {
419 set_bit(num, enabled_exit_syscalls); 425 set_bit(num, tr->enabled_exit_syscalls);
420 sys_refcount_exit++; 426 tr->sys_refcount_exit++;
421 } 427 }
422 mutex_unlock(&syscall_trace_lock); 428 mutex_unlock(&syscall_trace_lock);
423 return ret; 429 return ret;
424} 430}
425 431
426static void unreg_event_syscall_exit(struct ftrace_event_call *call) 432static void unreg_event_syscall_exit(struct ftrace_event_file *file,
433 struct ftrace_event_call *call)
427{ 434{
435 struct trace_array *tr = file->tr;
428 int num; 436 int num;
429 437
430 num = ((struct syscall_metadata *)call->data)->syscall_nr; 438 num = ((struct syscall_metadata *)call->data)->syscall_nr;
431 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 439 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
432 return; 440 return;
433 mutex_lock(&syscall_trace_lock); 441 mutex_lock(&syscall_trace_lock);
434 sys_refcount_exit--; 442 tr->sys_refcount_exit--;
435 clear_bit(num, enabled_exit_syscalls); 443 clear_bit(num, tr->enabled_exit_syscalls);
436 if (!sys_refcount_exit) 444 if (!tr->sys_refcount_exit)
437 unregister_trace_sys_exit(ftrace_syscall_exit, NULL); 445 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
438 mutex_unlock(&syscall_trace_lock); 446 mutex_unlock(&syscall_trace_lock);
439} 447}
440 448
@@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
471 .trace = print_syscall_exit, 479 .trace = print_syscall_exit,
472}; 480};
473 481
474struct ftrace_event_class event_class_syscall_enter = { 482struct ftrace_event_class __refdata event_class_syscall_enter = {
475 .system = "syscalls", 483 .system = "syscalls",
476 .reg = syscall_enter_register, 484 .reg = syscall_enter_register,
477 .define_fields = syscall_enter_define_fields, 485 .define_fields = syscall_enter_define_fields,
@@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {
479 .raw_init = init_syscall_trace, 487 .raw_init = init_syscall_trace,
480}; 488};
481 489
482struct ftrace_event_class event_class_syscall_exit = { 490struct ftrace_event_class __refdata event_class_syscall_exit = {
483 .system = "syscalls", 491 .system = "syscalls",
484 .reg = syscall_exit_register, 492 .reg = syscall_exit_register,
485 .define_fields = syscall_exit_define_fields, 493 .define_fields = syscall_exit_define_fields,
@@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
685static int syscall_enter_register(struct ftrace_event_call *event, 693static int syscall_enter_register(struct ftrace_event_call *event,
686 enum trace_reg type, void *data) 694 enum trace_reg type, void *data)
687{ 695{
696 struct ftrace_event_file *file = data;
697
688 switch (type) { 698 switch (type) {
689 case TRACE_REG_REGISTER: 699 case TRACE_REG_REGISTER:
690 return reg_event_syscall_enter(event); 700 return reg_event_syscall_enter(file, event);
691 case TRACE_REG_UNREGISTER: 701 case TRACE_REG_UNREGISTER:
692 unreg_event_syscall_enter(event); 702 unreg_event_syscall_enter(file, event);
693 return 0; 703 return 0;
694 704
695#ifdef CONFIG_PERF_EVENTS 705#ifdef CONFIG_PERF_EVENTS
@@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,
711static int syscall_exit_register(struct ftrace_event_call *event, 721static int syscall_exit_register(struct ftrace_event_call *event,
712 enum trace_reg type, void *data) 722 enum trace_reg type, void *data)
713{ 723{
724 struct ftrace_event_file *file = data;
725
714 switch (type) { 726 switch (type) {
715 case TRACE_REG_REGISTER: 727 case TRACE_REG_REGISTER:
716 return reg_event_syscall_exit(event); 728 return reg_event_syscall_exit(file, event);
717 case TRACE_REG_UNREGISTER: 729 case TRACE_REG_UNREGISTER:
718 unreg_event_syscall_exit(event); 730 unreg_event_syscall_exit(file, event);
719 return 0; 731 return 0;
720 732
721#ifdef CONFIG_PERF_EVENTS 733#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8dad2a92dee9..32494fb0ee64 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,6 +28,18 @@
28 28
29#define UPROBE_EVENT_SYSTEM "uprobes" 29#define UPROBE_EVENT_SYSTEM "uprobes"
30 30
31struct uprobe_trace_entry_head {
32 struct trace_entry ent;
33 unsigned long vaddr[];
34};
35
36#define SIZEOF_TRACE_ENTRY(is_return) \
37 (sizeof(struct uprobe_trace_entry_head) + \
38 sizeof(unsigned long) * (is_return ? 2 : 1))
39
40#define DATAOF_TRACE_ENTRY(entry, is_return) \
41 ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
42
31struct trace_uprobe_filter { 43struct trace_uprobe_filter {
32 rwlock_t rwlock; 44 rwlock_t rwlock;
33 int nr_systemwide; 45 int nr_systemwide;
@@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock);
64static LIST_HEAD(uprobe_list); 76static LIST_HEAD(uprobe_list);
65 77
66static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 78static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
79static int uretprobe_dispatcher(struct uprobe_consumer *con,
80 unsigned long func, struct pt_regs *regs);
67 81
68static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) 82static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
69{ 83{
@@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
77 return !filter->nr_systemwide && list_empty(&filter->perf_events); 91 return !filter->nr_systemwide && list_empty(&filter->perf_events);
78} 92}
79 93
94static inline bool is_ret_probe(struct trace_uprobe *tu)
95{
96 return tu->consumer.ret_handler != NULL;
97}
98
80/* 99/*
81 * Allocate new trace_uprobe and initialize it (including uprobes). 100 * Allocate new trace_uprobe and initialize it (including uprobes).
82 */ 101 */
83static struct trace_uprobe * 102static struct trace_uprobe *
84alloc_trace_uprobe(const char *group, const char *event, int nargs) 103alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
85{ 104{
86 struct trace_uprobe *tu; 105 struct trace_uprobe *tu;
87 106
@@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
106 125
107 INIT_LIST_HEAD(&tu->list); 126 INIT_LIST_HEAD(&tu->list);
108 tu->consumer.handler = uprobe_dispatcher; 127 tu->consumer.handler = uprobe_dispatcher;
128 if (is_ret)
129 tu->consumer.ret_handler = uretprobe_dispatcher;
109 init_trace_uprobe_filter(&tu->filter); 130 init_trace_uprobe_filter(&tu->filter);
110 return tu; 131 return tu;
111 132
@@ -180,7 +201,7 @@ end:
180 201
181/* 202/*
182 * Argument syntax: 203 * Argument syntax:
183 * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] 204 * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]
184 * 205 *
185 * - Remove uprobe: -:[GRP/]EVENT 206 * - Remove uprobe: -:[GRP/]EVENT
186 */ 207 */
@@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv)
192 char buf[MAX_EVENT_NAME_LEN]; 213 char buf[MAX_EVENT_NAME_LEN];
193 struct path path; 214 struct path path;
194 unsigned long offset; 215 unsigned long offset;
195 bool is_delete; 216 bool is_delete, is_return;
196 int i, ret; 217 int i, ret;
197 218
198 inode = NULL; 219 inode = NULL;
199 ret = 0; 220 ret = 0;
200 is_delete = false; 221 is_delete = false;
222 is_return = false;
201 event = NULL; 223 event = NULL;
202 group = NULL; 224 group = NULL;
203 225
204 /* argc must be >= 1 */ 226 /* argc must be >= 1 */
205 if (argv[0][0] == '-') 227 if (argv[0][0] == '-')
206 is_delete = true; 228 is_delete = true;
229 else if (argv[0][0] == 'r')
230 is_return = true;
207 else if (argv[0][0] != 'p') { 231 else if (argv[0][0] != 'p') {
208 pr_info("Probe definition must be started with 'p' or '-'.\n"); 232 pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
209 return -EINVAL; 233 return -EINVAL;
210 } 234 }
211 235
@@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv)
303 kfree(tail); 327 kfree(tail);
304 } 328 }
305 329
306 tu = alloc_trace_uprobe(group, event, argc); 330 tu = alloc_trace_uprobe(group, event, argc, is_return);
307 if (IS_ERR(tu)) { 331 if (IS_ERR(tu)) {
308 pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); 332 pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
309 ret = PTR_ERR(tu); 333 ret = PTR_ERR(tu);
@@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
414static int probes_seq_show(struct seq_file *m, void *v) 438static int probes_seq_show(struct seq_file *m, void *v)
415{ 439{
416 struct trace_uprobe *tu = v; 440 struct trace_uprobe *tu = v;
441 char c = is_ret_probe(tu) ? 'r' : 'p';
417 int i; 442 int i;
418 443
419 seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); 444 seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);
420 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); 445 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
421 446
422 for (i = 0; i < tu->nr_args; i++) 447 for (i = 0; i < tu->nr_args; i++)
@@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = {
485 .release = seq_release, 510 .release = seq_release,
486}; 511};
487 512
488/* uprobe handler */ 513static void uprobe_trace_print(struct trace_uprobe *tu,
489static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 514 unsigned long func, struct pt_regs *regs)
490{ 515{
491 struct uprobe_trace_entry_head *entry; 516 struct uprobe_trace_entry_head *entry;
492 struct ring_buffer_event *event; 517 struct ring_buffer_event *event;
493 struct ring_buffer *buffer; 518 struct ring_buffer *buffer;
494 u8 *data; 519 void *data;
495 int size, i, pc; 520 int size, i;
496 unsigned long irq_flags;
497 struct ftrace_event_call *call = &tu->call; 521 struct ftrace_event_call *call = &tu->call;
498 522
499 local_save_flags(irq_flags); 523 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
500 pc = preempt_count();
501
502 size = sizeof(*entry) + tu->size;
503
504 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 524 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
505 size, irq_flags, pc); 525 size + tu->size, 0, 0);
506 if (!event) 526 if (!event)
507 return 0; 527 return;
508 528
509 entry = ring_buffer_event_data(event); 529 entry = ring_buffer_event_data(event);
510 entry->ip = instruction_pointer(task_pt_regs(current)); 530 if (is_ret_probe(tu)) {
511 data = (u8 *)&entry[1]; 531 entry->vaddr[0] = func;
532 entry->vaddr[1] = instruction_pointer(regs);
533 data = DATAOF_TRACE_ENTRY(entry, true);
534 } else {
535 entry->vaddr[0] = instruction_pointer(regs);
536 data = DATAOF_TRACE_ENTRY(entry, false);
537 }
538
512 for (i = 0; i < tu->nr_args; i++) 539 for (i = 0; i < tu->nr_args; i++)
513 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 540 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
514 541
515 if (!filter_current_check_discard(buffer, call, entry, event)) 542 if (!filter_current_check_discard(buffer, call, entry, event))
516 trace_buffer_unlock_commit(buffer, event, irq_flags, pc); 543 trace_buffer_unlock_commit(buffer, event, 0, 0);
544}
517 545
546/* uprobe handler */
547static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
548{
549 if (!is_ret_probe(tu))
550 uprobe_trace_print(tu, 0, regs);
518 return 0; 551 return 0;
519} 552}
520 553
554static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
555 struct pt_regs *regs)
556{
557 uprobe_trace_print(tu, func, regs);
558}
559
521/* Event entry printers */ 560/* Event entry printers */
522static enum print_line_t 561static enum print_line_t
523print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) 562print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
524{ 563{
525 struct uprobe_trace_entry_head *field; 564 struct uprobe_trace_entry_head *entry;
526 struct trace_seq *s = &iter->seq; 565 struct trace_seq *s = &iter->seq;
527 struct trace_uprobe *tu; 566 struct trace_uprobe *tu;
528 u8 *data; 567 u8 *data;
529 int i; 568 int i;
530 569
531 field = (struct uprobe_trace_entry_head *)iter->ent; 570 entry = (struct uprobe_trace_entry_head *)iter->ent;
532 tu = container_of(event, struct trace_uprobe, call.event); 571 tu = container_of(event, struct trace_uprobe, call.event);
533 572
534 if (!trace_seq_printf(s, "%s: (", tu->call.name)) 573 if (is_ret_probe(tu)) {
535 goto partial; 574 if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name,
536 575 entry->vaddr[1], entry->vaddr[0]))
537 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) 576 goto partial;
538 goto partial; 577 data = DATAOF_TRACE_ENTRY(entry, true);
539 578 } else {
540 if (!trace_seq_puts(s, ")")) 579 if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name,
541 goto partial; 580 entry->vaddr[0]))
581 goto partial;
582 data = DATAOF_TRACE_ENTRY(entry, false);
583 }
542 584
543 data = (u8 *)&field[1];
544 for (i = 0; i < tu->nr_args; i++) { 585 for (i = 0; i < tu->nr_args; i++) {
545 if (!tu->args[i].type->print(s, tu->args[i].name, 586 if (!tu->args[i].type->print(s, tu->args[i].name,
546 data + tu->args[i].offset, field)) 587 data + tu->args[i].offset, entry))
547 goto partial; 588 goto partial;
548 } 589 }
549 590
@@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag)
595 636
596static int uprobe_event_define_fields(struct ftrace_event_call *event_call) 637static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
597{ 638{
598 int ret, i; 639 int ret, i, size;
599 struct uprobe_trace_entry_head field; 640 struct uprobe_trace_entry_head field;
600 struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; 641 struct trace_uprobe *tu = event_call->data;
601 642
602 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 643 if (is_ret_probe(tu)) {
644 DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
645 DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
646 size = SIZEOF_TRACE_ENTRY(true);
647 } else {
648 DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
649 size = SIZEOF_TRACE_ENTRY(false);
650 }
603 /* Set argument names as fields */ 651 /* Set argument names as fields */
604 for (i = 0; i < tu->nr_args; i++) { 652 for (i = 0; i < tu->nr_args; i++) {
605 ret = trace_define_field(event_call, tu->args[i].type->fmttype, 653 ret = trace_define_field(event_call, tu->args[i].type->fmttype,
606 tu->args[i].name, 654 tu->args[i].name,
607 sizeof(field) + tu->args[i].offset, 655 size + tu->args[i].offset,
608 tu->args[i].type->size, 656 tu->args[i].type->size,
609 tu->args[i].type->is_signed, 657 tu->args[i].type->is_signed,
610 FILTER_OTHER); 658 FILTER_OTHER);
@@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
622 int i; 670 int i;
623 int pos = 0; 671 int pos = 0;
624 672
625 fmt = "(%lx)"; 673 if (is_ret_probe(tu)) {
626 arg = "REC->" FIELD_STRING_IP; 674 fmt = "(%lx <- %lx)";
675 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
676 } else {
677 fmt = "(%lx)";
678 arg = "REC->" FIELD_STRING_IP;
679 }
627 680
628 /* When len=0, we just calculate the needed length */ 681 /* When len=0, we just calculate the needed length */
629 682
@@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
752 return ret; 805 return ret;
753} 806}
754 807
755/* uprobe profile handler */ 808static void uprobe_perf_print(struct trace_uprobe *tu,
756static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 809 unsigned long func, struct pt_regs *regs)
757{ 810{
758 struct ftrace_event_call *call = &tu->call; 811 struct ftrace_event_call *call = &tu->call;
759 struct uprobe_trace_entry_head *entry; 812 struct uprobe_trace_entry_head *entry;
760 struct hlist_head *head; 813 struct hlist_head *head;
761 u8 *data; 814 void *data;
762 int size, __size, i; 815 int size, rctx, i;
763 int rctx;
764 816
765 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) 817 size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
766 return UPROBE_HANDLER_REMOVE; 818 size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
767
768 __size = sizeof(*entry) + tu->size;
769 size = ALIGN(__size + sizeof(u32), sizeof(u64));
770 size -= sizeof(u32);
771 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 819 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
772 return 0; 820 return;
773 821
774 preempt_disable(); 822 preempt_disable();
823 head = this_cpu_ptr(call->perf_events);
824 if (hlist_empty(head))
825 goto out;
775 826
776 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 827 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
777 if (!entry) 828 if (!entry)
778 goto out; 829 goto out;
779 830
780 entry->ip = instruction_pointer(task_pt_regs(current)); 831 if (is_ret_probe(tu)) {
781 data = (u8 *)&entry[1]; 832 entry->vaddr[0] = func;
833 entry->vaddr[1] = instruction_pointer(regs);
834 data = DATAOF_TRACE_ENTRY(entry, true);
835 } else {
836 entry->vaddr[0] = instruction_pointer(regs);
837 data = DATAOF_TRACE_ENTRY(entry, false);
838 }
839
782 for (i = 0; i < tu->nr_args; i++) 840 for (i = 0; i < tu->nr_args; i++)
783 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 841 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
784 842
785 head = this_cpu_ptr(call->perf_events); 843 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
786 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
787
788 out: 844 out:
789 preempt_enable(); 845 preempt_enable();
846}
847
848/* uprobe profile handler */
849static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
850{
851 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
852 return UPROBE_HANDLER_REMOVE;
853
854 if (!is_ret_probe(tu))
855 uprobe_perf_print(tu, 0, regs);
790 return 0; 856 return 0;
791} 857}
858
859static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
860 struct pt_regs *regs)
861{
862 uprobe_perf_print(tu, func, regs);
863}
792#endif /* CONFIG_PERF_EVENTS */ 864#endif /* CONFIG_PERF_EVENTS */
793 865
794static 866static
795int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) 867int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
796{ 868{
797 struct trace_uprobe *tu = (struct trace_uprobe *)event->data; 869 struct trace_uprobe *tu = event->data;
798 870
799 switch (type) { 871 switch (type) {
800 case TRACE_REG_REGISTER: 872 case TRACE_REG_REGISTER:
@@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
843 return ret; 915 return ret;
844} 916}
845 917
918static int uretprobe_dispatcher(struct uprobe_consumer *con,
919 unsigned long func, struct pt_regs *regs)
920{
921 struct trace_uprobe *tu;
922
923 tu = container_of(con, struct trace_uprobe, consumer);
924
925 if (tu->flags & TP_FLAG_TRACE)
926 uretprobe_trace_func(tu, func, regs);
927
928#ifdef CONFIG_PERF_EVENTS
929 if (tu->flags & TP_FLAG_PROFILE)
930 uretprobe_perf_func(tu, func, regs);
931#endif
932 return 0;
933}
934
846static struct trace_event_functions uprobe_funcs = { 935static struct trace_event_functions uprobe_funcs = {
847 .trace = print_uprobe_event 936 .trace = print_uprobe_event
848}; 937};
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 0c05a4592047..29f26540e9c9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
112 int nr_probes = 0; 112 int nr_probes = 0;
113 struct tracepoint_func *old, *new; 113 struct tracepoint_func *old, *new;
114 114
115 WARN_ON(!probe); 115 if (WARN_ON(!probe))
116 return ERR_PTR(-EINVAL);
116 117
117 debug_print_probes(entry); 118 debug_print_probes(entry);
118 old = entry->funcs; 119 old = entry->funcs;
@@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
152 153
153 debug_print_probes(entry); 154 debug_print_probes(entry);
154 /* (N -> M), (N > 1, M >= 0) probes */ 155 /* (N -> M), (N > 1, M >= 0) probes */
155 for (nr_probes = 0; old[nr_probes].func; nr_probes++) { 156 if (probe) {
156 if (!probe || 157 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
157 (old[nr_probes].func == probe && 158 if (old[nr_probes].func == probe &&
158 old[nr_probes].data == data)) 159 old[nr_probes].data == data)
159 nr_del++; 160 nr_del++;
161 }
160 } 162 }
161 163
164 /*
165 * If probe is NULL, then nr_probes = nr_del = 0, and then the
166 * entire entry will be removed.
167 */
162 if (nr_probes - nr_del == 0) { 168 if (nr_probes - nr_del == 0) {
163 /* N -> 0, (N > 1) */ 169 /* N -> 0, (N > 1) */
164 entry->funcs = NULL; 170 entry->funcs = NULL;
@@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
173 if (new == NULL) 179 if (new == NULL)
174 return ERR_PTR(-ENOMEM); 180 return ERR_PTR(-ENOMEM);
175 for (i = 0; old[i].func; i++) 181 for (i = 0; old[i].func; i++)
176 if (probe && 182 if (old[i].func != probe || old[i].data != data)
177 (old[i].func != probe || old[i].data != data))
178 new[j++] = old[i]; 183 new[j++] = old[i];
179 new[nr_probes - nr_del].func = NULL; 184 new[nr_probes - nr_del].func = NULL;
180 entry->refcount = nr_probes - nr_del; 185 entry->refcount = nr_probes - nr_del;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d7948eb10225..f6c83d7ef000 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -18,67 +18,43 @@
18 18
19SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) 19SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
20{ 20{
21 long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); 21 return sys_chown(filename, low2highuid(user), low2highgid(group));
22 /* avoid REGPARM breakage on x86: */
23 asmlinkage_protect(3, ret, filename, user, group);
24 return ret;
25} 22}
26 23
27SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) 24SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
28{ 25{
29 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); 26 return sys_lchown(filename, low2highuid(user), low2highgid(group));
30 /* avoid REGPARM breakage on x86: */
31 asmlinkage_protect(3, ret, filename, user, group);
32 return ret;
33} 27}
34 28
35SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) 29SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
36{ 30{
37 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); 31 return sys_fchown(fd, low2highuid(user), low2highgid(group));
38 /* avoid REGPARM breakage on x86: */
39 asmlinkage_protect(3, ret, fd, user, group);
40 return ret;
41} 32}
42 33
43SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) 34SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
44{ 35{
45 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); 36 return sys_setregid(low2highgid(rgid), low2highgid(egid));
46 /* avoid REGPARM breakage on x86: */
47 asmlinkage_protect(2, ret, rgid, egid);
48 return ret;
49} 37}
50 38
51SYSCALL_DEFINE1(setgid16, old_gid_t, gid) 39SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
52{ 40{
53 long ret = sys_setgid(low2highgid(gid)); 41 return sys_setgid(low2highgid(gid));
54 /* avoid REGPARM breakage on x86: */
55 asmlinkage_protect(1, ret, gid);
56 return ret;
57} 42}
58 43
59SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) 44SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
60{ 45{
61 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); 46 return sys_setreuid(low2highuid(ruid), low2highuid(euid));
62 /* avoid REGPARM breakage on x86: */
63 asmlinkage_protect(2, ret, ruid, euid);
64 return ret;
65} 47}
66 48
67SYSCALL_DEFINE1(setuid16, old_uid_t, uid) 49SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
68{ 50{
69 long ret = sys_setuid(low2highuid(uid)); 51 return sys_setuid(low2highuid(uid));
70 /* avoid REGPARM breakage on x86: */
71 asmlinkage_protect(1, ret, uid);
72 return ret;
73} 52}
74 53
75SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) 54SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
76{ 55{
77 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), 56 return sys_setresuid(low2highuid(ruid), low2highuid(euid),
78 low2highuid(suid)); 57 low2highuid(suid));
79 /* avoid REGPARM breakage on x86: */
80 asmlinkage_protect(3, ret, ruid, euid, suid);
81 return ret;
82} 58}
83 59
84SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp) 60SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
@@ -100,11 +76,8 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid
100 76
101SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) 77SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
102{ 78{
103 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), 79 return sys_setresgid(low2highgid(rgid), low2highgid(egid),
104 low2highgid(sgid)); 80 low2highgid(sgid));
105 /* avoid REGPARM breakage on x86: */
106 asmlinkage_protect(3, ret, rgid, egid, sgid);
107 return ret;
108} 81}
109 82
110 83
@@ -127,18 +100,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid
127 100
128SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) 101SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
129{ 102{
130 long ret = sys_setfsuid(low2highuid(uid)); 103 return sys_setfsuid(low2highuid(uid));
131 /* avoid REGPARM breakage on x86: */
132 asmlinkage_protect(1, ret, uid);
133 return ret;
134} 104}
135 105
136SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) 106SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
137{ 107{
138 long ret = sys_setfsgid(low2highgid(gid)); 108 return sys_setfsgid(low2highgid(gid));
139 /* avoid REGPARM breakage on x86: */
140 asmlinkage_protect(1, ret, gid);
141 return ret;
142} 109}
143 110
144static int groups16_to_user(old_gid_t __user *grouplist, 111static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/user.c b/kernel/user.c
index e81978e8c03b..69b4c3d48cde 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/export.h> 17#include <linux/export.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include <linux/proc_fs.h> 19#include <linux/proc_ns.h>
20 20
21/* 21/*
22 * userns count is 1 for root user, 1 for init_uts_ns, 22 * userns count is 1 for root user, 1 for init_uts_ns,
@@ -51,6 +51,8 @@ struct user_namespace init_user_ns = {
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
54 .may_mount_sysfs = true,
55 .may_mount_proc = true,
54}; 56};
55EXPORT_SYMBOL_GPL(init_user_ns); 57EXPORT_SYMBOL_GPL(init_user_ns);
56 58
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8b650837083e..d8c30db06c5b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,7 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/proc_fs.h> 12#include <linux/proc_ns.h>
13#include <linux/highuid.h> 13#include <linux/highuid.h>
14#include <linux/cred.h> 14#include <linux/cred.h>
15#include <linux/securebits.h> 15#include <linux/securebits.h>
@@ -21,10 +21,12 @@
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/ctype.h> 22#include <linux/ctype.h>
23#include <linux/projid.h> 23#include <linux/projid.h>
24#include <linux/fs_struct.h>
24 25
25static struct kmem_cache *user_ns_cachep __read_mostly; 26static struct kmem_cache *user_ns_cachep __read_mostly;
26 27
27static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 28static bool new_idmap_permitted(const struct file *file,
29 struct user_namespace *ns, int cap_setid,
28 struct uid_gid_map *map); 30 struct uid_gid_map *map);
29 31
30static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) 32static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
@@ -60,6 +62,15 @@ int create_user_ns(struct cred *new)
60 kgid_t group = new->egid; 62 kgid_t group = new->egid;
61 int ret; 63 int ret;
62 64
65 /*
66 * Verify that we can not violate the policy of which files
67 * may be accessed that is specified by the root directory,
68 * by verifing that the root directory is at the root of the
69 * mount namespace which allows all files to be accessed.
70 */
71 if (current_chrooted())
72 return -EPERM;
73
63 /* The creator needs a mapping in the parent user namespace 74 /* The creator needs a mapping in the parent user namespace
64 * or else we won't be able to reasonably tell userspace who 75 * or else we won't be able to reasonably tell userspace who
65 * created a user_namespace. 76 * created a user_namespace.
@@ -86,6 +97,8 @@ int create_user_ns(struct cred *new)
86 97
87 set_cred_user_ns(new, ns); 98 set_cred_user_ns(new, ns);
88 99
100 update_mnt_policy(ns);
101
89 return 0; 102 return 0;
90} 103}
91 104
@@ -600,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,
600 if (map->nr_extents != 0) 613 if (map->nr_extents != 0)
601 goto out; 614 goto out;
602 615
603 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID 616 /*
604 * over the user namespace in order to set the id mapping. 617 * Adjusting namespace settings requires capabilities on the target.
605 */ 618 */
606 if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) 619 if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
607 goto out; 620 goto out;
608 621
609 /* Get a buffer */ 622 /* Get a buffer */
@@ -688,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
688 701
689 ret = -EPERM; 702 ret = -EPERM;
690 /* Validate the user is allowed to use user id's mapped to. */ 703 /* Validate the user is allowed to use user id's mapped to. */
691 if (!new_idmap_permitted(ns, cap_setid, &new_map)) 704 if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
692 goto out; 705 goto out;
693 706
694 /* Map the lower ids from the parent user namespace to the 707 /* Map the lower ids from the parent user namespace to the
@@ -775,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
775 &ns->projid_map, &ns->parent->projid_map); 788 &ns->projid_map, &ns->parent->projid_map);
776} 789}
777 790
778static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 791static bool new_idmap_permitted(const struct file *file,
792 struct user_namespace *ns, int cap_setid,
779 struct uid_gid_map *new_map) 793 struct uid_gid_map *new_map)
780{ 794{
781 /* Allow mapping to your own filesystem ids */ 795 /* Allow mapping to your own filesystem ids */
@@ -783,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
783 u32 id = new_map->extent[0].lower_first; 797 u32 id = new_map->extent[0].lower_first;
784 if (cap_setid == CAP_SETUID) { 798 if (cap_setid == CAP_SETUID) {
785 kuid_t uid = make_kuid(ns->parent, id); 799 kuid_t uid = make_kuid(ns->parent, id);
786 if (uid_eq(uid, current_fsuid())) 800 if (uid_eq(uid, file->f_cred->fsuid))
787 return true; 801 return true;
788 } 802 }
789 else if (cap_setid == CAP_SETGID) { 803 else if (cap_setid == CAP_SETGID) {
790 kgid_t gid = make_kgid(ns->parent, id); 804 kgid_t gid = make_kgid(ns->parent, id);
791 if (gid_eq(gid, current_fsgid())) 805 if (gid_eq(gid, file->f_cred->fsgid))
792 return true; 806 return true;
793 } 807 }
794 } 808 }
@@ -799,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
799 813
800 /* Allow the specified ids if we have the appropriate capability 814 /* Allow the specified ids if we have the appropriate capability
801 * (CAP_SETUID or CAP_SETGID) over the parent user namespace. 815 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
816 * And the opener of the id file also had the approprpiate capability.
802 */ 817 */
803 if (ns_capable(ns->parent, cap_setid)) 818 if (ns_capable(ns->parent, cap_setid) &&
819 file_ns_capable(file, ns->parent, cap_setid))
804 return true; 820 return true;
805 821
806 return false; 822 return false;
@@ -837,6 +853,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
837 if (atomic_read(&current->mm->mm_users) > 1) 853 if (atomic_read(&current->mm->mm_users) > 1)
838 return -EINVAL; 854 return -EINVAL;
839 855
856 if (current->fs->users != 1)
857 return -EINVAL;
858
840 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 859 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
841 return -EPERM; 860 return -EPERM;
842 861
diff --git a/kernel/utsname.c b/kernel/utsname.c
index a47fc5de3113..2fc8576efaa8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,7 +15,7 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/proc_fs.h> 18#include <linux/proc_ns.h>
19 19
20static struct uts_namespace *create_uts_ns(void) 20static struct uts_namespace *create_uts_ns(void)
21{ 21{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4a944676358e..05039e348f07 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write,
517 return ret; 517 return ret;
518 518
519 set_sample_period(); 519 set_sample_period();
520 /*
521 * Watchdog threads shouldn't be enabled if they are
522 * disabled. The 'watchdog_disabled' variable check in
523 * watchdog_*_all_cpus() function takes care of this.
524 */
520 if (watchdog_enabled && watchdog_thresh) 525 if (watchdog_enabled && watchdog_thresh)
521 watchdog_enable_all_cpus(); 526 watchdog_enable_all_cpus();
522 else 527 else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 81f2457811eb..4aa9f5bc6b2d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,7 +41,12 @@
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/lockdep.h> 42#include <linux/lockdep.h>
43#include <linux/idr.h> 43#include <linux/idr.h>
44#include <linux/jhash.h>
44#include <linux/hashtable.h> 45#include <linux/hashtable.h>
46#include <linux/rculist.h>
47#include <linux/nodemask.h>
48#include <linux/moduleparam.h>
49#include <linux/uaccess.h>
45 50
46#include "workqueue_internal.h" 51#include "workqueue_internal.h"
47 52
@@ -58,12 +63,11 @@ enum {
58 * %WORKER_UNBOUND set and concurrency management disabled, and may 63 * %WORKER_UNBOUND set and concurrency management disabled, and may
59 * be executing on any CPU. The pool behaves as an unbound one. 64 * be executing on any CPU. The pool behaves as an unbound one.
60 * 65 *
61 * Note that DISASSOCIATED can be flipped only while holding 66 * Note that DISASSOCIATED should be flipped only while holding
62 * assoc_mutex to avoid changing binding state while 67 * manager_mutex to avoid changing binding state while
63 * create_worker() is in progress. 68 * create_worker() is in progress.
64 */ 69 */
65 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ 70 POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
66 POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
67 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ 71 POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
68 POOL_FREEZING = 1 << 3, /* freeze in progress */ 72 POOL_FREEZING = 1 << 3, /* freeze in progress */
69 73
@@ -74,12 +78,14 @@ enum {
74 WORKER_PREP = 1 << 3, /* preparing to run works */ 78 WORKER_PREP = 1 << 3, /* preparing to run works */
75 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ 79 WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
76 WORKER_UNBOUND = 1 << 7, /* worker is unbound */ 80 WORKER_UNBOUND = 1 << 7, /* worker is unbound */
81 WORKER_REBOUND = 1 << 8, /* worker was rebound */
77 82
78 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | 83 WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
79 WORKER_CPU_INTENSIVE, 84 WORKER_UNBOUND | WORKER_REBOUND,
80 85
81 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ 86 NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
82 87
88 UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
83 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ 89 BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
84 90
85 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ 91 MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
@@ -97,6 +103,8 @@ enum {
97 */ 103 */
98 RESCUER_NICE_LEVEL = -20, 104 RESCUER_NICE_LEVEL = -20,
99 HIGHPRI_NICE_LEVEL = -20, 105 HIGHPRI_NICE_LEVEL = -20,
106
107 WQ_NAME_LEN = 24,
100}; 108};
101 109
102/* 110/*
@@ -115,16 +123,26 @@ enum {
115 * cpu or grabbing pool->lock is enough for read access. If 123 * cpu or grabbing pool->lock is enough for read access. If
116 * POOL_DISASSOCIATED is set, it's identical to L. 124 * POOL_DISASSOCIATED is set, it's identical to L.
117 * 125 *
118 * F: wq->flush_mutex protected. 126 * MG: pool->manager_mutex and pool->lock protected. Writes require both
127 * locks. Reads can happen under either lock.
128 *
129 * PL: wq_pool_mutex protected.
130 *
131 * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
132 *
133 * WQ: wq->mutex protected.
119 * 134 *
120 * W: workqueue_lock protected. 135 * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
136 *
137 * MD: wq_mayday_lock protected.
121 */ 138 */
122 139
123/* struct worker is defined in workqueue_internal.h */ 140/* struct worker is defined in workqueue_internal.h */
124 141
125struct worker_pool { 142struct worker_pool {
126 spinlock_t lock; /* the pool lock */ 143 spinlock_t lock; /* the pool lock */
127 unsigned int cpu; /* I: the associated cpu */ 144 int cpu; /* I: the associated cpu */
145 int node; /* I: the associated node ID */
128 int id; /* I: pool ID */ 146 int id; /* I: pool ID */
129 unsigned int flags; /* X: flags */ 147 unsigned int flags; /* X: flags */
130 148
@@ -138,12 +156,18 @@ struct worker_pool {
138 struct timer_list idle_timer; /* L: worker idle timeout */ 156 struct timer_list idle_timer; /* L: worker idle timeout */
139 struct timer_list mayday_timer; /* L: SOS timer for workers */ 157 struct timer_list mayday_timer; /* L: SOS timer for workers */
140 158
141 /* workers are chained either in busy_hash or idle_list */ 159 /* a workers is either on busy_hash or idle_list, or the manager */
142 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); 160 DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
143 /* L: hash of busy workers */ 161 /* L: hash of busy workers */
144 162
145 struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ 163 /* see manage_workers() for details on the two manager mutexes */
146 struct ida worker_ida; /* L: for worker IDs */ 164 struct mutex manager_arb; /* manager arbitration */
165 struct mutex manager_mutex; /* manager exclusion */
166 struct idr worker_idr; /* MG: worker IDs and iteration */
167
168 struct workqueue_attrs *attrs; /* I: worker attributes */
169 struct hlist_node hash_node; /* PL: unbound_pool_hash node */
170 int refcnt; /* PL: refcnt for unbound pools */
147 171
148 /* 172 /*
149 * The current concurrency level. As it's likely to be accessed 173 * The current concurrency level. As it's likely to be accessed
@@ -151,6 +175,12 @@ struct worker_pool {
151 * cacheline. 175 * cacheline.
152 */ 176 */
153 atomic_t nr_running ____cacheline_aligned_in_smp; 177 atomic_t nr_running ____cacheline_aligned_in_smp;
178
179 /*
180 * Destruction of pool is sched-RCU protected to allow dereferences
181 * from get_work_pool().
182 */
183 struct rcu_head rcu;
154} ____cacheline_aligned_in_smp; 184} ____cacheline_aligned_in_smp;
155 185
156/* 186/*
@@ -164,75 +194,107 @@ struct pool_workqueue {
164 struct workqueue_struct *wq; /* I: the owning workqueue */ 194 struct workqueue_struct *wq; /* I: the owning workqueue */
165 int work_color; /* L: current color */ 195 int work_color; /* L: current color */
166 int flush_color; /* L: flushing color */ 196 int flush_color; /* L: flushing color */
197 int refcnt; /* L: reference count */
167 int nr_in_flight[WORK_NR_COLORS]; 198 int nr_in_flight[WORK_NR_COLORS];
168 /* L: nr of in_flight works */ 199 /* L: nr of in_flight works */
169 int nr_active; /* L: nr of active works */ 200 int nr_active; /* L: nr of active works */
170 int max_active; /* L: max active works */ 201 int max_active; /* L: max active works */
171 struct list_head delayed_works; /* L: delayed works */ 202 struct list_head delayed_works; /* L: delayed works */
172}; 203 struct list_head pwqs_node; /* WR: node on wq->pwqs */
204 struct list_head mayday_node; /* MD: node on wq->maydays */
205
206 /*
207 * Release of unbound pwq is punted to system_wq. See put_pwq()
208 * and pwq_unbound_release_workfn() for details. pool_workqueue
209 * itself is also sched-RCU protected so that the first pwq can be
210 * determined without grabbing wq->mutex.
211 */
212 struct work_struct unbound_release_work;
213 struct rcu_head rcu;
214} __aligned(1 << WORK_STRUCT_FLAG_BITS);
173 215
174/* 216/*
175 * Structure used to wait for workqueue flush. 217 * Structure used to wait for workqueue flush.
176 */ 218 */
177struct wq_flusher { 219struct wq_flusher {
178 struct list_head list; /* F: list of flushers */ 220 struct list_head list; /* WQ: list of flushers */
179 int flush_color; /* F: flush color waiting for */ 221 int flush_color; /* WQ: flush color waiting for */
180 struct completion done; /* flush completion */ 222 struct completion done; /* flush completion */
181}; 223};
182 224
183/* 225struct wq_device;
184 * All cpumasks are assumed to be always set on UP and thus can't be
185 * used to determine whether there's something to be done.
186 */
187#ifdef CONFIG_SMP
188typedef cpumask_var_t mayday_mask_t;
189#define mayday_test_and_set_cpu(cpu, mask) \
190 cpumask_test_and_set_cpu((cpu), (mask))
191#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
192#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
193#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp))
194#define free_mayday_mask(mask) free_cpumask_var((mask))
195#else
196typedef unsigned long mayday_mask_t;
197#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
198#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
199#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
200#define alloc_mayday_mask(maskp, gfp) true
201#define free_mayday_mask(mask) do { } while (0)
202#endif
203 226
204/* 227/*
205 * The externally visible workqueue abstraction is an array of 228 * The externally visible workqueue. It relays the issued work items to
206 * per-CPU workqueues: 229 * the appropriate worker_pool through its pool_workqueues.
207 */ 230 */
208struct workqueue_struct { 231struct workqueue_struct {
209 unsigned int flags; /* W: WQ_* flags */ 232 struct list_head pwqs; /* WR: all pwqs of this wq */
210 union { 233 struct list_head list; /* PL: list of all workqueues */
211 struct pool_workqueue __percpu *pcpu; 234
212 struct pool_workqueue *single; 235 struct mutex mutex; /* protects this wq */
213 unsigned long v; 236 int work_color; /* WQ: current work color */
214 } pool_wq; /* I: pwq's */ 237 int flush_color; /* WQ: current flush color */
215 struct list_head list; /* W: list of all workqueues */
216
217 struct mutex flush_mutex; /* protects wq flushing */
218 int work_color; /* F: current work color */
219 int flush_color; /* F: current flush color */
220 atomic_t nr_pwqs_to_flush; /* flush in progress */ 238 atomic_t nr_pwqs_to_flush; /* flush in progress */
221 struct wq_flusher *first_flusher; /* F: first flusher */ 239 struct wq_flusher *first_flusher; /* WQ: first flusher */
222 struct list_head flusher_queue; /* F: flush waiters */ 240 struct list_head flusher_queue; /* WQ: flush waiters */
223 struct list_head flusher_overflow; /* F: flush overflow list */ 241 struct list_head flusher_overflow; /* WQ: flush overflow list */
224 242
225 mayday_mask_t mayday_mask; /* cpus requesting rescue */ 243 struct list_head maydays; /* MD: pwqs requesting rescue */
226 struct worker *rescuer; /* I: rescue worker */ 244 struct worker *rescuer; /* I: rescue worker */
227 245
228 int nr_drainers; /* W: drain in progress */ 246 int nr_drainers; /* WQ: drain in progress */
229 int saved_max_active; /* W: saved pwq max_active */ 247 int saved_max_active; /* WQ: saved pwq max_active */
248
249 struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */
250 struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */
251
252#ifdef CONFIG_SYSFS
253 struct wq_device *wq_dev; /* I: for sysfs interface */
254#endif
230#ifdef CONFIG_LOCKDEP 255#ifdef CONFIG_LOCKDEP
231 struct lockdep_map lockdep_map; 256 struct lockdep_map lockdep_map;
232#endif 257#endif
233 char name[]; /* I: workqueue name */ 258 char name[WQ_NAME_LEN]; /* I: workqueue name */
259
260 /* hot fields used during command issue, aligned to cacheline */
261 unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
262 struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
263 struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
234}; 264};
235 265
266static struct kmem_cache *pwq_cache;
267
268static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
269static cpumask_var_t *wq_numa_possible_cpumask;
270 /* possible CPUs of each node */
271
272static bool wq_disable_numa;
273module_param_named(disable_numa, wq_disable_numa, bool, 0444);
274
275static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
276
277/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
278static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
279
280static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
281static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
282
283static LIST_HEAD(workqueues); /* PL: list of all workqueues */
284static bool workqueue_freezing; /* PL: have wqs started freezing? */
285
286/* the per-cpu worker pools */
287static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
288 cpu_worker_pools);
289
290static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
291
292/* PL: hash of all unbound pools keyed by pool->attrs */
293static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
294
295/* I: attributes used when instantiating standard unbound pools on demand */
296static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
297
236struct workqueue_struct *system_wq __read_mostly; 298struct workqueue_struct *system_wq __read_mostly;
237EXPORT_SYMBOL_GPL(system_wq); 299EXPORT_SYMBOL_GPL(system_wq);
238struct workqueue_struct *system_highpri_wq __read_mostly; 300struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -244,64 +306,87 @@ EXPORT_SYMBOL_GPL(system_unbound_wq);
244struct workqueue_struct *system_freezable_wq __read_mostly; 306struct workqueue_struct *system_freezable_wq __read_mostly;
245EXPORT_SYMBOL_GPL(system_freezable_wq); 307EXPORT_SYMBOL_GPL(system_freezable_wq);
246 308
309static int worker_thread(void *__worker);
310static void copy_workqueue_attrs(struct workqueue_attrs *to,
311 const struct workqueue_attrs *from);
312
247#define CREATE_TRACE_POINTS 313#define CREATE_TRACE_POINTS
248#include <trace/events/workqueue.h> 314#include <trace/events/workqueue.h>
249 315
250#define for_each_std_worker_pool(pool, cpu) \ 316#define assert_rcu_or_pool_mutex() \
251 for ((pool) = &std_worker_pools(cpu)[0]; \ 317 rcu_lockdep_assert(rcu_read_lock_sched_held() || \
252 (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) 318 lockdep_is_held(&wq_pool_mutex), \
319 "sched RCU or wq_pool_mutex should be held")
253 320
254#define for_each_busy_worker(worker, i, pool) \ 321#define assert_rcu_or_wq_mutex(wq) \
255 hash_for_each(pool->busy_hash, i, worker, hentry) 322 rcu_lockdep_assert(rcu_read_lock_sched_held() || \
323 lockdep_is_held(&wq->mutex), \
324 "sched RCU or wq->mutex should be held")
256 325
257static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, 326#ifdef CONFIG_LOCKDEP
258 unsigned int sw) 327#define assert_manager_or_pool_lock(pool) \
259{ 328 WARN_ONCE(debug_locks && \
260 if (cpu < nr_cpu_ids) { 329 !lockdep_is_held(&(pool)->manager_mutex) && \
261 if (sw & 1) { 330 !lockdep_is_held(&(pool)->lock), \
262 cpu = cpumask_next(cpu, mask); 331 "pool->manager_mutex or ->lock should be held")
263 if (cpu < nr_cpu_ids) 332#else
264 return cpu; 333#define assert_manager_or_pool_lock(pool) do { } while (0)
265 } 334#endif
266 if (sw & 2)
267 return WORK_CPU_UNBOUND;
268 }
269 return WORK_CPU_END;
270}
271 335
272static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, 336#define for_each_cpu_worker_pool(pool, cpu) \
273 struct workqueue_struct *wq) 337 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
274{ 338 (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
275 return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); 339 (pool)++)
276}
277 340
278/* 341/**
279 * CPU iterators 342 * for_each_pool - iterate through all worker_pools in the system
343 * @pool: iteration cursor
344 * @pi: integer used for iteration
280 * 345 *
281 * An extra cpu number is defined using an invalid cpu number 346 * This must be called either with wq_pool_mutex held or sched RCU read
282 * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any 347 * locked. If the pool needs to be used beyond the locking in effect, the
283 * specific CPU. The following iterators are similar to for_each_*_cpu() 348 * caller is responsible for guaranteeing that the pool stays online.
284 * iterators but also considers the unbound CPU.
285 * 349 *
286 * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND 350 * The if/else clause exists only for the lockdep assertion and can be
287 * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND 351 * ignored.
288 * for_each_pwq_cpu() : possible CPUs for bound workqueues,
289 * WORK_CPU_UNBOUND for unbound workqueues
290 */ 352 */
291#define for_each_wq_cpu(cpu) \ 353#define for_each_pool(pool, pi) \
292 for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ 354 idr_for_each_entry(&worker_pool_idr, pool, pi) \
293 (cpu) < WORK_CPU_END; \ 355 if (({ assert_rcu_or_pool_mutex(); false; })) { } \
294 (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) 356 else
295 357
296#define for_each_online_wq_cpu(cpu) \ 358/**
297 for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ 359 * for_each_pool_worker - iterate through all workers of a worker_pool
298 (cpu) < WORK_CPU_END; \ 360 * @worker: iteration cursor
299 (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) 361 * @wi: integer used for iteration
362 * @pool: worker_pool to iterate workers of
363 *
364 * This must be called with either @pool->manager_mutex or ->lock held.
365 *
366 * The if/else clause exists only for the lockdep assertion and can be
367 * ignored.
368 */
369#define for_each_pool_worker(worker, wi, pool) \
370 idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \
371 if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
372 else
300 373
301#define for_each_pwq_cpu(cpu, wq) \ 374/**
302 for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ 375 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
303 (cpu) < WORK_CPU_END; \ 376 * @pwq: iteration cursor
304 (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) 377 * @wq: the target workqueue
378 *
379 * This must be called either with wq->mutex held or sched RCU read locked.
380 * If the pwq needs to be used beyond the locking in effect, the caller is
381 * responsible for guaranteeing that the pwq stays online.
382 *
383 * The if/else clause exists only for the lockdep assertion and can be
384 * ignored.
385 */
386#define for_each_pwq(pwq, wq) \
387 list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
388 if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
389 else
305 390
306#ifdef CONFIG_DEBUG_OBJECTS_WORK 391#ifdef CONFIG_DEBUG_OBJECTS_WORK
307 392
@@ -419,76 +504,35 @@ static inline void debug_work_activate(struct work_struct *work) { }
419static inline void debug_work_deactivate(struct work_struct *work) { } 504static inline void debug_work_deactivate(struct work_struct *work) { }
420#endif 505#endif
421 506
422/* Serializes the accesses to the list of workqueues. */
423static DEFINE_SPINLOCK(workqueue_lock);
424static LIST_HEAD(workqueues);
425static bool workqueue_freezing; /* W: have wqs started freezing? */
426
427/*
428 * The CPU and unbound standard worker pools. The unbound ones have
429 * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
430 */
431static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
432 cpu_std_worker_pools);
433static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
434
435/* idr of all pools */
436static DEFINE_MUTEX(worker_pool_idr_mutex);
437static DEFINE_IDR(worker_pool_idr);
438
439static int worker_thread(void *__worker);
440
441static struct worker_pool *std_worker_pools(int cpu)
442{
443 if (cpu != WORK_CPU_UNBOUND)
444 return per_cpu(cpu_std_worker_pools, cpu);
445 else
446 return unbound_std_worker_pools;
447}
448
449static int std_worker_pool_pri(struct worker_pool *pool)
450{
451 return pool - std_worker_pools(pool->cpu);
452}
453
454/* allocate ID and assign it to @pool */ 507/* allocate ID and assign it to @pool */
455static int worker_pool_assign_id(struct worker_pool *pool) 508static int worker_pool_assign_id(struct worker_pool *pool)
456{ 509{
457 int ret; 510 int ret;
458 511
459 mutex_lock(&worker_pool_idr_mutex); 512 lockdep_assert_held(&wq_pool_mutex);
460 idr_pre_get(&worker_pool_idr, GFP_KERNEL);
461 ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
462 mutex_unlock(&worker_pool_idr_mutex);
463 513
514 ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
515 if (ret >= 0) {
516 pool->id = ret;
517 return 0;
518 }
464 return ret; 519 return ret;
465} 520}
466 521
467/* 522/**
468 * Lookup worker_pool by id. The idr currently is built during boot and 523 * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
469 * never modified. Don't worry about locking for now. 524 * @wq: the target workqueue
525 * @node: the node ID
526 *
527 * This must be called either with pwq_lock held or sched RCU read locked.
528 * If the pwq needs to be used beyond the locking in effect, the caller is
529 * responsible for guaranteeing that the pwq stays online.
470 */ 530 */
471static struct worker_pool *worker_pool_by_id(int pool_id) 531static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
532 int node)
472{ 533{
473 return idr_find(&worker_pool_idr, pool_id); 534 assert_rcu_or_wq_mutex(wq);
474} 535 return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
475
476static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
477{
478 struct worker_pool *pools = std_worker_pools(cpu);
479
480 return &pools[highpri];
481}
482
483static struct pool_workqueue *get_pwq(unsigned int cpu,
484 struct workqueue_struct *wq)
485{
486 if (!(wq->flags & WQ_UNBOUND)) {
487 if (likely(cpu < nr_cpu_ids))
488 return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
489 } else if (likely(cpu == WORK_CPU_UNBOUND))
490 return wq->pool_wq.single;
491 return NULL;
492} 536}
493 537
494static unsigned int work_color_to_flags(int color) 538static unsigned int work_color_to_flags(int color)
@@ -530,7 +574,7 @@ static int work_next_color(int color)
530static inline void set_work_data(struct work_struct *work, unsigned long data, 574static inline void set_work_data(struct work_struct *work, unsigned long data,
531 unsigned long flags) 575 unsigned long flags)
532{ 576{
533 BUG_ON(!work_pending(work)); 577 WARN_ON_ONCE(!work_pending(work));
534 atomic_long_set(&work->data, data | flags | work_static(work)); 578 atomic_long_set(&work->data, data | flags | work_static(work));
535} 579}
536 580
@@ -582,13 +626,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
582 * @work: the work item of interest 626 * @work: the work item of interest
583 * 627 *
584 * Return the worker_pool @work was last associated with. %NULL if none. 628 * Return the worker_pool @work was last associated with. %NULL if none.
629 *
630 * Pools are created and destroyed under wq_pool_mutex, and allows read
631 * access under sched-RCU read lock. As such, this function should be
632 * called under wq_pool_mutex or with preemption disabled.
633 *
634 * All fields of the returned pool are accessible as long as the above
635 * mentioned locking is in effect. If the returned pool needs to be used
636 * beyond the critical section, the caller is responsible for ensuring the
637 * returned pool is and stays online.
585 */ 638 */
586static struct worker_pool *get_work_pool(struct work_struct *work) 639static struct worker_pool *get_work_pool(struct work_struct *work)
587{ 640{
588 unsigned long data = atomic_long_read(&work->data); 641 unsigned long data = atomic_long_read(&work->data);
589 struct worker_pool *pool;
590 int pool_id; 642 int pool_id;
591 643
644 assert_rcu_or_pool_mutex();
645
592 if (data & WORK_STRUCT_PWQ) 646 if (data & WORK_STRUCT_PWQ)
593 return ((struct pool_workqueue *) 647 return ((struct pool_workqueue *)
594 (data & WORK_STRUCT_WQ_DATA_MASK))->pool; 648 (data & WORK_STRUCT_WQ_DATA_MASK))->pool;
@@ -597,9 +651,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
597 if (pool_id == WORK_OFFQ_POOL_NONE) 651 if (pool_id == WORK_OFFQ_POOL_NONE)
598 return NULL; 652 return NULL;
599 653
600 pool = worker_pool_by_id(pool_id); 654 return idr_find(&worker_pool_idr, pool_id);
601 WARN_ON_ONCE(!pool);
602 return pool;
603} 655}
604 656
605/** 657/**
@@ -688,7 +740,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
688/* Do we have too many workers and should some go away? */ 740/* Do we have too many workers and should some go away? */
689static bool too_many_workers(struct worker_pool *pool) 741static bool too_many_workers(struct worker_pool *pool)
690{ 742{
691 bool managing = pool->flags & POOL_MANAGING_WORKERS; 743 bool managing = mutex_is_locked(&pool->manager_arb);
692 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ 744 int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
693 int nr_busy = pool->nr_workers - nr_idle; 745 int nr_busy = pool->nr_workers - nr_idle;
694 746
@@ -743,7 +795,7 @@ static void wake_up_worker(struct worker_pool *pool)
743 * CONTEXT: 795 * CONTEXT:
744 * spin_lock_irq(rq->lock) 796 * spin_lock_irq(rq->lock)
745 */ 797 */
746void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) 798void wq_worker_waking_up(struct task_struct *task, int cpu)
747{ 799{
748 struct worker *worker = kthread_data(task); 800 struct worker *worker = kthread_data(task);
749 801
@@ -768,8 +820,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
768 * RETURNS: 820 * RETURNS:
769 * Worker task on @cpu to wake up, %NULL if none. 821 * Worker task on @cpu to wake up, %NULL if none.
770 */ 822 */
771struct task_struct *wq_worker_sleeping(struct task_struct *task, 823struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
772 unsigned int cpu)
773{ 824{
774 struct worker *worker = kthread_data(task), *to_wakeup = NULL; 825 struct worker *worker = kthread_data(task), *to_wakeup = NULL;
775 struct worker_pool *pool; 826 struct worker_pool *pool;
@@ -785,7 +836,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
785 pool = worker->pool; 836 pool = worker->pool;
786 837
787 /* this can only happen on the local cpu */ 838 /* this can only happen on the local cpu */
788 BUG_ON(cpu != raw_smp_processor_id()); 839 if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
840 return NULL;
789 841
790 /* 842 /*
791 * The counterpart of the following dec_and_test, implied mb, 843 * The counterpart of the following dec_and_test, implied mb,
@@ -890,13 +942,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
890 * recycled work item as currently executing and make it wait until the 942 * recycled work item as currently executing and make it wait until the
891 * current execution finishes, introducing an unwanted dependency. 943 * current execution finishes, introducing an unwanted dependency.
892 * 944 *
893 * This function checks the work item address, work function and workqueue 945 * This function checks the work item address and work function to avoid
894 * to avoid false positives. Note that this isn't complete as one may 946 * false positives. Note that this isn't complete as one may construct a
895 * construct a work function which can introduce dependency onto itself 947 * work function which can introduce dependency onto itself through a
896 * through a recycled work item. Well, if somebody wants to shoot oneself 948 * recycled work item. Well, if somebody wants to shoot oneself in the
897 * in the foot that badly, there's only so much we can do, and if such 949 * foot that badly, there's only so much we can do, and if such deadlock
898 * deadlock actually occurs, it should be easy to locate the culprit work 950 * actually occurs, it should be easy to locate the culprit work function.
899 * function.
900 * 951 *
901 * CONTEXT: 952 * CONTEXT:
902 * spin_lock_irq(pool->lock). 953 * spin_lock_irq(pool->lock).
@@ -960,6 +1011,64 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
960 *nextp = n; 1011 *nextp = n;
961} 1012}
962 1013
1014/**
1015 * get_pwq - get an extra reference on the specified pool_workqueue
1016 * @pwq: pool_workqueue to get
1017 *
1018 * Obtain an extra reference on @pwq. The caller should guarantee that
1019 * @pwq has positive refcnt and be holding the matching pool->lock.
1020 */
1021static void get_pwq(struct pool_workqueue *pwq)
1022{
1023 lockdep_assert_held(&pwq->pool->lock);
1024 WARN_ON_ONCE(pwq->refcnt <= 0);
1025 pwq->refcnt++;
1026}
1027
1028/**
1029 * put_pwq - put a pool_workqueue reference
1030 * @pwq: pool_workqueue to put
1031 *
1032 * Drop a reference of @pwq. If its refcnt reaches zero, schedule its
1033 * destruction. The caller should be holding the matching pool->lock.
1034 */
1035static void put_pwq(struct pool_workqueue *pwq)
1036{
1037 lockdep_assert_held(&pwq->pool->lock);
1038 if (likely(--pwq->refcnt))
1039 return;
1040 if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
1041 return;
1042 /*
1043 * @pwq can't be released under pool->lock, bounce to
1044 * pwq_unbound_release_workfn(). This never recurses on the same
1045 * pool->lock as this path is taken only for unbound workqueues and
1046 * the release work item is scheduled on a per-cpu workqueue. To
1047 * avoid lockdep warning, unbound pool->locks are given lockdep
1048 * subclass of 1 in get_unbound_pool().
1049 */
1050 schedule_work(&pwq->unbound_release_work);
1051}
1052
1053/**
1054 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
1055 * @pwq: pool_workqueue to put (can be %NULL)
1056 *
1057 * put_pwq() with locking. This function also allows %NULL @pwq.
1058 */
1059static void put_pwq_unlocked(struct pool_workqueue *pwq)
1060{
1061 if (pwq) {
1062 /*
1063 * As both pwqs and pools are sched-RCU protected, the
1064 * following lock operations are safe.
1065 */
1066 spin_lock_irq(&pwq->pool->lock);
1067 put_pwq(pwq);
1068 spin_unlock_irq(&pwq->pool->lock);
1069 }
1070}
1071
963static void pwq_activate_delayed_work(struct work_struct *work) 1072static void pwq_activate_delayed_work(struct work_struct *work)
964{ 1073{
965 struct pool_workqueue *pwq = get_work_pwq(work); 1074 struct pool_workqueue *pwq = get_work_pwq(work);
@@ -991,9 +1100,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
991 */ 1100 */
992static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) 1101static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
993{ 1102{
994 /* ignore uncolored works */ 1103 /* uncolored work items don't participate in flushing or nr_active */
995 if (color == WORK_NO_COLOR) 1104 if (color == WORK_NO_COLOR)
996 return; 1105 goto out_put;
997 1106
998 pwq->nr_in_flight[color]--; 1107 pwq->nr_in_flight[color]--;
999 1108
@@ -1006,11 +1115,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1006 1115
1007 /* is flush in progress and are we at the flushing tip? */ 1116 /* is flush in progress and are we at the flushing tip? */
1008 if (likely(pwq->flush_color != color)) 1117 if (likely(pwq->flush_color != color))
1009 return; 1118 goto out_put;
1010 1119
1011 /* are there still in-flight works? */ 1120 /* are there still in-flight works? */
1012 if (pwq->nr_in_flight[color]) 1121 if (pwq->nr_in_flight[color])
1013 return; 1122 goto out_put;
1014 1123
1015 /* this pwq is done, clear flush_color */ 1124 /* this pwq is done, clear flush_color */
1016 pwq->flush_color = -1; 1125 pwq->flush_color = -1;
@@ -1021,6 +1130,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
1021 */ 1130 */
1022 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) 1131 if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
1023 complete(&pwq->wq->first_flusher->done); 1132 complete(&pwq->wq->first_flusher->done);
1133out_put:
1134 put_pwq(pwq);
1024} 1135}
1025 1136
1026/** 1137/**
@@ -1143,11 +1254,12 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
1143 /* we own @work, set data and link */ 1254 /* we own @work, set data and link */
1144 set_work_pwq(work, pwq, extra_flags); 1255 set_work_pwq(work, pwq, extra_flags);
1145 list_add_tail(&work->entry, head); 1256 list_add_tail(&work->entry, head);
1257 get_pwq(pwq);
1146 1258
1147 /* 1259 /*
1148 * Ensure either worker_sched_deactivated() sees the above 1260 * Ensure either wq_worker_sleeping() sees the above
1149 * list_add_tail() or we see zero nr_running to avoid workers 1261 * list_add_tail() or we see zero nr_running to avoid workers lying
1150 * lying around lazily while there are works to be processed. 1262 * around lazily while there are works to be processed.
1151 */ 1263 */
1152 smp_mb(); 1264 smp_mb();
1153 1265
@@ -1171,10 +1283,11 @@ static bool is_chained_work(struct workqueue_struct *wq)
1171 return worker && worker->current_pwq->wq == wq; 1283 return worker && worker->current_pwq->wq == wq;
1172} 1284}
1173 1285
1174static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, 1286static void __queue_work(int cpu, struct workqueue_struct *wq,
1175 struct work_struct *work) 1287 struct work_struct *work)
1176{ 1288{
1177 struct pool_workqueue *pwq; 1289 struct pool_workqueue *pwq;
1290 struct worker_pool *last_pool;
1178 struct list_head *worklist; 1291 struct list_head *worklist;
1179 unsigned int work_flags; 1292 unsigned int work_flags;
1180 unsigned int req_cpu = cpu; 1293 unsigned int req_cpu = cpu;
@@ -1190,48 +1303,62 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
1190 debug_work_activate(work); 1303 debug_work_activate(work);
1191 1304
1192 /* if dying, only works from the same workqueue are allowed */ 1305 /* if dying, only works from the same workqueue are allowed */
1193 if (unlikely(wq->flags & WQ_DRAINING) && 1306 if (unlikely(wq->flags & __WQ_DRAINING) &&
1194 WARN_ON_ONCE(!is_chained_work(wq))) 1307 WARN_ON_ONCE(!is_chained_work(wq)))
1195 return; 1308 return;
1309retry:
1310 if (req_cpu == WORK_CPU_UNBOUND)
1311 cpu = raw_smp_processor_id();
1196 1312
1197 /* determine the pwq to use */ 1313 /* pwq which will be used unless @work is executing elsewhere */
1198 if (!(wq->flags & WQ_UNBOUND)) { 1314 if (!(wq->flags & WQ_UNBOUND))
1199 struct worker_pool *last_pool; 1315 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
1200 1316 else
1201 if (cpu == WORK_CPU_UNBOUND) 1317 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
1202 cpu = raw_smp_processor_id();
1203
1204 /*
1205 * It's multi cpu. If @work was previously on a different
1206 * cpu, it might still be running there, in which case the
1207 * work needs to be queued on that cpu to guarantee
1208 * non-reentrancy.
1209 */
1210 pwq = get_pwq(cpu, wq);
1211 last_pool = get_work_pool(work);
1212 1318
1213 if (last_pool && last_pool != pwq->pool) { 1319 /*
1214 struct worker *worker; 1320 * If @work was previously on a different pool, it might still be
1321 * running there, in which case the work needs to be queued on that
1322 * pool to guarantee non-reentrancy.
1323 */
1324 last_pool = get_work_pool(work);
1325 if (last_pool && last_pool != pwq->pool) {
1326 struct worker *worker;
1215 1327
1216 spin_lock(&last_pool->lock); 1328 spin_lock(&last_pool->lock);
1217 1329
1218 worker = find_worker_executing_work(last_pool, work); 1330 worker = find_worker_executing_work(last_pool, work);
1219 1331
1220 if (worker && worker->current_pwq->wq == wq) { 1332 if (worker && worker->current_pwq->wq == wq) {
1221 pwq = get_pwq(last_pool->cpu, wq); 1333 pwq = worker->current_pwq;
1222 } else {
1223 /* meh... not running there, queue here */
1224 spin_unlock(&last_pool->lock);
1225 spin_lock(&pwq->pool->lock);
1226 }
1227 } else { 1334 } else {
1335 /* meh... not running there, queue here */
1336 spin_unlock(&last_pool->lock);
1228 spin_lock(&pwq->pool->lock); 1337 spin_lock(&pwq->pool->lock);
1229 } 1338 }
1230 } else { 1339 } else {
1231 pwq = get_pwq(WORK_CPU_UNBOUND, wq);
1232 spin_lock(&pwq->pool->lock); 1340 spin_lock(&pwq->pool->lock);
1233 } 1341 }
1234 1342
1343 /*
1344 * pwq is determined and locked. For unbound pools, we could have
1345 * raced with pwq release and it could already be dead. If its
1346 * refcnt is zero, repeat pwq selection. Note that pwqs never die
1347 * without another pwq replacing it in the numa_pwq_tbl or while
1348 * work items are executing on it, so the retrying is guaranteed to
1349 * make forward-progress.
1350 */
1351 if (unlikely(!pwq->refcnt)) {
1352 if (wq->flags & WQ_UNBOUND) {
1353 spin_unlock(&pwq->pool->lock);
1354 cpu_relax();
1355 goto retry;
1356 }
1357 /* oops */
1358 WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
1359 wq->name, cpu);
1360 }
1361
1235 /* pwq determined, queue */ 1362 /* pwq determined, queue */
1236 trace_workqueue_queue_work(req_cpu, pwq, work); 1363 trace_workqueue_queue_work(req_cpu, pwq, work);
1237 1364
@@ -1286,22 +1413,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
1286} 1413}
1287EXPORT_SYMBOL_GPL(queue_work_on); 1414EXPORT_SYMBOL_GPL(queue_work_on);
1288 1415
1289/**
1290 * queue_work - queue work on a workqueue
1291 * @wq: workqueue to use
1292 * @work: work to queue
1293 *
1294 * Returns %false if @work was already on a queue, %true otherwise.
1295 *
1296 * We queue the work to the CPU on which it was submitted, but if the CPU dies
1297 * it can be processed by another CPU.
1298 */
1299bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
1300{
1301 return queue_work_on(WORK_CPU_UNBOUND, wq, work);
1302}
1303EXPORT_SYMBOL_GPL(queue_work);
1304
1305void delayed_work_timer_fn(unsigned long __data) 1416void delayed_work_timer_fn(unsigned long __data)
1306{ 1417{
1307 struct delayed_work *dwork = (struct delayed_work *)__data; 1418 struct delayed_work *dwork = (struct delayed_work *)__data;
@@ -1377,21 +1488,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
1377EXPORT_SYMBOL_GPL(queue_delayed_work_on); 1488EXPORT_SYMBOL_GPL(queue_delayed_work_on);
1378 1489
1379/** 1490/**
1380 * queue_delayed_work - queue work on a workqueue after delay
1381 * @wq: workqueue to use
1382 * @dwork: delayable work to queue
1383 * @delay: number of jiffies to wait before queueing
1384 *
1385 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
1386 */
1387bool queue_delayed_work(struct workqueue_struct *wq,
1388 struct delayed_work *dwork, unsigned long delay)
1389{
1390 return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1391}
1392EXPORT_SYMBOL_GPL(queue_delayed_work);
1393
1394/**
1395 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU 1491 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
1396 * @cpu: CPU number to execute work on 1492 * @cpu: CPU number to execute work on
1397 * @wq: workqueue to use 1493 * @wq: workqueue to use
@@ -1430,21 +1526,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
1430EXPORT_SYMBOL_GPL(mod_delayed_work_on); 1526EXPORT_SYMBOL_GPL(mod_delayed_work_on);
1431 1527
1432/** 1528/**
1433 * mod_delayed_work - modify delay of or queue a delayed work
1434 * @wq: workqueue to use
1435 * @dwork: work to queue
1436 * @delay: number of jiffies to wait before queueing
1437 *
1438 * mod_delayed_work_on() on local CPU.
1439 */
1440bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
1441 unsigned long delay)
1442{
1443 return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
1444}
1445EXPORT_SYMBOL_GPL(mod_delayed_work);
1446
1447/**
1448 * worker_enter_idle - enter idle state 1529 * worker_enter_idle - enter idle state
1449 * @worker: worker which is entering idle state 1530 * @worker: worker which is entering idle state
1450 * 1531 *
@@ -1458,9 +1539,10 @@ static void worker_enter_idle(struct worker *worker)
1458{ 1539{
1459 struct worker_pool *pool = worker->pool; 1540 struct worker_pool *pool = worker->pool;
1460 1541
1461 BUG_ON(worker->flags & WORKER_IDLE); 1542 if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
1462 BUG_ON(!list_empty(&worker->entry) && 1543 WARN_ON_ONCE(!list_empty(&worker->entry) &&
1463 (worker->hentry.next || worker->hentry.pprev)); 1544 (worker->hentry.next || worker->hentry.pprev)))
1545 return;
1464 1546
1465 /* can't use worker_set_flags(), also called from start_worker() */ 1547 /* can't use worker_set_flags(), also called from start_worker() */
1466 worker->flags |= WORKER_IDLE; 1548 worker->flags |= WORKER_IDLE;
@@ -1497,22 +1579,25 @@ static void worker_leave_idle(struct worker *worker)
1497{ 1579{
1498 struct worker_pool *pool = worker->pool; 1580 struct worker_pool *pool = worker->pool;
1499 1581
1500 BUG_ON(!(worker->flags & WORKER_IDLE)); 1582 if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
1583 return;
1501 worker_clr_flags(worker, WORKER_IDLE); 1584 worker_clr_flags(worker, WORKER_IDLE);
1502 pool->nr_idle--; 1585 pool->nr_idle--;
1503 list_del_init(&worker->entry); 1586 list_del_init(&worker->entry);
1504} 1587}
1505 1588
1506/** 1589/**
1507 * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool 1590 * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
1508 * @worker: self 1591 * @pool: target worker_pool
1592 *
1593 * Bind %current to the cpu of @pool if it is associated and lock @pool.
1509 * 1594 *
1510 * Works which are scheduled while the cpu is online must at least be 1595 * Works which are scheduled while the cpu is online must at least be
1511 * scheduled to a worker which is bound to the cpu so that if they are 1596 * scheduled to a worker which is bound to the cpu so that if they are
1512 * flushed from cpu callbacks while cpu is going down, they are 1597 * flushed from cpu callbacks while cpu is going down, they are
1513 * guaranteed to execute on the cpu. 1598 * guaranteed to execute on the cpu.
1514 * 1599 *
1515 * This function is to be used by rogue workers and rescuers to bind 1600 * This function is to be used by unbound workers and rescuers to bind
1516 * themselves to the target cpu and may race with cpu going down or 1601 * themselves to the target cpu and may race with cpu going down or
1517 * coming online. kthread_bind() can't be used because it may put the 1602 * coming online. kthread_bind() can't be used because it may put the
1518 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used 1603 * worker to already dead cpu and set_cpus_allowed_ptr() can't be used
@@ -1533,12 +1618,9 @@ static void worker_leave_idle(struct worker *worker)
1533 * %true if the associated pool is online (@worker is successfully 1618 * %true if the associated pool is online (@worker is successfully
1534 * bound), %false if offline. 1619 * bound), %false if offline.
1535 */ 1620 */
1536static bool worker_maybe_bind_and_lock(struct worker *worker) 1621static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
1537__acquires(&pool->lock) 1622__acquires(&pool->lock)
1538{ 1623{
1539 struct worker_pool *pool = worker->pool;
1540 struct task_struct *task = worker->task;
1541
1542 while (true) { 1624 while (true) {
1543 /* 1625 /*
1544 * The following call may fail, succeed or succeed 1626 * The following call may fail, succeed or succeed
@@ -1547,14 +1629,13 @@ __acquires(&pool->lock)
1547 * against POOL_DISASSOCIATED. 1629 * against POOL_DISASSOCIATED.
1548 */ 1630 */
1549 if (!(pool->flags & POOL_DISASSOCIATED)) 1631 if (!(pool->flags & POOL_DISASSOCIATED))
1550 set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); 1632 set_cpus_allowed_ptr(current, pool->attrs->cpumask);
1551 1633
1552 spin_lock_irq(&pool->lock); 1634 spin_lock_irq(&pool->lock);
1553 if (pool->flags & POOL_DISASSOCIATED) 1635 if (pool->flags & POOL_DISASSOCIATED)
1554 return false; 1636 return false;
1555 if (task_cpu(task) == pool->cpu && 1637 if (task_cpu(current) == pool->cpu &&
1556 cpumask_equal(&current->cpus_allowed, 1638 cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
1557 get_cpu_mask(pool->cpu)))
1558 return true; 1639 return true;
1559 spin_unlock_irq(&pool->lock); 1640 spin_unlock_irq(&pool->lock);
1560 1641
@@ -1569,108 +1650,6 @@ __acquires(&pool->lock)
1569 } 1650 }
1570} 1651}
1571 1652
1572/*
1573 * Rebind an idle @worker to its CPU. worker_thread() will test
1574 * list_empty(@worker->entry) before leaving idle and call this function.
1575 */
1576static void idle_worker_rebind(struct worker *worker)
1577{
1578 /* CPU may go down again inbetween, clear UNBOUND only on success */
1579 if (worker_maybe_bind_and_lock(worker))
1580 worker_clr_flags(worker, WORKER_UNBOUND);
1581
1582 /* rebind complete, become available again */
1583 list_add(&worker->entry, &worker->pool->idle_list);
1584 spin_unlock_irq(&worker->pool->lock);
1585}
1586
1587/*
1588 * Function for @worker->rebind.work used to rebind unbound busy workers to
1589 * the associated cpu which is coming back online. This is scheduled by
1590 * cpu up but can race with other cpu hotplug operations and may be
1591 * executed twice without intervening cpu down.
1592 */
1593static void busy_worker_rebind_fn(struct work_struct *work)
1594{
1595 struct worker *worker = container_of(work, struct worker, rebind_work);
1596
1597 if (worker_maybe_bind_and_lock(worker))
1598 worker_clr_flags(worker, WORKER_UNBOUND);
1599
1600 spin_unlock_irq(&worker->pool->lock);
1601}
1602
1603/**
1604 * rebind_workers - rebind all workers of a pool to the associated CPU
1605 * @pool: pool of interest
1606 *
1607 * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding
1608 * is different for idle and busy ones.
1609 *
1610 * Idle ones will be removed from the idle_list and woken up. They will
1611 * add themselves back after completing rebind. This ensures that the
1612 * idle_list doesn't contain any unbound workers when re-bound busy workers
1613 * try to perform local wake-ups for concurrency management.
1614 *
1615 * Busy workers can rebind after they finish their current work items.
1616 * Queueing the rebind work item at the head of the scheduled list is
1617 * enough. Note that nr_running will be properly bumped as busy workers
1618 * rebind.
1619 *
1620 * On return, all non-manager workers are scheduled for rebind - see
1621 * manage_workers() for the manager special case. Any idle worker
1622 * including the manager will not appear on @idle_list until rebind is
1623 * complete, making local wake-ups safe.
1624 */
1625static void rebind_workers(struct worker_pool *pool)
1626{
1627 struct worker *worker, *n;
1628 int i;
1629
1630 lockdep_assert_held(&pool->assoc_mutex);
1631 lockdep_assert_held(&pool->lock);
1632
1633 /* dequeue and kick idle ones */
1634 list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
1635 /*
1636 * idle workers should be off @pool->idle_list until rebind
1637 * is complete to avoid receiving premature local wake-ups.
1638 */
1639 list_del_init(&worker->entry);
1640
1641 /*
1642 * worker_thread() will see the above dequeuing and call
1643 * idle_worker_rebind().
1644 */
1645 wake_up_process(worker->task);
1646 }
1647
1648 /* rebind busy workers */
1649 for_each_busy_worker(worker, i, pool) {
1650 struct work_struct *rebind_work = &worker->rebind_work;
1651 struct workqueue_struct *wq;
1652
1653 if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
1654 work_data_bits(rebind_work)))
1655 continue;
1656
1657 debug_work_activate(rebind_work);
1658
1659 /*
1660 * wq doesn't really matter but let's keep @worker->pool
1661 * and @pwq->pool consistent for sanity.
1662 */
1663 if (std_worker_pool_pri(worker->pool))
1664 wq = system_highpri_wq;
1665 else
1666 wq = system_wq;
1667
1668 insert_work(get_pwq(pool->cpu, wq), rebind_work,
1669 worker->scheduled.next,
1670 work_color_to_flags(WORK_NO_COLOR));
1671 }
1672}
1673
1674static struct worker *alloc_worker(void) 1653static struct worker *alloc_worker(void)
1675{ 1654{
1676 struct worker *worker; 1655 struct worker *worker;
@@ -1679,7 +1658,6 @@ static struct worker *alloc_worker(void)
1679 if (worker) { 1658 if (worker) {
1680 INIT_LIST_HEAD(&worker->entry); 1659 INIT_LIST_HEAD(&worker->entry);
1681 INIT_LIST_HEAD(&worker->scheduled); 1660 INIT_LIST_HEAD(&worker->scheduled);
1682 INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
1683 /* on creation a worker is in !idle && prep state */ 1661 /* on creation a worker is in !idle && prep state */
1684 worker->flags = WORKER_PREP; 1662 worker->flags = WORKER_PREP;
1685 } 1663 }
@@ -1702,18 +1680,25 @@ static struct worker *alloc_worker(void)
1702 */ 1680 */
1703static struct worker *create_worker(struct worker_pool *pool) 1681static struct worker *create_worker(struct worker_pool *pool)
1704{ 1682{
1705 const char *pri = std_worker_pool_pri(pool) ? "H" : "";
1706 struct worker *worker = NULL; 1683 struct worker *worker = NULL;
1707 int id = -1; 1684 int id = -1;
1685 char id_buf[16];
1686
1687 lockdep_assert_held(&pool->manager_mutex);
1708 1688
1689 /*
1690 * ID is needed to determine kthread name. Allocate ID first
1691 * without installing the pointer.
1692 */
1693 idr_preload(GFP_KERNEL);
1709 spin_lock_irq(&pool->lock); 1694 spin_lock_irq(&pool->lock);
1710 while (ida_get_new(&pool->worker_ida, &id)) { 1695
1711 spin_unlock_irq(&pool->lock); 1696 id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
1712 if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) 1697
1713 goto fail;
1714 spin_lock_irq(&pool->lock);
1715 }
1716 spin_unlock_irq(&pool->lock); 1698 spin_unlock_irq(&pool->lock);
1699 idr_preload_end();
1700 if (id < 0)
1701 goto fail;
1717 1702
1718 worker = alloc_worker(); 1703 worker = alloc_worker();
1719 if (!worker) 1704 if (!worker)
@@ -1722,40 +1707,46 @@ static struct worker *create_worker(struct worker_pool *pool)
1722 worker->pool = pool; 1707 worker->pool = pool;
1723 worker->id = id; 1708 worker->id = id;
1724 1709
1725 if (pool->cpu != WORK_CPU_UNBOUND) 1710 if (pool->cpu >= 0)
1726 worker->task = kthread_create_on_node(worker_thread, 1711 snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
1727 worker, cpu_to_node(pool->cpu), 1712 pool->attrs->nice < 0 ? "H" : "");
1728 "kworker/%u:%d%s", pool->cpu, id, pri);
1729 else 1713 else
1730 worker->task = kthread_create(worker_thread, worker, 1714 snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
1731 "kworker/u:%d%s", id, pri); 1715
1716 worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
1717 "kworker/%s", id_buf);
1732 if (IS_ERR(worker->task)) 1718 if (IS_ERR(worker->task))
1733 goto fail; 1719 goto fail;
1734 1720
1735 if (std_worker_pool_pri(pool)) 1721 /*
1736 set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); 1722 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
1723 * online CPUs. It'll be re-applied when any of the CPUs come up.
1724 */
1725 set_user_nice(worker->task, pool->attrs->nice);
1726 set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
1727
1728 /* prevent userland from meddling with cpumask of workqueue workers */
1729 worker->task->flags |= PF_NO_SETAFFINITY;
1737 1730
1738 /* 1731 /*
1739 * Determine CPU binding of the new worker depending on 1732 * The caller is responsible for ensuring %POOL_DISASSOCIATED
1740 * %POOL_DISASSOCIATED. The caller is responsible for ensuring the 1733 * remains stable across this function. See the comments above the
1741 * flag remains stable across this function. See the comments 1734 * flag definition for details.
1742 * above the flag definition for details.
1743 *
1744 * As an unbound worker may later become a regular one if CPU comes
1745 * online, make sure every worker has %PF_THREAD_BOUND set.
1746 */ 1735 */
1747 if (!(pool->flags & POOL_DISASSOCIATED)) { 1736 if (pool->flags & POOL_DISASSOCIATED)
1748 kthread_bind(worker->task, pool->cpu);
1749 } else {
1750 worker->task->flags |= PF_THREAD_BOUND;
1751 worker->flags |= WORKER_UNBOUND; 1737 worker->flags |= WORKER_UNBOUND;
1752 } 1738
1739 /* successful, commit the pointer to idr */
1740 spin_lock_irq(&pool->lock);
1741 idr_replace(&pool->worker_idr, worker, worker->id);
1742 spin_unlock_irq(&pool->lock);
1753 1743
1754 return worker; 1744 return worker;
1745
1755fail: 1746fail:
1756 if (id >= 0) { 1747 if (id >= 0) {
1757 spin_lock_irq(&pool->lock); 1748 spin_lock_irq(&pool->lock);
1758 ida_remove(&pool->worker_ida, id); 1749 idr_remove(&pool->worker_idr, id);
1759 spin_unlock_irq(&pool->lock); 1750 spin_unlock_irq(&pool->lock);
1760 } 1751 }
1761 kfree(worker); 1752 kfree(worker);
@@ -1780,6 +1771,30 @@ static void start_worker(struct worker *worker)
1780} 1771}
1781 1772
1782/** 1773/**
1774 * create_and_start_worker - create and start a worker for a pool
1775 * @pool: the target pool
1776 *
1777 * Grab the managership of @pool and create and start a new worker for it.
1778 */
1779static int create_and_start_worker(struct worker_pool *pool)
1780{
1781 struct worker *worker;
1782
1783 mutex_lock(&pool->manager_mutex);
1784
1785 worker = create_worker(pool);
1786 if (worker) {
1787 spin_lock_irq(&pool->lock);
1788 start_worker(worker);
1789 spin_unlock_irq(&pool->lock);
1790 }
1791
1792 mutex_unlock(&pool->manager_mutex);
1793
1794 return worker ? 0 : -ENOMEM;
1795}
1796
1797/**
1783 * destroy_worker - destroy a workqueue worker 1798 * destroy_worker - destroy a workqueue worker
1784 * @worker: worker to be destroyed 1799 * @worker: worker to be destroyed
1785 * 1800 *
@@ -1791,11 +1806,14 @@ static void start_worker(struct worker *worker)
1791static void destroy_worker(struct worker *worker) 1806static void destroy_worker(struct worker *worker)
1792{ 1807{
1793 struct worker_pool *pool = worker->pool; 1808 struct worker_pool *pool = worker->pool;
1794 int id = worker->id; 1809
1810 lockdep_assert_held(&pool->manager_mutex);
1811 lockdep_assert_held(&pool->lock);
1795 1812
1796 /* sanity check frenzy */ 1813 /* sanity check frenzy */
1797 BUG_ON(worker->current_work); 1814 if (WARN_ON(worker->current_work) ||
1798 BUG_ON(!list_empty(&worker->scheduled)); 1815 WARN_ON(!list_empty(&worker->scheduled)))
1816 return;
1799 1817
1800 if (worker->flags & WORKER_STARTED) 1818 if (worker->flags & WORKER_STARTED)
1801 pool->nr_workers--; 1819 pool->nr_workers--;
@@ -1805,13 +1823,14 @@ static void destroy_worker(struct worker *worker)
1805 list_del_init(&worker->entry); 1823 list_del_init(&worker->entry);
1806 worker->flags |= WORKER_DIE; 1824 worker->flags |= WORKER_DIE;
1807 1825
1826 idr_remove(&pool->worker_idr, worker->id);
1827
1808 spin_unlock_irq(&pool->lock); 1828 spin_unlock_irq(&pool->lock);
1809 1829
1810 kthread_stop(worker->task); 1830 kthread_stop(worker->task);
1811 kfree(worker); 1831 kfree(worker);
1812 1832
1813 spin_lock_irq(&pool->lock); 1833 spin_lock_irq(&pool->lock);
1814 ida_remove(&pool->worker_ida, id);
1815} 1834}
1816 1835
1817static void idle_worker_timeout(unsigned long __pool) 1836static void idle_worker_timeout(unsigned long __pool)
@@ -1840,23 +1859,21 @@ static void idle_worker_timeout(unsigned long __pool)
1840 spin_unlock_irq(&pool->lock); 1859 spin_unlock_irq(&pool->lock);
1841} 1860}
1842 1861
1843static bool send_mayday(struct work_struct *work) 1862static void send_mayday(struct work_struct *work)
1844{ 1863{
1845 struct pool_workqueue *pwq = get_work_pwq(work); 1864 struct pool_workqueue *pwq = get_work_pwq(work);
1846 struct workqueue_struct *wq = pwq->wq; 1865 struct workqueue_struct *wq = pwq->wq;
1847 unsigned int cpu;
1848 1866
1849 if (!(wq->flags & WQ_RESCUER)) 1867 lockdep_assert_held(&wq_mayday_lock);
1850 return false; 1868
1869 if (!wq->rescuer)
1870 return;
1851 1871
1852 /* mayday mayday mayday */ 1872 /* mayday mayday mayday */
1853 cpu = pwq->pool->cpu; 1873 if (list_empty(&pwq->mayday_node)) {
1854 /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ 1874 list_add_tail(&pwq->mayday_node, &wq->maydays);
1855 if (cpu == WORK_CPU_UNBOUND)
1856 cpu = 0;
1857 if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
1858 wake_up_process(wq->rescuer->task); 1875 wake_up_process(wq->rescuer->task);
1859 return true; 1876 }
1860} 1877}
1861 1878
1862static void pool_mayday_timeout(unsigned long __pool) 1879static void pool_mayday_timeout(unsigned long __pool)
@@ -1864,7 +1881,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1864 struct worker_pool *pool = (void *)__pool; 1881 struct worker_pool *pool = (void *)__pool;
1865 struct work_struct *work; 1882 struct work_struct *work;
1866 1883
1867 spin_lock_irq(&pool->lock); 1884 spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */
1885 spin_lock(&pool->lock);
1868 1886
1869 if (need_to_create_worker(pool)) { 1887 if (need_to_create_worker(pool)) {
1870 /* 1888 /*
@@ -1877,7 +1895,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1877 send_mayday(work); 1895 send_mayday(work);
1878 } 1896 }
1879 1897
1880 spin_unlock_irq(&pool->lock); 1898 spin_unlock(&pool->lock);
1899 spin_unlock_irq(&wq_mayday_lock);
1881 1900
1882 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); 1901 mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
1883} 1902}
@@ -1892,8 +1911,8 @@ static void pool_mayday_timeout(unsigned long __pool)
1892 * sent to all rescuers with works scheduled on @pool to resolve 1911 * sent to all rescuers with works scheduled on @pool to resolve
1893 * possible allocation deadlock. 1912 * possible allocation deadlock.
1894 * 1913 *
1895 * On return, need_to_create_worker() is guaranteed to be false and 1914 * On return, need_to_create_worker() is guaranteed to be %false and
1896 * may_start_working() true. 1915 * may_start_working() %true.
1897 * 1916 *
1898 * LOCKING: 1917 * LOCKING:
1899 * spin_lock_irq(pool->lock) which may be released and regrabbed 1918 * spin_lock_irq(pool->lock) which may be released and regrabbed
@@ -1901,7 +1920,7 @@ static void pool_mayday_timeout(unsigned long __pool)
1901 * manager. 1920 * manager.
1902 * 1921 *
1903 * RETURNS: 1922 * RETURNS:
1904 * false if no action was taken and pool->lock stayed locked, true 1923 * %false if no action was taken and pool->lock stayed locked, %true
1905 * otherwise. 1924 * otherwise.
1906 */ 1925 */
1907static bool maybe_create_worker(struct worker_pool *pool) 1926static bool maybe_create_worker(struct worker_pool *pool)
@@ -1924,7 +1943,8 @@ restart:
1924 del_timer_sync(&pool->mayday_timer); 1943 del_timer_sync(&pool->mayday_timer);
1925 spin_lock_irq(&pool->lock); 1944 spin_lock_irq(&pool->lock);
1926 start_worker(worker); 1945 start_worker(worker);
1927 BUG_ON(need_to_create_worker(pool)); 1946 if (WARN_ON_ONCE(need_to_create_worker(pool)))
1947 goto restart;
1928 return true; 1948 return true;
1929 } 1949 }
1930 1950
@@ -1957,7 +1977,7 @@ restart:
1957 * multiple times. Called only from manager. 1977 * multiple times. Called only from manager.
1958 * 1978 *
1959 * RETURNS: 1979 * RETURNS:
1960 * false if no action was taken and pool->lock stayed locked, true 1980 * %false if no action was taken and pool->lock stayed locked, %true
1961 * otherwise. 1981 * otherwise.
1962 */ 1982 */
1963static bool maybe_destroy_workers(struct worker_pool *pool) 1983static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2008,42 +2028,37 @@ static bool manage_workers(struct worker *worker)
2008 struct worker_pool *pool = worker->pool; 2028 struct worker_pool *pool = worker->pool;
2009 bool ret = false; 2029 bool ret = false;
2010 2030
2011 if (pool->flags & POOL_MANAGING_WORKERS) 2031 /*
2032 * Managership is governed by two mutexes - manager_arb and
2033 * manager_mutex. manager_arb handles arbitration of manager role.
2034 * Anyone who successfully grabs manager_arb wins the arbitration
2035 * and becomes the manager. mutex_trylock() on pool->manager_arb
2036 * failure while holding pool->lock reliably indicates that someone
2037 * else is managing the pool and the worker which failed trylock
2038 * can proceed to executing work items. This means that anyone
2039 * grabbing manager_arb is responsible for actually performing
2040 * manager duties. If manager_arb is grabbed and released without
2041 * actual management, the pool may stall indefinitely.
2042 *
2043 * manager_mutex is used for exclusion of actual management
2044 * operations. The holder of manager_mutex can be sure that none
2045 * of management operations, including creation and destruction of
2046 * workers, won't take place until the mutex is released. Because
2047 * manager_mutex doesn't interfere with manager role arbitration,
2048 * it is guaranteed that the pool's management, while may be
2049 * delayed, won't be disturbed by someone else grabbing
2050 * manager_mutex.
2051 */
2052 if (!mutex_trylock(&pool->manager_arb))
2012 return ret; 2053 return ret;
2013 2054
2014 pool->flags |= POOL_MANAGING_WORKERS;
2015
2016 /* 2055 /*
2017 * To simplify both worker management and CPU hotplug, hold off 2056 * With manager arbitration won, manager_mutex would be free in
2018 * management while hotplug is in progress. CPU hotplug path can't 2057 * most cases. trylock first without dropping @pool->lock.
2019 * grab %POOL_MANAGING_WORKERS to achieve this because that can
2020 * lead to idle worker depletion (all become busy thinking someone
2021 * else is managing) which in turn can result in deadlock under
2022 * extreme circumstances. Use @pool->assoc_mutex to synchronize
2023 * manager against CPU hotplug.
2024 *
2025 * assoc_mutex would always be free unless CPU hotplug is in
2026 * progress. trylock first without dropping @pool->lock.
2027 */ 2058 */
2028 if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { 2059 if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
2029 spin_unlock_irq(&pool->lock); 2060 spin_unlock_irq(&pool->lock);
2030 mutex_lock(&pool->assoc_mutex); 2061 mutex_lock(&pool->manager_mutex);
2031 /*
2032 * CPU hotplug could have happened while we were waiting
2033 * for assoc_mutex. Hotplug itself can't handle us
2034 * because manager isn't either on idle or busy list, and
2035 * @pool's state and ours could have deviated.
2036 *
2037 * As hotplug is now excluded via assoc_mutex, we can
2038 * simply try to bind. It will succeed or fail depending
2039 * on @pool's current state. Try it and adjust
2040 * %WORKER_UNBOUND accordingly.
2041 */
2042 if (worker_maybe_bind_and_lock(worker))
2043 worker->flags &= ~WORKER_UNBOUND;
2044 else
2045 worker->flags |= WORKER_UNBOUND;
2046
2047 ret = true; 2062 ret = true;
2048 } 2063 }
2049 2064
@@ -2056,8 +2071,8 @@ static bool manage_workers(struct worker *worker)
2056 ret |= maybe_destroy_workers(pool); 2071 ret |= maybe_destroy_workers(pool);
2057 ret |= maybe_create_worker(pool); 2072 ret |= maybe_create_worker(pool);
2058 2073
2059 pool->flags &= ~POOL_MANAGING_WORKERS; 2074 mutex_unlock(&pool->manager_mutex);
2060 mutex_unlock(&pool->assoc_mutex); 2075 mutex_unlock(&pool->manager_arb);
2061 return ret; 2076 return ret;
2062} 2077}
2063 2078
@@ -2183,6 +2198,7 @@ __acquires(&pool->lock)
2183 worker->current_work = NULL; 2198 worker->current_work = NULL;
2184 worker->current_func = NULL; 2199 worker->current_func = NULL;
2185 worker->current_pwq = NULL; 2200 worker->current_pwq = NULL;
2201 worker->desc_valid = false;
2186 pwq_dec_nr_in_flight(pwq, work_color); 2202 pwq_dec_nr_in_flight(pwq, work_color);
2187} 2203}
2188 2204
@@ -2211,11 +2227,11 @@ static void process_scheduled_works(struct worker *worker)
2211 * worker_thread - the worker thread function 2227 * worker_thread - the worker thread function
2212 * @__worker: self 2228 * @__worker: self
2213 * 2229 *
2214 * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools 2230 * The worker thread function. All workers belong to a worker_pool -
2215 * of these per each cpu. These workers process all works regardless of 2231 * either a per-cpu one or dynamic unbound one. These workers process all
2216 * their specific target workqueue. The only exception is works which 2232 * work items regardless of their specific target workqueue. The only
2217 * belong to workqueues with a rescuer which will be explained in 2233 * exception is work items which belong to workqueues with a rescuer which
2218 * rescuer_thread(). 2234 * will be explained in rescuer_thread().
2219 */ 2235 */
2220static int worker_thread(void *__worker) 2236static int worker_thread(void *__worker)
2221{ 2237{
@@ -2227,19 +2243,12 @@ static int worker_thread(void *__worker)
2227woke_up: 2243woke_up:
2228 spin_lock_irq(&pool->lock); 2244 spin_lock_irq(&pool->lock);
2229 2245
2230 /* we are off idle list if destruction or rebind is requested */ 2246 /* am I supposed to die? */
2231 if (unlikely(list_empty(&worker->entry))) { 2247 if (unlikely(worker->flags & WORKER_DIE)) {
2232 spin_unlock_irq(&pool->lock); 2248 spin_unlock_irq(&pool->lock);
2233 2249 WARN_ON_ONCE(!list_empty(&worker->entry));
2234 /* if DIE is set, destruction is requested */ 2250 worker->task->flags &= ~PF_WQ_WORKER;
2235 if (worker->flags & WORKER_DIE) { 2251 return 0;
2236 worker->task->flags &= ~PF_WQ_WORKER;
2237 return 0;
2238 }
2239
2240 /* otherwise, rebind */
2241 idle_worker_rebind(worker);
2242 goto woke_up;
2243 } 2252 }
2244 2253
2245 worker_leave_idle(worker); 2254 worker_leave_idle(worker);
@@ -2257,14 +2266,16 @@ recheck:
2257 * preparing to process a work or actually processing it. 2266 * preparing to process a work or actually processing it.
2258 * Make sure nobody diddled with it while I was sleeping. 2267 * Make sure nobody diddled with it while I was sleeping.
2259 */ 2268 */
2260 BUG_ON(!list_empty(&worker->scheduled)); 2269 WARN_ON_ONCE(!list_empty(&worker->scheduled));
2261 2270
2262 /* 2271 /*
2263 * When control reaches this point, we're guaranteed to have 2272 * Finish PREP stage. We're guaranteed to have at least one idle
2264 * at least one idle worker or that someone else has already 2273 * worker or that someone else has already assumed the manager
2265 * assumed the manager role. 2274 * role. This is where @worker starts participating in concurrency
2275 * management if applicable and concurrency management is restored
2276 * after being rebound. See rebind_workers() for details.
2266 */ 2277 */
2267 worker_clr_flags(worker, WORKER_PREP); 2278 worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
2268 2279
2269 do { 2280 do {
2270 struct work_struct *work = 2281 struct work_struct *work =
@@ -2306,7 +2317,7 @@ sleep:
2306 * @__rescuer: self 2317 * @__rescuer: self
2307 * 2318 *
2308 * Workqueue rescuer thread function. There's one rescuer for each 2319 * Workqueue rescuer thread function. There's one rescuer for each
2309 * workqueue which has WQ_RESCUER set. 2320 * workqueue which has WQ_MEM_RECLAIM set.
2310 * 2321 *
2311 * Regular work processing on a pool may block trying to create a new 2322 * Regular work processing on a pool may block trying to create a new
2312 * worker which uses GFP_KERNEL allocation which has slight chance of 2323 * worker which uses GFP_KERNEL allocation which has slight chance of
@@ -2325,8 +2336,6 @@ static int rescuer_thread(void *__rescuer)
2325 struct worker *rescuer = __rescuer; 2336 struct worker *rescuer = __rescuer;
2326 struct workqueue_struct *wq = rescuer->rescue_wq; 2337 struct workqueue_struct *wq = rescuer->rescue_wq;
2327 struct list_head *scheduled = &rescuer->scheduled; 2338 struct list_head *scheduled = &rescuer->scheduled;
2328 bool is_unbound = wq->flags & WQ_UNBOUND;
2329 unsigned int cpu;
2330 2339
2331 set_user_nice(current, RESCUER_NICE_LEVEL); 2340 set_user_nice(current, RESCUER_NICE_LEVEL);
2332 2341
@@ -2344,28 +2353,29 @@ repeat:
2344 return 0; 2353 return 0;
2345 } 2354 }
2346 2355
2347 /* 2356 /* see whether any pwq is asking for help */
2348 * See whether any cpu is asking for help. Unbounded 2357 spin_lock_irq(&wq_mayday_lock);
2349 * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. 2358
2350 */ 2359 while (!list_empty(&wq->maydays)) {
2351 for_each_mayday_cpu(cpu, wq->mayday_mask) { 2360 struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
2352 unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; 2361 struct pool_workqueue, mayday_node);
2353 struct pool_workqueue *pwq = get_pwq(tcpu, wq);
2354 struct worker_pool *pool = pwq->pool; 2362 struct worker_pool *pool = pwq->pool;
2355 struct work_struct *work, *n; 2363 struct work_struct *work, *n;
2356 2364
2357 __set_current_state(TASK_RUNNING); 2365 __set_current_state(TASK_RUNNING);
2358 mayday_clear_cpu(cpu, wq->mayday_mask); 2366 list_del_init(&pwq->mayday_node);
2367
2368 spin_unlock_irq(&wq_mayday_lock);
2359 2369
2360 /* migrate to the target cpu if possible */ 2370 /* migrate to the target cpu if possible */
2371 worker_maybe_bind_and_lock(pool);
2361 rescuer->pool = pool; 2372 rescuer->pool = pool;
2362 worker_maybe_bind_and_lock(rescuer);
2363 2373
2364 /* 2374 /*
2365 * Slurp in all works issued via this workqueue and 2375 * Slurp in all works issued via this workqueue and
2366 * process'em. 2376 * process'em.
2367 */ 2377 */
2368 BUG_ON(!list_empty(&rescuer->scheduled)); 2378 WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
2369 list_for_each_entry_safe(work, n, &pool->worklist, entry) 2379 list_for_each_entry_safe(work, n, &pool->worklist, entry)
2370 if (get_work_pwq(work) == pwq) 2380 if (get_work_pwq(work) == pwq)
2371 move_linked_works(work, scheduled, &n); 2381 move_linked_works(work, scheduled, &n);
@@ -2380,9 +2390,13 @@ repeat:
2380 if (keep_working(pool)) 2390 if (keep_working(pool))
2381 wake_up_worker(pool); 2391 wake_up_worker(pool);
2382 2392
2383 spin_unlock_irq(&pool->lock); 2393 rescuer->pool = NULL;
2394 spin_unlock(&pool->lock);
2395 spin_lock(&wq_mayday_lock);
2384 } 2396 }
2385 2397
2398 spin_unlock_irq(&wq_mayday_lock);
2399
2386 /* rescuers should never participate in concurrency management */ 2400 /* rescuers should never participate in concurrency management */
2387 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); 2401 WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
2388 schedule(); 2402 schedule();
@@ -2486,7 +2500,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
2486 * advanced to @work_color. 2500 * advanced to @work_color.
2487 * 2501 *
2488 * CONTEXT: 2502 * CONTEXT:
2489 * mutex_lock(wq->flush_mutex). 2503 * mutex_lock(wq->mutex).
2490 * 2504 *
2491 * RETURNS: 2505 * RETURNS:
2492 * %true if @flush_color >= 0 and there's something to flush. %false 2506 * %true if @flush_color >= 0 and there's something to flush. %false
@@ -2496,21 +2510,20 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2496 int flush_color, int work_color) 2510 int flush_color, int work_color)
2497{ 2511{
2498 bool wait = false; 2512 bool wait = false;
2499 unsigned int cpu; 2513 struct pool_workqueue *pwq;
2500 2514
2501 if (flush_color >= 0) { 2515 if (flush_color >= 0) {
2502 BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); 2516 WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
2503 atomic_set(&wq->nr_pwqs_to_flush, 1); 2517 atomic_set(&wq->nr_pwqs_to_flush, 1);
2504 } 2518 }
2505 2519
2506 for_each_pwq_cpu(cpu, wq) { 2520 for_each_pwq(pwq, wq) {
2507 struct pool_workqueue *pwq = get_pwq(cpu, wq);
2508 struct worker_pool *pool = pwq->pool; 2521 struct worker_pool *pool = pwq->pool;
2509 2522
2510 spin_lock_irq(&pool->lock); 2523 spin_lock_irq(&pool->lock);
2511 2524
2512 if (flush_color >= 0) { 2525 if (flush_color >= 0) {
2513 BUG_ON(pwq->flush_color != -1); 2526 WARN_ON_ONCE(pwq->flush_color != -1);
2514 2527
2515 if (pwq->nr_in_flight[flush_color]) { 2528 if (pwq->nr_in_flight[flush_color]) {
2516 pwq->flush_color = flush_color; 2529 pwq->flush_color = flush_color;
@@ -2520,7 +2533,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2520 } 2533 }
2521 2534
2522 if (work_color >= 0) { 2535 if (work_color >= 0) {
2523 BUG_ON(work_color != work_next_color(pwq->work_color)); 2536 WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
2524 pwq->work_color = work_color; 2537 pwq->work_color = work_color;
2525 } 2538 }
2526 2539
@@ -2537,11 +2550,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
2537 * flush_workqueue - ensure that any scheduled work has run to completion. 2550 * flush_workqueue - ensure that any scheduled work has run to completion.
2538 * @wq: workqueue to flush 2551 * @wq: workqueue to flush
2539 * 2552 *
2540 * Forces execution of the workqueue and blocks until its completion. 2553 * This function sleeps until all work items which were queued on entry
2541 * This is typically used in driver shutdown handlers. 2554 * have finished execution, but it is not livelocked by new incoming ones.
2542 *
2543 * We sleep until all works which were queued on entry have been handled,
2544 * but we are not livelocked by new incoming ones.
2545 */ 2555 */
2546void flush_workqueue(struct workqueue_struct *wq) 2556void flush_workqueue(struct workqueue_struct *wq)
2547{ 2557{
@@ -2555,7 +2565,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2555 lock_map_acquire(&wq->lockdep_map); 2565 lock_map_acquire(&wq->lockdep_map);
2556 lock_map_release(&wq->lockdep_map); 2566 lock_map_release(&wq->lockdep_map);
2557 2567
2558 mutex_lock(&wq->flush_mutex); 2568 mutex_lock(&wq->mutex);
2559 2569
2560 /* 2570 /*
2561 * Start-to-wait phase 2571 * Start-to-wait phase
@@ -2568,13 +2578,13 @@ void flush_workqueue(struct workqueue_struct *wq)
2568 * becomes our flush_color and work_color is advanced 2578 * becomes our flush_color and work_color is advanced
2569 * by one. 2579 * by one.
2570 */ 2580 */
2571 BUG_ON(!list_empty(&wq->flusher_overflow)); 2581 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
2572 this_flusher.flush_color = wq->work_color; 2582 this_flusher.flush_color = wq->work_color;
2573 wq->work_color = next_color; 2583 wq->work_color = next_color;
2574 2584
2575 if (!wq->first_flusher) { 2585 if (!wq->first_flusher) {
2576 /* no flush in progress, become the first flusher */ 2586 /* no flush in progress, become the first flusher */
2577 BUG_ON(wq->flush_color != this_flusher.flush_color); 2587 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
2578 2588
2579 wq->first_flusher = &this_flusher; 2589 wq->first_flusher = &this_flusher;
2580 2590
@@ -2587,7 +2597,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2587 } 2597 }
2588 } else { 2598 } else {
2589 /* wait in queue */ 2599 /* wait in queue */
2590 BUG_ON(wq->flush_color == this_flusher.flush_color); 2600 WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
2591 list_add_tail(&this_flusher.list, &wq->flusher_queue); 2601 list_add_tail(&this_flusher.list, &wq->flusher_queue);
2592 flush_workqueue_prep_pwqs(wq, -1, wq->work_color); 2602 flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
2593 } 2603 }
@@ -2600,7 +2610,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2600 list_add_tail(&this_flusher.list, &wq->flusher_overflow); 2610 list_add_tail(&this_flusher.list, &wq->flusher_overflow);
2601 } 2611 }
2602 2612
2603 mutex_unlock(&wq->flush_mutex); 2613 mutex_unlock(&wq->mutex);
2604 2614
2605 wait_for_completion(&this_flusher.done); 2615 wait_for_completion(&this_flusher.done);
2606 2616
@@ -2613,7 +2623,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2613 if (wq->first_flusher != &this_flusher) 2623 if (wq->first_flusher != &this_flusher)
2614 return; 2624 return;
2615 2625
2616 mutex_lock(&wq->flush_mutex); 2626 mutex_lock(&wq->mutex);
2617 2627
2618 /* we might have raced, check again with mutex held */ 2628 /* we might have raced, check again with mutex held */
2619 if (wq->first_flusher != &this_flusher) 2629 if (wq->first_flusher != &this_flusher)
@@ -2621,8 +2631,8 @@ void flush_workqueue(struct workqueue_struct *wq)
2621 2631
2622 wq->first_flusher = NULL; 2632 wq->first_flusher = NULL;
2623 2633
2624 BUG_ON(!list_empty(&this_flusher.list)); 2634 WARN_ON_ONCE(!list_empty(&this_flusher.list));
2625 BUG_ON(wq->flush_color != this_flusher.flush_color); 2635 WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
2626 2636
2627 while (true) { 2637 while (true) {
2628 struct wq_flusher *next, *tmp; 2638 struct wq_flusher *next, *tmp;
@@ -2635,8 +2645,8 @@ void flush_workqueue(struct workqueue_struct *wq)
2635 complete(&next->done); 2645 complete(&next->done);
2636 } 2646 }
2637 2647
2638 BUG_ON(!list_empty(&wq->flusher_overflow) && 2648 WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
2639 wq->flush_color != work_next_color(wq->work_color)); 2649 wq->flush_color != work_next_color(wq->work_color));
2640 2650
2641 /* this flush_color is finished, advance by one */ 2651 /* this flush_color is finished, advance by one */
2642 wq->flush_color = work_next_color(wq->flush_color); 2652 wq->flush_color = work_next_color(wq->flush_color);
@@ -2660,7 +2670,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2660 } 2670 }
2661 2671
2662 if (list_empty(&wq->flusher_queue)) { 2672 if (list_empty(&wq->flusher_queue)) {
2663 BUG_ON(wq->flush_color != wq->work_color); 2673 WARN_ON_ONCE(wq->flush_color != wq->work_color);
2664 break; 2674 break;
2665 } 2675 }
2666 2676
@@ -2668,8 +2678,8 @@ void flush_workqueue(struct workqueue_struct *wq)
2668 * Need to flush more colors. Make the next flusher 2678 * Need to flush more colors. Make the next flusher
2669 * the new first flusher and arm pwqs. 2679 * the new first flusher and arm pwqs.
2670 */ 2680 */
2671 BUG_ON(wq->flush_color == wq->work_color); 2681 WARN_ON_ONCE(wq->flush_color == wq->work_color);
2672 BUG_ON(wq->flush_color != next->flush_color); 2682 WARN_ON_ONCE(wq->flush_color != next->flush_color);
2673 2683
2674 list_del_init(&next->list); 2684 list_del_init(&next->list);
2675 wq->first_flusher = next; 2685 wq->first_flusher = next;
@@ -2685,7 +2695,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2685 } 2695 }
2686 2696
2687out_unlock: 2697out_unlock:
2688 mutex_unlock(&wq->flush_mutex); 2698 mutex_unlock(&wq->mutex);
2689} 2699}
2690EXPORT_SYMBOL_GPL(flush_workqueue); 2700EXPORT_SYMBOL_GPL(flush_workqueue);
2691 2701
@@ -2703,22 +2713,23 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
2703void drain_workqueue(struct workqueue_struct *wq) 2713void drain_workqueue(struct workqueue_struct *wq)
2704{ 2714{
2705 unsigned int flush_cnt = 0; 2715 unsigned int flush_cnt = 0;
2706 unsigned int cpu; 2716 struct pool_workqueue *pwq;
2707 2717
2708 /* 2718 /*
2709 * __queue_work() needs to test whether there are drainers, is much 2719 * __queue_work() needs to test whether there are drainers, is much
2710 * hotter than drain_workqueue() and already looks at @wq->flags. 2720 * hotter than drain_workqueue() and already looks at @wq->flags.
2711 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. 2721 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
2712 */ 2722 */
2713 spin_lock(&workqueue_lock); 2723 mutex_lock(&wq->mutex);
2714 if (!wq->nr_drainers++) 2724 if (!wq->nr_drainers++)
2715 wq->flags |= WQ_DRAINING; 2725 wq->flags |= __WQ_DRAINING;
2716 spin_unlock(&workqueue_lock); 2726 mutex_unlock(&wq->mutex);
2717reflush: 2727reflush:
2718 flush_workqueue(wq); 2728 flush_workqueue(wq);
2719 2729
2720 for_each_pwq_cpu(cpu, wq) { 2730 mutex_lock(&wq->mutex);
2721 struct pool_workqueue *pwq = get_pwq(cpu, wq); 2731
2732 for_each_pwq(pwq, wq) {
2722 bool drained; 2733 bool drained;
2723 2734
2724 spin_lock_irq(&pwq->pool->lock); 2735 spin_lock_irq(&pwq->pool->lock);
@@ -2730,15 +2741,16 @@ reflush:
2730 2741
2731 if (++flush_cnt == 10 || 2742 if (++flush_cnt == 10 ||
2732 (flush_cnt % 100 == 0 && flush_cnt <= 1000)) 2743 (flush_cnt % 100 == 0 && flush_cnt <= 1000))
2733 pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", 2744 pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
2734 wq->name, flush_cnt); 2745 wq->name, flush_cnt);
2746
2747 mutex_unlock(&wq->mutex);
2735 goto reflush; 2748 goto reflush;
2736 } 2749 }
2737 2750
2738 spin_lock(&workqueue_lock);
2739 if (!--wq->nr_drainers) 2751 if (!--wq->nr_drainers)
2740 wq->flags &= ~WQ_DRAINING; 2752 wq->flags &= ~__WQ_DRAINING;
2741 spin_unlock(&workqueue_lock); 2753 mutex_unlock(&wq->mutex);
2742} 2754}
2743EXPORT_SYMBOL_GPL(drain_workqueue); 2755EXPORT_SYMBOL_GPL(drain_workqueue);
2744 2756
@@ -2749,11 +2761,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2749 struct pool_workqueue *pwq; 2761 struct pool_workqueue *pwq;
2750 2762
2751 might_sleep(); 2763 might_sleep();
2764
2765 local_irq_disable();
2752 pool = get_work_pool(work); 2766 pool = get_work_pool(work);
2753 if (!pool) 2767 if (!pool) {
2768 local_irq_enable();
2754 return false; 2769 return false;
2770 }
2755 2771
2756 spin_lock_irq(&pool->lock); 2772 spin_lock(&pool->lock);
2757 /* see the comment in try_to_grab_pending() with the same code */ 2773 /* see the comment in try_to_grab_pending() with the same code */
2758 pwq = get_work_pwq(work); 2774 pwq = get_work_pwq(work);
2759 if (pwq) { 2775 if (pwq) {
@@ -2775,7 +2791,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
2775 * flusher is not running on the same workqueue by verifying write 2791 * flusher is not running on the same workqueue by verifying write
2776 * access. 2792 * access.
2777 */ 2793 */
2778 if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) 2794 if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
2779 lock_map_acquire(&pwq->wq->lockdep_map); 2795 lock_map_acquire(&pwq->wq->lockdep_map);
2780 else 2796 else
2781 lock_map_acquire_read(&pwq->wq->lockdep_map); 2797 lock_map_acquire_read(&pwq->wq->lockdep_map);
@@ -2932,66 +2948,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
2932EXPORT_SYMBOL(cancel_delayed_work_sync); 2948EXPORT_SYMBOL(cancel_delayed_work_sync);
2933 2949
2934/** 2950/**
2935 * schedule_work_on - put work task on a specific cpu
2936 * @cpu: cpu to put the work task on
2937 * @work: job to be done
2938 *
2939 * This puts a job on a specific cpu
2940 */
2941bool schedule_work_on(int cpu, struct work_struct *work)
2942{
2943 return queue_work_on(cpu, system_wq, work);
2944}
2945EXPORT_SYMBOL(schedule_work_on);
2946
2947/**
2948 * schedule_work - put work task in global workqueue
2949 * @work: job to be done
2950 *
2951 * Returns %false if @work was already on the kernel-global workqueue and
2952 * %true otherwise.
2953 *
2954 * This puts a job in the kernel-global workqueue if it was not already
2955 * queued and leaves it in the same position on the kernel-global
2956 * workqueue otherwise.
2957 */
2958bool schedule_work(struct work_struct *work)
2959{
2960 return queue_work(system_wq, work);
2961}
2962EXPORT_SYMBOL(schedule_work);
2963
2964/**
2965 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
2966 * @cpu: cpu to use
2967 * @dwork: job to be done
2968 * @delay: number of jiffies to wait
2969 *
2970 * After waiting for a given time this puts a job in the kernel-global
2971 * workqueue on the specified CPU.
2972 */
2973bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
2974 unsigned long delay)
2975{
2976 return queue_delayed_work_on(cpu, system_wq, dwork, delay);
2977}
2978EXPORT_SYMBOL(schedule_delayed_work_on);
2979
2980/**
2981 * schedule_delayed_work - put work task in global workqueue after delay
2982 * @dwork: job to be done
2983 * @delay: number of jiffies to wait or 0 for immediate execution
2984 *
2985 * After waiting for a given time this puts a job in the kernel-global
2986 * workqueue.
2987 */
2988bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
2989{
2990 return queue_delayed_work(system_wq, dwork, delay);
2991}
2992EXPORT_SYMBOL(schedule_delayed_work);
2993
2994/**
2995 * schedule_on_each_cpu - execute a function synchronously on each online CPU 2951 * schedule_on_each_cpu - execute a function synchronously on each online CPU
2996 * @func: the function to call 2952 * @func: the function to call
2997 * 2953 *
@@ -3084,51 +3040,1025 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
3084} 3040}
3085EXPORT_SYMBOL_GPL(execute_in_process_context); 3041EXPORT_SYMBOL_GPL(execute_in_process_context);
3086 3042
3087int keventd_up(void) 3043#ifdef CONFIG_SYSFS
3044/*
3045 * Workqueues with WQ_SYSFS flag set is visible to userland via
3046 * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
3047 * following attributes.
3048 *
3049 * per_cpu RO bool : whether the workqueue is per-cpu or unbound
3050 * max_active RW int : maximum number of in-flight work items
3051 *
3052 * Unbound workqueues have the following extra attributes.
3053 *
3054 * id RO int : the associated pool ID
3055 * nice RW int : nice value of the workers
3056 * cpumask RW mask : bitmask of allowed CPUs for the workers
3057 */
3058struct wq_device {
3059 struct workqueue_struct *wq;
3060 struct device dev;
3061};
3062
3063static struct workqueue_struct *dev_to_wq(struct device *dev)
3064{
3065 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
3066
3067 return wq_dev->wq;
3068}
3069
3070static ssize_t wq_per_cpu_show(struct device *dev,
3071 struct device_attribute *attr, char *buf)
3072{
3073 struct workqueue_struct *wq = dev_to_wq(dev);
3074
3075 return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
3076}
3077
3078static ssize_t wq_max_active_show(struct device *dev,
3079 struct device_attribute *attr, char *buf)
3080{
3081 struct workqueue_struct *wq = dev_to_wq(dev);
3082
3083 return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
3084}
3085
3086static ssize_t wq_max_active_store(struct device *dev,
3087 struct device_attribute *attr,
3088 const char *buf, size_t count)
3088{ 3089{
3089 return system_wq != NULL; 3090 struct workqueue_struct *wq = dev_to_wq(dev);
3091 int val;
3092
3093 if (sscanf(buf, "%d", &val) != 1 || val <= 0)
3094 return -EINVAL;
3095
3096 workqueue_set_max_active(wq, val);
3097 return count;
3090} 3098}
3091 3099
3092static int alloc_pwqs(struct workqueue_struct *wq) 3100static struct device_attribute wq_sysfs_attrs[] = {
3101 __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),
3102 __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),
3103 __ATTR_NULL,
3104};
3105
3106static ssize_t wq_pool_ids_show(struct device *dev,
3107 struct device_attribute *attr, char *buf)
3093{ 3108{
3109 struct workqueue_struct *wq = dev_to_wq(dev);
3110 const char *delim = "";
3111 int node, written = 0;
3112
3113 rcu_read_lock_sched();
3114 for_each_node(node) {
3115 written += scnprintf(buf + written, PAGE_SIZE - written,
3116 "%s%d:%d", delim, node,
3117 unbound_pwq_by_node(wq, node)->pool->id);
3118 delim = " ";
3119 }
3120 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3121 rcu_read_unlock_sched();
3122
3123 return written;
3124}
3125
3126static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
3127 char *buf)
3128{
3129 struct workqueue_struct *wq = dev_to_wq(dev);
3130 int written;
3131
3132 mutex_lock(&wq->mutex);
3133 written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
3134 mutex_unlock(&wq->mutex);
3135
3136 return written;
3137}
3138
3139/* prepare workqueue_attrs for sysfs store operations */
3140static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
3141{
3142 struct workqueue_attrs *attrs;
3143
3144 attrs = alloc_workqueue_attrs(GFP_KERNEL);
3145 if (!attrs)
3146 return NULL;
3147
3148 mutex_lock(&wq->mutex);
3149 copy_workqueue_attrs(attrs, wq->unbound_attrs);
3150 mutex_unlock(&wq->mutex);
3151 return attrs;
3152}
3153
3154static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
3155 const char *buf, size_t count)
3156{
3157 struct workqueue_struct *wq = dev_to_wq(dev);
3158 struct workqueue_attrs *attrs;
3159 int ret;
3160
3161 attrs = wq_sysfs_prep_attrs(wq);
3162 if (!attrs)
3163 return -ENOMEM;
3164
3165 if (sscanf(buf, "%d", &attrs->nice) == 1 &&
3166 attrs->nice >= -20 && attrs->nice <= 19)
3167 ret = apply_workqueue_attrs(wq, attrs);
3168 else
3169 ret = -EINVAL;
3170
3171 free_workqueue_attrs(attrs);
3172 return ret ?: count;
3173}
3174
3175static ssize_t wq_cpumask_show(struct device *dev,
3176 struct device_attribute *attr, char *buf)
3177{
3178 struct workqueue_struct *wq = dev_to_wq(dev);
3179 int written;
3180
3181 mutex_lock(&wq->mutex);
3182 written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask);
3183 mutex_unlock(&wq->mutex);
3184
3185 written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
3186 return written;
3187}
3188
3189static ssize_t wq_cpumask_store(struct device *dev,
3190 struct device_attribute *attr,
3191 const char *buf, size_t count)
3192{
3193 struct workqueue_struct *wq = dev_to_wq(dev);
3194 struct workqueue_attrs *attrs;
3195 int ret;
3196
3197 attrs = wq_sysfs_prep_attrs(wq);
3198 if (!attrs)
3199 return -ENOMEM;
3200
3201 ret = cpumask_parse(buf, attrs->cpumask);
3202 if (!ret)
3203 ret = apply_workqueue_attrs(wq, attrs);
3204
3205 free_workqueue_attrs(attrs);
3206 return ret ?: count;
3207}
3208
3209static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
3210 char *buf)
3211{
3212 struct workqueue_struct *wq = dev_to_wq(dev);
3213 int written;
3214
3215 mutex_lock(&wq->mutex);
3216 written = scnprintf(buf, PAGE_SIZE, "%d\n",
3217 !wq->unbound_attrs->no_numa);
3218 mutex_unlock(&wq->mutex);
3219
3220 return written;
3221}
3222
3223static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
3224 const char *buf, size_t count)
3225{
3226 struct workqueue_struct *wq = dev_to_wq(dev);
3227 struct workqueue_attrs *attrs;
3228 int v, ret;
3229
3230 attrs = wq_sysfs_prep_attrs(wq);
3231 if (!attrs)
3232 return -ENOMEM;
3233
3234 ret = -EINVAL;
3235 if (sscanf(buf, "%d", &v) == 1) {
3236 attrs->no_numa = !v;
3237 ret = apply_workqueue_attrs(wq, attrs);
3238 }
3239
3240 free_workqueue_attrs(attrs);
3241 return ret ?: count;
3242}
3243
3244static struct device_attribute wq_sysfs_unbound_attrs[] = {
3245 __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
3246 __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
3247 __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
3248 __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
3249 __ATTR_NULL,
3250};
3251
3252static struct bus_type wq_subsys = {
3253 .name = "workqueue",
3254 .dev_attrs = wq_sysfs_attrs,
3255};
3256
3257static int __init wq_sysfs_init(void)
3258{
3259 return subsys_virtual_register(&wq_subsys, NULL);
3260}
3261core_initcall(wq_sysfs_init);
3262
3263static void wq_device_release(struct device *dev)
3264{
3265 struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
3266
3267 kfree(wq_dev);
3268}
3269
3270/**
3271 * workqueue_sysfs_register - make a workqueue visible in sysfs
3272 * @wq: the workqueue to register
3273 *
3274 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
3275 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
3276 * which is the preferred method.
3277 *
3278 * Workqueue user should use this function directly iff it wants to apply
3279 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
3280 * apply_workqueue_attrs() may race against userland updating the
3281 * attributes.
3282 *
3283 * Returns 0 on success, -errno on failure.
3284 */
3285int workqueue_sysfs_register(struct workqueue_struct *wq)
3286{
3287 struct wq_device *wq_dev;
3288 int ret;
3289
3094 /* 3290 /*
3095 * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. 3291 * Adjusting max_active or creating new pwqs by applyting
3096 * Make sure that the alignment isn't lower than that of 3292 * attributes breaks ordering guarantee. Disallow exposing ordered
3097 * unsigned long long. 3293 * workqueues.
3098 */ 3294 */
3099 const size_t size = sizeof(struct pool_workqueue); 3295 if (WARN_ON(wq->flags & __WQ_ORDERED))
3100 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 3296 return -EINVAL;
3101 __alignof__(unsigned long long));
3102 3297
3103 if (!(wq->flags & WQ_UNBOUND)) 3298 wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
3104 wq->pool_wq.pcpu = __alloc_percpu(size, align); 3299 if (!wq_dev)
3105 else { 3300 return -ENOMEM;
3106 void *ptr; 3301
3302 wq_dev->wq = wq;
3303 wq_dev->dev.bus = &wq_subsys;
3304 wq_dev->dev.init_name = wq->name;
3305 wq_dev->dev.release = wq_device_release;
3306
3307 /*
3308 * unbound_attrs are created separately. Suppress uevent until
3309 * everything is ready.
3310 */
3311 dev_set_uevent_suppress(&wq_dev->dev, true);
3312
3313 ret = device_register(&wq_dev->dev);
3314 if (ret) {
3315 kfree(wq_dev);
3316 wq->wq_dev = NULL;
3317 return ret;
3318 }
3319
3320 if (wq->flags & WQ_UNBOUND) {
3321 struct device_attribute *attr;
3322
3323 for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
3324 ret = device_create_file(&wq_dev->dev, attr);
3325 if (ret) {
3326 device_unregister(&wq_dev->dev);
3327 wq->wq_dev = NULL;
3328 return ret;
3329 }
3330 }
3331 }
3332
3333 kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
3334 return 0;
3335}
3336
3337/**
3338 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
3339 * @wq: the workqueue to unregister
3340 *
3341 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
3342 */
3343static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
3344{
3345 struct wq_device *wq_dev = wq->wq_dev;
3346
3347 if (!wq->wq_dev)
3348 return;
3349
3350 wq->wq_dev = NULL;
3351 device_unregister(&wq_dev->dev);
3352}
3353#else /* CONFIG_SYSFS */
3354static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
3355#endif /* CONFIG_SYSFS */
3356
3357/**
3358 * free_workqueue_attrs - free a workqueue_attrs
3359 * @attrs: workqueue_attrs to free
3360 *
3361 * Undo alloc_workqueue_attrs().
3362 */
3363void free_workqueue_attrs(struct workqueue_attrs *attrs)
3364{
3365 if (attrs) {
3366 free_cpumask_var(attrs->cpumask);
3367 kfree(attrs);
3368 }
3369}
3370
3371/**
3372 * alloc_workqueue_attrs - allocate a workqueue_attrs
3373 * @gfp_mask: allocation mask to use
3374 *
3375 * Allocate a new workqueue_attrs, initialize with default settings and
3376 * return it. Returns NULL on failure.
3377 */
3378struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
3379{
3380 struct workqueue_attrs *attrs;
3381
3382 attrs = kzalloc(sizeof(*attrs), gfp_mask);
3383 if (!attrs)
3384 goto fail;
3385 if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
3386 goto fail;
3387
3388 cpumask_copy(attrs->cpumask, cpu_possible_mask);
3389 return attrs;
3390fail:
3391 free_workqueue_attrs(attrs);
3392 return NULL;
3393}
3394
3395static void copy_workqueue_attrs(struct workqueue_attrs *to,
3396 const struct workqueue_attrs *from)
3397{
3398 to->nice = from->nice;
3399 cpumask_copy(to->cpumask, from->cpumask);
3400}
3401
3402/* hash value of the content of @attr */
3403static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
3404{
3405 u32 hash = 0;
3406
3407 hash = jhash_1word(attrs->nice, hash);
3408 hash = jhash(cpumask_bits(attrs->cpumask),
3409 BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
3410 return hash;
3411}
3412
3413/* content equality test */
3414static bool wqattrs_equal(const struct workqueue_attrs *a,
3415 const struct workqueue_attrs *b)
3416{
3417 if (a->nice != b->nice)
3418 return false;
3419 if (!cpumask_equal(a->cpumask, b->cpumask))
3420 return false;
3421 return true;
3422}
3423
3424/**
3425 * init_worker_pool - initialize a newly zalloc'd worker_pool
3426 * @pool: worker_pool to initialize
3427 *
3428 * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs.
3429 * Returns 0 on success, -errno on failure. Even on failure, all fields
3430 * inside @pool proper are initialized and put_unbound_pool() can be called
3431 * on @pool safely to release it.
3432 */
3433static int init_worker_pool(struct worker_pool *pool)
3434{
3435 spin_lock_init(&pool->lock);
3436 pool->id = -1;
3437 pool->cpu = -1;
3438 pool->node = NUMA_NO_NODE;
3439 pool->flags |= POOL_DISASSOCIATED;
3440 INIT_LIST_HEAD(&pool->worklist);
3441 INIT_LIST_HEAD(&pool->idle_list);
3442 hash_init(pool->busy_hash);
3443
3444 init_timer_deferrable(&pool->idle_timer);
3445 pool->idle_timer.function = idle_worker_timeout;
3446 pool->idle_timer.data = (unsigned long)pool;
3447
3448 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3449 (unsigned long)pool);
3450
3451 mutex_init(&pool->manager_arb);
3452 mutex_init(&pool->manager_mutex);
3453 idr_init(&pool->worker_idr);
3454
3455 INIT_HLIST_NODE(&pool->hash_node);
3456 pool->refcnt = 1;
3457
3458 /* shouldn't fail above this point */
3459 pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
3460 if (!pool->attrs)
3461 return -ENOMEM;
3462 return 0;
3463}
3464
3465static void rcu_free_pool(struct rcu_head *rcu)
3466{
3467 struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
3468
3469 idr_destroy(&pool->worker_idr);
3470 free_workqueue_attrs(pool->attrs);
3471 kfree(pool);
3472}
3473
3474/**
3475 * put_unbound_pool - put a worker_pool
3476 * @pool: worker_pool to put
3477 *
3478 * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
3479 * safe manner. get_unbound_pool() calls this function on its failure path
3480 * and this function should be able to release pools which went through,
3481 * successfully or not, init_worker_pool().
3482 *
3483 * Should be called with wq_pool_mutex held.
3484 */
3485static void put_unbound_pool(struct worker_pool *pool)
3486{
3487 struct worker *worker;
3488
3489 lockdep_assert_held(&wq_pool_mutex);
3490
3491 if (--pool->refcnt)
3492 return;
3493
3494 /* sanity checks */
3495 if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
3496 WARN_ON(!list_empty(&pool->worklist)))
3497 return;
3498
3499 /* release id and unhash */
3500 if (pool->id >= 0)
3501 idr_remove(&worker_pool_idr, pool->id);
3502 hash_del(&pool->hash_node);
3503
3504 /*
3505 * Become the manager and destroy all workers. Grabbing
3506 * manager_arb prevents @pool's workers from blocking on
3507 * manager_mutex.
3508 */
3509 mutex_lock(&pool->manager_arb);
3510 mutex_lock(&pool->manager_mutex);
3511 spin_lock_irq(&pool->lock);
3512
3513 while ((worker = first_worker(pool)))
3514 destroy_worker(worker);
3515 WARN_ON(pool->nr_workers || pool->nr_idle);
3516
3517 spin_unlock_irq(&pool->lock);
3518 mutex_unlock(&pool->manager_mutex);
3519 mutex_unlock(&pool->manager_arb);
3520
3521 /* shut down the timers */
3522 del_timer_sync(&pool->idle_timer);
3523 del_timer_sync(&pool->mayday_timer);
3524
3525 /* sched-RCU protected to allow dereferences from get_work_pool() */
3526 call_rcu_sched(&pool->rcu, rcu_free_pool);
3527}
3528
3529/**
3530 * get_unbound_pool - get a worker_pool with the specified attributes
3531 * @attrs: the attributes of the worker_pool to get
3532 *
3533 * Obtain a worker_pool which has the same attributes as @attrs, bump the
3534 * reference count and return it. If there already is a matching
3535 * worker_pool, it will be used; otherwise, this function attempts to
3536 * create a new one. On failure, returns NULL.
3537 *
3538 * Should be called with wq_pool_mutex held.
3539 */
3540static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
3541{
3542 u32 hash = wqattrs_hash(attrs);
3543 struct worker_pool *pool;
3544 int node;
3545
3546 lockdep_assert_held(&wq_pool_mutex);
3547
3548 /* do we already have a matching pool? */
3549 hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
3550 if (wqattrs_equal(pool->attrs, attrs)) {
3551 pool->refcnt++;
3552 goto out_unlock;
3553 }
3554 }
3555
3556 /* nope, create a new one */
3557 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
3558 if (!pool || init_worker_pool(pool) < 0)
3559 goto fail;
3560
3561 if (workqueue_freezing)
3562 pool->flags |= POOL_FREEZING;
3563
3564 lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
3565 copy_workqueue_attrs(pool->attrs, attrs);
3566
3567 /* if cpumask is contained inside a NUMA node, we belong to that node */
3568 if (wq_numa_enabled) {
3569 for_each_node(node) {
3570 if (cpumask_subset(pool->attrs->cpumask,
3571 wq_numa_possible_cpumask[node])) {
3572 pool->node = node;
3573 break;
3574 }
3575 }
3576 }
3577
3578 if (worker_pool_assign_id(pool) < 0)
3579 goto fail;
3580
3581 /* create and start the initial worker */
3582 if (create_and_start_worker(pool) < 0)
3583 goto fail;
3584
3585 /* install */
3586 hash_add(unbound_pool_hash, &pool->hash_node, hash);
3587out_unlock:
3588 return pool;
3589fail:
3590 if (pool)
3591 put_unbound_pool(pool);
3592 return NULL;
3593}
3594
3595static void rcu_free_pwq(struct rcu_head *rcu)
3596{
3597 kmem_cache_free(pwq_cache,
3598 container_of(rcu, struct pool_workqueue, rcu));
3599}
3600
3601/*
3602 * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
3603 * and needs to be destroyed.
3604 */
3605static void pwq_unbound_release_workfn(struct work_struct *work)
3606{
3607 struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
3608 unbound_release_work);
3609 struct workqueue_struct *wq = pwq->wq;
3610 struct worker_pool *pool = pwq->pool;
3611 bool is_last;
3612
3613 if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
3614 return;
3615
3616 /*
3617 * Unlink @pwq. Synchronization against wq->mutex isn't strictly
3618 * necessary on release but do it anyway. It's easier to verify
3619 * and consistent with the linking path.
3620 */
3621 mutex_lock(&wq->mutex);
3622 list_del_rcu(&pwq->pwqs_node);
3623 is_last = list_empty(&wq->pwqs);
3624 mutex_unlock(&wq->mutex);
3625
3626 mutex_lock(&wq_pool_mutex);
3627 put_unbound_pool(pool);
3628 mutex_unlock(&wq_pool_mutex);
3629
3630 call_rcu_sched(&pwq->rcu, rcu_free_pwq);
3631
3632 /*
3633 * If we're the last pwq going away, @wq is already dead and no one
3634 * is gonna access it anymore. Free it.
3635 */
3636 if (is_last) {
3637 free_workqueue_attrs(wq->unbound_attrs);
3638 kfree(wq);
3639 }
3640}
3641
3642/**
3643 * pwq_adjust_max_active - update a pwq's max_active to the current setting
3644 * @pwq: target pool_workqueue
3645 *
3646 * If @pwq isn't freezing, set @pwq->max_active to the associated
3647 * workqueue's saved_max_active and activate delayed work items
3648 * accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
3649 */
3650static void pwq_adjust_max_active(struct pool_workqueue *pwq)
3651{
3652 struct workqueue_struct *wq = pwq->wq;
3653 bool freezable = wq->flags & WQ_FREEZABLE;
3654
3655 /* for @wq->saved_max_active */
3656 lockdep_assert_held(&wq->mutex);
3657
3658 /* fast exit for non-freezable wqs */
3659 if (!freezable && pwq->max_active == wq->saved_max_active)
3660 return;
3661
3662 spin_lock_irq(&pwq->pool->lock);
3663
3664 if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
3665 pwq->max_active = wq->saved_max_active;
3666
3667 while (!list_empty(&pwq->delayed_works) &&
3668 pwq->nr_active < pwq->max_active)
3669 pwq_activate_first_delayed(pwq);
3107 3670
3108 /* 3671 /*
3109 * Allocate enough room to align pwq and put an extra 3672 * Need to kick a worker after thawed or an unbound wq's
3110 * pointer at the end pointing back to the originally 3673 * max_active is bumped. It's a slow path. Do it always.
3111 * allocated pointer which will be used for free.
3112 */ 3674 */
3113 ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); 3675 wake_up_worker(pwq->pool);
3114 if (ptr) { 3676 } else {
3115 wq->pool_wq.single = PTR_ALIGN(ptr, align); 3677 pwq->max_active = 0;
3116 *(void **)(wq->pool_wq.single + 1) = ptr; 3678 }
3679
3680 spin_unlock_irq(&pwq->pool->lock);
3681}
3682
3683/* initialize newly alloced @pwq which is associated with @wq and @pool */
3684static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
3685 struct worker_pool *pool)
3686{
3687 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3688
3689 memset(pwq, 0, sizeof(*pwq));
3690
3691 pwq->pool = pool;
3692 pwq->wq = wq;
3693 pwq->flush_color = -1;
3694 pwq->refcnt = 1;
3695 INIT_LIST_HEAD(&pwq->delayed_works);
3696 INIT_LIST_HEAD(&pwq->pwqs_node);
3697 INIT_LIST_HEAD(&pwq->mayday_node);
3698 INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
3699}
3700
3701/* sync @pwq with the current state of its associated wq and link it */
3702static void link_pwq(struct pool_workqueue *pwq)
3703{
3704 struct workqueue_struct *wq = pwq->wq;
3705
3706 lockdep_assert_held(&wq->mutex);
3707
3708 /* may be called multiple times, ignore if already linked */
3709 if (!list_empty(&pwq->pwqs_node))
3710 return;
3711
3712 /*
3713 * Set the matching work_color. This is synchronized with
3714 * wq->mutex to avoid confusing flush_workqueue().
3715 */
3716 pwq->work_color = wq->work_color;
3717
3718 /* sync max_active to the current setting */
3719 pwq_adjust_max_active(pwq);
3720
3721 /* link in @pwq */
3722 list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
3723}
3724
3725/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
3726static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
3727 const struct workqueue_attrs *attrs)
3728{
3729 struct worker_pool *pool;
3730 struct pool_workqueue *pwq;
3731
3732 lockdep_assert_held(&wq_pool_mutex);
3733
3734 pool = get_unbound_pool(attrs);
3735 if (!pool)
3736 return NULL;
3737
3738 pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
3739 if (!pwq) {
3740 put_unbound_pool(pool);
3741 return NULL;
3742 }
3743
3744 init_pwq(pwq, wq, pool);
3745 return pwq;
3746}
3747
3748/* undo alloc_unbound_pwq(), used only in the error path */
3749static void free_unbound_pwq(struct pool_workqueue *pwq)
3750{
3751 lockdep_assert_held(&wq_pool_mutex);
3752
3753 if (pwq) {
3754 put_unbound_pool(pwq->pool);
3755 kmem_cache_free(pwq_cache, pwq);
3756 }
3757}
3758
3759/**
3760 * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
3761 * @attrs: the wq_attrs of interest
3762 * @node: the target NUMA node
3763 * @cpu_going_down: if >= 0, the CPU to consider as offline
3764 * @cpumask: outarg, the resulting cpumask
3765 *
3766 * Calculate the cpumask a workqueue with @attrs should use on @node. If
3767 * @cpu_going_down is >= 0, that cpu is considered offline during
3768 * calculation. The result is stored in @cpumask. This function returns
3769 * %true if the resulting @cpumask is different from @attrs->cpumask,
3770 * %false if equal.
3771 *
3772 * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
3773 * enabled and @node has online CPUs requested by @attrs, the returned
3774 * cpumask is the intersection of the possible CPUs of @node and
3775 * @attrs->cpumask.
3776 *
3777 * The caller is responsible for ensuring that the cpumask of @node stays
3778 * stable.
3779 */
3780static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
3781 int cpu_going_down, cpumask_t *cpumask)
3782{
3783 if (!wq_numa_enabled || attrs->no_numa)
3784 goto use_dfl;
3785
3786 /* does @node have any online CPUs @attrs wants? */
3787 cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
3788 if (cpu_going_down >= 0)
3789 cpumask_clear_cpu(cpu_going_down, cpumask);
3790
3791 if (cpumask_empty(cpumask))
3792 goto use_dfl;
3793
3794 /* yeap, return possible CPUs in @node that @attrs wants */
3795 cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
3796 return !cpumask_equal(cpumask, attrs->cpumask);
3797
3798use_dfl:
3799 cpumask_copy(cpumask, attrs->cpumask);
3800 return false;
3801}
3802
3803/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
3804static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
3805 int node,
3806 struct pool_workqueue *pwq)
3807{
3808 struct pool_workqueue *old_pwq;
3809
3810 lockdep_assert_held(&wq->mutex);
3811
3812 /* link_pwq() can handle duplicate calls */
3813 link_pwq(pwq);
3814
3815 old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
3816 rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
3817 return old_pwq;
3818}
3819
3820/**
3821 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
3822 * @wq: the target workqueue
3823 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
3824 *
3825 * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
3826 * machines, this function maps a separate pwq to each NUMA node with
3827 * possibles CPUs in @attrs->cpumask so that work items are affine to the
3828 * NUMA node it was issued on. Older pwqs are released as in-flight work
3829 * items finish. Note that a work item which repeatedly requeues itself
3830 * back-to-back will stay on its current pwq.
3831 *
3832 * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on
3833 * failure.
3834 */
3835int apply_workqueue_attrs(struct workqueue_struct *wq,
3836 const struct workqueue_attrs *attrs)
3837{
3838 struct workqueue_attrs *new_attrs, *tmp_attrs;
3839 struct pool_workqueue **pwq_tbl, *dfl_pwq;
3840 int node, ret;
3841
3842 /* only unbound workqueues can change attributes */
3843 if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
3844 return -EINVAL;
3845
3846 /* creating multiple pwqs breaks ordering guarantee */
3847 if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
3848 return -EINVAL;
3849
3850 pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
3851 new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3852 tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3853 if (!pwq_tbl || !new_attrs || !tmp_attrs)
3854 goto enomem;
3855
3856 /* make a copy of @attrs and sanitize it */
3857 copy_workqueue_attrs(new_attrs, attrs);
3858 cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
3859
3860 /*
3861 * We may create multiple pwqs with differing cpumasks. Make a
3862 * copy of @new_attrs which will be modified and used to obtain
3863 * pools.
3864 */
3865 copy_workqueue_attrs(tmp_attrs, new_attrs);
3866
3867 /*
3868 * CPUs should stay stable across pwq creations and installations.
3869 * Pin CPUs, determine the target cpumask for each node and create
3870 * pwqs accordingly.
3871 */
3872 get_online_cpus();
3873
3874 mutex_lock(&wq_pool_mutex);
3875
3876 /*
3877 * If something goes wrong during CPU up/down, we'll fall back to
3878 * the default pwq covering whole @attrs->cpumask. Always create
3879 * it even if we don't use it immediately.
3880 */
3881 dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
3882 if (!dfl_pwq)
3883 goto enomem_pwq;
3884
3885 for_each_node(node) {
3886 if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
3887 pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
3888 if (!pwq_tbl[node])
3889 goto enomem_pwq;
3890 } else {
3891 dfl_pwq->refcnt++;
3892 pwq_tbl[node] = dfl_pwq;
3117 } 3893 }
3118 } 3894 }
3119 3895
3120 /* just in case, make sure it's actually aligned */ 3896 mutex_unlock(&wq_pool_mutex);
3121 BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); 3897
3122 return wq->pool_wq.v ? 0 : -ENOMEM; 3898 /* all pwqs have been created successfully, let's install'em */
3899 mutex_lock(&wq->mutex);
3900
3901 copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
3902
3903 /* save the previous pwq and install the new one */
3904 for_each_node(node)
3905 pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
3906
3907 /* @dfl_pwq might not have been used, ensure it's linked */
3908 link_pwq(dfl_pwq);
3909 swap(wq->dfl_pwq, dfl_pwq);
3910
3911 mutex_unlock(&wq->mutex);
3912
3913 /* put the old pwqs */
3914 for_each_node(node)
3915 put_pwq_unlocked(pwq_tbl[node]);
3916 put_pwq_unlocked(dfl_pwq);
3917
3918 put_online_cpus();
3919 ret = 0;
3920 /* fall through */
3921out_free:
3922 free_workqueue_attrs(tmp_attrs);
3923 free_workqueue_attrs(new_attrs);
3924 kfree(pwq_tbl);
3925 return ret;
3926
3927enomem_pwq:
3928 free_unbound_pwq(dfl_pwq);
3929 for_each_node(node)
3930 if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
3931 free_unbound_pwq(pwq_tbl[node]);
3932 mutex_unlock(&wq_pool_mutex);
3933 put_online_cpus();
3934enomem:
3935 ret = -ENOMEM;
3936 goto out_free;
3123} 3937}
3124 3938
3125static void free_pwqs(struct workqueue_struct *wq) 3939/**
3940 * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
3941 * @wq: the target workqueue
3942 * @cpu: the CPU coming up or going down
3943 * @online: whether @cpu is coming up or going down
3944 *
3945 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
3946 * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
3947 * @wq accordingly.
3948 *
3949 * If NUMA affinity can't be adjusted due to memory allocation failure, it
3950 * falls back to @wq->dfl_pwq which may not be optimal but is always
3951 * correct.
3952 *
3953 * Note that when the last allowed CPU of a NUMA node goes offline for a
3954 * workqueue with a cpumask spanning multiple nodes, the workers which were
3955 * already executing the work items for the workqueue will lose their CPU
3956 * affinity and may execute on any CPU. This is similar to how per-cpu
3957 * workqueues behave on CPU_DOWN. If a workqueue user wants strict
3958 * affinity, it's the user's responsibility to flush the work item from
3959 * CPU_DOWN_PREPARE.
3960 */
3961static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
3962 bool online)
3126{ 3963{
3127 if (!(wq->flags & WQ_UNBOUND)) 3964 int node = cpu_to_node(cpu);
3128 free_percpu(wq->pool_wq.pcpu); 3965 int cpu_off = online ? -1 : cpu;
3129 else if (wq->pool_wq.single) { 3966 struct pool_workqueue *old_pwq = NULL, *pwq;
3130 /* the pointer to free is stored right after the pwq */ 3967 struct workqueue_attrs *target_attrs;
3131 kfree(*(void **)(wq->pool_wq.single + 1)); 3968 cpumask_t *cpumask;
3969
3970 lockdep_assert_held(&wq_pool_mutex);
3971
3972 if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
3973 return;
3974
3975 /*
3976 * We don't wanna alloc/free wq_attrs for each wq for each CPU.
3977 * Let's use a preallocated one. The following buf is protected by
3978 * CPU hotplug exclusion.
3979 */
3980 target_attrs = wq_update_unbound_numa_attrs_buf;
3981 cpumask = target_attrs->cpumask;
3982
3983 mutex_lock(&wq->mutex);
3984 if (wq->unbound_attrs->no_numa)
3985 goto out_unlock;
3986
3987 copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
3988 pwq = unbound_pwq_by_node(wq, node);
3989
3990 /*
3991 * Let's determine what needs to be done. If the target cpumask is
3992 * different from wq's, we need to compare it to @pwq's and create
3993 * a new one if they don't match. If the target cpumask equals
3994 * wq's, the default pwq should be used. If @pwq is already the
3995 * default one, nothing to do; otherwise, install the default one.
3996 */
3997 if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
3998 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
3999 goto out_unlock;
4000 } else {
4001 if (pwq == wq->dfl_pwq)
4002 goto out_unlock;
4003 else
4004 goto use_dfl_pwq;
4005 }
4006
4007 mutex_unlock(&wq->mutex);
4008
4009 /* create a new pwq */
4010 pwq = alloc_unbound_pwq(wq, target_attrs);
4011 if (!pwq) {
4012 pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
4013 wq->name);
4014 goto out_unlock;
4015 }
4016
4017 /*
4018 * Install the new pwq. As this function is called only from CPU
4019 * hotplug callbacks and applying a new attrs is wrapped with
4020 * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
4021 * inbetween.
4022 */
4023 mutex_lock(&wq->mutex);
4024 old_pwq = numa_pwq_tbl_install(wq, node, pwq);
4025 goto out_unlock;
4026
4027use_dfl_pwq:
4028 spin_lock_irq(&wq->dfl_pwq->pool->lock);
4029 get_pwq(wq->dfl_pwq);
4030 spin_unlock_irq(&wq->dfl_pwq->pool->lock);
4031 old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
4032out_unlock:
4033 mutex_unlock(&wq->mutex);
4034 put_pwq_unlocked(old_pwq);
4035}
4036
4037static int alloc_and_link_pwqs(struct workqueue_struct *wq)
4038{
4039 bool highpri = wq->flags & WQ_HIGHPRI;
4040 int cpu;
4041
4042 if (!(wq->flags & WQ_UNBOUND)) {
4043 wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
4044 if (!wq->cpu_pwqs)
4045 return -ENOMEM;
4046
4047 for_each_possible_cpu(cpu) {
4048 struct pool_workqueue *pwq =
4049 per_cpu_ptr(wq->cpu_pwqs, cpu);
4050 struct worker_pool *cpu_pools =
4051 per_cpu(cpu_worker_pools, cpu);
4052
4053 init_pwq(pwq, wq, &cpu_pools[highpri]);
4054
4055 mutex_lock(&wq->mutex);
4056 link_pwq(pwq);
4057 mutex_unlock(&wq->mutex);
4058 }
4059 return 0;
4060 } else {
4061 return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
3132 } 4062 }
3133} 4063}
3134 4064
@@ -3150,30 +4080,28 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3150 struct lock_class_key *key, 4080 struct lock_class_key *key,
3151 const char *lock_name, ...) 4081 const char *lock_name, ...)
3152{ 4082{
3153 va_list args, args1; 4083 size_t tbl_size = 0;
4084 va_list args;
3154 struct workqueue_struct *wq; 4085 struct workqueue_struct *wq;
3155 unsigned int cpu; 4086 struct pool_workqueue *pwq;
3156 size_t namelen;
3157 4087
3158 /* determine namelen, allocate wq and format name */ 4088 /* allocate wq and format name */
3159 va_start(args, lock_name); 4089 if (flags & WQ_UNBOUND)
3160 va_copy(args1, args); 4090 tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
3161 namelen = vsnprintf(NULL, 0, fmt, args) + 1;
3162 4091
3163 wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); 4092 wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
3164 if (!wq) 4093 if (!wq)
3165 goto err; 4094 return NULL;
3166 4095
3167 vsnprintf(wq->name, namelen, fmt, args1); 4096 if (flags & WQ_UNBOUND) {
3168 va_end(args); 4097 wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
3169 va_end(args1); 4098 if (!wq->unbound_attrs)
4099 goto err_free_wq;
4100 }
3170 4101
3171 /* 4102 va_start(args, lock_name);
3172 * Workqueues which may be used during memory reclaim should 4103 vsnprintf(wq->name, sizeof(wq->name), fmt, args);
3173 * have a rescuer to guarantee forward progress. 4104 va_end(args);
3174 */
3175 if (flags & WQ_MEM_RECLAIM)
3176 flags |= WQ_RESCUER;
3177 4105
3178 max_active = max_active ?: WQ_DFL_ACTIVE; 4106 max_active = max_active ?: WQ_DFL_ACTIVE;
3179 max_active = wq_clamp_max_active(max_active, flags, wq->name); 4107 max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -3181,71 +4109,70 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3181 /* init wq */ 4109 /* init wq */
3182 wq->flags = flags; 4110 wq->flags = flags;
3183 wq->saved_max_active = max_active; 4111 wq->saved_max_active = max_active;
3184 mutex_init(&wq->flush_mutex); 4112 mutex_init(&wq->mutex);
3185 atomic_set(&wq->nr_pwqs_to_flush, 0); 4113 atomic_set(&wq->nr_pwqs_to_flush, 0);
4114 INIT_LIST_HEAD(&wq->pwqs);
3186 INIT_LIST_HEAD(&wq->flusher_queue); 4115 INIT_LIST_HEAD(&wq->flusher_queue);
3187 INIT_LIST_HEAD(&wq->flusher_overflow); 4116 INIT_LIST_HEAD(&wq->flusher_overflow);
4117 INIT_LIST_HEAD(&wq->maydays);
3188 4118
3189 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 4119 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
3190 INIT_LIST_HEAD(&wq->list); 4120 INIT_LIST_HEAD(&wq->list);
3191 4121
3192 if (alloc_pwqs(wq) < 0) 4122 if (alloc_and_link_pwqs(wq) < 0)
3193 goto err; 4123 goto err_free_wq;
3194
3195 for_each_pwq_cpu(cpu, wq) {
3196 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3197
3198 BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
3199 pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
3200 pwq->wq = wq;
3201 pwq->flush_color = -1;
3202 pwq->max_active = max_active;
3203 INIT_LIST_HEAD(&pwq->delayed_works);
3204 }
3205 4124
3206 if (flags & WQ_RESCUER) { 4125 /*
4126 * Workqueues which may be used during memory reclaim should
4127 * have a rescuer to guarantee forward progress.
4128 */
4129 if (flags & WQ_MEM_RECLAIM) {
3207 struct worker *rescuer; 4130 struct worker *rescuer;
3208 4131
3209 if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) 4132 rescuer = alloc_worker();
3210 goto err;
3211
3212 wq->rescuer = rescuer = alloc_worker();
3213 if (!rescuer) 4133 if (!rescuer)
3214 goto err; 4134 goto err_destroy;
3215 4135
3216 rescuer->rescue_wq = wq; 4136 rescuer->rescue_wq = wq;
3217 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", 4137 rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
3218 wq->name); 4138 wq->name);
3219 if (IS_ERR(rescuer->task)) 4139 if (IS_ERR(rescuer->task)) {
3220 goto err; 4140 kfree(rescuer);
4141 goto err_destroy;
4142 }
3221 4143
3222 rescuer->task->flags |= PF_THREAD_BOUND; 4144 wq->rescuer = rescuer;
4145 rescuer->task->flags |= PF_NO_SETAFFINITY;
3223 wake_up_process(rescuer->task); 4146 wake_up_process(rescuer->task);
3224 } 4147 }
3225 4148
4149 if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
4150 goto err_destroy;
4151
3226 /* 4152 /*
3227 * workqueue_lock protects global freeze state and workqueues 4153 * wq_pool_mutex protects global freeze state and workqueues list.
3228 * list. Grab it, set max_active accordingly and add the new 4154 * Grab it, adjust max_active and add the new @wq to workqueues
3229 * workqueue to workqueues list. 4155 * list.
3230 */ 4156 */
3231 spin_lock(&workqueue_lock); 4157 mutex_lock(&wq_pool_mutex);
3232 4158
3233 if (workqueue_freezing && wq->flags & WQ_FREEZABLE) 4159 mutex_lock(&wq->mutex);
3234 for_each_pwq_cpu(cpu, wq) 4160 for_each_pwq(pwq, wq)
3235 get_pwq(cpu, wq)->max_active = 0; 4161 pwq_adjust_max_active(pwq);
4162 mutex_unlock(&wq->mutex);
3236 4163
3237 list_add(&wq->list, &workqueues); 4164 list_add(&wq->list, &workqueues);
3238 4165
3239 spin_unlock(&workqueue_lock); 4166 mutex_unlock(&wq_pool_mutex);
3240 4167
3241 return wq; 4168 return wq;
3242err: 4169
3243 if (wq) { 4170err_free_wq:
3244 free_pwqs(wq); 4171 free_workqueue_attrs(wq->unbound_attrs);
3245 free_mayday_mask(wq->mayday_mask); 4172 kfree(wq);
3246 kfree(wq->rescuer); 4173 return NULL;
3247 kfree(wq); 4174err_destroy:
3248 } 4175 destroy_workqueue(wq);
3249 return NULL; 4176 return NULL;
3250} 4177}
3251EXPORT_SYMBOL_GPL(__alloc_workqueue_key); 4178EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
@@ -3258,60 +4185,78 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
3258 */ 4185 */
3259void destroy_workqueue(struct workqueue_struct *wq) 4186void destroy_workqueue(struct workqueue_struct *wq)
3260{ 4187{
3261 unsigned int cpu; 4188 struct pool_workqueue *pwq;
4189 int node;
3262 4190
3263 /* drain it before proceeding with destruction */ 4191 /* drain it before proceeding with destruction */
3264 drain_workqueue(wq); 4192 drain_workqueue(wq);
3265 4193
4194 /* sanity checks */
4195 mutex_lock(&wq->mutex);
4196 for_each_pwq(pwq, wq) {
4197 int i;
4198
4199 for (i = 0; i < WORK_NR_COLORS; i++) {
4200 if (WARN_ON(pwq->nr_in_flight[i])) {
4201 mutex_unlock(&wq->mutex);
4202 return;
4203 }
4204 }
4205
4206 if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) ||
4207 WARN_ON(pwq->nr_active) ||
4208 WARN_ON(!list_empty(&pwq->delayed_works))) {
4209 mutex_unlock(&wq->mutex);
4210 return;
4211 }
4212 }
4213 mutex_unlock(&wq->mutex);
4214
3266 /* 4215 /*
3267 * wq list is used to freeze wq, remove from list after 4216 * wq list is used to freeze wq, remove from list after
3268 * flushing is complete in case freeze races us. 4217 * flushing is complete in case freeze races us.
3269 */ 4218 */
3270 spin_lock(&workqueue_lock); 4219 mutex_lock(&wq_pool_mutex);
3271 list_del(&wq->list); 4220 list_del_init(&wq->list);
3272 spin_unlock(&workqueue_lock); 4221 mutex_unlock(&wq_pool_mutex);
3273 4222
3274 /* sanity check */ 4223 workqueue_sysfs_unregister(wq);
3275 for_each_pwq_cpu(cpu, wq) {
3276 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3277 int i;
3278
3279 for (i = 0; i < WORK_NR_COLORS; i++)
3280 BUG_ON(pwq->nr_in_flight[i]);
3281 BUG_ON(pwq->nr_active);
3282 BUG_ON(!list_empty(&pwq->delayed_works));
3283 }
3284 4224
3285 if (wq->flags & WQ_RESCUER) { 4225 if (wq->rescuer) {
3286 kthread_stop(wq->rescuer->task); 4226 kthread_stop(wq->rescuer->task);
3287 free_mayday_mask(wq->mayday_mask);
3288 kfree(wq->rescuer); 4227 kfree(wq->rescuer);
4228 wq->rescuer = NULL;
3289 } 4229 }
3290 4230
3291 free_pwqs(wq); 4231 if (!(wq->flags & WQ_UNBOUND)) {
3292 kfree(wq); 4232 /*
3293} 4233 * The base ref is never dropped on per-cpu pwqs. Directly
3294EXPORT_SYMBOL_GPL(destroy_workqueue); 4234 * free the pwqs and wq.
3295 4235 */
3296/** 4236 free_percpu(wq->cpu_pwqs);
3297 * pwq_set_max_active - adjust max_active of a pwq 4237 kfree(wq);
3298 * @pwq: target pool_workqueue 4238 } else {
3299 * @max_active: new max_active value. 4239 /*
3300 * 4240 * We're the sole accessor of @wq at this point. Directly
3301 * Set @pwq->max_active to @max_active and activate delayed works if 4241 * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
3302 * increased. 4242 * @wq will be freed when the last pwq is released.
3303 * 4243 */
3304 * CONTEXT: 4244 for_each_node(node) {
3305 * spin_lock_irq(pool->lock). 4245 pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
3306 */ 4246 RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
3307static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) 4247 put_pwq_unlocked(pwq);
3308{ 4248 }
3309 pwq->max_active = max_active;
3310 4249
3311 while (!list_empty(&pwq->delayed_works) && 4250 /*
3312 pwq->nr_active < pwq->max_active) 4251 * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
3313 pwq_activate_first_delayed(pwq); 4252 * put. Don't access it afterwards.
4253 */
4254 pwq = wq->dfl_pwq;
4255 wq->dfl_pwq = NULL;
4256 put_pwq_unlocked(pwq);
4257 }
3314} 4258}
4259EXPORT_SYMBOL_GPL(destroy_workqueue);
3315 4260
3316/** 4261/**
3317 * workqueue_set_max_active - adjust max_active of a workqueue 4262 * workqueue_set_max_active - adjust max_active of a workqueue
@@ -3325,30 +4270,37 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
3325 */ 4270 */
3326void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) 4271void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
3327{ 4272{
3328 unsigned int cpu; 4273 struct pool_workqueue *pwq;
4274
4275 /* disallow meddling with max_active for ordered workqueues */
4276 if (WARN_ON(wq->flags & __WQ_ORDERED))
4277 return;
3329 4278
3330 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); 4279 max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
3331 4280
3332 spin_lock(&workqueue_lock); 4281 mutex_lock(&wq->mutex);
3333 4282
3334 wq->saved_max_active = max_active; 4283 wq->saved_max_active = max_active;
3335 4284
3336 for_each_pwq_cpu(cpu, wq) { 4285 for_each_pwq(pwq, wq)
3337 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4286 pwq_adjust_max_active(pwq);
3338 struct worker_pool *pool = pwq->pool;
3339 4287
3340 spin_lock_irq(&pool->lock); 4288 mutex_unlock(&wq->mutex);
3341 4289}
3342 if (!(wq->flags & WQ_FREEZABLE) || 4290EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3343 !(pool->flags & POOL_FREEZING))
3344 pwq_set_max_active(pwq, max_active);
3345 4291
3346 spin_unlock_irq(&pool->lock); 4292/**
3347 } 4293 * current_is_workqueue_rescuer - is %current workqueue rescuer?
4294 *
4295 * Determine whether %current is a workqueue rescuer. Can be used from
4296 * work functions to determine whether it's being run off the rescuer task.
4297 */
4298bool current_is_workqueue_rescuer(void)
4299{
4300 struct worker *worker = current_wq_worker();
3348 4301
3349 spin_unlock(&workqueue_lock); 4302 return worker && worker->rescue_wq;
3350} 4303}
3351EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3352 4304
3353/** 4305/**
3354 * workqueue_congested - test whether a workqueue is congested 4306 * workqueue_congested - test whether a workqueue is congested
@@ -3362,11 +4314,22 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
3362 * RETURNS: 4314 * RETURNS:
3363 * %true if congested, %false otherwise. 4315 * %true if congested, %false otherwise.
3364 */ 4316 */
3365bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) 4317bool workqueue_congested(int cpu, struct workqueue_struct *wq)
3366{ 4318{
3367 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4319 struct pool_workqueue *pwq;
4320 bool ret;
4321
4322 rcu_read_lock_sched();
4323
4324 if (!(wq->flags & WQ_UNBOUND))
4325 pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
4326 else
4327 pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
3368 4328
3369 return !list_empty(&pwq->delayed_works); 4329 ret = !list_empty(&pwq->delayed_works);
4330 rcu_read_unlock_sched();
4331
4332 return ret;
3370} 4333}
3371EXPORT_SYMBOL_GPL(workqueue_congested); 4334EXPORT_SYMBOL_GPL(workqueue_congested);
3372 4335
@@ -3383,24 +4346,104 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
3383 */ 4346 */
3384unsigned int work_busy(struct work_struct *work) 4347unsigned int work_busy(struct work_struct *work)
3385{ 4348{
3386 struct worker_pool *pool = get_work_pool(work); 4349 struct worker_pool *pool;
3387 unsigned long flags; 4350 unsigned long flags;
3388 unsigned int ret = 0; 4351 unsigned int ret = 0;
3389 4352
3390 if (work_pending(work)) 4353 if (work_pending(work))
3391 ret |= WORK_BUSY_PENDING; 4354 ret |= WORK_BUSY_PENDING;
3392 4355
4356 local_irq_save(flags);
4357 pool = get_work_pool(work);
3393 if (pool) { 4358 if (pool) {
3394 spin_lock_irqsave(&pool->lock, flags); 4359 spin_lock(&pool->lock);
3395 if (find_worker_executing_work(pool, work)) 4360 if (find_worker_executing_work(pool, work))
3396 ret |= WORK_BUSY_RUNNING; 4361 ret |= WORK_BUSY_RUNNING;
3397 spin_unlock_irqrestore(&pool->lock, flags); 4362 spin_unlock(&pool->lock);
3398 } 4363 }
4364 local_irq_restore(flags);
3399 4365
3400 return ret; 4366 return ret;
3401} 4367}
3402EXPORT_SYMBOL_GPL(work_busy); 4368EXPORT_SYMBOL_GPL(work_busy);
3403 4369
4370/**
4371 * set_worker_desc - set description for the current work item
4372 * @fmt: printf-style format string
4373 * @...: arguments for the format string
4374 *
4375 * This function can be called by a running work function to describe what
4376 * the work item is about. If the worker task gets dumped, this
4377 * information will be printed out together to help debugging. The
4378 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
4379 */
4380void set_worker_desc(const char *fmt, ...)
4381{
4382 struct worker *worker = current_wq_worker();
4383 va_list args;
4384
4385 if (worker) {
4386 va_start(args, fmt);
4387 vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
4388 va_end(args);
4389 worker->desc_valid = true;
4390 }
4391}
4392
4393/**
4394 * print_worker_info - print out worker information and description
4395 * @log_lvl: the log level to use when printing
4396 * @task: target task
4397 *
4398 * If @task is a worker and currently executing a work item, print out the
4399 * name of the workqueue being serviced and worker description set with
4400 * set_worker_desc() by the currently executing work item.
4401 *
4402 * This function can be safely called on any task as long as the
4403 * task_struct itself is accessible. While safe, this function isn't
4404 * synchronized and may print out mixups or garbages of limited length.
4405 */
4406void print_worker_info(const char *log_lvl, struct task_struct *task)
4407{
4408 work_func_t *fn = NULL;
4409 char name[WQ_NAME_LEN] = { };
4410 char desc[WORKER_DESC_LEN] = { };
4411 struct pool_workqueue *pwq = NULL;
4412 struct workqueue_struct *wq = NULL;
4413 bool desc_valid = false;
4414 struct worker *worker;
4415
4416 if (!(task->flags & PF_WQ_WORKER))
4417 return;
4418
4419 /*
4420 * This function is called without any synchronization and @task
4421 * could be in any state. Be careful with dereferences.
4422 */
4423 worker = probe_kthread_data(task);
4424
4425 /*
4426 * Carefully copy the associated workqueue's workfn and name. Keep
4427 * the original last '\0' in case the original contains garbage.
4428 */
4429 probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
4430 probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
4431 probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
4432 probe_kernel_read(name, wq->name, sizeof(name) - 1);
4433
4434 /* copy worker description */
4435 probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
4436 if (desc_valid)
4437 probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
4438
4439 if (fn || name[0] || desc[0]) {
4440 printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
4441 if (desc[0])
4442 pr_cont(" (%s)", desc);
4443 pr_cont("\n");
4444 }
4445}
4446
3404/* 4447/*
3405 * CPU hotplug. 4448 * CPU hotplug.
3406 * 4449 *
@@ -3421,53 +4464,153 @@ static void wq_unbind_fn(struct work_struct *work)
3421 int cpu = smp_processor_id(); 4464 int cpu = smp_processor_id();
3422 struct worker_pool *pool; 4465 struct worker_pool *pool;
3423 struct worker *worker; 4466 struct worker *worker;
3424 int i; 4467 int wi;
3425 4468
3426 for_each_std_worker_pool(pool, cpu) { 4469 for_each_cpu_worker_pool(pool, cpu) {
3427 BUG_ON(cpu != smp_processor_id()); 4470 WARN_ON_ONCE(cpu != smp_processor_id());
3428 4471
3429 mutex_lock(&pool->assoc_mutex); 4472 mutex_lock(&pool->manager_mutex);
3430 spin_lock_irq(&pool->lock); 4473 spin_lock_irq(&pool->lock);
3431 4474
3432 /* 4475 /*
3433 * We've claimed all manager positions. Make all workers 4476 * We've blocked all manager operations. Make all workers
3434 * unbound and set DISASSOCIATED. Before this, all workers 4477 * unbound and set DISASSOCIATED. Before this, all workers
3435 * except for the ones which are still executing works from 4478 * except for the ones which are still executing works from
3436 * before the last CPU down must be on the cpu. After 4479 * before the last CPU down must be on the cpu. After
3437 * this, they may become diasporas. 4480 * this, they may become diasporas.
3438 */ 4481 */
3439 list_for_each_entry(worker, &pool->idle_list, entry) 4482 for_each_pool_worker(worker, wi, pool)
3440 worker->flags |= WORKER_UNBOUND;
3441
3442 for_each_busy_worker(worker, i, pool)
3443 worker->flags |= WORKER_UNBOUND; 4483 worker->flags |= WORKER_UNBOUND;
3444 4484
3445 pool->flags |= POOL_DISASSOCIATED; 4485 pool->flags |= POOL_DISASSOCIATED;
3446 4486
3447 spin_unlock_irq(&pool->lock); 4487 spin_unlock_irq(&pool->lock);
3448 mutex_unlock(&pool->assoc_mutex); 4488 mutex_unlock(&pool->manager_mutex);
4489
4490 /*
4491 * Call schedule() so that we cross rq->lock and thus can
4492 * guarantee sched callbacks see the %WORKER_UNBOUND flag.
4493 * This is necessary as scheduler callbacks may be invoked
4494 * from other cpus.
4495 */
4496 schedule();
4497
4498 /*
4499 * Sched callbacks are disabled now. Zap nr_running.
4500 * After this, nr_running stays zero and need_more_worker()
4501 * and keep_working() are always true as long as the
4502 * worklist is not empty. This pool now behaves as an
4503 * unbound (in terms of concurrency management) pool which
4504 * are served by workers tied to the pool.
4505 */
4506 atomic_set(&pool->nr_running, 0);
4507
4508 /*
4509 * With concurrency management just turned off, a busy
4510 * worker blocking could lead to lengthy stalls. Kick off
4511 * unbound chain execution of currently pending work items.
4512 */
4513 spin_lock_irq(&pool->lock);
4514 wake_up_worker(pool);
4515 spin_unlock_irq(&pool->lock);
3449 } 4516 }
4517}
3450 4518
3451 /* 4519/**
3452 * Call schedule() so that we cross rq->lock and thus can guarantee 4520 * rebind_workers - rebind all workers of a pool to the associated CPU
3453 * sched callbacks see the %WORKER_UNBOUND flag. This is necessary 4521 * @pool: pool of interest
3454 * as scheduler callbacks may be invoked from other cpus. 4522 *
3455 */ 4523 * @pool->cpu is coming online. Rebind all workers to the CPU.
3456 schedule(); 4524 */
4525static void rebind_workers(struct worker_pool *pool)
4526{
4527 struct worker *worker;
4528 int wi;
4529
4530 lockdep_assert_held(&pool->manager_mutex);
3457 4531
3458 /* 4532 /*
3459 * Sched callbacks are disabled now. Zap nr_running. After this, 4533 * Restore CPU affinity of all workers. As all idle workers should
3460 * nr_running stays zero and need_more_worker() and keep_working() 4534 * be on the run-queue of the associated CPU before any local
3461 * are always true as long as the worklist is not empty. Pools on 4535 * wake-ups for concurrency management happen, restore CPU affinty
3462 * @cpu now behave as unbound (in terms of concurrency management) 4536 * of all workers first and then clear UNBOUND. As we're called
3463 * pools which are served by workers tied to the CPU. 4537 * from CPU_ONLINE, the following shouldn't fail.
3464 *
3465 * On return from this function, the current worker would trigger
3466 * unbound chain execution of pending work items if other workers
3467 * didn't already.
3468 */ 4538 */
3469 for_each_std_worker_pool(pool, cpu) 4539 for_each_pool_worker(worker, wi, pool)
3470 atomic_set(&pool->nr_running, 0); 4540 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4541 pool->attrs->cpumask) < 0);
4542
4543 spin_lock_irq(&pool->lock);
4544
4545 for_each_pool_worker(worker, wi, pool) {
4546 unsigned int worker_flags = worker->flags;
4547
4548 /*
4549 * A bound idle worker should actually be on the runqueue
4550 * of the associated CPU for local wake-ups targeting it to
4551 * work. Kick all idle workers so that they migrate to the
4552 * associated CPU. Doing this in the same loop as
4553 * replacing UNBOUND with REBOUND is safe as no worker will
4554 * be bound before @pool->lock is released.
4555 */
4556 if (worker_flags & WORKER_IDLE)
4557 wake_up_process(worker->task);
4558
4559 /*
4560 * We want to clear UNBOUND but can't directly call
4561 * worker_clr_flags() or adjust nr_running. Atomically
4562 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
4563 * @worker will clear REBOUND using worker_clr_flags() when
4564 * it initiates the next execution cycle thus restoring
4565 * concurrency management. Note that when or whether
4566 * @worker clears REBOUND doesn't affect correctness.
4567 *
4568 * ACCESS_ONCE() is necessary because @worker->flags may be
4569 * tested without holding any lock in
4570 * wq_worker_waking_up(). Without it, NOT_RUNNING test may
4571 * fail incorrectly leading to premature concurrency
4572 * management operations.
4573 */
4574 WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
4575 worker_flags |= WORKER_REBOUND;
4576 worker_flags &= ~WORKER_UNBOUND;
4577 ACCESS_ONCE(worker->flags) = worker_flags;
4578 }
4579
4580 spin_unlock_irq(&pool->lock);
4581}
4582
4583/**
4584 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
4585 * @pool: unbound pool of interest
4586 * @cpu: the CPU which is coming up
4587 *
4588 * An unbound pool may end up with a cpumask which doesn't have any online
4589 * CPUs. When a worker of such pool get scheduled, the scheduler resets
4590 * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any
4591 * online CPU before, cpus_allowed of all its workers should be restored.
4592 */
4593static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
4594{
4595 static cpumask_t cpumask;
4596 struct worker *worker;
4597 int wi;
4598
4599 lockdep_assert_held(&pool->manager_mutex);
4600
4601 /* is @cpu allowed for @pool? */
4602 if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
4603 return;
4604
4605 /* is @cpu the only online CPU? */
4606 cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
4607 if (cpumask_weight(&cpumask) != 1)
4608 return;
4609
4610 /* as we're called from CPU_ONLINE, the following shouldn't fail */
4611 for_each_pool_worker(worker, wi, pool)
4612 WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
4613 pool->attrs->cpumask) < 0);
3471} 4614}
3472 4615
3473/* 4616/*
@@ -3478,39 +4621,46 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
3478 unsigned long action, 4621 unsigned long action,
3479 void *hcpu) 4622 void *hcpu)
3480{ 4623{
3481 unsigned int cpu = (unsigned long)hcpu; 4624 int cpu = (unsigned long)hcpu;
3482 struct worker_pool *pool; 4625 struct worker_pool *pool;
4626 struct workqueue_struct *wq;
4627 int pi;
3483 4628
3484 switch (action & ~CPU_TASKS_FROZEN) { 4629 switch (action & ~CPU_TASKS_FROZEN) {
3485 case CPU_UP_PREPARE: 4630 case CPU_UP_PREPARE:
3486 for_each_std_worker_pool(pool, cpu) { 4631 for_each_cpu_worker_pool(pool, cpu) {
3487 struct worker *worker;
3488
3489 if (pool->nr_workers) 4632 if (pool->nr_workers)
3490 continue; 4633 continue;
3491 4634 if (create_and_start_worker(pool) < 0)
3492 worker = create_worker(pool);
3493 if (!worker)
3494 return NOTIFY_BAD; 4635 return NOTIFY_BAD;
3495
3496 spin_lock_irq(&pool->lock);
3497 start_worker(worker);
3498 spin_unlock_irq(&pool->lock);
3499 } 4636 }
3500 break; 4637 break;
3501 4638
3502 case CPU_DOWN_FAILED: 4639 case CPU_DOWN_FAILED:
3503 case CPU_ONLINE: 4640 case CPU_ONLINE:
3504 for_each_std_worker_pool(pool, cpu) { 4641 mutex_lock(&wq_pool_mutex);
3505 mutex_lock(&pool->assoc_mutex);
3506 spin_lock_irq(&pool->lock);
3507 4642
3508 pool->flags &= ~POOL_DISASSOCIATED; 4643 for_each_pool(pool, pi) {
3509 rebind_workers(pool); 4644 mutex_lock(&pool->manager_mutex);
4645
4646 if (pool->cpu == cpu) {
4647 spin_lock_irq(&pool->lock);
4648 pool->flags &= ~POOL_DISASSOCIATED;
4649 spin_unlock_irq(&pool->lock);
3510 4650
3511 spin_unlock_irq(&pool->lock); 4651 rebind_workers(pool);
3512 mutex_unlock(&pool->assoc_mutex); 4652 } else if (pool->cpu < 0) {
4653 restore_unbound_workers_cpumask(pool, cpu);
4654 }
4655
4656 mutex_unlock(&pool->manager_mutex);
3513 } 4657 }
4658
4659 /* update NUMA affinity of unbound workqueues */
4660 list_for_each_entry(wq, &workqueues, list)
4661 wq_update_unbound_numa(wq, cpu, true);
4662
4663 mutex_unlock(&wq_pool_mutex);
3514 break; 4664 break;
3515 } 4665 }
3516 return NOTIFY_OK; 4666 return NOTIFY_OK;
@@ -3524,14 +4674,23 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
3524 unsigned long action, 4674 unsigned long action,
3525 void *hcpu) 4675 void *hcpu)
3526{ 4676{
3527 unsigned int cpu = (unsigned long)hcpu; 4677 int cpu = (unsigned long)hcpu;
3528 struct work_struct unbind_work; 4678 struct work_struct unbind_work;
4679 struct workqueue_struct *wq;
3529 4680
3530 switch (action & ~CPU_TASKS_FROZEN) { 4681 switch (action & ~CPU_TASKS_FROZEN) {
3531 case CPU_DOWN_PREPARE: 4682 case CPU_DOWN_PREPARE:
3532 /* unbinding should happen on the local CPU */ 4683 /* unbinding per-cpu workers should happen on the local CPU */
3533 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); 4684 INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
3534 queue_work_on(cpu, system_highpri_wq, &unbind_work); 4685 queue_work_on(cpu, system_highpri_wq, &unbind_work);
4686
4687 /* update NUMA affinity of unbound workqueues */
4688 mutex_lock(&wq_pool_mutex);
4689 list_for_each_entry(wq, &workqueues, list)
4690 wq_update_unbound_numa(wq, cpu, false);
4691 mutex_unlock(&wq_pool_mutex);
4692
4693 /* wait for per-cpu unbinding to finish */
3535 flush_work(&unbind_work); 4694 flush_work(&unbind_work);
3536 break; 4695 break;
3537 } 4696 }
@@ -3564,7 +4723,7 @@ static void work_for_cpu_fn(struct work_struct *work)
3564 * It is up to the caller to ensure that the cpu doesn't go offline. 4723 * It is up to the caller to ensure that the cpu doesn't go offline.
3565 * The caller must not hold any locks which would prevent @fn from completing. 4724 * The caller must not hold any locks which would prevent @fn from completing.
3566 */ 4725 */
3567long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) 4726long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
3568{ 4727{
3569 struct work_for_cpu wfc = { .fn = fn, .arg = arg }; 4728 struct work_for_cpu wfc = { .fn = fn, .arg = arg };
3570 4729
@@ -3582,44 +4741,40 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
3582 * freeze_workqueues_begin - begin freezing workqueues 4741 * freeze_workqueues_begin - begin freezing workqueues
3583 * 4742 *
3584 * Start freezing workqueues. After this function returns, all freezable 4743 * Start freezing workqueues. After this function returns, all freezable
3585 * workqueues will queue new works to their frozen_works list instead of 4744 * workqueues will queue new works to their delayed_works list instead of
3586 * pool->worklist. 4745 * pool->worklist.
3587 * 4746 *
3588 * CONTEXT: 4747 * CONTEXT:
3589 * Grabs and releases workqueue_lock and pool->lock's. 4748 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
3590 */ 4749 */
3591void freeze_workqueues_begin(void) 4750void freeze_workqueues_begin(void)
3592{ 4751{
3593 unsigned int cpu; 4752 struct worker_pool *pool;
4753 struct workqueue_struct *wq;
4754 struct pool_workqueue *pwq;
4755 int pi;
3594 4756
3595 spin_lock(&workqueue_lock); 4757 mutex_lock(&wq_pool_mutex);
3596 4758
3597 BUG_ON(workqueue_freezing); 4759 WARN_ON_ONCE(workqueue_freezing);
3598 workqueue_freezing = true; 4760 workqueue_freezing = true;
3599 4761
3600 for_each_wq_cpu(cpu) { 4762 /* set FREEZING */
3601 struct worker_pool *pool; 4763 for_each_pool(pool, pi) {
3602 struct workqueue_struct *wq; 4764 spin_lock_irq(&pool->lock);
3603 4765 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3604 for_each_std_worker_pool(pool, cpu) { 4766 pool->flags |= POOL_FREEZING;
3605 spin_lock_irq(&pool->lock); 4767 spin_unlock_irq(&pool->lock);
3606 4768 }
3607 WARN_ON_ONCE(pool->flags & POOL_FREEZING);
3608 pool->flags |= POOL_FREEZING;
3609
3610 list_for_each_entry(wq, &workqueues, list) {
3611 struct pool_workqueue *pwq = get_pwq(cpu, wq);
3612
3613 if (pwq && pwq->pool == pool &&
3614 (wq->flags & WQ_FREEZABLE))
3615 pwq->max_active = 0;
3616 }
3617 4769
3618 spin_unlock_irq(&pool->lock); 4770 list_for_each_entry(wq, &workqueues, list) {
3619 } 4771 mutex_lock(&wq->mutex);
4772 for_each_pwq(pwq, wq)
4773 pwq_adjust_max_active(pwq);
4774 mutex_unlock(&wq->mutex);
3620 } 4775 }
3621 4776
3622 spin_unlock(&workqueue_lock); 4777 mutex_unlock(&wq_pool_mutex);
3623} 4778}
3624 4779
3625/** 4780/**
@@ -3629,7 +4784,7 @@ void freeze_workqueues_begin(void)
3629 * between freeze_workqueues_begin() and thaw_workqueues(). 4784 * between freeze_workqueues_begin() and thaw_workqueues().
3630 * 4785 *
3631 * CONTEXT: 4786 * CONTEXT:
3632 * Grabs and releases workqueue_lock. 4787 * Grabs and releases wq_pool_mutex.
3633 * 4788 *
3634 * RETURNS: 4789 * RETURNS:
3635 * %true if some freezable workqueues are still busy. %false if freezing 4790 * %true if some freezable workqueues are still busy. %false if freezing
@@ -3637,34 +4792,34 @@ void freeze_workqueues_begin(void)
3637 */ 4792 */
3638bool freeze_workqueues_busy(void) 4793bool freeze_workqueues_busy(void)
3639{ 4794{
3640 unsigned int cpu;
3641 bool busy = false; 4795 bool busy = false;
4796 struct workqueue_struct *wq;
4797 struct pool_workqueue *pwq;
3642 4798
3643 spin_lock(&workqueue_lock); 4799 mutex_lock(&wq_pool_mutex);
3644 4800
3645 BUG_ON(!workqueue_freezing); 4801 WARN_ON_ONCE(!workqueue_freezing);
3646 4802
3647 for_each_wq_cpu(cpu) { 4803 list_for_each_entry(wq, &workqueues, list) {
3648 struct workqueue_struct *wq; 4804 if (!(wq->flags & WQ_FREEZABLE))
4805 continue;
3649 /* 4806 /*
3650 * nr_active is monotonically decreasing. It's safe 4807 * nr_active is monotonically decreasing. It's safe
3651 * to peek without lock. 4808 * to peek without lock.
3652 */ 4809 */
3653 list_for_each_entry(wq, &workqueues, list) { 4810 rcu_read_lock_sched();
3654 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4811 for_each_pwq(pwq, wq) {
3655 4812 WARN_ON_ONCE(pwq->nr_active < 0);
3656 if (!pwq || !(wq->flags & WQ_FREEZABLE))
3657 continue;
3658
3659 BUG_ON(pwq->nr_active < 0);
3660 if (pwq->nr_active) { 4813 if (pwq->nr_active) {
3661 busy = true; 4814 busy = true;
4815 rcu_read_unlock_sched();
3662 goto out_unlock; 4816 goto out_unlock;
3663 } 4817 }
3664 } 4818 }
4819 rcu_read_unlock_sched();
3665 } 4820 }
3666out_unlock: 4821out_unlock:
3667 spin_unlock(&workqueue_lock); 4822 mutex_unlock(&wq_pool_mutex);
3668 return busy; 4823 return busy;
3669} 4824}
3670 4825
@@ -3675,104 +4830,141 @@ out_unlock:
3675 * frozen works are transferred to their respective pool worklists. 4830 * frozen works are transferred to their respective pool worklists.
3676 * 4831 *
3677 * CONTEXT: 4832 * CONTEXT:
3678 * Grabs and releases workqueue_lock and pool->lock's. 4833 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
3679 */ 4834 */
3680void thaw_workqueues(void) 4835void thaw_workqueues(void)
3681{ 4836{
3682 unsigned int cpu; 4837 struct workqueue_struct *wq;
4838 struct pool_workqueue *pwq;
4839 struct worker_pool *pool;
4840 int pi;
3683 4841
3684 spin_lock(&workqueue_lock); 4842 mutex_lock(&wq_pool_mutex);
3685 4843
3686 if (!workqueue_freezing) 4844 if (!workqueue_freezing)
3687 goto out_unlock; 4845 goto out_unlock;
3688 4846
3689 for_each_wq_cpu(cpu) { 4847 /* clear FREEZING */
3690 struct worker_pool *pool; 4848 for_each_pool(pool, pi) {
3691 struct workqueue_struct *wq; 4849 spin_lock_irq(&pool->lock);
4850 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
4851 pool->flags &= ~POOL_FREEZING;
4852 spin_unlock_irq(&pool->lock);
4853 }
3692 4854
3693 for_each_std_worker_pool(pool, cpu) { 4855 /* restore max_active and repopulate worklist */
3694 spin_lock_irq(&pool->lock); 4856 list_for_each_entry(wq, &workqueues, list) {
4857 mutex_lock(&wq->mutex);
4858 for_each_pwq(pwq, wq)
4859 pwq_adjust_max_active(pwq);
4860 mutex_unlock(&wq->mutex);
4861 }
3695 4862
3696 WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); 4863 workqueue_freezing = false;
3697 pool->flags &= ~POOL_FREEZING; 4864out_unlock:
4865 mutex_unlock(&wq_pool_mutex);
4866}
4867#endif /* CONFIG_FREEZER */
3698 4868
3699 list_for_each_entry(wq, &workqueues, list) { 4869static void __init wq_numa_init(void)
3700 struct pool_workqueue *pwq = get_pwq(cpu, wq); 4870{
4871 cpumask_var_t *tbl;
4872 int node, cpu;
3701 4873
3702 if (!pwq || pwq->pool != pool || 4874 /* determine NUMA pwq table len - highest node id + 1 */
3703 !(wq->flags & WQ_FREEZABLE)) 4875 for_each_node(node)
3704 continue; 4876 wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
3705 4877
3706 /* restore max_active and repopulate worklist */ 4878 if (num_possible_nodes() <= 1)
3707 pwq_set_max_active(pwq, wq->saved_max_active); 4879 return;
3708 }
3709 4880
3710 wake_up_worker(pool); 4881 if (wq_disable_numa) {
4882 pr_info("workqueue: NUMA affinity support disabled\n");
4883 return;
4884 }
4885
4886 wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
4887 BUG_ON(!wq_update_unbound_numa_attrs_buf);
3711 4888
3712 spin_unlock_irq(&pool->lock); 4889 /*
4890 * We want masks of possible CPUs of each node which isn't readily
4891 * available. Build one from cpu_to_node() which should have been
4892 * fully initialized by now.
4893 */
4894 tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
4895 BUG_ON(!tbl);
4896
4897 for_each_node(node)
4898 BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node));
4899
4900 for_each_possible_cpu(cpu) {
4901 node = cpu_to_node(cpu);
4902 if (WARN_ON(node == NUMA_NO_NODE)) {
4903 pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
4904 /* happens iff arch is bonkers, let's just proceed */
4905 return;
3713 } 4906 }
4907 cpumask_set_cpu(cpu, tbl[node]);
3714 } 4908 }
3715 4909
3716 workqueue_freezing = false; 4910 wq_numa_possible_cpumask = tbl;
3717out_unlock: 4911 wq_numa_enabled = true;
3718 spin_unlock(&workqueue_lock);
3719} 4912}
3720#endif /* CONFIG_FREEZER */
3721 4913
3722static int __init init_workqueues(void) 4914static int __init init_workqueues(void)
3723{ 4915{
3724 unsigned int cpu; 4916 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
4917 int i, cpu;
3725 4918
3726 /* make sure we have enough bits for OFFQ pool ID */ 4919 /* make sure we have enough bits for OFFQ pool ID */
3727 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < 4920 BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
3728 WORK_CPU_END * NR_STD_WORKER_POOLS); 4921 WORK_CPU_END * NR_STD_WORKER_POOLS);
3729 4922
4923 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
4924
4925 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
4926
3730 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); 4927 cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
3731 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); 4928 hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
3732 4929
4930 wq_numa_init();
4931
3733 /* initialize CPU pools */ 4932 /* initialize CPU pools */
3734 for_each_wq_cpu(cpu) { 4933 for_each_possible_cpu(cpu) {
3735 struct worker_pool *pool; 4934 struct worker_pool *pool;
3736 4935
3737 for_each_std_worker_pool(pool, cpu) { 4936 i = 0;
3738 spin_lock_init(&pool->lock); 4937 for_each_cpu_worker_pool(pool, cpu) {
4938 BUG_ON(init_worker_pool(pool));
3739 pool->cpu = cpu; 4939 pool->cpu = cpu;
3740 pool->flags |= POOL_DISASSOCIATED; 4940 cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
3741 INIT_LIST_HEAD(&pool->worklist); 4941 pool->attrs->nice = std_nice[i++];
3742 INIT_LIST_HEAD(&pool->idle_list); 4942 pool->node = cpu_to_node(cpu);
3743 hash_init(pool->busy_hash);
3744
3745 init_timer_deferrable(&pool->idle_timer);
3746 pool->idle_timer.function = idle_worker_timeout;
3747 pool->idle_timer.data = (unsigned long)pool;
3748
3749 setup_timer(&pool->mayday_timer, pool_mayday_timeout,
3750 (unsigned long)pool);
3751
3752 mutex_init(&pool->assoc_mutex);
3753 ida_init(&pool->worker_ida);
3754 4943
3755 /* alloc pool ID */ 4944 /* alloc pool ID */
4945 mutex_lock(&wq_pool_mutex);
3756 BUG_ON(worker_pool_assign_id(pool)); 4946 BUG_ON(worker_pool_assign_id(pool));
4947 mutex_unlock(&wq_pool_mutex);
3757 } 4948 }
3758 } 4949 }
3759 4950
3760 /* create the initial worker */ 4951 /* create the initial worker */
3761 for_each_online_wq_cpu(cpu) { 4952 for_each_online_cpu(cpu) {
3762 struct worker_pool *pool; 4953 struct worker_pool *pool;
3763 4954
3764 for_each_std_worker_pool(pool, cpu) { 4955 for_each_cpu_worker_pool(pool, cpu) {
3765 struct worker *worker; 4956 pool->flags &= ~POOL_DISASSOCIATED;
4957 BUG_ON(create_and_start_worker(pool) < 0);
4958 }
4959 }
3766 4960
3767 if (cpu != WORK_CPU_UNBOUND) 4961 /* create default unbound wq attrs */
3768 pool->flags &= ~POOL_DISASSOCIATED; 4962 for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
4963 struct workqueue_attrs *attrs;
3769 4964
3770 worker = create_worker(pool); 4965 BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
3771 BUG_ON(!worker); 4966 attrs->nice = std_nice[i];
3772 spin_lock_irq(&pool->lock); 4967 unbound_std_wq_attrs[i] = attrs;
3773 start_worker(worker);
3774 spin_unlock_irq(&pool->lock);
3775 }
3776 } 4968 }
3777 4969
3778 system_wq = alloc_workqueue("events", 0, 0); 4970 system_wq = alloc_workqueue("events", 0, 0);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 07650264ec15..ad83c96b2ece 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -29,16 +29,24 @@ struct worker {
29 struct work_struct *current_work; /* L: work being processed */ 29 struct work_struct *current_work; /* L: work being processed */
30 work_func_t current_func; /* L: current_work's fn */ 30 work_func_t current_func; /* L: current_work's fn */
31 struct pool_workqueue *current_pwq; /* L: current_work's pwq */ 31 struct pool_workqueue *current_pwq; /* L: current_work's pwq */
32 bool desc_valid; /* ->desc is valid */
32 struct list_head scheduled; /* L: scheduled works */ 33 struct list_head scheduled; /* L: scheduled works */
34
35 /* 64 bytes boundary on 64bit, 32 on 32bit */
36
33 struct task_struct *task; /* I: worker task */ 37 struct task_struct *task; /* I: worker task */
34 struct worker_pool *pool; /* I: the associated pool */ 38 struct worker_pool *pool; /* I: the associated pool */
35 /* 64 bytes boundary on 64bit, 32 on 32bit */ 39 /* L: for rescuers */
40
36 unsigned long last_active; /* L: last active timestamp */ 41 unsigned long last_active; /* L: last active timestamp */
37 unsigned int flags; /* X: flags */ 42 unsigned int flags; /* X: flags */
38 int id; /* I: worker id */ 43 int id; /* I: worker id */
39 44
40 /* for rebinding worker to CPU */ 45 /*
41 struct work_struct rebind_work; /* L: for busy worker */ 46 * Opaque string set with work_set_desc(). Printed out with task
47 * dump for debugging - WARN, BUG, panic or sysrq.
48 */
49 char desc[WORKER_DESC_LEN];
42 50
43 /* used only by rescuers to point to the target workqueue */ 51 /* used only by rescuers to point to the target workqueue */
44 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ 52 struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
@@ -58,8 +66,7 @@ static inline struct worker *current_wq_worker(void)
58 * Scheduler hooks for concurrency managed workqueue. Only to be used from 66 * Scheduler hooks for concurrency managed workqueue. Only to be used from
59 * sched.c and workqueue.c. 67 * sched.c and workqueue.c.
60 */ 68 */
61void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); 69void wq_worker_waking_up(struct task_struct *task, int cpu);
62struct task_struct *wq_worker_sleeping(struct task_struct *task, 70struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
63 unsigned int cpu);
64 71
65#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ 72#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */