aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt25
-rw-r--r--kernel/Makefile12
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/async.c323
-rw-r--r--kernel/audit.h5
-rw-r--r--kernel/audit_tree.c3
-rw-r--r--kernel/auditfilter.c325
-rw-r--r--kernel/auditsc.c930
-rw-r--r--kernel/capability.c288
-rw-r--r--kernel/cgroup.c333
-rw-r--r--kernel/compat.c54
-rw-r--r--kernel/cpu.c157
-rw-r--r--kernel/cpuset.c285
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c588
-rw-r--r--kernel/delayacct.c2
-rw-r--r--kernel/dma-coherent.c42
-rw-r--r--kernel/exit.c53
-rw-r--r--kernel/extable.c21
-rw-r--r--kernel/fork.c101
-rw-r--r--kernel/futex.c433
-rw-r--r--kernel/futex_compat.c7
-rw-r--r--kernel/hrtimer.c390
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/autoprobe.c5
-rw-r--r--kernel/irq/chip.c18
-rw-r--r--kernel/irq/handle.c205
-rw-r--r--kernel/irq/internals.h5
-rw-r--r--kernel/irq/manage.c58
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/numa_migrate.c119
-rw-r--r--kernel/irq/proc.c63
-rw-r--r--kernel/kallsyms.c16
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c34
-rw-r--r--kernel/kprobes.c281
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/lockdep.c61
-rw-r--r--kernel/lockdep_proc.c28
-rw-r--r--kernel/marker.c192
-rw-r--r--kernel/module.c107
-rw-r--r--kernel/mutex.c10
-rw-r--r--kernel/notifier.c8
-rw-r--r--kernel/ns_cgroup.c2
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/panic.c34
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/posix-cpu-timers.c10
-rw-r--r--kernel/posix-timers.c40
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c11
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/profile.c41
-rw-r--r--kernel/ptrace.c41
-rw-r--r--kernel/rcuclassic.c36
-rw-r--r--kernel/rcupdate.c11
-rw-r--r--kernel/rcupreempt.c40
-rw-r--r--kernel/rcupreempt_trace.c10
-rw-r--r--kernel/rcutorture.c91
-rw-r--r--kernel/rcutree.c1532
-rw-r--r--kernel/rcutree_trace.c271
-rw-r--r--kernel/res_counter.c44
-rw-r--r--kernel/resource.c70
-rw-r--r--kernel/sched.c1481
-rw-r--r--kernel/sched_clock.c5
-rw-r--r--kernel/sched_cpupri.c39
-rw-r--r--kernel/sched_cpupri.h5
-rw-r--r--kernel/sched_debug.c57
-rw-r--r--kernel/sched_fair.c73
-rw-r--r--kernel/sched_rt.c83
-rw-r--r--kernel/sched_stats.h8
-rw-r--r--kernel/signal.c65
-rw-r--r--kernel/smp.c145
-rw-r--r--kernel/softirq.c41
-rw-r--r--kernel/softlockup.c12
-rw-r--r--kernel/stacktrace.c11
-rw-r--r--kernel/stop_machine.c63
-rw-r--r--kernel/sys.c592
-rw-r--r--kernel/sysctl.c63
-rw-r--r--kernel/sysctl_check.c1
-rw-r--r--kernel/taskstats.c41
-rw-r--r--kernel/test_kprobes.c210
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/tick-broadcast.c113
-rw-r--r--kernel/time/tick-common.c18
-rw-r--r--kernel/time/tick-sched.c66
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/timer.c23
-rw-r--r--kernel/trace/Kconfig115
-rw-r--r--kernel/trace/Makefile9
-rw-r--r--kernel/trace/ftrace.c929
-rw-r--r--kernel/trace/ring_buffer.c787
-rw-r--r--kernel/trace/trace.c1043
-rw-r--r--kernel/trace/trace.h265
-rw-r--r--kernel/trace/trace_boot.c160
-rw-r--r--kernel/trace/trace_branch.c342
-rw-r--r--kernel/trace/trace_functions.c30
-rw-r--r--kernel/trace/trace_functions_graph.c669
-rw-r--r--kernel/trace/trace_hw_branches.c195
-rw-r--r--kernel/trace/trace_irqsoff.c61
-rw-r--r--kernel/trace/trace_mmiotrace.c33
-rw-r--r--kernel/trace/trace_nop.c65
-rw-r--r--kernel/trace/trace_power.c179
-rw-r--r--kernel/trace/trace_sched_switch.c121
-rw-r--r--kernel/trace/trace_sched_wakeup.c72
-rw-r--r--kernel/trace/trace_selftest.c173
-rw-r--r--kernel/trace/trace_stack.c70
-rw-r--r--kernel/trace/trace_sysprof.c45
-rw-r--r--kernel/tracepoint.c295
-rw-r--r--kernel/tsacct.c10
-rw-r--r--kernel/uid16.c31
-rw-r--r--kernel/user.c98
-rw-r--r--kernel/user_namespace.c65
-rw-r--r--kernel/workqueue.c34
120 files changed, 12357 insertions, 4747 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1fc..bf987b95b356 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,28 +52,3 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_RCU
56 bool "Preemptible RCU"
57 depends on PREEMPT
58 default n
59 help
60 This option reduces the latency of the kernel by making certain
61 RCU sections preemptible. Normally RCU code is non-preemptible, if
62 this option is selected then read-only RCU sections become
63 preemptible. This helps latency, but may expose bugs due to
64 now-naive assumptions about each RCU read-side critical section
65 remaining on a given CPU through its execution.
66
67 Say N if you are unsure.
68
69config RCU_TRACE
70 bool "Enable tracing for RCU - currently stats in debugfs"
71 depends on PREEMPT_RCU
72 select DEBUG_FS
73 default y
74 help
75 This option provides tracing in RCU which presents stats
76 in debugfs for debugging RCU implementation.
77
78 Say Y here if you want to enable RCU tracing
79 Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19d..2921d90ce32f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o
13 14
14ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -19,7 +20,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
19CFLAGS_REMOVE_rtmutex-debug.o = -pg 20CFLAGS_REMOVE_rtmutex-debug.o = -pg
20CFLAGS_REMOVE_cgroup-debug.o = -pg 21CFLAGS_REMOVE_cgroup-debug.o = -pg
21CFLAGS_REMOVE_sched_clock.o = -pg 22CFLAGS_REMOVE_sched_clock.o = -pg
22CFLAGS_REMOVE_sched.o = -pg
23endif 23endif
24 24
25obj-$(CONFIG_FREEZER) += freezer.o 25obj-$(CONFIG_FREEZER) += freezer.o
@@ -74,10 +74,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
74obj-$(CONFIG_SECCOMP) += seccomp.o 74obj-$(CONFIG_SECCOMP) += seccomp.o
75obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 75obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
76obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o 76obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
77obj-$(CONFIG_TREE_RCU) += rcutree.o
77obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 78obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
78ifeq ($(CONFIG_PREEMPT_RCU),y) 79obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
79obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o 80obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
80endif
81obj-$(CONFIG_RELAY) += relay.o 81obj-$(CONFIG_RELAY) += relay.o
82obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 82obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
83obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 83obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -90,7 +90,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
90obj-$(CONFIG_TRACING) += trace/ 90obj-$(CONFIG_TRACING) += trace/
91obj-$(CONFIG_SMP) += sched_cpupri.o 91obj-$(CONFIG_SMP) += sched_cpupri.o
92 92
93ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 93ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
94# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 94# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
95# needed for x86 only. Why this used to be enabled for all architectures is beyond 95# needed for x86 only. Why this used to be enabled for all architectures is beyond
96# me. I suspect most platforms don't need this, but until we know that for sure 96# me. I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/acct.c b/kernel/acct.c
index f6006a60df5d..d57b7cbb98b6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -530,15 +530,14 @@ static void do_acct_process(struct bsd_acct_struct *acct,
530 do_div(elapsed, AHZ); 530 do_div(elapsed, AHZ);
531 ac.ac_btime = get_seconds() - elapsed; 531 ac.ac_btime = get_seconds() - elapsed;
532 /* we really need to bite the bullet and change layout */ 532 /* we really need to bite the bullet and change layout */
533 ac.ac_uid = current->uid; 533 current_uid_gid(&ac.ac_uid, &ac.ac_gid);
534 ac.ac_gid = current->gid;
535#if ACCT_VERSION==2 534#if ACCT_VERSION==2
536 ac.ac_ahz = AHZ; 535 ac.ac_ahz = AHZ;
537#endif 536#endif
538#if ACCT_VERSION==1 || ACCT_VERSION==2 537#if ACCT_VERSION==1 || ACCT_VERSION==2
539 /* backward-compatible 16 bit fields */ 538 /* backward-compatible 16 bit fields */
540 ac.ac_uid16 = current->uid; 539 ac.ac_uid16 = ac.ac_uid;
541 ac.ac_gid16 = current->gid; 540 ac.ac_gid16 = ac.ac_gid;
542#endif 541#endif
543#if ACCT_VERSION==3 542#if ACCT_VERSION==3
544 ac.ac_pid = task_tgid_nr_ns(current, ns); 543 ac.ac_pid = task_tgid_nr_ns(current, ns);
diff --git a/kernel/async.c b/kernel/async.c
new file mode 100644
index 000000000000..64cc916299a5
--- /dev/null
+++ b/kernel/async.c
@@ -0,0 +1,323 @@
1/*
2 * async.c: Asynchronous function calls for boot performance
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13
14/*
15
16Goals and Theory of Operation
17
18The primary goal of this feature is to reduce the kernel boot time,
19by doing various independent hardware delays and discovery operations
20decoupled and not strictly serialized.
21
22More specifically, the asynchronous function call concept allows
23certain operations (primarily during system boot) to happen
24asynchronously, out of order, while these operations still
25have their externally visible parts happen sequentially and in-order.
26(not unlike how out-of-order CPUs retire their instructions in order)
27
28Key to the asynchronous function call implementation is the concept of
29a "sequence cookie" (which, although it has an abstracted type, can be
30thought of as a monotonically incrementing number).
31
32The async core will assign each scheduled event such a sequence cookie and
33pass this to the called functions.
34
35The asynchronously called function should before doing a globally visible
36operation, such as registering device numbers, call the
37async_synchronize_cookie() function and pass in its own cookie. The
38async_synchronize_cookie() function will make sure that all asynchronous
39operations that were scheduled prior to the operation corresponding with the
40cookie have completed.
41
42Subsystem/driver initialization code that scheduled asynchronous probe
43functions, but which shares global resources with other drivers/subsystems
44that do not use the asynchronous call feature, need to do a full
45synchronization with the async_synchronize_full() function, before returning
46from their init function. This is to maintain strict ordering between the
47asynchronous and synchronous parts of the kernel.
48
49*/
50
51#include <linux/async.h>
52#include <linux/module.h>
53#include <linux/wait.h>
54#include <linux/sched.h>
55#include <linux/init.h>
56#include <linux/kthread.h>
57#include <asm/atomic.h>
58
59static async_cookie_t next_cookie = 1;
60
61#define MAX_THREADS 256
62#define MAX_WORK 32768
63
64static LIST_HEAD(async_pending);
65static LIST_HEAD(async_running);
66static DEFINE_SPINLOCK(async_lock);
67
68struct async_entry {
69 struct list_head list;
70 async_cookie_t cookie;
71 async_func_ptr *func;
72 void *data;
73 struct list_head *running;
74};
75
76static DECLARE_WAIT_QUEUE_HEAD(async_done);
77static DECLARE_WAIT_QUEUE_HEAD(async_new);
78
79static atomic_t entry_count;
80static atomic_t thread_count;
81
82extern int initcall_debug;
83
84
85/*
86 * MUST be called with the lock held!
87 */
88static async_cookie_t __lowest_in_progress(struct list_head *running)
89{
90 struct async_entry *entry;
91 if (!list_empty(&async_pending)) {
92 entry = list_first_entry(&async_pending,
93 struct async_entry, list);
94 return entry->cookie;
95 } else if (!list_empty(running)) {
96 entry = list_first_entry(running,
97 struct async_entry, list);
98 return entry->cookie;
99 } else {
100 /* nothing in progress... next_cookie is "infinity" */
101 return next_cookie;
102 }
103
104}
105/*
106 * pick the first pending entry and run it
107 */
108static void run_one_entry(void)
109{
110 unsigned long flags;
111 struct async_entry *entry;
112 ktime_t calltime, delta, rettime;
113
114 /* 1) pick one task from the pending queue */
115
116 spin_lock_irqsave(&async_lock, flags);
117 if (list_empty(&async_pending))
118 goto out;
119 entry = list_first_entry(&async_pending, struct async_entry, list);
120
121 /* 2) move it to the running queue */
122 list_del(&entry->list);
123 list_add_tail(&entry->list, &async_running);
124 spin_unlock_irqrestore(&async_lock, flags);
125
126 /* 3) run it (and print duration)*/
127 if (initcall_debug && system_state == SYSTEM_BOOTING) {
128 printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
129 calltime = ktime_get();
130 }
131 entry->func(entry->data, entry->cookie);
132 if (initcall_debug && system_state == SYSTEM_BOOTING) {
133 rettime = ktime_get();
134 delta = ktime_sub(rettime, calltime);
135 printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
136 entry->func, ktime_to_ns(delta) >> 10);
137 }
138
139 /* 4) remove it from the running queue */
140 spin_lock_irqsave(&async_lock, flags);
141 list_del(&entry->list);
142
143 /* 5) free the entry */
144 kfree(entry);
145 atomic_dec(&entry_count);
146
147 spin_unlock_irqrestore(&async_lock, flags);
148
149 /* 6) wake up any waiters. */
150 wake_up(&async_done);
151 return;
152
153out:
154 spin_unlock_irqrestore(&async_lock, flags);
155}
156
157
158static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
159{
160 struct async_entry *entry;
161 unsigned long flags;
162 async_cookie_t newcookie;
163
164
165 /* allow irq-off callers */
166 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
167
168 /*
169 * If we're out of memory or if there's too much work
170 * pending already, we execute synchronously.
171 */
172 if (!entry || atomic_read(&entry_count) > MAX_WORK) {
173 kfree(entry);
174 spin_lock_irqsave(&async_lock, flags);
175 newcookie = next_cookie++;
176 spin_unlock_irqrestore(&async_lock, flags);
177
178 /* low on memory.. run synchronously */
179 ptr(data, newcookie);
180 return newcookie;
181 }
182 entry->func = ptr;
183 entry->data = data;
184 entry->running = running;
185
186 spin_lock_irqsave(&async_lock, flags);
187 newcookie = entry->cookie = next_cookie++;
188 list_add_tail(&entry->list, &async_pending);
189 atomic_inc(&entry_count);
190 spin_unlock_irqrestore(&async_lock, flags);
191 wake_up(&async_new);
192 return newcookie;
193}
194
195async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
196{
197 return __async_schedule(ptr, data, &async_pending);
198}
199EXPORT_SYMBOL_GPL(async_schedule);
200
201async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
202{
203 return __async_schedule(ptr, data, running);
204}
205EXPORT_SYMBOL_GPL(async_schedule_special);
206
207void async_synchronize_full(void)
208{
209 do {
210 async_synchronize_cookie(next_cookie);
211 } while (!list_empty(&async_running) || !list_empty(&async_pending));
212}
213EXPORT_SYMBOL_GPL(async_synchronize_full);
214
215void async_synchronize_full_special(struct list_head *list)
216{
217 async_synchronize_cookie_special(next_cookie, list);
218}
219EXPORT_SYMBOL_GPL(async_synchronize_full_special);
220
221void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
222{
223 ktime_t starttime, delta, endtime;
224
225 if (initcall_debug && system_state == SYSTEM_BOOTING) {
226 printk("async_waiting @ %i\n", task_pid_nr(current));
227 starttime = ktime_get();
228 }
229
230 wait_event(async_done, __lowest_in_progress(running) >= cookie);
231
232 if (initcall_debug && system_state == SYSTEM_BOOTING) {
233 endtime = ktime_get();
234 delta = ktime_sub(endtime, starttime);
235
236 printk("async_continuing @ %i after %lli usec\n",
237 task_pid_nr(current), ktime_to_ns(delta) >> 10);
238 }
239}
240EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
241
242void async_synchronize_cookie(async_cookie_t cookie)
243{
244 async_synchronize_cookie_special(cookie, &async_running);
245}
246EXPORT_SYMBOL_GPL(async_synchronize_cookie);
247
248
249static int async_thread(void *unused)
250{
251 DECLARE_WAITQUEUE(wq, current);
252 add_wait_queue(&async_new, &wq);
253
254 while (!kthread_should_stop()) {
255 int ret = HZ;
256 set_current_state(TASK_INTERRUPTIBLE);
257 /*
258 * check the list head without lock.. false positives
259 * are dealt with inside run_one_entry() while holding
260 * the lock.
261 */
262 rmb();
263 if (!list_empty(&async_pending))
264 run_one_entry();
265 else
266 ret = schedule_timeout(HZ);
267
268 if (ret == 0) {
269 /*
270 * we timed out, this means we as thread are redundant.
271 * we sign off and die, but we to avoid any races there
272 * is a last-straw check to see if work snuck in.
273 */
274 atomic_dec(&thread_count);
275 wmb(); /* manager must see our departure first */
276 if (list_empty(&async_pending))
277 break;
278 /*
279 * woops work came in between us timing out and us
280 * signing off; we need to stay alive and keep working.
281 */
282 atomic_inc(&thread_count);
283 }
284 }
285 remove_wait_queue(&async_new, &wq);
286
287 return 0;
288}
289
290static int async_manager_thread(void *unused)
291{
292 DECLARE_WAITQUEUE(wq, current);
293 add_wait_queue(&async_new, &wq);
294
295 while (!kthread_should_stop()) {
296 int tc, ec;
297
298 set_current_state(TASK_INTERRUPTIBLE);
299
300 tc = atomic_read(&thread_count);
301 rmb();
302 ec = atomic_read(&entry_count);
303
304 while (tc < ec && tc < MAX_THREADS) {
305 kthread_run(async_thread, NULL, "async/%i", tc);
306 atomic_inc(&thread_count);
307 tc++;
308 }
309
310 schedule();
311 }
312 remove_wait_queue(&async_new, &wq);
313
314 return 0;
315}
316
317static int __init async_init(void)
318{
319 kthread_run(async_manager_thread, NULL, "async/mgr");
320 return 0;
321}
322
323core_initcall(async_init);
diff --git a/kernel/audit.h b/kernel/audit.h
index 9d6717412fec..16f18cac661b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -159,11 +159,8 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
159 return __audit_signal_info(sig, t); 159 return __audit_signal_info(sig, t);
160 return 0; 160 return 0;
161} 161}
162extern enum audit_state audit_filter_inodes(struct task_struct *, 162extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
163 struct audit_context *);
164extern void audit_set_auditable(struct audit_context *);
165#else 163#else
166#define audit_signal_info(s,t) AUDIT_DISABLED 164#define audit_signal_info(s,t) AUDIT_DISABLED
167#define audit_filter_inodes(t,c) AUDIT_DISABLED 165#define audit_filter_inodes(t,c) AUDIT_DISABLED
168#define audit_set_auditable(c)
169#endif 166#endif
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8b509441f49a..8ad9545b8db9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -450,6 +450,7 @@ static void kill_rules(struct audit_tree *tree)
450 audit_log_end(ab); 450 audit_log_end(ab);
451 rule->tree = NULL; 451 rule->tree = NULL;
452 list_del_rcu(&entry->list); 452 list_del_rcu(&entry->list);
453 list_del(&entry->rule.list);
453 call_rcu(&entry->rcu, audit_free_rule_rcu); 454 call_rcu(&entry->rcu, audit_free_rule_rcu);
454 } 455 }
455 } 456 }
@@ -617,7 +618,7 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
617 618
618 if (pathname[0] != '/' || 619 if (pathname[0] != '/' ||
619 rule->listnr != AUDIT_FILTER_EXIT || 620 rule->listnr != AUDIT_FILTER_EXIT ||
620 op & ~AUDIT_EQUAL || 621 op != Audit_equal ||
621 rule->inode_f || rule->watch || rule->tree) 622 rule->inode_f || rule->watch || rule->tree)
622 return -EINVAL; 623 return -EINVAL;
623 rule->tree = alloc_tree(pathname); 624 rule->tree = alloc_tree(pathname);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9fd85a4640a0..fbf24d121d97 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -86,6 +86,14 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
86#error Fix audit_filter_list initialiser 86#error Fix audit_filter_list initialiser
87#endif 87#endif
88}; 88};
89static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
90 LIST_HEAD_INIT(audit_rules_list[0]),
91 LIST_HEAD_INIT(audit_rules_list[1]),
92 LIST_HEAD_INIT(audit_rules_list[2]),
93 LIST_HEAD_INIT(audit_rules_list[3]),
94 LIST_HEAD_INIT(audit_rules_list[4]),
95 LIST_HEAD_INIT(audit_rules_list[5]),
96};
89 97
90DEFINE_MUTEX(audit_filter_mutex); 98DEFINE_MUTEX(audit_filter_mutex);
91 99
@@ -244,7 +252,8 @@ static inline int audit_to_inode(struct audit_krule *krule,
244 struct audit_field *f) 252 struct audit_field *f)
245{ 253{
246 if (krule->listnr != AUDIT_FILTER_EXIT || 254 if (krule->listnr != AUDIT_FILTER_EXIT ||
247 krule->watch || krule->inode_f || krule->tree) 255 krule->watch || krule->inode_f || krule->tree ||
256 (f->op != Audit_equal && f->op != Audit_not_equal))
248 return -EINVAL; 257 return -EINVAL;
249 258
250 krule->inode_f = f; 259 krule->inode_f = f;
@@ -262,7 +271,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
262 271
263 if (path[0] != '/' || path[len-1] == '/' || 272 if (path[0] != '/' || path[len-1] == '/' ||
264 krule->listnr != AUDIT_FILTER_EXIT || 273 krule->listnr != AUDIT_FILTER_EXIT ||
265 op & ~AUDIT_EQUAL || 274 op != Audit_equal ||
266 krule->inode_f || krule->watch || krule->tree) 275 krule->inode_f || krule->watch || krule->tree)
267 return -EINVAL; 276 return -EINVAL;
268 277
@@ -412,12 +421,32 @@ exit_err:
412 return ERR_PTR(err); 421 return ERR_PTR(err);
413} 422}
414 423
424static u32 audit_ops[] =
425{
426 [Audit_equal] = AUDIT_EQUAL,
427 [Audit_not_equal] = AUDIT_NOT_EQUAL,
428 [Audit_bitmask] = AUDIT_BIT_MASK,
429 [Audit_bittest] = AUDIT_BIT_TEST,
430 [Audit_lt] = AUDIT_LESS_THAN,
431 [Audit_gt] = AUDIT_GREATER_THAN,
432 [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL,
433 [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL,
434};
435
436static u32 audit_to_op(u32 op)
437{
438 u32 n;
439 for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++)
440 ;
441 return n;
442}
443
444
415/* Translate struct audit_rule to kernel's rule respresentation. 445/* Translate struct audit_rule to kernel's rule respresentation.
416 * Exists for backward compatibility with userspace. */ 446 * Exists for backward compatibility with userspace. */
417static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 447static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
418{ 448{
419 struct audit_entry *entry; 449 struct audit_entry *entry;
420 struct audit_field *ino_f;
421 int err = 0; 450 int err = 0;
422 int i; 451 int i;
423 452
@@ -427,12 +456,28 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
427 456
428 for (i = 0; i < rule->field_count; i++) { 457 for (i = 0; i < rule->field_count; i++) {
429 struct audit_field *f = &entry->rule.fields[i]; 458 struct audit_field *f = &entry->rule.fields[i];
459 u32 n;
460
461 n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
462
463 /* Support for legacy operators where
464 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
465 if (n & AUDIT_NEGATE)
466 f->op = Audit_not_equal;
467 else if (!n)
468 f->op = Audit_equal;
469 else
470 f->op = audit_to_op(n);
471
472 entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
430 473
431 f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
432 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 474 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
433 f->val = rule->values[i]; 475 f->val = rule->values[i];
434 476
435 err = -EINVAL; 477 err = -EINVAL;
478 if (f->op == Audit_bad)
479 goto exit_free;
480
436 switch(f->type) { 481 switch(f->type) {
437 default: 482 default:
438 goto exit_free; 483 goto exit_free;
@@ -454,11 +499,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
454 case AUDIT_EXIT: 499 case AUDIT_EXIT:
455 case AUDIT_SUCCESS: 500 case AUDIT_SUCCESS:
456 /* bit ops are only useful on syscall args */ 501 /* bit ops are only useful on syscall args */
457 if (f->op == AUDIT_BIT_MASK || 502 if (f->op == Audit_bitmask || f->op == Audit_bittest)
458 f->op == AUDIT_BIT_TEST) {
459 err = -EINVAL;
460 goto exit_free; 503 goto exit_free;
461 }
462 break; 504 break;
463 case AUDIT_ARG0: 505 case AUDIT_ARG0:
464 case AUDIT_ARG1: 506 case AUDIT_ARG1:
@@ -467,11 +509,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
467 break; 509 break;
468 /* arch is only allowed to be = or != */ 510 /* arch is only allowed to be = or != */
469 case AUDIT_ARCH: 511 case AUDIT_ARCH:
470 if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) 512 if (f->op != Audit_not_equal && f->op != Audit_equal)
471 && (f->op != AUDIT_NEGATE) && (f->op)) {
472 err = -EINVAL;
473 goto exit_free; 513 goto exit_free;
474 }
475 entry->rule.arch_f = f; 514 entry->rule.arch_f = f;
476 break; 515 break;
477 case AUDIT_PERM: 516 case AUDIT_PERM:
@@ -488,33 +527,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
488 goto exit_free; 527 goto exit_free;
489 break; 528 break;
490 } 529 }
491
492 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
493
494 /* Support for legacy operators where
495 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
496 if (f->op & AUDIT_NEGATE)
497 f->op = AUDIT_NOT_EQUAL;
498 else if (!f->op)
499 f->op = AUDIT_EQUAL;
500 else if (f->op == AUDIT_OPERATORS) {
501 err = -EINVAL;
502 goto exit_free;
503 }
504 } 530 }
505 531
506 ino_f = entry->rule.inode_f; 532 if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
507 if (ino_f) { 533 entry->rule.inode_f = NULL;
508 switch(ino_f->op) {
509 case AUDIT_NOT_EQUAL:
510 entry->rule.inode_f = NULL;
511 case AUDIT_EQUAL:
512 break;
513 default:
514 err = -EINVAL;
515 goto exit_free;
516 }
517 }
518 534
519exit_nofree: 535exit_nofree:
520 return entry; 536 return entry;
@@ -530,7 +546,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
530{ 546{
531 int err = 0; 547 int err = 0;
532 struct audit_entry *entry; 548 struct audit_entry *entry;
533 struct audit_field *ino_f;
534 void *bufp; 549 void *bufp;
535 size_t remain = datasz - sizeof(struct audit_rule_data); 550 size_t remain = datasz - sizeof(struct audit_rule_data);
536 int i; 551 int i;
@@ -546,11 +561,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
546 struct audit_field *f = &entry->rule.fields[i]; 561 struct audit_field *f = &entry->rule.fields[i];
547 562
548 err = -EINVAL; 563 err = -EINVAL;
549 if (!(data->fieldflags[i] & AUDIT_OPERATORS) || 564
550 data->fieldflags[i] & ~AUDIT_OPERATORS) 565 f->op = audit_to_op(data->fieldflags[i]);
566 if (f->op == Audit_bad)
551 goto exit_free; 567 goto exit_free;
552 568
553 f->op = data->fieldflags[i] & AUDIT_OPERATORS;
554 f->type = data->fields[i]; 569 f->type = data->fields[i];
555 f->val = data->values[i]; 570 f->val = data->values[i];
556 f->lsm_str = NULL; 571 f->lsm_str = NULL;
@@ -662,18 +677,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
662 } 677 }
663 } 678 }
664 679
665 ino_f = entry->rule.inode_f; 680 if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
666 if (ino_f) { 681 entry->rule.inode_f = NULL;
667 switch(ino_f->op) {
668 case AUDIT_NOT_EQUAL:
669 entry->rule.inode_f = NULL;
670 case AUDIT_EQUAL:
671 break;
672 default:
673 err = -EINVAL;
674 goto exit_free;
675 }
676 }
677 682
678exit_nofree: 683exit_nofree:
679 return entry; 684 return entry;
@@ -713,10 +718,10 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
713 rule->fields[i] = krule->fields[i].type; 718 rule->fields[i] = krule->fields[i].type;
714 719
715 if (krule->vers_ops == 1) { 720 if (krule->vers_ops == 1) {
716 if (krule->fields[i].op & AUDIT_NOT_EQUAL) 721 if (krule->fields[i].op == Audit_not_equal)
717 rule->fields[i] |= AUDIT_NEGATE; 722 rule->fields[i] |= AUDIT_NEGATE;
718 } else { 723 } else {
719 rule->fields[i] |= krule->fields[i].op; 724 rule->fields[i] |= audit_ops[krule->fields[i].op];
720 } 725 }
721 } 726 }
722 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; 727 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
@@ -744,7 +749,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
744 struct audit_field *f = &krule->fields[i]; 749 struct audit_field *f = &krule->fields[i];
745 750
746 data->fields[i] = f->type; 751 data->fields[i] = f->type;
747 data->fieldflags[i] = f->op; 752 data->fieldflags[i] = audit_ops[f->op];
748 switch(f->type) { 753 switch(f->type) {
749 case AUDIT_SUBJ_USER: 754 case AUDIT_SUBJ_USER:
750 case AUDIT_SUBJ_ROLE: 755 case AUDIT_SUBJ_ROLE:
@@ -919,6 +924,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
919 new->action = old->action; 924 new->action = old->action;
920 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 925 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
921 new->mask[i] = old->mask[i]; 926 new->mask[i] = old->mask[i];
927 new->prio = old->prio;
922 new->buflen = old->buflen; 928 new->buflen = old->buflen;
923 new->inode_f = old->inode_f; 929 new->inode_f = old->inode_f;
924 new->watch = NULL; 930 new->watch = NULL;
@@ -987,9 +993,8 @@ static void audit_update_watch(struct audit_parent *parent,
987 993
988 /* If the update involves invalidating rules, do the inode-based 994 /* If the update involves invalidating rules, do the inode-based
989 * filtering now, so we don't omit records. */ 995 * filtering now, so we don't omit records. */
990 if (invalidating && current->audit_context && 996 if (invalidating && current->audit_context)
991 audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) 997 audit_filter_inodes(current, current->audit_context);
992 audit_set_auditable(current->audit_context);
993 998
994 nwatch = audit_dupe_watch(owatch); 999 nwatch = audit_dupe_watch(owatch);
995 if (IS_ERR(nwatch)) { 1000 if (IS_ERR(nwatch)) {
@@ -1007,12 +1012,15 @@ static void audit_update_watch(struct audit_parent *parent,
1007 list_del_rcu(&oentry->list); 1012 list_del_rcu(&oentry->list);
1008 1013
1009 nentry = audit_dupe_rule(&oentry->rule, nwatch); 1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1010 if (IS_ERR(nentry)) 1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1011 audit_panic("error updating watch, removing"); 1017 audit_panic("error updating watch, removing");
1012 else { 1018 } else {
1013 int h = audit_hash_ino((u32)ino); 1019 int h = audit_hash_ino((u32)ino);
1014 list_add(&nentry->rule.rlist, &nwatch->rules); 1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1015 list_add_rcu(&nentry->list, &audit_inode_hash[h]); 1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1016 } 1024 }
1017 1025
1018 call_rcu(&oentry->rcu, audit_free_rule_rcu); 1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
@@ -1077,6 +1085,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
1077 audit_log_end(ab); 1085 audit_log_end(ab);
1078 } 1086 }
1079 list_del(&r->rlist); 1087 list_del(&r->rlist);
1088 list_del(&r->list);
1080 list_del_rcu(&e->list); 1089 list_del_rcu(&e->list);
1081 call_rcu(&e->rcu, audit_free_rule_rcu); 1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1082 } 1091 }
@@ -1102,12 +1111,16 @@ static void audit_inotify_unregister(struct list_head *in_list)
1102/* Find an existing audit rule. 1111/* Find an existing audit rule.
1103 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1104static struct audit_entry *audit_find_rule(struct audit_entry *entry, 1113static struct audit_entry *audit_find_rule(struct audit_entry *entry,
1105 struct list_head *list) 1114 struct list_head **p)
1106{ 1115{
1107 struct audit_entry *e, *found = NULL; 1116 struct audit_entry *e, *found = NULL;
1117 struct list_head *list;
1108 int h; 1118 int h;
1109 1119
1110 if (entry->rule.watch) { 1120 if (entry->rule.inode_f) {
1121 h = audit_hash_ino(entry->rule.inode_f->val);
1122 *p = list = &audit_inode_hash[h];
1123 } else if (entry->rule.watch) {
1111 /* we don't know the inode number, so must walk entire hash */ 1124 /* we don't know the inode number, so must walk entire hash */
1112 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { 1125 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
1113 list = &audit_inode_hash[h]; 1126 list = &audit_inode_hash[h];
@@ -1118,6 +1131,8 @@ static struct audit_entry *audit_find_rule(struct audit_entry *entry,
1118 } 1131 }
1119 } 1132 }
1120 goto out; 1133 goto out;
1134 } else {
1135 *p = list = &audit_filter_list[entry->rule.listnr];
1121 } 1136 }
1122 1137
1123 list_for_each_entry(e, list, list) 1138 list_for_each_entry(e, list, list)
@@ -1258,15 +1273,17 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1258 return ret; 1273 return ret;
1259} 1274}
1260 1275
1276static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1;
1278
1261/* Add rule to given filterlist if not a duplicate. */ 1279/* Add rule to given filterlist if not a duplicate. */
1262static inline int audit_add_rule(struct audit_entry *entry, 1280static inline int audit_add_rule(struct audit_entry *entry)
1263 struct list_head *list)
1264{ 1281{
1265 struct audit_entry *e; 1282 struct audit_entry *e;
1266 struct audit_field *inode_f = entry->rule.inode_f;
1267 struct audit_watch *watch = entry->rule.watch; 1283 struct audit_watch *watch = entry->rule.watch;
1268 struct audit_tree *tree = entry->rule.tree; 1284 struct audit_tree *tree = entry->rule.tree;
1269 struct nameidata *ndp = NULL, *ndw = NULL; 1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list;
1270 int h, err; 1287 int h, err;
1271#ifdef CONFIG_AUDITSYSCALL 1288#ifdef CONFIG_AUDITSYSCALL
1272 int dont_count = 0; 1289 int dont_count = 0;
@@ -1277,13 +1294,8 @@ static inline int audit_add_rule(struct audit_entry *entry,
1277 dont_count = 1; 1294 dont_count = 1;
1278#endif 1295#endif
1279 1296
1280 if (inode_f) {
1281 h = audit_hash_ino(inode_f->val);
1282 list = &audit_inode_hash[h];
1283 }
1284
1285 mutex_lock(&audit_filter_mutex); 1297 mutex_lock(&audit_filter_mutex);
1286 e = audit_find_rule(entry, list); 1298 e = audit_find_rule(entry, &list);
1287 mutex_unlock(&audit_filter_mutex); 1299 mutex_unlock(&audit_filter_mutex);
1288 if (e) { 1300 if (e) {
1289 err = -EEXIST; 1301 err = -EEXIST;
@@ -1319,10 +1331,22 @@ static inline int audit_add_rule(struct audit_entry *entry,
1319 } 1331 }
1320 } 1332 }
1321 1333
1334 entry->rule.prio = ~0ULL;
1335 if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
1336 if (entry->rule.flags & AUDIT_FILTER_PREPEND)
1337 entry->rule.prio = ++prio_high;
1338 else
1339 entry->rule.prio = --prio_low;
1340 }
1341
1322 if (entry->rule.flags & AUDIT_FILTER_PREPEND) { 1342 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
1343 list_add(&entry->rule.list,
1344 &audit_rules_list[entry->rule.listnr]);
1323 list_add_rcu(&entry->list, list); 1345 list_add_rcu(&entry->list, list);
1324 entry->rule.flags &= ~AUDIT_FILTER_PREPEND; 1346 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
1325 } else { 1347 } else {
1348 list_add_tail(&entry->rule.list,
1349 &audit_rules_list[entry->rule.listnr]);
1326 list_add_tail_rcu(&entry->list, list); 1350 list_add_tail_rcu(&entry->list, list);
1327 } 1351 }
1328#ifdef CONFIG_AUDITSYSCALL 1352#ifdef CONFIG_AUDITSYSCALL
@@ -1345,15 +1369,14 @@ error:
1345} 1369}
1346 1370
1347/* Remove an existing rule from filterlist. */ 1371/* Remove an existing rule from filterlist. */
1348static inline int audit_del_rule(struct audit_entry *entry, 1372static inline int audit_del_rule(struct audit_entry *entry)
1349 struct list_head *list)
1350{ 1373{
1351 struct audit_entry *e; 1374 struct audit_entry *e;
1352 struct audit_field *inode_f = entry->rule.inode_f;
1353 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch;
1354 struct audit_tree *tree = entry->rule.tree; 1376 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list;
1355 LIST_HEAD(inotify_list); 1378 LIST_HEAD(inotify_list);
1356 int h, ret = 0; 1379 int ret = 0;
1357#ifdef CONFIG_AUDITSYSCALL 1380#ifdef CONFIG_AUDITSYSCALL
1358 int dont_count = 0; 1381 int dont_count = 0;
1359 1382
@@ -1363,13 +1386,8 @@ static inline int audit_del_rule(struct audit_entry *entry,
1363 dont_count = 1; 1386 dont_count = 1;
1364#endif 1387#endif
1365 1388
1366 if (inode_f) {
1367 h = audit_hash_ino(inode_f->val);
1368 list = &audit_inode_hash[h];
1369 }
1370
1371 mutex_lock(&audit_filter_mutex); 1389 mutex_lock(&audit_filter_mutex);
1372 e = audit_find_rule(entry, list); 1390 e = audit_find_rule(entry, &list);
1373 if (!e) { 1391 if (!e) {
1374 mutex_unlock(&audit_filter_mutex); 1392 mutex_unlock(&audit_filter_mutex);
1375 ret = -ENOENT; 1393 ret = -ENOENT;
@@ -1404,6 +1422,7 @@ static inline int audit_del_rule(struct audit_entry *entry,
1404 audit_remove_tree_rule(&e->rule); 1422 audit_remove_tree_rule(&e->rule);
1405 1423
1406 list_del_rcu(&e->list); 1424 list_del_rcu(&e->list);
1425 list_del(&e->rule.list);
1407 call_rcu(&e->rcu, audit_free_rule_rcu); 1426 call_rcu(&e->rcu, audit_free_rule_rcu);
1408 1427
1409#ifdef CONFIG_AUDITSYSCALL 1428#ifdef CONFIG_AUDITSYSCALL
@@ -1432,30 +1451,16 @@ out:
1432static void audit_list(int pid, int seq, struct sk_buff_head *q) 1451static void audit_list(int pid, int seq, struct sk_buff_head *q)
1433{ 1452{
1434 struct sk_buff *skb; 1453 struct sk_buff *skb;
1435 struct audit_entry *entry; 1454 struct audit_krule *r;
1436 int i; 1455 int i;
1437 1456
1438 /* This is a blocking read, so use audit_filter_mutex instead of rcu 1457 /* This is a blocking read, so use audit_filter_mutex instead of rcu
1439 * iterator to sync with list writers. */ 1458 * iterator to sync with list writers. */
1440 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1459 for (i=0; i<AUDIT_NR_FILTERS; i++) {
1441 list_for_each_entry(entry, &audit_filter_list[i], list) { 1460 list_for_each_entry(r, &audit_rules_list[i], list) {
1442 struct audit_rule *rule;
1443
1444 rule = audit_krule_to_rule(&entry->rule);
1445 if (unlikely(!rule))
1446 break;
1447 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
1448 rule, sizeof(*rule));
1449 if (skb)
1450 skb_queue_tail(q, skb);
1451 kfree(rule);
1452 }
1453 }
1454 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
1455 list_for_each_entry(entry, &audit_inode_hash[i], list) {
1456 struct audit_rule *rule; 1461 struct audit_rule *rule;
1457 1462
1458 rule = audit_krule_to_rule(&entry->rule); 1463 rule = audit_krule_to_rule(r);
1459 if (unlikely(!rule)) 1464 if (unlikely(!rule))
1460 break; 1465 break;
1461 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, 1466 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
@@ -1474,30 +1479,16 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q)
1474static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) 1479static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1475{ 1480{
1476 struct sk_buff *skb; 1481 struct sk_buff *skb;
1477 struct audit_entry *e; 1482 struct audit_krule *r;
1478 int i; 1483 int i;
1479 1484
1480 /* This is a blocking read, so use audit_filter_mutex instead of rcu 1485 /* This is a blocking read, so use audit_filter_mutex instead of rcu
1481 * iterator to sync with list writers. */ 1486 * iterator to sync with list writers. */
1482 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1487 for (i=0; i<AUDIT_NR_FILTERS; i++) {
1483 list_for_each_entry(e, &audit_filter_list[i], list) { 1488 list_for_each_entry(r, &audit_rules_list[i], list) {
1484 struct audit_rule_data *data;
1485
1486 data = audit_krule_to_data(&e->rule);
1487 if (unlikely(!data))
1488 break;
1489 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
1490 data, sizeof(*data) + data->buflen);
1491 if (skb)
1492 skb_queue_tail(q, skb);
1493 kfree(data);
1494 }
1495 }
1496 for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
1497 list_for_each_entry(e, &audit_inode_hash[i], list) {
1498 struct audit_rule_data *data; 1489 struct audit_rule_data *data;
1499 1490
1500 data = audit_krule_to_data(&e->rule); 1491 data = audit_krule_to_data(r);
1501 if (unlikely(!data)) 1492 if (unlikely(!data))
1502 break; 1493 break;
1503 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, 1494 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
@@ -1603,8 +1594,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1603 if (IS_ERR(entry)) 1594 if (IS_ERR(entry))
1604 return PTR_ERR(entry); 1595 return PTR_ERR(entry);
1605 1596
1606 err = audit_add_rule(entry, 1597 err = audit_add_rule(entry);
1607 &audit_filter_list[entry->rule.listnr]);
1608 audit_log_rule_change(loginuid, sessionid, sid, "add", 1598 audit_log_rule_change(loginuid, sessionid, sid, "add",
1609 &entry->rule, !err); 1599 &entry->rule, !err);
1610 1600
@@ -1620,8 +1610,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1620 if (IS_ERR(entry)) 1610 if (IS_ERR(entry))
1621 return PTR_ERR(entry); 1611 return PTR_ERR(entry);
1622 1612
1623 err = audit_del_rule(entry, 1613 err = audit_del_rule(entry);
1624 &audit_filter_list[entry->rule.listnr]);
1625 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1614 audit_log_rule_change(loginuid, sessionid, sid, "remove",
1626 &entry->rule, !err); 1615 &entry->rule, !err);
1627 1616
@@ -1634,28 +1623,29 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1634 return err; 1623 return err;
1635} 1624}
1636 1625
1637int audit_comparator(const u32 left, const u32 op, const u32 right) 1626int audit_comparator(u32 left, u32 op, u32 right)
1638{ 1627{
1639 switch (op) { 1628 switch (op) {
1640 case AUDIT_EQUAL: 1629 case Audit_equal:
1641 return (left == right); 1630 return (left == right);
1642 case AUDIT_NOT_EQUAL: 1631 case Audit_not_equal:
1643 return (left != right); 1632 return (left != right);
1644 case AUDIT_LESS_THAN: 1633 case Audit_lt:
1645 return (left < right); 1634 return (left < right);
1646 case AUDIT_LESS_THAN_OR_EQUAL: 1635 case Audit_le:
1647 return (left <= right); 1636 return (left <= right);
1648 case AUDIT_GREATER_THAN: 1637 case Audit_gt:
1649 return (left > right); 1638 return (left > right);
1650 case AUDIT_GREATER_THAN_OR_EQUAL: 1639 case Audit_ge:
1651 return (left >= right); 1640 return (left >= right);
1652 case AUDIT_BIT_MASK: 1641 case Audit_bitmask:
1653 return (left & right); 1642 return (left & right);
1654 case AUDIT_BIT_TEST: 1643 case Audit_bittest:
1655 return ((left & right) == right); 1644 return ((left & right) == right);
1645 default:
1646 BUG();
1647 return 0;
1656 } 1648 }
1657 BUG();
1658 return 0;
1659} 1649}
1660 1650
1661/* Compare given dentry name with last component in given path, 1651/* Compare given dentry name with last component in given path,
@@ -1778,6 +1768,43 @@ unlock_and_return:
1778 return result; 1768 return result;
1779} 1769}
1780 1770
1771static int update_lsm_rule(struct audit_krule *r)
1772{
1773 struct audit_entry *entry = container_of(r, struct audit_entry, rule);
1774 struct audit_entry *nentry;
1775 struct audit_watch *watch;
1776 struct audit_tree *tree;
1777 int err = 0;
1778
1779 if (!security_audit_rule_known(r))
1780 return 0;
1781
1782 watch = r->watch;
1783 tree = r->tree;
1784 nentry = audit_dupe_rule(r, watch);
1785 if (IS_ERR(nentry)) {
1786 /* save the first error encountered for the
1787 * return value */
1788 err = PTR_ERR(nentry);
1789 audit_panic("error updating LSM filters");
1790 if (watch)
1791 list_del(&r->rlist);
1792 list_del_rcu(&entry->list);
1793 list_del(&r->list);
1794 } else {
1795 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules);
1797 list_del(&r->rlist);
1798 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist);
1800 list_replace_rcu(&entry->list, &nentry->list);
1801 list_replace(&r->list, &nentry->rule.list);
1802 }
1803 call_rcu(&entry->rcu, audit_free_rule_rcu);
1804
1805 return err;
1806}
1807
1781/* This function will re-initialize the lsm_rule field of all applicable rules. 1808/* This function will re-initialize the lsm_rule field of all applicable rules.
1782 * It will traverse the filter lists serarching for rules that contain LSM 1809 * It will traverse the filter lists serarching for rules that contain LSM
1783 * specific filter fields. When such a rule is found, it is copied, the 1810 * specific filter fields. When such a rule is found, it is copied, the
@@ -1785,45 +1812,19 @@ unlock_and_return:
1785 * updated rule. */ 1812 * updated rule. */
1786int audit_update_lsm_rules(void) 1813int audit_update_lsm_rules(void)
1787{ 1814{
1788 struct audit_entry *entry, *n, *nentry; 1815 struct audit_krule *r, *n;
1789 struct audit_watch *watch;
1790 struct audit_tree *tree;
1791 int i, err = 0; 1816 int i, err = 0;
1792 1817
1793 /* audit_filter_mutex synchronizes the writers */ 1818 /* audit_filter_mutex synchronizes the writers */
1794 mutex_lock(&audit_filter_mutex); 1819 mutex_lock(&audit_filter_mutex);
1795 1820
1796 for (i = 0; i < AUDIT_NR_FILTERS; i++) { 1821 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
1797 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { 1822 list_for_each_entry_safe(r, n, &audit_rules_list[i], list) {
1798 if (!security_audit_rule_known(&entry->rule)) 1823 int res = update_lsm_rule(r);
1799 continue; 1824 if (!err)
1800 1825 err = res;
1801 watch = entry->rule.watch;
1802 tree = entry->rule.tree;
1803 nentry = audit_dupe_rule(&entry->rule, watch);
1804 if (IS_ERR(nentry)) {
1805 /* save the first error encountered for the
1806 * return value */
1807 if (!err)
1808 err = PTR_ERR(nentry);
1809 audit_panic("error updating LSM filters");
1810 if (watch)
1811 list_del(&entry->rule.rlist);
1812 list_del_rcu(&entry->list);
1813 } else {
1814 if (watch) {
1815 list_add(&nentry->rule.rlist,
1816 &watch->rules);
1817 list_del(&entry->rule.rlist);
1818 } else if (tree)
1819 list_replace_init(&entry->rule.rlist,
1820 &nentry->rule.rlist);
1821 list_replace_rcu(&entry->list, &nentry->list);
1822 }
1823 call_rcu(&entry->rcu, audit_free_rule_rcu);
1824 } 1826 }
1825 } 1827 }
1826
1827 mutex_unlock(&audit_filter_mutex); 1828 mutex_unlock(&audit_filter_mutex);
1828 1829
1829 return err; 1830 return err;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2a3f0afc4d2a..8cbddff6c283 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -65,6 +65,7 @@
65#include <linux/highmem.h> 65#include <linux/highmem.h>
66#include <linux/syscalls.h> 66#include <linux/syscalls.h>
67#include <linux/inotify.h> 67#include <linux/inotify.h>
68#include <linux/capability.h>
68 69
69#include "audit.h" 70#include "audit.h"
70 71
@@ -84,6 +85,15 @@ int audit_n_rules;
84/* determines whether we collect data for signals sent */ 85/* determines whether we collect data for signals sent */
85int audit_signals; 86int audit_signals;
86 87
88struct audit_cap_data {
89 kernel_cap_t permitted;
90 kernel_cap_t inheritable;
91 union {
92 unsigned int fE; /* effective bit of a file capability */
93 kernel_cap_t effective; /* effective set of a process */
94 };
95};
96
87/* When fs/namei.c:getname() is called, we store the pointer in name and 97/* When fs/namei.c:getname() is called, we store the pointer in name and
88 * we don't let putname() free it (instead we free all of the saved 98 * we don't let putname() free it (instead we free all of the saved
89 * pointers at syscall exit time). 99 * pointers at syscall exit time).
@@ -100,6 +110,8 @@ struct audit_names {
100 gid_t gid; 110 gid_t gid;
101 dev_t rdev; 111 dev_t rdev;
102 u32 osid; 112 u32 osid;
113 struct audit_cap_data fcap;
114 unsigned int fcap_ver;
103}; 115};
104 116
105struct audit_aux_data { 117struct audit_aux_data {
@@ -112,43 +124,6 @@ struct audit_aux_data {
112/* Number of target pids per aux struct. */ 124/* Number of target pids per aux struct. */
113#define AUDIT_AUX_PIDS 16 125#define AUDIT_AUX_PIDS 16
114 126
115struct audit_aux_data_mq_open {
116 struct audit_aux_data d;
117 int oflag;
118 mode_t mode;
119 struct mq_attr attr;
120};
121
122struct audit_aux_data_mq_sendrecv {
123 struct audit_aux_data d;
124 mqd_t mqdes;
125 size_t msg_len;
126 unsigned int msg_prio;
127 struct timespec abs_timeout;
128};
129
130struct audit_aux_data_mq_notify {
131 struct audit_aux_data d;
132 mqd_t mqdes;
133 struct sigevent notification;
134};
135
136struct audit_aux_data_mq_getsetattr {
137 struct audit_aux_data d;
138 mqd_t mqdes;
139 struct mq_attr mqstat;
140};
141
142struct audit_aux_data_ipcctl {
143 struct audit_aux_data d;
144 struct ipc_perm p;
145 unsigned long qbytes;
146 uid_t uid;
147 gid_t gid;
148 mode_t mode;
149 u32 osid;
150};
151
152struct audit_aux_data_execve { 127struct audit_aux_data_execve {
153 struct audit_aux_data d; 128 struct audit_aux_data d;
154 int argc; 129 int argc;
@@ -156,23 +131,6 @@ struct audit_aux_data_execve {
156 struct mm_struct *mm; 131 struct mm_struct *mm;
157}; 132};
158 133
159struct audit_aux_data_socketcall {
160 struct audit_aux_data d;
161 int nargs;
162 unsigned long args[0];
163};
164
165struct audit_aux_data_sockaddr {
166 struct audit_aux_data d;
167 int len;
168 char a[0];
169};
170
171struct audit_aux_data_fd_pair {
172 struct audit_aux_data d;
173 int fd[2];
174};
175
176struct audit_aux_data_pids { 134struct audit_aux_data_pids {
177 struct audit_aux_data d; 135 struct audit_aux_data d;
178 pid_t target_pid[AUDIT_AUX_PIDS]; 136 pid_t target_pid[AUDIT_AUX_PIDS];
@@ -184,6 +142,20 @@ struct audit_aux_data_pids {
184 int pid_count; 142 int pid_count;
185}; 143};
186 144
145struct audit_aux_data_bprm_fcaps {
146 struct audit_aux_data d;
147 struct audit_cap_data fcap;
148 unsigned int fcap_ver;
149 struct audit_cap_data old_pcap;
150 struct audit_cap_data new_pcap;
151};
152
153struct audit_aux_data_capset {
154 struct audit_aux_data d;
155 pid_t pid;
156 struct audit_cap_data cap;
157};
158
187struct audit_tree_refs { 159struct audit_tree_refs {
188 struct audit_tree_refs *next; 160 struct audit_tree_refs *next;
189 struct audit_chunk *c[31]; 161 struct audit_chunk *c[31];
@@ -193,14 +165,14 @@ struct audit_tree_refs {
193struct audit_context { 165struct audit_context {
194 int dummy; /* must be the first element */ 166 int dummy; /* must be the first element */
195 int in_syscall; /* 1 if task is in a syscall */ 167 int in_syscall; /* 1 if task is in a syscall */
196 enum audit_state state; 168 enum audit_state state, current_state;
197 unsigned int serial; /* serial number for record */ 169 unsigned int serial; /* serial number for record */
198 struct timespec ctime; /* time of syscall entry */ 170 struct timespec ctime; /* time of syscall entry */
199 int major; /* syscall number */ 171 int major; /* syscall number */
200 unsigned long argv[4]; /* syscall arguments */ 172 unsigned long argv[4]; /* syscall arguments */
201 int return_valid; /* return code is valid */ 173 int return_valid; /* return code is valid */
202 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
203 int auditable; /* 1 if record should be written */ 175 u64 prio;
204 int name_count; 176 int name_count;
205 struct audit_names names[AUDIT_NAMES]; 177 struct audit_names names[AUDIT_NAMES];
206 char * filterkey; /* key for rule that triggered record */ 178 char * filterkey; /* key for rule that triggered record */
@@ -208,7 +180,8 @@ struct audit_context {
208 struct audit_context *previous; /* For nested syscalls */ 180 struct audit_context *previous; /* For nested syscalls */
209 struct audit_aux_data *aux; 181 struct audit_aux_data *aux;
210 struct audit_aux_data *aux_pids; 182 struct audit_aux_data *aux_pids;
211 183 struct sockaddr_storage *sockaddr;
184 size_t sockaddr_len;
212 /* Save things to print about task_struct */ 185 /* Save things to print about task_struct */
213 pid_t pid, ppid; 186 pid_t pid, ppid;
214 uid_t uid, euid, suid, fsuid; 187 uid_t uid, euid, suid, fsuid;
@@ -226,6 +199,49 @@ struct audit_context {
226 struct audit_tree_refs *trees, *first_trees; 199 struct audit_tree_refs *trees, *first_trees;
227 int tree_count; 200 int tree_count;
228 201
202 int type;
203 union {
204 struct {
205 int nargs;
206 long args[6];
207 } socketcall;
208 struct {
209 uid_t uid;
210 gid_t gid;
211 mode_t mode;
212 u32 osid;
213 int has_perm;
214 uid_t perm_uid;
215 gid_t perm_gid;
216 mode_t perm_mode;
217 unsigned long qbytes;
218 } ipc;
219 struct {
220 mqd_t mqdes;
221 struct mq_attr mqstat;
222 } mq_getsetattr;
223 struct {
224 mqd_t mqdes;
225 int sigev_signo;
226 } mq_notify;
227 struct {
228 mqd_t mqdes;
229 size_t msg_len;
230 unsigned int msg_prio;
231 struct timespec abs_timeout;
232 } mq_sendrecv;
233 struct {
234 int oflag;
235 mode_t mode;
236 struct mq_attr attr;
237 } mq_open;
238 struct {
239 pid_t pid;
240 struct audit_cap_data cap;
241 } capset;
242 };
243 int fds[2];
244
229#if AUDIT_DEBUG 245#if AUDIT_DEBUG
230 int put_count; 246 int put_count;
231 int ino_count; 247 int ino_count;
@@ -421,6 +437,7 @@ static int audit_filter_rules(struct task_struct *tsk,
421 struct audit_names *name, 437 struct audit_names *name,
422 enum audit_state *state) 438 enum audit_state *state)
423{ 439{
440 const struct cred *cred = get_task_cred(tsk);
424 int i, j, need_sid = 1; 441 int i, j, need_sid = 1;
425 u32 sid; 442 u32 sid;
426 443
@@ -440,28 +457,28 @@ static int audit_filter_rules(struct task_struct *tsk,
440 } 457 }
441 break; 458 break;
442 case AUDIT_UID: 459 case AUDIT_UID:
443 result = audit_comparator(tsk->uid, f->op, f->val); 460 result = audit_comparator(cred->uid, f->op, f->val);
444 break; 461 break;
445 case AUDIT_EUID: 462 case AUDIT_EUID:
446 result = audit_comparator(tsk->euid, f->op, f->val); 463 result = audit_comparator(cred->euid, f->op, f->val);
447 break; 464 break;
448 case AUDIT_SUID: 465 case AUDIT_SUID:
449 result = audit_comparator(tsk->suid, f->op, f->val); 466 result = audit_comparator(cred->suid, f->op, f->val);
450 break; 467 break;
451 case AUDIT_FSUID: 468 case AUDIT_FSUID:
452 result = audit_comparator(tsk->fsuid, f->op, f->val); 469 result = audit_comparator(cred->fsuid, f->op, f->val);
453 break; 470 break;
454 case AUDIT_GID: 471 case AUDIT_GID:
455 result = audit_comparator(tsk->gid, f->op, f->val); 472 result = audit_comparator(cred->gid, f->op, f->val);
456 break; 473 break;
457 case AUDIT_EGID: 474 case AUDIT_EGID:
458 result = audit_comparator(tsk->egid, f->op, f->val); 475 result = audit_comparator(cred->egid, f->op, f->val);
459 break; 476 break;
460 case AUDIT_SGID: 477 case AUDIT_SGID:
461 result = audit_comparator(tsk->sgid, f->op, f->val); 478 result = audit_comparator(cred->sgid, f->op, f->val);
462 break; 479 break;
463 case AUDIT_FSGID: 480 case AUDIT_FSGID:
464 result = audit_comparator(tsk->fsgid, f->op, f->val); 481 result = audit_comparator(cred->fsgid, f->op, f->val);
465 break; 482 break;
466 case AUDIT_PERS: 483 case AUDIT_PERS:
467 result = audit_comparator(tsk->personality, f->op, f->val); 484 result = audit_comparator(tsk->personality, f->op, f->val);
@@ -581,19 +598,12 @@ static int audit_filter_rules(struct task_struct *tsk,
581 } 598 }
582 } 599 }
583 /* Find ipc objects that match */ 600 /* Find ipc objects that match */
584 if (ctx) { 601 if (!ctx || ctx->type != AUDIT_IPC)
585 struct audit_aux_data *aux; 602 break;
586 for (aux = ctx->aux; aux; 603 if (security_audit_rule_match(ctx->ipc.osid,
587 aux = aux->next) { 604 f->type, f->op,
588 if (aux->type == AUDIT_IPC) { 605 f->lsm_rule, ctx))
589 struct audit_aux_data_ipcctl *axi = (void *)aux; 606 ++result;
590 if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) {
591 ++result;
592 break;
593 }
594 }
595 }
596 }
597 } 607 }
598 break; 608 break;
599 case AUDIT_ARG0: 609 case AUDIT_ARG0:
@@ -615,15 +625,26 @@ static int audit_filter_rules(struct task_struct *tsk,
615 break; 625 break;
616 } 626 }
617 627
618 if (!result) 628 if (!result) {
629 put_cred(cred);
619 return 0; 630 return 0;
631 }
632 }
633
634 if (ctx) {
635 if (rule->prio <= ctx->prio)
636 return 0;
637 if (rule->filterkey) {
638 kfree(ctx->filterkey);
639 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
640 }
641 ctx->prio = rule->prio;
620 } 642 }
621 if (rule->filterkey && ctx)
622 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
623 switch (rule->action) { 643 switch (rule->action) {
624 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 644 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
625 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 645 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
626 } 646 }
647 put_cred(cred);
627 return 1; 648 return 1;
628} 649}
629 650
@@ -631,7 +652,7 @@ static int audit_filter_rules(struct task_struct *tsk,
631 * completely disabled for this task. Since we only have the task 652 * completely disabled for this task. Since we only have the task
632 * structure at this point, we can only check uid and gid. 653 * structure at this point, we can only check uid and gid.
633 */ 654 */
634static enum audit_state audit_filter_task(struct task_struct *tsk) 655static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
635{ 656{
636 struct audit_entry *e; 657 struct audit_entry *e;
637 enum audit_state state; 658 enum audit_state state;
@@ -639,6 +660,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
639 rcu_read_lock(); 660 rcu_read_lock();
640 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 661 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
641 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { 662 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
663 if (state == AUDIT_RECORD_CONTEXT)
664 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
642 rcu_read_unlock(); 665 rcu_read_unlock();
643 return state; 666 return state;
644 } 667 }
@@ -672,6 +695,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
672 audit_filter_rules(tsk, &e->rule, ctx, NULL, 695 audit_filter_rules(tsk, &e->rule, ctx, NULL,
673 &state)) { 696 &state)) {
674 rcu_read_unlock(); 697 rcu_read_unlock();
698 ctx->current_state = state;
675 return state; 699 return state;
676 } 700 }
677 } 701 }
@@ -685,15 +709,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
685 * buckets applicable to the inode numbers in audit_names[]. 709 * buckets applicable to the inode numbers in audit_names[].
686 * Regarding audit_state, same rules apply as for audit_filter_syscall(). 710 * Regarding audit_state, same rules apply as for audit_filter_syscall().
687 */ 711 */
688enum audit_state audit_filter_inodes(struct task_struct *tsk, 712void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
689 struct audit_context *ctx)
690{ 713{
691 int i; 714 int i;
692 struct audit_entry *e; 715 struct audit_entry *e;
693 enum audit_state state; 716 enum audit_state state;
694 717
695 if (audit_pid && tsk->tgid == audit_pid) 718 if (audit_pid && tsk->tgid == audit_pid)
696 return AUDIT_DISABLED; 719 return;
697 720
698 rcu_read_lock(); 721 rcu_read_lock();
699 for (i = 0; i < ctx->name_count; i++) { 722 for (i = 0; i < ctx->name_count; i++) {
@@ -710,17 +733,20 @@ enum audit_state audit_filter_inodes(struct task_struct *tsk,
710 if ((e->rule.mask[word] & bit) == bit && 733 if ((e->rule.mask[word] & bit) == bit &&
711 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { 734 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
712 rcu_read_unlock(); 735 rcu_read_unlock();
713 return state; 736 ctx->current_state = state;
737 return;
714 } 738 }
715 } 739 }
716 } 740 }
717 rcu_read_unlock(); 741 rcu_read_unlock();
718 return AUDIT_BUILD_CONTEXT;
719} 742}
720 743
721void audit_set_auditable(struct audit_context *ctx) 744static void audit_set_auditable(struct audit_context *ctx)
722{ 745{
723 ctx->auditable = 1; 746 if (!ctx->prio) {
747 ctx->prio = 1;
748 ctx->current_state = AUDIT_RECORD_CONTEXT;
749 }
724} 750}
725 751
726static inline struct audit_context *audit_get_context(struct task_struct *tsk, 752static inline struct audit_context *audit_get_context(struct task_struct *tsk,
@@ -751,23 +777,11 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
751 else 777 else
752 context->return_code = return_code; 778 context->return_code = return_code;
753 779
754 if (context->in_syscall && !context->dummy && !context->auditable) { 780 if (context->in_syscall && !context->dummy) {
755 enum audit_state state; 781 audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
756 782 audit_filter_inodes(tsk, context);
757 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
758 if (state == AUDIT_RECORD_CONTEXT) {
759 context->auditable = 1;
760 goto get_context;
761 }
762
763 state = audit_filter_inodes(tsk, context);
764 if (state == AUDIT_RECORD_CONTEXT)
765 context->auditable = 1;
766
767 } 783 }
768 784
769get_context:
770
771 tsk->audit_context = NULL; 785 tsk->audit_context = NULL;
772 return context; 786 return context;
773} 787}
@@ -777,8 +791,7 @@ static inline void audit_free_names(struct audit_context *context)
777 int i; 791 int i;
778 792
779#if AUDIT_DEBUG == 2 793#if AUDIT_DEBUG == 2
780 if (context->auditable 794 if (context->put_count + context->ino_count != context->name_count) {
781 ||context->put_count + context->ino_count != context->name_count) {
782 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" 795 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
783 " name_count=%d put_count=%d" 796 " name_count=%d put_count=%d"
784 " ino_count=%d [NOT freeing]\n", 797 " ino_count=%d [NOT freeing]\n",
@@ -829,6 +842,7 @@ static inline void audit_zero_context(struct audit_context *context,
829{ 842{
830 memset(context, 0, sizeof(*context)); 843 memset(context, 0, sizeof(*context));
831 context->state = state; 844 context->state = state;
845 context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
832} 846}
833 847
834static inline struct audit_context *audit_alloc_context(enum audit_state state) 848static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -854,18 +868,21 @@ int audit_alloc(struct task_struct *tsk)
854{ 868{
855 struct audit_context *context; 869 struct audit_context *context;
856 enum audit_state state; 870 enum audit_state state;
871 char *key = NULL;
857 872
858 if (likely(!audit_ever_enabled)) 873 if (likely(!audit_ever_enabled))
859 return 0; /* Return if not auditing. */ 874 return 0; /* Return if not auditing. */
860 875
861 state = audit_filter_task(tsk); 876 state = audit_filter_task(tsk, &key);
862 if (likely(state == AUDIT_DISABLED)) 877 if (likely(state == AUDIT_DISABLED))
863 return 0; 878 return 0;
864 879
865 if (!(context = audit_alloc_context(state))) { 880 if (!(context = audit_alloc_context(state))) {
881 kfree(key);
866 audit_log_lost("out of memory in audit_alloc"); 882 audit_log_lost("out of memory in audit_alloc");
867 return -ENOMEM; 883 return -ENOMEM;
868 } 884 }
885 context->filterkey = key;
869 886
870 tsk->audit_context = context; 887 tsk->audit_context = context;
871 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); 888 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
@@ -891,6 +908,7 @@ static inline void audit_free_context(struct audit_context *context)
891 free_tree_refs(context); 908 free_tree_refs(context);
892 audit_free_aux(context); 909 audit_free_aux(context);
893 kfree(context->filterkey); 910 kfree(context->filterkey);
911 kfree(context->sockaddr);
894 kfree(context); 912 kfree(context);
895 context = previous; 913 context = previous;
896 } while (context); 914 } while (context);
@@ -1171,8 +1189,129 @@ static void audit_log_execve_info(struct audit_context *context,
1171 kfree(buf); 1189 kfree(buf);
1172} 1190}
1173 1191
1192static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1193{
1194 int i;
1195
1196 audit_log_format(ab, " %s=", prefix);
1197 CAP_FOR_EACH_U32(i) {
1198 audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
1199 }
1200}
1201
1202static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1203{
1204 kernel_cap_t *perm = &name->fcap.permitted;
1205 kernel_cap_t *inh = &name->fcap.inheritable;
1206 int log = 0;
1207
1208 if (!cap_isclear(*perm)) {
1209 audit_log_cap(ab, "cap_fp", perm);
1210 log = 1;
1211 }
1212 if (!cap_isclear(*inh)) {
1213 audit_log_cap(ab, "cap_fi", inh);
1214 log = 1;
1215 }
1216
1217 if (log)
1218 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
1219}
1220
1221static void show_special(struct audit_context *context, int *call_panic)
1222{
1223 struct audit_buffer *ab;
1224 int i;
1225
1226 ab = audit_log_start(context, GFP_KERNEL, context->type);
1227 if (!ab)
1228 return;
1229
1230 switch (context->type) {
1231 case AUDIT_SOCKETCALL: {
1232 int nargs = context->socketcall.nargs;
1233 audit_log_format(ab, "nargs=%d", nargs);
1234 for (i = 0; i < nargs; i++)
1235 audit_log_format(ab, " a%d=%lx", i,
1236 context->socketcall.args[i]);
1237 break; }
1238 case AUDIT_IPC: {
1239 u32 osid = context->ipc.osid;
1240
1241 audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
1242 context->ipc.uid, context->ipc.gid, context->ipc.mode);
1243 if (osid) {
1244 char *ctx = NULL;
1245 u32 len;
1246 if (security_secid_to_secctx(osid, &ctx, &len)) {
1247 audit_log_format(ab, " osid=%u", osid);
1248 *call_panic = 1;
1249 } else {
1250 audit_log_format(ab, " obj=%s", ctx);
1251 security_release_secctx(ctx, len);
1252 }
1253 }
1254 if (context->ipc.has_perm) {
1255 audit_log_end(ab);
1256 ab = audit_log_start(context, GFP_KERNEL,
1257 AUDIT_IPC_SET_PERM);
1258 audit_log_format(ab,
1259 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
1260 context->ipc.qbytes,
1261 context->ipc.perm_uid,
1262 context->ipc.perm_gid,
1263 context->ipc.perm_mode);
1264 if (!ab)
1265 return;
1266 }
1267 break; }
1268 case AUDIT_MQ_OPEN: {
1269 audit_log_format(ab,
1270 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
1271 "mq_msgsize=%ld mq_curmsgs=%ld",
1272 context->mq_open.oflag, context->mq_open.mode,
1273 context->mq_open.attr.mq_flags,
1274 context->mq_open.attr.mq_maxmsg,
1275 context->mq_open.attr.mq_msgsize,
1276 context->mq_open.attr.mq_curmsgs);
1277 break; }
1278 case AUDIT_MQ_SENDRECV: {
1279 audit_log_format(ab,
1280 "mqdes=%d msg_len=%zd msg_prio=%u "
1281 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
1282 context->mq_sendrecv.mqdes,
1283 context->mq_sendrecv.msg_len,
1284 context->mq_sendrecv.msg_prio,
1285 context->mq_sendrecv.abs_timeout.tv_sec,
1286 context->mq_sendrecv.abs_timeout.tv_nsec);
1287 break; }
1288 case AUDIT_MQ_NOTIFY: {
1289 audit_log_format(ab, "mqdes=%d sigev_signo=%d",
1290 context->mq_notify.mqdes,
1291 context->mq_notify.sigev_signo);
1292 break; }
1293 case AUDIT_MQ_GETSETATTR: {
1294 struct mq_attr *attr = &context->mq_getsetattr.mqstat;
1295 audit_log_format(ab,
1296 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
1297 "mq_curmsgs=%ld ",
1298 context->mq_getsetattr.mqdes,
1299 attr->mq_flags, attr->mq_maxmsg,
1300 attr->mq_msgsize, attr->mq_curmsgs);
1301 break; }
1302 case AUDIT_CAPSET: {
1303 audit_log_format(ab, "pid=%d", context->capset.pid);
1304 audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
1305 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1306 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1307 break; }
1308 }
1309 audit_log_end(ab);
1310}
1311
1174static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1312static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1175{ 1313{
1314 const struct cred *cred;
1176 int i, call_panic = 0; 1315 int i, call_panic = 0;
1177 struct audit_buffer *ab; 1316 struct audit_buffer *ab;
1178 struct audit_aux_data *aux; 1317 struct audit_aux_data *aux;
@@ -1182,14 +1321,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1182 context->pid = tsk->pid; 1321 context->pid = tsk->pid;
1183 if (!context->ppid) 1322 if (!context->ppid)
1184 context->ppid = sys_getppid(); 1323 context->ppid = sys_getppid();
1185 context->uid = tsk->uid; 1324 cred = current_cred();
1186 context->gid = tsk->gid; 1325 context->uid = cred->uid;
1187 context->euid = tsk->euid; 1326 context->gid = cred->gid;
1188 context->suid = tsk->suid; 1327 context->euid = cred->euid;
1189 context->fsuid = tsk->fsuid; 1328 context->suid = cred->suid;
1190 context->egid = tsk->egid; 1329 context->fsuid = cred->fsuid;
1191 context->sgid = tsk->sgid; 1330 context->egid = cred->egid;
1192 context->fsgid = tsk->fsgid; 1331 context->sgid = cred->sgid;
1332 context->fsgid = cred->fsgid;
1193 context->personality = tsk->personality; 1333 context->personality = tsk->personality;
1194 1334
1195 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); 1335 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
@@ -1246,96 +1386,50 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1246 continue; /* audit_panic has been called */ 1386 continue; /* audit_panic has been called */
1247 1387
1248 switch (aux->type) { 1388 switch (aux->type) {
1249 case AUDIT_MQ_OPEN: {
1250 struct audit_aux_data_mq_open *axi = (void *)aux;
1251 audit_log_format(ab,
1252 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
1253 "mq_msgsize=%ld mq_curmsgs=%ld",
1254 axi->oflag, axi->mode, axi->attr.mq_flags,
1255 axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
1256 axi->attr.mq_curmsgs);
1257 break; }
1258
1259 case AUDIT_MQ_SENDRECV: {
1260 struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
1261 audit_log_format(ab,
1262 "mqdes=%d msg_len=%zd msg_prio=%u "
1263 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
1264 axi->mqdes, axi->msg_len, axi->msg_prio,
1265 axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
1266 break; }
1267
1268 case AUDIT_MQ_NOTIFY: {
1269 struct audit_aux_data_mq_notify *axi = (void *)aux;
1270 audit_log_format(ab,
1271 "mqdes=%d sigev_signo=%d",
1272 axi->mqdes,
1273 axi->notification.sigev_signo);
1274 break; }
1275
1276 case AUDIT_MQ_GETSETATTR: {
1277 struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
1278 audit_log_format(ab,
1279 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
1280 "mq_curmsgs=%ld ",
1281 axi->mqdes,
1282 axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
1283 axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
1284 break; }
1285
1286 case AUDIT_IPC: {
1287 struct audit_aux_data_ipcctl *axi = (void *)aux;
1288 audit_log_format(ab,
1289 "ouid=%u ogid=%u mode=%#o",
1290 axi->uid, axi->gid, axi->mode);
1291 if (axi->osid != 0) {
1292 char *ctx = NULL;
1293 u32 len;
1294 if (security_secid_to_secctx(
1295 axi->osid, &ctx, &len)) {
1296 audit_log_format(ab, " osid=%u",
1297 axi->osid);
1298 call_panic = 1;
1299 } else {
1300 audit_log_format(ab, " obj=%s", ctx);
1301 security_release_secctx(ctx, len);
1302 }
1303 }
1304 break; }
1305
1306 case AUDIT_IPC_SET_PERM: {
1307 struct audit_aux_data_ipcctl *axi = (void *)aux;
1308 audit_log_format(ab,
1309 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
1310 axi->qbytes, axi->uid, axi->gid, axi->mode);
1311 break; }
1312 1389
1313 case AUDIT_EXECVE: { 1390 case AUDIT_EXECVE: {
1314 struct audit_aux_data_execve *axi = (void *)aux; 1391 struct audit_aux_data_execve *axi = (void *)aux;
1315 audit_log_execve_info(context, &ab, axi); 1392 audit_log_execve_info(context, &ab, axi);
1316 break; } 1393 break; }
1317 1394
1318 case AUDIT_SOCKETCALL: { 1395 case AUDIT_BPRM_FCAPS: {
1319 struct audit_aux_data_socketcall *axs = (void *)aux; 1396 struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
1320 audit_log_format(ab, "nargs=%d", axs->nargs); 1397 audit_log_format(ab, "fver=%x", axs->fcap_ver);
1321 for (i=0; i<axs->nargs; i++) 1398 audit_log_cap(ab, "fp", &axs->fcap.permitted);
1322 audit_log_format(ab, " a%d=%lx", i, axs->args[i]); 1399 audit_log_cap(ab, "fi", &axs->fcap.inheritable);
1400 audit_log_format(ab, " fe=%d", axs->fcap.fE);
1401 audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
1402 audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
1403 audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
1404 audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
1405 audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
1406 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
1323 break; } 1407 break; }
1324 1408
1325 case AUDIT_SOCKADDR: { 1409 }
1326 struct audit_aux_data_sockaddr *axs = (void *)aux; 1410 audit_log_end(ab);
1411 }
1327 1412
1328 audit_log_format(ab, "saddr="); 1413 if (context->type)
1329 audit_log_n_hex(ab, axs->a, axs->len); 1414 show_special(context, &call_panic);
1330 break; }
1331 1415
1332 case AUDIT_FD_PAIR: { 1416 if (context->fds[0] >= 0) {
1333 struct audit_aux_data_fd_pair *axs = (void *)aux; 1417 ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR);
1334 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]); 1418 if (ab) {
1335 break; } 1419 audit_log_format(ab, "fd0=%d fd1=%d",
1420 context->fds[0], context->fds[1]);
1421 audit_log_end(ab);
1422 }
1423 }
1336 1424
1425 if (context->sockaddr_len) {
1426 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
1427 if (ab) {
1428 audit_log_format(ab, "saddr=");
1429 audit_log_n_hex(ab, (void *)context->sockaddr,
1430 context->sockaddr_len);
1431 audit_log_end(ab);
1337 } 1432 }
1338 audit_log_end(ab);
1339 } 1433 }
1340 1434
1341 for (aux = context->aux_pids; aux; aux = aux->next) { 1435 for (aux = context->aux_pids; aux; aux = aux->next) {
@@ -1421,6 +1515,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1421 } 1515 }
1422 } 1516 }
1423 1517
1518 audit_log_fcaps(ab, n);
1519
1424 audit_log_end(ab); 1520 audit_log_end(ab);
1425 } 1521 }
1426 1522
@@ -1451,7 +1547,7 @@ void audit_free(struct task_struct *tsk)
1451 * We use GFP_ATOMIC here because we might be doing this 1547 * We use GFP_ATOMIC here because we might be doing this
1452 * in the context of the idle thread */ 1548 * in the context of the idle thread */
1453 /* that can happen only if we are called from do_exit() */ 1549 /* that can happen only if we are called from do_exit() */
1454 if (context->in_syscall && context->auditable) 1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1455 audit_log_exit(context, tsk); 1551 audit_log_exit(context, tsk);
1456 1552
1457 audit_free_context(context); 1553 audit_free_context(context);
@@ -1535,15 +1631,17 @@ void audit_syscall_entry(int arch, int major,
1535 1631
1536 state = context->state; 1632 state = context->state;
1537 context->dummy = !audit_n_rules; 1633 context->dummy = !audit_n_rules;
1538 if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) 1634 if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
1635 context->prio = 0;
1539 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); 1636 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
1637 }
1540 if (likely(state == AUDIT_DISABLED)) 1638 if (likely(state == AUDIT_DISABLED))
1541 return; 1639 return;
1542 1640
1543 context->serial = 0; 1641 context->serial = 0;
1544 context->ctime = CURRENT_TIME; 1642 context->ctime = CURRENT_TIME;
1545 context->in_syscall = 1; 1643 context->in_syscall = 1;
1546 context->auditable = !!(state == AUDIT_RECORD_CONTEXT); 1644 context->current_state = state;
1547 context->ppid = 0; 1645 context->ppid = 0;
1548} 1646}
1549 1647
@@ -1551,17 +1649,20 @@ void audit_finish_fork(struct task_struct *child)
1551{ 1649{
1552 struct audit_context *ctx = current->audit_context; 1650 struct audit_context *ctx = current->audit_context;
1553 struct audit_context *p = child->audit_context; 1651 struct audit_context *p = child->audit_context;
1554 if (!p || !ctx || !ctx->auditable) 1652 if (!p || !ctx)
1653 return;
1654 if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
1555 return; 1655 return;
1556 p->arch = ctx->arch; 1656 p->arch = ctx->arch;
1557 p->major = ctx->major; 1657 p->major = ctx->major;
1558 memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); 1658 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1559 p->ctime = ctx->ctime; 1659 p->ctime = ctx->ctime;
1560 p->dummy = ctx->dummy; 1660 p->dummy = ctx->dummy;
1561 p->auditable = ctx->auditable;
1562 p->in_syscall = ctx->in_syscall; 1661 p->in_syscall = ctx->in_syscall;
1563 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); 1662 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1564 p->ppid = current->pid; 1663 p->ppid = current->pid;
1664 p->prio = ctx->prio;
1665 p->current_state = ctx->current_state;
1565} 1666}
1566 1667
1567/** 1668/**
@@ -1585,11 +1686,11 @@ void audit_syscall_exit(int valid, long return_code)
1585 if (likely(!context)) 1686 if (likely(!context))
1586 return; 1687 return;
1587 1688
1588 if (context->in_syscall && context->auditable) 1689 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1589 audit_log_exit(context, tsk); 1690 audit_log_exit(context, tsk);
1590 1691
1591 context->in_syscall = 0; 1692 context->in_syscall = 0;
1592 context->auditable = 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1593 1694
1594 if (context->previous) { 1695 if (context->previous) {
1595 struct audit_context *new_context = context->previous; 1696 struct audit_context *new_context = context->previous;
@@ -1604,8 +1705,13 @@ void audit_syscall_exit(int valid, long return_code)
1604 context->aux_pids = NULL; 1705 context->aux_pids = NULL;
1605 context->target_pid = 0; 1706 context->target_pid = 0;
1606 context->target_sid = 0; 1707 context->target_sid = 0;
1607 kfree(context->filterkey); 1708 context->sockaddr_len = 0;
1608 context->filterkey = NULL; 1709 context->type = 0;
1710 context->fds[0] = -1;
1711 if (context->state != AUDIT_RECORD_CONTEXT) {
1712 kfree(context->filterkey);
1713 context->filterkey = NULL;
1714 }
1609 tsk->audit_context = context; 1715 tsk->audit_context = context;
1610 } 1716 }
1611} 1717}
@@ -1802,8 +1908,36 @@ static int audit_inc_name_count(struct audit_context *context,
1802 return 0; 1908 return 0;
1803} 1909}
1804 1910
1911
1912static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
1913{
1914 struct cpu_vfs_cap_data caps;
1915 int rc;
1916
1917 memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
1918 memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
1919 name->fcap.fE = 0;
1920 name->fcap_ver = 0;
1921
1922 if (!dentry)
1923 return 0;
1924
1925 rc = get_vfs_caps_from_disk(dentry, &caps);
1926 if (rc)
1927 return rc;
1928
1929 name->fcap.permitted = caps.permitted;
1930 name->fcap.inheritable = caps.inheritable;
1931 name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
1932 name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
1933
1934 return 0;
1935}
1936
1937
1805/* Copy inode data into an audit_names. */ 1938/* Copy inode data into an audit_names. */
1806static void audit_copy_inode(struct audit_names *name, const struct inode *inode) 1939static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
1940 const struct inode *inode)
1807{ 1941{
1808 name->ino = inode->i_ino; 1942 name->ino = inode->i_ino;
1809 name->dev = inode->i_sb->s_dev; 1943 name->dev = inode->i_sb->s_dev;
@@ -1812,6 +1946,7 @@ static void audit_copy_inode(struct audit_names *name, const struct inode *inode
1812 name->gid = inode->i_gid; 1946 name->gid = inode->i_gid;
1813 name->rdev = inode->i_rdev; 1947 name->rdev = inode->i_rdev;
1814 security_inode_getsecid(inode, &name->osid); 1948 security_inode_getsecid(inode, &name->osid);
1949 audit_copy_fcaps(name, dentry);
1815} 1950}
1816 1951
1817/** 1952/**
@@ -1846,7 +1981,7 @@ void __audit_inode(const char *name, const struct dentry *dentry)
1846 context->names[idx].name = NULL; 1981 context->names[idx].name = NULL;
1847 } 1982 }
1848 handle_path(dentry); 1983 handle_path(dentry);
1849 audit_copy_inode(&context->names[idx], inode); 1984 audit_copy_inode(&context->names[idx], dentry, inode);
1850} 1985}
1851 1986
1852/** 1987/**
@@ -1907,7 +2042,7 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry,
1907 if (!strcmp(dname, n->name) || 2042 if (!strcmp(dname, n->name) ||
1908 !audit_compare_dname_path(dname, n->name, &dirlen)) { 2043 !audit_compare_dname_path(dname, n->name, &dirlen)) {
1909 if (inode) 2044 if (inode)
1910 audit_copy_inode(n, inode); 2045 audit_copy_inode(n, NULL, inode);
1911 else 2046 else
1912 n->ino = (unsigned long)-1; 2047 n->ino = (unsigned long)-1;
1913 found_child = n->name; 2048 found_child = n->name;
@@ -1921,7 +2056,7 @@ add_names:
1921 return; 2056 return;
1922 idx = context->name_count - 1; 2057 idx = context->name_count - 1;
1923 context->names[idx].name = NULL; 2058 context->names[idx].name = NULL;
1924 audit_copy_inode(&context->names[idx], parent); 2059 audit_copy_inode(&context->names[idx], NULL, parent);
1925 } 2060 }
1926 2061
1927 if (!found_child) { 2062 if (!found_child) {
@@ -1942,7 +2077,7 @@ add_names:
1942 } 2077 }
1943 2078
1944 if (inode) 2079 if (inode)
1945 audit_copy_inode(&context->names[idx], inode); 2080 audit_copy_inode(&context->names[idx], NULL, inode);
1946 else 2081 else
1947 context->names[idx].ino = (unsigned long)-1; 2082 context->names[idx].ino = (unsigned long)-1;
1948 } 2083 }
@@ -1967,7 +2102,10 @@ int auditsc_get_stamp(struct audit_context *ctx,
1967 t->tv_sec = ctx->ctime.tv_sec; 2102 t->tv_sec = ctx->ctime.tv_sec;
1968 t->tv_nsec = ctx->ctime.tv_nsec; 2103 t->tv_nsec = ctx->ctime.tv_nsec;
1969 *serial = ctx->serial; 2104 *serial = ctx->serial;
1970 ctx->auditable = 1; 2105 if (!ctx->prio) {
2106 ctx->prio = 1;
2107 ctx->current_state = AUDIT_RECORD_CONTEXT;
2108 }
1971 return 1; 2109 return 1;
1972} 2110}
1973 2111
@@ -1996,7 +2134,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1996 audit_log_format(ab, "login pid=%d uid=%u " 2134 audit_log_format(ab, "login pid=%d uid=%u "
1997 "old auid=%u new auid=%u" 2135 "old auid=%u new auid=%u"
1998 " old ses=%u new ses=%u", 2136 " old ses=%u new ses=%u",
1999 task->pid, task->uid, 2137 task->pid, task_uid(task),
2000 task->loginuid, loginuid, 2138 task->loginuid, loginuid,
2001 task->sessionid, sessionid); 2139 task->sessionid, sessionid);
2002 audit_log_end(ab); 2140 audit_log_end(ab);
@@ -2013,132 +2151,46 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2013 * @mode: mode bits 2151 * @mode: mode bits
2014 * @u_attr: queue attributes 2152 * @u_attr: queue attributes
2015 * 2153 *
2016 * Returns 0 for success or NULL context or < 0 on error.
2017 */ 2154 */
2018int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) 2155void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
2019{ 2156{
2020 struct audit_aux_data_mq_open *ax;
2021 struct audit_context *context = current->audit_context; 2157 struct audit_context *context = current->audit_context;
2022 2158
2023 if (!audit_enabled) 2159 if (attr)
2024 return 0; 2160 memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
2025 2161 else
2026 if (likely(!context)) 2162 memset(&context->mq_open.attr, 0, sizeof(struct mq_attr));
2027 return 0;
2028
2029 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2030 if (!ax)
2031 return -ENOMEM;
2032
2033 if (u_attr != NULL) {
2034 if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
2035 kfree(ax);
2036 return -EFAULT;
2037 }
2038 } else
2039 memset(&ax->attr, 0, sizeof(ax->attr));
2040 2163
2041 ax->oflag = oflag; 2164 context->mq_open.oflag = oflag;
2042 ax->mode = mode; 2165 context->mq_open.mode = mode;
2043 2166
2044 ax->d.type = AUDIT_MQ_OPEN; 2167 context->type = AUDIT_MQ_OPEN;
2045 ax->d.next = context->aux;
2046 context->aux = (void *)ax;
2047 return 0;
2048} 2168}
2049 2169
2050/** 2170/**
2051 * __audit_mq_timedsend - record audit data for a POSIX MQ timed send 2171 * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive
2052 * @mqdes: MQ descriptor 2172 * @mqdes: MQ descriptor
2053 * @msg_len: Message length 2173 * @msg_len: Message length
2054 * @msg_prio: Message priority 2174 * @msg_prio: Message priority
2055 * @u_abs_timeout: Message timeout in absolute time 2175 * @abs_timeout: Message timeout in absolute time
2056 * 2176 *
2057 * Returns 0 for success or NULL context or < 0 on error.
2058 */ 2177 */
2059int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, 2178void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
2060 const struct timespec __user *u_abs_timeout) 2179 const struct timespec *abs_timeout)
2061{ 2180{
2062 struct audit_aux_data_mq_sendrecv *ax;
2063 struct audit_context *context = current->audit_context; 2181 struct audit_context *context = current->audit_context;
2182 struct timespec *p = &context->mq_sendrecv.abs_timeout;
2064 2183
2065 if (!audit_enabled) 2184 if (abs_timeout)
2066 return 0; 2185 memcpy(p, abs_timeout, sizeof(struct timespec));
2067 2186 else
2068 if (likely(!context)) 2187 memset(p, 0, sizeof(struct timespec));
2069 return 0;
2070
2071 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2072 if (!ax)
2073 return -ENOMEM;
2074
2075 if (u_abs_timeout != NULL) {
2076 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
2077 kfree(ax);
2078 return -EFAULT;
2079 }
2080 } else
2081 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
2082
2083 ax->mqdes = mqdes;
2084 ax->msg_len = msg_len;
2085 ax->msg_prio = msg_prio;
2086
2087 ax->d.type = AUDIT_MQ_SENDRECV;
2088 ax->d.next = context->aux;
2089 context->aux = (void *)ax;
2090 return 0;
2091}
2092
2093/**
2094 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
2095 * @mqdes: MQ descriptor
2096 * @msg_len: Message length
2097 * @u_msg_prio: Message priority
2098 * @u_abs_timeout: Message timeout in absolute time
2099 *
2100 * Returns 0 for success or NULL context or < 0 on error.
2101 */
2102int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
2103 unsigned int __user *u_msg_prio,
2104 const struct timespec __user *u_abs_timeout)
2105{
2106 struct audit_aux_data_mq_sendrecv *ax;
2107 struct audit_context *context = current->audit_context;
2108
2109 if (!audit_enabled)
2110 return 0;
2111
2112 if (likely(!context))
2113 return 0;
2114
2115 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2116 if (!ax)
2117 return -ENOMEM;
2118
2119 if (u_msg_prio != NULL) {
2120 if (get_user(ax->msg_prio, u_msg_prio)) {
2121 kfree(ax);
2122 return -EFAULT;
2123 }
2124 } else
2125 ax->msg_prio = 0;
2126
2127 if (u_abs_timeout != NULL) {
2128 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
2129 kfree(ax);
2130 return -EFAULT;
2131 }
2132 } else
2133 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
2134 2188
2135 ax->mqdes = mqdes; 2189 context->mq_sendrecv.mqdes = mqdes;
2136 ax->msg_len = msg_len; 2190 context->mq_sendrecv.msg_len = msg_len;
2191 context->mq_sendrecv.msg_prio = msg_prio;
2137 2192
2138 ax->d.type = AUDIT_MQ_SENDRECV; 2193 context->type = AUDIT_MQ_SENDRECV;
2139 ax->d.next = context->aux;
2140 context->aux = (void *)ax;
2141 return 0;
2142} 2194}
2143 2195
2144/** 2196/**
@@ -2146,38 +2198,19 @@ int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
2146 * @mqdes: MQ descriptor 2198 * @mqdes: MQ descriptor
2147 * @u_notification: Notification event 2199 * @u_notification: Notification event
2148 * 2200 *
2149 * Returns 0 for success or NULL context or < 0 on error.
2150 */ 2201 */
2151 2202
2152int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) 2203void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
2153{ 2204{
2154 struct audit_aux_data_mq_notify *ax;
2155 struct audit_context *context = current->audit_context; 2205 struct audit_context *context = current->audit_context;
2156 2206
2157 if (!audit_enabled) 2207 if (notification)
2158 return 0; 2208 context->mq_notify.sigev_signo = notification->sigev_signo;
2159 2209 else
2160 if (likely(!context)) 2210 context->mq_notify.sigev_signo = 0;
2161 return 0;
2162
2163 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2164 if (!ax)
2165 return -ENOMEM;
2166
2167 if (u_notification != NULL) {
2168 if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
2169 kfree(ax);
2170 return -EFAULT;
2171 }
2172 } else
2173 memset(&ax->notification, 0, sizeof(ax->notification));
2174
2175 ax->mqdes = mqdes;
2176 2211
2177 ax->d.type = AUDIT_MQ_NOTIFY; 2212 context->mq_notify.mqdes = mqdes;
2178 ax->d.next = context->aux; 2213 context->type = AUDIT_MQ_NOTIFY;
2179 context->aux = (void *)ax;
2180 return 0;
2181} 2214}
2182 2215
2183/** 2216/**
@@ -2185,55 +2218,29 @@ int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
2185 * @mqdes: MQ descriptor 2218 * @mqdes: MQ descriptor
2186 * @mqstat: MQ flags 2219 * @mqstat: MQ flags
2187 * 2220 *
2188 * Returns 0 for success or NULL context or < 0 on error.
2189 */ 2221 */
2190int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) 2222void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
2191{ 2223{
2192 struct audit_aux_data_mq_getsetattr *ax;
2193 struct audit_context *context = current->audit_context; 2224 struct audit_context *context = current->audit_context;
2194 2225 context->mq_getsetattr.mqdes = mqdes;
2195 if (!audit_enabled) 2226 context->mq_getsetattr.mqstat = *mqstat;
2196 return 0; 2227 context->type = AUDIT_MQ_GETSETATTR;
2197
2198 if (likely(!context))
2199 return 0;
2200
2201 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2202 if (!ax)
2203 return -ENOMEM;
2204
2205 ax->mqdes = mqdes;
2206 ax->mqstat = *mqstat;
2207
2208 ax->d.type = AUDIT_MQ_GETSETATTR;
2209 ax->d.next = context->aux;
2210 context->aux = (void *)ax;
2211 return 0;
2212} 2228}
2213 2229
2214/** 2230/**
2215 * audit_ipc_obj - record audit data for ipc object 2231 * audit_ipc_obj - record audit data for ipc object
2216 * @ipcp: ipc permissions 2232 * @ipcp: ipc permissions
2217 * 2233 *
2218 * Returns 0 for success or NULL context or < 0 on error.
2219 */ 2234 */
2220int __audit_ipc_obj(struct kern_ipc_perm *ipcp) 2235void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2221{ 2236{
2222 struct audit_aux_data_ipcctl *ax;
2223 struct audit_context *context = current->audit_context; 2237 struct audit_context *context = current->audit_context;
2224 2238 context->ipc.uid = ipcp->uid;
2225 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 2239 context->ipc.gid = ipcp->gid;
2226 if (!ax) 2240 context->ipc.mode = ipcp->mode;
2227 return -ENOMEM; 2241 context->ipc.has_perm = 0;
2228 2242 security_ipc_getsecid(ipcp, &context->ipc.osid);
2229 ax->uid = ipcp->uid; 2243 context->type = AUDIT_IPC;
2230 ax->gid = ipcp->gid;
2231 ax->mode = ipcp->mode;
2232 security_ipc_getsecid(ipcp, &ax->osid);
2233 ax->d.type = AUDIT_IPC;
2234 ax->d.next = context->aux;
2235 context->aux = (void *)ax;
2236 return 0;
2237} 2244}
2238 2245
2239/** 2246/**
@@ -2243,26 +2250,17 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2243 * @gid: msgq group id 2250 * @gid: msgq group id
2244 * @mode: msgq mode (permissions) 2251 * @mode: msgq mode (permissions)
2245 * 2252 *
2246 * Returns 0 for success or NULL context or < 0 on error. 2253 * Called only after audit_ipc_obj().
2247 */ 2254 */
2248int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) 2255void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
2249{ 2256{
2250 struct audit_aux_data_ipcctl *ax;
2251 struct audit_context *context = current->audit_context; 2257 struct audit_context *context = current->audit_context;
2252 2258
2253 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 2259 context->ipc.qbytes = qbytes;
2254 if (!ax) 2260 context->ipc.perm_uid = uid;
2255 return -ENOMEM; 2261 context->ipc.perm_gid = gid;
2256 2262 context->ipc.perm_mode = mode;
2257 ax->qbytes = qbytes; 2263 context->ipc.has_perm = 1;
2258 ax->uid = uid;
2259 ax->gid = gid;
2260 ax->mode = mode;
2261
2262 ax->d.type = AUDIT_IPC_SET_PERM;
2263 ax->d.next = context->aux;
2264 context->aux = (void *)ax;
2265 return 0;
2266} 2264}
2267 2265
2268int audit_bprm(struct linux_binprm *bprm) 2266int audit_bprm(struct linux_binprm *bprm)
@@ -2292,27 +2290,17 @@ int audit_bprm(struct linux_binprm *bprm)
2292 * @nargs: number of args 2290 * @nargs: number of args
2293 * @args: args array 2291 * @args: args array
2294 * 2292 *
2295 * Returns 0 for success or NULL context or < 0 on error.
2296 */ 2293 */
2297int audit_socketcall(int nargs, unsigned long *args) 2294void audit_socketcall(int nargs, unsigned long *args)
2298{ 2295{
2299 struct audit_aux_data_socketcall *ax;
2300 struct audit_context *context = current->audit_context; 2296 struct audit_context *context = current->audit_context;
2301 2297
2302 if (likely(!context || context->dummy)) 2298 if (likely(!context || context->dummy))
2303 return 0; 2299 return;
2304
2305 ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
2306 if (!ax)
2307 return -ENOMEM;
2308
2309 ax->nargs = nargs;
2310 memcpy(ax->args, args, nargs * sizeof(unsigned long));
2311 2300
2312 ax->d.type = AUDIT_SOCKETCALL; 2301 context->type = AUDIT_SOCKETCALL;
2313 ax->d.next = context->aux; 2302 context->socketcall.nargs = nargs;
2314 context->aux = (void *)ax; 2303 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
2315 return 0;
2316} 2304}
2317 2305
2318/** 2306/**
@@ -2320,29 +2308,12 @@ int audit_socketcall(int nargs, unsigned long *args)
2320 * @fd1: the first file descriptor 2308 * @fd1: the first file descriptor
2321 * @fd2: the second file descriptor 2309 * @fd2: the second file descriptor
2322 * 2310 *
2323 * Returns 0 for success or NULL context or < 0 on error.
2324 */ 2311 */
2325int __audit_fd_pair(int fd1, int fd2) 2312void __audit_fd_pair(int fd1, int fd2)
2326{ 2313{
2327 struct audit_context *context = current->audit_context; 2314 struct audit_context *context = current->audit_context;
2328 struct audit_aux_data_fd_pair *ax; 2315 context->fds[0] = fd1;
2329 2316 context->fds[1] = fd2;
2330 if (likely(!context)) {
2331 return 0;
2332 }
2333
2334 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2335 if (!ax) {
2336 return -ENOMEM;
2337 }
2338
2339 ax->fd[0] = fd1;
2340 ax->fd[1] = fd2;
2341
2342 ax->d.type = AUDIT_FD_PAIR;
2343 ax->d.next = context->aux;
2344 context->aux = (void *)ax;
2345 return 0;
2346} 2317}
2347 2318
2348/** 2319/**
@@ -2354,22 +2325,20 @@ int __audit_fd_pair(int fd1, int fd2)
2354 */ 2325 */
2355int audit_sockaddr(int len, void *a) 2326int audit_sockaddr(int len, void *a)
2356{ 2327{
2357 struct audit_aux_data_sockaddr *ax;
2358 struct audit_context *context = current->audit_context; 2328 struct audit_context *context = current->audit_context;
2359 2329
2360 if (likely(!context || context->dummy)) 2330 if (likely(!context || context->dummy))
2361 return 0; 2331 return 0;
2362 2332
2363 ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); 2333 if (!context->sockaddr) {
2364 if (!ax) 2334 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
2365 return -ENOMEM; 2335 if (!p)
2366 2336 return -ENOMEM;
2367 ax->len = len; 2337 context->sockaddr = p;
2368 memcpy(ax->a, a, len); 2338 }
2369 2339
2370 ax->d.type = AUDIT_SOCKADDR; 2340 context->sockaddr_len = len;
2371 ax->d.next = context->aux; 2341 memcpy(context->sockaddr, a, len);
2372 context->aux = (void *)ax;
2373 return 0; 2342 return 0;
2374} 2343}
2375 2344
@@ -2379,7 +2348,7 @@ void __audit_ptrace(struct task_struct *t)
2379 2348
2380 context->target_pid = t->pid; 2349 context->target_pid = t->pid;
2381 context->target_auid = audit_get_loginuid(t); 2350 context->target_auid = audit_get_loginuid(t);
2382 context->target_uid = t->uid; 2351 context->target_uid = task_uid(t);
2383 context->target_sessionid = audit_get_sessionid(t); 2352 context->target_sessionid = audit_get_sessionid(t);
2384 security_task_getsecid(t, &context->target_sid); 2353 security_task_getsecid(t, &context->target_sid);
2385 memcpy(context->target_comm, t->comm, TASK_COMM_LEN); 2354 memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
@@ -2398,6 +2367,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2398 struct audit_aux_data_pids *axp; 2367 struct audit_aux_data_pids *axp;
2399 struct task_struct *tsk = current; 2368 struct task_struct *tsk = current;
2400 struct audit_context *ctx = tsk->audit_context; 2369 struct audit_context *ctx = tsk->audit_context;
2370 uid_t uid = current_uid(), t_uid = task_uid(t);
2401 2371
2402 if (audit_pid && t->tgid == audit_pid) { 2372 if (audit_pid && t->tgid == audit_pid) {
2403 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { 2373 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
@@ -2405,7 +2375,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2405 if (tsk->loginuid != -1) 2375 if (tsk->loginuid != -1)
2406 audit_sig_uid = tsk->loginuid; 2376 audit_sig_uid = tsk->loginuid;
2407 else 2377 else
2408 audit_sig_uid = tsk->uid; 2378 audit_sig_uid = uid;
2409 security_task_getsecid(tsk, &audit_sig_sid); 2379 security_task_getsecid(tsk, &audit_sig_sid);
2410 } 2380 }
2411 if (!audit_signals || audit_dummy_context()) 2381 if (!audit_signals || audit_dummy_context())
@@ -2417,7 +2387,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2417 if (!ctx->target_pid) { 2387 if (!ctx->target_pid) {
2418 ctx->target_pid = t->tgid; 2388 ctx->target_pid = t->tgid;
2419 ctx->target_auid = audit_get_loginuid(t); 2389 ctx->target_auid = audit_get_loginuid(t);
2420 ctx->target_uid = t->uid; 2390 ctx->target_uid = t_uid;
2421 ctx->target_sessionid = audit_get_sessionid(t); 2391 ctx->target_sessionid = audit_get_sessionid(t);
2422 security_task_getsecid(t, &ctx->target_sid); 2392 security_task_getsecid(t, &ctx->target_sid);
2423 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); 2393 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
@@ -2438,7 +2408,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2438 2408
2439 axp->target_pid[axp->pid_count] = t->tgid; 2409 axp->target_pid[axp->pid_count] = t->tgid;
2440 axp->target_auid[axp->pid_count] = audit_get_loginuid(t); 2410 axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
2441 axp->target_uid[axp->pid_count] = t->uid; 2411 axp->target_uid[axp->pid_count] = t_uid;
2442 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); 2412 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
2443 security_task_getsecid(t, &axp->target_sid[axp->pid_count]); 2413 security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
2444 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); 2414 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
@@ -2448,6 +2418,72 @@ int __audit_signal_info(int sig, struct task_struct *t)
2448} 2418}
2449 2419
2450/** 2420/**
2421 * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps
2422 * @bprm: pointer to the bprm being processed
2423 * @new: the proposed new credentials
2424 * @old: the old credentials
2425 *
2426 * Simply check if the proc already has the caps given by the file and if not
2427 * store the priv escalation info for later auditing at the end of the syscall
2428 *
2429 * -Eric
2430 */
2431int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2432 const struct cred *new, const struct cred *old)
2433{
2434 struct audit_aux_data_bprm_fcaps *ax;
2435 struct audit_context *context = current->audit_context;
2436 struct cpu_vfs_cap_data vcaps;
2437 struct dentry *dentry;
2438
2439 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2440 if (!ax)
2441 return -ENOMEM;
2442
2443 ax->d.type = AUDIT_BPRM_FCAPS;
2444 ax->d.next = context->aux;
2445 context->aux = (void *)ax;
2446
2447 dentry = dget(bprm->file->f_dentry);
2448 get_vfs_caps_from_disk(dentry, &vcaps);
2449 dput(dentry);
2450
2451 ax->fcap.permitted = vcaps.permitted;
2452 ax->fcap.inheritable = vcaps.inheritable;
2453 ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
2454 ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
2455
2456 ax->old_pcap.permitted = old->cap_permitted;
2457 ax->old_pcap.inheritable = old->cap_inheritable;
2458 ax->old_pcap.effective = old->cap_effective;
2459
2460 ax->new_pcap.permitted = new->cap_permitted;
2461 ax->new_pcap.inheritable = new->cap_inheritable;
2462 ax->new_pcap.effective = new->cap_effective;
2463 return 0;
2464}
2465
2466/**
2467 * __audit_log_capset - store information about the arguments to the capset syscall
2468 * @pid: target pid of the capset call
2469 * @new: the new credentials
2470 * @old: the old (current) credentials
2471 *
2472 * Record the aguments userspace sent to sys_capset for later printing by the
2473 * audit system if applicable
2474 */
2475void __audit_log_capset(pid_t pid,
2476 const struct cred *new, const struct cred *old)
2477{
2478 struct audit_context *context = current->audit_context;
2479 context->capset.pid = pid;
2480 context->capset.cap.effective = new->cap_effective;
2481 context->capset.cap.inheritable = new->cap_effective;
2482 context->capset.cap.permitted = new->cap_permitted;
2483 context->type = AUDIT_CAPSET;
2484}
2485
2486/**
2451 * audit_core_dumps - record information about processes that end abnormally 2487 * audit_core_dumps - record information about processes that end abnormally
2452 * @signr: signal value 2488 * @signr: signal value
2453 * 2489 *
@@ -2458,7 +2494,8 @@ void audit_core_dumps(long signr)
2458{ 2494{
2459 struct audit_buffer *ab; 2495 struct audit_buffer *ab;
2460 u32 sid; 2496 u32 sid;
2461 uid_t auid = audit_get_loginuid(current); 2497 uid_t auid = audit_get_loginuid(current), uid;
2498 gid_t gid;
2462 unsigned int sessionid = audit_get_sessionid(current); 2499 unsigned int sessionid = audit_get_sessionid(current);
2463 2500
2464 if (!audit_enabled) 2501 if (!audit_enabled)
@@ -2468,8 +2505,9 @@ void audit_core_dumps(long signr)
2468 return; 2505 return;
2469 2506
2470 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2507 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2508 current_uid_gid(&uid, &gid);
2471 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", 2509 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2472 auid, current->uid, current->gid, sessionid); 2510 auid, uid, gid, sessionid);
2473 security_task_getsecid(current, &sid); 2511 security_task_getsecid(current, &sid);
2474 if (sid) { 2512 if (sid) {
2475 char *ctx = NULL; 2513 char *ctx = NULL;
diff --git a/kernel/capability.c b/kernel/capability.c
index 33e51e78c2d8..688926e496be 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,7 @@
7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> 7 * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net>
8 */ 8 */
9 9
10#include <linux/audit.h>
10#include <linux/capability.h> 11#include <linux/capability.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/module.h> 13#include <linux/module.h>
@@ -14,12 +15,7 @@
14#include <linux/syscalls.h> 15#include <linux/syscalls.h>
15#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18#include "cred-internals.h"
18/*
19 * This lock protects task->cap_* for all tasks including current.
20 * Locking rule: acquire this prior to tasklist_lock.
21 */
22static DEFINE_SPINLOCK(task_capability_lock);
23 19
24/* 20/*
25 * Leveraged for setting/resetting capabilities 21 * Leveraged for setting/resetting capabilities
@@ -33,6 +29,17 @@ EXPORT_SYMBOL(__cap_empty_set);
33EXPORT_SYMBOL(__cap_full_set); 29EXPORT_SYMBOL(__cap_full_set);
34EXPORT_SYMBOL(__cap_init_eff_set); 30EXPORT_SYMBOL(__cap_init_eff_set);
35 31
32#ifdef CONFIG_SECURITY_FILE_CAPABILITIES
33int file_caps_enabled = 1;
34
35static int __init file_caps_disable(char *str)
36{
37 file_caps_enabled = 0;
38 return 1;
39}
40__setup("no_file_caps", file_caps_disable);
41#endif
42
36/* 43/*
37 * More recent versions of libcap are available from: 44 * More recent versions of libcap are available from:
38 * 45 *
@@ -115,167 +122,12 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
115 return 0; 122 return 0;
116} 123}
117 124
118#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
119
120/* 125/*
121 * Without filesystem capability support, we nominally support one process 126 * The only thing that can change the capabilities of the current
122 * setting the capabilities of another 127 * process is the current process. As such, we can't be in this code
123 */ 128 * at the same time as we are in the process of setting capabilities
124static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, 129 * in this process. The net result is that we can limit our use of
125 kernel_cap_t *pIp, kernel_cap_t *pPp) 130 * locks to when we are reading the caps of another process.
126{
127 struct task_struct *target;
128 int ret;
129
130 spin_lock(&task_capability_lock);
131 read_lock(&tasklist_lock);
132
133 if (pid && pid != task_pid_vnr(current)) {
134 target = find_task_by_vpid(pid);
135 if (!target) {
136 ret = -ESRCH;
137 goto out;
138 }
139 } else
140 target = current;
141
142 ret = security_capget(target, pEp, pIp, pPp);
143
144out:
145 read_unlock(&tasklist_lock);
146 spin_unlock(&task_capability_lock);
147
148 return ret;
149}
150
151/*
152 * cap_set_pg - set capabilities for all processes in a given process
153 * group. We call this holding task_capability_lock and tasklist_lock.
154 */
155static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
156 kernel_cap_t *inheritable,
157 kernel_cap_t *permitted)
158{
159 struct task_struct *g, *target;
160 int ret = -EPERM;
161 int found = 0;
162 struct pid *pgrp;
163
164 spin_lock(&task_capability_lock);
165 read_lock(&tasklist_lock);
166
167 pgrp = find_vpid(pgrp_nr);
168 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
169 target = g;
170 while_each_thread(g, target) {
171 if (!security_capset_check(target, effective,
172 inheritable, permitted)) {
173 security_capset_set(target, effective,
174 inheritable, permitted);
175 ret = 0;
176 }
177 found = 1;
178 }
179 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
180
181 read_unlock(&tasklist_lock);
182 spin_unlock(&task_capability_lock);
183
184 if (!found)
185 ret = 0;
186 return ret;
187}
188
189/*
190 * cap_set_all - set capabilities for all processes other than init
191 * and self. We call this holding task_capability_lock and tasklist_lock.
192 */
193static inline int cap_set_all(kernel_cap_t *effective,
194 kernel_cap_t *inheritable,
195 kernel_cap_t *permitted)
196{
197 struct task_struct *g, *target;
198 int ret = -EPERM;
199 int found = 0;
200
201 spin_lock(&task_capability_lock);
202 read_lock(&tasklist_lock);
203
204 do_each_thread(g, target) {
205 if (target == current
206 || is_container_init(target->group_leader))
207 continue;
208 found = 1;
209 if (security_capset_check(target, effective, inheritable,
210 permitted))
211 continue;
212 ret = 0;
213 security_capset_set(target, effective, inheritable, permitted);
214 } while_each_thread(g, target);
215
216 read_unlock(&tasklist_lock);
217 spin_unlock(&task_capability_lock);
218
219 if (!found)
220 ret = 0;
221
222 return ret;
223}
224
225/*
226 * Given the target pid does not refer to the current process we
227 * need more elaborate support... (This support is not present when
228 * filesystem capabilities are configured.)
229 */
230static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
231 kernel_cap_t *inheritable,
232 kernel_cap_t *permitted)
233{
234 struct task_struct *target;
235 int ret;
236
237 if (!capable(CAP_SETPCAP))
238 return -EPERM;
239
240 if (pid == -1) /* all procs other than current and init */
241 return cap_set_all(effective, inheritable, permitted);
242
243 else if (pid < 0) /* all procs in process group */
244 return cap_set_pg(-pid, effective, inheritable, permitted);
245
246 /* target != current */
247 spin_lock(&task_capability_lock);
248 read_lock(&tasklist_lock);
249
250 target = find_task_by_vpid(pid);
251 if (!target)
252 ret = -ESRCH;
253 else {
254 ret = security_capset_check(target, effective, inheritable,
255 permitted);
256
257 /* having verified that the proposed changes are legal,
258 we now put them into effect. */
259 if (!ret)
260 security_capset_set(target, effective, inheritable,
261 permitted);
262 }
263
264 read_unlock(&tasklist_lock);
265 spin_unlock(&task_capability_lock);
266
267 return ret;
268}
269
270#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
271
272/*
273 * If we have configured with filesystem capability support, then the
274 * only thing that can change the capabilities of the current process
275 * is the current process. As such, we can't be in this code at the
276 * same time as we are in the process of setting capabilities in this
277 * process. The net result is that we can limit our use of locks to
278 * when we are reading the caps of another process.
279 */ 131 */
280static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, 132static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
281 kernel_cap_t *pIp, kernel_cap_t *pPp) 133 kernel_cap_t *pIp, kernel_cap_t *pPp)
@@ -285,7 +137,6 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
285 if (pid && (pid != task_pid_vnr(current))) { 137 if (pid && (pid != task_pid_vnr(current))) {
286 struct task_struct *target; 138 struct task_struct *target;
287 139
288 spin_lock(&task_capability_lock);
289 read_lock(&tasklist_lock); 140 read_lock(&tasklist_lock);
290 141
291 target = find_task_by_vpid(pid); 142 target = find_task_by_vpid(pid);
@@ -295,50 +146,12 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
295 ret = security_capget(target, pEp, pIp, pPp); 146 ret = security_capget(target, pEp, pIp, pPp);
296 147
297 read_unlock(&tasklist_lock); 148 read_unlock(&tasklist_lock);
298 spin_unlock(&task_capability_lock);
299 } else 149 } else
300 ret = security_capget(current, pEp, pIp, pPp); 150 ret = security_capget(current, pEp, pIp, pPp);
301 151
302 return ret; 152 return ret;
303} 153}
304 154
305/*
306 * With filesystem capability support configured, the kernel does not
307 * permit the changing of capabilities in one process by another
308 * process. (CAP_SETPCAP has much less broad semantics when configured
309 * this way.)
310 */
311static inline int do_sys_capset_other_tasks(pid_t pid,
312 kernel_cap_t *effective,
313 kernel_cap_t *inheritable,
314 kernel_cap_t *permitted)
315{
316 return -EPERM;
317}
318
319#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
320
321/*
322 * Atomically modify the effective capabilities returning the original
323 * value. No permission check is performed here - it is assumed that the
324 * caller is permitted to set the desired effective capabilities.
325 */
326kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
327{
328 kernel_cap_t pE_old;
329
330 spin_lock(&task_capability_lock);
331
332 pE_old = current->cap_effective;
333 current->cap_effective = pE_new;
334
335 spin_unlock(&task_capability_lock);
336
337 return pE_old;
338}
339
340EXPORT_SYMBOL(cap_set_effective);
341
342/** 155/**
343 * sys_capget - get the capabilities of a given process. 156 * sys_capget - get the capabilities of a given process.
344 * @header: pointer to struct that contains capability version and 157 * @header: pointer to struct that contains capability version and
@@ -366,7 +179,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
366 return -EINVAL; 179 return -EINVAL;
367 180
368 ret = cap_get_target_pid(pid, &pE, &pI, &pP); 181 ret = cap_get_target_pid(pid, &pE, &pI, &pP);
369
370 if (!ret) { 182 if (!ret) {
371 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 183 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
372 unsigned i; 184 unsigned i;
@@ -412,16 +224,14 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
412 * @data: pointer to struct that contains the effective, permitted, 224 * @data: pointer to struct that contains the effective, permitted,
413 * and inheritable capabilities 225 * and inheritable capabilities
414 * 226 *
415 * Set capabilities for a given process, all processes, or all 227 * Set capabilities for the current process only. The ability to any other
416 * processes in a given process group. 228 * process(es) has been deprecated and removed.
417 * 229 *
418 * The restrictions on setting capabilities are specified as: 230 * The restrictions on setting capabilities are specified as:
419 * 231 *
420 * [pid is for the 'target' task. 'current' is the calling task.] 232 * I: any raised capabilities must be a subset of the old permitted
421 * 233 * P: any raised capabilities must be a subset of the old permitted
422 * I: any raised capabilities must be a subset of the (old current) permitted 234 * E: must be set to a subset of new permitted
423 * P: any raised capabilities must be a subset of the (old current) permitted
424 * E: must be set to a subset of (new target) permitted
425 * 235 *
426 * Returns 0 on success and < 0 on error. 236 * Returns 0 on success and < 0 on error.
427 */ 237 */
@@ -430,6 +240,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
430 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
431 unsigned i, tocopy; 241 unsigned i, tocopy;
432 kernel_cap_t inheritable, permitted, effective; 242 kernel_cap_t inheritable, permitted, effective;
243 struct cred *new;
433 int ret; 244 int ret;
434 pid_t pid; 245 pid_t pid;
435 246
@@ -440,10 +251,13 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
440 if (get_user(pid, &header->pid)) 251 if (get_user(pid, &header->pid))
441 return -EFAULT; 252 return -EFAULT;
442 253
443 if (copy_from_user(&kdata, data, tocopy 254 /* may only affect current now */
444 * sizeof(struct __user_cap_data_struct))) { 255 if (pid != 0 && pid != task_pid_vnr(current))
256 return -EPERM;
257
258 if (copy_from_user(&kdata, data,
259 tocopy * sizeof(struct __user_cap_data_struct)))
445 return -EFAULT; 260 return -EFAULT;
446 }
447 261
448 for (i = 0; i < tocopy; i++) { 262 for (i = 0; i < tocopy; i++) {
449 effective.cap[i] = kdata[i].effective; 263 effective.cap[i] = kdata[i].effective;
@@ -457,32 +271,21 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
457 i++; 271 i++;
458 } 272 }
459 273
460 if (pid && (pid != task_pid_vnr(current))) 274 new = prepare_creds();
461 ret = do_sys_capset_other_tasks(pid, &effective, &inheritable, 275 if (!new)
462 &permitted); 276 return -ENOMEM;
463 else {
464 /*
465 * This lock is required even when filesystem
466 * capability support is configured - it protects the
467 * sys_capget() call from returning incorrect data in
468 * the case that the targeted process is not the
469 * current one.
470 */
471 spin_lock(&task_capability_lock);
472 277
473 ret = security_capset_check(current, &effective, &inheritable, 278 ret = security_capset(new, current_cred(),
474 &permitted); 279 &effective, &inheritable, &permitted);
475 /* 280 if (ret < 0)
476 * Having verified that the proposed changes are 281 goto error;
477 * legal, we now put them into effect. 282
478 */ 283 audit_log_capset(pid, new, current_cred());
479 if (!ret)
480 security_capset_set(current, &effective, &inheritable,
481 &permitted);
482 spin_unlock(&task_capability_lock);
483 }
484 284
285 return commit_creds(new);
485 286
287error:
288 abort_creds(new);
486 return ret; 289 return ret;
487} 290}
488 291
@@ -498,7 +301,12 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
498 */ 301 */
499int capable(int cap) 302int capable(int cap)
500{ 303{
501 if (has_capability(current, cap)) { 304 if (unlikely(!cap_valid(cap))) {
305 printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
306 BUG();
307 }
308
309 if (security_capable(cap) == 0) {
502 current->flags |= PF_SUPERPRIV; 310 current->flags |= PF_SUPERPRIV;
503 return 1; 311 return 1;
504 } 312 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2606d0fb4e54..c29831076e7a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
84 /* Tracks how many cgroups are currently defined in hierarchy.*/ 84 /* Tracks how many cgroups are currently defined in hierarchy.*/
85 int number_of_cgroups; 85 int number_of_cgroups;
86 86
87 /* A list running through the mounted hierarchies */ 87 /* A list running through the active hierarchies */
88 struct list_head root_list; 88 struct list_head root_list;
89 89
90 /* Hierarchy-specific flags */ 90 /* Hierarchy-specific flags */
@@ -116,7 +116,6 @@ static int root_count;
116 * be called. 116 * be called.
117 */ 117 */
118static int need_forkexit_callback __read_mostly; 118static int need_forkexit_callback __read_mostly;
119static int need_mm_owner_callback __read_mostly;
120 119
121/* convenient tests for these bits */ 120/* convenient tests for these bits */
122inline int cgroup_is_removed(const struct cgroup *cgrp) 121inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -149,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
149#define for_each_subsys(_root, _ss) \ 148#define for_each_subsys(_root, _ss) \
150list_for_each_entry(_ss, &_root->subsys_list, sibling) 149list_for_each_entry(_ss, &_root->subsys_list, sibling)
151 150
152/* for_each_root() allows you to iterate across the active hierarchies */ 151/* for_each_active_root() allows you to iterate across the active hierarchies */
153#define for_each_root(_root) \ 152#define for_each_active_root(_root) \
154list_for_each_entry(_root, &roots, root_list) 153list_for_each_entry(_root, &roots, root_list)
155 154
156/* the list of cgroups eligible for automatic release. Protected by 155/* the list of cgroups eligible for automatic release. Protected by
@@ -272,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
272 271
273 rcu_read_lock(); 272 rcu_read_lock();
274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
275 struct cgroup *cgrp = cg->subsys[i]->cgroup; 274 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
276 if (atomic_dec_and_test(&cgrp->count) && 275 if (atomic_dec_and_test(&cgrp->count) &&
277 notify_on_release(cgrp)) { 276 notify_on_release(cgrp)) {
278 if (taskexit) 277 if (taskexit)
@@ -385,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
385 return 0; 384 return 0;
386} 385}
387 386
387/**
388 * link_css_set - a helper function to link a css_set to a cgroup
389 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
390 * @cg: the css_set to be linked
391 * @cgrp: the destination cgroup
392 */
393static void link_css_set(struct list_head *tmp_cg_links,
394 struct css_set *cg, struct cgroup *cgrp)
395{
396 struct cg_cgroup_link *link;
397
398 BUG_ON(list_empty(tmp_cg_links));
399 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
400 cgrp_link_list);
401 link->cg = cg;
402 list_move(&link->cgrp_link_list, &cgrp->css_sets);
403 list_add(&link->cg_link_list, &cg->cg_links);
404}
405
388/* 406/*
389 * find_css_set() takes an existing cgroup group and a 407 * find_css_set() takes an existing cgroup group and a
390 * cgroup object, and returns a css_set object that's 408 * cgroup object, and returns a css_set object that's
@@ -400,7 +418,6 @@ static struct css_set *find_css_set(
400 int i; 418 int i;
401 419
402 struct list_head tmp_cg_links; 420 struct list_head tmp_cg_links;
403 struct cg_cgroup_link *link;
404 421
405 struct hlist_head *hhead; 422 struct hlist_head *hhead;
406 423
@@ -445,26 +462,11 @@ static struct css_set *find_css_set(
445 * only do it for the first subsystem in each 462 * only do it for the first subsystem in each
446 * hierarchy 463 * hierarchy
447 */ 464 */
448 if (ss->root->subsys_list.next == &ss->sibling) { 465 if (ss->root->subsys_list.next == &ss->sibling)
449 BUG_ON(list_empty(&tmp_cg_links)); 466 link_css_set(&tmp_cg_links, res, cgrp);
450 link = list_entry(tmp_cg_links.next,
451 struct cg_cgroup_link,
452 cgrp_link_list);
453 list_del(&link->cgrp_link_list);
454 list_add(&link->cgrp_link_list, &cgrp->css_sets);
455 link->cg = res;
456 list_add(&link->cg_link_list, &res->cg_links);
457 }
458 }
459 if (list_empty(&rootnode.subsys_list)) {
460 link = list_entry(tmp_cg_links.next,
461 struct cg_cgroup_link,
462 cgrp_link_list);
463 list_del(&link->cgrp_link_list);
464 list_add(&link->cgrp_link_list, &dummytop->css_sets);
465 link->cg = res;
466 list_add(&link->cg_link_list, &res->cg_links);
467 } 467 }
468 if (list_empty(&rootnode.subsys_list))
469 link_css_set(&tmp_cg_links, res, dummytop);
468 470
469 BUG_ON(!list_empty(&tmp_cg_links)); 471 BUG_ON(!list_empty(&tmp_cg_links));
470 472
@@ -571,9 +573,8 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
571 573
572 if (inode) { 574 if (inode) {
573 inode->i_mode = mode; 575 inode->i_mode = mode;
574 inode->i_uid = current->fsuid; 576 inode->i_uid = current_fsuid();
575 inode->i_gid = current->fsgid; 577 inode->i_gid = current_fsgid();
576 inode->i_blocks = 0;
577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 578 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 579 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
579 } 580 }
@@ -588,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
588{ 589{
589 struct cgroup_subsys *ss; 590 struct cgroup_subsys *ss;
590 for_each_subsys(cgrp->root, ss) 591 for_each_subsys(cgrp->root, ss)
591 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) 592 if (ss->pre_destroy)
592 ss->pre_destroy(ss, cgrp); 593 ss->pre_destroy(ss, cgrp);
593 return; 594 return;
594} 595}
595 596
597static void free_cgroup_rcu(struct rcu_head *obj)
598{
599 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
600
601 kfree(cgrp);
602}
603
596static void cgroup_diput(struct dentry *dentry, struct inode *inode) 604static void cgroup_diput(struct dentry *dentry, struct inode *inode)
597{ 605{
598 /* is dentry a directory ? if so, kfree() associated cgroup */ 606 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -612,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
612 /* 620 /*
613 * Release the subsystem state objects. 621 * Release the subsystem state objects.
614 */ 622 */
615 for_each_subsys(cgrp->root, ss) { 623 for_each_subsys(cgrp->root, ss)
616 if (cgrp->subsys[ss->subsys_id]) 624 ss->destroy(ss, cgrp);
617 ss->destroy(ss, cgrp);
618 }
619 625
620 cgrp->root->number_of_cgroups--; 626 cgrp->root->number_of_cgroups--;
621 mutex_unlock(&cgroup_mutex); 627 mutex_unlock(&cgroup_mutex);
622 628
623 /* Drop the active superblock reference that we took when we 629 /*
624 * created the cgroup */ 630 * Drop the active superblock reference that we took when we
631 * created the cgroup
632 */
625 deactivate_super(cgrp->root->sb); 633 deactivate_super(cgrp->root->sb);
626 634
627 kfree(cgrp); 635 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
628 } 636 }
629 iput(inode); 637 iput(inode);
630} 638}
@@ -714,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
714 BUG_ON(cgrp->subsys[i]); 722 BUG_ON(cgrp->subsys[i]);
715 BUG_ON(!dummytop->subsys[i]); 723 BUG_ON(!dummytop->subsys[i]);
716 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 724 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
725 mutex_lock(&ss->hierarchy_mutex);
717 cgrp->subsys[i] = dummytop->subsys[i]; 726 cgrp->subsys[i] = dummytop->subsys[i];
718 cgrp->subsys[i]->cgroup = cgrp; 727 cgrp->subsys[i]->cgroup = cgrp;
719 list_add(&ss->sibling, &root->subsys_list); 728 list_move(&ss->sibling, &root->subsys_list);
720 rcu_assign_pointer(ss->root, root); 729 ss->root = root;
721 if (ss->bind) 730 if (ss->bind)
722 ss->bind(ss, cgrp); 731 ss->bind(ss, cgrp);
723 732 mutex_unlock(&ss->hierarchy_mutex);
724 } else if (bit & removed_bits) { 733 } else if (bit & removed_bits) {
725 /* We're removing this subsystem */ 734 /* We're removing this subsystem */
726 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 735 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
727 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 736 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
737 mutex_lock(&ss->hierarchy_mutex);
728 if (ss->bind) 738 if (ss->bind)
729 ss->bind(ss, dummytop); 739 ss->bind(ss, dummytop);
730 dummytop->subsys[i]->cgroup = dummytop; 740 dummytop->subsys[i]->cgroup = dummytop;
731 cgrp->subsys[i] = NULL; 741 cgrp->subsys[i] = NULL;
732 rcu_assign_pointer(subsys[i]->root, &rootnode); 742 subsys[i]->root = &rootnode;
733 list_del(&ss->sibling); 743 list_move(&ss->sibling, &rootnode.subsys_list);
744 mutex_unlock(&ss->hierarchy_mutex);
734 } else if (bit & final_bits) { 745 } else if (bit & final_bits) {
735 /* Subsystem state should already exist */ 746 /* Subsystem state should already exist */
736 BUG_ON(!cgrp->subsys[i]); 747 BUG_ON(!cgrp->subsys[i]);
@@ -992,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
992 root = NULL; 1003 root = NULL;
993 } else { 1004 } else {
994 /* New superblock */ 1005 /* New superblock */
995 struct cgroup *cgrp = &root->top_cgroup; 1006 struct cgroup *root_cgrp = &root->top_cgroup;
996 struct inode *inode; 1007 struct inode *inode;
997 int i; 1008 int i;
998 1009
@@ -1033,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1033 list_add(&root->root_list, &roots); 1044 list_add(&root->root_list, &roots);
1034 root_count++; 1045 root_count++;
1035 1046
1036 sb->s_root->d_fsdata = &root->top_cgroup; 1047 sb->s_root->d_fsdata = root_cgrp;
1037 root->top_cgroup.dentry = sb->s_root; 1048 root->top_cgroup.dentry = sb->s_root;
1038 1049
1039 /* Link the top cgroup in this hierarchy into all 1050 /* Link the top cgroup in this hierarchy into all
@@ -1044,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1044 struct hlist_node *node; 1055 struct hlist_node *node;
1045 struct css_set *cg; 1056 struct css_set *cg;
1046 1057
1047 hlist_for_each_entry(cg, node, hhead, hlist) { 1058 hlist_for_each_entry(cg, node, hhead, hlist)
1048 struct cg_cgroup_link *link; 1059 link_css_set(&tmp_cg_links, cg, root_cgrp);
1049
1050 BUG_ON(list_empty(&tmp_cg_links));
1051 link = list_entry(tmp_cg_links.next,
1052 struct cg_cgroup_link,
1053 cgrp_link_list);
1054 list_del(&link->cgrp_link_list);
1055 link->cg = cg;
1056 list_add(&link->cgrp_link_list,
1057 &root->top_cgroup.css_sets);
1058 list_add(&link->cg_link_list, &cg->cg_links);
1059 }
1060 } 1060 }
1061 write_unlock(&css_set_lock); 1061 write_unlock(&css_set_lock);
1062 1062
1063 free_cg_links(&tmp_cg_links); 1063 free_cg_links(&tmp_cg_links);
1064 1064
1065 BUG_ON(!list_empty(&cgrp->sibling)); 1065 BUG_ON(!list_empty(&root_cgrp->sibling));
1066 BUG_ON(!list_empty(&cgrp->children)); 1066 BUG_ON(!list_empty(&root_cgrp->children));
1067 BUG_ON(root->number_of_cgroups != 1); 1067 BUG_ON(root->number_of_cgroups != 1);
1068 1068
1069 cgroup_populate_dir(cgrp); 1069 cgroup_populate_dir(root_cgrp);
1070 mutex_unlock(&inode->i_mutex); 1070 mutex_unlock(&inode->i_mutex);
1071 mutex_unlock(&cgroup_mutex); 1071 mutex_unlock(&cgroup_mutex);
1072 } 1072 }
@@ -1115,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
1115 } 1115 }
1116 write_unlock(&css_set_lock); 1116 write_unlock(&css_set_lock);
1117 1117
1118 if (!list_empty(&root->root_list)) { 1118 list_del(&root->root_list);
1119 list_del(&root->root_list); 1119 root_count--;
1120 root_count--; 1120
1121 }
1122 mutex_unlock(&cgroup_mutex); 1121 mutex_unlock(&cgroup_mutex);
1123 1122
1124 kfree(root); 1123 kfree(root);
@@ -1147,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1147 * @buf: the buffer to write the path into 1146 * @buf: the buffer to write the path into
1148 * @buflen: the length of the buffer 1147 * @buflen: the length of the buffer
1149 * 1148 *
1150 * Called with cgroup_mutex held. Writes path of cgroup into buf. 1149 * Called with cgroup_mutex held or else with an RCU-protected cgroup
1151 * Returns 0 on success, -errno on error. 1150 * reference. Writes path of cgroup into buf. Returns 0 on success,
1151 * -errno on error.
1152 */ 1152 */
1153int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1153int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1154{ 1154{
1155 char *start; 1155 char *start;
1156 struct dentry *dentry = rcu_dereference(cgrp->dentry);
1156 1157
1157 if (cgrp == dummytop) { 1158 if (!dentry || cgrp == dummytop) {
1158 /* 1159 /*
1159 * Inactive subsystems have no dentry for their root 1160 * Inactive subsystems have no dentry for their root
1160 * cgroup 1161 * cgroup
@@ -1167,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1167 1168
1168 *--start = '\0'; 1169 *--start = '\0';
1169 for (;;) { 1170 for (;;) {
1170 int len = cgrp->dentry->d_name.len; 1171 int len = dentry->d_name.len;
1171 if ((start -= len) < buf) 1172 if ((start -= len) < buf)
1172 return -ENAMETOOLONG; 1173 return -ENAMETOOLONG;
1173 memcpy(start, cgrp->dentry->d_name.name, len); 1174 memcpy(start, cgrp->dentry->d_name.name, len);
1174 cgrp = cgrp->parent; 1175 cgrp = cgrp->parent;
1175 if (!cgrp) 1176 if (!cgrp)
1176 break; 1177 break;
1178 dentry = rcu_dereference(cgrp->dentry);
1177 if (!cgrp->parent) 1179 if (!cgrp->parent)
1178 continue; 1180 continue;
1179 if (--start < buf) 1181 if (--start < buf)
@@ -1218,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1218 int retval = 0; 1220 int retval = 0;
1219 struct cgroup_subsys *ss; 1221 struct cgroup_subsys *ss;
1220 struct cgroup *oldcgrp; 1222 struct cgroup *oldcgrp;
1221 struct css_set *cg = tsk->cgroups; 1223 struct css_set *cg;
1222 struct css_set *newcg; 1224 struct css_set *newcg;
1223 struct cgroupfs_root *root = cgrp->root; 1225 struct cgroupfs_root *root = cgrp->root;
1224 int subsys_id; 1226 int subsys_id;
@@ -1238,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1238 } 1240 }
1239 } 1241 }
1240 1242
1243 task_lock(tsk);
1244 cg = tsk->cgroups;
1245 get_css_set(cg);
1246 task_unlock(tsk);
1241 /* 1247 /*
1242 * Locate or allocate a new css_set for this task, 1248 * Locate or allocate a new css_set for this task,
1243 * based on its final set of cgroups 1249 * based on its final set of cgroups
1244 */ 1250 */
1245 newcg = find_css_set(cg, cgrp); 1251 newcg = find_css_set(cg, cgrp);
1252 put_css_set(cg);
1246 if (!newcg) 1253 if (!newcg)
1247 return -ENOMEM; 1254 return -ENOMEM;
1248 1255
@@ -1280,6 +1287,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1280static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 1287static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1281{ 1288{
1282 struct task_struct *tsk; 1289 struct task_struct *tsk;
1290 const struct cred *cred = current_cred(), *tcred;
1283 int ret; 1291 int ret;
1284 1292
1285 if (pid) { 1293 if (pid) {
@@ -1289,14 +1297,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1289 rcu_read_unlock(); 1297 rcu_read_unlock();
1290 return -ESRCH; 1298 return -ESRCH;
1291 } 1299 }
1292 get_task_struct(tsk);
1293 rcu_read_unlock();
1294 1300
1295 if ((current->euid) && (current->euid != tsk->uid) 1301 tcred = __task_cred(tsk);
1296 && (current->euid != tsk->suid)) { 1302 if (cred->euid &&
1297 put_task_struct(tsk); 1303 cred->euid != tcred->uid &&
1304 cred->euid != tcred->suid) {
1305 rcu_read_unlock();
1298 return -EACCES; 1306 return -EACCES;
1299 } 1307 }
1308 get_task_struct(tsk);
1309 rcu_read_unlock();
1300 } else { 1310 } else {
1301 tsk = current; 1311 tsk = current;
1302 get_task_struct(tsk); 1312 get_task_struct(tsk);
@@ -1444,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1444 struct cftype *cft = __d_cft(file->f_dentry); 1454 struct cftype *cft = __d_cft(file->f_dentry);
1445 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1455 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1446 1456
1447 if (!cft || cgroup_is_removed(cgrp)) 1457 if (cgroup_is_removed(cgrp))
1448 return -ENODEV; 1458 return -ENODEV;
1449 if (cft->write) 1459 if (cft->write)
1450 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1460 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1489,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1489 struct cftype *cft = __d_cft(file->f_dentry); 1499 struct cftype *cft = __d_cft(file->f_dentry);
1490 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1500 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1491 1501
1492 if (!cft || cgroup_is_removed(cgrp)) 1502 if (cgroup_is_removed(cgrp))
1493 return -ENODEV; 1503 return -ENODEV;
1494 1504
1495 if (cft->read) 1505 if (cft->read)
@@ -1553,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1553 err = generic_file_open(inode, file); 1563 err = generic_file_open(inode, file);
1554 if (err) 1564 if (err)
1555 return err; 1565 return err;
1556
1557 cft = __d_cft(file->f_dentry); 1566 cft = __d_cft(file->f_dentry);
1558 if (!cft) 1567
1559 return -ENODEV;
1560 if (cft->read_map || cft->read_seq_string) { 1568 if (cft->read_map || cft->read_seq_string) {
1561 struct cgroup_seqfile_state *state = 1569 struct cgroup_seqfile_state *state =
1562 kzalloc(sizeof(*state), GFP_USER); 1570 kzalloc(sizeof(*state), GFP_USER);
@@ -1670,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1670 if (!error) { 1678 if (!error) {
1671 dentry->d_fsdata = cgrp; 1679 dentry->d_fsdata = cgrp;
1672 inc_nlink(parent->d_inode); 1680 inc_nlink(parent->d_inode);
1673 cgrp->dentry = dentry; 1681 rcu_assign_pointer(cgrp->dentry, dentry);
1674 dget(dentry); 1682 dget(dentry);
1675 } 1683 }
1676 dput(dentry); 1684 dput(dentry);
@@ -1811,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1811{ 1819{
1812 struct task_struct *res; 1820 struct task_struct *res;
1813 struct list_head *l = it->task; 1821 struct list_head *l = it->task;
1822 struct cg_cgroup_link *link;
1814 1823
1815 /* If the iterator cg is NULL, we have no tasks */ 1824 /* If the iterator cg is NULL, we have no tasks */
1816 if (!it->cg_link) 1825 if (!it->cg_link)
@@ -1818,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1818 res = list_entry(l, struct task_struct, cg_list); 1827 res = list_entry(l, struct task_struct, cg_list);
1819 /* Advance iterator to find next entry */ 1828 /* Advance iterator to find next entry */
1820 l = l->next; 1829 l = l->next;
1821 if (l == &res->cgroups->tasks) { 1830 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
1831 if (l == &link->cg->tasks) {
1822 /* We reached the end of this task list - move on to 1832 /* We reached the end of this task list - move on to
1823 * the next cg_cgroup_link */ 1833 * the next cg_cgroup_link */
1824 cgroup_advance_iter(cgrp, it); 1834 cgroup_advance_iter(cgrp, it);
@@ -2012,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2012 */ 2022 */
2013static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2023static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
2014{ 2024{
2015 int n = 0; 2025 int n = 0, pid;
2016 struct cgroup_iter it; 2026 struct cgroup_iter it;
2017 struct task_struct *tsk; 2027 struct task_struct *tsk;
2018 cgroup_iter_start(cgrp, &it); 2028 cgroup_iter_start(cgrp, &it);
2019 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2029 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2020 if (unlikely(n == npids)) 2030 if (unlikely(n == npids))
2021 break; 2031 break;
2022 pidarray[n++] = task_pid_vnr(tsk); 2032 pid = task_pid_vnr(tsk);
2033 if (pid > 0)
2034 pidarray[n++] = pid;
2023 } 2035 }
2024 cgroup_iter_end(cgrp, &it); 2036 cgroup_iter_end(cgrp, &it);
2025 return n; 2037 return n;
@@ -2051,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2051 2063
2052 ret = 0; 2064 ret = 0;
2053 cgrp = dentry->d_fsdata; 2065 cgrp = dentry->d_fsdata;
2054 rcu_read_lock();
2055 2066
2056 cgroup_iter_start(cgrp, &it); 2067 cgroup_iter_start(cgrp, &it);
2057 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2068 while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2076,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2076 } 2087 }
2077 cgroup_iter_end(cgrp, &it); 2088 cgroup_iter_end(cgrp, &it);
2078 2089
2079 rcu_read_unlock();
2080err: 2090err:
2081 return ret; 2091 return ret;
2082} 2092}
@@ -2323,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2323 struct cgroup *cgrp) 2333 struct cgroup *cgrp)
2324{ 2334{
2325 css->cgroup = cgrp; 2335 css->cgroup = cgrp;
2326 atomic_set(&css->refcnt, 0); 2336 atomic_set(&css->refcnt, 1);
2327 css->flags = 0; 2337 css->flags = 0;
2328 if (cgrp == dummytop) 2338 if (cgrp == dummytop)
2329 set_bit(CSS_ROOT, &css->flags); 2339 set_bit(CSS_ROOT, &css->flags);
@@ -2331,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2331 cgrp->subsys[ss->subsys_id] = css; 2341 cgrp->subsys[ss->subsys_id] = css;
2332} 2342}
2333 2343
2344static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2345{
2346 /* We need to take each hierarchy_mutex in a consistent order */
2347 int i;
2348
2349 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2350 struct cgroup_subsys *ss = subsys[i];
2351 if (ss->root == root)
2352 mutex_lock_nested(&ss->hierarchy_mutex, i);
2353 }
2354}
2355
2356static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2357{
2358 int i;
2359
2360 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2361 struct cgroup_subsys *ss = subsys[i];
2362 if (ss->root == root)
2363 mutex_unlock(&ss->hierarchy_mutex);
2364 }
2365}
2366
2334/* 2367/*
2335 * cgroup_create - create a cgroup 2368 * cgroup_create - create a cgroup
2336 * @parent: cgroup that will be parent of the new cgroup 2369 * @parent: cgroup that will be parent of the new cgroup
@@ -2379,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2379 init_cgroup_css(css, ss, cgrp); 2412 init_cgroup_css(css, ss, cgrp);
2380 } 2413 }
2381 2414
2415 cgroup_lock_hierarchy(root);
2382 list_add(&cgrp->sibling, &cgrp->parent->children); 2416 list_add(&cgrp->sibling, &cgrp->parent->children);
2417 cgroup_unlock_hierarchy(root);
2383 root->number_of_cgroups++; 2418 root->number_of_cgroups++;
2384 2419
2385 err = cgroup_create_dir(cgrp, dentry, mode); 2420 err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2430,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2430{ 2465{
2431 /* Check the reference count on each subsystem. Since we 2466 /* Check the reference count on each subsystem. Since we
2432 * already established that there are no tasks in the 2467 * already established that there are no tasks in the
2433 * cgroup, if the css refcount is also 0, then there should 2468 * cgroup, if the css refcount is also 1, then there should
2434 * be no outstanding references, so the subsystem is safe to 2469 * be no outstanding references, so the subsystem is safe to
2435 * destroy. We scan across all subsystems rather than using 2470 * destroy. We scan across all subsystems rather than using
2436 * the per-hierarchy linked list of mounted subsystems since 2471 * the per-hierarchy linked list of mounted subsystems since
@@ -2451,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2451 * matter, since it can only happen if the cgroup 2486 * matter, since it can only happen if the cgroup
2452 * has been deleted and hence no longer needs the 2487 * has been deleted and hence no longer needs the
2453 * release agent to be called anyway. */ 2488 * release agent to be called anyway. */
2454 if (css && atomic_read(&css->refcnt)) 2489 if (css && (atomic_read(&css->refcnt) > 1))
2455 return 1; 2490 return 1;
2456 } 2491 }
2457 return 0; 2492 return 0;
2458} 2493}
2459 2494
2495/*
2496 * Atomically mark all (or else none) of the cgroup's CSS objects as
2497 * CSS_REMOVED. Return true on success, or false if the cgroup has
2498 * busy subsystems. Call with cgroup_mutex held
2499 */
2500
2501static int cgroup_clear_css_refs(struct cgroup *cgrp)
2502{
2503 struct cgroup_subsys *ss;
2504 unsigned long flags;
2505 bool failed = false;
2506 local_irq_save(flags);
2507 for_each_subsys(cgrp->root, ss) {
2508 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2509 int refcnt;
2510 do {
2511 /* We can only remove a CSS with a refcnt==1 */
2512 refcnt = atomic_read(&css->refcnt);
2513 if (refcnt > 1) {
2514 failed = true;
2515 goto done;
2516 }
2517 BUG_ON(!refcnt);
2518 /*
2519 * Drop the refcnt to 0 while we check other
2520 * subsystems. This will cause any racing
2521 * css_tryget() to spin until we set the
2522 * CSS_REMOVED bits or abort
2523 */
2524 } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
2525 }
2526 done:
2527 for_each_subsys(cgrp->root, ss) {
2528 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2529 if (failed) {
2530 /*
2531 * Restore old refcnt if we previously managed
2532 * to clear it from 1 to 0
2533 */
2534 if (!atomic_read(&css->refcnt))
2535 atomic_set(&css->refcnt, 1);
2536 } else {
2537 /* Commit the fact that the CSS is removed */
2538 set_bit(CSS_REMOVED, &css->flags);
2539 }
2540 }
2541 local_irq_restore(flags);
2542 return !failed;
2543}
2544
2460static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 2545static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2461{ 2546{
2462 struct cgroup *cgrp = dentry->d_fsdata; 2547 struct cgroup *cgrp = dentry->d_fsdata;
2463 struct dentry *d; 2548 struct dentry *d;
2464 struct cgroup *parent; 2549 struct cgroup *parent;
2465 struct super_block *sb;
2466 struct cgroupfs_root *root;
2467 2550
2468 /* the vfs holds both inode->i_mutex already */ 2551 /* the vfs holds both inode->i_mutex already */
2469 2552
@@ -2486,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2486 2569
2487 mutex_lock(&cgroup_mutex); 2570 mutex_lock(&cgroup_mutex);
2488 parent = cgrp->parent; 2571 parent = cgrp->parent;
2489 root = cgrp->root;
2490 sb = root->sb;
2491 2572
2492 if (atomic_read(&cgrp->count) 2573 if (atomic_read(&cgrp->count)
2493 || !list_empty(&cgrp->children) 2574 || !list_empty(&cgrp->children)
2494 || cgroup_has_css_refs(cgrp)) { 2575 || !cgroup_clear_css_refs(cgrp)) {
2495 mutex_unlock(&cgroup_mutex); 2576 mutex_unlock(&cgroup_mutex);
2496 return -EBUSY; 2577 return -EBUSY;
2497 } 2578 }
@@ -2501,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2501 if (!list_empty(&cgrp->release_list)) 2582 if (!list_empty(&cgrp->release_list))
2502 list_del(&cgrp->release_list); 2583 list_del(&cgrp->release_list);
2503 spin_unlock(&release_list_lock); 2584 spin_unlock(&release_list_lock);
2504 /* delete my sibling from parent->children */ 2585
2586 cgroup_lock_hierarchy(cgrp->root);
2587 /* delete this cgroup from parent->children */
2505 list_del(&cgrp->sibling); 2588 list_del(&cgrp->sibling);
2589 cgroup_unlock_hierarchy(cgrp->root);
2590
2506 spin_lock(&cgrp->dentry->d_lock); 2591 spin_lock(&cgrp->dentry->d_lock);
2507 d = dget(cgrp->dentry); 2592 d = dget(cgrp->dentry);
2508 spin_unlock(&d->d_lock); 2593 spin_unlock(&d->d_lock);
@@ -2524,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2524 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2609 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2525 2610
2526 /* Create the top cgroup state for this subsystem */ 2611 /* Create the top cgroup state for this subsystem */
2612 list_add(&ss->sibling, &rootnode.subsys_list);
2527 ss->root = &rootnode; 2613 ss->root = &rootnode;
2528 css = ss->create(ss, dummytop); 2614 css = ss->create(ss, dummytop);
2529 /* We don't handle early failures gracefully */ 2615 /* We don't handle early failures gracefully */
@@ -2537,13 +2623,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2537 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 2623 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2538 2624
2539 need_forkexit_callback |= ss->fork || ss->exit; 2625 need_forkexit_callback |= ss->fork || ss->exit;
2540 need_mm_owner_callback |= !!ss->mm_owner_changed;
2541 2626
2542 /* At system boot, before all subsystems have been 2627 /* At system boot, before all subsystems have been
2543 * registered, no tasks have been forked, so we don't 2628 * registered, no tasks have been forked, so we don't
2544 * need to invoke fork callbacks here. */ 2629 * need to invoke fork callbacks here. */
2545 BUG_ON(!list_empty(&init_task.tasks)); 2630 BUG_ON(!list_empty(&init_task.tasks));
2546 2631
2632 mutex_init(&ss->hierarchy_mutex);
2547 ss->active = 1; 2633 ss->active = 1;
2548} 2634}
2549 2635
@@ -2562,7 +2648,6 @@ int __init cgroup_init_early(void)
2562 INIT_HLIST_NODE(&init_css_set.hlist); 2648 INIT_HLIST_NODE(&init_css_set.hlist);
2563 css_set_count = 1; 2649 css_set_count = 1;
2564 init_cgroup_root(&rootnode); 2650 init_cgroup_root(&rootnode);
2565 list_add(&rootnode.root_list, &roots);
2566 root_count = 1; 2651 root_count = 1;
2567 init_task.cgroups = &init_css_set; 2652 init_task.cgroups = &init_css_set;
2568 2653
@@ -2669,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2669 2754
2670 mutex_lock(&cgroup_mutex); 2755 mutex_lock(&cgroup_mutex);
2671 2756
2672 for_each_root(root) { 2757 for_each_active_root(root) {
2673 struct cgroup_subsys *ss; 2758 struct cgroup_subsys *ss;
2674 struct cgroup *cgrp; 2759 struct cgroup *cgrp;
2675 int subsys_id; 2760 int subsys_id;
2676 int count = 0; 2761 int count = 0;
2677 2762
2678 /* Skip this hierarchy if it has no active subsystems */
2679 if (!root->actual_subsys_bits)
2680 continue;
2681 seq_printf(m, "%lu:", root->subsys_bits); 2763 seq_printf(m, "%lu:", root->subsys_bits);
2682 for_each_subsys(root, ss) 2764 for_each_subsys(root, ss)
2683 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 2765 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2787,37 +2869,6 @@ void cgroup_fork_callbacks(struct task_struct *child)
2787 } 2869 }
2788} 2870}
2789 2871
2790#ifdef CONFIG_MM_OWNER
2791/**
2792 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2793 * @p: the new owner
2794 *
2795 * Called on every change to mm->owner. mm_init_owner() does not
2796 * invoke this routine, since it assigns the mm->owner the first time
2797 * and does not change it.
2798 *
2799 * The callbacks are invoked with mmap_sem held in read mode.
2800 */
2801void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2802{
2803 struct cgroup *oldcgrp, *newcgrp = NULL;
2804
2805 if (need_mm_owner_callback) {
2806 int i;
2807 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2808 struct cgroup_subsys *ss = subsys[i];
2809 oldcgrp = task_cgroup(old, ss->subsys_id);
2810 if (new)
2811 newcgrp = task_cgroup(new, ss->subsys_id);
2812 if (oldcgrp == newcgrp)
2813 continue;
2814 if (ss->mm_owner_changed)
2815 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2816 }
2817 }
2818}
2819#endif /* CONFIG_MM_OWNER */
2820
2821/** 2872/**
2822 * cgroup_post_fork - called on a new task after adding it to the task list 2873 * cgroup_post_fork - called on a new task after adding it to the task list
2823 * @child: the task in question 2874 * @child: the task in question
@@ -2831,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child)
2831{ 2882{
2832 if (use_task_css_set_links) { 2883 if (use_task_css_set_links) {
2833 write_lock(&css_set_lock); 2884 write_lock(&css_set_lock);
2885 task_lock(child);
2834 if (list_empty(&child->cg_list)) 2886 if (list_empty(&child->cg_list))
2835 list_add(&child->cg_list, &child->cgroups->tasks); 2887 list_add(&child->cg_list, &child->cgroups->tasks);
2888 task_unlock(child);
2836 write_unlock(&css_set_lock); 2889 write_unlock(&css_set_lock);
2837 } 2890 }
2838} 2891}
@@ -2938,14 +2991,20 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2938 mutex_unlock(&cgroup_mutex); 2991 mutex_unlock(&cgroup_mutex);
2939 return 0; 2992 return 0;
2940 } 2993 }
2994 task_lock(tsk);
2941 cg = tsk->cgroups; 2995 cg = tsk->cgroups;
2942 parent = task_cgroup(tsk, subsys->subsys_id); 2996 parent = task_cgroup(tsk, subsys->subsys_id);
2943 2997
2944 /* Pin the hierarchy */ 2998 /* Pin the hierarchy */
2945 atomic_inc(&parent->root->sb->s_active); 2999 if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
3000 /* We race with the final deactivate_super() */
3001 mutex_unlock(&cgroup_mutex);
3002 return 0;
3003 }
2946 3004
2947 /* Keep the cgroup alive */ 3005 /* Keep the cgroup alive */
2948 get_css_set(cg); 3006 get_css_set(cg);
3007 task_unlock(tsk);
2949 mutex_unlock(&cgroup_mutex); 3008 mutex_unlock(&cgroup_mutex);
2950 3009
2951 /* Now do the VFS work to create a cgroup */ 3010 /* Now do the VFS work to create a cgroup */
@@ -2964,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2964 } 3023 }
2965 3024
2966 /* Create the cgroup directory, which also creates the cgroup */ 3025 /* Create the cgroup directory, which also creates the cgroup */
2967 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); 3026 ret = vfs_mkdir(inode, dentry, 0755);
2968 child = __d_cgrp(dentry); 3027 child = __d_cgrp(dentry);
2969 dput(dentry); 3028 dput(dentry);
2970 if (ret) { 3029 if (ret) {
@@ -2974,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2974 goto out_release; 3033 goto out_release;
2975 } 3034 }
2976 3035
2977 if (!child) {
2978 printk(KERN_INFO
2979 "Couldn't find new cgroup %s\n", nodename);
2980 ret = -ENOMEM;
2981 goto out_release;
2982 }
2983
2984 /* The cgroup now exists. Retake cgroup_mutex and check 3036 /* The cgroup now exists. Retake cgroup_mutex and check
2985 * that we're still in the same state that we thought we 3037 * that we're still in the same state that we thought we
2986 * were. */ 3038 * were. */
@@ -3076,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
3076{ 3128{
3077 struct cgroup *cgrp = css->cgroup; 3129 struct cgroup *cgrp = css->cgroup;
3078 rcu_read_lock(); 3130 rcu_read_lock();
3079 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { 3131 if ((atomic_dec_return(&css->refcnt) == 1) &&
3132 notify_on_release(cgrp)) {
3080 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3133 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3081 check_for_release(cgrp); 3134 check_for_release(cgrp);
3082 } 3135 }
diff --git a/kernel/compat.c b/kernel/compat.c
index 8eafe3eb50d9..42d56544460f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -24,6 +24,7 @@
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h> 26#include <linux/times.h>
27#include <linux/ptrace.h>
27 28
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29 30
@@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
229 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 230 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
230 return -EFAULT; 231 return -EFAULT;
231 } 232 }
233 force_successful_syscall_return();
232 return compat_jiffies_to_clock_t(jiffies); 234 return compat_jiffies_to_clock_t(jiffies);
233} 235}
234 236
@@ -454,16 +456,16 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
454} 456}
455 457
456static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, 458static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
457 unsigned len, cpumask_t *new_mask) 459 unsigned len, struct cpumask *new_mask)
458{ 460{
459 unsigned long *k; 461 unsigned long *k;
460 462
461 if (len < sizeof(cpumask_t)) 463 if (len < cpumask_size())
462 memset(new_mask, 0, sizeof(cpumask_t)); 464 memset(new_mask, 0, cpumask_size());
463 else if (len > sizeof(cpumask_t)) 465 else if (len > cpumask_size())
464 len = sizeof(cpumask_t); 466 len = cpumask_size();
465 467
466 k = cpus_addr(*new_mask); 468 k = cpumask_bits(new_mask);
467 return compat_get_bitmap(k, user_mask_ptr, len * 8); 469 return compat_get_bitmap(k, user_mask_ptr, len * 8);
468} 470}
469 471
@@ -471,40 +473,51 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
471 unsigned int len, 473 unsigned int len,
472 compat_ulong_t __user *user_mask_ptr) 474 compat_ulong_t __user *user_mask_ptr)
473{ 475{
474 cpumask_t new_mask; 476 cpumask_var_t new_mask;
475 int retval; 477 int retval;
476 478
477 retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); 479 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
480 return -ENOMEM;
481
482 retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask);
478 if (retval) 483 if (retval)
479 return retval; 484 goto out;
480 485
481 return sched_setaffinity(pid, &new_mask); 486 retval = sched_setaffinity(pid, new_mask);
487out:
488 free_cpumask_var(new_mask);
489 return retval;
482} 490}
483 491
484asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, 492asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
485 compat_ulong_t __user *user_mask_ptr) 493 compat_ulong_t __user *user_mask_ptr)
486{ 494{
487 int ret; 495 int ret;
488 cpumask_t mask; 496 cpumask_var_t mask;
489 unsigned long *k; 497 unsigned long *k;
490 unsigned int min_length = sizeof(cpumask_t); 498 unsigned int min_length = cpumask_size();
491 499
492 if (NR_CPUS <= BITS_PER_COMPAT_LONG) 500 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
493 min_length = sizeof(compat_ulong_t); 501 min_length = sizeof(compat_ulong_t);
494 502
495 if (len < min_length) 503 if (len < min_length)
496 return -EINVAL; 504 return -EINVAL;
497 505
498 ret = sched_getaffinity(pid, &mask); 506 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
507 return -ENOMEM;
508
509 ret = sched_getaffinity(pid, mask);
499 if (ret < 0) 510 if (ret < 0)
500 return ret; 511 goto out;
501 512
502 k = cpus_addr(mask); 513 k = cpumask_bits(mask);
503 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 514 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
504 if (ret) 515 if (ret == 0)
505 return ret; 516 ret = min_length;
506 517
507 return min_length; 518out:
519 free_cpumask_var(mask);
520 return ret;
508} 521}
509 522
510int get_compat_itimerspec(struct itimerspec *dst, 523int get_compat_itimerspec(struct itimerspec *dst,
@@ -883,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
883 896
884 if (tloc) { 897 if (tloc) {
885 if (put_user(i,tloc)) 898 if (put_user(i,tloc))
886 i = -EFAULT; 899 return -EFAULT;
887 } 900 }
901 force_successful_syscall_return();
888 return i; 902 return i;
889} 903}
890 904
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ea32e8d68b0..79e40f00dcb8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,29 +15,8 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/* 18#ifdef CONFIG_SMP
19 * Represents all cpu's present in the system 19/* Serializes the updates to cpu_online_mask, cpu_present_mask */
20 * In systems capable of hotplug, this map could dynamically grow
21 * as new cpu's are detected in the system via any platform specific
22 * method, such as ACPI for e.g.
23 */
24cpumask_t cpu_present_map __read_mostly;
25EXPORT_SYMBOL(cpu_present_map);
26
27#ifndef CONFIG_SMP
28
29/*
30 * Represents all cpu's that are currently online.
31 */
32cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
33EXPORT_SYMBOL(cpu_online_map);
34
35cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
36EXPORT_SYMBOL(cpu_possible_map);
37
38#else /* CONFIG_SMP */
39
40/* Serializes the updates to cpu_online_map, cpu_present_map */
41static DEFINE_MUTEX(cpu_add_remove_lock); 20static DEFINE_MUTEX(cpu_add_remove_lock);
42 21
43static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
@@ -64,8 +43,6 @@ void __init cpu_hotplug_init(void)
64 cpu_hotplug.refcount = 0; 43 cpu_hotplug.refcount = 0;
65} 44}
66 45
67cpumask_t cpu_active_map;
68
69#ifdef CONFIG_HOTPLUG_CPU 46#ifdef CONFIG_HOTPLUG_CPU
70 47
71void get_online_cpus(void) 48void get_online_cpus(void)
@@ -96,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
96 73
97/* 74/*
98 * The following two API's must be used when attempting 75 * The following two API's must be used when attempting
99 * to serialize the updates to cpu_online_map, cpu_present_map. 76 * to serialize the updates to cpu_online_mask, cpu_present_mask.
100 */ 77 */
101void cpu_maps_update_begin(void) 78void cpu_maps_update_begin(void)
102{ 79{
@@ -217,7 +194,7 @@ static int __ref take_cpu_down(void *_param)
217static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 194static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
218{ 195{
219 int err, nr_calls = 0; 196 int err, nr_calls = 0;
220 cpumask_t old_allowed, tmp; 197 cpumask_var_t old_allowed;
221 void *hcpu = (void *)(long)cpu; 198 void *hcpu = (void *)(long)cpu;
222 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 199 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
223 struct take_cpu_down_param tcd_param = { 200 struct take_cpu_down_param tcd_param = {
@@ -231,6 +208,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
231 if (!cpu_online(cpu)) 208 if (!cpu_online(cpu))
232 return -EINVAL; 209 return -EINVAL;
233 210
211 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
212 return -ENOMEM;
213
234 cpu_hotplug_begin(); 214 cpu_hotplug_begin();
235 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 215 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
236 hcpu, -1, &nr_calls); 216 hcpu, -1, &nr_calls);
@@ -245,13 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
245 } 225 }
246 226
247 /* Ensure that we are not runnable on dying cpu */ 227 /* Ensure that we are not runnable on dying cpu */
248 old_allowed = current->cpus_allowed; 228 cpumask_copy(old_allowed, &current->cpus_allowed);
249 cpus_setall(tmp); 229 set_cpus_allowed_ptr(current,
250 cpu_clear(cpu, tmp); 230 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
251 set_cpus_allowed_ptr(current, &tmp);
252 tmp = cpumask_of_cpu(cpu);
253 231
254 err = __stop_machine(take_cpu_down, &tcd_param, &tmp); 232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
255 if (err) { 233 if (err) {
256 /* CPU didn't die: tell everyone. Can't complain. */ 234 /* CPU didn't die: tell everyone. Can't complain. */
257 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 235 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
@@ -277,7 +255,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
277 check_for_tasks(cpu); 255 check_for_tasks(cpu);
278 256
279out_allowed: 257out_allowed:
280 set_cpus_allowed_ptr(current, &old_allowed); 258 set_cpus_allowed_ptr(current, old_allowed);
281out_release: 259out_release:
282 cpu_hotplug_done(); 260 cpu_hotplug_done();
283 if (!err) { 261 if (!err) {
@@ -285,13 +263,17 @@ out_release:
285 hcpu) == NOTIFY_BAD) 263 hcpu) == NOTIFY_BAD)
286 BUG(); 264 BUG();
287 } 265 }
266 free_cpumask_var(old_allowed);
288 return err; 267 return err;
289} 268}
290 269
291int __ref cpu_down(unsigned int cpu) 270int __ref cpu_down(unsigned int cpu)
292{ 271{
293 int err = 0; 272 int err;
294 273
274 err = stop_machine_create();
275 if (err)
276 return err;
295 cpu_maps_update_begin(); 277 cpu_maps_update_begin();
296 278
297 if (cpu_hotplug_disabled) { 279 if (cpu_hotplug_disabled) {
@@ -303,7 +285,7 @@ int __ref cpu_down(unsigned int cpu)
303 285
304 /* 286 /*
305 * Make sure the all cpus did the reschedule and are not 287 * Make sure the all cpus did the reschedule and are not
306 * using stale version of the cpu_active_map. 288 * using stale version of the cpu_active_mask.
307 * This is not strictly necessary becuase stop_machine() 289 * This is not strictly necessary becuase stop_machine()
308 * that we run down the line already provides the required 290 * that we run down the line already provides the required
309 * synchronization. But it's really a side effect and we do not 291 * synchronization. But it's really a side effect and we do not
@@ -318,6 +300,7 @@ int __ref cpu_down(unsigned int cpu)
318 300
319out: 301out:
320 cpu_maps_update_done(); 302 cpu_maps_update_done();
303 stop_machine_destroy();
321 return err; 304 return err;
322} 305}
323EXPORT_SYMBOL(cpu_down); 306EXPORT_SYMBOL(cpu_down);
@@ -367,7 +350,7 @@ out_notify:
367int __cpuinit cpu_up(unsigned int cpu) 350int __cpuinit cpu_up(unsigned int cpu)
368{ 351{
369 int err = 0; 352 int err = 0;
370 if (!cpu_isset(cpu, cpu_possible_map)) { 353 if (!cpu_possible(cpu)) {
371 printk(KERN_ERR "can't online cpu %d because it is not " 354 printk(KERN_ERR "can't online cpu %d because it is not "
372 "configured as may-hotadd at boot time\n", cpu); 355 "configured as may-hotadd at boot time\n", cpu);
373#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 356#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
@@ -392,25 +375,28 @@ out:
392} 375}
393 376
394#ifdef CONFIG_PM_SLEEP_SMP 377#ifdef CONFIG_PM_SLEEP_SMP
395static cpumask_t frozen_cpus; 378static cpumask_var_t frozen_cpus;
396 379
397int disable_nonboot_cpus(void) 380int disable_nonboot_cpus(void)
398{ 381{
399 int cpu, first_cpu, error = 0; 382 int cpu, first_cpu, error;
400 383
384 error = stop_machine_create();
385 if (error)
386 return error;
401 cpu_maps_update_begin(); 387 cpu_maps_update_begin();
402 first_cpu = first_cpu(cpu_online_map); 388 first_cpu = cpumask_first(cpu_online_mask);
403 /* We take down all of the non-boot CPUs in one shot to avoid races 389 /* We take down all of the non-boot CPUs in one shot to avoid races
404 * with the userspace trying to use the CPU hotplug at the same time 390 * with the userspace trying to use the CPU hotplug at the same time
405 */ 391 */
406 cpus_clear(frozen_cpus); 392 cpumask_clear(frozen_cpus);
407 printk("Disabling non-boot CPUs ...\n"); 393 printk("Disabling non-boot CPUs ...\n");
408 for_each_online_cpu(cpu) { 394 for_each_online_cpu(cpu) {
409 if (cpu == first_cpu) 395 if (cpu == first_cpu)
410 continue; 396 continue;
411 error = _cpu_down(cpu, 1); 397 error = _cpu_down(cpu, 1);
412 if (!error) { 398 if (!error) {
413 cpu_set(cpu, frozen_cpus); 399 cpumask_set_cpu(cpu, frozen_cpus);
414 printk("CPU%d is down\n", cpu); 400 printk("CPU%d is down\n", cpu);
415 } else { 401 } else {
416 printk(KERN_ERR "Error taking CPU%d down: %d\n", 402 printk(KERN_ERR "Error taking CPU%d down: %d\n",
@@ -426,6 +412,7 @@ int disable_nonboot_cpus(void)
426 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 412 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
427 } 413 }
428 cpu_maps_update_done(); 414 cpu_maps_update_done();
415 stop_machine_destroy();
429 return error; 416 return error;
430} 417}
431 418
@@ -436,11 +423,11 @@ void __ref enable_nonboot_cpus(void)
436 /* Allow everyone to use the CPU hotplug again */ 423 /* Allow everyone to use the CPU hotplug again */
437 cpu_maps_update_begin(); 424 cpu_maps_update_begin();
438 cpu_hotplug_disabled = 0; 425 cpu_hotplug_disabled = 0;
439 if (cpus_empty(frozen_cpus)) 426 if (cpumask_empty(frozen_cpus))
440 goto out; 427 goto out;
441 428
442 printk("Enabling non-boot CPUs ...\n"); 429 printk("Enabling non-boot CPUs ...\n");
443 for_each_cpu_mask_nr(cpu, frozen_cpus) { 430 for_each_cpu(cpu, frozen_cpus) {
444 error = _cpu_up(cpu, 1); 431 error = _cpu_up(cpu, 1);
445 if (!error) { 432 if (!error) {
446 printk("CPU%d is up\n", cpu); 433 printk("CPU%d is up\n", cpu);
@@ -448,10 +435,18 @@ void __ref enable_nonboot_cpus(void)
448 } 435 }
449 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 436 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
450 } 437 }
451 cpus_clear(frozen_cpus); 438 cpumask_clear(frozen_cpus);
452out: 439out:
453 cpu_maps_update_done(); 440 cpu_maps_update_done();
454} 441}
442
443static int alloc_frozen_cpus(void)
444{
445 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
446 return -ENOMEM;
447 return 0;
448}
449core_initcall(alloc_frozen_cpus);
455#endif /* CONFIG_PM_SLEEP_SMP */ 450#endif /* CONFIG_PM_SLEEP_SMP */
456 451
457/** 452/**
@@ -467,7 +462,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
467 unsigned long val = CPU_STARTING; 462 unsigned long val = CPU_STARTING;
468 463
469#ifdef CONFIG_PM_SLEEP_SMP 464#ifdef CONFIG_PM_SLEEP_SMP
470 if (cpu_isset(cpu, frozen_cpus)) 465 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
471 val = CPU_STARTING_FROZEN; 466 val = CPU_STARTING_FROZEN;
472#endif /* CONFIG_PM_SLEEP_SMP */ 467#endif /* CONFIG_PM_SLEEP_SMP */
473 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); 468 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
@@ -479,7 +474,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
479 * cpu_bit_bitmap[] is a special, "compressed" data structure that 474 * cpu_bit_bitmap[] is a special, "compressed" data structure that
480 * represents all NR_CPUS bits binary values of 1<<nr. 475 * represents all NR_CPUS bits binary values of 1<<nr.
481 * 476 *
482 * It is used by cpumask_of_cpu() to get a constant address to a CPU 477 * It is used by cpumask_of() to get a constant address to a CPU
483 * mask value that has a single bit set only. 478 * mask value that has a single bit set only.
484 */ 479 */
485 480
@@ -502,3 +497,71 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
502 497
503const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; 498const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
504EXPORT_SYMBOL(cpu_all_bits); 499EXPORT_SYMBOL(cpu_all_bits);
500
501#ifdef CONFIG_INIT_ALL_POSSIBLE
502static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
503 = CPU_BITS_ALL;
504#else
505static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
506#endif
507const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
508EXPORT_SYMBOL(cpu_possible_mask);
509
510static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
511const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
512EXPORT_SYMBOL(cpu_online_mask);
513
514static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
515const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
516EXPORT_SYMBOL(cpu_present_mask);
517
518static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
519const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
520EXPORT_SYMBOL(cpu_active_mask);
521
522void set_cpu_possible(unsigned int cpu, bool possible)
523{
524 if (possible)
525 cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
526 else
527 cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
528}
529
530void set_cpu_present(unsigned int cpu, bool present)
531{
532 if (present)
533 cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
534 else
535 cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
536}
537
538void set_cpu_online(unsigned int cpu, bool online)
539{
540 if (online)
541 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
542 else
543 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
544}
545
546void set_cpu_active(unsigned int cpu, bool active)
547{
548 if (active)
549 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
550 else
551 cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
552}
553
554void init_cpu_present(const struct cpumask *src)
555{
556 cpumask_copy(to_cpumask(cpu_present_bits), src);
557}
558
559void init_cpu_possible(const struct cpumask *src)
560{
561 cpumask_copy(to_cpumask(cpu_possible_bits), src);
562}
563
564void init_cpu_online(const struct cpumask *src)
565{
566 cpumask_copy(to_cpumask(cpu_online_bits), src);
567}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 96c0ba13b8cd..647c77a88fcb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -84,7 +84,7 @@ struct cpuset {
84 struct cgroup_subsys_state css; 84 struct cgroup_subsys_state css;
85 85
86 unsigned long flags; /* "unsigned long" so bitops work */ 86 unsigned long flags; /* "unsigned long" so bitops work */
87 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
89 89
90 struct cpuset *parent; /* my parent */ 90 struct cpuset *parent; /* my parent */
@@ -195,8 +195,6 @@ static int cpuset_mems_generation;
195 195
196static struct cpuset top_cpuset = { 196static struct cpuset top_cpuset = {
197 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 197 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
198 .cpus_allowed = CPU_MASK_ALL,
199 .mems_allowed = NODE_MASK_ALL,
200}; 198};
201 199
202/* 200/*
@@ -240,6 +238,17 @@ static struct cpuset top_cpuset = {
240static DEFINE_MUTEX(callback_mutex); 238static DEFINE_MUTEX(callback_mutex);
241 239
242/* 240/*
241 * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
242 * buffers. They are statically allocated to prevent using excess stack
243 * when calling cpuset_print_task_mems_allowed().
244 */
245#define CPUSET_NAME_LEN (128)
246#define CPUSET_NODELIST_LEN (256)
247static char cpuset_name[CPUSET_NAME_LEN];
248static char cpuset_nodelist[CPUSET_NODELIST_LEN];
249static DEFINE_SPINLOCK(cpuset_buffer_lock);
250
251/*
243 * This is ugly, but preserves the userspace API for existing cpuset 252 * This is ugly, but preserves the userspace API for existing cpuset
244 * users. If someone tries to mount the "cpuset" filesystem, we 253 * users. If someone tries to mount the "cpuset" filesystem, we
245 * silently switch it to mount "cgroup" instead 254 * silently switch it to mount "cgroup" instead
@@ -267,7 +276,7 @@ static struct file_system_type cpuset_fs_type = {
267}; 276};
268 277
269/* 278/*
270 * Return in *pmask the portion of a cpusets's cpus_allowed that 279 * Return in pmask the portion of a cpusets's cpus_allowed that
271 * are online. If none are online, walk up the cpuset hierarchy 280 * are online. If none are online, walk up the cpuset hierarchy
272 * until we find one that does have some online cpus. If we get 281 * until we find one that does have some online cpus. If we get
273 * all the way to the top and still haven't found any online cpus, 282 * all the way to the top and still haven't found any online cpus,
@@ -280,15 +289,16 @@ static struct file_system_type cpuset_fs_type = {
280 * Call with callback_mutex held. 289 * Call with callback_mutex held.
281 */ 290 */
282 291
283static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 292static void guarantee_online_cpus(const struct cpuset *cs,
293 struct cpumask *pmask)
284{ 294{
285 while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) 295 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
286 cs = cs->parent; 296 cs = cs->parent;
287 if (cs) 297 if (cs)
288 cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); 298 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
289 else 299 else
290 *pmask = cpu_online_map; 300 cpumask_copy(pmask, cpu_online_mask);
291 BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); 301 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
292} 302}
293 303
294/* 304/*
@@ -364,14 +374,9 @@ void cpuset_update_task_memory_state(void)
364 struct task_struct *tsk = current; 374 struct task_struct *tsk = current;
365 struct cpuset *cs; 375 struct cpuset *cs;
366 376
367 if (task_cs(tsk) == &top_cpuset) { 377 rcu_read_lock();
368 /* Don't need rcu for top_cpuset. It's never freed. */ 378 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
369 my_cpusets_mem_gen = top_cpuset.mems_generation; 379 rcu_read_unlock();
370 } else {
371 rcu_read_lock();
372 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
373 rcu_read_unlock();
374 }
375 380
376 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 381 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
377 mutex_lock(&callback_mutex); 382 mutex_lock(&callback_mutex);
@@ -403,12 +408,43 @@ void cpuset_update_task_memory_state(void)
403 408
404static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 409static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
405{ 410{
406 return cpus_subset(p->cpus_allowed, q->cpus_allowed) && 411 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
407 nodes_subset(p->mems_allowed, q->mems_allowed) && 412 nodes_subset(p->mems_allowed, q->mems_allowed) &&
408 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 413 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
409 is_mem_exclusive(p) <= is_mem_exclusive(q); 414 is_mem_exclusive(p) <= is_mem_exclusive(q);
410} 415}
411 416
417/**
418 * alloc_trial_cpuset - allocate a trial cpuset
419 * @cs: the cpuset that the trial cpuset duplicates
420 */
421static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
422{
423 struct cpuset *trial;
424
425 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
426 if (!trial)
427 return NULL;
428
429 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
430 kfree(trial);
431 return NULL;
432 }
433 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
434
435 return trial;
436}
437
438/**
439 * free_trial_cpuset - free the trial cpuset
440 * @trial: the trial cpuset to be freed
441 */
442static void free_trial_cpuset(struct cpuset *trial)
443{
444 free_cpumask_var(trial->cpus_allowed);
445 kfree(trial);
446}
447
412/* 448/*
413 * validate_change() - Used to validate that any proposed cpuset change 449 * validate_change() - Used to validate that any proposed cpuset change
414 * follows the structural rules for cpusets. 450 * follows the structural rules for cpusets.
@@ -458,7 +494,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
458 c = cgroup_cs(cont); 494 c = cgroup_cs(cont);
459 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 495 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
460 c != cur && 496 c != cur &&
461 cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) 497 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
462 return -EINVAL; 498 return -EINVAL;
463 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 499 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
464 c != cur && 500 c != cur &&
@@ -468,7 +504,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
468 504
469 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 505 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
470 if (cgroup_task_count(cur->css.cgroup)) { 506 if (cgroup_task_count(cur->css.cgroup)) {
471 if (cpus_empty(trial->cpus_allowed) || 507 if (cpumask_empty(trial->cpus_allowed) ||
472 nodes_empty(trial->mems_allowed)) { 508 nodes_empty(trial->mems_allowed)) {
473 return -ENOSPC; 509 return -ENOSPC;
474 } 510 }
@@ -483,7 +519,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
483 */ 519 */
484static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 520static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
485{ 521{
486 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 522 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
487} 523}
488 524
489static void 525static void
@@ -508,7 +544,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
508 cp = list_first_entry(&q, struct cpuset, stack_list); 544 cp = list_first_entry(&q, struct cpuset, stack_list);
509 list_del(q.next); 545 list_del(q.next);
510 546
511 if (cpus_empty(cp->cpus_allowed)) 547 if (cpumask_empty(cp->cpus_allowed))
512 continue; 548 continue;
513 549
514 if (is_sched_load_balance(cp)) 550 if (is_sched_load_balance(cp))
@@ -575,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
575 * element of the partition (one sched domain) to be passed to 611 * element of the partition (one sched domain) to be passed to
576 * partition_sched_domains(). 612 * partition_sched_domains().
577 */ 613 */
578static int generate_sched_domains(cpumask_t **domains, 614/* FIXME: see the FIXME in partition_sched_domains() */
615static int generate_sched_domains(struct cpumask **domains,
579 struct sched_domain_attr **attributes) 616 struct sched_domain_attr **attributes)
580{ 617{
581 LIST_HEAD(q); /* queue of cpusets to be scanned */ 618 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -583,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains,
583 struct cpuset **csa; /* array of all cpuset ptrs */ 620 struct cpuset **csa; /* array of all cpuset ptrs */
584 int csn; /* how many cpuset ptrs in csa so far */ 621 int csn; /* how many cpuset ptrs in csa so far */
585 int i, j, k; /* indices for partition finding loops */ 622 int i, j, k; /* indices for partition finding loops */
586 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 623 struct cpumask *doms; /* resulting partition; i.e. sched domains */
587 struct sched_domain_attr *dattr; /* attributes for custom domains */ 624 struct sched_domain_attr *dattr; /* attributes for custom domains */
588 int ndoms = 0; /* number of sched domains in result */ 625 int ndoms = 0; /* number of sched domains in result */
589 int nslot; /* next empty doms[] cpumask_t slot */ 626 int nslot; /* next empty doms[] struct cpumask slot */
590 627
591 doms = NULL; 628 doms = NULL;
592 dattr = NULL; 629 dattr = NULL;
@@ -594,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains,
594 631
595 /* Special case for the 99% of systems with one, full, sched domain */ 632 /* Special case for the 99% of systems with one, full, sched domain */
596 if (is_sched_load_balance(&top_cpuset)) { 633 if (is_sched_load_balance(&top_cpuset)) {
597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 634 doms = kmalloc(cpumask_size(), GFP_KERNEL);
598 if (!doms) 635 if (!doms)
599 goto done; 636 goto done;
600 637
@@ -603,7 +640,7 @@ static int generate_sched_domains(cpumask_t **domains,
603 *dattr = SD_ATTR_INIT; 640 *dattr = SD_ATTR_INIT;
604 update_domain_attr_tree(dattr, &top_cpuset); 641 update_domain_attr_tree(dattr, &top_cpuset);
605 } 642 }
606 *doms = top_cpuset.cpus_allowed; 643 cpumask_copy(doms, top_cpuset.cpus_allowed);
607 644
608 ndoms = 1; 645 ndoms = 1;
609 goto done; 646 goto done;
@@ -622,7 +659,7 @@ static int generate_sched_domains(cpumask_t **domains,
622 cp = list_first_entry(&q, struct cpuset, stack_list); 659 cp = list_first_entry(&q, struct cpuset, stack_list);
623 list_del(q.next); 660 list_del(q.next);
624 661
625 if (cpus_empty(cp->cpus_allowed)) 662 if (cpumask_empty(cp->cpus_allowed))
626 continue; 663 continue;
627 664
628 /* 665 /*
@@ -673,7 +710,7 @@ restart:
673 * Now we know how many domains to create. 710 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 711 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */ 712 */
676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 713 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
677 if (!doms) 714 if (!doms)
678 goto done; 715 goto done;
679 716
@@ -685,7 +722,7 @@ restart:
685 722
686 for (nslot = 0, i = 0; i < csn; i++) { 723 for (nslot = 0, i = 0; i < csn; i++) {
687 struct cpuset *a = csa[i]; 724 struct cpuset *a = csa[i];
688 cpumask_t *dp; 725 struct cpumask *dp;
689 int apn = a->pn; 726 int apn = a->pn;
690 727
691 if (apn < 0) { 728 if (apn < 0) {
@@ -708,14 +745,14 @@ restart:
708 continue; 745 continue;
709 } 746 }
710 747
711 cpus_clear(*dp); 748 cpumask_clear(dp);
712 if (dattr) 749 if (dattr)
713 *(dattr + nslot) = SD_ATTR_INIT; 750 *(dattr + nslot) = SD_ATTR_INIT;
714 for (j = i; j < csn; j++) { 751 for (j = i; j < csn; j++) {
715 struct cpuset *b = csa[j]; 752 struct cpuset *b = csa[j];
716 753
717 if (apn == b->pn) { 754 if (apn == b->pn) {
718 cpus_or(*dp, *dp, b->cpus_allowed); 755 cpumask_or(dp, dp, b->cpus_allowed);
719 if (dattr) 756 if (dattr)
720 update_domain_attr_tree(dattr + nslot, b); 757 update_domain_attr_tree(dattr + nslot, b);
721 758
@@ -755,7 +792,7 @@ done:
755static void do_rebuild_sched_domains(struct work_struct *unused) 792static void do_rebuild_sched_domains(struct work_struct *unused)
756{ 793{
757 struct sched_domain_attr *attr; 794 struct sched_domain_attr *attr;
758 cpumask_t *doms; 795 struct cpumask *doms;
759 int ndoms; 796 int ndoms;
760 797
761 get_online_cpus(); 798 get_online_cpus();
@@ -824,7 +861,7 @@ void rebuild_sched_domains(void)
824static int cpuset_test_cpumask(struct task_struct *tsk, 861static int cpuset_test_cpumask(struct task_struct *tsk,
825 struct cgroup_scanner *scan) 862 struct cgroup_scanner *scan)
826{ 863{
827 return !cpus_equal(tsk->cpus_allowed, 864 return !cpumask_equal(&tsk->cpus_allowed,
828 (cgroup_cs(scan->cg))->cpus_allowed); 865 (cgroup_cs(scan->cg))->cpus_allowed);
829} 866}
830 867
@@ -842,7 +879,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
842static void cpuset_change_cpumask(struct task_struct *tsk, 879static void cpuset_change_cpumask(struct task_struct *tsk,
843 struct cgroup_scanner *scan) 880 struct cgroup_scanner *scan)
844{ 881{
845 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); 882 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
846} 883}
847 884
848/** 885/**
@@ -874,10 +911,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
874 * @cs: the cpuset to consider 911 * @cs: the cpuset to consider
875 * @buf: buffer of cpu numbers written to this cpuset 912 * @buf: buffer of cpu numbers written to this cpuset
876 */ 913 */
877static int update_cpumask(struct cpuset *cs, const char *buf) 914static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
915 const char *buf)
878{ 916{
879 struct ptr_heap heap; 917 struct ptr_heap heap;
880 struct cpuset trialcs;
881 int retval; 918 int retval;
882 int is_load_balanced; 919 int is_load_balanced;
883 920
@@ -885,8 +922,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
885 if (cs == &top_cpuset) 922 if (cs == &top_cpuset)
886 return -EACCES; 923 return -EACCES;
887 924
888 trialcs = *cs;
889
890 /* 925 /*
891 * An empty cpus_allowed is ok only if the cpuset has no tasks. 926 * An empty cpus_allowed is ok only if the cpuset has no tasks.
892 * Since cpulist_parse() fails on an empty mask, we special case 927 * Since cpulist_parse() fails on an empty mask, we special case
@@ -894,31 +929,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
894 * with tasks have cpus. 929 * with tasks have cpus.
895 */ 930 */
896 if (!*buf) { 931 if (!*buf) {
897 cpus_clear(trialcs.cpus_allowed); 932 cpumask_clear(trialcs->cpus_allowed);
898 } else { 933 } else {
899 retval = cpulist_parse(buf, trialcs.cpus_allowed); 934 retval = cpulist_parse(buf, trialcs->cpus_allowed);
900 if (retval < 0) 935 if (retval < 0)
901 return retval; 936 return retval;
902 937
903 if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) 938 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
904 return -EINVAL; 939 return -EINVAL;
905 } 940 }
906 retval = validate_change(cs, &trialcs); 941 retval = validate_change(cs, trialcs);
907 if (retval < 0) 942 if (retval < 0)
908 return retval; 943 return retval;
909 944
910 /* Nothing to do if the cpus didn't change */ 945 /* Nothing to do if the cpus didn't change */
911 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 946 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
912 return 0; 947 return 0;
913 948
914 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 949 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
915 if (retval) 950 if (retval)
916 return retval; 951 return retval;
917 952
918 is_load_balanced = is_sched_load_balance(&trialcs); 953 is_load_balanced = is_sched_load_balance(trialcs);
919 954
920 mutex_lock(&callback_mutex); 955 mutex_lock(&callback_mutex);
921 cs->cpus_allowed = trialcs.cpus_allowed; 956 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
922 mutex_unlock(&callback_mutex); 957 mutex_unlock(&callback_mutex);
923 958
924 /* 959 /*
@@ -1006,7 +1041,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
1006 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1041 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1007 1042
1008 fudge = 10; /* spare mmarray[] slots */ 1043 fudge = 10; /* spare mmarray[] slots */
1009 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 1044 fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
1010 retval = -ENOMEM; 1045 retval = -ENOMEM;
1011 1046
1012 /* 1047 /*
@@ -1093,9 +1128,9 @@ done:
1093 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1128 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1094 * their mempolicies to the cpusets new mems_allowed. 1129 * their mempolicies to the cpusets new mems_allowed.
1095 */ 1130 */
1096static int update_nodemask(struct cpuset *cs, const char *buf) 1131static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1132 const char *buf)
1097{ 1133{
1098 struct cpuset trialcs;
1099 nodemask_t oldmem; 1134 nodemask_t oldmem;
1100 int retval; 1135 int retval;
1101 1136
@@ -1106,8 +1141,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
1106 if (cs == &top_cpuset) 1141 if (cs == &top_cpuset)
1107 return -EACCES; 1142 return -EACCES;
1108 1143
1109 trialcs = *cs;
1110
1111 /* 1144 /*
1112 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1145 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1113 * Since nodelist_parse() fails on an empty mask, we special case 1146 * Since nodelist_parse() fails on an empty mask, we special case
@@ -1115,27 +1148,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
1115 * with tasks have memory. 1148 * with tasks have memory.
1116 */ 1149 */
1117 if (!*buf) { 1150 if (!*buf) {
1118 nodes_clear(trialcs.mems_allowed); 1151 nodes_clear(trialcs->mems_allowed);
1119 } else { 1152 } else {
1120 retval = nodelist_parse(buf, trialcs.mems_allowed); 1153 retval = nodelist_parse(buf, trialcs->mems_allowed);
1121 if (retval < 0) 1154 if (retval < 0)
1122 goto done; 1155 goto done;
1123 1156
1124 if (!nodes_subset(trialcs.mems_allowed, 1157 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_HIGH_MEMORY])) 1158 node_states[N_HIGH_MEMORY]))
1126 return -EINVAL; 1159 return -EINVAL;
1127 } 1160 }
1128 oldmem = cs->mems_allowed; 1161 oldmem = cs->mems_allowed;
1129 if (nodes_equal(oldmem, trialcs.mems_allowed)) { 1162 if (nodes_equal(oldmem, trialcs->mems_allowed)) {
1130 retval = 0; /* Too easy - nothing to do */ 1163 retval = 0; /* Too easy - nothing to do */
1131 goto done; 1164 goto done;
1132 } 1165 }
1133 retval = validate_change(cs, &trialcs); 1166 retval = validate_change(cs, trialcs);
1134 if (retval < 0) 1167 if (retval < 0)
1135 goto done; 1168 goto done;
1136 1169
1137 mutex_lock(&callback_mutex); 1170 mutex_lock(&callback_mutex);
1138 cs->mems_allowed = trialcs.mems_allowed; 1171 cs->mems_allowed = trialcs->mems_allowed;
1139 cs->mems_generation = cpuset_mems_generation++; 1172 cs->mems_generation = cpuset_mems_generation++;
1140 mutex_unlock(&callback_mutex); 1173 mutex_unlock(&callback_mutex);
1141 1174
@@ -1156,7 +1189,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1156 1189
1157 if (val != cs->relax_domain_level) { 1190 if (val != cs->relax_domain_level) {
1158 cs->relax_domain_level = val; 1191 cs->relax_domain_level = val;
1159 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1192 if (!cpumask_empty(cs->cpus_allowed) &&
1193 is_sched_load_balance(cs))
1160 async_rebuild_sched_domains(); 1194 async_rebuild_sched_domains();
1161 } 1195 }
1162 1196
@@ -1175,31 +1209,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1175static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1209static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1176 int turning_on) 1210 int turning_on)
1177{ 1211{
1178 struct cpuset trialcs; 1212 struct cpuset *trialcs;
1179 int err; 1213 int err;
1180 int balance_flag_changed; 1214 int balance_flag_changed;
1181 1215
1182 trialcs = *cs; 1216 trialcs = alloc_trial_cpuset(cs);
1217 if (!trialcs)
1218 return -ENOMEM;
1219
1183 if (turning_on) 1220 if (turning_on)
1184 set_bit(bit, &trialcs.flags); 1221 set_bit(bit, &trialcs->flags);
1185 else 1222 else
1186 clear_bit(bit, &trialcs.flags); 1223 clear_bit(bit, &trialcs->flags);
1187 1224
1188 err = validate_change(cs, &trialcs); 1225 err = validate_change(cs, trialcs);
1189 if (err < 0) 1226 if (err < 0)
1190 return err; 1227 goto out;
1191 1228
1192 balance_flag_changed = (is_sched_load_balance(cs) != 1229 balance_flag_changed = (is_sched_load_balance(cs) !=
1193 is_sched_load_balance(&trialcs)); 1230 is_sched_load_balance(trialcs));
1194 1231
1195 mutex_lock(&callback_mutex); 1232 mutex_lock(&callback_mutex);
1196 cs->flags = trialcs.flags; 1233 cs->flags = trialcs->flags;
1197 mutex_unlock(&callback_mutex); 1234 mutex_unlock(&callback_mutex);
1198 1235
1199 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) 1236 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1200 async_rebuild_sched_domains(); 1237 async_rebuild_sched_domains();
1201 1238
1202 return 0; 1239out:
1240 free_trial_cpuset(trialcs);
1241 return err;
1203} 1242}
1204 1243
1205/* 1244/*
@@ -1300,42 +1339,47 @@ static int fmeter_getrate(struct fmeter *fmp)
1300 return val; 1339 return val;
1301} 1340}
1302 1341
1342/* Protected by cgroup_lock */
1343static cpumask_var_t cpus_attach;
1344
1303/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1345/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1304static int cpuset_can_attach(struct cgroup_subsys *ss, 1346static int cpuset_can_attach(struct cgroup_subsys *ss,
1305 struct cgroup *cont, struct task_struct *tsk) 1347 struct cgroup *cont, struct task_struct *tsk)
1306{ 1348{
1307 struct cpuset *cs = cgroup_cs(cont); 1349 struct cpuset *cs = cgroup_cs(cont);
1350 int ret = 0;
1308 1351
1309 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1352 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1310 return -ENOSPC; 1353 return -ENOSPC;
1311 if (tsk->flags & PF_THREAD_BOUND) {
1312 cpumask_t mask;
1313 1354
1355 if (tsk->flags & PF_THREAD_BOUND) {
1314 mutex_lock(&callback_mutex); 1356 mutex_lock(&callback_mutex);
1315 mask = cs->cpus_allowed; 1357 if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
1358 ret = -EINVAL;
1316 mutex_unlock(&callback_mutex); 1359 mutex_unlock(&callback_mutex);
1317 if (!cpus_equal(tsk->cpus_allowed, mask))
1318 return -EINVAL;
1319 } 1360 }
1320 1361
1321 return security_task_setscheduler(tsk, 0, NULL); 1362 return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
1322} 1363}
1323 1364
1324static void cpuset_attach(struct cgroup_subsys *ss, 1365static void cpuset_attach(struct cgroup_subsys *ss,
1325 struct cgroup *cont, struct cgroup *oldcont, 1366 struct cgroup *cont, struct cgroup *oldcont,
1326 struct task_struct *tsk) 1367 struct task_struct *tsk)
1327{ 1368{
1328 cpumask_t cpus;
1329 nodemask_t from, to; 1369 nodemask_t from, to;
1330 struct mm_struct *mm; 1370 struct mm_struct *mm;
1331 struct cpuset *cs = cgroup_cs(cont); 1371 struct cpuset *cs = cgroup_cs(cont);
1332 struct cpuset *oldcs = cgroup_cs(oldcont); 1372 struct cpuset *oldcs = cgroup_cs(oldcont);
1333 int err; 1373 int err;
1334 1374
1335 mutex_lock(&callback_mutex); 1375 if (cs == &top_cpuset) {
1336 guarantee_online_cpus(cs, &cpus); 1376 cpumask_copy(cpus_attach, cpu_possible_mask);
1337 err = set_cpus_allowed_ptr(tsk, &cpus); 1377 } else {
1338 mutex_unlock(&callback_mutex); 1378 mutex_lock(&callback_mutex);
1379 guarantee_online_cpus(cs, cpus_attach);
1380 mutex_unlock(&callback_mutex);
1381 }
1382 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1339 if (err) 1383 if (err)
1340 return; 1384 return;
1341 1385
@@ -1348,7 +1392,6 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1348 cpuset_migrate_mm(mm, &from, &to); 1392 cpuset_migrate_mm(mm, &from, &to);
1349 mmput(mm); 1393 mmput(mm);
1350 } 1394 }
1351
1352} 1395}
1353 1396
1354/* The various types of files and directories in a cpuset file system */ 1397/* The various types of files and directories in a cpuset file system */
@@ -1443,21 +1486,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1443 const char *buf) 1486 const char *buf)
1444{ 1487{
1445 int retval = 0; 1488 int retval = 0;
1489 struct cpuset *cs = cgroup_cs(cgrp);
1490 struct cpuset *trialcs;
1446 1491
1447 if (!cgroup_lock_live_group(cgrp)) 1492 if (!cgroup_lock_live_group(cgrp))
1448 return -ENODEV; 1493 return -ENODEV;
1449 1494
1495 trialcs = alloc_trial_cpuset(cs);
1496 if (!trialcs)
1497 return -ENOMEM;
1498
1450 switch (cft->private) { 1499 switch (cft->private) {
1451 case FILE_CPULIST: 1500 case FILE_CPULIST:
1452 retval = update_cpumask(cgroup_cs(cgrp), buf); 1501 retval = update_cpumask(cs, trialcs, buf);
1453 break; 1502 break;
1454 case FILE_MEMLIST: 1503 case FILE_MEMLIST:
1455 retval = update_nodemask(cgroup_cs(cgrp), buf); 1504 retval = update_nodemask(cs, trialcs, buf);
1456 break; 1505 break;
1457 default: 1506 default:
1458 retval = -EINVAL; 1507 retval = -EINVAL;
1459 break; 1508 break;
1460 } 1509 }
1510
1511 free_trial_cpuset(trialcs);
1461 cgroup_unlock(); 1512 cgroup_unlock();
1462 return retval; 1513 return retval;
1463} 1514}
@@ -1476,13 +1527,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1476 1527
1477static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1528static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1478{ 1529{
1479 cpumask_t mask; 1530 int ret;
1480 1531
1481 mutex_lock(&callback_mutex); 1532 mutex_lock(&callback_mutex);
1482 mask = cs->cpus_allowed; 1533 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1483 mutex_unlock(&callback_mutex); 1534 mutex_unlock(&callback_mutex);
1484 1535
1485 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1536 return ret;
1486} 1537}
1487 1538
1488static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1539static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
@@ -1718,7 +1769,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1718 parent_cs = cgroup_cs(parent); 1769 parent_cs = cgroup_cs(parent);
1719 1770
1720 cs->mems_allowed = parent_cs->mems_allowed; 1771 cs->mems_allowed = parent_cs->mems_allowed;
1721 cs->cpus_allowed = parent_cs->cpus_allowed; 1772 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1722 return; 1773 return;
1723} 1774}
1724 1775
@@ -1744,6 +1795,10 @@ static struct cgroup_subsys_state *cpuset_create(
1744 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1795 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1745 if (!cs) 1796 if (!cs)
1746 return ERR_PTR(-ENOMEM); 1797 return ERR_PTR(-ENOMEM);
1798 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1799 kfree(cs);
1800 return ERR_PTR(-ENOMEM);
1801 }
1747 1802
1748 cpuset_update_task_memory_state(); 1803 cpuset_update_task_memory_state();
1749 cs->flags = 0; 1804 cs->flags = 0;
@@ -1752,7 +1807,7 @@ static struct cgroup_subsys_state *cpuset_create(
1752 if (is_spread_slab(parent)) 1807 if (is_spread_slab(parent))
1753 set_bit(CS_SPREAD_SLAB, &cs->flags); 1808 set_bit(CS_SPREAD_SLAB, &cs->flags);
1754 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1809 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1755 cpus_clear(cs->cpus_allowed); 1810 cpumask_clear(cs->cpus_allowed);
1756 nodes_clear(cs->mems_allowed); 1811 nodes_clear(cs->mems_allowed);
1757 cs->mems_generation = cpuset_mems_generation++; 1812 cs->mems_generation = cpuset_mems_generation++;
1758 fmeter_init(&cs->fmeter); 1813 fmeter_init(&cs->fmeter);
@@ -1779,6 +1834,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1779 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1834 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1780 1835
1781 number_of_cpusets--; 1836 number_of_cpusets--;
1837 free_cpumask_var(cs->cpus_allowed);
1782 kfree(cs); 1838 kfree(cs);
1783} 1839}
1784 1840
@@ -1802,6 +1858,8 @@ struct cgroup_subsys cpuset_subsys = {
1802 1858
1803int __init cpuset_init_early(void) 1859int __init cpuset_init_early(void)
1804{ 1860{
1861 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
1862
1805 top_cpuset.mems_generation = cpuset_mems_generation++; 1863 top_cpuset.mems_generation = cpuset_mems_generation++;
1806 return 0; 1864 return 0;
1807} 1865}
@@ -1817,7 +1875,7 @@ int __init cpuset_init(void)
1817{ 1875{
1818 int err = 0; 1876 int err = 0;
1819 1877
1820 cpus_setall(top_cpuset.cpus_allowed); 1878 cpumask_setall(top_cpuset.cpus_allowed);
1821 nodes_setall(top_cpuset.mems_allowed); 1879 nodes_setall(top_cpuset.mems_allowed);
1822 1880
1823 fmeter_init(&top_cpuset.fmeter); 1881 fmeter_init(&top_cpuset.fmeter);
@@ -1829,6 +1887,9 @@ int __init cpuset_init(void)
1829 if (err < 0) 1887 if (err < 0)
1830 return err; 1888 return err;
1831 1889
1890 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1891 BUG();
1892
1832 number_of_cpusets = 1; 1893 number_of_cpusets = 1;
1833 return 0; 1894 return 0;
1834} 1895}
@@ -1903,7 +1964,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1903 * has online cpus, so can't be empty). 1964 * has online cpus, so can't be empty).
1904 */ 1965 */
1905 parent = cs->parent; 1966 parent = cs->parent;
1906 while (cpus_empty(parent->cpus_allowed) || 1967 while (cpumask_empty(parent->cpus_allowed) ||
1907 nodes_empty(parent->mems_allowed)) 1968 nodes_empty(parent->mems_allowed))
1908 parent = parent->parent; 1969 parent = parent->parent;
1909 1970
@@ -1944,7 +2005,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1944 } 2005 }
1945 2006
1946 /* Continue past cpusets with all cpus, mems online */ 2007 /* Continue past cpusets with all cpus, mems online */
1947 if (cpus_subset(cp->cpus_allowed, cpu_online_map) && 2008 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
1948 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2009 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1949 continue; 2010 continue;
1950 2011
@@ -1952,13 +2013,14 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1952 2013
1953 /* Remove offline cpus and mems from this cpuset. */ 2014 /* Remove offline cpus and mems from this cpuset. */
1954 mutex_lock(&callback_mutex); 2015 mutex_lock(&callback_mutex);
1955 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); 2016 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2017 cpu_online_mask);
1956 nodes_and(cp->mems_allowed, cp->mems_allowed, 2018 nodes_and(cp->mems_allowed, cp->mems_allowed,
1957 node_states[N_HIGH_MEMORY]); 2019 node_states[N_HIGH_MEMORY]);
1958 mutex_unlock(&callback_mutex); 2020 mutex_unlock(&callback_mutex);
1959 2021
1960 /* Move tasks from the empty cpuset to a parent */ 2022 /* Move tasks from the empty cpuset to a parent */
1961 if (cpus_empty(cp->cpus_allowed) || 2023 if (cpumask_empty(cp->cpus_allowed) ||
1962 nodes_empty(cp->mems_allowed)) 2024 nodes_empty(cp->mems_allowed))
1963 remove_tasks_in_empty_cpuset(cp); 2025 remove_tasks_in_empty_cpuset(cp);
1964 else { 2026 else {
@@ -1984,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1984 unsigned long phase, void *unused_cpu) 2046 unsigned long phase, void *unused_cpu)
1985{ 2047{
1986 struct sched_domain_attr *attr; 2048 struct sched_domain_attr *attr;
1987 cpumask_t *doms; 2049 struct cpumask *doms;
1988 int ndoms; 2050 int ndoms;
1989 2051
1990 switch (phase) { 2052 switch (phase) {
@@ -1999,7 +2061,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1999 } 2061 }
2000 2062
2001 cgroup_lock(); 2063 cgroup_lock();
2002 top_cpuset.cpus_allowed = cpu_online_map; 2064 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2003 scan_for_empty_cpusets(&top_cpuset); 2065 scan_for_empty_cpusets(&top_cpuset);
2004 ndoms = generate_sched_domains(&doms, &attr); 2066 ndoms = generate_sched_domains(&doms, &attr);
2005 cgroup_unlock(); 2067 cgroup_unlock();
@@ -2044,7 +2106,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2044 2106
2045void __init cpuset_init_smp(void) 2107void __init cpuset_init_smp(void)
2046{ 2108{
2047 top_cpuset.cpus_allowed = cpu_online_map; 2109 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2048 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2110 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2049 2111
2050 hotcpu_notifier(cpuset_track_online_cpus, 0); 2112 hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2054,15 +2116,15 @@ void __init cpuset_init_smp(void)
2054/** 2116/**
2055 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2117 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
2056 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2118 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
2057 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. 2119 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
2058 * 2120 *
2059 * Description: Returns the cpumask_t cpus_allowed of the cpuset 2121 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
2060 * attached to the specified @tsk. Guaranteed to return some non-empty 2122 * attached to the specified @tsk. Guaranteed to return some non-empty
2061 * subset of cpu_online_map, even if this means going outside the 2123 * subset of cpu_online_map, even if this means going outside the
2062 * tasks cpuset. 2124 * tasks cpuset.
2063 **/ 2125 **/
2064 2126
2065void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) 2127void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2066{ 2128{
2067 mutex_lock(&callback_mutex); 2129 mutex_lock(&callback_mutex);
2068 cpuset_cpus_allowed_locked(tsk, pmask); 2130 cpuset_cpus_allowed_locked(tsk, pmask);
@@ -2073,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
2073 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. 2135 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2074 * Must be called with callback_mutex held. 2136 * Must be called with callback_mutex held.
2075 **/ 2137 **/
2076void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) 2138void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2077{ 2139{
2078 task_lock(tsk); 2140 task_lock(tsk);
2079 guarantee_online_cpus(task_cs(tsk), pmask); 2141 guarantee_online_cpus(task_cs(tsk), pmask);
@@ -2356,6 +2418,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2356 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2418 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2357} 2419}
2358 2420
2421/**
2422 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2423 * @task: pointer to task_struct of some task.
2424 *
2425 * Description: Prints @task's name, cpuset name, and cached copy of its
2426 * mems_allowed to the kernel log. Must hold task_lock(task) to allow
2427 * dereferencing task_cs(task).
2428 */
2429void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2430{
2431 struct dentry *dentry;
2432
2433 dentry = task_cs(tsk)->css.cgroup->dentry;
2434 spin_lock(&cpuset_buffer_lock);
2435 snprintf(cpuset_name, CPUSET_NAME_LEN,
2436 dentry ? (const char *)dentry->d_name.name : "/");
2437 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2438 tsk->mems_allowed);
2439 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2440 tsk->comm, cpuset_name, cpuset_nodelist);
2441 spin_unlock(&cpuset_buffer_lock);
2442}
2443
2359/* 2444/*
2360 * Collection of memory_pressure is suppressed unless 2445 * Collection of memory_pressure is suppressed unless
2361 * this flag is enabled by writing "1" to the special 2446 * this flag is enabled by writing "1" to the special
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
new file mode 100644
index 000000000000..2dc4fc2d0bf1
--- /dev/null
+++ b/kernel/cred-internals.h
@@ -0,0 +1,21 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
new file mode 100644
index 000000000000..ff7bc071991c
--- /dev/null
+++ b/kernel/cred.c
@@ -0,0 +1,588 @@
1/* Task credentials management - see Documentation/credentials.txt
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11#include <linux/module.h>
12#include <linux/cred.h>
13#include <linux/sched.h>
14#include <linux/key.h>
15#include <linux/keyctl.h>
16#include <linux/init_task.h>
17#include <linux/security.h>
18#include <linux/cn_proc.h>
19#include "cred-internals.h"
20
21static struct kmem_cache *cred_jar;
22
23/*
24 * The common credentials for the initial task's thread group
25 */
26#ifdef CONFIG_KEYS
27static struct thread_group_cred init_tgcred = {
28 .usage = ATOMIC_INIT(2),
29 .tgid = 0,
30 .lock = SPIN_LOCK_UNLOCKED,
31};
32#endif
33
34/*
35 * The initial credentials for the initial task
36 */
37struct cred init_cred = {
38 .usage = ATOMIC_INIT(4),
39 .securebits = SECUREBITS_DEFAULT,
40 .cap_inheritable = CAP_INIT_INH_SET,
41 .cap_permitted = CAP_FULL_SET,
42 .cap_effective = CAP_INIT_EFF_SET,
43 .cap_bset = CAP_INIT_BSET,
44 .user = INIT_USER,
45 .group_info = &init_groups,
46#ifdef CONFIG_KEYS
47 .tgcred = &init_tgcred,
48#endif
49};
50
51/*
52 * Dispose of the shared task group credentials
53 */
54#ifdef CONFIG_KEYS
55static void release_tgcred_rcu(struct rcu_head *rcu)
56{
57 struct thread_group_cred *tgcred =
58 container_of(rcu, struct thread_group_cred, rcu);
59
60 BUG_ON(atomic_read(&tgcred->usage) != 0);
61
62 key_put(tgcred->session_keyring);
63 key_put(tgcred->process_keyring);
64 kfree(tgcred);
65}
66#endif
67
68/*
69 * Release a set of thread group credentials.
70 */
71static void release_tgcred(struct cred *cred)
72{
73#ifdef CONFIG_KEYS
74 struct thread_group_cred *tgcred = cred->tgcred;
75
76 if (atomic_dec_and_test(&tgcred->usage))
77 call_rcu(&tgcred->rcu, release_tgcred_rcu);
78#endif
79}
80
81/*
82 * The RCU callback to actually dispose of a set of credentials
83 */
84static void put_cred_rcu(struct rcu_head *rcu)
85{
86 struct cred *cred = container_of(rcu, struct cred, rcu);
87
88 if (atomic_read(&cred->usage) != 0)
89 panic("CRED: put_cred_rcu() sees %p with usage %d\n",
90 cred, atomic_read(&cred->usage));
91
92 security_cred_free(cred);
93 key_put(cred->thread_keyring);
94 key_put(cred->request_key_auth);
95 release_tgcred(cred);
96 put_group_info(cred->group_info);
97 free_uid(cred->user);
98 kmem_cache_free(cred_jar, cred);
99}
100
101/**
102 * __put_cred - Destroy a set of credentials
103 * @cred: The record to release
104 *
105 * Destroy a set of credentials on which no references remain.
106 */
107void __put_cred(struct cred *cred)
108{
109 BUG_ON(atomic_read(&cred->usage) != 0);
110
111 call_rcu(&cred->rcu, put_cred_rcu);
112}
113EXPORT_SYMBOL(__put_cred);
114
115/**
116 * prepare_creds - Prepare a new set of credentials for modification
117 *
118 * Prepare a new set of task credentials for modification. A task's creds
119 * shouldn't generally be modified directly, therefore this function is used to
120 * prepare a new copy, which the caller then modifies and then commits by
121 * calling commit_creds().
122 *
123 * Preparation involves making a copy of the objective creds for modification.
124 *
125 * Returns a pointer to the new creds-to-be if successful, NULL otherwise.
126 *
127 * Call commit_creds() or abort_creds() to clean up.
128 */
129struct cred *prepare_creds(void)
130{
131 struct task_struct *task = current;
132 const struct cred *old;
133 struct cred *new;
134
135 BUG_ON(atomic_read(&task->real_cred->usage) < 1);
136
137 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
138 if (!new)
139 return NULL;
140
141 old = task->cred;
142 memcpy(new, old, sizeof(struct cred));
143
144 atomic_set(&new->usage, 1);
145 get_group_info(new->group_info);
146 get_uid(new->user);
147
148#ifdef CONFIG_KEYS
149 key_get(new->thread_keyring);
150 key_get(new->request_key_auth);
151 atomic_inc(&new->tgcred->usage);
152#endif
153
154#ifdef CONFIG_SECURITY
155 new->security = NULL;
156#endif
157
158 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
159 goto error;
160 return new;
161
162error:
163 abort_creds(new);
164 return NULL;
165}
166EXPORT_SYMBOL(prepare_creds);
167
168/*
169 * Prepare credentials for current to perform an execve()
170 * - The caller must hold current->cred_exec_mutex
171 */
172struct cred *prepare_exec_creds(void)
173{
174 struct thread_group_cred *tgcred = NULL;
175 struct cred *new;
176
177#ifdef CONFIG_KEYS
178 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
179 if (!tgcred)
180 return NULL;
181#endif
182
183 new = prepare_creds();
184 if (!new) {
185 kfree(tgcred);
186 return new;
187 }
188
189#ifdef CONFIG_KEYS
190 /* newly exec'd tasks don't get a thread keyring */
191 key_put(new->thread_keyring);
192 new->thread_keyring = NULL;
193
194 /* create a new per-thread-group creds for all this set of threads to
195 * share */
196 memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
197
198 atomic_set(&tgcred->usage, 1);
199 spin_lock_init(&tgcred->lock);
200
201 /* inherit the session keyring; new process keyring */
202 key_get(tgcred->session_keyring);
203 tgcred->process_keyring = NULL;
204
205 release_tgcred(new);
206 new->tgcred = tgcred;
207#endif
208
209 return new;
210}
211
212/*
213 * prepare new credentials for the usermode helper dispatcher
214 */
215struct cred *prepare_usermodehelper_creds(void)
216{
217#ifdef CONFIG_KEYS
218 struct thread_group_cred *tgcred = NULL;
219#endif
220 struct cred *new;
221
222#ifdef CONFIG_KEYS
223 tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
224 if (!tgcred)
225 return NULL;
226#endif
227
228 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
229 if (!new)
230 return NULL;
231
232 memcpy(new, &init_cred, sizeof(struct cred));
233
234 atomic_set(&new->usage, 1);
235 get_group_info(new->group_info);
236 get_uid(new->user);
237
238#ifdef CONFIG_KEYS
239 new->thread_keyring = NULL;
240 new->request_key_auth = NULL;
241 new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
242
243 atomic_set(&tgcred->usage, 1);
244 spin_lock_init(&tgcred->lock);
245 new->tgcred = tgcred;
246#endif
247
248#ifdef CONFIG_SECURITY
249 new->security = NULL;
250#endif
251 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
252 goto error;
253
254 BUG_ON(atomic_read(&new->usage) != 1);
255 return new;
256
257error:
258 put_cred(new);
259 return NULL;
260}
261
262/*
263 * Copy credentials for the new process created by fork()
264 *
265 * We share if we can, but under some circumstances we have to generate a new
266 * set.
267 *
268 * The new process gets the current process's subjective credentials as its
269 * objective and subjective credentials
270 */
271int copy_creds(struct task_struct *p, unsigned long clone_flags)
272{
273#ifdef CONFIG_KEYS
274 struct thread_group_cred *tgcred;
275#endif
276 struct cred *new;
277 int ret;
278
279 mutex_init(&p->cred_exec_mutex);
280
281 if (
282#ifdef CONFIG_KEYS
283 !p->cred->thread_keyring &&
284#endif
285 clone_flags & CLONE_THREAD
286 ) {
287 p->real_cred = get_cred(p->cred);
288 get_cred(p->cred);
289 atomic_inc(&p->cred->user->processes);
290 return 0;
291 }
292
293 new = prepare_creds();
294 if (!new)
295 return -ENOMEM;
296
297 if (clone_flags & CLONE_NEWUSER) {
298 ret = create_user_ns(new);
299 if (ret < 0)
300 goto error_put;
301 }
302
303#ifdef CONFIG_KEYS
304 /* new threads get their own thread keyrings if their parent already
305 * had one */
306 if (new->thread_keyring) {
307 key_put(new->thread_keyring);
308 new->thread_keyring = NULL;
309 if (clone_flags & CLONE_THREAD)
310 install_thread_keyring_to_cred(new);
311 }
312
313 /* we share the process and session keyrings between all the threads in
314 * a process - this is slightly icky as we violate COW credentials a
315 * bit */
316 if (!(clone_flags & CLONE_THREAD)) {
317 tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
318 if (!tgcred) {
319 ret = -ENOMEM;
320 goto error_put;
321 }
322 atomic_set(&tgcred->usage, 1);
323 spin_lock_init(&tgcred->lock);
324 tgcred->process_keyring = NULL;
325 tgcred->session_keyring = key_get(new->tgcred->session_keyring);
326
327 release_tgcred(new);
328 new->tgcred = tgcred;
329 }
330#endif
331
332 atomic_inc(&new->user->processes);
333 p->cred = p->real_cred = get_cred(new);
334 return 0;
335
336error_put:
337 put_cred(new);
338 return ret;
339}
340
341/**
342 * commit_creds - Install new credentials upon the current task
343 * @new: The credentials to be assigned
344 *
345 * Install a new set of credentials to the current task, using RCU to replace
346 * the old set. Both the objective and the subjective credentials pointers are
347 * updated. This function may not be called if the subjective credentials are
348 * in an overridden state.
349 *
350 * This function eats the caller's reference to the new credentials.
351 *
352 * Always returns 0 thus allowing this function to be tail-called at the end
353 * of, say, sys_setgid().
354 */
355int commit_creds(struct cred *new)
356{
357 struct task_struct *task = current;
358 const struct cred *old;
359
360 BUG_ON(task->cred != task->real_cred);
361 BUG_ON(atomic_read(&task->real_cred->usage) < 2);
362 BUG_ON(atomic_read(&new->usage) < 1);
363
364 old = task->real_cred;
365 security_commit_creds(new, old);
366
367 get_cred(new); /* we will require a ref for the subj creds too */
368
369 /* dumpability changes */
370 if (old->euid != new->euid ||
371 old->egid != new->egid ||
372 old->fsuid != new->fsuid ||
373 old->fsgid != new->fsgid ||
374 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
375 set_dumpable(task->mm, suid_dumpable);
376 task->pdeath_signal = 0;
377 smp_wmb();
378 }
379
380 /* alter the thread keyring */
381 if (new->fsuid != old->fsuid)
382 key_fsuid_changed(task);
383 if (new->fsgid != old->fsgid)
384 key_fsgid_changed(task);
385
386 /* do it
387 * - What if a process setreuid()'s and this brings the
388 * new uid over his NPROC rlimit? We can check this now
389 * cheaply with the new uid cache, so if it matters
390 * we should be checking for it. -DaveM
391 */
392 if (new->user != old->user)
393 atomic_inc(&new->user->processes);
394 rcu_assign_pointer(task->real_cred, new);
395 rcu_assign_pointer(task->cred, new);
396 if (new->user != old->user)
397 atomic_dec(&old->user->processes);
398
399 sched_switch_user(task);
400
401 /* send notifications */
402 if (new->uid != old->uid ||
403 new->euid != old->euid ||
404 new->suid != old->suid ||
405 new->fsuid != old->fsuid)
406 proc_id_connector(task, PROC_EVENT_UID);
407
408 if (new->gid != old->gid ||
409 new->egid != old->egid ||
410 new->sgid != old->sgid ||
411 new->fsgid != old->fsgid)
412 proc_id_connector(task, PROC_EVENT_GID);
413
414 /* release the old obj and subj refs both */
415 put_cred(old);
416 put_cred(old);
417 return 0;
418}
419EXPORT_SYMBOL(commit_creds);
420
421/**
422 * abort_creds - Discard a set of credentials and unlock the current task
423 * @new: The credentials that were going to be applied
424 *
425 * Discard a set of credentials that were under construction and unlock the
426 * current task.
427 */
428void abort_creds(struct cred *new)
429{
430 BUG_ON(atomic_read(&new->usage) < 1);
431 put_cred(new);
432}
433EXPORT_SYMBOL(abort_creds);
434
435/**
436 * override_creds - Override the current process's subjective credentials
437 * @new: The credentials to be assigned
438 *
439 * Install a set of temporary override subjective credentials on the current
440 * process, returning the old set for later reversion.
441 */
442const struct cred *override_creds(const struct cred *new)
443{
444 const struct cred *old = current->cred;
445
446 rcu_assign_pointer(current->cred, get_cred(new));
447 return old;
448}
449EXPORT_SYMBOL(override_creds);
450
451/**
452 * revert_creds - Revert a temporary subjective credentials override
453 * @old: The credentials to be restored
454 *
455 * Revert a temporary set of override subjective credentials to an old set,
456 * discarding the override set.
457 */
458void revert_creds(const struct cred *old)
459{
460 const struct cred *override = current->cred;
461
462 rcu_assign_pointer(current->cred, old);
463 put_cred(override);
464}
465EXPORT_SYMBOL(revert_creds);
466
467/*
468 * initialise the credentials stuff
469 */
470void __init cred_init(void)
471{
472 /* allocate a slab in which we can store credentials */
473 cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
474 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
475}
476
477/**
478 * prepare_kernel_cred - Prepare a set of credentials for a kernel service
479 * @daemon: A userspace daemon to be used as a reference
480 *
481 * Prepare a set of credentials for a kernel service. This can then be used to
482 * override a task's own credentials so that work can be done on behalf of that
483 * task that requires a different subjective context.
484 *
485 * @daemon is used to provide a base for the security record, but can be NULL.
486 * If @daemon is supplied, then the security data will be derived from that;
487 * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
488 *
489 * The caller may change these controls afterwards if desired.
490 *
491 * Returns the new credentials or NULL if out of memory.
492 *
493 * Does not take, and does not return holding current->cred_replace_mutex.
494 */
495struct cred *prepare_kernel_cred(struct task_struct *daemon)
496{
497 const struct cred *old;
498 struct cred *new;
499
500 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
501 if (!new)
502 return NULL;
503
504 if (daemon)
505 old = get_task_cred(daemon);
506 else
507 old = get_cred(&init_cred);
508
509 get_uid(new->user);
510 get_group_info(new->group_info);
511
512#ifdef CONFIG_KEYS
513 atomic_inc(&init_tgcred.usage);
514 new->tgcred = &init_tgcred;
515 new->request_key_auth = NULL;
516 new->thread_keyring = NULL;
517 new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
518#endif
519
520#ifdef CONFIG_SECURITY
521 new->security = NULL;
522#endif
523 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
524 goto error;
525
526 atomic_set(&new->usage, 1);
527 put_cred(old);
528 return new;
529
530error:
531 put_cred(new);
532 return NULL;
533}
534EXPORT_SYMBOL(prepare_kernel_cred);
535
536/**
537 * set_security_override - Set the security ID in a set of credentials
538 * @new: The credentials to alter
539 * @secid: The LSM security ID to set
540 *
541 * Set the LSM security ID in a set of credentials so that the subjective
542 * security is overridden when an alternative set of credentials is used.
543 */
544int set_security_override(struct cred *new, u32 secid)
545{
546 return security_kernel_act_as(new, secid);
547}
548EXPORT_SYMBOL(set_security_override);
549
550/**
551 * set_security_override_from_ctx - Set the security ID in a set of credentials
552 * @new: The credentials to alter
553 * @secctx: The LSM security context to generate the security ID from.
554 *
555 * Set the LSM security ID in a set of credentials so that the subjective
556 * security is overridden when an alternative set of credentials is used. The
557 * security ID is specified in string form as a security context to be
558 * interpreted by the LSM.
559 */
560int set_security_override_from_ctx(struct cred *new, const char *secctx)
561{
562 u32 secid;
563 int ret;
564
565 ret = security_secctx_to_secid(secctx, strlen(secctx), &secid);
566 if (ret < 0)
567 return ret;
568
569 return set_security_override(new, secid);
570}
571EXPORT_SYMBOL(set_security_override_from_ctx);
572
573/**
574 * set_create_files_as - Set the LSM file create context in a set of credentials
575 * @new: The credentials to alter
576 * @inode: The inode to take the context from
577 *
578 * Change the LSM file creation context in a set of credentials to be the same
579 * as the object context of the specified inode, so that the new inodes have
580 * the same MAC context as that inode.
581 */
582int set_create_files_as(struct cred *new, struct inode *inode)
583{
584 new->fsuid = inode->i_uid;
585 new->fsgid = inode->i_gid;
586 return security_kernel_create_files_as(new, inode);
587}
588EXPORT_SYMBOL(set_create_files_as);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b3179dad71be..abb6e17505e2 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
127 */ 127 */
128 t1 = tsk->sched_info.pcount; 128 t1 = tsk->sched_info.pcount;
129 t2 = tsk->sched_info.run_delay; 129 t2 = tsk->sched_info.run_delay;
130 t3 = tsk->sched_info.cpu_time; 130 t3 = tsk->se.sum_exec_runtime;
131 131
132 d->cpu_count += t1; 132 d->cpu_count += t1;
133 133
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index f013a0c2e111..038707404b76 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -109,20 +109,40 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
109int dma_alloc_from_coherent(struct device *dev, ssize_t size, 109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret) 110 dma_addr_t *dma_handle, void **ret)
111{ 111{
112 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; 112 struct dma_coherent_mem *mem;
113 int order = get_order(size); 113 int order = get_order(size);
114 int pageno;
114 115
115 if (mem) { 116 if (!dev)
116 int page = bitmap_find_free_region(mem->bitmap, mem->size, 117 return 0;
117 order); 118 mem = dev->dma_mem;
118 if (page >= 0) { 119 if (!mem)
119 *dma_handle = mem->device_base + (page << PAGE_SHIFT); 120 return 0;
120 *ret = mem->virt_base + (page << PAGE_SHIFT); 121 if (unlikely(size > mem->size))
121 memset(*ret, 0, size); 122 return 0;
122 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) 123
123 *ret = NULL; 124 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
125 if (pageno >= 0) {
126 /*
127 * Memory was found in the per-device arena.
128 */
129 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
130 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
131 memset(*ret, 0, size);
132 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
133 /*
134 * The per-device arena is exhausted and we are not
135 * permitted to fall back to generic memory.
136 */
137 *ret = NULL;
138 } else {
139 /*
140 * The per-device arena is exhausted and we are
141 * permitted to fall back to generic memory.
142 */
143 return 0;
124 } 144 }
125 return (mem != NULL); 145 return 1;
126} 146}
127EXPORT_SYMBOL(dma_alloc_from_coherent); 147EXPORT_SYMBOL(dma_alloc_from_coherent);
128 148
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f7..c7740fa3252c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,12 +46,18 @@
46#include <linux/blkdev.h> 46#include <linux/blkdev.h>
47#include <linux/task_io_accounting_ops.h> 47#include <linux/task_io_accounting_ops.h>
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/init_task.h>
49#include <trace/sched.h> 50#include <trace/sched.h>
50 51
51#include <asm/uaccess.h> 52#include <asm/uaccess.h>
52#include <asm/unistd.h> 53#include <asm/unistd.h>
53#include <asm/pgtable.h> 54#include <asm/pgtable.h>
54#include <asm/mmu_context.h> 55#include <asm/mmu_context.h>
56#include "cred-internals.h"
57
58DEFINE_TRACE(sched_process_free);
59DEFINE_TRACE(sched_process_exit);
60DEFINE_TRACE(sched_process_wait);
55 61
56static void exit_mm(struct task_struct * tsk); 62static void exit_mm(struct task_struct * tsk);
57 63
@@ -164,7 +170,10 @@ void release_task(struct task_struct * p)
164 int zap_leader; 170 int zap_leader;
165repeat: 171repeat:
166 tracehook_prepare_release_task(p); 172 tracehook_prepare_release_task(p);
167 atomic_dec(&p->user->processes); 173 /* don't need to get the RCU readlock here - the process is dead and
174 * can't be modifying its own credentials */
175 atomic_dec(&__task_cred(p)->user->processes);
176
168 proc_flush_task(p); 177 proc_flush_task(p);
169 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
170 tracehook_finish_release_task(p); 179 tracehook_finish_release_task(p);
@@ -339,12 +348,12 @@ static void reparent_to_kthreadd(void)
339 /* cpus_allowed? */ 348 /* cpus_allowed? */
340 /* rt_priority? */ 349 /* rt_priority? */
341 /* signals? */ 350 /* signals? */
342 security_task_reparent_to_init(current);
343 memcpy(current->signal->rlim, init_task.signal->rlim, 351 memcpy(current->signal->rlim, init_task.signal->rlim,
344 sizeof(current->signal->rlim)); 352 sizeof(current->signal->rlim));
345 atomic_inc(&(INIT_USER->__count)); 353
354 atomic_inc(&init_cred.usage);
355 commit_creds(&init_cred);
346 write_unlock_irq(&tasklist_lock); 356 write_unlock_irq(&tasklist_lock);
347 switch_uid(INIT_USER);
348} 357}
349 358
350void __set_special_pids(struct pid *pid) 359void __set_special_pids(struct pid *pid)
@@ -633,35 +642,31 @@ retry:
633 /* 642 /*
634 * We found no owner yet mm_users > 1: this implies that we are 643 * We found no owner yet mm_users > 1: this implies that we are
635 * most likely racing with swapoff (try_to_unuse()) or /proc or 644 * most likely racing with swapoff (try_to_unuse()) or /proc or
636 * ptrace or page migration (get_task_mm()). Mark owner as NULL, 645 * ptrace or page migration (get_task_mm()). Mark owner as NULL.
637 * so that subsystems can understand the callback and take action.
638 */ 646 */
639 down_write(&mm->mmap_sem);
640 cgroup_mm_owner_callbacks(mm->owner, NULL);
641 mm->owner = NULL; 647 mm->owner = NULL;
642 up_write(&mm->mmap_sem);
643 return; 648 return;
644 649
645assign_new_owner: 650assign_new_owner:
646 BUG_ON(c == p); 651 BUG_ON(c == p);
647 get_task_struct(c); 652 get_task_struct(c);
648 read_unlock(&tasklist_lock);
649 down_write(&mm->mmap_sem);
650 /* 653 /*
651 * The task_lock protects c->mm from changing. 654 * The task_lock protects c->mm from changing.
652 * We always want mm->owner->mm == mm 655 * We always want mm->owner->mm == mm
653 */ 656 */
654 task_lock(c); 657 task_lock(c);
658 /*
659 * Delay read_unlock() till we have the task_lock()
660 * to ensure that c does not slip away underneath us
661 */
662 read_unlock(&tasklist_lock);
655 if (c->mm != mm) { 663 if (c->mm != mm) {
656 task_unlock(c); 664 task_unlock(c);
657 up_write(&mm->mmap_sem);
658 put_task_struct(c); 665 put_task_struct(c);
659 goto retry; 666 goto retry;
660 } 667 }
661 cgroup_mm_owner_callbacks(mm->owner, c);
662 mm->owner = c; 668 mm->owner = c;
663 task_unlock(c); 669 task_unlock(c);
664 up_write(&mm->mmap_sem);
665 put_task_struct(c); 670 put_task_struct(c);
666} 671}
667#endif /* CONFIG_MM_OWNER */ 672#endif /* CONFIG_MM_OWNER */
@@ -1028,8 +1033,6 @@ NORET_TYPE void do_exit(long code)
1028 * task into the wait for ever nirwana as well. 1033 * task into the wait for ever nirwana as well.
1029 */ 1034 */
1030 tsk->flags |= PF_EXITPIDONE; 1035 tsk->flags |= PF_EXITPIDONE;
1031 if (tsk->io_context)
1032 exit_io_context();
1033 set_current_state(TASK_UNINTERRUPTIBLE); 1036 set_current_state(TASK_UNINTERRUPTIBLE);
1034 schedule(); 1037 schedule();
1035 } 1038 }
@@ -1048,10 +1051,7 @@ NORET_TYPE void do_exit(long code)
1048 preempt_count()); 1051 preempt_count());
1049 1052
1050 acct_update_integrals(tsk); 1053 acct_update_integrals(tsk);
1051 if (tsk->mm) { 1054
1052 update_hiwater_rss(tsk->mm);
1053 update_hiwater_vm(tsk->mm);
1054 }
1055 group_dead = atomic_dec_and_test(&tsk->signal->live); 1055 group_dead = atomic_dec_and_test(&tsk->signal->live);
1056 if (group_dead) { 1056 if (group_dead) {
1057 hrtimer_cancel(&tsk->signal->real_timer); 1057 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1078,7 +1078,6 @@ NORET_TYPE void do_exit(long code)
1078 check_stack_usage(); 1078 check_stack_usage();
1079 exit_thread(); 1079 exit_thread();
1080 cgroup_exit(tsk, 1); 1080 cgroup_exit(tsk, 1);
1081 exit_keys(tsk);
1082 1081
1083 if (group_dead && tsk->signal->leader) 1082 if (group_dead && tsk->signal->leader)
1084 disassociate_ctty(1); 1083 disassociate_ctty(1);
@@ -1123,7 +1122,6 @@ NORET_TYPE void do_exit(long code)
1123 preempt_disable(); 1122 preempt_disable();
1124 /* causes final put_task_struct in finish_task_switch(). */ 1123 /* causes final put_task_struct in finish_task_switch(). */
1125 tsk->state = TASK_DEAD; 1124 tsk->state = TASK_DEAD;
1126
1127 schedule(); 1125 schedule();
1128 BUG(); 1126 BUG();
1129 /* Avoid "noreturn function does return". */ 1127 /* Avoid "noreturn function does return". */
@@ -1263,12 +1261,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1263 unsigned long state; 1261 unsigned long state;
1264 int retval, status, traced; 1262 int retval, status, traced;
1265 pid_t pid = task_pid_vnr(p); 1263 pid_t pid = task_pid_vnr(p);
1264 uid_t uid = __task_cred(p)->uid;
1266 1265
1267 if (!likely(options & WEXITED)) 1266 if (!likely(options & WEXITED))
1268 return 0; 1267 return 0;
1269 1268
1270 if (unlikely(options & WNOWAIT)) { 1269 if (unlikely(options & WNOWAIT)) {
1271 uid_t uid = p->uid;
1272 int exit_code = p->exit_code; 1270 int exit_code = p->exit_code;
1273 int why, status; 1271 int why, status;
1274 1272
@@ -1321,10 +1319,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
1321 * group, which consolidates times for all threads in the 1319 * group, which consolidates times for all threads in the
1322 * group including the group leader. 1320 * group including the group leader.
1323 */ 1321 */
1322 thread_group_cputime(p, &cputime);
1324 spin_lock_irq(&p->parent->sighand->siglock); 1323 spin_lock_irq(&p->parent->sighand->siglock);
1325 psig = p->parent->signal; 1324 psig = p->parent->signal;
1326 sig = p->signal; 1325 sig = p->signal;
1327 thread_group_cputime(p, &cputime);
1328 psig->cutime = 1326 psig->cutime =
1329 cputime_add(psig->cutime, 1327 cputime_add(psig->cutime,
1330 cputime_add(cputime.utime, 1328 cputime_add(cputime.utime,
@@ -1389,7 +1387,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1389 if (!retval && infop) 1387 if (!retval && infop)
1390 retval = put_user(pid, &infop->si_pid); 1388 retval = put_user(pid, &infop->si_pid);
1391 if (!retval && infop) 1389 if (!retval && infop)
1392 retval = put_user(p->uid, &infop->si_uid); 1390 retval = put_user(uid, &infop->si_uid);
1393 if (!retval) 1391 if (!retval)
1394 retval = pid; 1392 retval = pid;
1395 1393
@@ -1454,7 +1452,8 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1454 if (!unlikely(options & WNOWAIT)) 1452 if (!unlikely(options & WNOWAIT))
1455 p->exit_code = 0; 1453 p->exit_code = 0;
1456 1454
1457 uid = p->uid; 1455 /* don't need the RCU readlock here as we're holding a spinlock */
1456 uid = __task_cred(p)->uid;
1458unlock_sig: 1457unlock_sig:
1459 spin_unlock_irq(&p->sighand->siglock); 1458 spin_unlock_irq(&p->sighand->siglock);
1460 if (!exit_code) 1459 if (!exit_code)
@@ -1528,10 +1527,10 @@ static int wait_task_continued(struct task_struct *p, int options,
1528 } 1527 }
1529 if (!unlikely(options & WNOWAIT)) 1528 if (!unlikely(options & WNOWAIT))
1530 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1529 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1530 uid = __task_cred(p)->uid;
1531 spin_unlock_irq(&p->sighand->siglock); 1531 spin_unlock_irq(&p->sighand->siglock);
1532 1532
1533 pid = task_pid_vnr(p); 1533 pid = task_pid_vnr(p);
1534 uid = p->uid;
1535 get_task_struct(p); 1534 get_task_struct(p);
1536 read_unlock(&tasklist_lock); 1535 read_unlock(&tasklist_lock);
1537 1536
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e17023..e136ed8d82ba 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,6 +17,7 @@
17*/ 17*/
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <linux/ftrace.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
22 23
@@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
40 return e; 41 return e;
41} 42}
42 43
43int core_kernel_text(unsigned long addr) 44__notrace_funcgraph int core_kernel_text(unsigned long addr)
44{ 45{
45 if (addr >= (unsigned long)_stext && 46 if (addr >= (unsigned long)_stext &&
46 addr <= (unsigned long)_etext) 47 addr <= (unsigned long)_etext)
@@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr)
53 return 0; 54 return 0;
54} 55}
55 56
56int __kernel_text_address(unsigned long addr) 57__notrace_funcgraph int __kernel_text_address(unsigned long addr)
57{ 58{
58 if (core_kernel_text(addr)) 59 if (core_kernel_text(addr))
59 return 1; 60 return 1;
@@ -66,3 +67,19 @@ int kernel_text_address(unsigned long addr)
66 return 1; 67 return 1;
67 return module_text_address(addr) != NULL; 68 return module_text_address(addr) != NULL;
68} 69}
70
71/*
72 * On some architectures (PPC64, IA64) function pointers
73 * are actually only tokens to some data that then holds the
74 * real function address. As a result, to find if a function
75 * pointer is part of the kernel text, we need to do some
76 * special dereferencing first.
77 */
78int func_ptr_is_kernel_text(void *ptr)
79{
80 unsigned long addr;
81 addr = (unsigned long) dereference_function_descriptor(ptr);
82 if (core_kernel_text(addr))
83 return 1;
84 return module_text_address(addr) != NULL;
85}
diff --git a/kernel/fork.c b/kernel/fork.c
index 495da2e9a8b4..4018308048cf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -47,6 +47,7 @@
47#include <linux/mount.h> 47#include <linux/mount.h>
48#include <linux/audit.h> 48#include <linux/audit.h>
49#include <linux/memcontrol.h> 49#include <linux/memcontrol.h>
50#include <linux/ftrace.h>
50#include <linux/profile.h> 51#include <linux/profile.h>
51#include <linux/rmap.h> 52#include <linux/rmap.h>
52#include <linux/acct.h> 53#include <linux/acct.h>
@@ -80,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
80 81
81__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 82__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
82 83
84DEFINE_TRACE(sched_process_fork);
85
83int nr_processes(void) 86int nr_processes(void)
84{ 87{
85 int cpu; 88 int cpu;
@@ -137,6 +140,7 @@ void free_task(struct task_struct *tsk)
137 prop_local_destroy_single(&tsk->dirties); 140 prop_local_destroy_single(&tsk->dirties);
138 free_thread_info(tsk->stack); 141 free_thread_info(tsk->stack);
139 rt_mutex_debug_task_free(tsk); 142 rt_mutex_debug_task_free(tsk);
143 ftrace_graph_exit_task(tsk);
140 free_task_struct(tsk); 144 free_task_struct(tsk);
141} 145}
142EXPORT_SYMBOL(free_task); 146EXPORT_SYMBOL(free_task);
@@ -147,9 +151,8 @@ void __put_task_struct(struct task_struct *tsk)
147 WARN_ON(atomic_read(&tsk->usage)); 151 WARN_ON(atomic_read(&tsk->usage));
148 WARN_ON(tsk == current); 152 WARN_ON(tsk == current);
149 153
150 security_task_free(tsk); 154 put_cred(tsk->real_cred);
151 free_uid(tsk->user); 155 put_cred(tsk->cred);
152 put_group_info(tsk->group_info);
153 delayacct_tsk_free(tsk); 156 delayacct_tsk_free(tsk);
154 157
155 if (!profile_handoff_task(tsk)) 158 if (!profile_handoff_task(tsk))
@@ -397,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
397#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) 400#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
398#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 401#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
399 402
403static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
404
405static int __init coredump_filter_setup(char *s)
406{
407 default_dump_filter =
408 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
409 MMF_DUMP_FILTER_MASK;
410 return 1;
411}
412
413__setup("coredump_filter=", coredump_filter_setup);
414
400#include <linux/init_task.h> 415#include <linux/init_task.h>
401 416
402static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 417static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
@@ -405,15 +420,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
405 atomic_set(&mm->mm_count, 1); 420 atomic_set(&mm->mm_count, 1);
406 init_rwsem(&mm->mmap_sem); 421 init_rwsem(&mm->mmap_sem);
407 INIT_LIST_HEAD(&mm->mmlist); 422 INIT_LIST_HEAD(&mm->mmlist);
408 mm->flags = (current->mm) ? current->mm->flags 423 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
409 : MMF_DUMP_FILTER_DEFAULT;
410 mm->core_state = NULL; 424 mm->core_state = NULL;
411 mm->nr_ptes = 0; 425 mm->nr_ptes = 0;
412 set_mm_counter(mm, file_rss, 0); 426 set_mm_counter(mm, file_rss, 0);
413 set_mm_counter(mm, anon_rss, 0); 427 set_mm_counter(mm, anon_rss, 0);
414 spin_lock_init(&mm->page_table_lock); 428 spin_lock_init(&mm->page_table_lock);
415 rwlock_init(&mm->ioctx_list_lock); 429 spin_lock_init(&mm->ioctx_lock);
416 mm->ioctx_list = NULL; 430 INIT_HLIST_HEAD(&mm->ioctx_list);
417 mm->free_area_cache = TASK_UNMAPPED_BASE; 431 mm->free_area_cache = TASK_UNMAPPED_BASE;
418 mm->cached_hole_size = ~0UL; 432 mm->cached_hole_size = ~0UL;
419 mm_init_owner(mm, p); 433 mm_init_owner(mm, p);
@@ -755,7 +769,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
755{ 769{
756 struct sighand_struct *sig; 770 struct sighand_struct *sig;
757 771
758 if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { 772 if (clone_flags & CLONE_SIGHAND) {
759 atomic_inc(&current->sighand->count); 773 atomic_inc(&current->sighand->count);
760 return 0; 774 return 0;
761 } 775 }
@@ -818,12 +832,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
818 if (!sig) 832 if (!sig)
819 return -ENOMEM; 833 return -ENOMEM;
820 834
821 ret = copy_thread_group_keys(tsk);
822 if (ret < 0) {
823 kmem_cache_free(signal_cachep, sig);
824 return ret;
825 }
826
827 atomic_set(&sig->count, 1); 835 atomic_set(&sig->count, 1);
828 atomic_set(&sig->live, 1); 836 atomic_set(&sig->live, 1);
829 init_waitqueue_head(&sig->wait_chldexit); 837 init_waitqueue_head(&sig->wait_chldexit);
@@ -868,7 +876,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
868void __cleanup_signal(struct signal_struct *sig) 876void __cleanup_signal(struct signal_struct *sig)
869{ 877{
870 thread_group_cputime_free(sig); 878 thread_group_cputime_free(sig);
871 exit_thread_group_keys(sig);
872 tty_kref_put(sig->tty); 879 tty_kref_put(sig->tty);
873 kmem_cache_free(signal_cachep, sig); 880 kmem_cache_free(signal_cachep, sig);
874} 881}
@@ -984,16 +991,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
984 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); 991 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
985#endif 992#endif
986 retval = -EAGAIN; 993 retval = -EAGAIN;
987 if (atomic_read(&p->user->processes) >= 994 if (atomic_read(&p->real_cred->user->processes) >=
988 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 995 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
989 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && 996 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
990 p->user != current->nsproxy->user_ns->root_user) 997 p->real_cred->user != INIT_USER)
991 goto bad_fork_free; 998 goto bad_fork_free;
992 } 999 }
993 1000
994 atomic_inc(&p->user->__count); 1001 retval = copy_creds(p, clone_flags);
995 atomic_inc(&p->user->processes); 1002 if (retval < 0)
996 get_group_info(p->group_info); 1003 goto bad_fork_free;
997 1004
998 /* 1005 /*
999 * If multiple threads are within copy_process(), then this check 1006 * If multiple threads are within copy_process(), then this check
@@ -1048,10 +1055,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1048 do_posix_clock_monotonic_gettime(&p->start_time); 1055 do_posix_clock_monotonic_gettime(&p->start_time);
1049 p->real_start_time = p->start_time; 1056 p->real_start_time = p->start_time;
1050 monotonic_to_bootbased(&p->real_start_time); 1057 monotonic_to_bootbased(&p->real_start_time);
1051#ifdef CONFIG_SECURITY
1052 p->security = NULL;
1053#endif
1054 p->cap_bset = current->cap_bset;
1055 p->io_context = NULL; 1058 p->io_context = NULL;
1056 p->audit_context = NULL; 1059 p->audit_context = NULL;
1057 cgroup_fork(p); 1060 cgroup_fork(p);
@@ -1092,14 +1095,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1092#ifdef CONFIG_DEBUG_MUTEXES 1095#ifdef CONFIG_DEBUG_MUTEXES
1093 p->blocked_on = NULL; /* not blocked yet */ 1096 p->blocked_on = NULL; /* not blocked yet */
1094#endif 1097#endif
1098 if (unlikely(ptrace_reparented(current)))
1099 ptrace_fork(p, clone_flags);
1095 1100
1096 /* Perform scheduler related setup. Assign this task to a CPU. */ 1101 /* Perform scheduler related setup. Assign this task to a CPU. */
1097 sched_fork(p, clone_flags); 1102 sched_fork(p, clone_flags);
1098 1103
1099 if ((retval = security_task_alloc(p)))
1100 goto bad_fork_cleanup_policy;
1101 if ((retval = audit_alloc(p))) 1104 if ((retval = audit_alloc(p)))
1102 goto bad_fork_cleanup_security; 1105 goto bad_fork_cleanup_policy;
1103 /* copy all the process information */ 1106 /* copy all the process information */
1104 if ((retval = copy_semundo(clone_flags, p))) 1107 if ((retval = copy_semundo(clone_flags, p)))
1105 goto bad_fork_cleanup_audit; 1108 goto bad_fork_cleanup_audit;
@@ -1113,10 +1116,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1113 goto bad_fork_cleanup_sighand; 1116 goto bad_fork_cleanup_sighand;
1114 if ((retval = copy_mm(clone_flags, p))) 1117 if ((retval = copy_mm(clone_flags, p)))
1115 goto bad_fork_cleanup_signal; 1118 goto bad_fork_cleanup_signal;
1116 if ((retval = copy_keys(clone_flags, p)))
1117 goto bad_fork_cleanup_mm;
1118 if ((retval = copy_namespaces(clone_flags, p))) 1119 if ((retval = copy_namespaces(clone_flags, p)))
1119 goto bad_fork_cleanup_keys; 1120 goto bad_fork_cleanup_mm;
1120 if ((retval = copy_io(clone_flags, p))) 1121 if ((retval = copy_io(clone_flags, p)))
1121 goto bad_fork_cleanup_namespaces; 1122 goto bad_fork_cleanup_namespaces;
1122 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1123 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
@@ -1125,17 +1126,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1125 1126
1126 if (pid != &init_struct_pid) { 1127 if (pid != &init_struct_pid) {
1127 retval = -ENOMEM; 1128 retval = -ENOMEM;
1128 pid = alloc_pid(task_active_pid_ns(p)); 1129 pid = alloc_pid(p->nsproxy->pid_ns);
1129 if (!pid) 1130 if (!pid)
1130 goto bad_fork_cleanup_io; 1131 goto bad_fork_cleanup_io;
1131 1132
1132 if (clone_flags & CLONE_NEWPID) { 1133 if (clone_flags & CLONE_NEWPID) {
1133 retval = pid_ns_prepare_proc(task_active_pid_ns(p)); 1134 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1134 if (retval < 0) 1135 if (retval < 0)
1135 goto bad_fork_free_pid; 1136 goto bad_fork_free_pid;
1136 } 1137 }
1137 } 1138 }
1138 1139
1140 ftrace_graph_init_task(p);
1141
1139 p->pid = pid_nr(pid); 1142 p->pid = pid_nr(pid);
1140 p->tgid = p->pid; 1143 p->tgid = p->pid;
1141 if (clone_flags & CLONE_THREAD) 1144 if (clone_flags & CLONE_THREAD)
@@ -1144,7 +1147,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1144 if (current->nsproxy != p->nsproxy) { 1147 if (current->nsproxy != p->nsproxy) {
1145 retval = ns_cgroup_clone(p, pid); 1148 retval = ns_cgroup_clone(p, pid);
1146 if (retval) 1149 if (retval)
1147 goto bad_fork_free_pid; 1150 goto bad_fork_free_graph;
1148 } 1151 }
1149 1152
1150 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1153 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1237,7 +1240,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1237 spin_unlock(&current->sighand->siglock); 1240 spin_unlock(&current->sighand->siglock);
1238 write_unlock_irq(&tasklist_lock); 1241 write_unlock_irq(&tasklist_lock);
1239 retval = -ERESTARTNOINTR; 1242 retval = -ERESTARTNOINTR;
1240 goto bad_fork_free_pid; 1243 goto bad_fork_free_graph;
1241 } 1244 }
1242 1245
1243 if (clone_flags & CLONE_THREAD) { 1246 if (clone_flags & CLONE_THREAD) {
@@ -1274,6 +1277,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1274 cgroup_post_fork(p); 1277 cgroup_post_fork(p);
1275 return p; 1278 return p;
1276 1279
1280bad_fork_free_graph:
1281 ftrace_graph_exit_task(p);
1277bad_fork_free_pid: 1282bad_fork_free_pid:
1278 if (pid != &init_struct_pid) 1283 if (pid != &init_struct_pid)
1279 free_pid(pid); 1284 free_pid(pid);
@@ -1281,8 +1286,6 @@ bad_fork_cleanup_io:
1281 put_io_context(p->io_context); 1286 put_io_context(p->io_context);
1282bad_fork_cleanup_namespaces: 1287bad_fork_cleanup_namespaces:
1283 exit_task_namespaces(p); 1288 exit_task_namespaces(p);
1284bad_fork_cleanup_keys:
1285 exit_keys(p);
1286bad_fork_cleanup_mm: 1289bad_fork_cleanup_mm:
1287 if (p->mm) 1290 if (p->mm)
1288 mmput(p->mm); 1291 mmput(p->mm);
@@ -1298,8 +1301,6 @@ bad_fork_cleanup_semundo:
1298 exit_sem(p); 1301 exit_sem(p);
1299bad_fork_cleanup_audit: 1302bad_fork_cleanup_audit:
1300 audit_free(p); 1303 audit_free(p);
1301bad_fork_cleanup_security:
1302 security_task_free(p);
1303bad_fork_cleanup_policy: 1304bad_fork_cleanup_policy:
1304#ifdef CONFIG_NUMA 1305#ifdef CONFIG_NUMA
1305 mpol_put(p->mempolicy); 1306 mpol_put(p->mempolicy);
@@ -1312,9 +1313,9 @@ bad_fork_cleanup_cgroup:
1312bad_fork_cleanup_put_domain: 1313bad_fork_cleanup_put_domain:
1313 module_put(task_thread_info(p)->exec_domain->module); 1314 module_put(task_thread_info(p)->exec_domain->module);
1314bad_fork_cleanup_count: 1315bad_fork_cleanup_count:
1315 put_group_info(p->group_info); 1316 atomic_dec(&p->cred->user->processes);
1316 atomic_dec(&p->user->processes); 1317 put_cred(p->real_cred);
1317 free_uid(p->user); 1318 put_cred(p->cred);
1318bad_fork_free: 1319bad_fork_free:
1319 free_task(p); 1320 free_task(p);
1320fork_out: 1321fork_out:
@@ -1358,6 +1359,21 @@ long do_fork(unsigned long clone_flags,
1358 long nr; 1359 long nr;
1359 1360
1360 /* 1361 /*
1362 * Do some preliminary argument and permissions checking before we
1363 * actually start allocating stuff
1364 */
1365 if (clone_flags & CLONE_NEWUSER) {
1366 if (clone_flags & CLONE_THREAD)
1367 return -EINVAL;
1368 /* hopefully this check will go away when userns support is
1369 * complete
1370 */
1371 if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
1372 !capable(CAP_SETGID))
1373 return -EPERM;
1374 }
1375
1376 /*
1361 * We hope to recycle these flags after 2.6.26 1377 * We hope to recycle these flags after 2.6.26
1362 */ 1378 */
1363 if (unlikely(clone_flags & CLONE_STOPPED)) { 1379 if (unlikely(clone_flags & CLONE_STOPPED)) {
@@ -1605,8 +1621,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1605 err = -EINVAL; 1621 err = -EINVAL;
1606 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| 1622 if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
1607 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| 1623 CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
1608 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWUSER| 1624 CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
1609 CLONE_NEWNET))
1610 goto bad_unshare_out; 1625 goto bad_unshare_out;
1611 1626
1612 /* 1627 /*
diff --git a/kernel/futex.c b/kernel/futex.c
index 8af10027514b..002aa189eb09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -92,11 +92,12 @@ struct futex_pi_state {
92 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
94 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
95 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiter, then make the second condition true.
96 */ 96 */
97struct futex_q { 97struct futex_q {
98 struct plist_node list; 98 struct plist_node list;
99 wait_queue_head_t waiters; 99 /* There can only be a single waiter */
100 wait_queue_head_t waiter;
100 101
101 /* Which hash list lock to use: */ 102 /* Which hash list lock to use: */
102 spinlock_t *lock_ptr; 103 spinlock_t *lock_ptr;
@@ -123,24 +124,6 @@ struct futex_hash_bucket {
123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 124static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
124 125
125/* 126/*
126 * Take mm->mmap_sem, when futex is shared
127 */
128static inline void futex_lock_mm(struct rw_semaphore *fshared)
129{
130 if (fshared)
131 down_read(fshared);
132}
133
134/*
135 * Release mm->mmap_sem, when the futex is shared
136 */
137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
138{
139 if (fshared)
140 up_read(fshared);
141}
142
143/*
144 * We hash on the keys returned from get_futex_key (see below). 127 * We hash on the keys returned from get_futex_key (see below).
145 */ 128 */
146static struct futex_hash_bucket *hash_futex(union futex_key *key) 129static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -161,6 +144,48 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
161 && key1->both.offset == key2->both.offset); 144 && key1->both.offset == key2->both.offset);
162} 145}
163 146
147/*
148 * Take a reference to the resource addressed by a key.
149 * Can be called while holding spinlocks.
150 *
151 */
152static void get_futex_key_refs(union futex_key *key)
153{
154 if (!key->both.ptr)
155 return;
156
157 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
158 case FUT_OFF_INODE:
159 atomic_inc(&key->shared.inode->i_count);
160 break;
161 case FUT_OFF_MMSHARED:
162 atomic_inc(&key->private.mm->mm_count);
163 break;
164 }
165}
166
167/*
168 * Drop a reference to the resource addressed by a key.
169 * The hash bucket spinlock must not be held.
170 */
171static void drop_futex_key_refs(union futex_key *key)
172{
173 if (!key->both.ptr) {
174 /* If we're here then we tried to put a key we failed to get */
175 WARN_ON_ONCE(1);
176 return;
177 }
178
179 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
180 case FUT_OFF_INODE:
181 iput(key->shared.inode);
182 break;
183 case FUT_OFF_MMSHARED:
184 mmdrop(key->private.mm);
185 break;
186 }
187}
188
164/** 189/**
165 * get_futex_key - Get parameters which are the keys for a futex. 190 * get_futex_key - Get parameters which are the keys for a futex.
166 * @uaddr: virtual address of the futex 191 * @uaddr: virtual address of the futex
@@ -179,12 +204,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
179 * For other futexes, it points to &current->mm->mmap_sem and 204 * For other futexes, it points to &current->mm->mmap_sem and
180 * caller must have taken the reader lock. but NOT any spinlocks. 205 * caller must have taken the reader lock. but NOT any spinlocks.
181 */ 206 */
182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 207static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
183 union futex_key *key)
184{ 208{
185 unsigned long address = (unsigned long)uaddr; 209 unsigned long address = (unsigned long)uaddr;
186 struct mm_struct *mm = current->mm; 210 struct mm_struct *mm = current->mm;
187 struct vm_area_struct *vma;
188 struct page *page; 211 struct page *page;
189 int err; 212 int err;
190 213
@@ -208,100 +231,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
208 return -EFAULT; 231 return -EFAULT;
209 key->private.mm = mm; 232 key->private.mm = mm;
210 key->private.address = address; 233 key->private.address = address;
234 get_futex_key_refs(key);
211 return 0; 235 return 0;
212 } 236 }
213 /*
214 * The futex is hashed differently depending on whether
215 * it's in a shared or private mapping. So check vma first.
216 */
217 vma = find_extend_vma(mm, address);
218 if (unlikely(!vma))
219 return -EFAULT;
220 237
221 /* 238again:
222 * Permissions. 239 err = get_user_pages_fast(address, 1, 0, &page);
223 */ 240 if (err < 0)
224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 241 return err;
225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 242
243 lock_page(page);
244 if (!page->mapping) {
245 unlock_page(page);
246 put_page(page);
247 goto again;
248 }
226 249
227 /* 250 /*
228 * Private mappings are handled in a simple way. 251 * Private mappings are handled in a simple way.
229 * 252 *
230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 253 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
231 * it's a read-only handle, it's expected that futexes attach to 254 * it's a read-only handle, it's expected that futexes attach to
232 * the object not the particular process. Therefore we use 255 * the object not the particular process.
233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
234 * mappings of _writable_ handles.
235 */ 256 */
236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 257 if (PageAnon(page)) {
237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 258 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
238 key->private.mm = mm; 259 key->private.mm = mm;
239 key->private.address = address; 260 key->private.address = address;
240 return 0; 261 } else {
241 } 262 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
242 263 key->shared.inode = page->mapping->host;
243 /* 264 key->shared.pgoff = page->index;
244 * Linear file mappings are also simple.
245 */
246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
250 + vma->vm_pgoff);
251 return 0;
252 } 265 }
253 266
254 /* 267 get_futex_key_refs(key);
255 * We could walk the page table to read the non-linear
256 * pte, and get the page index without fetching the page
257 * from swap. But that's a lot of code to duplicate here
258 * for a rare case, so we simply fetch the page.
259 */
260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
261 if (err >= 0) {
262 key->shared.pgoff =
263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
264 put_page(page);
265 return 0;
266 }
267 return err;
268}
269 268
270/* 269 unlock_page(page);
271 * Take a reference to the resource addressed by a key. 270 put_page(page);
272 * Can be called while holding spinlocks. 271 return 0;
273 *
274 */
275static void get_futex_key_refs(union futex_key *key)
276{
277 if (key->both.ptr == NULL)
278 return;
279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
280 case FUT_OFF_INODE:
281 atomic_inc(&key->shared.inode->i_count);
282 break;
283 case FUT_OFF_MMSHARED:
284 atomic_inc(&key->private.mm->mm_count);
285 break;
286 }
287} 272}
288 273
289/* 274static inline
290 * Drop a reference to the resource addressed by a key. 275void put_futex_key(int fshared, union futex_key *key)
291 * The hash bucket spinlock must not be held.
292 */
293static void drop_futex_key_refs(union futex_key *key)
294{ 276{
295 if (!key->both.ptr) 277 drop_futex_key_refs(key);
296 return;
297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
298 case FUT_OFF_INODE:
299 iput(key->shared.inode);
300 break;
301 case FUT_OFF_MMSHARED:
302 mmdrop(key->private.mm);
303 break;
304 }
305} 278}
306 279
307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 280static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -328,10 +301,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
328 301
329/* 302/*
330 * Fault handling. 303 * Fault handling.
331 * if fshared is non NULL, current->mm->mmap_sem is already held
332 */ 304 */
333static int futex_handle_fault(unsigned long address, 305static int futex_handle_fault(unsigned long address, int attempt)
334 struct rw_semaphore *fshared, int attempt)
335{ 306{
336 struct vm_area_struct * vma; 307 struct vm_area_struct * vma;
337 struct mm_struct *mm = current->mm; 308 struct mm_struct *mm = current->mm;
@@ -340,8 +311,7 @@ static int futex_handle_fault(unsigned long address,
340 if (attempt > 2) 311 if (attempt > 2)
341 return ret; 312 return ret;
342 313
343 if (!fshared) 314 down_read(&mm->mmap_sem);
344 down_read(&mm->mmap_sem);
345 vma = find_vma(mm, address); 315 vma = find_vma(mm, address);
346 if (vma && address >= vma->vm_start && 316 if (vma && address >= vma->vm_start &&
347 (vma->vm_flags & VM_WRITE)) { 317 (vma->vm_flags & VM_WRITE)) {
@@ -361,8 +331,7 @@ static int futex_handle_fault(unsigned long address,
361 current->min_flt++; 331 current->min_flt++;
362 } 332 }
363 } 333 }
364 if (!fshared) 334 up_read(&mm->mmap_sem);
365 up_read(&mm->mmap_sem);
366 return ret; 335 return ret;
367} 336}
368 337
@@ -385,6 +354,7 @@ static int refill_pi_state_cache(void)
385 /* pi_mutex gets initialized later */ 354 /* pi_mutex gets initialized later */
386 pi_state->owner = NULL; 355 pi_state->owner = NULL;
387 atomic_set(&pi_state->refcount, 1); 356 atomic_set(&pi_state->refcount, 1);
357 pi_state->key = FUTEX_KEY_INIT;
388 358
389 current->pi_state_cache = pi_state; 359 current->pi_state_cache = pi_state;
390 360
@@ -439,13 +409,20 @@ static void free_pi_state(struct futex_pi_state *pi_state)
439static struct task_struct * futex_find_get_task(pid_t pid) 409static struct task_struct * futex_find_get_task(pid_t pid)
440{ 410{
441 struct task_struct *p; 411 struct task_struct *p;
412 const struct cred *cred = current_cred(), *pcred;
442 413
443 rcu_read_lock(); 414 rcu_read_lock();
444 p = find_task_by_vpid(pid); 415 p = find_task_by_vpid(pid);
445 if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) 416 if (!p) {
446 p = ERR_PTR(-ESRCH); 417 p = ERR_PTR(-ESRCH);
447 else 418 } else {
448 get_task_struct(p); 419 pcred = __task_cred(p);
420 if (cred->euid != pcred->euid &&
421 cred->euid != pcred->uid)
422 p = ERR_PTR(-ESRCH);
423 else
424 get_task_struct(p);
425 }
449 426
450 rcu_read_unlock(); 427 rcu_read_unlock();
451 428
@@ -462,7 +439,7 @@ void exit_pi_state_list(struct task_struct *curr)
462 struct list_head *next, *head = &curr->pi_state_list; 439 struct list_head *next, *head = &curr->pi_state_list;
463 struct futex_pi_state *pi_state; 440 struct futex_pi_state *pi_state;
464 struct futex_hash_bucket *hb; 441 struct futex_hash_bucket *hb;
465 union futex_key key; 442 union futex_key key = FUTEX_KEY_INIT;
466 443
467 if (!futex_cmpxchg_enabled) 444 if (!futex_cmpxchg_enabled)
468 return; 445 return;
@@ -607,7 +584,7 @@ static void wake_futex(struct futex_q *q)
607 * The lock in wake_up_all() is a crucial memory barrier after the 584 * The lock in wake_up_all() is a crucial memory barrier after the
608 * plist_del() and also before assigning to q->lock_ptr. 585 * plist_del() and also before assigning to q->lock_ptr.
609 */ 586 */
610 wake_up_all(&q->waiters); 587 wake_up(&q->waiter);
611 /* 588 /*
612 * The waiting task can free the futex_q as soon as this is written, 589 * The waiting task can free the futex_q as soon as this is written,
613 * without taking any locks. This must come last. 590 * without taking any locks. This must come last.
@@ -719,20 +696,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
719 * Wake up all waiters hashed on the physical page that is mapped 696 * Wake up all waiters hashed on the physical page that is mapped
720 * to this virtual address: 697 * to this virtual address:
721 */ 698 */
722static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 699static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
723 int nr_wake, u32 bitset)
724{ 700{
725 struct futex_hash_bucket *hb; 701 struct futex_hash_bucket *hb;
726 struct futex_q *this, *next; 702 struct futex_q *this, *next;
727 struct plist_head *head; 703 struct plist_head *head;
728 union futex_key key; 704 union futex_key key = FUTEX_KEY_INIT;
729 int ret; 705 int ret;
730 706
731 if (!bitset) 707 if (!bitset)
732 return -EINVAL; 708 return -EINVAL;
733 709
734 futex_lock_mm(fshared);
735
736 ret = get_futex_key(uaddr, fshared, &key); 710 ret = get_futex_key(uaddr, fshared, &key);
737 if (unlikely(ret != 0)) 711 if (unlikely(ret != 0))
738 goto out; 712 goto out;
@@ -759,8 +733,8 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
759 } 733 }
760 734
761 spin_unlock(&hb->lock); 735 spin_unlock(&hb->lock);
736 put_futex_key(fshared, &key);
762out: 737out:
763 futex_unlock_mm(fshared);
764 return ret; 738 return ret;
765} 739}
766 740
@@ -769,25 +743,22 @@ out:
769 * to this virtual address: 743 * to this virtual address:
770 */ 744 */
771static int 745static int
772futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 746futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
773 u32 __user *uaddr2,
774 int nr_wake, int nr_wake2, int op) 747 int nr_wake, int nr_wake2, int op)
775{ 748{
776 union futex_key key1, key2; 749 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
777 struct futex_hash_bucket *hb1, *hb2; 750 struct futex_hash_bucket *hb1, *hb2;
778 struct plist_head *head; 751 struct plist_head *head;
779 struct futex_q *this, *next; 752 struct futex_q *this, *next;
780 int ret, op_ret, attempt = 0; 753 int ret, op_ret, attempt = 0;
781 754
782retryfull: 755retryfull:
783 futex_lock_mm(fshared);
784
785 ret = get_futex_key(uaddr1, fshared, &key1); 756 ret = get_futex_key(uaddr1, fshared, &key1);
786 if (unlikely(ret != 0)) 757 if (unlikely(ret != 0))
787 goto out; 758 goto out;
788 ret = get_futex_key(uaddr2, fshared, &key2); 759 ret = get_futex_key(uaddr2, fshared, &key2);
789 if (unlikely(ret != 0)) 760 if (unlikely(ret != 0))
790 goto out; 761 goto out_put_key1;
791 762
792 hb1 = hash_futex(&key1); 763 hb1 = hash_futex(&key1);
793 hb2 = hash_futex(&key2); 764 hb2 = hash_futex(&key2);
@@ -809,12 +780,12 @@ retry:
809 * but we might get them from range checking 780 * but we might get them from range checking
810 */ 781 */
811 ret = op_ret; 782 ret = op_ret;
812 goto out; 783 goto out_put_keys;
813#endif 784#endif
814 785
815 if (unlikely(op_ret != -EFAULT)) { 786 if (unlikely(op_ret != -EFAULT)) {
816 ret = op_ret; 787 ret = op_ret;
817 goto out; 788 goto out_put_keys;
818 } 789 }
819 790
820 /* 791 /*
@@ -826,18 +797,12 @@ retry:
826 */ 797 */
827 if (attempt++) { 798 if (attempt++) {
828 ret = futex_handle_fault((unsigned long)uaddr2, 799 ret = futex_handle_fault((unsigned long)uaddr2,
829 fshared, attempt); 800 attempt);
830 if (ret) 801 if (ret)
831 goto out; 802 goto out_put_keys;
832 goto retry; 803 goto retry;
833 } 804 }
834 805
835 /*
836 * If we would have faulted, release mmap_sem,
837 * fault it in and start all over again.
838 */
839 futex_unlock_mm(fshared);
840
841 ret = get_user(dummy, uaddr2); 806 ret = get_user(dummy, uaddr2);
842 if (ret) 807 if (ret)
843 return ret; 808 return ret;
@@ -872,9 +837,11 @@ retry:
872 spin_unlock(&hb1->lock); 837 spin_unlock(&hb1->lock);
873 if (hb1 != hb2) 838 if (hb1 != hb2)
874 spin_unlock(&hb2->lock); 839 spin_unlock(&hb2->lock);
840out_put_keys:
841 put_futex_key(fshared, &key2);
842out_put_key1:
843 put_futex_key(fshared, &key1);
875out: 844out:
876 futex_unlock_mm(fshared);
877
878 return ret; 845 return ret;
879} 846}
880 847
@@ -882,25 +849,22 @@ out:
882 * Requeue all waiters hashed on one physical page to another 849 * Requeue all waiters hashed on one physical page to another
883 * physical page. 850 * physical page.
884 */ 851 */
885static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 852static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
886 u32 __user *uaddr2,
887 int nr_wake, int nr_requeue, u32 *cmpval) 853 int nr_wake, int nr_requeue, u32 *cmpval)
888{ 854{
889 union futex_key key1, key2; 855 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
890 struct futex_hash_bucket *hb1, *hb2; 856 struct futex_hash_bucket *hb1, *hb2;
891 struct plist_head *head1; 857 struct plist_head *head1;
892 struct futex_q *this, *next; 858 struct futex_q *this, *next;
893 int ret, drop_count = 0; 859 int ret, drop_count = 0;
894 860
895 retry: 861retry:
896 futex_lock_mm(fshared);
897
898 ret = get_futex_key(uaddr1, fshared, &key1); 862 ret = get_futex_key(uaddr1, fshared, &key1);
899 if (unlikely(ret != 0)) 863 if (unlikely(ret != 0))
900 goto out; 864 goto out;
901 ret = get_futex_key(uaddr2, fshared, &key2); 865 ret = get_futex_key(uaddr2, fshared, &key2);
902 if (unlikely(ret != 0)) 866 if (unlikely(ret != 0))
903 goto out; 867 goto out_put_key1;
904 868
905 hb1 = hash_futex(&key1); 869 hb1 = hash_futex(&key1);
906 hb2 = hash_futex(&key2); 870 hb2 = hash_futex(&key2);
@@ -917,18 +881,12 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
917 if (hb1 != hb2) 881 if (hb1 != hb2)
918 spin_unlock(&hb2->lock); 882 spin_unlock(&hb2->lock);
919 883
920 /*
921 * If we would have faulted, release mmap_sem, fault
922 * it in and start all over again.
923 */
924 futex_unlock_mm(fshared);
925
926 ret = get_user(curval, uaddr1); 884 ret = get_user(curval, uaddr1);
927 885
928 if (!ret) 886 if (!ret)
929 goto retry; 887 goto retry;
930 888
931 return ret; 889 goto out_put_keys;
932 } 890 }
933 if (curval != *cmpval) { 891 if (curval != *cmpval) {
934 ret = -EAGAIN; 892 ret = -EAGAIN;
@@ -973,8 +931,11 @@ out_unlock:
973 while (--drop_count >= 0) 931 while (--drop_count >= 0)
974 drop_futex_key_refs(&key1); 932 drop_futex_key_refs(&key1);
975 933
934out_put_keys:
935 put_futex_key(fshared, &key2);
936out_put_key1:
937 put_futex_key(fshared, &key1);
976out: 938out:
977 futex_unlock_mm(fshared);
978 return ret; 939 return ret;
979} 940}
980 941
@@ -983,7 +944,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
983{ 944{
984 struct futex_hash_bucket *hb; 945 struct futex_hash_bucket *hb;
985 946
986 init_waitqueue_head(&q->waiters); 947 init_waitqueue_head(&q->waiter);
987 948
988 get_futex_key_refs(&q->key); 949 get_futex_key_refs(&q->key);
989 hb = hash_futex(&q->key); 950 hb = hash_futex(&q->key);
@@ -1035,7 +996,7 @@ static int unqueue_me(struct futex_q *q)
1035 int ret = 0; 996 int ret = 0;
1036 997
1037 /* In the common case we don't take the spinlock, which is nice. */ 998 /* In the common case we don't take the spinlock, which is nice. */
1038 retry: 999retry:
1039 lock_ptr = q->lock_ptr; 1000 lock_ptr = q->lock_ptr;
1040 barrier(); 1001 barrier();
1041 if (lock_ptr != NULL) { 1002 if (lock_ptr != NULL) {
@@ -1096,8 +1057,7 @@ static void unqueue_me_pi(struct futex_q *q)
1096 * private futexes. 1057 * private futexes.
1097 */ 1058 */
1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1059static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1099 struct task_struct *newowner, 1060 struct task_struct *newowner, int fshared)
1100 struct rw_semaphore *fshared)
1101{ 1061{
1102 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1062 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1103 struct futex_pi_state *pi_state = q->pi_state; 1063 struct futex_pi_state *pi_state = q->pi_state;
@@ -1176,7 +1136,7 @@ retry:
1176handle_fault: 1136handle_fault:
1177 spin_unlock(q->lock_ptr); 1137 spin_unlock(q->lock_ptr);
1178 1138
1179 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); 1139 ret = futex_handle_fault((unsigned long)uaddr, attempt++);
1180 1140
1181 spin_lock(q->lock_ptr); 1141 spin_lock(q->lock_ptr);
1182 1142
@@ -1196,12 +1156,13 @@ handle_fault:
1196 * In case we must use restart_block to restart a futex_wait, 1156 * In case we must use restart_block to restart a futex_wait,
1197 * we encode in the 'flags' shared capability 1157 * we encode in the 'flags' shared capability
1198 */ 1158 */
1199#define FLAGS_SHARED 1 1159#define FLAGS_SHARED 0x01
1160#define FLAGS_CLOCKRT 0x02
1200 1161
1201static long futex_wait_restart(struct restart_block *restart); 1162static long futex_wait_restart(struct restart_block *restart);
1202 1163
1203static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1164static int futex_wait(u32 __user *uaddr, int fshared,
1204 u32 val, ktime_t *abs_time, u32 bitset) 1165 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1205{ 1166{
1206 struct task_struct *curr = current; 1167 struct task_struct *curr = current;
1207 DECLARE_WAITQUEUE(wait, curr); 1168 DECLARE_WAITQUEUE(wait, curr);
@@ -1217,12 +1178,11 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1217 1178
1218 q.pi_state = NULL; 1179 q.pi_state = NULL;
1219 q.bitset = bitset; 1180 q.bitset = bitset;
1220 retry: 1181retry:
1221 futex_lock_mm(fshared); 1182 q.key = FUTEX_KEY_INIT;
1222
1223 ret = get_futex_key(uaddr, fshared, &q.key); 1183 ret = get_futex_key(uaddr, fshared, &q.key);
1224 if (unlikely(ret != 0)) 1184 if (unlikely(ret != 0))
1225 goto out_release_sem; 1185 goto out;
1226 1186
1227 hb = queue_lock(&q); 1187 hb = queue_lock(&q);
1228 1188
@@ -1250,12 +1210,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1250 1210
1251 if (unlikely(ret)) { 1211 if (unlikely(ret)) {
1252 queue_unlock(&q, hb); 1212 queue_unlock(&q, hb);
1253 1213 put_futex_key(fshared, &q.key);
1254 /*
1255 * If we would have faulted, release mmap_sem, fault it in and
1256 * start all over again.
1257 */
1258 futex_unlock_mm(fshared);
1259 1214
1260 ret = get_user(uval, uaddr); 1215 ret = get_user(uval, uaddr);
1261 1216
@@ -1265,18 +1220,12 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1265 } 1220 }
1266 ret = -EWOULDBLOCK; 1221 ret = -EWOULDBLOCK;
1267 if (uval != val) 1222 if (uval != val)
1268 goto out_unlock_release_sem; 1223 goto out_unlock_put_key;
1269 1224
1270 /* Only actually queue if *uaddr contained val. */ 1225 /* Only actually queue if *uaddr contained val. */
1271 queue_me(&q, hb); 1226 queue_me(&q, hb);
1272 1227
1273 /* 1228 /*
1274 * Now the futex is queued and we have checked the data, we
1275 * don't want to hold mmap_sem while we sleep.
1276 */
1277 futex_unlock_mm(fshared);
1278
1279 /*
1280 * There might have been scheduling since the queue_me(), as we 1229 * There might have been scheduling since the queue_me(), as we
1281 * cannot hold a spinlock across the get_user() in case it 1230 * cannot hold a spinlock across the get_user() in case it
1282 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1231 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
@@ -1287,7 +1236,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1287 1236
1288 /* add_wait_queue is the barrier after __set_current_state. */ 1237 /* add_wait_queue is the barrier after __set_current_state. */
1289 __set_current_state(TASK_INTERRUPTIBLE); 1238 __set_current_state(TASK_INTERRUPTIBLE);
1290 add_wait_queue(&q.waiters, &wait); 1239 add_wait_queue(&q.waiter, &wait);
1291 /* 1240 /*
1292 * !plist_node_empty() is safe here without any lock. 1241 * !plist_node_empty() is safe here without any lock.
1293 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1242 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
@@ -1300,8 +1249,10 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1300 slack = current->timer_slack_ns; 1249 slack = current->timer_slack_ns;
1301 if (rt_task(current)) 1250 if (rt_task(current))
1302 slack = 0; 1251 slack = 0;
1303 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, 1252 hrtimer_init_on_stack(&t.timer,
1304 HRTIMER_MODE_ABS); 1253 clockrt ? CLOCK_REALTIME :
1254 CLOCK_MONOTONIC,
1255 HRTIMER_MODE_ABS);
1305 hrtimer_init_sleeper(&t, current); 1256 hrtimer_init_sleeper(&t, current);
1306 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); 1257 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
1307 1258
@@ -1356,14 +1307,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1356 1307
1357 if (fshared) 1308 if (fshared)
1358 restart->futex.flags |= FLAGS_SHARED; 1309 restart->futex.flags |= FLAGS_SHARED;
1310 if (clockrt)
1311 restart->futex.flags |= FLAGS_CLOCKRT;
1359 return -ERESTART_RESTARTBLOCK; 1312 return -ERESTART_RESTARTBLOCK;
1360 } 1313 }
1361 1314
1362 out_unlock_release_sem: 1315out_unlock_put_key:
1363 queue_unlock(&q, hb); 1316 queue_unlock(&q, hb);
1317 put_futex_key(fshared, &q.key);
1364 1318
1365 out_release_sem: 1319out:
1366 futex_unlock_mm(fshared);
1367 return ret; 1320 return ret;
1368} 1321}
1369 1322
@@ -1371,15 +1324,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1371static long futex_wait_restart(struct restart_block *restart) 1324static long futex_wait_restart(struct restart_block *restart)
1372{ 1325{
1373 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1326 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1374 struct rw_semaphore *fshared = NULL; 1327 int fshared = 0;
1375 ktime_t t; 1328 ktime_t t;
1376 1329
1377 t.tv64 = restart->futex.time; 1330 t.tv64 = restart->futex.time;
1378 restart->fn = do_no_restart_syscall; 1331 restart->fn = do_no_restart_syscall;
1379 if (restart->futex.flags & FLAGS_SHARED) 1332 if (restart->futex.flags & FLAGS_SHARED)
1380 fshared = &current->mm->mmap_sem; 1333 fshared = 1;
1381 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1334 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1382 restart->futex.bitset); 1335 restart->futex.bitset,
1336 restart->futex.flags & FLAGS_CLOCKRT);
1383} 1337}
1384 1338
1385 1339
@@ -1389,7 +1343,7 @@ static long futex_wait_restart(struct restart_block *restart)
1389 * if there are waiters then it will block, it does PI, etc. (Due to 1343 * if there are waiters then it will block, it does PI, etc. (Due to
1390 * races the kernel might see a 0 value of the futex too.) 1344 * races the kernel might see a 0 value of the futex too.)
1391 */ 1345 */
1392static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, 1346static int futex_lock_pi(u32 __user *uaddr, int fshared,
1393 int detect, ktime_t *time, int trylock) 1347 int detect, ktime_t *time, int trylock)
1394{ 1348{
1395 struct hrtimer_sleeper timeout, *to = NULL; 1349 struct hrtimer_sleeper timeout, *to = NULL;
@@ -1411,17 +1365,16 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1411 } 1365 }
1412 1366
1413 q.pi_state = NULL; 1367 q.pi_state = NULL;
1414 retry: 1368retry:
1415 futex_lock_mm(fshared); 1369 q.key = FUTEX_KEY_INIT;
1416
1417 ret = get_futex_key(uaddr, fshared, &q.key); 1370 ret = get_futex_key(uaddr, fshared, &q.key);
1418 if (unlikely(ret != 0)) 1371 if (unlikely(ret != 0))
1419 goto out_release_sem; 1372 goto out;
1420 1373
1421 retry_unlocked: 1374retry_unlocked:
1422 hb = queue_lock(&q); 1375 hb = queue_lock(&q);
1423 1376
1424 retry_locked: 1377retry_locked:
1425 ret = lock_taken = 0; 1378 ret = lock_taken = 0;
1426 1379
1427 /* 1380 /*
@@ -1442,14 +1395,14 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1442 */ 1395 */
1443 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { 1396 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1444 ret = -EDEADLK; 1397 ret = -EDEADLK;
1445 goto out_unlock_release_sem; 1398 goto out_unlock_put_key;
1446 } 1399 }
1447 1400
1448 /* 1401 /*
1449 * Surprise - we got the lock. Just return to userspace: 1402 * Surprise - we got the lock. Just return to userspace:
1450 */ 1403 */
1451 if (unlikely(!curval)) 1404 if (unlikely(!curval))
1452 goto out_unlock_release_sem; 1405 goto out_unlock_put_key;
1453 1406
1454 uval = curval; 1407 uval = curval;
1455 1408
@@ -1485,7 +1438,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1485 * We took the lock due to owner died take over. 1438 * We took the lock due to owner died take over.
1486 */ 1439 */
1487 if (unlikely(lock_taken)) 1440 if (unlikely(lock_taken))
1488 goto out_unlock_release_sem; 1441 goto out_unlock_put_key;
1489 1442
1490 /* 1443 /*
1491 * We dont have the lock. Look up the PI state (or create it if 1444 * We dont have the lock. Look up the PI state (or create it if
@@ -1502,7 +1455,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1502 * exit to complete. 1455 * exit to complete.
1503 */ 1456 */
1504 queue_unlock(&q, hb); 1457 queue_unlock(&q, hb);
1505 futex_unlock_mm(fshared);
1506 cond_resched(); 1458 cond_resched();
1507 goto retry; 1459 goto retry;
1508 1460
@@ -1525,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1525 goto retry_locked; 1477 goto retry_locked;
1526 } 1478 }
1527 default: 1479 default:
1528 goto out_unlock_release_sem; 1480 goto out_unlock_put_key;
1529 } 1481 }
1530 } 1482 }
1531 1483
@@ -1534,12 +1486,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1534 */ 1486 */
1535 queue_me(&q, hb); 1487 queue_me(&q, hb);
1536 1488
1537 /*
1538 * Now the futex is queued and we have checked the data, we
1539 * don't want to hold mmap_sem while we sleep.
1540 */
1541 futex_unlock_mm(fshared);
1542
1543 WARN_ON(!q.pi_state); 1489 WARN_ON(!q.pi_state);
1544 /* 1490 /*
1545 * Block on the PI mutex: 1491 * Block on the PI mutex:
@@ -1552,7 +1498,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1552 ret = ret ? 0 : -EWOULDBLOCK; 1498 ret = ret ? 0 : -EWOULDBLOCK;
1553 } 1499 }
1554 1500
1555 futex_lock_mm(fshared);
1556 spin_lock(q.lock_ptr); 1501 spin_lock(q.lock_ptr);
1557 1502
1558 if (!ret) { 1503 if (!ret) {
@@ -1618,44 +1563,40 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1618 1563
1619 /* Unqueue and drop the lock */ 1564 /* Unqueue and drop the lock */
1620 unqueue_me_pi(&q); 1565 unqueue_me_pi(&q);
1621 futex_unlock_mm(fshared);
1622 1566
1623 if (to) 1567 if (to)
1624 destroy_hrtimer_on_stack(&to->timer); 1568 destroy_hrtimer_on_stack(&to->timer);
1625 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1569 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1626 1570
1627 out_unlock_release_sem: 1571out_unlock_put_key:
1628 queue_unlock(&q, hb); 1572 queue_unlock(&q, hb);
1629 1573
1630 out_release_sem: 1574out_put_key:
1631 futex_unlock_mm(fshared); 1575 put_futex_key(fshared, &q.key);
1576out:
1632 if (to) 1577 if (to)
1633 destroy_hrtimer_on_stack(&to->timer); 1578 destroy_hrtimer_on_stack(&to->timer);
1634 return ret; 1579 return ret;
1635 1580
1636 uaddr_faulted: 1581uaddr_faulted:
1637 /* 1582 /*
1638 * We have to r/w *(int __user *)uaddr, but we can't modify it 1583 * We have to r/w *(int __user *)uaddr, and we have to modify it
1639 * non-atomically. Therefore, if get_user below is not 1584 * atomically. Therefore, if we continue to fault after get_user()
1640 * enough, we need to handle the fault ourselves, while 1585 * below, we need to handle the fault ourselves, while still holding
1641 * still holding the mmap_sem. 1586 * the mmap_sem. This can occur if the uaddr is under contention as
1642 * 1587 * we have to drop the mmap_sem in order to call get_user().
1643 * ... and hb->lock. :-) --ANK
1644 */ 1588 */
1645 queue_unlock(&q, hb); 1589 queue_unlock(&q, hb);
1646 1590
1647 if (attempt++) { 1591 if (attempt++) {
1648 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1592 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1649 attempt);
1650 if (ret) 1593 if (ret)
1651 goto out_release_sem; 1594 goto out_put_key;
1652 goto retry_unlocked; 1595 goto retry_unlocked;
1653 } 1596 }
1654 1597
1655 futex_unlock_mm(fshared);
1656
1657 ret = get_user(uval, uaddr); 1598 ret = get_user(uval, uaddr);
1658 if (!ret && (uval != -EFAULT)) 1599 if (!ret)
1659 goto retry; 1600 goto retry;
1660 1601
1661 if (to) 1602 if (to)
@@ -1668,13 +1609,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1668 * This is the in-kernel slowpath: we look up the PI state (if any), 1609 * This is the in-kernel slowpath: we look up the PI state (if any),
1669 * and do the rt-mutex unlock. 1610 * and do the rt-mutex unlock.
1670 */ 1611 */
1671static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) 1612static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1672{ 1613{
1673 struct futex_hash_bucket *hb; 1614 struct futex_hash_bucket *hb;
1674 struct futex_q *this, *next; 1615 struct futex_q *this, *next;
1675 u32 uval; 1616 u32 uval;
1676 struct plist_head *head; 1617 struct plist_head *head;
1677 union futex_key key; 1618 union futex_key key = FUTEX_KEY_INIT;
1678 int ret, attempt = 0; 1619 int ret, attempt = 0;
1679 1620
1680retry: 1621retry:
@@ -1685,10 +1626,6 @@ retry:
1685 */ 1626 */
1686 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1627 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1687 return -EPERM; 1628 return -EPERM;
1688 /*
1689 * First take all the futex related locks:
1690 */
1691 futex_lock_mm(fshared);
1692 1629
1693 ret = get_futex_key(uaddr, fshared, &key); 1630 ret = get_futex_key(uaddr, fshared, &key);
1694 if (unlikely(ret != 0)) 1631 if (unlikely(ret != 0))
@@ -1746,35 +1683,31 @@ retry_unlocked:
1746 1683
1747out_unlock: 1684out_unlock:
1748 spin_unlock(&hb->lock); 1685 spin_unlock(&hb->lock);
1749out: 1686 put_futex_key(fshared, &key);
1750 futex_unlock_mm(fshared);
1751 1687
1688out:
1752 return ret; 1689 return ret;
1753 1690
1754pi_faulted: 1691pi_faulted:
1755 /* 1692 /*
1756 * We have to r/w *(int __user *)uaddr, but we can't modify it 1693 * We have to r/w *(int __user *)uaddr, and we have to modify it
1757 * non-atomically. Therefore, if get_user below is not 1694 * atomically. Therefore, if we continue to fault after get_user()
1758 * enough, we need to handle the fault ourselves, while 1695 * below, we need to handle the fault ourselves, while still holding
1759 * still holding the mmap_sem. 1696 * the mmap_sem. This can occur if the uaddr is under contention as
1760 * 1697 * we have to drop the mmap_sem in order to call get_user().
1761 * ... and hb->lock. --ANK
1762 */ 1698 */
1763 spin_unlock(&hb->lock); 1699 spin_unlock(&hb->lock);
1764 1700
1765 if (attempt++) { 1701 if (attempt++) {
1766 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1702 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1767 attempt);
1768 if (ret) 1703 if (ret)
1769 goto out; 1704 goto out;
1770 uval = 0; 1705 uval = 0;
1771 goto retry_unlocked; 1706 goto retry_unlocked;
1772 } 1707 }
1773 1708
1774 futex_unlock_mm(fshared);
1775
1776 ret = get_user(uval, uaddr); 1709 ret = get_user(uval, uaddr);
1777 if (!ret && (uval != -EFAULT)) 1710 if (!ret)
1778 goto retry; 1711 goto retry;
1779 1712
1780 return ret; 1713 return ret;
@@ -1829,6 +1762,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1829{ 1762{
1830 struct robust_list_head __user *head; 1763 struct robust_list_head __user *head;
1831 unsigned long ret; 1764 unsigned long ret;
1765 const struct cred *cred = current_cred(), *pcred;
1832 1766
1833 if (!futex_cmpxchg_enabled) 1767 if (!futex_cmpxchg_enabled)
1834 return -ENOSYS; 1768 return -ENOSYS;
@@ -1844,8 +1778,10 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
1844 if (!p) 1778 if (!p)
1845 goto err_unlock; 1779 goto err_unlock;
1846 ret = -EPERM; 1780 ret = -EPERM;
1847 if ((current->euid != p->euid) && (current->euid != p->uid) && 1781 pcred = __task_cred(p);
1848 !capable(CAP_SYS_PTRACE)) 1782 if (cred->euid != pcred->euid &&
1783 cred->euid != pcred->uid &&
1784 !capable(CAP_SYS_PTRACE))
1849 goto err_unlock; 1785 goto err_unlock;
1850 head = p->robust_list; 1786 head = p->robust_list;
1851 rcu_read_unlock(); 1787 rcu_read_unlock();
@@ -1898,8 +1834,7 @@ retry:
1898 * PI futexes happens in exit_pi_state(): 1834 * PI futexes happens in exit_pi_state():
1899 */ 1835 */
1900 if (!pi && (uval & FUTEX_WAITERS)) 1836 if (!pi && (uval & FUTEX_WAITERS))
1901 futex_wake(uaddr, &curr->mm->mmap_sem, 1, 1837 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
1902 FUTEX_BITSET_MATCH_ANY);
1903 } 1838 }
1904 return 0; 1839 return 0;
1905} 1840}
@@ -1993,18 +1928,22 @@ void exit_robust_list(struct task_struct *curr)
1993long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 1928long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1994 u32 __user *uaddr2, u32 val2, u32 val3) 1929 u32 __user *uaddr2, u32 val2, u32 val3)
1995{ 1930{
1996 int ret = -ENOSYS; 1931 int clockrt, ret = -ENOSYS;
1997 int cmd = op & FUTEX_CMD_MASK; 1932 int cmd = op & FUTEX_CMD_MASK;
1998 struct rw_semaphore *fshared = NULL; 1933 int fshared = 0;
1999 1934
2000 if (!(op & FUTEX_PRIVATE_FLAG)) 1935 if (!(op & FUTEX_PRIVATE_FLAG))
2001 fshared = &current->mm->mmap_sem; 1936 fshared = 1;
1937
1938 clockrt = op & FUTEX_CLOCK_REALTIME;
1939 if (clockrt && cmd != FUTEX_WAIT_BITSET)
1940 return -ENOSYS;
2002 1941
2003 switch (cmd) { 1942 switch (cmd) {
2004 case FUTEX_WAIT: 1943 case FUTEX_WAIT:
2005 val3 = FUTEX_BITSET_MATCH_ANY; 1944 val3 = FUTEX_BITSET_MATCH_ANY;
2006 case FUTEX_WAIT_BITSET: 1945 case FUTEX_WAIT_BITSET:
2007 ret = futex_wait(uaddr, fshared, val, timeout, val3); 1946 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
2008 break; 1947 break;
2009 case FUTEX_WAKE: 1948 case FUTEX_WAKE:
2010 val3 = FUTEX_BITSET_MATCH_ANY; 1949 val3 = FUTEX_BITSET_MATCH_ANY;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 04ac3a9e42cf..d607a5b9ee29 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -135,6 +135,7 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
135{ 135{
136 struct compat_robust_list_head __user *head; 136 struct compat_robust_list_head __user *head;
137 unsigned long ret; 137 unsigned long ret;
138 const struct cred *cred = current_cred(), *pcred;
138 139
139 if (!futex_cmpxchg_enabled) 140 if (!futex_cmpxchg_enabled)
140 return -ENOSYS; 141 return -ENOSYS;
@@ -150,8 +151,10 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
150 if (!p) 151 if (!p)
151 goto err_unlock; 152 goto err_unlock;
152 ret = -EPERM; 153 ret = -EPERM;
153 if ((current->euid != p->euid) && (current->euid != p->uid) && 154 pcred = __task_cred(p);
154 !capable(CAP_SYS_PTRACE)) 155 if (cred->euid != pcred->euid &&
156 cred->euid != pcred->uid &&
157 !capable(CAP_SYS_PTRACE))
155 goto err_unlock; 158 goto err_unlock;
156 head = p->compat_robust_list; 159 head = p->compat_robust_list;
157 read_unlock(&tasklist_lock); 160 read_unlock(&tasklist_lock);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 47e63349d1b2..1455b7651b6b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,6 @@
32 */ 32 */
33 33
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/irq.h>
36#include <linux/module.h> 35#include <linux/module.h>
37#include <linux/percpu.h> 36#include <linux/percpu.h>
38#include <linux/hrtimer.h> 37#include <linux/hrtimer.h>
@@ -442,22 +441,6 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
442static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 441static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
443#endif 442#endif
444 443
445/*
446 * Check, whether the timer is on the callback pending list
447 */
448static inline int hrtimer_cb_pending(const struct hrtimer *timer)
449{
450 return timer->state & HRTIMER_STATE_PENDING;
451}
452
453/*
454 * Remove a timer from the callback pending list
455 */
456static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
457{
458 list_del_init(&timer->cb_entry);
459}
460
461/* High resolution timer related functions */ 444/* High resolution timer related functions */
462#ifdef CONFIG_HIGH_RES_TIMERS 445#ifdef CONFIG_HIGH_RES_TIMERS
463 446
@@ -651,6 +634,7 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
651{ 634{
652} 635}
653 636
637
654/* 638/*
655 * When High resolution timers are active, try to reprogram. Note, that in case 639 * When High resolution timers are active, try to reprogram. Note, that in case
656 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 640 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
@@ -661,31 +645,10 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
661 struct hrtimer_clock_base *base) 645 struct hrtimer_clock_base *base)
662{ 646{
663 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 647 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
664 648 spin_unlock(&base->cpu_base->lock);
665 /* Timer is expired, act upon the callback mode */ 649 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
666 switch(timer->cb_mode) { 650 spin_lock(&base->cpu_base->lock);
667 case HRTIMER_CB_IRQSAFE_PERCPU: 651 return 1;
668 case HRTIMER_CB_IRQSAFE_UNLOCKED:
669 /*
670 * This is solely for the sched tick emulation with
671 * dynamic tick support to ensure that we do not
672 * restart the tick right on the edge and end up with
673 * the tick timer in the softirq ! The calling site
674 * takes care of this. Also used for hrtimer sleeper !
675 */
676 debug_hrtimer_deactivate(timer);
677 return 1;
678 case HRTIMER_CB_SOFTIRQ:
679 /*
680 * Move everything else into the softirq pending list !
681 */
682 list_add_tail(&timer->cb_entry,
683 &base->cpu_base->cb_pending);
684 timer->state = HRTIMER_STATE_PENDING;
685 return 1;
686 default:
687 BUG();
688 }
689 } 652 }
690 return 0; 653 return 0;
691} 654}
@@ -724,11 +687,6 @@ static int hrtimer_switch_to_hres(void)
724 return 1; 687 return 1;
725} 688}
726 689
727static inline void hrtimer_raise_softirq(void)
728{
729 raise_softirq(HRTIMER_SOFTIRQ);
730}
731
732#else 690#else
733 691
734static inline int hrtimer_hres_active(void) { return 0; } 692static inline int hrtimer_hres_active(void) { return 0; }
@@ -742,12 +700,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
742} 700}
743static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 701static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
744static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 702static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
745static inline int hrtimer_reprogram(struct hrtimer *timer,
746 struct hrtimer_clock_base *base)
747{
748 return 0;
749}
750static inline void hrtimer_raise_softirq(void) { }
751 703
752#endif /* CONFIG_HIGH_RES_TIMERS */ 704#endif /* CONFIG_HIGH_RES_TIMERS */
753 705
@@ -818,9 +770,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
818 * 770 *
819 * The timer is inserted in expiry order. Insertion into the 771 * The timer is inserted in expiry order. Insertion into the
820 * red black tree is O(log(n)). Must hold the base lock. 772 * red black tree is O(log(n)). Must hold the base lock.
773 *
774 * Returns 1 when the new timer is the leftmost timer in the tree.
821 */ 775 */
822static void enqueue_hrtimer(struct hrtimer *timer, 776static int enqueue_hrtimer(struct hrtimer *timer,
823 struct hrtimer_clock_base *base, int reprogram) 777 struct hrtimer_clock_base *base)
824{ 778{
825 struct rb_node **link = &base->active.rb_node; 779 struct rb_node **link = &base->active.rb_node;
826 struct rb_node *parent = NULL; 780 struct rb_node *parent = NULL;
@@ -852,20 +806,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
852 * Insert the timer to the rbtree and check whether it 806 * Insert the timer to the rbtree and check whether it
853 * replaces the first pending timer 807 * replaces the first pending timer
854 */ 808 */
855 if (leftmost) { 809 if (leftmost)
856 /*
857 * Reprogram the clock event device. When the timer is already
858 * expired hrtimer_enqueue_reprogram has either called the
859 * callback or added it to the pending list and raised the
860 * softirq.
861 *
862 * This is a NOP for !HIGHRES
863 */
864 if (reprogram && hrtimer_enqueue_reprogram(timer, base))
865 return;
866
867 base->first = &timer->node; 810 base->first = &timer->node;
868 }
869 811
870 rb_link_node(&timer->node, parent, link); 812 rb_link_node(&timer->node, parent, link);
871 rb_insert_color(&timer->node, &base->active); 813 rb_insert_color(&timer->node, &base->active);
@@ -874,6 +816,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
874 * state of a possibly running callback. 816 * state of a possibly running callback.
875 */ 817 */
876 timer->state |= HRTIMER_STATE_ENQUEUED; 818 timer->state |= HRTIMER_STATE_ENQUEUED;
819
820 return leftmost;
877} 821}
878 822
879/* 823/*
@@ -890,10 +834,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
890 struct hrtimer_clock_base *base, 834 struct hrtimer_clock_base *base,
891 unsigned long newstate, int reprogram) 835 unsigned long newstate, int reprogram)
892{ 836{
893 /* High res. callback list. NOP for !HIGHRES */ 837 if (timer->state & HRTIMER_STATE_ENQUEUED) {
894 if (hrtimer_cb_pending(timer))
895 hrtimer_remove_cb_pending(timer);
896 else {
897 /* 838 /*
898 * Remove the timer from the rbtree and replace the 839 * Remove the timer from the rbtree and replace the
899 * first entry pointer if necessary. 840 * first entry pointer if necessary.
@@ -953,7 +894,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
953{ 894{
954 struct hrtimer_clock_base *base, *new_base; 895 struct hrtimer_clock_base *base, *new_base;
955 unsigned long flags; 896 unsigned long flags;
956 int ret, raise; 897 int ret, leftmost;
957 898
958 base = lock_hrtimer_base(timer, &flags); 899 base = lock_hrtimer_base(timer, &flags);
959 900
@@ -981,33 +922,19 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
981 922
982 timer_stats_hrtimer_set_start_info(timer); 923 timer_stats_hrtimer_set_start_info(timer);
983 924
925 leftmost = enqueue_hrtimer(timer, new_base);
926
984 /* 927 /*
985 * Only allow reprogramming if the new base is on this CPU. 928 * Only allow reprogramming if the new base is on this CPU.
986 * (it might still be on another CPU if the timer was pending) 929 * (it might still be on another CPU if the timer was pending)
930 *
931 * XXX send_remote_softirq() ?
987 */ 932 */
988 enqueue_hrtimer(timer, new_base, 933 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
989 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 934 hrtimer_enqueue_reprogram(timer, new_base);
990
991 /*
992 * The timer may be expired and moved to the cb_pending
993 * list. We can not raise the softirq with base lock held due
994 * to a possible deadlock with runqueue lock.
995 */
996 raise = timer->state == HRTIMER_STATE_PENDING;
997
998 /*
999 * We use preempt_disable to prevent this task from migrating after
1000 * setting up the softirq and raising it. Otherwise, if me migrate
1001 * we will raise the softirq on the wrong CPU.
1002 */
1003 preempt_disable();
1004 935
1005 unlock_hrtimer_base(timer, &flags); 936 unlock_hrtimer_base(timer, &flags);
1006 937
1007 if (raise)
1008 hrtimer_raise_softirq();
1009 preempt_enable();
1010
1011 return ret; 938 return ret;
1012} 939}
1013EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 940EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1192,75 +1119,6 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1192} 1119}
1193EXPORT_SYMBOL_GPL(hrtimer_get_res); 1120EXPORT_SYMBOL_GPL(hrtimer_get_res);
1194 1121
1195static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1196{
1197 spin_lock_irq(&cpu_base->lock);
1198
1199 while (!list_empty(&cpu_base->cb_pending)) {
1200 enum hrtimer_restart (*fn)(struct hrtimer *);
1201 struct hrtimer *timer;
1202 int restart;
1203 int emulate_hardirq_ctx = 0;
1204
1205 timer = list_entry(cpu_base->cb_pending.next,
1206 struct hrtimer, cb_entry);
1207
1208 debug_hrtimer_deactivate(timer);
1209 timer_stats_account_hrtimer(timer);
1210
1211 fn = timer->function;
1212 /*
1213 * A timer might have been added to the cb_pending list
1214 * when it was migrated during a cpu-offline operation.
1215 * Emulate hardirq context for such timers.
1216 */
1217 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1218 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
1219 emulate_hardirq_ctx = 1;
1220
1221 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1222 spin_unlock_irq(&cpu_base->lock);
1223
1224 if (unlikely(emulate_hardirq_ctx)) {
1225 local_irq_disable();
1226 restart = fn(timer);
1227 local_irq_enable();
1228 } else
1229 restart = fn(timer);
1230
1231 spin_lock_irq(&cpu_base->lock);
1232
1233 timer->state &= ~HRTIMER_STATE_CALLBACK;
1234 if (restart == HRTIMER_RESTART) {
1235 BUG_ON(hrtimer_active(timer));
1236 /*
1237 * Enqueue the timer, allow reprogramming of the event
1238 * device
1239 */
1240 enqueue_hrtimer(timer, timer->base, 1);
1241 } else if (hrtimer_active(timer)) {
1242 /*
1243 * If the timer was rearmed on another CPU, reprogram
1244 * the event device.
1245 */
1246 struct hrtimer_clock_base *base = timer->base;
1247
1248 if (base->first == &timer->node &&
1249 hrtimer_reprogram(timer, base)) {
1250 /*
1251 * Timer is expired. Thus move it from tree to
1252 * pending list again.
1253 */
1254 __remove_hrtimer(timer, base,
1255 HRTIMER_STATE_PENDING, 0);
1256 list_add_tail(&timer->cb_entry,
1257 &base->cpu_base->cb_pending);
1258 }
1259 }
1260 }
1261 spin_unlock_irq(&cpu_base->lock);
1262}
1263
1264static void __run_hrtimer(struct hrtimer *timer) 1122static void __run_hrtimer(struct hrtimer *timer)
1265{ 1123{
1266 struct hrtimer_clock_base *base = timer->base; 1124 struct hrtimer_clock_base *base = timer->base;
@@ -1268,34 +1126,30 @@ static void __run_hrtimer(struct hrtimer *timer)
1268 enum hrtimer_restart (*fn)(struct hrtimer *); 1126 enum hrtimer_restart (*fn)(struct hrtimer *);
1269 int restart; 1127 int restart;
1270 1128
1129 WARN_ON(!irqs_disabled());
1130
1271 debug_hrtimer_deactivate(timer); 1131 debug_hrtimer_deactivate(timer);
1272 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1132 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1273 timer_stats_account_hrtimer(timer); 1133 timer_stats_account_hrtimer(timer);
1274
1275 fn = timer->function; 1134 fn = timer->function;
1276 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1277 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
1278 /*
1279 * Used for scheduler timers, avoid lock inversion with
1280 * rq->lock and tasklist_lock.
1281 *
1282 * These timers are required to deal with enqueue expiry
1283 * themselves and are not allowed to migrate.
1284 */
1285 spin_unlock(&cpu_base->lock);
1286 restart = fn(timer);
1287 spin_lock(&cpu_base->lock);
1288 } else
1289 restart = fn(timer);
1290 1135
1291 /* 1136 /*
1292 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid 1137 * Because we run timers from hardirq context, there is no chance
1293 * reprogramming of the event hardware. This happens at the end of this 1138 * they get migrated to another cpu, therefore its safe to unlock
1294 * function anyway. 1139 * the timer base.
1140 */
1141 spin_unlock(&cpu_base->lock);
1142 restart = fn(timer);
1143 spin_lock(&cpu_base->lock);
1144
1145 /*
1146 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
1147 * we do not reprogramm the event hardware. Happens either in
1148 * hrtimer_start_range_ns() or in hrtimer_interrupt()
1295 */ 1149 */
1296 if (restart != HRTIMER_NORESTART) { 1150 if (restart != HRTIMER_NORESTART) {
1297 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); 1151 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1298 enqueue_hrtimer(timer, base, 0); 1152 enqueue_hrtimer(timer, base);
1299 } 1153 }
1300 timer->state &= ~HRTIMER_STATE_CALLBACK; 1154 timer->state &= ~HRTIMER_STATE_CALLBACK;
1301} 1155}
@@ -1311,7 +1165,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1311 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1165 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1312 struct hrtimer_clock_base *base; 1166 struct hrtimer_clock_base *base;
1313 ktime_t expires_next, now; 1167 ktime_t expires_next, now;
1314 int i, raise = 0; 1168 int i;
1315 1169
1316 BUG_ON(!cpu_base->hres_active); 1170 BUG_ON(!cpu_base->hres_active);
1317 cpu_base->nr_events++; 1171 cpu_base->nr_events++;
@@ -1360,16 +1214,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1360 break; 1214 break;
1361 } 1215 }
1362 1216
1363 /* Move softirq callbacks to the pending list */
1364 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1365 __remove_hrtimer(timer, base,
1366 HRTIMER_STATE_PENDING, 0);
1367 list_add_tail(&timer->cb_entry,
1368 &base->cpu_base->cb_pending);
1369 raise = 1;
1370 continue;
1371 }
1372
1373 __run_hrtimer(timer); 1217 __run_hrtimer(timer);
1374 } 1218 }
1375 spin_unlock(&cpu_base->lock); 1219 spin_unlock(&cpu_base->lock);
@@ -1383,10 +1227,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1383 if (tick_program_event(expires_next, 0)) 1227 if (tick_program_event(expires_next, 0))
1384 goto retry; 1228 goto retry;
1385 } 1229 }
1230}
1231
1232/*
1233 * local version of hrtimer_peek_ahead_timers() called with interrupts
1234 * disabled.
1235 */
1236static void __hrtimer_peek_ahead_timers(void)
1237{
1238 struct tick_device *td;
1239
1240 if (!hrtimer_hres_active())
1241 return;
1386 1242
1387 /* Raise softirq ? */ 1243 td = &__get_cpu_var(tick_cpu_device);
1388 if (raise) 1244 if (td && td->evtdev)
1389 raise_softirq(HRTIMER_SOFTIRQ); 1245 hrtimer_interrupt(td->evtdev);
1390} 1246}
1391 1247
1392/** 1248/**
@@ -1400,25 +1256,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1400 */ 1256 */
1401void hrtimer_peek_ahead_timers(void) 1257void hrtimer_peek_ahead_timers(void)
1402{ 1258{
1403 struct tick_device *td;
1404 unsigned long flags; 1259 unsigned long flags;
1405 1260
1406 if (!hrtimer_hres_active())
1407 return;
1408
1409 local_irq_save(flags); 1261 local_irq_save(flags);
1410 td = &__get_cpu_var(tick_cpu_device); 1262 __hrtimer_peek_ahead_timers();
1411 if (td && td->evtdev)
1412 hrtimer_interrupt(td->evtdev);
1413 local_irq_restore(flags); 1263 local_irq_restore(flags);
1414} 1264}
1415 1265
1416static void run_hrtimer_softirq(struct softirq_action *h) 1266static void run_hrtimer_softirq(struct softirq_action *h)
1417{ 1267{
1418 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); 1268 hrtimer_peek_ahead_timers();
1419} 1269}
1420 1270
1421#endif /* CONFIG_HIGH_RES_TIMERS */ 1271#else /* CONFIG_HIGH_RES_TIMERS */
1272
1273static inline void __hrtimer_peek_ahead_timers(void) { }
1274
1275#endif /* !CONFIG_HIGH_RES_TIMERS */
1422 1276
1423/* 1277/*
1424 * Called from timer softirq every jiffy, expire hrtimers: 1278 * Called from timer softirq every jiffy, expire hrtimers:
@@ -1429,8 +1283,6 @@ static void run_hrtimer_softirq(struct softirq_action *h)
1429 */ 1283 */
1430void hrtimer_run_pending(void) 1284void hrtimer_run_pending(void)
1431{ 1285{
1432 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1433
1434 if (hrtimer_hres_active()) 1286 if (hrtimer_hres_active())
1435 return; 1287 return;
1436 1288
@@ -1444,8 +1296,6 @@ void hrtimer_run_pending(void)
1444 */ 1296 */
1445 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) 1297 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1446 hrtimer_switch_to_hres(); 1298 hrtimer_switch_to_hres();
1447
1448 run_hrtimer_pending(cpu_base);
1449} 1299}
1450 1300
1451/* 1301/*
@@ -1482,14 +1332,6 @@ void hrtimer_run_queues(void)
1482 hrtimer_get_expires_tv64(timer)) 1332 hrtimer_get_expires_tv64(timer))
1483 break; 1333 break;
1484 1334
1485 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1486 __remove_hrtimer(timer, base,
1487 HRTIMER_STATE_PENDING, 0);
1488 list_add_tail(&timer->cb_entry,
1489 &base->cpu_base->cb_pending);
1490 continue;
1491 }
1492
1493 __run_hrtimer(timer); 1335 __run_hrtimer(timer);
1494 } 1336 }
1495 spin_unlock(&cpu_base->lock); 1337 spin_unlock(&cpu_base->lock);
@@ -1516,9 +1358,6 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1516{ 1358{
1517 sl->timer.function = hrtimer_wakeup; 1359 sl->timer.function = hrtimer_wakeup;
1518 sl->task = task; 1360 sl->task = task;
1519#ifdef CONFIG_HIGH_RES_TIMERS
1520 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1521#endif
1522} 1361}
1523 1362
1524static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1363static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -1655,18 +1494,16 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1655 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1494 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1656 cpu_base->clock_base[i].cpu_base = cpu_base; 1495 cpu_base->clock_base[i].cpu_base = cpu_base;
1657 1496
1658 INIT_LIST_HEAD(&cpu_base->cb_pending);
1659 hrtimer_init_hres(cpu_base); 1497 hrtimer_init_hres(cpu_base);
1660} 1498}
1661 1499
1662#ifdef CONFIG_HOTPLUG_CPU 1500#ifdef CONFIG_HOTPLUG_CPU
1663 1501
1664static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1502static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1665 struct hrtimer_clock_base *new_base, int dcpu) 1503 struct hrtimer_clock_base *new_base)
1666{ 1504{
1667 struct hrtimer *timer; 1505 struct hrtimer *timer;
1668 struct rb_node *node; 1506 struct rb_node *node;
1669 int raise = 0;
1670 1507
1671 while ((node = rb_first(&old_base->active))) { 1508 while ((node = rb_first(&old_base->active))) {
1672 timer = rb_entry(node, struct hrtimer, node); 1509 timer = rb_entry(node, struct hrtimer, node);
@@ -1674,18 +1511,6 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1674 debug_hrtimer_deactivate(timer); 1511 debug_hrtimer_deactivate(timer);
1675 1512
1676 /* 1513 /*
1677 * Should not happen. Per CPU timers should be
1678 * canceled _before_ the migration code is called
1679 */
1680 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1681 __remove_hrtimer(timer, old_base,
1682 HRTIMER_STATE_INACTIVE, 0);
1683 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1684 timer, timer->function, dcpu);
1685 continue;
1686 }
1687
1688 /*
1689 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1514 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1690 * timer could be seen as !active and just vanish away 1515 * timer could be seen as !active and just vanish away
1691 * under us on another CPU 1516 * under us on another CPU
@@ -1693,112 +1518,73 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1693 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); 1518 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1694 timer->base = new_base; 1519 timer->base = new_base;
1695 /* 1520 /*
1696 * Enqueue the timer. Allow reprogramming of the event device 1521 * Enqueue the timers on the new cpu. This does not
1522 * reprogram the event device in case the timer
1523 * expires before the earliest on this CPU, but we run
1524 * hrtimer_interrupt after we migrated everything to
1525 * sort out already expired timers and reprogram the
1526 * event device.
1697 */ 1527 */
1698 enqueue_hrtimer(timer, new_base, 1); 1528 enqueue_hrtimer(timer, new_base);
1699 1529
1700#ifdef CONFIG_HIGH_RES_TIMERS
1701 /*
1702 * Happens with high res enabled when the timer was
1703 * already expired and the callback mode is
1704 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1705 * enqueue code does not move them to the soft irq
1706 * pending list for performance/latency reasons, but
1707 * in the migration state, we need to do that
1708 * otherwise we end up with a stale timer.
1709 */
1710 if (timer->state == HRTIMER_STATE_MIGRATE) {
1711 timer->state = HRTIMER_STATE_PENDING;
1712 list_add_tail(&timer->cb_entry,
1713 &new_base->cpu_base->cb_pending);
1714 raise = 1;
1715 }
1716#endif
1717 /* Clear the migration state bit */ 1530 /* Clear the migration state bit */
1718 timer->state &= ~HRTIMER_STATE_MIGRATE; 1531 timer->state &= ~HRTIMER_STATE_MIGRATE;
1719 } 1532 }
1720 return raise;
1721}
1722
1723#ifdef CONFIG_HIGH_RES_TIMERS
1724static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1725 struct hrtimer_cpu_base *new_base)
1726{
1727 struct hrtimer *timer;
1728 int raise = 0;
1729
1730 while (!list_empty(&old_base->cb_pending)) {
1731 timer = list_entry(old_base->cb_pending.next,
1732 struct hrtimer, cb_entry);
1733
1734 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1735 timer->base = &new_base->clock_base[timer->base->index];
1736 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1737 raise = 1;
1738 }
1739 return raise;
1740}
1741#else
1742static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1743 struct hrtimer_cpu_base *new_base)
1744{
1745 return 0;
1746} 1533}
1747#endif
1748 1534
1749static void migrate_hrtimers(int cpu) 1535static void migrate_hrtimers(int scpu)
1750{ 1536{
1751 struct hrtimer_cpu_base *old_base, *new_base; 1537 struct hrtimer_cpu_base *old_base, *new_base;
1752 int i, raise = 0; 1538 int i;
1753 1539
1754 BUG_ON(cpu_online(cpu)); 1540 BUG_ON(cpu_online(scpu));
1755 old_base = &per_cpu(hrtimer_bases, cpu); 1541 tick_cancel_sched_timer(scpu);
1756 new_base = &get_cpu_var(hrtimer_bases);
1757 1542
1758 tick_cancel_sched_timer(cpu); 1543 local_irq_disable();
1544 old_base = &per_cpu(hrtimer_bases, scpu);
1545 new_base = &__get_cpu_var(hrtimer_bases);
1759 /* 1546 /*
1760 * The caller is globally serialized and nobody else 1547 * The caller is globally serialized and nobody else
1761 * takes two locks at once, deadlock is not possible. 1548 * takes two locks at once, deadlock is not possible.
1762 */ 1549 */
1763 spin_lock_irq(&new_base->lock); 1550 spin_lock(&new_base->lock);
1764 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1551 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1765 1552
1766 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1553 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1767 if (migrate_hrtimer_list(&old_base->clock_base[i], 1554 migrate_hrtimer_list(&old_base->clock_base[i],
1768 &new_base->clock_base[i], cpu)) 1555 &new_base->clock_base[i]);
1769 raise = 1;
1770 } 1556 }
1771 1557
1772 if (migrate_hrtimer_pending(old_base, new_base))
1773 raise = 1;
1774
1775 spin_unlock(&old_base->lock); 1558 spin_unlock(&old_base->lock);
1776 spin_unlock_irq(&new_base->lock); 1559 spin_unlock(&new_base->lock);
1777 put_cpu_var(hrtimer_bases);
1778 1560
1779 if (raise) 1561 /* Check, if we got expired work to do */
1780 hrtimer_raise_softirq(); 1562 __hrtimer_peek_ahead_timers();
1563 local_irq_enable();
1781} 1564}
1565
1782#endif /* CONFIG_HOTPLUG_CPU */ 1566#endif /* CONFIG_HOTPLUG_CPU */
1783 1567
1784static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1568static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1785 unsigned long action, void *hcpu) 1569 unsigned long action, void *hcpu)
1786{ 1570{
1787 unsigned int cpu = (long)hcpu; 1571 int scpu = (long)hcpu;
1788 1572
1789 switch (action) { 1573 switch (action) {
1790 1574
1791 case CPU_UP_PREPARE: 1575 case CPU_UP_PREPARE:
1792 case CPU_UP_PREPARE_FROZEN: 1576 case CPU_UP_PREPARE_FROZEN:
1793 init_hrtimers_cpu(cpu); 1577 init_hrtimers_cpu(scpu);
1794 break; 1578 break;
1795 1579
1796#ifdef CONFIG_HOTPLUG_CPU 1580#ifdef CONFIG_HOTPLUG_CPU
1797 case CPU_DEAD: 1581 case CPU_DEAD:
1798 case CPU_DEAD_FROZEN: 1582 case CPU_DEAD_FROZEN:
1799 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); 1583 {
1800 migrate_hrtimers(cpu); 1584 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1585 migrate_hrtimers(scpu);
1801 break; 1586 break;
1587 }
1802#endif 1588#endif
1803 1589
1804 default: 1590 default:
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 681c52dbfe22..4dd5b1edac98 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index cc0f7321b8ce..1de9700f416e 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/async.h>
13 14
14#include "internals.h" 15#include "internals.h"
15 16
@@ -34,6 +35,10 @@ unsigned long probe_irq_on(void)
34 unsigned int status; 35 unsigned int status;
35 int i; 36 int i;
36 37
38 /*
39 * quiesce the kernel, or at least the asynchronous portion
40 */
41 async_synchronize_full();
37 mutex_lock(&probing_active); 42 mutex_lock(&probing_active);
38 /* 43 /*
39 * something may have generated an irq long ago and we want to 44 * something may have generated an irq long ago and we want to
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 10b5092e9bfe..f63c706d25e1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -24,9 +24,10 @@
24 */ 24 */
25void dynamic_irq_init(unsigned int irq) 25void dynamic_irq_init(unsigned int irq)
26{ 26{
27 struct irq_desc *desc = irq_to_desc(irq); 27 struct irq_desc *desc;
28 unsigned long flags; 28 unsigned long flags;
29 29
30 desc = irq_to_desc(irq);
30 if (!desc) { 31 if (!desc) {
31 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); 32 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
32 return; 33 return;
@@ -45,7 +46,7 @@ void dynamic_irq_init(unsigned int irq)
45 desc->irq_count = 0; 46 desc->irq_count = 0;
46 desc->irqs_unhandled = 0; 47 desc->irqs_unhandled = 0;
47#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
48 cpus_setall(desc->affinity); 49 cpumask_setall(&desc->affinity);
49#endif 50#endif
50 spin_unlock_irqrestore(&desc->lock, flags); 51 spin_unlock_irqrestore(&desc->lock, flags);
51} 52}
@@ -124,6 +125,7 @@ int set_irq_type(unsigned int irq, unsigned int type)
124 return -ENODEV; 125 return -ENODEV;
125 } 126 }
126 127
128 type &= IRQ_TYPE_SENSE_MASK;
127 if (type == IRQ_TYPE_NONE) 129 if (type == IRQ_TYPE_NONE)
128 return 0; 130 return 0;
129 131
@@ -352,6 +354,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
352 354
353 spin_lock(&desc->lock); 355 spin_lock(&desc->lock);
354 mask_ack_irq(desc, irq); 356 mask_ack_irq(desc, irq);
357 desc = irq_remap_to_desc(irq, desc);
355 358
356 if (unlikely(desc->status & IRQ_INPROGRESS)) 359 if (unlikely(desc->status & IRQ_INPROGRESS))
357 goto out_unlock; 360 goto out_unlock;
@@ -429,6 +432,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
429 desc->status &= ~IRQ_INPROGRESS; 432 desc->status &= ~IRQ_INPROGRESS;
430out: 433out:
431 desc->chip->eoi(irq); 434 desc->chip->eoi(irq);
435 desc = irq_remap_to_desc(irq, desc);
432 436
433 spin_unlock(&desc->lock); 437 spin_unlock(&desc->lock);
434} 438}
@@ -465,12 +469,14 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
465 !desc->action)) { 469 !desc->action)) {
466 desc->status |= (IRQ_PENDING | IRQ_MASKED); 470 desc->status |= (IRQ_PENDING | IRQ_MASKED);
467 mask_ack_irq(desc, irq); 471 mask_ack_irq(desc, irq);
472 desc = irq_remap_to_desc(irq, desc);
468 goto out_unlock; 473 goto out_unlock;
469 } 474 }
470 kstat_incr_irqs_this_cpu(irq, desc); 475 kstat_incr_irqs_this_cpu(irq, desc);
471 476
472 /* Start handling the irq */ 477 /* Start handling the irq */
473 desc->chip->ack(irq); 478 desc->chip->ack(irq);
479 desc = irq_remap_to_desc(irq, desc);
474 480
475 /* Mark the IRQ currently in progress.*/ 481 /* Mark the IRQ currently in progress.*/
476 desc->status |= IRQ_INPROGRESS; 482 desc->status |= IRQ_INPROGRESS;
@@ -531,8 +537,10 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
531 if (!noirqdebug) 537 if (!noirqdebug)
532 note_interrupt(irq, desc, action_ret); 538 note_interrupt(irq, desc, action_ret);
533 539
534 if (desc->chip->eoi) 540 if (desc->chip->eoi) {
535 desc->chip->eoi(irq); 541 desc->chip->eoi(irq);
542 desc = irq_remap_to_desc(irq, desc);
543 }
536} 544}
537 545
538void 546void
@@ -567,8 +575,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
567 575
568 /* Uninstall? */ 576 /* Uninstall? */
569 if (handle == handle_bad_irq) { 577 if (handle == handle_bad_irq) {
570 if (desc->chip != &no_irq_chip) 578 if (desc->chip != &no_irq_chip) {
571 mask_ack_irq(desc, irq); 579 mask_ack_irq(desc, irq);
580 desc = irq_remap_to_desc(irq, desc);
581 }
572 desc->status |= IRQ_DISABLED; 582 desc->status |= IRQ_DISABLED;
573 desc->depth = 1; 583 desc->depth = 1;
574 } 584 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c815b42d0f5b..c20db0be9173 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -15,9 +15,16 @@
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h>
19#include <linux/hash.h>
18 20
19#include "internals.h" 21#include "internals.h"
20 22
23/*
24 * lockdep: we want to handle all irq_desc locks as a single lock-class:
25 */
26struct lock_class_key irq_desc_lock_class;
27
21/** 28/**
22 * handle_bad_irq - handle spurious and unhandled irqs 29 * handle_bad_irq - handle spurious and unhandled irqs
23 * @irq: the interrupt number 30 * @irq: the interrupt number
@@ -49,6 +56,150 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
49int nr_irqs = NR_IRQS; 56int nr_irqs = NR_IRQS;
50EXPORT_SYMBOL_GPL(nr_irqs); 57EXPORT_SYMBOL_GPL(nr_irqs);
51 58
59#ifdef CONFIG_SPARSE_IRQ
60static struct irq_desc irq_desc_init = {
61 .irq = -1,
62 .status = IRQ_DISABLED,
63 .chip = &no_irq_chip,
64 .handle_irq = handle_bad_irq,
65 .depth = 1,
66 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
67#ifdef CONFIG_SMP
68 .affinity = CPU_MASK_ALL
69#endif
70};
71
72void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
73{
74 unsigned long bytes;
75 char *ptr;
76 int node;
77
78 /* Compute how many bytes we need per irq and allocate them */
79 bytes = nr * sizeof(unsigned int);
80
81 node = cpu_to_node(cpu);
82 ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
83 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
84
85 if (ptr)
86 desc->kstat_irqs = (unsigned int *)ptr;
87}
88
89static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
90{
91 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
92
93 spin_lock_init(&desc->lock);
94 desc->irq = irq;
95#ifdef CONFIG_SMP
96 desc->cpu = cpu;
97#endif
98 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
99 init_kstat_irqs(desc, cpu, nr_cpu_ids);
100 if (!desc->kstat_irqs) {
101 printk(KERN_ERR "can not alloc kstat_irqs\n");
102 BUG_ON(1);
103 }
104 arch_init_chip_data(desc, cpu);
105}
106
107/*
108 * Protect the sparse_irqs:
109 */
110DEFINE_SPINLOCK(sparse_irq_lock);
111
112struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
113
114static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
115 [0 ... NR_IRQS_LEGACY-1] = {
116 .irq = -1,
117 .status = IRQ_DISABLED,
118 .chip = &no_irq_chip,
119 .handle_irq = handle_bad_irq,
120 .depth = 1,
121 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
122#ifdef CONFIG_SMP
123 .affinity = CPU_MASK_ALL
124#endif
125 }
126};
127
128/* FIXME: use bootmem alloc ...*/
129static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
130
131int __init early_irq_init(void)
132{
133 struct irq_desc *desc;
134 int legacy_count;
135 int i;
136
137 desc = irq_desc_legacy;
138 legacy_count = ARRAY_SIZE(irq_desc_legacy);
139
140 for (i = 0; i < legacy_count; i++) {
141 desc[i].irq = i;
142 desc[i].kstat_irqs = kstat_irqs_legacy[i];
143 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
144
145 irq_desc_ptrs[i] = desc + i;
146 }
147
148 for (i = legacy_count; i < NR_IRQS; i++)
149 irq_desc_ptrs[i] = NULL;
150
151 return arch_early_irq_init();
152}
153
154struct irq_desc *irq_to_desc(unsigned int irq)
155{
156 return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
157}
158
159struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
160{
161 struct irq_desc *desc;
162 unsigned long flags;
163 int node;
164
165 if (irq >= NR_IRQS) {
166 printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
167 irq, NR_IRQS);
168 WARN_ON(1);
169 return NULL;
170 }
171
172 desc = irq_desc_ptrs[irq];
173 if (desc)
174 return desc;
175
176 spin_lock_irqsave(&sparse_irq_lock, flags);
177
178 /* We have to check it to avoid races with another CPU */
179 desc = irq_desc_ptrs[irq];
180 if (desc)
181 goto out_unlock;
182
183 node = cpu_to_node(cpu);
184 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
185 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n",
186 irq, cpu, node);
187 if (!desc) {
188 printk(KERN_ERR "can not alloc irq_desc\n");
189 BUG_ON(1);
190 }
191 init_one_irq_desc(irq, desc, cpu);
192
193 irq_desc_ptrs[irq] = desc;
194
195out_unlock:
196 spin_unlock_irqrestore(&sparse_irq_lock, flags);
197
198 return desc;
199}
200
201#else /* !CONFIG_SPARSE_IRQ */
202
52struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 203struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
53 [0 ... NR_IRQS-1] = { 204 [0 ... NR_IRQS-1] = {
54 .status = IRQ_DISABLED, 205 .status = IRQ_DISABLED,
@@ -62,6 +213,32 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
62 } 213 }
63}; 214};
64 215
216int __init early_irq_init(void)
217{
218 struct irq_desc *desc;
219 int count;
220 int i;
221
222 desc = irq_desc;
223 count = ARRAY_SIZE(irq_desc);
224
225 for (i = 0; i < count; i++)
226 desc[i].irq = i;
227
228 return arch_early_irq_init();
229}
230
231struct irq_desc *irq_to_desc(unsigned int irq)
232{
233 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
234}
235
236struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
237{
238 return irq_to_desc(irq);
239}
240#endif /* !CONFIG_SPARSE_IRQ */
241
65/* 242/*
66 * What should we do if we get a hw irq event on an illegal vector? 243 * What should we do if we get a hw irq event on an illegal vector?
67 * Each architecture has to answer this themself. 244 * Each architecture has to answer this themself.
@@ -179,8 +356,11 @@ unsigned int __do_IRQ(unsigned int irq)
179 /* 356 /*
180 * No locking required for CPU-local interrupts: 357 * No locking required for CPU-local interrupts:
181 */ 358 */
182 if (desc->chip->ack) 359 if (desc->chip->ack) {
183 desc->chip->ack(irq); 360 desc->chip->ack(irq);
361 /* get new one */
362 desc = irq_remap_to_desc(irq, desc);
363 }
184 if (likely(!(desc->status & IRQ_DISABLED))) { 364 if (likely(!(desc->status & IRQ_DISABLED))) {
185 action_ret = handle_IRQ_event(irq, desc->action); 365 action_ret = handle_IRQ_event(irq, desc->action);
186 if (!noirqdebug) 366 if (!noirqdebug)
@@ -191,8 +371,10 @@ unsigned int __do_IRQ(unsigned int irq)
191 } 371 }
192 372
193 spin_lock(&desc->lock); 373 spin_lock(&desc->lock);
194 if (desc->chip->ack) 374 if (desc->chip->ack) {
195 desc->chip->ack(irq); 375 desc->chip->ack(irq);
376 desc = irq_remap_to_desc(irq, desc);
377 }
196 /* 378 /*
197 * REPLAY is when Linux resends an IRQ that was dropped earlier 379 * REPLAY is when Linux resends an IRQ that was dropped earlier
198 * WAITING is used by probe to mark irqs that are being tested 380 * WAITING is used by probe to mark irqs that are being tested
@@ -259,19 +441,22 @@ out:
259} 441}
260#endif 442#endif
261 443
262
263#ifdef CONFIG_TRACE_IRQFLAGS
264/*
265 * lockdep: we want to handle all irq_desc locks as a single lock-class:
266 */
267static struct lock_class_key irq_desc_lock_class;
268
269void early_init_irq_lock_class(void) 444void early_init_irq_lock_class(void)
270{ 445{
271 struct irq_desc *desc; 446 struct irq_desc *desc;
272 int i; 447 int i;
273 448
274 for_each_irq_desc(i, desc) 449 for_each_irq_desc(i, desc) {
275 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 450 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
451 }
452}
453
454#ifdef CONFIG_SPARSE_IRQ
455unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
456{
457 struct irq_desc *desc = irq_to_desc(irq);
458 return desc ? desc->kstat_irqs[cpu] : 0;
276} 459}
277#endif 460#endif
461EXPORT_SYMBOL(kstat_irqs_cpu);
462
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 64c1c7253dae..e6d0a43cc125 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,6 +13,11 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags); 14 unsigned long flags);
15 15
16extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
18extern spinlock_t sparse_irq_lock;
19extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
20
16#ifdef CONFIG_PROC_FS 21#ifdef CONFIG_PROC_FS
17extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 22extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
18extern void register_handler_proc(unsigned int irq, struct irqaction *action); 23extern void register_handler_proc(unsigned int irq, struct irqaction *action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 801addda3c43..cd0cd8dcb345 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,8 +16,15 @@
16#include "internals.h" 16#include "internals.h"
17 17
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
19cpumask_var_t irq_default_affinity;
19 20
20cpumask_t irq_default_affinity = CPU_MASK_ALL; 21static int init_irq_default_affinity(void)
22{
23 alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
24 cpumask_setall(irq_default_affinity);
25 return 0;
26}
27core_initcall(init_irq_default_affinity);
21 28
22/** 29/**
23 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 30 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
@@ -79,7 +86,7 @@ int irq_can_set_affinity(unsigned int irq)
79 * @cpumask: cpumask 86 * @cpumask: cpumask
80 * 87 *
81 */ 88 */
82int irq_set_affinity(unsigned int irq, cpumask_t cpumask) 89int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
83{ 90{
84 struct irq_desc *desc = irq_to_desc(irq); 91 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags; 92 unsigned long flags;
@@ -91,14 +98,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
91 98
92#ifdef CONFIG_GENERIC_PENDING_IRQ 99#ifdef CONFIG_GENERIC_PENDING_IRQ
93 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 100 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
94 desc->affinity = cpumask; 101 cpumask_copy(&desc->affinity, cpumask);
95 desc->chip->set_affinity(irq, cpumask); 102 desc->chip->set_affinity(irq, cpumask);
96 } else { 103 } else {
97 desc->status |= IRQ_MOVE_PENDING; 104 desc->status |= IRQ_MOVE_PENDING;
98 desc->pending_mask = cpumask; 105 cpumask_copy(&desc->pending_mask, cpumask);
99 } 106 }
100#else 107#else
101 desc->affinity = cpumask; 108 cpumask_copy(&desc->affinity, cpumask);
102 desc->chip->set_affinity(irq, cpumask); 109 desc->chip->set_affinity(irq, cpumask);
103#endif 110#endif
104 desc->status |= IRQ_AFFINITY_SET; 111 desc->status |= IRQ_AFFINITY_SET;
@@ -112,26 +119,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
112 */ 119 */
113int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) 120int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
114{ 121{
115 cpumask_t mask;
116
117 if (!irq_can_set_affinity(irq)) 122 if (!irq_can_set_affinity(irq))
118 return 0; 123 return 0;
119 124
120 cpus_and(mask, cpu_online_map, irq_default_affinity);
121
122 /* 125 /*
123 * Preserve an userspace affinity setup, but make sure that 126 * Preserve an userspace affinity setup, but make sure that
124 * one of the targets is online. 127 * one of the targets is online.
125 */ 128 */
126 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 129 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
127 if (cpus_intersects(desc->affinity, cpu_online_map)) 130 if (cpumask_any_and(&desc->affinity, cpu_online_mask)
128 mask = desc->affinity; 131 < nr_cpu_ids)
132 goto set_affinity;
129 else 133 else
130 desc->status &= ~IRQ_AFFINITY_SET; 134 desc->status &= ~IRQ_AFFINITY_SET;
131 } 135 }
132 136
133 desc->affinity = mask; 137 cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
134 desc->chip->set_affinity(irq, mask); 138set_affinity:
139 desc->chip->set_affinity(irq, &desc->affinity);
135 140
136 return 0; 141 return 0;
137} 142}
@@ -370,16 +375,18 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
370 return 0; 375 return 0;
371 } 376 }
372 377
373 ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); 378 /* caller masked out all except trigger mode flags */
379 ret = chip->set_type(irq, flags);
374 380
375 if (ret) 381 if (ret)
376 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 382 pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
377 (int)(flags & IRQF_TRIGGER_MASK), 383 (int)flags, irq, chip->set_type);
378 irq, chip->set_type);
379 else { 384 else {
385 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
386 flags |= IRQ_LEVEL;
380 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 387 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
381 desc->status &= ~IRQ_TYPE_SENSE_MASK; 388 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
382 desc->status |= flags & IRQ_TYPE_SENSE_MASK; 389 desc->status |= flags;
383 } 390 }
384 391
385 return ret; 392 return ret;
@@ -459,7 +466,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
459 466
460 /* Setup the type (level, edge polarity) if configured: */ 467 /* Setup the type (level, edge polarity) if configured: */
461 if (new->flags & IRQF_TRIGGER_MASK) { 468 if (new->flags & IRQF_TRIGGER_MASK) {
462 ret = __irq_set_trigger(desc, irq, new->flags); 469 ret = __irq_set_trigger(desc, irq,
470 new->flags & IRQF_TRIGGER_MASK);
463 471
464 if (ret) { 472 if (ret) {
465 spin_unlock_irqrestore(&desc->lock, flags); 473 spin_unlock_irqrestore(&desc->lock, flags);
@@ -673,6 +681,18 @@ int request_irq(unsigned int irq, irq_handler_t handler,
673 struct irq_desc *desc; 681 struct irq_desc *desc;
674 int retval; 682 int retval;
675 683
684 /*
685 * handle_IRQ_event() always ignores IRQF_DISABLED except for
686 * the _first_ irqaction (sigh). That can cause oopsing, but
687 * the behavior is classified as "will not fix" so we need to
688 * start nudging drivers away from using that idiom.
689 */
690 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
691 == (IRQF_SHARED|IRQF_DISABLED))
692 pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
693 "guaranteed on shared IRQs\n",
694 irq, devname);
695
676#ifdef CONFIG_LOCKDEP 696#ifdef CONFIG_LOCKDEP
677 /* 697 /*
678 * Lockdep wants atomic interrupt handlers: 698 * Lockdep wants atomic interrupt handlers:
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 9db681d95814..bd72329e630c 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,7 +4,6 @@
4void move_masked_irq(int irq) 4void move_masked_irq(int irq)
5{ 5{
6 struct irq_desc *desc = irq_to_desc(irq); 6 struct irq_desc *desc = irq_to_desc(irq);
7 cpumask_t tmp;
8 7
9 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 8 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
10 return; 9 return;
@@ -19,7 +18,7 @@ void move_masked_irq(int irq)
19 18
20 desc->status &= ~IRQ_MOVE_PENDING; 19 desc->status &= ~IRQ_MOVE_PENDING;
21 20
22 if (unlikely(cpus_empty(desc->pending_mask))) 21 if (unlikely(cpumask_empty(&desc->pending_mask)))
23 return; 22 return;
24 23
25 if (!desc->chip->set_affinity) 24 if (!desc->chip->set_affinity)
@@ -27,8 +26,6 @@ void move_masked_irq(int irq)
27 26
28 assert_spin_locked(&desc->lock); 27 assert_spin_locked(&desc->lock);
29 28
30 cpus_and(tmp, desc->pending_mask, cpu_online_map);
31
32 /* 29 /*
33 * If there was a valid mask to work with, please 30 * If there was a valid mask to work with, please
34 * do the disable, re-program, enable sequence. 31 * do the disable, re-program, enable sequence.
@@ -41,10 +38,13 @@ void move_masked_irq(int irq)
41 * For correct operation this depends on the caller 38 * For correct operation this depends on the caller
42 * masking the irqs. 39 * masking the irqs.
43 */ 40 */
44 if (likely(!cpus_empty(tmp))) { 41 if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
45 desc->chip->set_affinity(irq,tmp); 42 < nr_cpu_ids)) {
43 cpumask_and(&desc->affinity,
44 &desc->pending_mask, cpu_online_mask);
45 desc->chip->set_affinity(irq, &desc->affinity);
46 } 46 }
47 cpus_clear(desc->pending_mask); 47 cpumask_clear(&desc->pending_mask);
48} 48}
49 49
50void move_native_irq(int irq) 50void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
new file mode 100644
index 000000000000..ecf765c6a77a
--- /dev/null
+++ b/kernel/irq/numa_migrate.c
@@ -0,0 +1,119 @@
1/*
2 * NUMA irq-desc migration code
3 *
4 * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
5 * the new "home node" of the IRQ.
6 */
7
8#include <linux/irq.h>
9#include <linux/module.h>
10#include <linux/random.h>
11#include <linux/interrupt.h>
12#include <linux/kernel_stat.h>
13
14#include "internals.h"
15
16static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc,
18 int cpu, int nr)
19{
20 unsigned long bytes;
21
22 init_kstat_irqs(desc, cpu, nr);
23
24 if (desc->kstat_irqs != old_desc->kstat_irqs) {
25 /* Compute how many bytes we need per irq and allocate them */
26 bytes = nr * sizeof(unsigned int);
27
28 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
29 }
30}
31
32static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
33{
34 if (old_desc->kstat_irqs == desc->kstat_irqs)
35 return;
36
37 kfree(old_desc->kstat_irqs);
38 old_desc->kstat_irqs = NULL;
39}
40
41static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 struct irq_desc *desc, int cpu)
43{
44 memcpy(desc, old_desc, sizeof(struct irq_desc));
45 spin_lock_init(&desc->lock);
46 desc->cpu = cpu;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
49 arch_init_copy_chip_data(old_desc, desc, cpu);
50}
51
52static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
53{
54 free_kstat_irqs(old_desc, desc);
55 arch_free_chip_data(old_desc, desc);
56}
57
58static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
59 int cpu)
60{
61 struct irq_desc *desc;
62 unsigned int irq;
63 unsigned long flags;
64 int node;
65
66 irq = old_desc->irq;
67
68 spin_lock_irqsave(&sparse_irq_lock, flags);
69
70 /* We have to check it to avoid races with another CPU */
71 desc = irq_desc_ptrs[irq];
72
73 if (desc && old_desc != desc)
74 goto out_unlock;
75
76 node = cpu_to_node(cpu);
77 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
78 if (!desc) {
79 printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
80 /* still use old one */
81 desc = old_desc;
82 goto out_unlock;
83 }
84 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
85
86 irq_desc_ptrs[irq] = desc;
87
88 /* free the old one */
89 free_one_irq_desc(old_desc, desc);
90 kfree(old_desc);
91
92out_unlock:
93 spin_unlock_irqrestore(&sparse_irq_lock, flags);
94
95 return desc;
96}
97
98struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
99{
100 int old_cpu;
101 int node, old_node;
102
103 /* those all static, do move them */
104 if (desc->irq < NR_IRQS_LEGACY)
105 return desc;
106
107 old_cpu = desc->cpu;
108 if (old_cpu != cpu) {
109 node = cpu_to_node(cpu);
110 old_node = cpu_to_node(old_cpu);
111 if (old_node != node)
112 desc = __real_move_irq_desc(desc, cpu);
113 else
114 desc->cpu = cpu;
115 }
116
117 return desc;
118}
119
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d257e7d6a8a4..aae3f742bcec 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,7 +20,7 @@ static struct proc_dir_entry *root_irq_dir;
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_to_desc((long)m->private); 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 cpumask_t *mask = &desc->affinity; 23 const struct cpumask *mask = &desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
26 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
@@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file,
40 const char __user *buffer, size_t count, loff_t *pos) 40 const char __user *buffer, size_t count, loff_t *pos)
41{ 41{
42 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 42 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
43 cpumask_t new_value; 43 cpumask_var_t new_value;
44 int err; 44 int err;
45 45
46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || 46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
47 irq_balancing_disabled(irq)) 47 irq_balancing_disabled(irq))
48 return -EIO; 48 return -EIO;
49 49
50 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
51 return -ENOMEM;
52
50 err = cpumask_parse_user(buffer, count, new_value); 53 err = cpumask_parse_user(buffer, count, new_value);
51 if (err) 54 if (err)
52 return err; 55 goto free_cpumask;
53 56
54 if (!is_affinity_mask_valid(new_value)) 57 if (!is_affinity_mask_valid(new_value)) {
55 return -EINVAL; 58 err = -EINVAL;
59 goto free_cpumask;
60 }
56 61
57 /* 62 /*
58 * Do not allow disabling IRQs completely - it's a too easy 63 * Do not allow disabling IRQs completely - it's a too easy
59 * way to make the system unusable accidentally :-) At least 64 * way to make the system unusable accidentally :-) At least
60 * one online CPU still has to be targeted. 65 * one online CPU still has to be targeted.
61 */ 66 */
62 if (!cpus_intersects(new_value, cpu_online_map)) 67 if (!cpumask_intersects(new_value, cpu_online_mask)) {
63 /* Special case for empty set - allow the architecture 68 /* Special case for empty set - allow the architecture
64 code to set default SMP affinity. */ 69 code to set default SMP affinity. */
65 return irq_select_affinity_usr(irq) ? -EINVAL : count; 70 err = irq_select_affinity_usr(irq) ? -EINVAL : count;
66 71 } else {
67 irq_set_affinity(irq, new_value); 72 irq_set_affinity(irq, new_value);
73 err = count;
74 }
68 75
69 return count; 76free_cpumask:
77 free_cpumask_var(new_value);
78 return err;
70} 79}
71 80
72static int irq_affinity_proc_open(struct inode *inode, struct file *file) 81static int irq_affinity_proc_open(struct inode *inode, struct file *file)
@@ -84,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = {
84 93
85static int default_affinity_show(struct seq_file *m, void *v) 94static int default_affinity_show(struct seq_file *m, void *v)
86{ 95{
87 seq_cpumask(m, &irq_default_affinity); 96 seq_cpumask(m, irq_default_affinity);
88 seq_putc(m, '\n'); 97 seq_putc(m, '\n');
89 return 0; 98 return 0;
90} 99}
@@ -92,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v)
92static ssize_t default_affinity_write(struct file *file, 101static ssize_t default_affinity_write(struct file *file,
93 const char __user *buffer, size_t count, loff_t *ppos) 102 const char __user *buffer, size_t count, loff_t *ppos)
94{ 103{
95 cpumask_t new_value; 104 cpumask_var_t new_value;
96 int err; 105 int err;
97 106
107 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
108 return -ENOMEM;
109
98 err = cpumask_parse_user(buffer, count, new_value); 110 err = cpumask_parse_user(buffer, count, new_value);
99 if (err) 111 if (err)
100 return err; 112 goto out;
101 113
102 if (!is_affinity_mask_valid(new_value)) 114 if (!is_affinity_mask_valid(new_value)) {
103 return -EINVAL; 115 err = -EINVAL;
116 goto out;
117 }
104 118
105 /* 119 /*
106 * Do not allow disabling IRQs completely - it's a too easy 120 * Do not allow disabling IRQs completely - it's a too easy
107 * way to make the system unusable accidentally :-) At least 121 * way to make the system unusable accidentally :-) At least
108 * one online CPU still has to be targeted. 122 * one online CPU still has to be targeted.
109 */ 123 */
110 if (!cpus_intersects(new_value, cpu_online_map)) 124 if (!cpumask_intersects(new_value, cpu_online_mask)) {
111 return -EINVAL; 125 err = -EINVAL;
126 goto out;
127 }
112 128
113 irq_default_affinity = new_value; 129 cpumask_copy(irq_default_affinity, new_value);
130 err = count;
114 131
115 return count; 132out:
133 free_cpumask_var(new_value);
134 return err;
116} 135}
117 136
118static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
@@ -243,7 +262,11 @@ void init_irq_proc(void)
243 /* 262 /*
244 * Create entries for all existing IRQs. 263 * Create entries for all existing IRQs.
245 */ 264 */
246 for_each_irq_desc(irq, desc) 265 for_each_irq_desc(irq, desc) {
266 if (!desc)
267 continue;
268
247 register_irq_proc(irq, desc); 269 register_irq_proc(irq, desc);
270 }
248} 271}
249 272
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..e694afa0eb8c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,20 +30,19 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33/* These will be re-linked against their real values during the second link stage */ 33extern const unsigned long kallsyms_addresses[];
34extern const unsigned long kallsyms_addresses[] __attribute__((weak)); 34extern const u8 kallsyms_names[];
35extern const u8 kallsyms_names[] __attribute__((weak));
36 35
37/* tell the compiler that the count isn't in the small data section if the arch 36/* tell the compiler that the count isn't in the small data section if the arch
38 * has one (eg: FRV) 37 * has one (eg: FRV)
39 */ 38 */
40extern const unsigned long kallsyms_num_syms 39extern const unsigned long kallsyms_num_syms
41__attribute__((weak, section(".rodata"))); 40 __attribute__((__section__(".rodata")));
42 41
43extern const u8 kallsyms_token_table[] __attribute__((weak)); 42extern const u8 kallsyms_token_table[];
44extern const u16 kallsyms_token_index[] __attribute__((weak)); 43extern const u16 kallsyms_token_index[];
45 44
46extern const unsigned long kallsyms_markers[] __attribute__((weak)); 45extern const unsigned long kallsyms_markers[];
47 46
48static inline int is_kernel_inittext(unsigned long addr) 47static inline int is_kernel_inittext(unsigned long addr)
49{ 48{
@@ -168,9 +167,6 @@ static unsigned long get_symbol_pos(unsigned long addr,
168 unsigned long symbol_start = 0, symbol_end = 0; 167 unsigned long symbol_start = 0, symbol_end = 0;
169 unsigned long i, low, high, mid; 168 unsigned long i, low, high, mid;
170 169
171 /* This kernel should never had been booted. */
172 BUG_ON(!kallsyms_addresses);
173
174 /* do a binary search on the sorted kallsyms_addresses array */ 170 /* do a binary search on the sorted kallsyms_addresses array */
175 low = 0; 171 low = 0;
176 high = kallsyms_num_syms; 172 high = kallsyms_num_syms;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ac0fde7b54d0..3fb855ad6aa0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1116,7 +1116,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1116 struct elf_prstatus prstatus; 1116 struct elf_prstatus prstatus;
1117 u32 *buf; 1117 u32 *buf;
1118 1118
1119 if ((cpu < 0) || (cpu >= NR_CPUS)) 1119 if ((cpu < 0) || (cpu >= nr_cpu_ids))
1120 return; 1120 return;
1121 1121
1122 /* Using ELF notes here is opportunistic. 1122 /* Using ELF notes here is opportunistic.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3d3c3ea3a023..a27a5f64443d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51 51
52/** 52/**
53 * request_module - try to load a kernel module 53 * request_module - try to load a kernel module
54 * @fmt: printf style format string for the name of the module 54 * @fmt: printf style format string for the name of the module
55 * @varargs: arguements as specified in the format string 55 * @...: arguments as specified in the format string
56 * 56 *
57 * Load a module using the user mode module loader. The function returns 57 * Load a module using the user mode module loader. The function returns
58 * zero on success or a negative errno code on failure. Note that a 58 * zero on success or a negative errno code on failure. Note that a
@@ -118,10 +118,10 @@ EXPORT_SYMBOL(request_module);
118struct subprocess_info { 118struct subprocess_info {
119 struct work_struct work; 119 struct work_struct work;
120 struct completion *complete; 120 struct completion *complete;
121 struct cred *cred;
121 char *path; 122 char *path;
122 char **argv; 123 char **argv;
123 char **envp; 124 char **envp;
124 struct key *ring;
125 enum umh_wait wait; 125 enum umh_wait wait;
126 int retval; 126 int retval;
127 struct file *stdin; 127 struct file *stdin;
@@ -134,19 +134,20 @@ struct subprocess_info {
134static int ____call_usermodehelper(void *data) 134static int ____call_usermodehelper(void *data)
135{ 135{
136 struct subprocess_info *sub_info = data; 136 struct subprocess_info *sub_info = data;
137 struct key *new_session, *old_session;
138 int retval; 137 int retval;
139 138
140 /* Unblock all signals and set the session keyring. */ 139 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
141 new_session = key_get(sub_info->ring); 140
141 /* Unblock all signals */
142 spin_lock_irq(&current->sighand->siglock); 142 spin_lock_irq(&current->sighand->siglock);
143 old_session = __install_session_keyring(current, new_session);
144 flush_signal_handlers(current, 1); 143 flush_signal_handlers(current, 1);
145 sigemptyset(&current->blocked); 144 sigemptyset(&current->blocked);
146 recalc_sigpending(); 145 recalc_sigpending();
147 spin_unlock_irq(&current->sighand->siglock); 146 spin_unlock_irq(&current->sighand->siglock);
148 147
149 key_put(old_session); 148 /* Install the credentials */
149 commit_creds(sub_info->cred);
150 sub_info->cred = NULL;
150 151
151 /* Install input pipe when needed */ 152 /* Install input pipe when needed */
152 if (sub_info->stdin) { 153 if (sub_info->stdin) {
@@ -185,6 +186,8 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
185{ 186{
186 if (info->cleanup) 187 if (info->cleanup)
187 (*info->cleanup)(info->argv, info->envp); 188 (*info->cleanup)(info->argv, info->envp);
189 if (info->cred)
190 put_cred(info->cred);
188 kfree(info); 191 kfree(info);
189} 192}
190EXPORT_SYMBOL(call_usermodehelper_freeinfo); 193EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -240,6 +243,8 @@ static void __call_usermodehelper(struct work_struct *work)
240 pid_t pid; 243 pid_t pid;
241 enum umh_wait wait = sub_info->wait; 244 enum umh_wait wait = sub_info->wait;
242 245
246 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
247
243 /* CLONE_VFORK: wait until the usermode helper has execve'd 248 /* CLONE_VFORK: wait until the usermode helper has execve'd
244 * successfully We need the data structures to stay around 249 * successfully We need the data structures to stay around
245 * until that is done. */ 250 * until that is done. */
@@ -362,6 +367,9 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
362 sub_info->path = path; 367 sub_info->path = path;
363 sub_info->argv = argv; 368 sub_info->argv = argv;
364 sub_info->envp = envp; 369 sub_info->envp = envp;
370 sub_info->cred = prepare_usermodehelper_creds();
371 if (!sub_info->cred)
372 return NULL;
365 373
366 out: 374 out:
367 return sub_info; 375 return sub_info;
@@ -376,7 +384,13 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
376void call_usermodehelper_setkeys(struct subprocess_info *info, 384void call_usermodehelper_setkeys(struct subprocess_info *info,
377 struct key *session_keyring) 385 struct key *session_keyring)
378{ 386{
379 info->ring = session_keyring; 387#ifdef CONFIG_KEYS
388 struct thread_group_cred *tgcred = info->cred->tgcred;
389 key_put(tgcred->session_keyring);
390 tgcred->session_keyring = key_get(session_keyring);
391#else
392 BUG();
393#endif
380} 394}
381EXPORT_SYMBOL(call_usermodehelper_setkeys); 395EXPORT_SYMBOL(call_usermodehelper_setkeys);
382 396
@@ -444,6 +458,8 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
444 DECLARE_COMPLETION_ONSTACK(done); 458 DECLARE_COMPLETION_ONSTACK(done);
445 int retval = 0; 459 int retval = 0;
446 460
461 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
462
447 helper_lock(); 463 helper_lock();
448 if (sub_info->path[0] == '\0') 464 if (sub_info->path[0] == '\0')
449 goto out; 465 goto out;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9f8a3f25259a..1b9cbdc0127a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
69/* NOTE: change this value only with kprobe_mutex held */ 69/* NOTE: change this value only with kprobe_mutex held */
70static bool kprobe_enabled; 70static bool kprobe_enabled;
71 71
72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct { 74static struct {
75 spinlock_t lock ____cacheline_aligned_in_smp; 75 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -115,6 +115,7 @@ enum kprobe_slot_state {
115 SLOT_USED = 2, 115 SLOT_USED = 2,
116}; 116};
117 117
118static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
118static struct hlist_head kprobe_insn_pages; 119static struct hlist_head kprobe_insn_pages;
119static int kprobe_garbage_slots; 120static int kprobe_garbage_slots;
120static int collect_garbage_slots(void); 121static int collect_garbage_slots(void);
@@ -144,10 +145,10 @@ loop_end:
144} 145}
145 146
146/** 147/**
147 * get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
148 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
149 */ 150 */
150kprobe_opcode_t __kprobes *get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(void)
151{ 152{
152 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
153 struct hlist_node *pos; 154 struct hlist_node *pos;
@@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
196 return kip->insns; 197 return kip->insns;
197} 198}
198 199
200kprobe_opcode_t __kprobes *get_insn_slot(void)
201{
202 kprobe_opcode_t *ret;
203 mutex_lock(&kprobe_insn_mutex);
204 ret = __get_insn_slot();
205 mutex_unlock(&kprobe_insn_mutex);
206 return ret;
207}
208
199/* Return 1 if all garbages are collected, otherwise 0. */ 209/* Return 1 if all garbages are collected, otherwise 0. */
200static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) 210static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
201{ 211{
@@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void)
226{ 236{
227 struct kprobe_insn_page *kip; 237 struct kprobe_insn_page *kip;
228 struct hlist_node *pos, *next; 238 struct hlist_node *pos, *next;
239 int safety;
229 240
230 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
231 if (check_safety() != 0) 242 mutex_unlock(&kprobe_insn_mutex);
243 safety = check_safety();
244 mutex_lock(&kprobe_insn_mutex);
245 if (safety != 0)
232 return -EAGAIN; 246 return -EAGAIN;
233 247
234 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 248 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
251 struct kprobe_insn_page *kip; 265 struct kprobe_insn_page *kip;
252 struct hlist_node *pos; 266 struct hlist_node *pos;
253 267
268 mutex_lock(&kprobe_insn_mutex);
254 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 269 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
255 if (kip->insns <= slot && 270 if (kip->insns <= slot &&
256 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 271 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
@@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
267 282
268 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 283 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
269 collect_garbage_slots(); 284 collect_garbage_slots();
285
286 mutex_unlock(&kprobe_insn_mutex);
270} 287}
271#endif 288#endif
272 289
@@ -310,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
310 struct kprobe *kp; 327 struct kprobe *kp;
311 328
312 list_for_each_entry_rcu(kp, &p->list, list) { 329 list_for_each_entry_rcu(kp, &p->list, list) {
313 if (kp->pre_handler) { 330 if (kp->pre_handler && !kprobe_gone(kp)) {
314 set_kprobe_instance(kp); 331 set_kprobe_instance(kp);
315 if (kp->pre_handler(kp, regs)) 332 if (kp->pre_handler(kp, regs))
316 return 1; 333 return 1;
@@ -326,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
326 struct kprobe *kp; 343 struct kprobe *kp;
327 344
328 list_for_each_entry_rcu(kp, &p->list, list) { 345 list_for_each_entry_rcu(kp, &p->list, list) {
329 if (kp->post_handler) { 346 if (kp->post_handler && !kprobe_gone(kp)) {
330 set_kprobe_instance(kp); 347 set_kprobe_instance(kp);
331 kp->post_handler(kp, regs, flags); 348 kp->post_handler(kp, regs, flags);
332 reset_kprobe_instance(); 349 reset_kprobe_instance();
@@ -393,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
393 hlist_add_head(&ri->hlist, head); 410 hlist_add_head(&ri->hlist, head);
394} 411}
395 412
396void kretprobe_hash_lock(struct task_struct *tsk, 413void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
397 struct hlist_head **head, unsigned long *flags) 414 struct hlist_head **head, unsigned long *flags)
398{ 415{
399 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 416 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -404,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk,
404 spin_lock_irqsave(hlist_lock, *flags); 421 spin_lock_irqsave(hlist_lock, *flags);
405} 422}
406 423
407static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) 424static void __kprobes kretprobe_table_lock(unsigned long hash,
425 unsigned long *flags)
408{ 426{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 427 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags); 428 spin_lock_irqsave(hlist_lock, *flags);
411} 429}
412 430
413void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) 431void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
432 unsigned long *flags)
414{ 433{
415 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 434 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
416 spinlock_t *hlist_lock; 435 spinlock_t *hlist_lock;
@@ -419,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
419 spin_unlock_irqrestore(hlist_lock, *flags); 438 spin_unlock_irqrestore(hlist_lock, *flags);
420} 439}
421 440
422void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 441void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
423{ 442{
424 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 443 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
425 spin_unlock_irqrestore(hlist_lock, *flags); 444 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -526,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
526 ap->addr = p->addr; 545 ap->addr = p->addr;
527 ap->pre_handler = aggr_pre_handler; 546 ap->pre_handler = aggr_pre_handler;
528 ap->fault_handler = aggr_fault_handler; 547 ap->fault_handler = aggr_fault_handler;
529 if (p->post_handler) 548 /* We don't care the kprobe which has gone. */
549 if (p->post_handler && !kprobe_gone(p))
530 ap->post_handler = aggr_post_handler; 550 ap->post_handler = aggr_post_handler;
531 if (p->break_handler) 551 if (p->break_handler && !kprobe_gone(p))
532 ap->break_handler = aggr_break_handler; 552 ap->break_handler = aggr_break_handler;
533 553
534 INIT_LIST_HEAD(&ap->list); 554 INIT_LIST_HEAD(&ap->list);
@@ -547,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
547 int ret = 0; 567 int ret = 0;
548 struct kprobe *ap; 568 struct kprobe *ap;
549 569
570 if (kprobe_gone(old_p)) {
571 /*
572 * Attempting to insert new probe at the same location that
573 * had a probe in the module vaddr area which already
574 * freed. So, the instruction slot has already been
575 * released. We need a new slot for the new probe.
576 */
577 ret = arch_prepare_kprobe(old_p);
578 if (ret)
579 return ret;
580 }
550 if (old_p->pre_handler == aggr_pre_handler) { 581 if (old_p->pre_handler == aggr_pre_handler) {
551 copy_kprobe(old_p, p); 582 copy_kprobe(old_p, p);
552 ret = add_new_kprobe(old_p, p); 583 ret = add_new_kprobe(old_p, p);
584 ap = old_p;
553 } else { 585 } else {
554 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 586 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
555 if (!ap) 587 if (!ap) {
588 if (kprobe_gone(old_p))
589 arch_remove_kprobe(old_p);
556 return -ENOMEM; 590 return -ENOMEM;
591 }
557 add_aggr_kprobe(ap, old_p); 592 add_aggr_kprobe(ap, old_p);
558 copy_kprobe(ap, p); 593 copy_kprobe(ap, p);
559 ret = add_new_kprobe(ap, p); 594 ret = add_new_kprobe(ap, p);
560 } 595 }
596 if (kprobe_gone(old_p)) {
597 /*
598 * If the old_p has gone, its breakpoint has been disarmed.
599 * We have to arm it again after preparing real kprobes.
600 */
601 ap->flags &= ~KPROBE_FLAG_GONE;
602 if (kprobe_enabled)
603 arch_arm_kprobe(ap);
604 }
561 return ret; 605 return ret;
562} 606}
563 607
@@ -600,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
600 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 644 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
601} 645}
602 646
603static int __kprobes __register_kprobe(struct kprobe *p, 647int __kprobes register_kprobe(struct kprobe *p)
604 unsigned long called_from)
605{ 648{
606 int ret = 0; 649 int ret = 0;
607 struct kprobe *old_p; 650 struct kprobe *old_p;
@@ -620,28 +663,30 @@ static int __kprobes __register_kprobe(struct kprobe *p,
620 return -EINVAL; 663 return -EINVAL;
621 } 664 }
622 665
623 p->mod_refcounted = 0; 666 p->flags = 0;
624
625 /* 667 /*
626 * Check if are we probing a module. 668 * Check if are we probing a module.
627 */ 669 */
628 probed_mod = __module_text_address((unsigned long) p->addr); 670 probed_mod = __module_text_address((unsigned long) p->addr);
629 if (probed_mod) { 671 if (probed_mod) {
630 struct module *calling_mod;
631 calling_mod = __module_text_address(called_from);
632 /* 672 /*
633 * We must allow modules to probe themself and in this case 673 * We must hold a refcount of the probed module while updating
634 * avoid incrementing the module refcount, so as to allow 674 * its code to prohibit unexpected unloading.
635 * unloading of self probing modules.
636 */ 675 */
637 if (calling_mod && calling_mod != probed_mod) { 676 if (unlikely(!try_module_get(probed_mod))) {
638 if (unlikely(!try_module_get(probed_mod))) { 677 preempt_enable();
639 preempt_enable(); 678 return -EINVAL;
640 return -EINVAL; 679 }
641 } 680 /*
642 p->mod_refcounted = 1; 681 * If the module freed .init.text, we couldn't insert
643 } else 682 * kprobes in there.
644 probed_mod = NULL; 683 */
684 if (within_module_init((unsigned long)p->addr, probed_mod) &&
685 probed_mod->state != MODULE_STATE_COMING) {
686 module_put(probed_mod);
687 preempt_enable();
688 return -EINVAL;
689 }
645 } 690 }
646 preempt_enable(); 691 preempt_enable();
647 692
@@ -668,8 +713,9 @@ static int __kprobes __register_kprobe(struct kprobe *p,
668out: 713out:
669 mutex_unlock(&kprobe_mutex); 714 mutex_unlock(&kprobe_mutex);
670 715
671 if (ret && probed_mod) 716 if (probed_mod)
672 module_put(probed_mod); 717 module_put(probed_mod);
718
673 return ret; 719 return ret;
674} 720}
675 721
@@ -697,16 +743,16 @@ valid_p:
697 list_is_singular(&old_p->list))) { 743 list_is_singular(&old_p->list))) {
698 /* 744 /*
699 * Only probe on the hash list. Disarm only if kprobes are 745 * Only probe on the hash list. Disarm only if kprobes are
700 * enabled - otherwise, the breakpoint would already have 746 * enabled and not gone - otherwise, the breakpoint would
701 * been removed. We save on flushing icache. 747 * already have been removed. We save on flushing icache.
702 */ 748 */
703 if (kprobe_enabled) 749 if (kprobe_enabled && !kprobe_gone(old_p))
704 arch_disarm_kprobe(p); 750 arch_disarm_kprobe(p);
705 hlist_del_rcu(&old_p->hlist); 751 hlist_del_rcu(&old_p->hlist);
706 } else { 752 } else {
707 if (p->break_handler) 753 if (p->break_handler && !kprobe_gone(p))
708 old_p->break_handler = NULL; 754 old_p->break_handler = NULL;
709 if (p->post_handler) { 755 if (p->post_handler && !kprobe_gone(p)) {
710 list_for_each_entry_rcu(list_p, &old_p->list, list) { 756 list_for_each_entry_rcu(list_p, &old_p->list, list) {
711 if ((list_p != p) && (list_p->post_handler)) 757 if ((list_p != p) && (list_p->post_handler))
712 goto noclean; 758 goto noclean;
@@ -721,39 +767,27 @@ noclean:
721 767
722static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 768static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
723{ 769{
724 struct module *mod;
725 struct kprobe *old_p; 770 struct kprobe *old_p;
726 771
727 if (p->mod_refcounted) { 772 if (list_empty(&p->list))
728 /*
729 * Since we've already incremented refcount,
730 * we don't need to disable preemption.
731 */
732 mod = module_text_address((unsigned long)p->addr);
733 if (mod)
734 module_put(mod);
735 }
736
737 if (list_empty(&p->list) || list_is_singular(&p->list)) {
738 if (!list_empty(&p->list)) {
739 /* "p" is the last child of an aggr_kprobe */
740 old_p = list_entry(p->list.next, struct kprobe, list);
741 list_del(&p->list);
742 kfree(old_p);
743 }
744 arch_remove_kprobe(p); 773 arch_remove_kprobe(p);
774 else if (list_is_singular(&p->list)) {
775 /* "p" is the last child of an aggr_kprobe */
776 old_p = list_entry(p->list.next, struct kprobe, list);
777 list_del(&p->list);
778 arch_remove_kprobe(old_p);
779 kfree(old_p);
745 } 780 }
746} 781}
747 782
748static int __register_kprobes(struct kprobe **kps, int num, 783int __kprobes register_kprobes(struct kprobe **kps, int num)
749 unsigned long called_from)
750{ 784{
751 int i, ret = 0; 785 int i, ret = 0;
752 786
753 if (num <= 0) 787 if (num <= 0)
754 return -EINVAL; 788 return -EINVAL;
755 for (i = 0; i < num; i++) { 789 for (i = 0; i < num; i++) {
756 ret = __register_kprobe(kps[i], called_from); 790 ret = register_kprobe(kps[i]);
757 if (ret < 0) { 791 if (ret < 0) {
758 if (i > 0) 792 if (i > 0)
759 unregister_kprobes(kps, i); 793 unregister_kprobes(kps, i);
@@ -763,26 +797,11 @@ static int __register_kprobes(struct kprobe **kps, int num,
763 return ret; 797 return ret;
764} 798}
765 799
766/*
767 * Registration and unregistration functions for kprobe.
768 */
769int __kprobes register_kprobe(struct kprobe *p)
770{
771 return __register_kprobes(&p, 1,
772 (unsigned long)__builtin_return_address(0));
773}
774
775void __kprobes unregister_kprobe(struct kprobe *p) 800void __kprobes unregister_kprobe(struct kprobe *p)
776{ 801{
777 unregister_kprobes(&p, 1); 802 unregister_kprobes(&p, 1);
778} 803}
779 804
780int __kprobes register_kprobes(struct kprobe **kps, int num)
781{
782 return __register_kprobes(kps, num,
783 (unsigned long)__builtin_return_address(0));
784}
785
786void __kprobes unregister_kprobes(struct kprobe **kps, int num) 805void __kprobes unregister_kprobes(struct kprobe **kps, int num)
787{ 806{
788 int i; 807 int i;
@@ -811,8 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
811 return (unsigned long)entry; 830 return (unsigned long)entry;
812} 831}
813 832
814static int __register_jprobes(struct jprobe **jps, int num, 833int __kprobes register_jprobes(struct jprobe **jps, int num)
815 unsigned long called_from)
816{ 834{
817 struct jprobe *jp; 835 struct jprobe *jp;
818 int ret = 0, i; 836 int ret = 0, i;
@@ -830,7 +848,7 @@ static int __register_jprobes(struct jprobe **jps, int num,
830 /* Todo: Verify probepoint is a function entry point */ 848 /* Todo: Verify probepoint is a function entry point */
831 jp->kp.pre_handler = setjmp_pre_handler; 849 jp->kp.pre_handler = setjmp_pre_handler;
832 jp->kp.break_handler = longjmp_break_handler; 850 jp->kp.break_handler = longjmp_break_handler;
833 ret = __register_kprobe(&jp->kp, called_from); 851 ret = register_kprobe(&jp->kp);
834 } 852 }
835 if (ret < 0) { 853 if (ret < 0) {
836 if (i > 0) 854 if (i > 0)
@@ -843,8 +861,7 @@ static int __register_jprobes(struct jprobe **jps, int num,
843 861
844int __kprobes register_jprobe(struct jprobe *jp) 862int __kprobes register_jprobe(struct jprobe *jp)
845{ 863{
846 return __register_jprobes(&jp, 1, 864 return register_jprobes(&jp, 1);
847 (unsigned long)__builtin_return_address(0));
848} 865}
849 866
850void __kprobes unregister_jprobe(struct jprobe *jp) 867void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -852,12 +869,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
852 unregister_jprobes(&jp, 1); 869 unregister_jprobes(&jp, 1);
853} 870}
854 871
855int __kprobes register_jprobes(struct jprobe **jps, int num)
856{
857 return __register_jprobes(jps, num,
858 (unsigned long)__builtin_return_address(0));
859}
860
861void __kprobes unregister_jprobes(struct jprobe **jps, int num) 872void __kprobes unregister_jprobes(struct jprobe **jps, int num)
862{ 873{
863 int i; 874 int i;
@@ -920,8 +931,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
920 return 0; 931 return 0;
921} 932}
922 933
923static int __kprobes __register_kretprobe(struct kretprobe *rp, 934int __kprobes register_kretprobe(struct kretprobe *rp)
924 unsigned long called_from)
925{ 935{
926 int ret = 0; 936 int ret = 0;
927 struct kretprobe_instance *inst; 937 struct kretprobe_instance *inst;
@@ -967,21 +977,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
967 977
968 rp->nmissed = 0; 978 rp->nmissed = 0;
969 /* Establish function entry probe point */ 979 /* Establish function entry probe point */
970 ret = __register_kprobe(&rp->kp, called_from); 980 ret = register_kprobe(&rp->kp);
971 if (ret != 0) 981 if (ret != 0)
972 free_rp_inst(rp); 982 free_rp_inst(rp);
973 return ret; 983 return ret;
974} 984}
975 985
976static int __register_kretprobes(struct kretprobe **rps, int num, 986int __kprobes register_kretprobes(struct kretprobe **rps, int num)
977 unsigned long called_from)
978{ 987{
979 int ret = 0, i; 988 int ret = 0, i;
980 989
981 if (num <= 0) 990 if (num <= 0)
982 return -EINVAL; 991 return -EINVAL;
983 for (i = 0; i < num; i++) { 992 for (i = 0; i < num; i++) {
984 ret = __register_kretprobe(rps[i], called_from); 993 ret = register_kretprobe(rps[i]);
985 if (ret < 0) { 994 if (ret < 0) {
986 if (i > 0) 995 if (i > 0)
987 unregister_kretprobes(rps, i); 996 unregister_kretprobes(rps, i);
@@ -991,23 +1000,11 @@ static int __register_kretprobes(struct kretprobe **rps, int num,
991 return ret; 1000 return ret;
992} 1001}
993 1002
994int __kprobes register_kretprobe(struct kretprobe *rp)
995{
996 return __register_kretprobes(&rp, 1,
997 (unsigned long)__builtin_return_address(0));
998}
999
1000void __kprobes unregister_kretprobe(struct kretprobe *rp) 1003void __kprobes unregister_kretprobe(struct kretprobe *rp)
1001{ 1004{
1002 unregister_kretprobes(&rp, 1); 1005 unregister_kretprobes(&rp, 1);
1003} 1006}
1004 1007
1005int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1006{
1007 return __register_kretprobes(rps, num,
1008 (unsigned long)__builtin_return_address(0));
1009}
1010
1011void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1008void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1012{ 1009{
1013 int i; 1010 int i;
@@ -1055,6 +1052,72 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1055 1052
1056#endif /* CONFIG_KRETPROBES */ 1053#endif /* CONFIG_KRETPROBES */
1057 1054
1055/* Set the kprobe gone and remove its instruction buffer. */
1056static void __kprobes kill_kprobe(struct kprobe *p)
1057{
1058 struct kprobe *kp;
1059 p->flags |= KPROBE_FLAG_GONE;
1060 if (p->pre_handler == aggr_pre_handler) {
1061 /*
1062 * If this is an aggr_kprobe, we have to list all the
1063 * chained probes and mark them GONE.
1064 */
1065 list_for_each_entry_rcu(kp, &p->list, list)
1066 kp->flags |= KPROBE_FLAG_GONE;
1067 p->post_handler = NULL;
1068 p->break_handler = NULL;
1069 }
1070 /*
1071 * Here, we can remove insn_slot safely, because no thread calls
1072 * the original probed function (which will be freed soon) any more.
1073 */
1074 arch_remove_kprobe(p);
1075}
1076
1077/* Module notifier call back, checking kprobes on the module */
1078static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1079 unsigned long val, void *data)
1080{
1081 struct module *mod = data;
1082 struct hlist_head *head;
1083 struct hlist_node *node;
1084 struct kprobe *p;
1085 unsigned int i;
1086 int checkcore = (val == MODULE_STATE_GOING);
1087
1088 if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
1089 return NOTIFY_DONE;
1090
1091 /*
1092 * When MODULE_STATE_GOING was notified, both of module .text and
1093 * .init.text sections would be freed. When MODULE_STATE_LIVE was
1094 * notified, only .init.text section would be freed. We need to
1095 * disable kprobes which have been inserted in the sections.
1096 */
1097 mutex_lock(&kprobe_mutex);
1098 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1099 head = &kprobe_table[i];
1100 hlist_for_each_entry_rcu(p, node, head, hlist)
1101 if (within_module_init((unsigned long)p->addr, mod) ||
1102 (checkcore &&
1103 within_module_core((unsigned long)p->addr, mod))) {
1104 /*
1105 * The vaddr this probe is installed will soon
1106 * be vfreed buy not synced to disk. Hence,
1107 * disarming the breakpoint isn't needed.
1108 */
1109 kill_kprobe(p);
1110 }
1111 }
1112 mutex_unlock(&kprobe_mutex);
1113 return NOTIFY_DONE;
1114}
1115
1116static struct notifier_block kprobe_module_nb = {
1117 .notifier_call = kprobes_module_callback,
1118 .priority = 0
1119};
1120
1058static int __init init_kprobes(void) 1121static int __init init_kprobes(void)
1059{ 1122{
1060 int i, err = 0; 1123 int i, err = 0;
@@ -1111,6 +1174,9 @@ static int __init init_kprobes(void)
1111 err = arch_init_kprobes(); 1174 err = arch_init_kprobes();
1112 if (!err) 1175 if (!err)
1113 err = register_die_notifier(&kprobe_exceptions_nb); 1176 err = register_die_notifier(&kprobe_exceptions_nb);
1177 if (!err)
1178 err = register_module_notifier(&kprobe_module_nb);
1179
1114 kprobes_initialized = (err == 0); 1180 kprobes_initialized = (err == 0);
1115 1181
1116 if (!err) 1182 if (!err)
@@ -1131,10 +1197,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1131 else 1197 else
1132 kprobe_type = "k"; 1198 kprobe_type = "k";
1133 if (sym) 1199 if (sym)
1134 seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, 1200 seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type,
1135 sym, offset, (modname ? modname : " ")); 1201 sym, offset, (modname ? modname : " "),
1202 (kprobe_gone(p) ? "[GONE]" : ""));
1136 else 1203 else
1137 seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); 1204 seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr,
1205 (kprobe_gone(p) ? "[GONE]" : ""));
1138} 1206}
1139 1207
1140static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1208static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1215,7 +1283,8 @@ static void __kprobes enable_all_kprobes(void)
1215 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1283 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1216 head = &kprobe_table[i]; 1284 head = &kprobe_table[i];
1217 hlist_for_each_entry_rcu(p, node, head, hlist) 1285 hlist_for_each_entry_rcu(p, node, head, hlist)
1218 arch_arm_kprobe(p); 1286 if (!kprobe_gone(p))
1287 arch_arm_kprobe(p);
1219 } 1288 }
1220 1289
1221 kprobe_enabled = true; 1290 kprobe_enabled = true;
@@ -1244,7 +1313,7 @@ static void __kprobes disable_all_kprobes(void)
1244 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1313 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1245 head = &kprobe_table[i]; 1314 head = &kprobe_table[i];
1246 hlist_for_each_entry_rcu(p, node, head, hlist) { 1315 hlist_for_each_entry_rcu(p, node, head, hlist) {
1247 if (!arch_trampoline_kprobe(p)) 1316 if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
1248 arch_disarm_kprobe(p); 1317 arch_disarm_kprobe(p);
1249 } 1318 }
1250 } 1319 }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 08dd8ed86c77..528dd78e7e7e 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
24static struct kobj_attribute _name##_attr = \ 24static struct kobj_attribute _name##_attr = \
25 __ATTR(_name, 0644, _name##_show, _name##_store) 25 __ATTR(_name, 0644, _name##_show, _name##_store)
26 26
27#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 27#if defined(CONFIG_HOTPLUG)
28/* current uevent sequence number */ 28/* current uevent sequence number */
29static ssize_t uevent_seqnum_show(struct kobject *kobj, 29static ssize_t uevent_seqnum_show(struct kobject *kobj,
30 struct kobj_attribute *attr, char *buf) 30 struct kobj_attribute *attr, char *buf)
@@ -137,7 +137,7 @@ struct kobject *kernel_kobj;
137EXPORT_SYMBOL_GPL(kernel_kobj); 137EXPORT_SYMBOL_GPL(kernel_kobj);
138 138
139static struct attribute * kernel_attrs[] = { 139static struct attribute * kernel_attrs[] = {
140#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 140#if defined(CONFIG_HOTPLUG)
141 &uevent_seqnum_attr.attr, 141 &uevent_seqnum_attr.attr,
142 &uevent_helper_attr.attr, 142 &uevent_helper_attr.attr,
143#endif 143#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8e7a7ce3ed0a..4fbc456f393d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
21static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
22struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
23 23
24DEFINE_TRACE(sched_kthread_stop);
25DEFINE_TRACE(sched_kthread_stop_ret);
26
24struct kthread_create_info 27struct kthread_create_info
25{ 28{
26 /* Information passed to kthread() from kthreadd. */ 29 /* Information passed to kthread() from kthreadd. */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 46a404173db2..06b0c3568f0b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -25,6 +25,7 @@
25 * Thanks to Arjan van de Ven for coming up with the initial idea of 25 * Thanks to Arjan van de Ven for coming up with the initial idea of
26 * mapping lock dependencies runtime. 26 * mapping lock dependencies runtime.
27 */ 27 */
28#define DISABLE_BRANCH_PROFILING
28#include <linux/mutex.h> 29#include <linux/mutex.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/delay.h> 31#include <linux/delay.h>
@@ -136,16 +137,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
136#ifdef CONFIG_LOCK_STAT 137#ifdef CONFIG_LOCK_STAT
137static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
138 139
139static int lock_contention_point(struct lock_class *class, unsigned long ip) 140static int lock_point(unsigned long points[], unsigned long ip)
140{ 141{
141 int i; 142 int i;
142 143
143 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 144 for (i = 0; i < LOCKSTAT_POINTS; i++) {
144 if (class->contention_point[i] == 0) { 145 if (points[i] == 0) {
145 class->contention_point[i] = ip; 146 points[i] = ip;
146 break; 147 break;
147 } 148 }
148 if (class->contention_point[i] == ip) 149 if (points[i] == ip)
149 break; 150 break;
150 } 151 }
151 152
@@ -185,6 +186,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
185 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
186 stats.contention_point[i] += pcs->contention_point[i]; 187 stats.contention_point[i] += pcs->contention_point[i];
187 188
189 for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
190 stats.contending_point[i] += pcs->contending_point[i];
191
188 lock_time_add(&pcs->read_waittime, &stats.read_waittime); 192 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
189 lock_time_add(&pcs->write_waittime, &stats.write_waittime); 193 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
190 194
@@ -209,6 +213,7 @@ void clear_lock_stats(struct lock_class *class)
209 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 213 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
210 } 214 }
211 memset(class->contention_point, 0, sizeof(class->contention_point)); 215 memset(class->contention_point, 0, sizeof(class->contention_point));
216 memset(class->contending_point, 0, sizeof(class->contending_point));
212} 217}
213 218
214static struct lock_class_stats *get_lock_stats(struct lock_class *class) 219static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -287,14 +292,12 @@ void lockdep_off(void)
287{ 292{
288 current->lockdep_recursion++; 293 current->lockdep_recursion++;
289} 294}
290
291EXPORT_SYMBOL(lockdep_off); 295EXPORT_SYMBOL(lockdep_off);
292 296
293void lockdep_on(void) 297void lockdep_on(void)
294{ 298{
295 current->lockdep_recursion--; 299 current->lockdep_recursion--;
296} 300}
297
298EXPORT_SYMBOL(lockdep_on); 301EXPORT_SYMBOL(lockdep_on);
299 302
300/* 303/*
@@ -576,7 +579,8 @@ static void print_lock_class_header(struct lock_class *class, int depth)
576/* 579/*
577 * printk all lock dependencies starting at <entry>: 580 * printk all lock dependencies starting at <entry>:
578 */ 581 */
579static void print_lock_dependencies(struct lock_class *class, int depth) 582static void __used
583print_lock_dependencies(struct lock_class *class, int depth)
580{ 584{
581 struct lock_list *entry; 585 struct lock_list *entry;
582 586
@@ -2508,7 +2512,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2508 if (subclass) 2512 if (subclass)
2509 register_lock_class(lock, subclass, 1); 2513 register_lock_class(lock, subclass, 1);
2510} 2514}
2511
2512EXPORT_SYMBOL_GPL(lockdep_init_map); 2515EXPORT_SYMBOL_GPL(lockdep_init_map);
2513 2516
2514/* 2517/*
@@ -2689,8 +2692,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2689} 2692}
2690 2693
2691static int 2694static int
2692__lock_set_subclass(struct lockdep_map *lock, 2695__lock_set_class(struct lockdep_map *lock, const char *name,
2693 unsigned int subclass, unsigned long ip) 2696 struct lock_class_key *key, unsigned int subclass,
2697 unsigned long ip)
2694{ 2698{
2695 struct task_struct *curr = current; 2699 struct task_struct *curr = current;
2696 struct held_lock *hlock, *prev_hlock; 2700 struct held_lock *hlock, *prev_hlock;
@@ -2717,6 +2721,7 @@ __lock_set_subclass(struct lockdep_map *lock,
2717 return print_unlock_inbalance_bug(curr, lock, ip); 2721 return print_unlock_inbalance_bug(curr, lock, ip);
2718 2722
2719found_it: 2723found_it:
2724 lockdep_init_map(lock, name, key, 0);
2720 class = register_lock_class(lock, subclass, 0); 2725 class = register_lock_class(lock, subclass, 0);
2721 hlock->class_idx = class - lock_classes + 1; 2726 hlock->class_idx = class - lock_classes + 1;
2722 2727
@@ -2901,9 +2906,9 @@ static void check_flags(unsigned long flags)
2901#endif 2906#endif
2902} 2907}
2903 2908
2904void 2909void lock_set_class(struct lockdep_map *lock, const char *name,
2905lock_set_subclass(struct lockdep_map *lock, 2910 struct lock_class_key *key, unsigned int subclass,
2906 unsigned int subclass, unsigned long ip) 2911 unsigned long ip)
2907{ 2912{
2908 unsigned long flags; 2913 unsigned long flags;
2909 2914
@@ -2913,13 +2918,12 @@ lock_set_subclass(struct lockdep_map *lock,
2913 raw_local_irq_save(flags); 2918 raw_local_irq_save(flags);
2914 current->lockdep_recursion = 1; 2919 current->lockdep_recursion = 1;
2915 check_flags(flags); 2920 check_flags(flags);
2916 if (__lock_set_subclass(lock, subclass, ip)) 2921 if (__lock_set_class(lock, name, key, subclass, ip))
2917 check_chain_key(current); 2922 check_chain_key(current);
2918 current->lockdep_recursion = 0; 2923 current->lockdep_recursion = 0;
2919 raw_local_irq_restore(flags); 2924 raw_local_irq_restore(flags);
2920} 2925}
2921 2926EXPORT_SYMBOL_GPL(lock_set_class);
2922EXPORT_SYMBOL_GPL(lock_set_subclass);
2923 2927
2924/* 2928/*
2925 * We are not always called with irqs disabled - do that here, 2929 * We are not always called with irqs disabled - do that here,
@@ -2943,7 +2947,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2943 current->lockdep_recursion = 0; 2947 current->lockdep_recursion = 0;
2944 raw_local_irq_restore(flags); 2948 raw_local_irq_restore(flags);
2945} 2949}
2946
2947EXPORT_SYMBOL_GPL(lock_acquire); 2950EXPORT_SYMBOL_GPL(lock_acquire);
2948 2951
2949void lock_release(struct lockdep_map *lock, int nested, 2952void lock_release(struct lockdep_map *lock, int nested,
@@ -2961,7 +2964,6 @@ void lock_release(struct lockdep_map *lock, int nested,
2961 current->lockdep_recursion = 0; 2964 current->lockdep_recursion = 0;
2962 raw_local_irq_restore(flags); 2965 raw_local_irq_restore(flags);
2963} 2966}
2964
2965EXPORT_SYMBOL_GPL(lock_release); 2967EXPORT_SYMBOL_GPL(lock_release);
2966 2968
2967#ifdef CONFIG_LOCK_STAT 2969#ifdef CONFIG_LOCK_STAT
@@ -2999,7 +3001,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
2999 struct held_lock *hlock, *prev_hlock; 3001 struct held_lock *hlock, *prev_hlock;
3000 struct lock_class_stats *stats; 3002 struct lock_class_stats *stats;
3001 unsigned int depth; 3003 unsigned int depth;
3002 int i, point; 3004 int i, contention_point, contending_point;
3003 3005
3004 depth = curr->lockdep_depth; 3006 depth = curr->lockdep_depth;
3005 if (DEBUG_LOCKS_WARN_ON(!depth)) 3007 if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3023,18 +3025,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3023found_it: 3025found_it:
3024 hlock->waittime_stamp = sched_clock(); 3026 hlock->waittime_stamp = sched_clock();
3025 3027
3026 point = lock_contention_point(hlock_class(hlock), ip); 3028 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3029 contending_point = lock_point(hlock_class(hlock)->contending_point,
3030 lock->ip);
3027 3031
3028 stats = get_lock_stats(hlock_class(hlock)); 3032 stats = get_lock_stats(hlock_class(hlock));
3029 if (point < ARRAY_SIZE(stats->contention_point)) 3033 if (contention_point < LOCKSTAT_POINTS)
3030 stats->contention_point[point]++; 3034 stats->contention_point[contention_point]++;
3035 if (contending_point < LOCKSTAT_POINTS)
3036 stats->contending_point[contending_point]++;
3031 if (lock->cpu != smp_processor_id()) 3037 if (lock->cpu != smp_processor_id())
3032 stats->bounces[bounce_contended + !!hlock->read]++; 3038 stats->bounces[bounce_contended + !!hlock->read]++;
3033 put_lock_stats(stats); 3039 put_lock_stats(stats);
3034} 3040}
3035 3041
3036static void 3042static void
3037__lock_acquired(struct lockdep_map *lock) 3043__lock_acquired(struct lockdep_map *lock, unsigned long ip)
3038{ 3044{
3039 struct task_struct *curr = current; 3045 struct task_struct *curr = current;
3040 struct held_lock *hlock, *prev_hlock; 3046 struct held_lock *hlock, *prev_hlock;
@@ -3083,6 +3089,7 @@ found_it:
3083 put_lock_stats(stats); 3089 put_lock_stats(stats);
3084 3090
3085 lock->cpu = cpu; 3091 lock->cpu = cpu;
3092 lock->ip = ip;
3086} 3093}
3087 3094
3088void lock_contended(struct lockdep_map *lock, unsigned long ip) 3095void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3104,7 +3111,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3104} 3111}
3105EXPORT_SYMBOL_GPL(lock_contended); 3112EXPORT_SYMBOL_GPL(lock_contended);
3106 3113
3107void lock_acquired(struct lockdep_map *lock) 3114void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3108{ 3115{
3109 unsigned long flags; 3116 unsigned long flags;
3110 3117
@@ -3117,7 +3124,7 @@ void lock_acquired(struct lockdep_map *lock)
3117 raw_local_irq_save(flags); 3124 raw_local_irq_save(flags);
3118 check_flags(flags); 3125 check_flags(flags);
3119 current->lockdep_recursion = 1; 3126 current->lockdep_recursion = 1;
3120 __lock_acquired(lock); 3127 __lock_acquired(lock, ip);
3121 current->lockdep_recursion = 0; 3128 current->lockdep_recursion = 0;
3122 raw_local_irq_restore(flags); 3129 raw_local_irq_restore(flags);
3123} 3130}
@@ -3441,7 +3448,6 @@ retry:
3441 if (unlock) 3448 if (unlock)
3442 read_unlock(&tasklist_lock); 3449 read_unlock(&tasklist_lock);
3443} 3450}
3444
3445EXPORT_SYMBOL_GPL(debug_show_all_locks); 3451EXPORT_SYMBOL_GPL(debug_show_all_locks);
3446 3452
3447/* 3453/*
@@ -3462,7 +3468,6 @@ void debug_show_held_locks(struct task_struct *task)
3462{ 3468{
3463 __debug_show_held_locks(task); 3469 __debug_show_held_locks(task);
3464} 3470}
3465
3466EXPORT_SYMBOL_GPL(debug_show_held_locks); 3471EXPORT_SYMBOL_GPL(debug_show_held_locks);
3467 3472
3468void lockdep_sys_exit(void) 3473void lockdep_sys_exit(void)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7dd..13716b813896 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
470 470
471static void snprint_time(char *buf, size_t bufsiz, s64 nr) 471static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 s64 div;
474 s32 rem;
474 475
475 nr += 5; /* for display rounding */ 476 nr += 5; /* for display rounding */
476 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 477 div = div_s64_rem(nr, 1000, &rem);
477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); 478 snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
478} 479}
479 480
480static void seq_time(struct seq_file *m, s64 time) 481static void seq_time(struct seq_file *m, s64 time)
@@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
556 if (stats->read_holdtime.nr) 557 if (stats->read_holdtime.nr)
557 namelen += 2; 558 namelen += 2;
558 559
559 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 560 for (i = 0; i < LOCKSTAT_POINTS; i++) {
560 char sym[KSYM_SYMBOL_LEN]; 561 char sym[KSYM_SYMBOL_LEN];
561 char ip[32]; 562 char ip[32];
562 563
@@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
573 stats->contention_point[i], 574 stats->contention_point[i],
574 ip, sym); 575 ip, sym);
575 } 576 }
577 for (i = 0; i < LOCKSTAT_POINTS; i++) {
578 char sym[KSYM_SYMBOL_LEN];
579 char ip[32];
580
581 if (class->contending_point[i] == 0)
582 break;
583
584 if (!i)
585 seq_line(m, '-', 40-namelen, namelen);
586
587 sprint_symbol(sym, class->contending_point[i]);
588 snprintf(ip, sizeof(ip), "[<%p>]",
589 (void *)class->contending_point[i]);
590 seq_printf(m, "%40s %14lu %29s %s\n", name,
591 stats->contending_point[i],
592 ip, sym);
593 }
576 if (i) { 594 if (i) {
577 seq_puts(m, "\n"); 595 seq_puts(m, "\n");
578 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 596 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
582 600
583static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
584{ 602{
585 seq_printf(m, "lock_stat version 0.2\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
586 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
587 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
588 "%14s %14s\n", 606 "%14s %14s\n",
diff --git a/kernel/marker.c b/kernel/marker.c
index e9c6b2bc9400..ea54f2647868 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
43 */ 43 */
44#define MARKER_HASH_BITS 6 44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) 45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -64,11 +65,10 @@ struct marker_entry {
64 void *oldptr; 65 void *oldptr;
65 int rcu_pending; 66 int rcu_pending;
66 unsigned char ptype:1; 67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
67 char name[0]; /* Contains name'\0'format'\0' */ 69 char name[0]; /* Contains name'\0'format'\0' */
68}; 70};
69 71
70static struct hlist_head marker_table[MARKER_TABLE_SIZE];
71
72/** 72/**
73 * __mark_empty_function - Empty probe callback 73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data 74 * @probe_private: probe private data
@@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
81 * though the function pointer change and the marker enabling are two distinct 81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code. 82 * operations that modifies the execution flow of preemptible code.
83 */ 83 */
84void __mark_empty_function(void *probe_private, void *call_private, 84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args) 85 const char *fmt, va_list *args)
86{ 86{
87} 87}
@@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
97 * need to put a full smp_rmb() in this branch. This is why we do not use 97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read. 98 * rcu_dereference() for the pointer read.
99 */ 99 */
100void marker_probe_cb(const struct marker *mdata, void *call_private, ...) 100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
101{ 102{
102 va_list args; 103 va_list args;
103 char ptype; 104 char ptype;
@@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
107 * sure the teardown of the callbacks can be done correctly when they 108 * sure the teardown of the callbacks can be done correctly when they
108 * are in modules and they insure RCU read coherency. 109 * are in modules and they insure RCU read coherency.
109 */ 110 */
110 rcu_read_lock_sched(); 111 rcu_read_lock_sched_notrace();
111 ptype = mdata->ptype; 112 ptype = mdata->ptype;
112 if (likely(!ptype)) { 113 if (likely(!ptype)) {
113 marker_probe_func *func; 114 marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
145 va_end(args); 146 va_end(args);
146 } 147 }
147 } 148 }
148 rcu_read_unlock_sched(); 149 rcu_read_unlock_sched_notrace();
149} 150}
150EXPORT_SYMBOL_GPL(marker_probe_cb); 151EXPORT_SYMBOL_GPL(marker_probe_cb);
151 152
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
157 * 158 *
158 * Should be connected to markers "MARK_NOARGS". 159 * Should be connected to markers "MARK_NOARGS".
159 */ 160 */
160void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) 161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
161{ 163{
162 va_list args; /* not initialized */ 164 va_list args; /* not initialized */
163 char ptype; 165 char ptype;
164 166
165 rcu_read_lock_sched(); 167 rcu_read_lock_sched_notrace();
166 ptype = mdata->ptype; 168 ptype = mdata->ptype;
167 if (likely(!ptype)) { 169 if (likely(!ptype)) {
168 marker_probe_func *func; 170 marker_probe_func *func;
@@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
195 multi[i].func(multi[i].probe_private, call_private, 197 multi[i].func(multi[i].probe_private, call_private,
196 mdata->format, &args); 198 mdata->format, &args);
197 } 199 }
198 rcu_read_unlock_sched(); 200 rcu_read_unlock_sched_notrace();
199} 201}
200EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
201 202
202static void free_old_closure(struct rcu_head *head) 203static void free_old_closure(struct rcu_head *head)
203{ 204{
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
416 e->single.probe_private = NULL; 417 e->single.probe_private = NULL;
417 e->multi = NULL; 418 e->multi = NULL;
418 e->ptype = 0; 419 e->ptype = 0;
420 e->format_allocated = 0;
419 e->refcount = 0; 421 e->refcount = 0;
420 e->rcu_pending = 0; 422 e->rcu_pending = 0;
421 hlist_add_head(&e->hlist, head); 423 hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
447 if (e->single.func != __mark_empty_function) 449 if (e->single.func != __mark_empty_function)
448 return -EBUSY; 450 return -EBUSY;
449 hlist_del(&e->hlist); 451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
450 /* Make sure the call_rcu has been executed */ 454 /* Make sure the call_rcu has been executed */
451 if (e->rcu_pending) 455 if (e->rcu_pending)
452 rcu_barrier_sched(); 456 rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
457/* 461/*
458 * Set the mark_entry format to the format found in the element. 462 * Set the mark_entry format to the format found in the element.
459 */ 463 */
460static int marker_set_format(struct marker_entry **entry, const char *format) 464static int marker_set_format(struct marker_entry *entry, const char *format)
461{ 465{
462 struct marker_entry *e; 466 entry->format = kstrdup(format, GFP_KERNEL);
463 size_t name_len = strlen((*entry)->name) + 1; 467 if (!entry->format)
464 size_t format_len = strlen(format) + 1;
465
466
467 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
468 GFP_KERNEL);
469 if (!e)
470 return -ENOMEM; 468 return -ENOMEM;
471 memcpy(&e->name[0], (*entry)->name, name_len); 469 entry->format_allocated = 1;
472 e->format = &e->name[name_len]; 470
473 memcpy(e->format, format, format_len);
474 if (strcmp(e->format, MARK_NOARGS) == 0)
475 e->call = marker_probe_cb_noarg;
476 else
477 e->call = marker_probe_cb;
478 e->single = (*entry)->single;
479 e->multi = (*entry)->multi;
480 e->ptype = (*entry)->ptype;
481 e->refcount = (*entry)->refcount;
482 e->rcu_pending = 0;
483 hlist_add_before(&e->hlist, &(*entry)->hlist);
484 hlist_del(&(*entry)->hlist);
485 /* Make sure the call_rcu has been executed */
486 if ((*entry)->rcu_pending)
487 rcu_barrier_sched();
488 kfree(*entry);
489 *entry = e;
490 trace_mark(core_marker_format, "name %s format %s", 471 trace_mark(core_marker_format, "name %s format %s",
491 e->name, e->format); 472 entry->name, entry->format);
492 return 0; 473 return 0;
493} 474}
494 475
495/* 476/*
496 * Sets the probe callback corresponding to one marker. 477 * Sets the probe callback corresponding to one marker.
497 */ 478 */
498static int set_marker(struct marker_entry **entry, struct marker *elem, 479static int set_marker(struct marker_entry *entry, struct marker *elem,
499 int active) 480 int active)
500{ 481{
501 int ret; 482 int ret = 0;
502 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 483 WARN_ON(strcmp(entry->name, elem->name) != 0);
503 484
504 if ((*entry)->format) { 485 if (entry->format) {
505 if (strcmp((*entry)->format, elem->format) != 0) { 486 if (strcmp(entry->format, elem->format) != 0) {
506 printk(KERN_NOTICE 487 printk(KERN_NOTICE
507 "Format mismatch for probe %s " 488 "Format mismatch for probe %s "
508 "(%s), marker (%s)\n", 489 "(%s), marker (%s)\n",
509 (*entry)->name, 490 entry->name,
510 (*entry)->format, 491 entry->format,
511 elem->format); 492 elem->format);
512 return -EPERM; 493 return -EPERM;
513 } 494 }
@@ -523,37 +504,67 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
523 * pass from a "safe" callback (with argument) to an "unsafe" 504 * pass from a "safe" callback (with argument) to an "unsafe"
524 * callback (does not set arguments). 505 * callback (does not set arguments).
525 */ 506 */
526 elem->call = (*entry)->call; 507 elem->call = entry->call;
527 /* 508 /*
528 * Sanity check : 509 * Sanity check :
529 * We only update the single probe private data when the ptr is 510 * We only update the single probe private data when the ptr is
530 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) 511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
531 */ 512 */
532 WARN_ON(elem->single.func != __mark_empty_function 513 WARN_ON(elem->single.func != __mark_empty_function
533 && elem->single.probe_private 514 && elem->single.probe_private != entry->single.probe_private
534 != (*entry)->single.probe_private && 515 && !elem->ptype);
535 !elem->ptype); 516 elem->single.probe_private = entry->single.probe_private;
536 elem->single.probe_private = (*entry)->single.probe_private;
537 /* 517 /*
538 * Make sure the private data is valid when we update the 518 * Make sure the private data is valid when we update the
539 * single probe ptr. 519 * single probe ptr.
540 */ 520 */
541 smp_wmb(); 521 smp_wmb();
542 elem->single.func = (*entry)->single.func; 522 elem->single.func = entry->single.func;
543 /* 523 /*
544 * We also make sure that the new probe callbacks array is consistent 524 * We also make sure that the new probe callbacks array is consistent
545 * before setting a pointer to it. 525 * before setting a pointer to it.
546 */ 526 */
547 rcu_assign_pointer(elem->multi, (*entry)->multi); 527 rcu_assign_pointer(elem->multi, entry->multi);
548 /* 528 /*
549 * Update the function or multi probe array pointer before setting the 529 * Update the function or multi probe array pointer before setting the
550 * ptype. 530 * ptype.
551 */ 531 */
552 smp_wmb(); 532 smp_wmb();
553 elem->ptype = (*entry)->ptype; 533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
554 elem->state = active; 565 elem->state = active;
555 566
556 return 0; 567 return ret;
557} 568}
558 569
559/* 570/*
@@ -564,7 +575,24 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
564 */ 575 */
565static void disable_marker(struct marker *elem) 576static void disable_marker(struct marker *elem)
566{ 577{
578 int ret;
579
567 /* leave "call" as is. It is known statically. */ 580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
568 elem->state = 0; 596 elem->state = 0;
569 elem->single.func = __mark_empty_function; 597 elem->single.func = __mark_empty_function;
570 /* Update the function before setting the ptype */ 598 /* Update the function before setting the ptype */
@@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin,
594 for (iter = begin; iter < end; iter++) { 622 for (iter = begin; iter < end; iter++) {
595 mark_entry = get_marker(iter->name); 623 mark_entry = get_marker(iter->name);
596 if (mark_entry) { 624 if (mark_entry) {
597 set_marker(&mark_entry, iter, 625 set_marker(mark_entry, iter, !!mark_entry->refcount);
598 !!mark_entry->refcount);
599 /* 626 /*
600 * ignore error, continue 627 * ignore error, continue
601 */ 628 */
@@ -629,6 +656,7 @@ static void marker_update_probes(void)
629 marker_update_probe_range(__start___markers, __stop___markers); 656 marker_update_probe_range(__start___markers, __stop___markers);
630 /* Markers in modules. */ 657 /* Markers in modules. */
631 module_update_markers(); 658 module_update_markers();
659 tracepoint_probe_update_all();
632} 660}
633 661
634/** 662/**
@@ -657,7 +685,7 @@ int marker_probe_register(const char *name, const char *format,
657 ret = PTR_ERR(entry); 685 ret = PTR_ERR(entry);
658 } else if (format) { 686 } else if (format) {
659 if (!entry->format) 687 if (!entry->format)
660 ret = marker_set_format(&entry, format); 688 ret = marker_set_format(entry, format);
661 else if (strcmp(entry->format, format)) 689 else if (strcmp(entry->format, format))
662 ret = -EPERM; 690 ret = -EPERM;
663 } 691 }
@@ -676,10 +704,11 @@ int marker_probe_register(const char *name, const char *format,
676 goto end; 704 goto end;
677 } 705 }
678 mutex_unlock(&markers_mutex); 706 mutex_unlock(&markers_mutex);
679 marker_update_probes(); /* may update entry */ 707 marker_update_probes();
680 mutex_lock(&markers_mutex); 708 mutex_lock(&markers_mutex);
681 entry = get_marker(name); 709 entry = get_marker(name);
682 WARN_ON(!entry); 710 if (!entry)
711 goto end;
683 if (entry->rcu_pending) 712 if (entry->rcu_pending)
684 rcu_barrier_sched(); 713 rcu_barrier_sched();
685 entry->oldptr = old; 714 entry->oldptr = old;
@@ -720,7 +749,7 @@ int marker_probe_unregister(const char *name,
720 rcu_barrier_sched(); 749 rcu_barrier_sched();
721 old = marker_entry_remove_probe(entry, probe, probe_private); 750 old = marker_entry_remove_probe(entry, probe, probe_private);
722 mutex_unlock(&markers_mutex); 751 mutex_unlock(&markers_mutex);
723 marker_update_probes(); /* may update entry */ 752 marker_update_probes();
724 mutex_lock(&markers_mutex); 753 mutex_lock(&markers_mutex);
725 entry = get_marker(name); 754 entry = get_marker(name);
726 if (!entry) 755 if (!entry)
@@ -801,10 +830,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
801 rcu_barrier_sched(); 830 rcu_barrier_sched();
802 old = marker_entry_remove_probe(entry, NULL, probe_private); 831 old = marker_entry_remove_probe(entry, NULL, probe_private);
803 mutex_unlock(&markers_mutex); 832 mutex_unlock(&markers_mutex);
804 marker_update_probes(); /* may update entry */ 833 marker_update_probes();
805 mutex_lock(&markers_mutex); 834 mutex_lock(&markers_mutex);
806 entry = get_marker_from_private_data(probe, probe_private); 835 entry = get_marker_from_private_data(probe, probe_private);
807 WARN_ON(!entry); 836 if (!entry)
837 goto end;
808 if (entry->rcu_pending) 838 if (entry->rcu_pending)
809 rcu_barrier_sched(); 839 rcu_barrier_sched();
810 entry->oldptr = old; 840 entry->oldptr = old;
@@ -848,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
848 if (!e->ptype) { 878 if (!e->ptype) {
849 if (num == 0 && e->single.func == probe) 879 if (num == 0 && e->single.func == probe)
850 return e->single.probe_private; 880 return e->single.probe_private;
851 else
852 break;
853 } else { 881 } else {
854 struct marker_probe_closure *closure; 882 struct marker_probe_closure *closure;
855 int match = 0; 883 int match = 0;
@@ -861,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
861 return closure[i].probe_private; 889 return closure[i].probe_private;
862 } 890 }
863 } 891 }
892 break;
864 } 893 }
865 } 894 }
866 return ERR_PTR(-ENOENT); 895 return ERR_PTR(-ENOENT);
867} 896}
868EXPORT_SYMBOL_GPL(marker_get_private_data); 897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 1f4cc00e0c20..c9332c90d5a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,7 +43,6 @@
43#include <linux/device.h> 43#include <linux/device.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/unwind.h>
47#include <linux/rculist.h> 46#include <linux/rculist.h>
48#include <asm/uaccess.h> 47#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 48#include <asm/cacheflush.h>
@@ -51,6 +50,7 @@
51#include <asm/sections.h> 50#include <asm/sections.h>
52#include <linux/tracepoint.h> 51#include <linux/tracepoint.h>
53#include <linux/ftrace.h> 52#include <linux/ftrace.h>
53#include <linux/async.h>
54 54
55#if 0 55#if 0
56#define DEBUGP printk 56#define DEBUGP printk
@@ -757,8 +757,16 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
757 return -EFAULT; 757 return -EFAULT;
758 name[MODULE_NAME_LEN-1] = '\0'; 758 name[MODULE_NAME_LEN-1] = '\0';
759 759
760 if (mutex_lock_interruptible(&module_mutex) != 0) 760 /* Create stop_machine threads since free_module relies on
761 return -EINTR; 761 * a non-failing stop_machine call. */
762 ret = stop_machine_create();
763 if (ret)
764 return ret;
765
766 if (mutex_lock_interruptible(&module_mutex) != 0) {
767 ret = -EINTR;
768 goto out_stop;
769 }
762 770
763 mod = find_module(name); 771 mod = find_module(name);
764 if (!mod) { 772 if (!mod) {
@@ -809,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
809 mod->exit(); 817 mod->exit();
810 blocking_notifier_call_chain(&module_notify_list, 818 blocking_notifier_call_chain(&module_notify_list,
811 MODULE_STATE_GOING, mod); 819 MODULE_STATE_GOING, mod);
820 async_synchronize_full();
812 mutex_lock(&module_mutex); 821 mutex_lock(&module_mutex);
813 /* Store the name of the last unloaded module for diagnostic purposes */ 822 /* Store the name of the last unloaded module for diagnostic purposes */
814 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 823 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
@@ -817,10 +826,12 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
817 826
818 out: 827 out:
819 mutex_unlock(&module_mutex); 828 mutex_unlock(&module_mutex);
829out_stop:
830 stop_machine_destroy();
820 return ret; 831 return ret;
821} 832}
822 833
823static void print_unload_info(struct seq_file *m, struct module *mod) 834static inline void print_unload_info(struct seq_file *m, struct module *mod)
824{ 835{
825 struct module_use *use; 836 struct module_use *use;
826 int printed_something = 0; 837 int printed_something = 0;
@@ -893,7 +904,7 @@ void module_put(struct module *module)
893EXPORT_SYMBOL(module_put); 904EXPORT_SYMBOL(module_put);
894 905
895#else /* !CONFIG_MODULE_UNLOAD */ 906#else /* !CONFIG_MODULE_UNLOAD */
896static void print_unload_info(struct seq_file *m, struct module *mod) 907static inline void print_unload_info(struct seq_file *m, struct module *mod)
897{ 908{
898 /* We don't know the usage count, or what modules are using. */ 909 /* We don't know the usage count, or what modules are using. */
899 seq_printf(m, " - -"); 910 seq_printf(m, " - -");
@@ -1439,8 +1450,6 @@ static void free_module(struct module *mod)
1439 remove_sect_attrs(mod); 1450 remove_sect_attrs(mod);
1440 mod_kobject_remove(mod); 1451 mod_kobject_remove(mod);
1441 1452
1442 unwind_remove_table(mod->unwind_info, 0);
1443
1444 /* Arch-specific cleanup. */ 1453 /* Arch-specific cleanup. */
1445 module_arch_cleanup(mod); 1454 module_arch_cleanup(mod);
1446 1455
@@ -1578,11 +1587,21 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1578 return ret; 1587 return ret;
1579} 1588}
1580 1589
1590/* Additional bytes needed by arch in front of individual sections */
1591unsigned int __weak arch_mod_section_prepend(struct module *mod,
1592 unsigned int section)
1593{
1594 /* default implementation just returns zero */
1595 return 0;
1596}
1597
1581/* Update size with this section: return offset. */ 1598/* Update size with this section: return offset. */
1582static long get_offset(unsigned int *size, Elf_Shdr *sechdr) 1599static long get_offset(struct module *mod, unsigned int *size,
1600 Elf_Shdr *sechdr, unsigned int section)
1583{ 1601{
1584 long ret; 1602 long ret;
1585 1603
1604 *size += arch_mod_section_prepend(mod, section);
1586 ret = ALIGN(*size, sechdr->sh_addralign ?: 1); 1605 ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
1587 *size = ret + sechdr->sh_size; 1606 *size = ret + sechdr->sh_size;
1588 return ret; 1607 return ret;
@@ -1622,7 +1641,7 @@ static void layout_sections(struct module *mod,
1622 || strncmp(secstrings + s->sh_name, 1641 || strncmp(secstrings + s->sh_name,
1623 ".init", 5) == 0) 1642 ".init", 5) == 0)
1624 continue; 1643 continue;
1625 s->sh_entsize = get_offset(&mod->core_size, s); 1644 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1626 DEBUGP("\t%s\n", secstrings + s->sh_name); 1645 DEBUGP("\t%s\n", secstrings + s->sh_name);
1627 } 1646 }
1628 if (m == 0) 1647 if (m == 0)
@@ -1640,7 +1659,7 @@ static void layout_sections(struct module *mod,
1640 || strncmp(secstrings + s->sh_name, 1659 || strncmp(secstrings + s->sh_name,
1641 ".init", 5) != 0) 1660 ".init", 5) != 0)
1642 continue; 1661 continue;
1643 s->sh_entsize = (get_offset(&mod->init_size, s) 1662 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1644 | INIT_OFFSET_MASK); 1663 | INIT_OFFSET_MASK);
1645 DEBUGP("\t%s\n", secstrings + s->sh_name); 1664 DEBUGP("\t%s\n", secstrings + s->sh_name);
1646 } 1665 }
@@ -1725,15 +1744,15 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
1725 return NULL; 1744 return NULL;
1726} 1745}
1727 1746
1728static int is_exported(const char *name, const struct module *mod) 1747static int is_exported(const char *name, unsigned long value,
1748 const struct module *mod)
1729{ 1749{
1730 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1750 const struct kernel_symbol *ks;
1731 return 1; 1751 if (!mod)
1752 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
1732 else 1753 else
1733 if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) 1754 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
1734 return 1; 1755 return ks != NULL && ks->value == value;
1735 else
1736 return 0;
1737} 1756}
1738 1757
1739/* As per nm */ 1758/* As per nm */
@@ -1847,7 +1866,6 @@ static noinline struct module *load_module(void __user *umod,
1847 unsigned int symindex = 0; 1866 unsigned int symindex = 0;
1848 unsigned int strindex = 0; 1867 unsigned int strindex = 0;
1849 unsigned int modindex, versindex, infoindex, pcpuindex; 1868 unsigned int modindex, versindex, infoindex, pcpuindex;
1850 unsigned int unwindex = 0;
1851 unsigned int num_kp, num_mcount; 1869 unsigned int num_kp, num_mcount;
1852 struct kernel_param *kp; 1870 struct kernel_param *kp;
1853 struct module *mod; 1871 struct module *mod;
@@ -1865,6 +1883,13 @@ static noinline struct module *load_module(void __user *umod,
1865 /* vmalloc barfs on "unusual" numbers. Check here */ 1883 /* vmalloc barfs on "unusual" numbers. Check here */
1866 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 1884 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
1867 return ERR_PTR(-ENOMEM); 1885 return ERR_PTR(-ENOMEM);
1886
1887 /* Create stop_machine threads since the error path relies on
1888 * a non-failing stop_machine call. */
1889 err = stop_machine_create();
1890 if (err)
1891 goto free_hdr;
1892
1868 if (copy_from_user(hdr, umod, len) != 0) { 1893 if (copy_from_user(hdr, umod, len) != 0) {
1869 err = -EFAULT; 1894 err = -EFAULT;
1870 goto free_hdr; 1895 goto free_hdr;
@@ -1930,9 +1955,6 @@ static noinline struct module *load_module(void __user *umod,
1930 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1955 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1931 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1956 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1932 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1957 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1933#ifdef ARCH_UNWIND_SECTION_NAME
1934 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1935#endif
1936 1958
1937 /* Don't keep modinfo and version sections. */ 1959 /* Don't keep modinfo and version sections. */
1938 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1960 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1942,8 +1964,6 @@ static noinline struct module *load_module(void __user *umod,
1942 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1964 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1943 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1965 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1944#endif 1966#endif
1945 if (unwindex)
1946 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1947 1967
1948 /* Check module struct version now, before we try to use module. */ 1968 /* Check module struct version now, before we try to use module. */
1949 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1969 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2184,24 +2204,15 @@ static noinline struct module *load_module(void __user *umod,
2184 struct mod_debug *debug; 2204 struct mod_debug *debug;
2185 unsigned int num_debug; 2205 unsigned int num_debug;
2186 2206
2187#ifdef CONFIG_MARKERS
2188 marker_update_probe_range(mod->markers,
2189 mod->markers + mod->num_markers);
2190#endif
2191 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2207 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2192 sizeof(*debug), &num_debug); 2208 sizeof(*debug), &num_debug);
2193 dynamic_printk_setup(debug, num_debug); 2209 dynamic_printk_setup(debug, num_debug);
2194
2195#ifdef CONFIG_TRACEPOINTS
2196 tracepoint_update_probe_range(mod->tracepoints,
2197 mod->tracepoints + mod->num_tracepoints);
2198#endif
2199 } 2210 }
2200 2211
2201 /* sechdrs[0].sh_size is always zero */ 2212 /* sechdrs[0].sh_size is always zero */
2202 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", 2213 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2203 sizeof(*mseg), &num_mcount); 2214 sizeof(*mseg), &num_mcount);
2204 ftrace_init_module(mseg, mseg + num_mcount); 2215 ftrace_init_module(mod, mseg, mseg + num_mcount);
2205 2216
2206 err = module_finalize(hdr, sechdrs, mod); 2217 err = module_finalize(hdr, sechdrs, mod);
2207 if (err < 0) 2218 if (err < 0)
@@ -2249,14 +2260,10 @@ static noinline struct module *load_module(void __user *umod,
2249 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2260 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2250 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2261 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2251 2262
2252 /* Size of section 0 is 0, so this works well if no unwind info. */
2253 mod->unwind_info = unwind_add_table(mod,
2254 (void *)sechdrs[unwindex].sh_addr,
2255 sechdrs[unwindex].sh_size);
2256
2257 /* Get rid of temporary copy */ 2263 /* Get rid of temporary copy */
2258 vfree(hdr); 2264 vfree(hdr);
2259 2265
2266 stop_machine_destroy();
2260 /* Done! */ 2267 /* Done! */
2261 return mod; 2268 return mod;
2262 2269
@@ -2279,6 +2286,7 @@ static noinline struct module *load_module(void __user *umod,
2279 kfree(args); 2286 kfree(args);
2280 free_hdr: 2287 free_hdr:
2281 vfree(hdr); 2288 vfree(hdr);
2289 stop_machine_destroy();
2282 return ERR_PTR(err); 2290 return ERR_PTR(err);
2283 2291
2284 truncated: 2292 truncated:
@@ -2346,11 +2354,12 @@ sys_init_module(void __user *umod,
2346 /* Now it's a first class citizen! Wake up anyone waiting for it. */ 2354 /* Now it's a first class citizen! Wake up anyone waiting for it. */
2347 mod->state = MODULE_STATE_LIVE; 2355 mod->state = MODULE_STATE_LIVE;
2348 wake_up(&module_wq); 2356 wake_up(&module_wq);
2357 blocking_notifier_call_chain(&module_notify_list,
2358 MODULE_STATE_LIVE, mod);
2349 2359
2350 mutex_lock(&module_mutex); 2360 mutex_lock(&module_mutex);
2351 /* Drop initial reference. */ 2361 /* Drop initial reference. */
2352 module_put(mod); 2362 module_put(mod);
2353 unwind_remove_table(mod->unwind_info, 1);
2354 module_free(mod, mod->module_init); 2363 module_free(mod, mod->module_init);
2355 mod->module_init = NULL; 2364 mod->module_init = NULL;
2356 mod->init_size = 0; 2365 mod->init_size = 0;
@@ -2385,7 +2394,7 @@ static const char *get_ksymbol(struct module *mod,
2385 unsigned long nextval; 2394 unsigned long nextval;
2386 2395
2387 /* At worse, next value is at end of module */ 2396 /* At worse, next value is at end of module */
2388 if (within(addr, mod->module_init, mod->init_size)) 2397 if (within_module_init(addr, mod))
2389 nextval = (unsigned long)mod->module_init+mod->init_text_size; 2398 nextval = (unsigned long)mod->module_init+mod->init_text_size;
2390 else 2399 else
2391 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2400 nextval = (unsigned long)mod->module_core+mod->core_text_size;
@@ -2433,8 +2442,8 @@ const char *module_address_lookup(unsigned long addr,
2433 2442
2434 preempt_disable(); 2443 preempt_disable();
2435 list_for_each_entry_rcu(mod, &modules, list) { 2444 list_for_each_entry_rcu(mod, &modules, list) {
2436 if (within(addr, mod->module_init, mod->init_size) 2445 if (within_module_init(addr, mod) ||
2437 || within(addr, mod->module_core, mod->core_size)) { 2446 within_module_core(addr, mod)) {
2438 if (modname) 2447 if (modname)
2439 *modname = mod->name; 2448 *modname = mod->name;
2440 ret = get_ksymbol(mod, addr, size, offset); 2449 ret = get_ksymbol(mod, addr, size, offset);
@@ -2456,8 +2465,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2456 2465
2457 preempt_disable(); 2466 preempt_disable();
2458 list_for_each_entry_rcu(mod, &modules, list) { 2467 list_for_each_entry_rcu(mod, &modules, list) {
2459 if (within(addr, mod->module_init, mod->init_size) || 2468 if (within_module_init(addr, mod) ||
2460 within(addr, mod->module_core, mod->core_size)) { 2469 within_module_core(addr, mod)) {
2461 const char *sym; 2470 const char *sym;
2462 2471
2463 sym = get_ksymbol(mod, addr, NULL, NULL); 2472 sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -2480,8 +2489,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2480 2489
2481 preempt_disable(); 2490 preempt_disable();
2482 list_for_each_entry_rcu(mod, &modules, list) { 2491 list_for_each_entry_rcu(mod, &modules, list) {
2483 if (within(addr, mod->module_init, mod->init_size) || 2492 if (within_module_init(addr, mod) ||
2484 within(addr, mod->module_core, mod->core_size)) { 2493 within_module_core(addr, mod)) {
2485 const char *sym; 2494 const char *sym;
2486 2495
2487 sym = get_ksymbol(mod, addr, size, offset); 2496 sym = get_ksymbol(mod, addr, size, offset);
@@ -2513,7 +2522,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2513 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, 2522 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
2514 KSYM_NAME_LEN); 2523 KSYM_NAME_LEN);
2515 strlcpy(module_name, mod->name, MODULE_NAME_LEN); 2524 strlcpy(module_name, mod->name, MODULE_NAME_LEN);
2516 *exported = is_exported(name, mod); 2525 *exported = is_exported(name, *value, mod);
2517 preempt_enable(); 2526 preempt_enable();
2518 return 0; 2527 return 0;
2519 } 2528 }
@@ -2700,7 +2709,7 @@ int is_module_address(unsigned long addr)
2700 preempt_disable(); 2709 preempt_disable();
2701 2710
2702 list_for_each_entry_rcu(mod, &modules, list) { 2711 list_for_each_entry_rcu(mod, &modules, list) {
2703 if (within(addr, mod->module_core, mod->core_size)) { 2712 if (within_module_core(addr, mod)) {
2704 preempt_enable(); 2713 preempt_enable();
2705 return 1; 2714 return 1;
2706 } 2715 }
@@ -2713,7 +2722,7 @@ int is_module_address(unsigned long addr)
2713 2722
2714 2723
2715/* Is this a valid kernel address? */ 2724/* Is this a valid kernel address? */
2716struct module *__module_text_address(unsigned long addr) 2725__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
2717{ 2726{
2718 struct module *mod; 2727 struct module *mod;
2719 2728
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d4..4f45d4b658ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
59 * We also put the fastpath first in the kernel image, to make sure the 59 * We also put the fastpath first in the kernel image, to make sure the
60 * branch is predicted by the CPU as default-untaken. 60 * branch is predicted by the CPU as default-untaken.
61 */ 61 */
62static void noinline __sched 62static __used noinline void __sched
63__mutex_lock_slowpath(atomic_t *lock_count); 63__mutex_lock_slowpath(atomic_t *lock_count);
64 64
65/*** 65/***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
96EXPORT_SYMBOL(mutex_lock); 96EXPORT_SYMBOL(mutex_lock);
97#endif 97#endif
98 98
99static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 99static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
100 100
101/*** 101/***
102 * mutex_unlock - release the mutex 102 * mutex_unlock - release the mutex
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
184 } 184 }
185 185
186done: 186done:
187 lock_acquired(&lock->dep_map); 187 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 188 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 189 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
190 debug_mutex_set_owner(lock, task_thread_info(task)); 190 debug_mutex_set_owner(lock, task_thread_info(task));
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
268/* 268/*
269 * Release the lock, slowpath: 269 * Release the lock, slowpath:
270 */ 270 */
271static noinline void 271static __used noinline void
272__mutex_unlock_slowpath(atomic_t *lock_count) 272__mutex_unlock_slowpath(atomic_t *lock_count)
273{ 273{
274 __mutex_unlock_common_slowpath(lock_count, 1); 274 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
313} 313}
314EXPORT_SYMBOL(mutex_lock_killable); 314EXPORT_SYMBOL(mutex_lock_killable);
315 315
316static noinline void __sched 316static __used noinline void __sched
317__mutex_lock_slowpath(atomic_t *lock_count) 317__mutex_lock_slowpath(atomic_t *lock_count)
318{ 318{
319 struct mutex *lock = container_of(lock_count, struct mutex, count); 319 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4282c0a40a57..61d5aa5eced3 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference(nb->next);
85
86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
88 WARN(1, "Invalid notifier called!");
89 nb = next_nb;
90 continue;
91 }
92#endif
85 ret = nb->notifier_call(nb, val, v); 93 ret = nb->notifier_call(nb, val, v);
86 94
87 if (nr_calls) 95 if (nr_calls)
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 43c2111cd54d..78bc3fdac0d2 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -13,7 +13,6 @@
13 13
14struct ns_cgroup { 14struct ns_cgroup {
15 struct cgroup_subsys_state css; 15 struct cgroup_subsys_state css;
16 spinlock_t lock;
17}; 16};
18 17
19struct cgroup_subsys ns_subsys; 18struct cgroup_subsys ns_subsys;
@@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
84 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 83 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
85 if (!ns_cgroup) 84 if (!ns_cgroup)
86 return ERR_PTR(-ENOMEM); 85 return ERR_PTR(-ENOMEM);
87 spin_lock_init(&ns_cgroup->lock);
88 return &ns_cgroup->css; 86 return &ns_cgroup->css;
89} 87}
90 88
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 1d3ef29a2583..63598dca2d0c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -80,12 +80,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
80 goto out_pid; 80 goto out_pid;
81 } 81 }
82 82
83 new_nsp->user_ns = copy_user_ns(flags, tsk->nsproxy->user_ns);
84 if (IS_ERR(new_nsp->user_ns)) {
85 err = PTR_ERR(new_nsp->user_ns);
86 goto out_user;
87 }
88
89 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); 83 new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns);
90 if (IS_ERR(new_nsp->net_ns)) { 84 if (IS_ERR(new_nsp->net_ns)) {
91 err = PTR_ERR(new_nsp->net_ns); 85 err = PTR_ERR(new_nsp->net_ns);
@@ -95,9 +89,6 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
95 return new_nsp; 89 return new_nsp;
96 90
97out_net: 91out_net:
98 if (new_nsp->user_ns)
99 put_user_ns(new_nsp->user_ns);
100out_user:
101 if (new_nsp->pid_ns) 92 if (new_nsp->pid_ns)
102 put_pid_ns(new_nsp->pid_ns); 93 put_pid_ns(new_nsp->pid_ns);
103out_pid: 94out_pid:
@@ -130,7 +121,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
130 get_nsproxy(old_ns); 121 get_nsproxy(old_ns);
131 122
132 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 123 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
133 CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNET))) 124 CLONE_NEWPID | CLONE_NEWNET)))
134 return 0; 125 return 0;
135 126
136 if (!capable(CAP_SYS_ADMIN)) { 127 if (!capable(CAP_SYS_ADMIN)) {
@@ -173,8 +164,6 @@ void free_nsproxy(struct nsproxy *ns)
173 put_ipc_ns(ns->ipc_ns); 164 put_ipc_ns(ns->ipc_ns);
174 if (ns->pid_ns) 165 if (ns->pid_ns)
175 put_pid_ns(ns->pid_ns); 166 put_pid_ns(ns->pid_ns);
176 if (ns->user_ns)
177 put_user_ns(ns->user_ns);
178 put_net(ns->net_ns); 167 put_net(ns->net_ns);
179 kmem_cache_free(nsproxy_cachep, ns); 168 kmem_cache_free(nsproxy_cachep, ns);
180} 169}
@@ -189,7 +178,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
189 int err = 0; 178 int err = 0;
190 179
191 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | 180 if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
192 CLONE_NEWUSER | CLONE_NEWNET))) 181 CLONE_NEWNET)))
193 return 0; 182 return 0;
194 183
195 if (!capable(CAP_SYS_ADMIN)) 184 if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d5088355bfe..2a2ff36ff44d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/dmi.h>
24 25
25int panic_on_oops; 26int panic_on_oops;
26static unsigned long tainted_mask; 27static unsigned long tainted_mask;
@@ -298,6 +299,8 @@ static int init_oops_id(void)
298{ 299{
299 if (!oops_id) 300 if (!oops_id)
300 get_random_bytes(&oops_id, sizeof(oops_id)); 301 get_random_bytes(&oops_id, sizeof(oops_id));
302 else
303 oops_id++;
301 304
302 return 0; 305 return 0;
303} 306}
@@ -321,36 +324,27 @@ void oops_exit(void)
321} 324}
322 325
323#ifdef WANT_WARN_ON_SLOWPATH 326#ifdef WANT_WARN_ON_SLOWPATH
324void warn_on_slowpath(const char *file, int line)
325{
326 char function[KSYM_SYMBOL_LEN];
327 unsigned long caller = (unsigned long) __builtin_return_address(0);
328 sprint_symbol(function, caller);
329
330 printk(KERN_WARNING "------------[ cut here ]------------\n");
331 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
332 line, function);
333 print_modules();
334 dump_stack();
335 print_oops_end_marker();
336 add_taint(TAINT_WARN);
337}
338EXPORT_SYMBOL(warn_on_slowpath);
339
340
341void warn_slowpath(const char *file, int line, const char *fmt, ...) 327void warn_slowpath(const char *file, int line, const char *fmt, ...)
342{ 328{
343 va_list args; 329 va_list args;
344 char function[KSYM_SYMBOL_LEN]; 330 char function[KSYM_SYMBOL_LEN];
345 unsigned long caller = (unsigned long)__builtin_return_address(0); 331 unsigned long caller = (unsigned long)__builtin_return_address(0);
332 const char *board;
333
346 sprint_symbol(function, caller); 334 sprint_symbol(function, caller);
347 335
348 printk(KERN_WARNING "------------[ cut here ]------------\n"); 336 printk(KERN_WARNING "------------[ cut here ]------------\n");
349 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, 337 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
350 line, function); 338 line, function);
351 va_start(args, fmt); 339 board = dmi_get_system_info(DMI_PRODUCT_NAME);
352 vprintk(fmt, args); 340 if (board)
353 va_end(args); 341 printk(KERN_WARNING "Hardware name: %s\n", board);
342
343 if (fmt) {
344 va_start(args, fmt);
345 vprintk(fmt, args);
346 va_end(args);
347 }
354 348
355 print_modules(); 349 print_modules();
356 dump_stack(); 350 dump_stack();
diff --git a/kernel/pid.c b/kernel/pid.c
index 064e76afa507..1b3586fe753a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -474,8 +474,14 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
474} 474}
475EXPORT_SYMBOL(task_session_nr_ns); 475EXPORT_SYMBOL(task_session_nr_ns);
476 476
477struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
478{
479 return ns_of_pid(task_pid(tsk));
480}
481EXPORT_SYMBOL_GPL(task_active_pid_ns);
482
477/* 483/*
478 * Used by proc to find the first pid that is greater then or equal to nr. 484 * Used by proc to find the first pid that is greater than or equal to nr.
479 * 485 *
480 * If there is a pid at nr this function is exactly the same as find_pid_ns. 486 * If there is a pid at nr this function is exactly the same as find_pid_ns.
481 */ 487 */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4e5288a831de..157de3a47832 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -58,21 +58,21 @@ void thread_group_cputime(
58 struct task_struct *tsk, 58 struct task_struct *tsk,
59 struct task_cputime *times) 59 struct task_cputime *times)
60{ 60{
61 struct signal_struct *sig; 61 struct task_cputime *totals, *tot;
62 int i; 62 int i;
63 struct task_cputime *tot;
64 63
65 sig = tsk->signal; 64 totals = tsk->signal->cputime.totals;
66 if (unlikely(!sig) || !sig->cputime.totals) { 65 if (!totals) {
67 times->utime = tsk->utime; 66 times->utime = tsk->utime;
68 times->stime = tsk->stime; 67 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime; 68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return; 69 return;
71 } 70 }
71
72 times->stime = times->utime = cputime_zero; 72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0; 73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) { 74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i); 75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime); 76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime); 77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime; 78 times->sum_exec_runtime += tot->sum_exec_runtime;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a140e44eebba..887c63787de6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -116,7 +116,7 @@ static DEFINE_SPINLOCK(idr_lock);
116 * must supply functions here, even if the function just returns 116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the 117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the 118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_process 119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code. 120 * fields are not modified by timer code.
121 * 121 *
122 * At this time all functions EXCEPT clock_nanosleep can be 122 * At this time all functions EXCEPT clock_nanosleep can be
@@ -319,7 +319,8 @@ void do_schedule_next_timer(struct siginfo *info)
319 319
320int posix_timer_event(struct k_itimer *timr, int si_private) 320int posix_timer_event(struct k_itimer *timr, int si_private)
321{ 321{
322 int shared, ret; 322 struct task_struct *task;
323 int shared, ret = -1;
323 /* 324 /*
324 * FIXME: if ->sigq is queued we can race with 325 * FIXME: if ->sigq is queued we can race with
325 * dequeue_signal()->do_schedule_next_timer(). 326 * dequeue_signal()->do_schedule_next_timer().
@@ -333,8 +334,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
333 */ 334 */
334 timr->sigq->info.si_sys_private = si_private; 335 timr->sigq->info.si_sys_private = si_private;
335 336
336 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); 337 rcu_read_lock();
337 ret = send_sigqueue(timr->sigq, timr->it_process, shared); 338 task = pid_task(timr->it_pid, PIDTYPE_PID);
339 if (task) {
340 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
341 ret = send_sigqueue(timr->sigq, task, shared);
342 }
343 rcu_read_unlock();
338 /* If we failed to send the signal the timer stops. */ 344 /* If we failed to send the signal the timer stops. */
339 return ret > 0; 345 return ret > 0;
340} 346}
@@ -411,7 +417,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
411 return ret; 417 return ret;
412} 418}
413 419
414static struct task_struct * good_sigevent(sigevent_t * event) 420static struct pid *good_sigevent(sigevent_t * event)
415{ 421{
416 struct task_struct *rtn = current->group_leader; 422 struct task_struct *rtn = current->group_leader;
417 423
@@ -425,7 +431,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
425 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) 431 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
426 return NULL; 432 return NULL;
427 433
428 return rtn; 434 return task_pid(rtn);
429} 435}
430 436
431void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 437void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
@@ -464,6 +470,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
464 idr_remove(&posix_timers_id, tmr->it_id); 470 idr_remove(&posix_timers_id, tmr->it_id);
465 spin_unlock_irqrestore(&idr_lock, flags); 471 spin_unlock_irqrestore(&idr_lock, flags);
466 } 472 }
473 put_pid(tmr->it_pid);
467 sigqueue_free(tmr->sigq); 474 sigqueue_free(tmr->sigq);
468 kmem_cache_free(posix_timers_cache, tmr); 475 kmem_cache_free(posix_timers_cache, tmr);
469} 476}
@@ -477,7 +484,6 @@ sys_timer_create(const clockid_t which_clock,
477{ 484{
478 struct k_itimer *new_timer; 485 struct k_itimer *new_timer;
479 int error, new_timer_id; 486 int error, new_timer_id;
480 struct task_struct *process;
481 sigevent_t event; 487 sigevent_t event;
482 int it_id_set = IT_ID_NOT_SET; 488 int it_id_set = IT_ID_NOT_SET;
483 489
@@ -531,11 +537,9 @@ sys_timer_create(const clockid_t which_clock,
531 goto out; 537 goto out;
532 } 538 }
533 rcu_read_lock(); 539 rcu_read_lock();
534 process = good_sigevent(&event); 540 new_timer->it_pid = get_pid(good_sigevent(&event));
535 if (process)
536 get_task_struct(process);
537 rcu_read_unlock(); 541 rcu_read_unlock();
538 if (!process) { 542 if (!new_timer->it_pid) {
539 error = -EINVAL; 543 error = -EINVAL;
540 goto out; 544 goto out;
541 } 545 }
@@ -543,8 +547,7 @@ sys_timer_create(const clockid_t which_clock,
543 event.sigev_notify = SIGEV_SIGNAL; 547 event.sigev_notify = SIGEV_SIGNAL;
544 event.sigev_signo = SIGALRM; 548 event.sigev_signo = SIGALRM;
545 event.sigev_value.sival_int = new_timer->it_id; 549 event.sigev_value.sival_int = new_timer->it_id;
546 process = current->group_leader; 550 new_timer->it_pid = get_pid(task_tgid(current));
547 get_task_struct(process);
548 } 551 }
549 552
550 new_timer->it_sigev_notify = event.sigev_notify; 553 new_timer->it_sigev_notify = event.sigev_notify;
@@ -554,7 +557,7 @@ sys_timer_create(const clockid_t which_clock,
554 new_timer->sigq->info.si_code = SI_TIMER; 557 new_timer->sigq->info.si_code = SI_TIMER;
555 558
556 spin_lock_irq(&current->sighand->siglock); 559 spin_lock_irq(&current->sighand->siglock);
557 new_timer->it_process = process; 560 new_timer->it_signal = current->signal;
558 list_add(&new_timer->list, &current->signal->posix_timers); 561 list_add(&new_timer->list, &current->signal->posix_timers);
559 spin_unlock_irq(&current->sighand->siglock); 562 spin_unlock_irq(&current->sighand->siglock);
560 563
@@ -589,8 +592,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
589 timr = idr_find(&posix_timers_id, (int)timer_id); 592 timr = idr_find(&posix_timers_id, (int)timer_id);
590 if (timr) { 593 if (timr) {
591 spin_lock(&timr->it_lock); 594 spin_lock(&timr->it_lock);
592 if (timr->it_process && 595 if (timr->it_signal == current->signal) {
593 same_thread_group(timr->it_process, current)) {
594 spin_unlock(&idr_lock); 596 spin_unlock(&idr_lock);
595 return timr; 597 return timr;
596 } 598 }
@@ -837,8 +839,7 @@ retry_delete:
837 * This keeps any tasks waiting on the spin lock from thinking 839 * This keeps any tasks waiting on the spin lock from thinking
838 * they got something (see the lock code above). 840 * they got something (see the lock code above).
839 */ 841 */
840 put_task_struct(timer->it_process); 842 timer->it_signal = NULL;
841 timer->it_process = NULL;
842 843
843 unlock_timer(timer, flags); 844 unlock_timer(timer, flags);
844 release_posix_timer(timer, IT_ID_SET); 845 release_posix_timer(timer, IT_ID_SET);
@@ -864,8 +865,7 @@ retry_delete:
864 * This keeps any tasks waiting on the spin lock from thinking 865 * This keeps any tasks waiting on the spin lock from thinking
865 * they got something (see the lock code above). 866 * they got something (see the lock code above).
866 */ 867 */
867 put_task_struct(timer->it_process); 868 timer->it_signal = NULL;
868 timer->it_process = NULL;
869 869
870 unlock_timer(timer, flags); 870 unlock_timer(timer, flags);
871 release_posix_timer(timer, IT_ID_SET); 871 release_posix_timer(timer, IT_ID_SET);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c9d74083746f..f77d3819ef57 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,7 +22,6 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/ftrace.h>
26 25
27#include "power.h" 26#include "power.h"
28 27
@@ -257,7 +256,7 @@ static int create_image(int platform_mode)
257 256
258int hibernation_snapshot(int platform_mode) 257int hibernation_snapshot(int platform_mode)
259{ 258{
260 int error, ftrace_save; 259 int error;
261 260
262 /* Free memory before shutting down devices. */ 261 /* Free memory before shutting down devices. */
263 error = swsusp_shrink_memory(); 262 error = swsusp_shrink_memory();
@@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
269 goto Close; 268 goto Close;
270 269
271 suspend_console(); 270 suspend_console();
272 ftrace_save = __ftrace_enabled_save();
273 error = device_suspend(PMSG_FREEZE); 271 error = device_suspend(PMSG_FREEZE);
274 if (error) 272 if (error)
275 goto Recover_platform; 273 goto Recover_platform;
@@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
299 Resume_devices: 297 Resume_devices:
300 device_resume(in_suspend ? 298 device_resume(in_suspend ?
301 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 299 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
302 __ftrace_enabled_restore(ftrace_save);
303 resume_console(); 300 resume_console();
304 Close: 301 Close:
305 platform_end(platform_mode); 302 platform_end(platform_mode);
@@ -370,11 +367,10 @@ static int resume_target_kernel(void)
370 367
371int hibernation_restore(int platform_mode) 368int hibernation_restore(int platform_mode)
372{ 369{
373 int error, ftrace_save; 370 int error;
374 371
375 pm_prepare_console(); 372 pm_prepare_console();
376 suspend_console(); 373 suspend_console();
377 ftrace_save = __ftrace_enabled_save();
378 error = device_suspend(PMSG_QUIESCE); 374 error = device_suspend(PMSG_QUIESCE);
379 if (error) 375 if (error)
380 goto Finish; 376 goto Finish;
@@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)
389 platform_restore_cleanup(platform_mode); 385 platform_restore_cleanup(platform_mode);
390 device_resume(PMSG_RECOVER); 386 device_resume(PMSG_RECOVER);
391 Finish: 387 Finish:
392 __ftrace_enabled_restore(ftrace_save);
393 resume_console(); 388 resume_console();
394 pm_restore_console(); 389 pm_restore_console();
395 return error; 390 return error;
@@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)
402 397
403int hibernation_platform_enter(void) 398int hibernation_platform_enter(void)
404{ 399{
405 int error, ftrace_save; 400 int error;
406 401
407 if (!hibernation_ops) 402 if (!hibernation_ops)
408 return -ENOSYS; 403 return -ENOSYS;
@@ -417,7 +412,6 @@ int hibernation_platform_enter(void)
417 goto Close; 412 goto Close;
418 413
419 suspend_console(); 414 suspend_console();
420 ftrace_save = __ftrace_enabled_save();
421 error = device_suspend(PMSG_HIBERNATE); 415 error = device_suspend(PMSG_HIBERNATE);
422 if (error) { 416 if (error) {
423 if (hibernation_ops->recover) 417 if (hibernation_ops->recover)
@@ -452,7 +446,6 @@ int hibernation_platform_enter(void)
452 hibernation_ops->finish(); 446 hibernation_ops->finish();
453 Resume_devices: 447 Resume_devices:
454 device_resume(PMSG_RESTORE); 448 device_resume(PMSG_RESTORE);
455 __ftrace_enabled_restore(ftrace_save);
456 resume_console(); 449 resume_console();
457 Close: 450 Close:
458 hibernation_ops->end(); 451 hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b8f7ce9473e8..239988873971 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,7 +22,6 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/vmstat.h> 23#include <linux/vmstat.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
25#include <linux/ftrace.h>
26 25
27#include "power.h" 26#include "power.h"
28 27
@@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
317 */ 316 */
318int suspend_devices_and_enter(suspend_state_t state) 317int suspend_devices_and_enter(suspend_state_t state)
319{ 318{
320 int error, ftrace_save; 319 int error;
321 320
322 if (!suspend_ops) 321 if (!suspend_ops)
323 return -ENOSYS; 322 return -ENOSYS;
@@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
328 goto Close; 327 goto Close;
329 } 328 }
330 suspend_console(); 329 suspend_console();
331 ftrace_save = __ftrace_enabled_save();
332 suspend_test_start(); 330 suspend_test_start();
333 error = device_suspend(PMSG_SUSPEND); 331 error = device_suspend(PMSG_SUSPEND);
334 if (error) { 332 if (error) {
@@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
360 suspend_test_start(); 358 suspend_test_start();
361 device_resume(PMSG_RESUME); 359 device_resume(PMSG_RESUME);
362 suspend_test_finish("resume devices"); 360 suspend_test_finish("resume devices");
363 __ftrace_enabled_restore(ftrace_save);
364 resume_console(); 361 resume_console();
365 Close: 362 Close:
366 if (suspend_ops->end) 363 if (suspend_ops->end)
@@ -618,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
618 /* this may fail if the RTC hasn't been initialized */ 615 /* this may fail if the RTC hasn't been initialized */
619 status = rtc_read_time(rtc, &alm.time); 616 status = rtc_read_time(rtc, &alm.time);
620 if (status < 0) { 617 if (status < 0) {
621 printk(err_readtime, rtc->dev.bus_id, status); 618 printk(err_readtime, dev_name(&rtc->dev), status);
622 return; 619 return;
623 } 620 }
624 rtc_tm_to_time(&alm.time, &now); 621 rtc_tm_to_time(&alm.time, &now);
@@ -629,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
629 626
630 status = rtc_set_alarm(rtc, &alm); 627 status = rtc_set_alarm(rtc, &alm);
631 if (status < 0) { 628 if (status < 0) {
632 printk(err_wakealarm, rtc->dev.bus_id, status); 629 printk(err_wakealarm, dev_name(&rtc->dev), status);
633 return; 630 return;
634 } 631 }
635 632
@@ -663,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
663 if (!device_may_wakeup(candidate->dev.parent)) 660 if (!device_may_wakeup(candidate->dev.parent))
664 return 0; 661 return 0;
665 662
666 *(char **)name_ptr = dev->bus_id; 663 *(const char **)name_ptr = dev_name(dev);
667 return 1; 664 return 1;
668} 665}
669 666
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 72016f051477..97890831e1b5 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
27static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key, struct tty_struct *tty)
28{ 28{
29 /* run sysrq poweroff on boot cpu */ 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); 30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
31} 31}
32 32
33static struct sysrq_key_op sysrq_poweroff_op = { 33static struct sysrq_key_op sysrq_poweroff_op = {
diff --git a/kernel/printk.c b/kernel/printk.c
index f492f1583d77..7015733793e8 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
619static const char recursion_bug_msg [] = 619static const char recursion_bug_msg [] =
620 KERN_CRIT "BUG: recent printk recursion!\n"; 620 KERN_CRIT "BUG: recent printk recursion!\n";
621static int recursion_bug; 621static int recursion_bug;
622 static int new_text_line = 1; 622static int new_text_line = 1;
623static char printk_buf[1024]; 623static char printk_buf[1024];
624 624
625asmlinkage int vprintk(const char *fmt, va_list args) 625asmlinkage int vprintk(const char *fmt, va_list args)
@@ -662,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
662 if (recursion_bug) { 662 if (recursion_bug) {
663 recursion_bug = 0; 663 recursion_bug = 0;
664 strcpy(printk_buf, recursion_bug_msg); 664 strcpy(printk_buf, recursion_bug_msg);
665 printed_len = sizeof(recursion_bug_msg); 665 printed_len = strlen(recursion_bug_msg);
666 } 666 }
667 /* Emit the output into the temporary buffer */ 667 /* Emit the output into the temporary buffer */
668 printed_len += vscnprintf(printk_buf + printed_len, 668 printed_len += vscnprintf(printk_buf + printed_len,
diff --git a/kernel/profile.c b/kernel/profile.c
index dc41827fbfee..784933acf5b8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -45,7 +45,7 @@ static unsigned long prof_len, prof_shift;
45int prof_on __read_mostly; 45int prof_on __read_mostly;
46EXPORT_SYMBOL_GPL(prof_on); 46EXPORT_SYMBOL_GPL(prof_on);
47 47
48static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 48static cpumask_var_t prof_cpu_mask;
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 50static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
51static DEFINE_PER_CPU(int, cpu_profile_flip); 51static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -113,9 +113,13 @@ int __ref profile_init(void)
113 buffer_bytes = prof_len*sizeof(atomic_t); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) { 114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes); 115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
116 return 0; 117 return 0;
117 } 118 }
118 119
120 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
121 return -ENOMEM;
122
119 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 123 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
120 if (prof_buffer) 124 if (prof_buffer)
121 return 0; 125 return 0;
@@ -128,6 +132,7 @@ int __ref profile_init(void)
128 if (prof_buffer) 132 if (prof_buffer)
129 return 0; 133 return 0;
130 134
135 free_cpumask_var(prof_cpu_mask);
131 return -ENOMEM; 136 return -ENOMEM;
132} 137}
133 138
@@ -386,13 +391,15 @@ out_free:
386 return NOTIFY_BAD; 391 return NOTIFY_BAD;
387 case CPU_ONLINE: 392 case CPU_ONLINE:
388 case CPU_ONLINE_FROZEN: 393 case CPU_ONLINE_FROZEN:
389 cpu_set(cpu, prof_cpu_mask); 394 if (prof_cpu_mask != NULL)
395 cpumask_set_cpu(cpu, prof_cpu_mask);
390 break; 396 break;
391 case CPU_UP_CANCELED: 397 case CPU_UP_CANCELED:
392 case CPU_UP_CANCELED_FROZEN: 398 case CPU_UP_CANCELED_FROZEN:
393 case CPU_DEAD: 399 case CPU_DEAD:
394 case CPU_DEAD_FROZEN: 400 case CPU_DEAD_FROZEN:
395 cpu_clear(cpu, prof_cpu_mask); 401 if (prof_cpu_mask != NULL)
402 cpumask_clear_cpu(cpu, prof_cpu_mask);
396 if (per_cpu(cpu_profile_hits, cpu)[0]) { 403 if (per_cpu(cpu_profile_hits, cpu)[0]) {
397 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 404 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
398 per_cpu(cpu_profile_hits, cpu)[0] = NULL; 405 per_cpu(cpu_profile_hits, cpu)[0] = NULL;
@@ -430,19 +437,19 @@ void profile_tick(int type)
430 437
431 if (type == CPU_PROFILING && timer_hook) 438 if (type == CPU_PROFILING && timer_hook)
432 timer_hook(regs); 439 timer_hook(regs);
433 if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) 440 if (!user_mode(regs) && prof_cpu_mask != NULL &&
441 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
434 profile_hit(type, (void *)profile_pc(regs)); 442 profile_hit(type, (void *)profile_pc(regs));
435} 443}
436 444
437#ifdef CONFIG_PROC_FS 445#ifdef CONFIG_PROC_FS
438#include <linux/proc_fs.h> 446#include <linux/proc_fs.h>
439#include <asm/uaccess.h> 447#include <asm/uaccess.h>
440#include <asm/ptrace.h>
441 448
442static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 449static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
443 int count, int *eof, void *data) 450 int count, int *eof, void *data)
444{ 451{
445 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); 452 int len = cpumask_scnprintf(page, count, data);
446 if (count - len < 2) 453 if (count - len < 2)
447 return -EINVAL; 454 return -EINVAL;
448 len += sprintf(page + len, "\n"); 455 len += sprintf(page + len, "\n");
@@ -452,16 +459,20 @@ static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
452static int prof_cpu_mask_write_proc(struct file *file, 459static int prof_cpu_mask_write_proc(struct file *file,
453 const char __user *buffer, unsigned long count, void *data) 460 const char __user *buffer, unsigned long count, void *data)
454{ 461{
455 cpumask_t *mask = (cpumask_t *)data; 462 struct cpumask *mask = data;
456 unsigned long full_count = count, err; 463 unsigned long full_count = count, err;
457 cpumask_t new_value; 464 cpumask_var_t new_value;
458 465
459 err = cpumask_parse_user(buffer, count, new_value); 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
460 if (err) 467 return -ENOMEM;
461 return err;
462 468
463 *mask = new_value; 469 err = cpumask_parse_user(buffer, count, new_value);
464 return full_count; 470 if (!err) {
471 cpumask_copy(mask, new_value);
472 err = full_count;
473 }
474 free_cpumask_var(new_value);
475 return err;
465} 476}
466 477
467void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 478void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
@@ -472,7 +483,7 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
472 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 483 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
473 if (!entry) 484 if (!entry)
474 return; 485 return;
475 entry->data = (void *)&prof_cpu_mask; 486 entry->data = prof_cpu_mask;
476 entry->read_proc = prof_cpu_mask_read_proc; 487 entry->read_proc = prof_cpu_mask_read_proc;
477 entry->write_proc = prof_cpu_mask_write_proc; 488 entry->write_proc = prof_cpu_mask_write_proc;
478} 489}
@@ -544,7 +555,7 @@ static const struct file_operations proc_profile_operations = {
544}; 555};
545 556
546#ifdef CONFIG_SMP 557#ifdef CONFIG_SMP
547static inline void profile_nop(void *unused) 558static void profile_nop(void *unused)
548{ 559{
549} 560}
550 561
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4c8bcd7dd8e0..29dc700e198c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,6 +25,17 @@
25#include <asm/pgtable.h> 25#include <asm/pgtable.h>
26#include <asm/uaccess.h> 26#include <asm/uaccess.h>
27 27
28
29/*
30 * Initialize a new task whose father had been ptraced.
31 *
32 * Called from copy_process().
33 */
34void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
35{
36 arch_ptrace_fork(child, clone_flags);
37}
38
28/* 39/*
29 * ptrace a task: make the debugger its new parent and 40 * ptrace a task: make the debugger its new parent and
30 * move it to the ptrace list. 41 * move it to the ptrace list.
@@ -72,6 +83,7 @@ void __ptrace_unlink(struct task_struct *child)
72 child->parent = child->real_parent; 83 child->parent = child->real_parent;
73 list_del_init(&child->ptrace_entry); 84 list_del_init(&child->ptrace_entry);
74 85
86 arch_ptrace_untrace(child);
75 if (task_is_traced(child)) 87 if (task_is_traced(child))
76 ptrace_untrace(child); 88 ptrace_untrace(child);
77} 89}
@@ -115,6 +127,8 @@ int ptrace_check_attach(struct task_struct *child, int kill)
115 127
116int __ptrace_may_access(struct task_struct *task, unsigned int mode) 128int __ptrace_may_access(struct task_struct *task, unsigned int mode)
117{ 129{
130 const struct cred *cred = current_cred(), *tcred;
131
118 /* May we inspect the given task? 132 /* May we inspect the given task?
119 * This check is used both for attaching with ptrace 133 * This check is used both for attaching with ptrace
120 * and for allowing access to sensitive information in /proc. 134 * and for allowing access to sensitive information in /proc.
@@ -127,13 +141,19 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
127 /* Don't let security modules deny introspection */ 141 /* Don't let security modules deny introspection */
128 if (task == current) 142 if (task == current)
129 return 0; 143 return 0;
130 if (((current->uid != task->euid) || 144 rcu_read_lock();
131 (current->uid != task->suid) || 145 tcred = __task_cred(task);
132 (current->uid != task->uid) || 146 if ((cred->uid != tcred->euid ||
133 (current->gid != task->egid) || 147 cred->uid != tcred->suid ||
134 (current->gid != task->sgid) || 148 cred->uid != tcred->uid ||
135 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) 149 cred->gid != tcred->egid ||
150 cred->gid != tcred->sgid ||
151 cred->gid != tcred->gid) &&
152 !capable(CAP_SYS_PTRACE)) {
153 rcu_read_unlock();
136 return -EPERM; 154 return -EPERM;
155 }
156 rcu_read_unlock();
137 smp_rmb(); 157 smp_rmb();
138 if (task->mm) 158 if (task->mm)
139 dumpable = get_dumpable(task->mm); 159 dumpable = get_dumpable(task->mm);
@@ -163,6 +183,14 @@ int ptrace_attach(struct task_struct *task)
163 if (same_thread_group(task, current)) 183 if (same_thread_group(task, current))
164 goto out; 184 goto out;
165 185
186 /* Protect exec's credential calculations against our interference;
187 * SUID, SGID and LSM creds get determined differently under ptrace.
188 */
189 retval = mutex_lock_interruptible(&current->cred_exec_mutex);
190 if (retval < 0)
191 goto out;
192
193 retval = -EPERM;
166repeat: 194repeat:
167 /* 195 /*
168 * Nasty, nasty. 196 * Nasty, nasty.
@@ -202,6 +230,7 @@ repeat:
202bad: 230bad:
203 write_unlock_irqrestore(&tasklist_lock, flags); 231 write_unlock_irqrestore(&tasklist_lock, flags);
204 task_unlock(task); 232 task_unlock(task);
233 mutex_unlock(&current->cred_exec_mutex);
205out: 234out:
206 return retval; 235 return retval;
207} 236}
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e551542..490934fc7ac3 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
63 .completed = -300, 63 .completed = -300,
64 .pending = -300, 64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_MASK_NONE, 66 .cpumask = CPU_BITS_NONE,
67}; 67};
68static struct rcu_ctrlblk rcu_bh_ctrlblk = { 68static struct rcu_ctrlblk rcu_bh_ctrlblk = {
69 .cur = -300, 69 .cur = -300,
70 .completed = -300, 70 .completed = -300,
71 .pending = -300, 71 .pending = -300,
72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), 72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
73 .cpumask = CPU_MASK_NONE, 73 .cpumask = CPU_BITS_NONE,
74}; 74};
75 75
76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
@@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp,
85 struct rcu_ctrlblk *rcp) 85 struct rcu_ctrlblk *rcp)
86{ 86{
87 int cpu; 87 int cpu;
88 cpumask_t cpumask;
89 unsigned long flags; 88 unsigned long flags;
90 89
91 set_need_resched(); 90 set_need_resched();
@@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
96 * Don't send IPI to itself. With irqs disabled, 95 * Don't send IPI to itself. With irqs disabled,
97 * rdp->cpu is the current cpu. 96 * rdp->cpu is the current cpu.
98 * 97 *
99 * cpu_online_map is updated by the _cpu_down() 98 * cpu_online_mask is updated by the _cpu_down()
100 * using __stop_machine(). Since we're in irqs disabled 99 * using __stop_machine(). Since we're in irqs disabled
101 * section, __stop_machine() is not exectuting, hence 100 * section, __stop_machine() is not exectuting, hence
102 * the cpu_online_map is stable. 101 * the cpu_online_mask is stable.
103 * 102 *
104 * However, a cpu might have been offlined _just_ before 103 * However, a cpu might have been offlined _just_ before
105 * we disabled irqs while entering here. 104 * we disabled irqs while entering here.
@@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp,
107 * notification, leading to the offlined cpu's bit 106 * notification, leading to the offlined cpu's bit
108 * being set in the rcp->cpumask. 107 * being set in the rcp->cpumask.
109 * 108 *
110 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent 109 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
111 * sending smp_reschedule() to an offlined CPU. 110 * sending smp_reschedule() to an offlined CPU.
112 */ 111 */
113 cpus_and(cpumask, rcp->cpumask, cpu_online_map); 112 for_each_cpu_and(cpu,
114 cpu_clear(rdp->cpu, cpumask); 113 to_cpumask(rcp->cpumask), cpu_online_mask) {
115 for_each_cpu_mask_nr(cpu, cpumask) 114 if (cpu != rdp->cpu)
116 smp_send_reschedule(cpu); 115 smp_send_reschedule(cpu);
116 }
117 } 117 }
118 spin_unlock_irqrestore(&rcp->lock, flags); 118 spin_unlock_irqrestore(&rcp->lock, flags);
119} 119}
@@ -191,9 +191,9 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
191 191
192 /* OK, time to rat on our buddy... */ 192 /* OK, time to rat on our buddy... */
193 193
194 printk(KERN_ERR "RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
198 } 198 }
199 printk(" (detected by %d, t=%ld jiffies)\n", 199 printk(" (detected by %d, t=%ld jiffies)\n",
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{ 204{
205 unsigned long flags; 205 unsigned long flags;
206 206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", 207 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies, 208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start); 209 jiffies - rcp->gp_start);
210 dump_stack(); 210 dump_stack();
@@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
221 long delta; 221 long delta;
222 222
223 delta = jiffies - rcp->jiffies_stall; 223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { 224 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
225 delta >= 0) {
225 226
226 /* We haven't checked in, so go dump stack. */ 227 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp); 228 print_cpu_stall(rcp);
@@ -393,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
393 * unnecessarily. 394 * unnecessarily.
394 */ 395 */
395 smp_mb(); 396 smp_mb();
396 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); 397 cpumask_andnot(to_cpumask(rcp->cpumask),
398 cpu_online_mask, nohz_cpu_mask);
397 399
398 rcp->signaled = 0; 400 rcp->signaled = 0;
399 } 401 }
@@ -406,8 +408,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
406 */ 408 */
407static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) 409static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
408{ 410{
409 cpu_clear(cpu, rcp->cpumask); 411 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
410 if (cpus_empty(rcp->cpumask)) { 412 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
411 /* batch completed ! */ 413 /* batch completed ! */
412 rcp->completed = rcp->cur; 414 rcp->completed = rcp->cur;
413 rcu_start_batch(rcp); 415 rcu_start_batch(rcp);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ad63af8b2521..d92a76a881aa 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 78 * and may be nested.
79 */ 79 */
80void synchronize_rcu(void); /* Makes kernel-doc tools happy */ 80void synchronize_rcu(void)
81synchronize_rcu_xxx(synchronize_rcu, call_rcu) 81{
82 struct rcu_synchronize rcu;
83 init_completion(&rcu.completion);
84 /* Will wake me after RCU finished. */
85 call_rcu(&rcu.head, wakeme_after_rcu);
86 /* Wait for it. */
87 wait_for_completion(&rcu.completion);
88}
82EXPORT_SYMBOL_GPL(synchronize_rcu); 89EXPORT_SYMBOL_GPL(synchronize_rcu);
83 90
84static void rcu_barrier_callback(struct rcu_head *notused) 91static void rcu_barrier_callback(struct rcu_head *notused)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 59236e8b9daa..33cfc50781f9 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] =
164 { "idle", "waitack", "waitzero", "waitmb" }; 164 { "idle", "waitack", "waitzero", "waitmb" };
165#endif /* #ifdef CONFIG_RCU_TRACE */ 165#endif /* #ifdef CONFIG_RCU_TRACE */
166 166
167static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; 167static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
168 = CPU_BITS_NONE;
168 169
169/* 170/*
170 * Enum and per-CPU flag to determine when each CPU has seen 171 * Enum and per-CPU flag to determine when each CPU has seen
@@ -551,6 +552,16 @@ void rcu_irq_exit(void)
551 } 552 }
552} 553}
553 554
555void rcu_nmi_enter(void)
556{
557 rcu_irq_enter();
558}
559
560void rcu_nmi_exit(void)
561{
562 rcu_irq_exit();
563}
564
554static void dyntick_save_progress_counter(int cpu) 565static void dyntick_save_progress_counter(int cpu)
555{ 566{
556 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); 567 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
@@ -748,7 +759,7 @@ rcu_try_flip_idle(void)
748 759
749 /* Now ask each CPU for acknowledgement of the flip. */ 760 /* Now ask each CPU for acknowledgement of the flip. */
750 761
751 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { 762 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
752 per_cpu(rcu_flip_flag, cpu) = rcu_flipped; 763 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
753 dyntick_save_progress_counter(cpu); 764 dyntick_save_progress_counter(cpu);
754 } 765 }
@@ -766,7 +777,7 @@ rcu_try_flip_waitack(void)
766 int cpu; 777 int cpu;
767 778
768 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); 779 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
769 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 780 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
770 if (rcu_try_flip_waitack_needed(cpu) && 781 if (rcu_try_flip_waitack_needed(cpu) &&
771 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { 782 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
772 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); 783 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -798,7 +809,7 @@ rcu_try_flip_waitzero(void)
798 /* Check to see if the sum of the "last" counters is zero. */ 809 /* Check to see if the sum of the "last" counters is zero. */
799 810
800 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); 811 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
801 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 812 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
802 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; 813 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
803 if (sum != 0) { 814 if (sum != 0) {
804 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); 815 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -813,7 +824,7 @@ rcu_try_flip_waitzero(void)
813 smp_mb(); /* ^^^^^^^^^^^^ */ 824 smp_mb(); /* ^^^^^^^^^^^^ */
814 825
815 /* Call for a memory barrier from each CPU. */ 826 /* Call for a memory barrier from each CPU. */
816 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { 827 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
817 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; 828 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
818 dyntick_save_progress_counter(cpu); 829 dyntick_save_progress_counter(cpu);
819 } 830 }
@@ -833,7 +844,7 @@ rcu_try_flip_waitmb(void)
833 int cpu; 844 int cpu;
834 845
835 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); 846 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
836 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 847 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
837 if (rcu_try_flip_waitmb_needed(cpu) && 848 if (rcu_try_flip_waitmb_needed(cpu) &&
838 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { 849 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
839 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); 850 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@ -1022,7 +1033,7 @@ void rcu_offline_cpu(int cpu)
1022 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; 1033 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1023 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; 1034 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1024 1035
1025 cpu_clear(cpu, rcu_cpu_online_map); 1036 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1026 1037
1027 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1038 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1028 1039
@@ -1062,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu)
1062 struct rcu_data *rdp; 1073 struct rcu_data *rdp;
1063 1074
1064 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); 1075 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1065 cpu_set(cpu, rcu_cpu_online_map); 1076 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1066 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1077 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1067 1078
1068 /* 1079 /*
@@ -1166,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
1166 * in -rt this does -not- necessarily result in all currently executing 1177 * in -rt this does -not- necessarily result in all currently executing
1167 * interrupt -handlers- having completed. 1178 * interrupt -handlers- having completed.
1168 */ 1179 */
1169synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) 1180void __synchronize_sched(void)
1181{
1182 struct rcu_synchronize rcu;
1183
1184 init_completion(&rcu.completion);
1185 /* Will wake me after RCU finished. */
1186 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1187 /* Wait for it. */
1188 wait_for_completion(&rcu.completion);
1189}
1170EXPORT_SYMBOL_GPL(__synchronize_sched); 1190EXPORT_SYMBOL_GPL(__synchronize_sched);
1171 1191
1172/* 1192/*
@@ -1420,7 +1440,7 @@ void __init __rcu_init(void)
1420 * We don't need protection against CPU-Hotplug here 1440 * We don't need protection against CPU-Hotplug here
1421 * since 1441 * since
1422 * a) If a CPU comes online while we are iterating over the 1442 * a) If a CPU comes online while we are iterating over the
1423 * cpu_online_map below, we would only end up making a 1443 * cpu_online_mask below, we would only end up making a
1424 * duplicate call to rcu_online_cpu() which sets the corresponding 1444 * duplicate call to rcu_online_cpu() which sets the corresponding
1425 * CPU's mask in the rcu_cpu_online_map. 1445 * CPU's mask in the rcu_cpu_online_map.
1426 * 1446 *
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ecf..7c2665cac172 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
149 sp->done_length += cp->done_length; 149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add; 150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove; 151 sp->done_remove += cp->done_remove;
152 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); 152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks; 153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_set(&sp->rcu_try_flip_1, 154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 atomic_read(&cp->rcu_try_flip_1)); 155 &sp->rcu_try_flip_1);
156 atomic_set(&sp->rcu_try_flip_e1, 156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 atomic_read(&cp->rcu_try_flip_e1)); 157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; 158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; 159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; 160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 85cb90588a55..1cff28db56b6 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,6 +39,7 @@
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/reboot.h>
42#include <linux/freezer.h> 43#include <linux/freezer.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/delay.h> 45#include <linux/delay.h>
@@ -108,7 +109,6 @@ struct rcu_torture {
108 int rtort_mbtest; 109 int rtort_mbtest;
109}; 110};
110 111
111static int fullstop = 0; /* stop generating callbacks at test end. */
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current = NULL;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version = 0;
@@ -136,6 +136,28 @@ static int stutter_pause_test = 0;
136#endif 136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138 138
139#define FULLSTOP_SHUTDOWN 1 /* Bail due to system shutdown/panic. */
140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */
141static int fullstop; /* stop generating callbacks at test end. */
142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */
143 /* spawning of kthreads. */
144
145/*
146 * Detect and respond to a signal-based shutdown.
147 */
148static int
149rcutorture_shutdown_notify(struct notifier_block *unused1,
150 unsigned long unused2, void *unused3)
151{
152 if (fullstop)
153 return NOTIFY_DONE;
154 mutex_lock(&fullstop_mutex);
155 if (!fullstop)
156 fullstop = FULLSTOP_SHUTDOWN;
157 mutex_unlock(&fullstop_mutex);
158 return NOTIFY_DONE;
159}
160
139/* 161/*
140 * Allocate an element from the rcu_tortures pool. 162 * Allocate an element from the rcu_tortures pool.
141 */ 163 */
@@ -199,11 +221,12 @@ rcu_random(struct rcu_random_state *rrsp)
199static void 221static void
200rcu_stutter_wait(void) 222rcu_stutter_wait(void)
201{ 223{
202 while (stutter_pause_test || !rcutorture_runnable) 224 while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) {
203 if (rcutorture_runnable) 225 if (rcutorture_runnable)
204 schedule_timeout_interruptible(1); 226 schedule_timeout_interruptible(1);
205 else 227 else
206 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 228 schedule_timeout_interruptible(round_jiffies_relative(HZ));
229 }
207} 230}
208 231
209/* 232/*
@@ -599,7 +622,7 @@ rcu_torture_writer(void *arg)
599 rcu_stutter_wait(); 622 rcu_stutter_wait();
600 } while (!kthread_should_stop() && !fullstop); 623 } while (!kthread_should_stop() && !fullstop);
601 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 624 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
602 while (!kthread_should_stop()) 625 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
603 schedule_timeout_uninterruptible(1); 626 schedule_timeout_uninterruptible(1);
604 return 0; 627 return 0;
605} 628}
@@ -624,7 +647,7 @@ rcu_torture_fakewriter(void *arg)
624 } while (!kthread_should_stop() && !fullstop); 647 } while (!kthread_should_stop() && !fullstop);
625 648
626 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 649 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
627 while (!kthread_should_stop()) 650 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
628 schedule_timeout_uninterruptible(1); 651 schedule_timeout_uninterruptible(1);
629 return 0; 652 return 0;
630} 653}
@@ -734,7 +757,7 @@ rcu_torture_reader(void *arg)
734 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 757 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
735 if (irqreader && cur_ops->irqcapable) 758 if (irqreader && cur_ops->irqcapable)
736 del_timer_sync(&t); 759 del_timer_sync(&t);
737 while (!kthread_should_stop()) 760 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
738 schedule_timeout_uninterruptible(1); 761 schedule_timeout_uninterruptible(1);
739 return 0; 762 return 0;
740} 763}
@@ -831,7 +854,7 @@ rcu_torture_stats(void *arg)
831 do { 854 do {
832 schedule_timeout_interruptible(stat_interval * HZ); 855 schedule_timeout_interruptible(stat_interval * HZ);
833 rcu_torture_stats_print(); 856 rcu_torture_stats_print();
834 } while (!kthread_should_stop()); 857 } while (!kthread_should_stop() && !fullstop);
835 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 858 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
836 return 0; 859 return 0;
837} 860}
@@ -843,49 +866,52 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
843 */ 866 */
844static void rcu_torture_shuffle_tasks(void) 867static void rcu_torture_shuffle_tasks(void)
845{ 868{
846 cpumask_t tmp_mask; 869 cpumask_var_t tmp_mask;
847 int i; 870 int i;
848 871
849 cpus_setall(tmp_mask); 872 if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
873 BUG();
874
875 cpumask_setall(tmp_mask);
850 get_online_cpus(); 876 get_online_cpus();
851 877
852 /* No point in shuffling if there is only one online CPU (ex: UP) */ 878 /* No point in shuffling if there is only one online CPU (ex: UP) */
853 if (num_online_cpus() == 1) { 879 if (num_online_cpus() == 1)
854 put_online_cpus(); 880 goto out;
855 return;
856 }
857 881
858 if (rcu_idle_cpu != -1) 882 if (rcu_idle_cpu != -1)
859 cpu_clear(rcu_idle_cpu, tmp_mask); 883 cpumask_clear_cpu(rcu_idle_cpu, tmp_mask);
860 884
861 set_cpus_allowed_ptr(current, &tmp_mask); 885 set_cpus_allowed_ptr(current, tmp_mask);
862 886
863 if (reader_tasks) { 887 if (reader_tasks) {
864 for (i = 0; i < nrealreaders; i++) 888 for (i = 0; i < nrealreaders; i++)
865 if (reader_tasks[i]) 889 if (reader_tasks[i])
866 set_cpus_allowed_ptr(reader_tasks[i], 890 set_cpus_allowed_ptr(reader_tasks[i],
867 &tmp_mask); 891 tmp_mask);
868 } 892 }
869 893
870 if (fakewriter_tasks) { 894 if (fakewriter_tasks) {
871 for (i = 0; i < nfakewriters; i++) 895 for (i = 0; i < nfakewriters; i++)
872 if (fakewriter_tasks[i]) 896 if (fakewriter_tasks[i])
873 set_cpus_allowed_ptr(fakewriter_tasks[i], 897 set_cpus_allowed_ptr(fakewriter_tasks[i],
874 &tmp_mask); 898 tmp_mask);
875 } 899 }
876 900
877 if (writer_task) 901 if (writer_task)
878 set_cpus_allowed_ptr(writer_task, &tmp_mask); 902 set_cpus_allowed_ptr(writer_task, tmp_mask);
879 903
880 if (stats_task) 904 if (stats_task)
881 set_cpus_allowed_ptr(stats_task, &tmp_mask); 905 set_cpus_allowed_ptr(stats_task, tmp_mask);
882 906
883 if (rcu_idle_cpu == -1) 907 if (rcu_idle_cpu == -1)
884 rcu_idle_cpu = num_online_cpus() - 1; 908 rcu_idle_cpu = num_online_cpus() - 1;
885 else 909 else
886 rcu_idle_cpu--; 910 rcu_idle_cpu--;
887 911
912out:
888 put_online_cpus(); 913 put_online_cpus();
914 free_cpumask_var(tmp_mask);
889} 915}
890 916
891/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the 917/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
@@ -899,7 +925,7 @@ rcu_torture_shuffle(void *arg)
899 do { 925 do {
900 schedule_timeout_interruptible(shuffle_interval * HZ); 926 schedule_timeout_interruptible(shuffle_interval * HZ);
901 rcu_torture_shuffle_tasks(); 927 rcu_torture_shuffle_tasks();
902 } while (!kthread_should_stop()); 928 } while (!kthread_should_stop() && !fullstop);
903 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); 929 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
904 return 0; 930 return 0;
905} 931}
@@ -914,10 +940,10 @@ rcu_torture_stutter(void *arg)
914 do { 940 do {
915 schedule_timeout_interruptible(stutter * HZ); 941 schedule_timeout_interruptible(stutter * HZ);
916 stutter_pause_test = 1; 942 stutter_pause_test = 1;
917 if (!kthread_should_stop()) 943 if (!kthread_should_stop() && !fullstop)
918 schedule_timeout_interruptible(stutter * HZ); 944 schedule_timeout_interruptible(stutter * HZ);
919 stutter_pause_test = 0; 945 stutter_pause_test = 0;
920 } while (!kthread_should_stop()); 946 } while (!kthread_should_stop() && !fullstop);
921 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); 947 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
922 return 0; 948 return 0;
923} 949}
@@ -934,12 +960,27 @@ rcu_torture_print_module_parms(char *tag)
934 stutter, irqreader); 960 stutter, irqreader);
935} 961}
936 962
963static struct notifier_block rcutorture_nb = {
964 .notifier_call = rcutorture_shutdown_notify,
965};
966
937static void 967static void
938rcu_torture_cleanup(void) 968rcu_torture_cleanup(void)
939{ 969{
940 int i; 970 int i;
941 971
942 fullstop = 1; 972 mutex_lock(&fullstop_mutex);
973 if (!fullstop) {
974 /* If being signaled, let it happen, then exit. */
975 mutex_unlock(&fullstop_mutex);
976 schedule_timeout_interruptible(10 * HZ);
977 if (cur_ops->cb_barrier != NULL)
978 cur_ops->cb_barrier();
979 return;
980 }
981 fullstop = FULLSTOP_CLEANUP;
982 mutex_unlock(&fullstop_mutex);
983 unregister_reboot_notifier(&rcutorture_nb);
943 if (stutter_task) { 984 if (stutter_task) {
944 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 985 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
945 kthread_stop(stutter_task); 986 kthread_stop(stutter_task);
@@ -1015,6 +1056,8 @@ rcu_torture_init(void)
1015 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1056 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1016 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1057 &srcu_ops, &sched_ops, &sched_ops_sync, };
1017 1058
1059 mutex_lock(&fullstop_mutex);
1060
1018 /* Process args and tell the world that the torturer is on the job. */ 1061 /* Process args and tell the world that the torturer is on the job. */
1019 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1062 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
1020 cur_ops = torture_ops[i]; 1063 cur_ops = torture_ops[i];
@@ -1024,6 +1067,7 @@ rcu_torture_init(void)
1024 if (i == ARRAY_SIZE(torture_ops)) { 1067 if (i == ARRAY_SIZE(torture_ops)) {
1025 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1068 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1026 torture_type); 1069 torture_type);
1070 mutex_unlock(&fullstop_mutex);
1027 return (-EINVAL); 1071 return (-EINVAL);
1028 } 1072 }
1029 if (cur_ops->init) 1073 if (cur_ops->init)
@@ -1146,9 +1190,12 @@ rcu_torture_init(void)
1146 goto unwind; 1190 goto unwind;
1147 } 1191 }
1148 } 1192 }
1193 register_reboot_notifier(&rcutorture_nb);
1194 mutex_unlock(&fullstop_mutex);
1149 return 0; 1195 return 0;
1150 1196
1151unwind: 1197unwind:
1198 mutex_unlock(&fullstop_mutex);
1152 rcu_torture_cleanup(); 1199 rcu_torture_cleanup();
1153 return firsterr; 1200 return firsterr;
1154} 1201}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 000000000000..f2d8638e6c60
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1532 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
23 *
24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 *
27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU
29 */
30#include <linux/types.h>
31#include <linux/kernel.h>
32#include <linux/init.h>
33#include <linux/spinlock.h>
34#include <linux/smp.h>
35#include <linux/rcupdate.h>
36#include <linux/interrupt.h>
37#include <linux/sched.h>
38#include <asm/atomic.h>
39#include <linux/bitops.h>
40#include <linux/module.h>
41#include <linux/completion.h>
42#include <linux/moduleparam.h>
43#include <linux/percpu.h>
44#include <linux/notifier.h>
45#include <linux/cpu.h>
46#include <linux/mutex.h>
47#include <linux/time.h>
48
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map =
52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif
55
56/* Data structures. */
57
58#define RCU_STATE_INITIALIZER(name) { \
59 .level = { &name.node[0] }, \
60 .levelcnt = { \
61 NUM_RCU_LVL_0, /* root of hierarchy. */ \
62 NUM_RCU_LVL_1, \
63 NUM_RCU_LVL_2, \
64 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
65 }, \
66 .signaled = RCU_SIGNAL_INIT, \
67 .gpnum = -300, \
68 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
71 .n_force_qs = 0, \
72 .n_force_qs_ngp = 0, \
73}
74
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data);
77
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80
81#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
83 .dynticks_nesting = 1,
84 .dynticks = 1,
85};
86#endif /* #ifdef CONFIG_NO_HZ */
87
88static int blimit = 10; /* Maximum callbacks per softirq. */
89static int qhimark = 10000; /* If this many pending, ignore blimit. */
90static int qlowmark = 100; /* Once only this many pending, use blimit. */
91
92static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
93
94/*
95 * Return the number of RCU batches processed thus far for debug & stats.
96 */
97long rcu_batches_completed(void)
98{
99 return rcu_state.completed;
100}
101EXPORT_SYMBOL_GPL(rcu_batches_completed);
102
103/*
104 * Return the number of RCU BH batches processed thus far for debug & stats.
105 */
106long rcu_batches_completed_bh(void)
107{
108 return rcu_bh_state.completed;
109}
110EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
111
112/*
113 * Does the CPU have callbacks ready to be invoked?
114 */
115static int
116cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
117{
118 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
119}
120
121/*
122 * Does the current CPU require a yet-as-unscheduled grace period?
123 */
124static int
125cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
126{
127 /* ACCESS_ONCE() because we are accessing outside of lock. */
128 return *rdp->nxttail[RCU_DONE_TAIL] &&
129 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
130}
131
132/*
133 * Return the root node of the specified rcu_state structure.
134 */
135static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
136{
137 return &rsp->node[0];
138}
139
140#ifdef CONFIG_SMP
141
142/*
143 * If the specified CPU is offline, tell the caller that it is in
144 * a quiescent state. Otherwise, whack it with a reschedule IPI.
145 * Grace periods can end up waiting on an offline CPU when that
146 * CPU is in the process of coming online -- it will be added to the
147 * rcu_node bitmasks before it actually makes it online. The same thing
148 * can happen while a CPU is in the process of coming online. Because this
149 * race is quite rare, we check for it after detecting that the grace
150 * period has been delayed rather than checking each and every CPU
151 * each and every time we start a new grace period.
152 */
153static int rcu_implicit_offline_qs(struct rcu_data *rdp)
154{
155 /*
156 * If the CPU is offline, it is in a quiescent state. We can
157 * trust its state not to change because interrupts are disabled.
158 */
159 if (cpu_is_offline(rdp->cpu)) {
160 rdp->offline_fqs++;
161 return 1;
162 }
163
164 /* The CPU is online, so send it a reschedule IPI. */
165 if (rdp->cpu != smp_processor_id())
166 smp_send_reschedule(rdp->cpu);
167 else
168 set_need_resched();
169 rdp->resched_ipi++;
170 return 0;
171}
172
173#endif /* #ifdef CONFIG_SMP */
174
175#ifdef CONFIG_NO_HZ
176static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
177
178/**
179 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
180 *
181 * Enter nohz mode, in other words, -leave- the mode in which RCU
182 * read-side critical sections can occur. (Though RCU read-side
183 * critical sections can occur in irq handlers in nohz mode, a possibility
184 * handled by rcu_irq_enter() and rcu_irq_exit()).
185 */
186void rcu_enter_nohz(void)
187{
188 unsigned long flags;
189 struct rcu_dynticks *rdtp;
190
191 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
192 local_irq_save(flags);
193 rdtp = &__get_cpu_var(rcu_dynticks);
194 rdtp->dynticks++;
195 rdtp->dynticks_nesting--;
196 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
197 local_irq_restore(flags);
198}
199
200/*
201 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
202 *
203 * Exit nohz mode, in other words, -enter- the mode in which RCU
204 * read-side critical sections normally occur.
205 */
206void rcu_exit_nohz(void)
207{
208 unsigned long flags;
209 struct rcu_dynticks *rdtp;
210
211 local_irq_save(flags);
212 rdtp = &__get_cpu_var(rcu_dynticks);
213 rdtp->dynticks++;
214 rdtp->dynticks_nesting++;
215 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
216 local_irq_restore(flags);
217 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
218}
219
220/**
221 * rcu_nmi_enter - inform RCU of entry to NMI context
222 *
223 * If the CPU was idle with dynamic ticks active, and there is no
224 * irq handler running, this updates rdtp->dynticks_nmi to let the
225 * RCU grace-period handling know that the CPU is active.
226 */
227void rcu_nmi_enter(void)
228{
229 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
230
231 if (rdtp->dynticks & 0x1)
232 return;
233 rdtp->dynticks_nmi++;
234 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
235 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
236}
237
238/**
239 * rcu_nmi_exit - inform RCU of exit from NMI context
240 *
241 * If the CPU was idle with dynamic ticks active, and there is no
242 * irq handler running, this updates rdtp->dynticks_nmi to let the
243 * RCU grace-period handling know that the CPU is no longer active.
244 */
245void rcu_nmi_exit(void)
246{
247 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
248
249 if (rdtp->dynticks & 0x1)
250 return;
251 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
252 rdtp->dynticks_nmi++;
253 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
254}
255
256/**
257 * rcu_irq_enter - inform RCU of entry to hard irq context
258 *
259 * If the CPU was idle with dynamic ticks active, this updates the
260 * rdtp->dynticks to let the RCU handling know that the CPU is active.
261 */
262void rcu_irq_enter(void)
263{
264 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
265
266 if (rdtp->dynticks_nesting++)
267 return;
268 rdtp->dynticks++;
269 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
270 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
271}
272
273/**
274 * rcu_irq_exit - inform RCU of exit from hard irq context
275 *
276 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
277 * to put let the RCU handling be aware that the CPU is going back to idle
278 * with no ticks.
279 */
280void rcu_irq_exit(void)
281{
282 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
283
284 if (--rdtp->dynticks_nesting)
285 return;
286 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
287 rdtp->dynticks++;
288 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
289
290 /* If the interrupt queued a callback, get out of dyntick mode. */
291 if (__get_cpu_var(rcu_data).nxtlist ||
292 __get_cpu_var(rcu_bh_data).nxtlist)
293 set_need_resched();
294}
295
296/*
297 * Record the specified "completed" value, which is later used to validate
298 * dynticks counter manipulations. Specify "rsp->completed - 1" to
299 * unconditionally invalidate any future dynticks manipulations (which is
300 * useful at the beginning of a grace period).
301 */
302static void dyntick_record_completed(struct rcu_state *rsp, long comp)
303{
304 rsp->dynticks_completed = comp;
305}
306
307#ifdef CONFIG_SMP
308
309/*
310 * Recall the previously recorded value of the completion for dynticks.
311 */
312static long dyntick_recall_completed(struct rcu_state *rsp)
313{
314 return rsp->dynticks_completed;
315}
316
317/*
318 * Snapshot the specified CPU's dynticks counter so that we can later
319 * credit them with an implicit quiescent state. Return 1 if this CPU
320 * is already in a quiescent state courtesy of dynticks idle mode.
321 */
322static int dyntick_save_progress_counter(struct rcu_data *rdp)
323{
324 int ret;
325 int snap;
326 int snap_nmi;
327
328 snap = rdp->dynticks->dynticks;
329 snap_nmi = rdp->dynticks->dynticks_nmi;
330 smp_mb(); /* Order sampling of snap with end of grace period. */
331 rdp->dynticks_snap = snap;
332 rdp->dynticks_nmi_snap = snap_nmi;
333 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
334 if (ret)
335 rdp->dynticks_fqs++;
336 return ret;
337}
338
339/*
340 * Return true if the specified CPU has passed through a quiescent
341 * state by virtue of being in or having passed through an dynticks
342 * idle state since the last call to dyntick_save_progress_counter()
343 * for this same CPU.
344 */
345static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
346{
347 long curr;
348 long curr_nmi;
349 long snap;
350 long snap_nmi;
351
352 curr = rdp->dynticks->dynticks;
353 snap = rdp->dynticks_snap;
354 curr_nmi = rdp->dynticks->dynticks_nmi;
355 snap_nmi = rdp->dynticks_nmi_snap;
356 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
357
358 /*
359 * If the CPU passed through or entered a dynticks idle phase with
360 * no active irq/NMI handlers, then we can safely pretend that the CPU
361 * already acknowledged the request to pass through a quiescent
362 * state. Either way, that CPU cannot possibly be in an RCU
363 * read-side critical section that started before the beginning
364 * of the current RCU grace period.
365 */
366 if ((curr != snap || (curr & 0x1) == 0) &&
367 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
368 rdp->dynticks_fqs++;
369 return 1;
370 }
371
372 /* Go check for the CPU being offline. */
373 return rcu_implicit_offline_qs(rdp);
374}
375
376#endif /* #ifdef CONFIG_SMP */
377
378#else /* #ifdef CONFIG_NO_HZ */
379
380static void dyntick_record_completed(struct rcu_state *rsp, long comp)
381{
382}
383
384#ifdef CONFIG_SMP
385
386/*
387 * If there are no dynticks, then the only way that a CPU can passively
388 * be in a quiescent state is to be offline. Unlike dynticks idle, which
389 * is a point in time during the prior (already finished) grace period,
390 * an offline CPU is always in a quiescent state, and thus can be
391 * unconditionally applied. So just return the current value of completed.
392 */
393static long dyntick_recall_completed(struct rcu_state *rsp)
394{
395 return rsp->completed;
396}
397
398static int dyntick_save_progress_counter(struct rcu_data *rdp)
399{
400 return 0;
401}
402
403static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
404{
405 return rcu_implicit_offline_qs(rdp);
406}
407
408#endif /* #ifdef CONFIG_SMP */
409
410#endif /* #else #ifdef CONFIG_NO_HZ */
411
412#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
413
414static void record_gp_stall_check_time(struct rcu_state *rsp)
415{
416 rsp->gp_start = jiffies;
417 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
418}
419
420static void print_other_cpu_stall(struct rcu_state *rsp)
421{
422 int cpu;
423 long delta;
424 unsigned long flags;
425 struct rcu_node *rnp = rcu_get_root(rsp);
426 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
427 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
428
429 /* Only let one CPU complain about others per time interval. */
430
431 spin_lock_irqsave(&rnp->lock, flags);
432 delta = jiffies - rsp->jiffies_stall;
433 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
434 spin_unlock_irqrestore(&rnp->lock, flags);
435 return;
436 }
437 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
438 spin_unlock_irqrestore(&rnp->lock, flags);
439
440 /* OK, time to rat on our buddy... */
441
442 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
443 for (; rnp_cur < rnp_end; rnp_cur++) {
444 if (rnp_cur->qsmask == 0)
445 continue;
446 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
447 if (rnp_cur->qsmask & (1UL << cpu))
448 printk(" %d", rnp_cur->grplo + cpu);
449 }
450 printk(" (detected by %d, t=%ld jiffies)\n",
451 smp_processor_id(), (long)(jiffies - rsp->gp_start));
452 force_quiescent_state(rsp, 0); /* Kick them all. */
453}
454
455static void print_cpu_stall(struct rcu_state *rsp)
456{
457 unsigned long flags;
458 struct rcu_node *rnp = rcu_get_root(rsp);
459
460 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
461 smp_processor_id(), jiffies - rsp->gp_start);
462 dump_stack();
463 spin_lock_irqsave(&rnp->lock, flags);
464 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
465 rsp->jiffies_stall =
466 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
467 spin_unlock_irqrestore(&rnp->lock, flags);
468 set_need_resched(); /* kick ourselves to get things going. */
469}
470
471static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
472{
473 long delta;
474 struct rcu_node *rnp;
475
476 delta = jiffies - rsp->jiffies_stall;
477 rnp = rdp->mynode;
478 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
479
480 /* We haven't checked in, so go dump stack. */
481 print_cpu_stall(rsp);
482
483 } else if (rsp->gpnum != rsp->completed &&
484 delta >= RCU_STALL_RAT_DELAY) {
485
486 /* They had two time units to dump stack, so complain. */
487 print_other_cpu_stall(rsp);
488 }
489}
490
491#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
492
493static void record_gp_stall_check_time(struct rcu_state *rsp)
494{
495}
496
497static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
498{
499}
500
501#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
502
503/*
504 * Update CPU-local rcu_data state to record the newly noticed grace period.
505 * This is used both when we started the grace period and when we notice
506 * that someone else started the grace period.
507 */
508static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
509{
510 rdp->qs_pending = 1;
511 rdp->passed_quiesc = 0;
512 rdp->gpnum = rsp->gpnum;
513 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
514 RCU_JIFFIES_TILL_FORCE_QS;
515}
516
517/*
518 * Did someone else start a new RCU grace period start since we last
519 * checked? Update local state appropriately if so. Must be called
520 * on the CPU corresponding to rdp.
521 */
522static int
523check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
524{
525 unsigned long flags;
526 int ret = 0;
527
528 local_irq_save(flags);
529 if (rdp->gpnum != rsp->gpnum) {
530 note_new_gpnum(rsp, rdp);
531 ret = 1;
532 }
533 local_irq_restore(flags);
534 return ret;
535}
536
537/*
538 * Start a new RCU grace period if warranted, re-initializing the hierarchy
539 * in preparation for detecting the next grace period. The caller must hold
540 * the root node's ->lock, which is released before return. Hard irqs must
541 * be disabled.
542 */
543static void
544rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
545 __releases(rcu_get_root(rsp)->lock)
546{
547 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
548 struct rcu_node *rnp = rcu_get_root(rsp);
549 struct rcu_node *rnp_cur;
550 struct rcu_node *rnp_end;
551
552 if (!cpu_needs_another_gp(rsp, rdp)) {
553 spin_unlock_irqrestore(&rnp->lock, flags);
554 return;
555 }
556
557 /* Advance to a new grace period and initialize state. */
558 rsp->gpnum++;
559 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
560 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
561 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
562 RCU_JIFFIES_TILL_FORCE_QS;
563 record_gp_stall_check_time(rsp);
564 dyntick_record_completed(rsp, rsp->completed - 1);
565 note_new_gpnum(rsp, rdp);
566
567 /*
568 * Because we are first, we know that all our callbacks will
569 * be covered by this upcoming grace period, even the ones
570 * that were registered arbitrarily recently.
571 */
572 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
573 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
574
575 /* Special-case the common single-level case. */
576 if (NUM_RCU_NODES == 1) {
577 rnp->qsmask = rnp->qsmaskinit;
578 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
579 spin_unlock_irqrestore(&rnp->lock, flags);
580 return;
581 }
582
583 spin_unlock(&rnp->lock); /* leave irqs disabled. */
584
585
586 /* Exclude any concurrent CPU-hotplug operations. */
587 spin_lock(&rsp->onofflock); /* irqs already disabled. */
588
589 /*
590 * Set the quiescent-state-needed bits in all the non-leaf RCU
591 * nodes for all currently online CPUs. This operation relies
592 * on the layout of the hierarchy within the rsp->node[] array.
593 * Note that other CPUs will access only the leaves of the
594 * hierarchy, which still indicate that no grace period is in
595 * progress. In addition, we have excluded CPU-hotplug operations.
596 *
597 * We therefore do not need to hold any locks. Any required
598 * memory barriers will be supplied by the locks guarding the
599 * leaf rcu_nodes in the hierarchy.
600 */
601
602 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
603 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
604 rnp_cur->qsmask = rnp_cur->qsmaskinit;
605
606 /*
607 * Now set up the leaf nodes. Here we must be careful. First,
608 * we need to hold the lock in order to exclude other CPUs, which
609 * might be contending for the leaf nodes' locks. Second, as
610 * soon as we initialize a given leaf node, its CPUs might run
611 * up the rest of the hierarchy. We must therefore acquire locks
612 * for each node that we touch during this stage. (But we still
613 * are excluding CPU-hotplug operations.)
614 *
615 * Note that the grace period cannot complete until we finish
616 * the initialization process, as there will be at least one
617 * qsmask bit set in the root node until that time, namely the
618 * one corresponding to this CPU.
619 */
620 rnp_end = &rsp->node[NUM_RCU_NODES];
621 rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
622 for (; rnp_cur < rnp_end; rnp_cur++) {
623 spin_lock(&rnp_cur->lock); /* irqs already disabled. */
624 rnp_cur->qsmask = rnp_cur->qsmaskinit;
625 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
626 }
627
628 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
629 spin_unlock_irqrestore(&rsp->onofflock, flags);
630}
631
632/*
633 * Advance this CPU's callbacks, but only if the current grace period
634 * has ended. This may be called only from the CPU to whom the rdp
635 * belongs.
636 */
637static void
638rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
639{
640 long completed_snap;
641 unsigned long flags;
642
643 local_irq_save(flags);
644 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
645
646 /* Did another grace period end? */
647 if (rdp->completed != completed_snap) {
648
649 /* Advance callbacks. No harm if list empty. */
650 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
651 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
652 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
653
654 /* Remember that we saw this grace-period completion. */
655 rdp->completed = completed_snap;
656 }
657 local_irq_restore(flags);
658}
659
660/*
661 * Similar to cpu_quiet(), for which it is a helper function. Allows
662 * a group of CPUs to be quieted at one go, though all the CPUs in the
663 * group must be represented by the same leaf rcu_node structure.
664 * That structure's lock must be held upon entry, and it is released
665 * before return.
666 */
667static void
668cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
669 unsigned long flags)
670 __releases(rnp->lock)
671{
672 /* Walk up the rcu_node hierarchy. */
673 for (;;) {
674 if (!(rnp->qsmask & mask)) {
675
676 /* Our bit has already been cleared, so done. */
677 spin_unlock_irqrestore(&rnp->lock, flags);
678 return;
679 }
680 rnp->qsmask &= ~mask;
681 if (rnp->qsmask != 0) {
682
683 /* Other bits still set at this level, so done. */
684 spin_unlock_irqrestore(&rnp->lock, flags);
685 return;
686 }
687 mask = rnp->grpmask;
688 if (rnp->parent == NULL) {
689
690 /* No more levels. Exit loop holding root lock. */
691
692 break;
693 }
694 spin_unlock_irqrestore(&rnp->lock, flags);
695 rnp = rnp->parent;
696 spin_lock_irqsave(&rnp->lock, flags);
697 }
698
699 /*
700 * Get here if we are the last CPU to pass through a quiescent
701 * state for this grace period. Clean up and let rcu_start_gp()
702 * start up the next grace period if one is needed. Note that
703 * we still hold rnp->lock, as required by rcu_start_gp(), which
704 * will release it.
705 */
706 rsp->completed = rsp->gpnum;
707 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
708 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
709}
710
711/*
712 * Record a quiescent state for the specified CPU, which must either be
713 * the current CPU or an offline CPU. The lastcomp argument is used to
714 * make sure we are still in the grace period of interest. We don't want
715 * to end the current grace period based on quiescent states detected in
716 * an earlier grace period!
717 */
718static void
719cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
720{
721 unsigned long flags;
722 unsigned long mask;
723 struct rcu_node *rnp;
724
725 rnp = rdp->mynode;
726 spin_lock_irqsave(&rnp->lock, flags);
727 if (lastcomp != ACCESS_ONCE(rsp->completed)) {
728
729 /*
730 * Someone beat us to it for this grace period, so leave.
731 * The race with GP start is resolved by the fact that we
732 * hold the leaf rcu_node lock, so that the per-CPU bits
733 * cannot yet be initialized -- so we would simply find our
734 * CPU's bit already cleared in cpu_quiet_msk() if this race
735 * occurred.
736 */
737 rdp->passed_quiesc = 0; /* try again later! */
738 spin_unlock_irqrestore(&rnp->lock, flags);
739 return;
740 }
741 mask = rdp->grpmask;
742 if ((rnp->qsmask & mask) == 0) {
743 spin_unlock_irqrestore(&rnp->lock, flags);
744 } else {
745 rdp->qs_pending = 0;
746
747 /*
748 * This GP can't end until cpu checks in, so all of our
749 * callbacks can be processed during the next GP.
750 */
751 rdp = rsp->rda[smp_processor_id()];
752 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
753
754 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
755 }
756}
757
758/*
759 * Check to see if there is a new grace period of which this CPU
760 * is not yet aware, and if so, set up local rcu_data state for it.
761 * Otherwise, see if this CPU has just passed through its first
762 * quiescent state for this grace period, and record that fact if so.
763 */
764static void
765rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
766{
767 /* If there is now a new grace period, record and return. */
768 if (check_for_new_grace_period(rsp, rdp))
769 return;
770
771 /*
772 * Does this CPU still need to do its part for current grace period?
773 * If no, return and let the other CPUs do their part as well.
774 */
775 if (!rdp->qs_pending)
776 return;
777
778 /*
779 * Was there a quiescent state since the beginning of the grace
780 * period? If no, then exit and wait for the next call.
781 */
782 if (!rdp->passed_quiesc)
783 return;
784
785 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
786 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
787}
788
789#ifdef CONFIG_HOTPLUG_CPU
790
791/*
792 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
793 * and move all callbacks from the outgoing CPU to the current one.
794 */
795static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
796{
797 int i;
798 unsigned long flags;
799 long lastcomp;
800 unsigned long mask;
801 struct rcu_data *rdp = rsp->rda[cpu];
802 struct rcu_data *rdp_me;
803 struct rcu_node *rnp;
804
805 /* Exclude any attempts to start a new grace period. */
806 spin_lock_irqsave(&rsp->onofflock, flags);
807
808 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
809 rnp = rdp->mynode;
810 mask = rdp->grpmask; /* rnp->grplo is constant. */
811 do {
812 spin_lock(&rnp->lock); /* irqs already disabled. */
813 rnp->qsmaskinit &= ~mask;
814 if (rnp->qsmaskinit != 0) {
815 spin_unlock(&rnp->lock); /* irqs already disabled. */
816 break;
817 }
818 mask = rnp->grpmask;
819 spin_unlock(&rnp->lock); /* irqs already disabled. */
820 rnp = rnp->parent;
821 } while (rnp != NULL);
822 lastcomp = rsp->completed;
823
824 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
825
826 /* Being offline is a quiescent state, so go record it. */
827 cpu_quiet(cpu, rsp, rdp, lastcomp);
828
829 /*
830 * Move callbacks from the outgoing CPU to the running CPU.
831 * Note that the outgoing CPU is now quiscent, so it is now
832 * (uncharacteristically) safe to access it rcu_data structure.
833 * Note also that we must carefully retain the order of the
834 * outgoing CPU's callbacks in order for rcu_barrier() to work
835 * correctly. Finally, note that we start all the callbacks
836 * afresh, even those that have passed through a grace period
837 * and are therefore ready to invoke. The theory is that hotplug
838 * events are rare, and that if they are frequent enough to
839 * indefinitely delay callbacks, you have far worse things to
840 * be worrying about.
841 */
842 rdp_me = rsp->rda[smp_processor_id()];
843 if (rdp->nxtlist != NULL) {
844 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
845 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
846 rdp->nxtlist = NULL;
847 for (i = 0; i < RCU_NEXT_SIZE; i++)
848 rdp->nxttail[i] = &rdp->nxtlist;
849 rdp_me->qlen += rdp->qlen;
850 rdp->qlen = 0;
851 }
852 local_irq_restore(flags);
853}
854
855/*
856 * Remove the specified CPU from the RCU hierarchy and move any pending
857 * callbacks that it might have to the current CPU. This code assumes
858 * that at least one CPU in the system will remain running at all times.
859 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
860 */
861static void rcu_offline_cpu(int cpu)
862{
863 __rcu_offline_cpu(cpu, &rcu_state);
864 __rcu_offline_cpu(cpu, &rcu_bh_state);
865}
866
867#else /* #ifdef CONFIG_HOTPLUG_CPU */
868
869static void rcu_offline_cpu(int cpu)
870{
871}
872
873#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
874
875/*
876 * Invoke any RCU callbacks that have made it to the end of their grace
877 * period. Thottle as specified by rdp->blimit.
878 */
879static void rcu_do_batch(struct rcu_data *rdp)
880{
881 unsigned long flags;
882 struct rcu_head *next, *list, **tail;
883 int count;
884
885 /* If no callbacks are ready, just return.*/
886 if (!cpu_has_callbacks_ready_to_invoke(rdp))
887 return;
888
889 /*
890 * Extract the list of ready callbacks, disabling to prevent
891 * races with call_rcu() from interrupt handlers.
892 */
893 local_irq_save(flags);
894 list = rdp->nxtlist;
895 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
896 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
897 tail = rdp->nxttail[RCU_DONE_TAIL];
898 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
899 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
900 rdp->nxttail[count] = &rdp->nxtlist;
901 local_irq_restore(flags);
902
903 /* Invoke callbacks. */
904 count = 0;
905 while (list) {
906 next = list->next;
907 prefetch(next);
908 list->func(list);
909 list = next;
910 if (++count >= rdp->blimit)
911 break;
912 }
913
914 local_irq_save(flags);
915
916 /* Update count, and requeue any remaining callbacks. */
917 rdp->qlen -= count;
918 if (list != NULL) {
919 *tail = rdp->nxtlist;
920 rdp->nxtlist = list;
921 for (count = 0; count < RCU_NEXT_SIZE; count++)
922 if (&rdp->nxtlist == rdp->nxttail[count])
923 rdp->nxttail[count] = tail;
924 else
925 break;
926 }
927
928 /* Reinstate batch limit if we have worked down the excess. */
929 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
930 rdp->blimit = blimit;
931
932 local_irq_restore(flags);
933
934 /* Re-raise the RCU softirq if there are callbacks remaining. */
935 if (cpu_has_callbacks_ready_to_invoke(rdp))
936 raise_softirq(RCU_SOFTIRQ);
937}
938
939/*
940 * Check to see if this CPU is in a non-context-switch quiescent state
941 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
942 * Also schedule the RCU softirq handler.
943 *
944 * This function must be called with hardirqs disabled. It is normally
945 * invoked from the scheduling-clock interrupt. If rcu_pending returns
946 * false, there is no point in invoking rcu_check_callbacks().
947 */
948void rcu_check_callbacks(int cpu, int user)
949{
950 if (user ||
951 (idle_cpu(cpu) && !in_softirq() &&
952 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
953
954 /*
955 * Get here if this CPU took its interrupt from user
956 * mode or from the idle loop, and if this is not a
957 * nested interrupt. In this case, the CPU is in
958 * a quiescent state, so count it.
959 *
960 * No memory barrier is required here because both
961 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
962 * only CPU-local variables that other CPUs neither
963 * access nor modify, at least not while the corresponding
964 * CPU is online.
965 */
966
967 rcu_qsctr_inc(cpu);
968 rcu_bh_qsctr_inc(cpu);
969
970 } else if (!in_softirq()) {
971
972 /*
973 * Get here if this CPU did not take its interrupt from
974 * softirq, in other words, if it is not interrupting
975 * a rcu_bh read-side critical section. This is an _bh
976 * critical section, so count it.
977 */
978
979 rcu_bh_qsctr_inc(cpu);
980 }
981 raise_softirq(RCU_SOFTIRQ);
982}
983
984#ifdef CONFIG_SMP
985
986/*
987 * Scan the leaf rcu_node structures, processing dyntick state for any that
988 * have not yet encountered a quiescent state, using the function specified.
989 * Returns 1 if the current grace period ends while scanning (possibly
990 * because we made it end).
991 */
992static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
993 int (*f)(struct rcu_data *))
994{
995 unsigned long bit;
996 int cpu;
997 unsigned long flags;
998 unsigned long mask;
999 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
1000 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
1001
1002 for (; rnp_cur < rnp_end; rnp_cur++) {
1003 mask = 0;
1004 spin_lock_irqsave(&rnp_cur->lock, flags);
1005 if (rsp->completed != lastcomp) {
1006 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1007 return 1;
1008 }
1009 if (rnp_cur->qsmask == 0) {
1010 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1011 continue;
1012 }
1013 cpu = rnp_cur->grplo;
1014 bit = 1;
1015 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
1016 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1017 mask |= bit;
1018 }
1019 if (mask != 0 && rsp->completed == lastcomp) {
1020
1021 /* cpu_quiet_msk() releases rnp_cur->lock. */
1022 cpu_quiet_msk(mask, rsp, rnp_cur, flags);
1023 continue;
1024 }
1025 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1026 }
1027 return 0;
1028}
1029
1030/*
1031 * Force quiescent states on reluctant CPUs, and also detect which
1032 * CPUs are in dyntick-idle mode.
1033 */
1034static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1035{
1036 unsigned long flags;
1037 long lastcomp;
1038 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
1039 struct rcu_node *rnp = rcu_get_root(rsp);
1040 u8 signaled;
1041
1042 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
1043 return; /* No grace period in progress, nothing to force. */
1044 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1045 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1046 return; /* Someone else is already on the job. */
1047 }
1048 if (relaxed &&
1049 (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
1050 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
1051 goto unlock_ret; /* no emergency and done recently. */
1052 rsp->n_force_qs++;
1053 spin_lock(&rnp->lock);
1054 lastcomp = rsp->completed;
1055 signaled = rsp->signaled;
1056 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1057 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
1058 RCU_JIFFIES_TILL_FORCE_QS;
1059 if (lastcomp == rsp->gpnum) {
1060 rsp->n_force_qs_ngp++;
1061 spin_unlock(&rnp->lock);
1062 goto unlock_ret; /* no GP in progress, time updated. */
1063 }
1064 spin_unlock(&rnp->lock);
1065 switch (signaled) {
1066 case RCU_GP_INIT:
1067
1068 break; /* grace period still initializing, ignore. */
1069
1070 case RCU_SAVE_DYNTICK:
1071
1072 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1073 break; /* So gcc recognizes the dead code. */
1074
1075 /* Record dyntick-idle state. */
1076 if (rcu_process_dyntick(rsp, lastcomp,
1077 dyntick_save_progress_counter))
1078 goto unlock_ret;
1079
1080 /* Update state, record completion counter. */
1081 spin_lock(&rnp->lock);
1082 if (lastcomp == rsp->completed) {
1083 rsp->signaled = RCU_FORCE_QS;
1084 dyntick_record_completed(rsp, lastcomp);
1085 }
1086 spin_unlock(&rnp->lock);
1087 break;
1088
1089 case RCU_FORCE_QS:
1090
1091 /* Check dyntick-idle state, send IPI to laggarts. */
1092 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
1093 rcu_implicit_dynticks_qs))
1094 goto unlock_ret;
1095
1096 /* Leave state in case more forcing is required. */
1097
1098 break;
1099 }
1100unlock_ret:
1101 spin_unlock_irqrestore(&rsp->fqslock, flags);
1102}
1103
1104#else /* #ifdef CONFIG_SMP */
1105
1106static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1107{
1108 set_need_resched();
1109}
1110
1111#endif /* #else #ifdef CONFIG_SMP */
1112
1113/*
1114 * This does the RCU processing work from softirq context for the
1115 * specified rcu_state and rcu_data structures. This may be called
1116 * only from the CPU to whom the rdp belongs.
1117 */
1118static void
1119__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1120{
1121 unsigned long flags;
1122
1123 /*
1124 * If an RCU GP has gone long enough, go check for dyntick
1125 * idle CPUs and, if needed, send resched IPIs.
1126 */
1127 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1128 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1129 force_quiescent_state(rsp, 1);
1130
1131 /*
1132 * Advance callbacks in response to end of earlier grace
1133 * period that some other CPU ended.
1134 */
1135 rcu_process_gp_end(rsp, rdp);
1136
1137 /* Update RCU state based on any recent quiescent states. */
1138 rcu_check_quiescent_state(rsp, rdp);
1139
1140 /* Does this CPU require a not-yet-started grace period? */
1141 if (cpu_needs_another_gp(rsp, rdp)) {
1142 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1143 rcu_start_gp(rsp, flags); /* releases above lock */
1144 }
1145
1146 /* If there are callbacks ready, invoke them. */
1147 rcu_do_batch(rdp);
1148}
1149
1150/*
1151 * Do softirq processing for the current CPU.
1152 */
1153static void rcu_process_callbacks(struct softirq_action *unused)
1154{
1155 /*
1156 * Memory references from any prior RCU read-side critical sections
1157 * executed by the interrupted code must be seen before any RCU
1158 * grace-period manipulations below.
1159 */
1160 smp_mb(); /* See above block comment. */
1161
1162 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
1163 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1164
1165 /*
1166 * Memory references from any later RCU read-side critical sections
1167 * executed by the interrupted code must be seen after any RCU
1168 * grace-period manipulations above.
1169 */
1170 smp_mb(); /* See above block comment. */
1171}
1172
1173static void
1174__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1175 struct rcu_state *rsp)
1176{
1177 unsigned long flags;
1178 struct rcu_data *rdp;
1179
1180 head->func = func;
1181 head->next = NULL;
1182
1183 smp_mb(); /* Ensure RCU update seen before callback registry. */
1184
1185 /*
1186 * Opportunistically note grace-period endings and beginnings.
1187 * Note that we might see a beginning right after we see an
1188 * end, but never vice versa, since this CPU has to pass through
1189 * a quiescent state betweentimes.
1190 */
1191 local_irq_save(flags);
1192 rdp = rsp->rda[smp_processor_id()];
1193 rcu_process_gp_end(rsp, rdp);
1194 check_for_new_grace_period(rsp, rdp);
1195
1196 /* Add the callback to our list. */
1197 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1198 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1199
1200 /* Start a new grace period if one not already started. */
1201 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
1202 unsigned long nestflag;
1203 struct rcu_node *rnp_root = rcu_get_root(rsp);
1204
1205 spin_lock_irqsave(&rnp_root->lock, nestflag);
1206 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1207 }
1208
1209 /* Force the grace period if too many callbacks or too long waiting. */
1210 if (unlikely(++rdp->qlen > qhimark)) {
1211 rdp->blimit = LONG_MAX;
1212 force_quiescent_state(rsp, 0);
1213 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1214 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1215 force_quiescent_state(rsp, 1);
1216 local_irq_restore(flags);
1217}
1218
1219/*
1220 * Queue an RCU callback for invocation after a grace period.
1221 */
1222void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1223{
1224 __call_rcu(head, func, &rcu_state);
1225}
1226EXPORT_SYMBOL_GPL(call_rcu);
1227
1228/*
1229 * Queue an RCU for invocation after a quicker grace period.
1230 */
1231void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1232{
1233 __call_rcu(head, func, &rcu_bh_state);
1234}
1235EXPORT_SYMBOL_GPL(call_rcu_bh);
1236
1237/*
1238 * Check to see if there is any immediate RCU-related work to be done
1239 * by the current CPU, for the specified type of RCU, returning 1 if so.
1240 * The checks are in order of increasing expense: checks that can be
1241 * carried out against CPU-local state are performed first. However,
1242 * we must check for CPU stalls first, else we might not get a chance.
1243 */
1244static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1245{
1246 rdp->n_rcu_pending++;
1247
1248 /* Check for CPU stalls, if enabled. */
1249 check_cpu_stall(rsp, rdp);
1250
1251 /* Is the RCU core waiting for a quiescent state from this CPU? */
1252 if (rdp->qs_pending)
1253 return 1;
1254
1255 /* Does this CPU have callbacks ready to invoke? */
1256 if (cpu_has_callbacks_ready_to_invoke(rdp))
1257 return 1;
1258
1259 /* Has RCU gone idle with this CPU needing another grace period? */
1260 if (cpu_needs_another_gp(rsp, rdp))
1261 return 1;
1262
1263 /* Has another RCU grace period completed? */
1264 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
1265 return 1;
1266
1267 /* Has a new RCU grace period started? */
1268 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
1269 return 1;
1270
1271 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1272 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1273 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1274 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
1275 return 1;
1276
1277 /* nothing to do */
1278 return 0;
1279}
1280
1281/*
1282 * Check to see if there is any immediate RCU-related work to be done
1283 * by the current CPU, returning 1 if so. This function is part of the
1284 * RCU implementation; it is -not- an exported member of the RCU API.
1285 */
1286int rcu_pending(int cpu)
1287{
1288 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
1289 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
1290}
1291
1292/*
1293 * Check to see if any future RCU-related work will need to be done
1294 * by the current CPU, even if none need be done immediately, returning
1295 * 1 if so. This function is part of the RCU implementation; it is -not-
1296 * an exported member of the RCU API.
1297 */
1298int rcu_needs_cpu(int cpu)
1299{
1300 /* RCU callbacks either ready or pending? */
1301 return per_cpu(rcu_data, cpu).nxtlist ||
1302 per_cpu(rcu_bh_data, cpu).nxtlist;
1303}
1304
1305/*
1306 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth"
1307 * approach so that we don't have to worry about how long the CPU has
1308 * been gone, or whether it ever was online previously. We do trust the
1309 * ->mynode field, as it is constant for a given struct rcu_data and
1310 * initialized during early boot.
1311 *
1312 * Note that only one online or offline event can be happening at a given
1313 * time. Note also that we can accept some slop in the rsp->completed
1314 * access due to the fact that this CPU cannot possibly have any RCU
1315 * callbacks in flight yet.
1316 */
1317static void
1318rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1319{
1320 unsigned long flags;
1321 int i;
1322 long lastcomp;
1323 unsigned long mask;
1324 struct rcu_data *rdp = rsp->rda[cpu];
1325 struct rcu_node *rnp = rcu_get_root(rsp);
1326
1327 /* Set up local state, ensuring consistent view of global state. */
1328 spin_lock_irqsave(&rnp->lock, flags);
1329 lastcomp = rsp->completed;
1330 rdp->completed = lastcomp;
1331 rdp->gpnum = lastcomp;
1332 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1333 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1334 rdp->beenonline = 1; /* We have now been online. */
1335 rdp->passed_quiesc_completed = lastcomp - 1;
1336 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1337 rdp->nxtlist = NULL;
1338 for (i = 0; i < RCU_NEXT_SIZE; i++)
1339 rdp->nxttail[i] = &rdp->nxtlist;
1340 rdp->qlen = 0;
1341 rdp->blimit = blimit;
1342#ifdef CONFIG_NO_HZ
1343 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1344#endif /* #ifdef CONFIG_NO_HZ */
1345 rdp->cpu = cpu;
1346 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1347
1348 /*
1349 * A new grace period might start here. If so, we won't be part
1350 * of it, but that is OK, as we are currently in a quiescent state.
1351 */
1352
1353 /* Exclude any attempts to start a new GP on large systems. */
1354 spin_lock(&rsp->onofflock); /* irqs already disabled. */
1355
1356 /* Add CPU to rcu_node bitmasks. */
1357 rnp = rdp->mynode;
1358 mask = rdp->grpmask;
1359 do {
1360 /* Exclude any attempts to start a new GP on small systems. */
1361 spin_lock(&rnp->lock); /* irqs already disabled. */
1362 rnp->qsmaskinit |= mask;
1363 mask = rnp->grpmask;
1364 spin_unlock(&rnp->lock); /* irqs already disabled. */
1365 rnp = rnp->parent;
1366 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1367
1368 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1369
1370 /*
1371 * A new grace period might start here. If so, we will be part of
1372 * it, and its gpnum will be greater than ours, so we will
1373 * participate. It is also possible for the gpnum to have been
1374 * incremented before this function was called, and the bitmasks
1375 * to not be filled out until now, in which case we will also
1376 * participate due to our gpnum being behind.
1377 */
1378
1379 /* Since it is coming online, the CPU is in a quiescent state. */
1380 cpu_quiet(cpu, rsp, rdp, lastcomp);
1381 local_irq_restore(flags);
1382}
1383
1384static void __cpuinit rcu_online_cpu(int cpu)
1385{
1386 rcu_init_percpu_data(cpu, &rcu_state);
1387 rcu_init_percpu_data(cpu, &rcu_bh_state);
1388 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1389}
1390
1391/*
1392 * Handle CPU online/offline notifcation events.
1393 */
1394static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1395 unsigned long action, void *hcpu)
1396{
1397 long cpu = (long)hcpu;
1398
1399 switch (action) {
1400 case CPU_UP_PREPARE:
1401 case CPU_UP_PREPARE_FROZEN:
1402 rcu_online_cpu(cpu);
1403 break;
1404 case CPU_DEAD:
1405 case CPU_DEAD_FROZEN:
1406 case CPU_UP_CANCELED:
1407 case CPU_UP_CANCELED_FROZEN:
1408 rcu_offline_cpu(cpu);
1409 break;
1410 default:
1411 break;
1412 }
1413 return NOTIFY_OK;
1414}
1415
1416/*
1417 * Compute the per-level fanout, either using the exact fanout specified
1418 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1419 */
1420#ifdef CONFIG_RCU_FANOUT_EXACT
1421static void __init rcu_init_levelspread(struct rcu_state *rsp)
1422{
1423 int i;
1424
1425 for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
1426 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1427}
1428#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1429static void __init rcu_init_levelspread(struct rcu_state *rsp)
1430{
1431 int ccur;
1432 int cprv;
1433 int i;
1434
1435 cprv = NR_CPUS;
1436 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1437 ccur = rsp->levelcnt[i];
1438 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
1439 cprv = ccur;
1440 }
1441}
1442#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
1443
1444/*
1445 * Helper function for rcu_init() that initializes one rcu_state structure.
1446 */
1447static void __init rcu_init_one(struct rcu_state *rsp)
1448{
1449 int cpustride = 1;
1450 int i;
1451 int j;
1452 struct rcu_node *rnp;
1453
1454 /* Initialize the level-tracking arrays. */
1455
1456 for (i = 1; i < NUM_RCU_LVLS; i++)
1457 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
1458 rcu_init_levelspread(rsp);
1459
1460 /* Initialize the elements themselves, starting from the leaves. */
1461
1462 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1463 cpustride *= rsp->levelspread[i];
1464 rnp = rsp->level[i];
1465 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1466 spin_lock_init(&rnp->lock);
1467 rnp->qsmask = 0;
1468 rnp->qsmaskinit = 0;
1469 rnp->grplo = j * cpustride;
1470 rnp->grphi = (j + 1) * cpustride - 1;
1471 if (rnp->grphi >= NR_CPUS)
1472 rnp->grphi = NR_CPUS - 1;
1473 if (i == 0) {
1474 rnp->grpnum = 0;
1475 rnp->grpmask = 0;
1476 rnp->parent = NULL;
1477 } else {
1478 rnp->grpnum = j % rsp->levelspread[i - 1];
1479 rnp->grpmask = 1UL << rnp->grpnum;
1480 rnp->parent = rsp->level[i - 1] +
1481 j / rsp->levelspread[i - 1];
1482 }
1483 rnp->level = i;
1484 }
1485 }
1486}
1487
1488/*
1489 * Helper macro for __rcu_init(). To be used nowhere else!
1490 * Assigns leaf node pointers into each CPU's rcu_data structure.
1491 */
1492#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
1493do { \
1494 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1495 j = 0; \
1496 for_each_possible_cpu(i) { \
1497 if (i > rnp[j].grphi) \
1498 j++; \
1499 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1500 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1501 } \
1502} while (0)
1503
1504static struct notifier_block __cpuinitdata rcu_nb = {
1505 .notifier_call = rcu_cpu_notify,
1506};
1507
1508void __init __rcu_init(void)
1509{
1510 int i; /* All used by RCU_DATA_PTR_INIT(). */
1511 int j;
1512 struct rcu_node *rnp;
1513
1514 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
1515#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1516 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1517#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1518 rcu_init_one(&rcu_state);
1519 RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
1520 rcu_init_one(&rcu_bh_state);
1521 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
1522
1523 for_each_online_cpu(i)
1524 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1525 /* Register notifier for non-boot CPUs */
1526 register_cpu_notifier(&rcu_nb);
1527 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1528}
1529
1530module_param(blimit, int, 0);
1531module_param(qhimark, int, 0);
1532module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 000000000000..d6db3e837826
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
1/*
2 * Read-Copy Update tracing for classic implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/debugfs.h>
44#include <linux/seq_file.h>
45
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{
48 if (!rdp->beenonline)
49 return;
50 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
51 rdp->cpu,
52 cpu_is_offline(rdp->cpu) ? '!' : ' ',
53 rdp->completed, rdp->gpnum,
54 rdp->passed_quiesc, rdp->passed_quiesc_completed,
55 rdp->qs_pending,
56 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
57 (int)(rdp->n_rcu_pending & 0xffff));
58#ifdef CONFIG_NO_HZ
59 seq_printf(m, " dt=%d/%d dn=%d df=%lu",
60 rdp->dynticks->dynticks,
61 rdp->dynticks->dynticks_nesting,
62 rdp->dynticks->dynticks_nmi,
63 rdp->dynticks_fqs);
64#endif /* #ifdef CONFIG_NO_HZ */
65 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
66 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
67}
68
69#define PRINT_RCU_DATA(name, func, m) \
70 do { \
71 int _p_r_d_i; \
72 \
73 for_each_possible_cpu(_p_r_d_i) \
74 func(m, &per_cpu(name, _p_r_d_i)); \
75 } while (0)
76
77static int show_rcudata(struct seq_file *m, void *unused)
78{
79 seq_puts(m, "rcu:\n");
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0;
84}
85
86static int rcudata_open(struct inode *inode, struct file *file)
87{
88 return single_open(file, show_rcudata, NULL);
89}
90
91static struct file_operations rcudata_fops = {
92 .owner = THIS_MODULE,
93 .open = rcudata_open,
94 .read = seq_read,
95 .llseek = seq_lseek,
96 .release = single_release,
97};
98
99static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
100{
101 if (!rdp->beenonline)
102 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
104 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
106 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending,
109 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
110 rdp->n_rcu_pending);
111#ifdef CONFIG_NO_HZ
112 seq_printf(m, ",%d,%d,%d,%lu",
113 rdp->dynticks->dynticks,
114 rdp->dynticks->dynticks_nesting,
115 rdp->dynticks->dynticks_nmi,
116 rdp->dynticks_fqs);
117#endif /* #ifdef CONFIG_NO_HZ */
118 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
119 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
120}
121
122static int show_rcudata_csv(struct seq_file *m, void *unused)
123{
124 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
125#ifdef CONFIG_NO_HZ
126 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
127#endif /* #ifdef CONFIG_NO_HZ */
128 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
129 seq_puts(m, "\"rcu:\"\n");
130 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
131 seq_puts(m, "\"rcu_bh:\"\n");
132 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
133 return 0;
134}
135
136static int rcudata_csv_open(struct inode *inode, struct file *file)
137{
138 return single_open(file, show_rcudata_csv, NULL);
139}
140
141static struct file_operations rcudata_csv_fops = {
142 .owner = THIS_MODULE,
143 .open = rcudata_csv_open,
144 .read = seq_read,
145 .llseek = seq_lseek,
146 .release = single_release,
147};
148
149static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
150{
151 int level = 0;
152 struct rcu_node *rnp;
153
154 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
155 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
156 rsp->completed, rsp->gpnum, rsp->signaled,
157 (long)(rsp->jiffies_force_qs - jiffies),
158 (int)(jiffies & 0xffff),
159 rsp->n_force_qs, rsp->n_force_qs_ngp,
160 rsp->n_force_qs - rsp->n_force_qs_ngp,
161 rsp->n_force_qs_lh);
162 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
163 if (rnp->level != level) {
164 seq_puts(m, "\n");
165 level = rnp->level;
166 }
167 seq_printf(m, "%lx/%lx %d:%d ^%d ",
168 rnp->qsmask, rnp->qsmaskinit,
169 rnp->grplo, rnp->grphi, rnp->grpnum);
170 }
171 seq_puts(m, "\n");
172}
173
174static int show_rcuhier(struct seq_file *m, void *unused)
175{
176 seq_puts(m, "rcu:\n");
177 print_one_rcu_state(m, &rcu_state);
178 seq_puts(m, "rcu_bh:\n");
179 print_one_rcu_state(m, &rcu_bh_state);
180 return 0;
181}
182
183static int rcuhier_open(struct inode *inode, struct file *file)
184{
185 return single_open(file, show_rcuhier, NULL);
186}
187
188static struct file_operations rcuhier_fops = {
189 .owner = THIS_MODULE,
190 .open = rcuhier_open,
191 .read = seq_read,
192 .llseek = seq_lseek,
193 .release = single_release,
194};
195
196static int show_rcugp(struct seq_file *m, void *unused)
197{
198 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n",
199 rcu_state.completed, rcu_state.gpnum);
200 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
201 rcu_bh_state.completed, rcu_bh_state.gpnum);
202 return 0;
203}
204
205static int rcugp_open(struct inode *inode, struct file *file)
206{
207 return single_open(file, show_rcugp, NULL);
208}
209
210static struct file_operations rcugp_fops = {
211 .owner = THIS_MODULE,
212 .open = rcugp_open,
213 .read = seq_read,
214 .llseek = seq_lseek,
215 .release = single_release,
216};
217
218static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
219static int __init rcuclassic_trace_init(void)
220{
221 rcudir = debugfs_create_dir("rcu", NULL);
222 if (!rcudir)
223 goto out;
224
225 datadir = debugfs_create_file("rcudata", 0444, rcudir,
226 NULL, &rcudata_fops);
227 if (!datadir)
228 goto free_out;
229
230 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
231 NULL, &rcudata_csv_fops);
232 if (!datadir_csv)
233 goto free_out;
234
235 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
236 if (!gpdir)
237 goto free_out;
238
239 hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
240 NULL, &rcuhier_fops);
241 if (!hierdir)
242 goto free_out;
243 return 0;
244free_out:
245 if (datadir)
246 debugfs_remove(datadir);
247 if (datadir_csv)
248 debugfs_remove(datadir_csv);
249 if (gpdir)
250 debugfs_remove(gpdir);
251 debugfs_remove(rcudir);
252out:
253 return 1;
254}
255
256static void __exit rcuclassic_trace_cleanup(void)
257{
258 debugfs_remove(datadir);
259 debugfs_remove(datadir_csv);
260 debugfs_remove(gpdir);
261 debugfs_remove(hierdir);
262 debugfs_remove(rcudir);
263}
264
265
266module_init(rcuclassic_trace_init);
267module_exit(rcuclassic_trace_cleanup);
268
269MODULE_AUTHOR("Paul E. McKenney");
270MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
271MODULE_LICENSE("GPL");
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index f275c8eca772..bf8e7534c803 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -15,10 +15,11 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17 17
18void res_counter_init(struct res_counter *counter) 18void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = (unsigned long long)LLONG_MAX; 21 counter->limit = (unsigned long long)LLONG_MAX;
22 counter->parent = parent;
22} 23}
23 24
24int res_counter_charge_locked(struct res_counter *counter, unsigned long val) 25int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
@@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
34 return 0; 35 return 0;
35} 36}
36 37
37int res_counter_charge(struct res_counter *counter, unsigned long val) 38int res_counter_charge(struct res_counter *counter, unsigned long val,
39 struct res_counter **limit_fail_at)
38{ 40{
39 int ret; 41 int ret;
40 unsigned long flags; 42 unsigned long flags;
41 43 struct res_counter *c, *u;
42 spin_lock_irqsave(&counter->lock, flags); 44
43 ret = res_counter_charge_locked(counter, val); 45 *limit_fail_at = NULL;
44 spin_unlock_irqrestore(&counter->lock, flags); 46 local_irq_save(flags);
47 for (c = counter; c != NULL; c = c->parent) {
48 spin_lock(&c->lock);
49 ret = res_counter_charge_locked(c, val);
50 spin_unlock(&c->lock);
51 if (ret < 0) {
52 *limit_fail_at = c;
53 goto undo;
54 }
55 }
56 ret = 0;
57 goto done;
58undo:
59 for (u = counter; u != c; u = u->parent) {
60 spin_lock(&u->lock);
61 res_counter_uncharge_locked(u, val);
62 spin_unlock(&u->lock);
63 }
64done:
65 local_irq_restore(flags);
45 return ret; 66 return ret;
46} 67}
47 68
@@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
56void res_counter_uncharge(struct res_counter *counter, unsigned long val) 77void res_counter_uncharge(struct res_counter *counter, unsigned long val)
57{ 78{
58 unsigned long flags; 79 unsigned long flags;
80 struct res_counter *c;
59 81
60 spin_lock_irqsave(&counter->lock, flags); 82 local_irq_save(flags);
61 res_counter_uncharge_locked(counter, val); 83 for (c = counter; c != NULL; c = c->parent) {
62 spin_unlock_irqrestore(&counter->lock, flags); 84 spin_lock(&c->lock);
85 res_counter_uncharge_locked(c, val);
86 spin_unlock(&c->lock);
87 }
88 local_irq_restore(flags);
63} 89}
64 90
65 91
diff --git a/kernel/resource.c b/kernel/resource.c
index 4337063663ef..ca6a1536b205 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res)
623 */ 623 */
624struct resource * __request_region(struct resource *parent, 624struct resource * __request_region(struct resource *parent,
625 resource_size_t start, resource_size_t n, 625 resource_size_t start, resource_size_t n,
626 const char *name) 626 const char *name, int flags)
627{ 627{
628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
629 629
@@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent,
634 res->start = start; 634 res->start = start;
635 res->end = start + n - 1; 635 res->end = start + n - 1;
636 res->flags = IORESOURCE_BUSY; 636 res->flags = IORESOURCE_BUSY;
637 res->flags |= flags;
637 638
638 write_lock(&resource_lock); 639 write_lock(&resource_lock);
639 640
@@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start,
679{ 680{
680 struct resource * res; 681 struct resource * res;
681 682
682 res = __request_region(parent, start, n, "check-region"); 683 res = __request_region(parent, start, n, "check-region", 0);
683 if (!res) 684 if (!res)
684 return -EBUSY; 685 return -EBUSY;
685 686
@@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev,
776 dr->start = start; 777 dr->start = start;
777 dr->n = n; 778 dr->n = n;
778 779
779 res = __request_region(parent, start, n, name); 780 res = __request_region(parent, start, n, name, 0);
780 if (res) 781 if (res)
781 devres_add(dev, dr); 782 devres_add(dev, dr);
782 else 783 else
@@ -853,6 +854,15 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && 854 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) 855 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
855 continue; 856 continue;
857 /*
858 * if a resource is "BUSY", it's not a hardware resource
859 * but a driver mapping of such a resource; we don't want
860 * to warn for those; some drivers legitimately map only
861 * partial hardware resources. (example: vesafb)
862 */
863 if (p->flags & IORESOURCE_BUSY)
864 continue;
865
856 printk(KERN_WARNING "resource map sanity check conflict: " 866 printk(KERN_WARNING "resource map sanity check conflict: "
857 "0x%llx 0x%llx 0x%llx 0x%llx %s\n", 867 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
858 (unsigned long long)addr, 868 (unsigned long long)addr,
@@ -867,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
867 877
868 return err; 878 return err;
869} 879}
880
881#ifdef CONFIG_STRICT_DEVMEM
882static int strict_iomem_checks = 1;
883#else
884static int strict_iomem_checks;
885#endif
886
887/*
888 * check if an address is reserved in the iomem resource tree
889 * returns 1 if reserved, 0 if not reserved.
890 */
891int iomem_is_exclusive(u64 addr)
892{
893 struct resource *p = &iomem_resource;
894 int err = 0;
895 loff_t l;
896 int size = PAGE_SIZE;
897
898 if (!strict_iomem_checks)
899 return 0;
900
901 addr = addr & PAGE_MASK;
902
903 read_lock(&resource_lock);
904 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
905 /*
906 * We can probably skip the resources without
907 * IORESOURCE_IO attribute?
908 */
909 if (p->start >= addr + size)
910 break;
911 if (p->end < addr)
912 continue;
913 if (p->flags & IORESOURCE_BUSY &&
914 p->flags & IORESOURCE_EXCLUSIVE) {
915 err = 1;
916 break;
917 }
918 }
919 read_unlock(&resource_lock);
920
921 return err;
922}
923
924static int __init strict_iomem(char *str)
925{
926 if (strstr(str, "relaxed"))
927 strict_iomem_checks = 0;
928 if (strstr(str, "strict"))
929 strict_iomem_checks = 1;
930 return 1;
931}
932
933__setup("iomem=", strict_iomem);
diff --git a/kernel/sched.c b/kernel/sched.c
index e4bb1dd7b308..deb5ac8c12f3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
118 */ 118 */
119#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
120 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
121#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
122/* 128/*
123 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -203,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
203 hrtimer_init(&rt_b->rt_period_timer, 209 hrtimer_init(&rt_b->rt_period_timer,
204 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 210 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
205 rt_b->rt_period_timer.function = sched_rt_period_timer; 211 rt_b->rt_period_timer.function = sched_rt_period_timer;
206 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
207} 212}
208 213
209static inline int rt_bandwidth_enabled(void) 214static inline int rt_bandwidth_enabled(void)
@@ -261,6 +266,10 @@ struct task_group {
261 struct cgroup_subsys_state css; 266 struct cgroup_subsys_state css;
262#endif 267#endif
263 268
269#ifdef CONFIG_USER_SCHED
270 uid_t uid;
271#endif
272
264#ifdef CONFIG_FAIR_GROUP_SCHED 273#ifdef CONFIG_FAIR_GROUP_SCHED
265 /* schedulable entities of this group on each cpu */ 274 /* schedulable entities of this group on each cpu */
266 struct sched_entity **se; 275 struct sched_entity **se;
@@ -286,6 +295,12 @@ struct task_group {
286 295
287#ifdef CONFIG_USER_SCHED 296#ifdef CONFIG_USER_SCHED
288 297
298/* Helper function to pass uid information to create_sched_user() */
299void set_tg_uid(struct user_struct *user)
300{
301 user->tg->uid = user->uid;
302}
303
289/* 304/*
290 * Root task group. 305 * Root task group.
291 * Every UID task group (including init_task_group aka UID-0) will 306 * Every UID task group (including init_task_group aka UID-0) will
@@ -345,7 +360,9 @@ static inline struct task_group *task_group(struct task_struct *p)
345 struct task_group *tg; 360 struct task_group *tg;
346 361
347#ifdef CONFIG_USER_SCHED 362#ifdef CONFIG_USER_SCHED
348 tg = p->user->tg; 363 rcu_read_lock();
364 tg = __task_cred(p)->user->tg;
365 rcu_read_unlock();
349#elif defined(CONFIG_CGROUP_SCHED) 366#elif defined(CONFIG_CGROUP_SCHED)
350 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 367 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
351 struct task_group, css); 368 struct task_group, css);
@@ -481,18 +498,26 @@ struct rt_rq {
481 */ 498 */
482struct root_domain { 499struct root_domain {
483 atomic_t refcount; 500 atomic_t refcount;
484 cpumask_t span; 501 cpumask_var_t span;
485 cpumask_t online; 502 cpumask_var_t online;
486 503
487 /* 504 /*
488 * The "RT overload" flag: it gets set if a CPU has more than 505 * The "RT overload" flag: it gets set if a CPU has more than
489 * one runnable RT task. 506 * one runnable RT task.
490 */ 507 */
491 cpumask_t rto_mask; 508 cpumask_var_t rto_mask;
492 atomic_t rto_count; 509 atomic_t rto_count;
493#ifdef CONFIG_SMP 510#ifdef CONFIG_SMP
494 struct cpupri cpupri; 511 struct cpupri cpupri;
495#endif 512#endif
513#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
514 /*
515 * Preferred wake up cpu nominated by sched_mc balance that will be
516 * used when most cpus are idle in the system indicating overall very
517 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
518 */
519 unsigned int sched_mc_preferred_wakeup_cpu;
520#endif
496}; 521};
497 522
498/* 523/*
@@ -586,6 +611,8 @@ struct rq {
586#ifdef CONFIG_SCHEDSTATS 611#ifdef CONFIG_SCHEDSTATS
587 /* latency stats */ 612 /* latency stats */
588 struct sched_info rq_sched_info; 613 struct sched_info rq_sched_info;
614 unsigned long long rq_cpu_time;
615 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
589 616
590 /* sys_sched_yield() stats */ 617 /* sys_sched_yield() stats */
591 unsigned int yld_exp_empty; 618 unsigned int yld_exp_empty;
@@ -703,45 +730,18 @@ static __read_mostly char *sched_feat_names[] = {
703 730
704#undef SCHED_FEAT 731#undef SCHED_FEAT
705 732
706static int sched_feat_open(struct inode *inode, struct file *filp) 733static int sched_feat_show(struct seq_file *m, void *v)
707{ 734{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 735 int i;
720 736
721 for (i = 0; sched_feat_names[i]; i++) { 737 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 738 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 739 seq_puts(m, "NO_");
724 } 740 seq_printf(m, "%s ", sched_feat_names[i]);
725
726 buf = kmalloc(len + 2, GFP_KERNEL);
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 } 741 }
742 seq_puts(m, "\n");
736 743
737 r += sprintf(buf + r, "\n"); 744 return 0;
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 745}
746 746
747static ssize_t 747static ssize_t
@@ -786,10 +786,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 786 return cnt;
787} 787}
788 788
789static int sched_feat_open(struct inode *inode, struct file *filp)
790{
791 return single_open(filp, sched_feat_show, NULL);
792}
793
789static struct file_operations sched_feat_fops = { 794static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 795 .open = sched_feat_open,
791 .read = sched_feat_read, 796 .write = sched_feat_write,
792 .write = sched_feat_write, 797 .read = seq_read,
798 .llseek = seq_lseek,
799 .release = single_release,
793}; 800};
794 801
795static __init int sched_init_debug(void) 802static __init int sched_init_debug(void)
@@ -1139,7 +1146,6 @@ static void init_rq_hrtick(struct rq *rq)
1139 1146
1140 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1147 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1141 rq->hrtick_timer.function = hrtick; 1148 rq->hrtick_timer.function = hrtick;
1142 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1143} 1149}
1144#else /* CONFIG_SCHED_HRTICK */ 1150#else /* CONFIG_SCHED_HRTICK */
1145static inline void hrtick_clear(struct rq *rq) 1151static inline void hrtick_clear(struct rq *rq)
@@ -1474,27 +1480,13 @@ static void
1474update_group_shares_cpu(struct task_group *tg, int cpu, 1480update_group_shares_cpu(struct task_group *tg, int cpu,
1475 unsigned long sd_shares, unsigned long sd_rq_weight) 1481 unsigned long sd_shares, unsigned long sd_rq_weight)
1476{ 1482{
1477 int boost = 0;
1478 unsigned long shares; 1483 unsigned long shares;
1479 unsigned long rq_weight; 1484 unsigned long rq_weight;
1480 1485
1481 if (!tg->se[cpu]) 1486 if (!tg->se[cpu])
1482 return; 1487 return;
1483 1488
1484 rq_weight = tg->cfs_rq[cpu]->load.weight; 1489 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1485
1486 /*
1487 * If there are currently no tasks on the cpu pretend there is one of
1488 * average load so that when a new task gets to run here it will not
1489 * get delayed by group starvation.
1490 */
1491 if (!rq_weight) {
1492 boost = 1;
1493 rq_weight = NICE_0_LOAD;
1494 }
1495
1496 if (unlikely(rq_weight > sd_rq_weight))
1497 rq_weight = sd_rq_weight;
1498 1490
1499 /* 1491 /*
1500 * \Sum shares * rq_weight 1492 * \Sum shares * rq_weight
@@ -1502,7 +1494,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1502 * \Sum rq_weight 1494 * \Sum rq_weight
1503 * 1495 *
1504 */ 1496 */
1505 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1497 shares = (sd_shares * rq_weight) / sd_rq_weight;
1506 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1498 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1507 1499
1508 if (abs(shares - tg->se[cpu]->load.weight) > 1500 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1511,11 +1503,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1511 unsigned long flags; 1503 unsigned long flags;
1512 1504
1513 spin_lock_irqsave(&rq->lock, flags); 1505 spin_lock_irqsave(&rq->lock, flags);
1514 /* 1506 tg->cfs_rq[cpu]->shares = shares;
1515 * record the actual number of shares, not the boosted amount.
1516 */
1517 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1518 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1519 1507
1520 __set_se_shares(tg->se[cpu], shares); 1508 __set_se_shares(tg->se[cpu], shares);
1521 spin_unlock_irqrestore(&rq->lock, flags); 1509 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1529,13 +1517,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1529 */ 1517 */
1530static int tg_shares_up(struct task_group *tg, void *data) 1518static int tg_shares_up(struct task_group *tg, void *data)
1531{ 1519{
1532 unsigned long rq_weight = 0; 1520 unsigned long weight, rq_weight = 0;
1533 unsigned long shares = 0; 1521 unsigned long shares = 0;
1534 struct sched_domain *sd = data; 1522 struct sched_domain *sd = data;
1535 int i; 1523 int i;
1536 1524
1537 for_each_cpu_mask(i, sd->span) { 1525 for_each_cpu(i, sched_domain_span(sd)) {
1538 rq_weight += tg->cfs_rq[i]->load.weight; 1526 /*
1527 * If there are currently no tasks on the cpu pretend there
1528 * is one of average load so that when a new task gets to
1529 * run here it will not get delayed by group starvation.
1530 */
1531 weight = tg->cfs_rq[i]->load.weight;
1532 if (!weight)
1533 weight = NICE_0_LOAD;
1534
1535 tg->cfs_rq[i]->rq_weight = weight;
1536 rq_weight += weight;
1539 shares += tg->cfs_rq[i]->shares; 1537 shares += tg->cfs_rq[i]->shares;
1540 } 1538 }
1541 1539
@@ -1545,10 +1543,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1545 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1543 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1546 shares = tg->shares; 1544 shares = tg->shares;
1547 1545
1548 if (!rq_weight) 1546 for_each_cpu(i, sched_domain_span(sd))
1549 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1550
1551 for_each_cpu_mask(i, sd->span)
1552 update_group_shares_cpu(tg, i, shares, rq_weight); 1547 update_group_shares_cpu(tg, i, shares, rq_weight);
1553 1548
1554 return 0; 1549 return 0;
@@ -1612,6 +1607,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1612 1607
1613#endif 1608#endif
1614 1609
1610/*
1611 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1612 */
1613static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1614 __releases(this_rq->lock)
1615 __acquires(busiest->lock)
1616 __acquires(this_rq->lock)
1617{
1618 int ret = 0;
1619
1620 if (unlikely(!irqs_disabled())) {
1621 /* printk() doesn't work good under rq->lock */
1622 spin_unlock(&this_rq->lock);
1623 BUG_ON(1);
1624 }
1625 if (unlikely(!spin_trylock(&busiest->lock))) {
1626 if (busiest < this_rq) {
1627 spin_unlock(&this_rq->lock);
1628 spin_lock(&busiest->lock);
1629 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1630 ret = 1;
1631 } else
1632 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1633 }
1634 return ret;
1635}
1636
1637static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1638 __releases(busiest->lock)
1639{
1640 spin_unlock(&busiest->lock);
1641 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1642}
1615#endif 1643#endif
1616 1644
1617#ifdef CONFIG_FAIR_GROUP_SCHED 1645#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1845,6 +1873,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1845 1873
1846 clock_offset = old_rq->clock - new_rq->clock; 1874 clock_offset = old_rq->clock - new_rq->clock;
1847 1875
1876 trace_sched_migrate_task(p, task_cpu(p), new_cpu);
1877
1848#ifdef CONFIG_SCHEDSTATS 1878#ifdef CONFIG_SCHEDSTATS
1849 if (p->se.wait_start) 1879 if (p->se.wait_start)
1850 p->se.wait_start -= clock_offset; 1880 p->se.wait_start -= clock_offset;
@@ -2079,15 +2109,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2079 int i; 2109 int i;
2080 2110
2081 /* Skip over this group if it has no CPUs allowed */ 2111 /* Skip over this group if it has no CPUs allowed */
2082 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 2112 if (!cpumask_intersects(sched_group_cpus(group),
2113 &p->cpus_allowed))
2083 continue; 2114 continue;
2084 2115
2085 local_group = cpu_isset(this_cpu, group->cpumask); 2116 local_group = cpumask_test_cpu(this_cpu,
2117 sched_group_cpus(group));
2086 2118
2087 /* Tally up the load of all CPUs in the group */ 2119 /* Tally up the load of all CPUs in the group */
2088 avg_load = 0; 2120 avg_load = 0;
2089 2121
2090 for_each_cpu_mask_nr(i, group->cpumask) { 2122 for_each_cpu(i, sched_group_cpus(group)) {
2091 /* Bias balancing toward cpus of our domain */ 2123 /* Bias balancing toward cpus of our domain */
2092 if (local_group) 2124 if (local_group)
2093 load = source_load(i, load_idx); 2125 load = source_load(i, load_idx);
@@ -2119,17 +2151,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2119 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2151 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2120 */ 2152 */
2121static int 2153static int
2122find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, 2154find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2123 cpumask_t *tmp)
2124{ 2155{
2125 unsigned long load, min_load = ULONG_MAX; 2156 unsigned long load, min_load = ULONG_MAX;
2126 int idlest = -1; 2157 int idlest = -1;
2127 int i; 2158 int i;
2128 2159
2129 /* Traverse only the allowed CPUs */ 2160 /* Traverse only the allowed CPUs */
2130 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2161 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2131
2132 for_each_cpu_mask_nr(i, *tmp) {
2133 load = weighted_cpuload(i); 2162 load = weighted_cpuload(i);
2134 2163
2135 if (load < min_load || (load == min_load && i == this_cpu)) { 2164 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2171,7 +2200,6 @@ static int sched_balance_self(int cpu, int flag)
2171 update_shares(sd); 2200 update_shares(sd);
2172 2201
2173 while (sd) { 2202 while (sd) {
2174 cpumask_t span, tmpmask;
2175 struct sched_group *group; 2203 struct sched_group *group;
2176 int new_cpu, weight; 2204 int new_cpu, weight;
2177 2205
@@ -2180,14 +2208,13 @@ static int sched_balance_self(int cpu, int flag)
2180 continue; 2208 continue;
2181 } 2209 }
2182 2210
2183 span = sd->span;
2184 group = find_idlest_group(sd, t, cpu); 2211 group = find_idlest_group(sd, t, cpu);
2185 if (!group) { 2212 if (!group) {
2186 sd = sd->child; 2213 sd = sd->child;
2187 continue; 2214 continue;
2188 } 2215 }
2189 2216
2190 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); 2217 new_cpu = find_idlest_cpu(group, t, cpu);
2191 if (new_cpu == -1 || new_cpu == cpu) { 2218 if (new_cpu == -1 || new_cpu == cpu) {
2192 /* Now try balancing at a lower domain level of cpu */ 2219 /* Now try balancing at a lower domain level of cpu */
2193 sd = sd->child; 2220 sd = sd->child;
@@ -2196,10 +2223,10 @@ static int sched_balance_self(int cpu, int flag)
2196 2223
2197 /* Now try balancing at a lower domain level of new_cpu */ 2224 /* Now try balancing at a lower domain level of new_cpu */
2198 cpu = new_cpu; 2225 cpu = new_cpu;
2226 weight = cpumask_weight(sched_domain_span(sd));
2199 sd = NULL; 2227 sd = NULL;
2200 weight = cpus_weight(span);
2201 for_each_domain(cpu, tmp) { 2228 for_each_domain(cpu, tmp) {
2202 if (weight <= cpus_weight(tmp->span)) 2229 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2203 break; 2230 break;
2204 if (tmp->flags & flag) 2231 if (tmp->flags & flag)
2205 sd = tmp; 2232 sd = tmp;
@@ -2244,7 +2271,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2244 cpu = task_cpu(p); 2271 cpu = task_cpu(p);
2245 2272
2246 for_each_domain(this_cpu, sd) { 2273 for_each_domain(this_cpu, sd) {
2247 if (cpu_isset(cpu, sd->span)) { 2274 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2248 update_shares(sd); 2275 update_shares(sd);
2249 break; 2276 break;
2250 } 2277 }
@@ -2254,6 +2281,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2254 2281
2255 smp_wmb(); 2282 smp_wmb();
2256 rq = task_rq_lock(p, &flags); 2283 rq = task_rq_lock(p, &flags);
2284 update_rq_clock(rq);
2257 old_state = p->state; 2285 old_state = p->state;
2258 if (!(old_state & state)) 2286 if (!(old_state & state))
2259 goto out; 2287 goto out;
@@ -2292,7 +2320,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2292 else { 2320 else {
2293 struct sched_domain *sd; 2321 struct sched_domain *sd;
2294 for_each_domain(this_cpu, sd) { 2322 for_each_domain(this_cpu, sd) {
2295 if (cpu_isset(cpu, sd->span)) { 2323 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2296 schedstat_inc(sd, ttwu_wake_remote); 2324 schedstat_inc(sd, ttwu_wake_remote);
2297 break; 2325 break;
2298 } 2326 }
@@ -2311,12 +2339,11 @@ out_activate:
2311 schedstat_inc(p, se.nr_wakeups_local); 2339 schedstat_inc(p, se.nr_wakeups_local);
2312 else 2340 else
2313 schedstat_inc(p, se.nr_wakeups_remote); 2341 schedstat_inc(p, se.nr_wakeups_remote);
2314 update_rq_clock(rq);
2315 activate_task(rq, p, 1); 2342 activate_task(rq, p, 1);
2316 success = 1; 2343 success = 1;
2317 2344
2318out_running: 2345out_running:
2319 trace_sched_wakeup(rq, p); 2346 trace_sched_wakeup(rq, p, success);
2320 check_preempt_curr(rq, p, sync); 2347 check_preempt_curr(rq, p, sync);
2321 2348
2322 p->state = TASK_RUNNING; 2349 p->state = TASK_RUNNING;
@@ -2449,7 +2476,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2449 p->sched_class->task_new(rq, p); 2476 p->sched_class->task_new(rq, p);
2450 inc_nr_running(rq); 2477 inc_nr_running(rq);
2451 } 2478 }
2452 trace_sched_wakeup_new(rq, p); 2479 trace_sched_wakeup_new(rq, p, 1);
2453 check_preempt_curr(rq, p, 0); 2480 check_preempt_curr(rq, p, 0);
2454#ifdef CONFIG_SMP 2481#ifdef CONFIG_SMP
2455 if (p->sched_class->task_wake_up) 2482 if (p->sched_class->task_wake_up)
@@ -2812,40 +2839,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2812} 2839}
2813 2840
2814/* 2841/*
2815 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2816 */
2817static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2818 __releases(this_rq->lock)
2819 __acquires(busiest->lock)
2820 __acquires(this_rq->lock)
2821{
2822 int ret = 0;
2823
2824 if (unlikely(!irqs_disabled())) {
2825 /* printk() doesn't work good under rq->lock */
2826 spin_unlock(&this_rq->lock);
2827 BUG_ON(1);
2828 }
2829 if (unlikely(!spin_trylock(&busiest->lock))) {
2830 if (busiest < this_rq) {
2831 spin_unlock(&this_rq->lock);
2832 spin_lock(&busiest->lock);
2833 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2834 ret = 1;
2835 } else
2836 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2837 }
2838 return ret;
2839}
2840
2841static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2842 __releases(busiest->lock)
2843{
2844 spin_unlock(&busiest->lock);
2845 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2846}
2847
2848/*
2849 * If dest_cpu is allowed for this process, migrate the task to it. 2842 * If dest_cpu is allowed for this process, migrate the task to it.
2850 * This is accomplished by forcing the cpu_allowed mask to only 2843 * This is accomplished by forcing the cpu_allowed mask to only
2851 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2844 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -2858,11 +2851,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2858 struct rq *rq; 2851 struct rq *rq;
2859 2852
2860 rq = task_rq_lock(p, &flags); 2853 rq = task_rq_lock(p, &flags);
2861 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2854 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2862 || unlikely(!cpu_active(dest_cpu))) 2855 || unlikely(!cpu_active(dest_cpu)))
2863 goto out; 2856 goto out;
2864 2857
2865 trace_sched_migrate_task(rq, p, dest_cpu);
2866 /* force the process onto the specified CPU */ 2858 /* force the process onto the specified CPU */
2867 if (migrate_task(p, dest_cpu, &req)) { 2859 if (migrate_task(p, dest_cpu, &req)) {
2868 /* Need to wait for migration thread (might exit: take ref). */ 2860 /* Need to wait for migration thread (might exit: take ref). */
@@ -2924,7 +2916,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2924 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2916 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2925 * 3) are cache-hot on their current CPU. 2917 * 3) are cache-hot on their current CPU.
2926 */ 2918 */
2927 if (!cpu_isset(this_cpu, p->cpus_allowed)) { 2919 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
2928 schedstat_inc(p, se.nr_failed_migrations_affine); 2920 schedstat_inc(p, se.nr_failed_migrations_affine);
2929 return 0; 2921 return 0;
2930 } 2922 }
@@ -3099,7 +3091,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3099static struct sched_group * 3091static struct sched_group *
3100find_busiest_group(struct sched_domain *sd, int this_cpu, 3092find_busiest_group(struct sched_domain *sd, int this_cpu,
3101 unsigned long *imbalance, enum cpu_idle_type idle, 3093 unsigned long *imbalance, enum cpu_idle_type idle,
3102 int *sd_idle, const cpumask_t *cpus, int *balance) 3094 int *sd_idle, const struct cpumask *cpus, int *balance)
3103{ 3095{
3104 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3096 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3105 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3097 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3135,10 +3127,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3135 unsigned long sum_avg_load_per_task; 3127 unsigned long sum_avg_load_per_task;
3136 unsigned long avg_load_per_task; 3128 unsigned long avg_load_per_task;
3137 3129
3138 local_group = cpu_isset(this_cpu, group->cpumask); 3130 local_group = cpumask_test_cpu(this_cpu,
3131 sched_group_cpus(group));
3139 3132
3140 if (local_group) 3133 if (local_group)
3141 balance_cpu = first_cpu(group->cpumask); 3134 balance_cpu = cpumask_first(sched_group_cpus(group));
3142 3135
3143 /* Tally up the load of all CPUs in the group */ 3136 /* Tally up the load of all CPUs in the group */
3144 sum_weighted_load = sum_nr_running = avg_load = 0; 3137 sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3147,13 +3140,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3147 max_cpu_load = 0; 3140 max_cpu_load = 0;
3148 min_cpu_load = ~0UL; 3141 min_cpu_load = ~0UL;
3149 3142
3150 for_each_cpu_mask_nr(i, group->cpumask) { 3143 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3151 struct rq *rq; 3144 struct rq *rq = cpu_rq(i);
3152
3153 if (!cpu_isset(i, *cpus))
3154 continue;
3155
3156 rq = cpu_rq(i);
3157 3145
3158 if (*sd_idle && rq->nr_running) 3146 if (*sd_idle && rq->nr_running)
3159 *sd_idle = 0; 3147 *sd_idle = 0;
@@ -3264,8 +3252,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3264 */ 3252 */
3265 if ((sum_nr_running < min_nr_running) || 3253 if ((sum_nr_running < min_nr_running) ||
3266 (sum_nr_running == min_nr_running && 3254 (sum_nr_running == min_nr_running &&
3267 first_cpu(group->cpumask) < 3255 cpumask_first(sched_group_cpus(group)) >
3268 first_cpu(group_min->cpumask))) { 3256 cpumask_first(sched_group_cpus(group_min)))) {
3269 group_min = group; 3257 group_min = group;
3270 min_nr_running = sum_nr_running; 3258 min_nr_running = sum_nr_running;
3271 min_load_per_task = sum_weighted_load / 3259 min_load_per_task = sum_weighted_load /
@@ -3280,8 +3268,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3280 if (sum_nr_running <= group_capacity - 1) { 3268 if (sum_nr_running <= group_capacity - 1) {
3281 if (sum_nr_running > leader_nr_running || 3269 if (sum_nr_running > leader_nr_running ||
3282 (sum_nr_running == leader_nr_running && 3270 (sum_nr_running == leader_nr_running &&
3283 first_cpu(group->cpumask) > 3271 cpumask_first(sched_group_cpus(group)) <
3284 first_cpu(group_leader->cpumask))) { 3272 cpumask_first(sched_group_cpus(group_leader)))) {
3285 group_leader = group; 3273 group_leader = group;
3286 leader_nr_running = sum_nr_running; 3274 leader_nr_running = sum_nr_running;
3287 } 3275 }
@@ -3407,6 +3395,10 @@ out_balanced:
3407 3395
3408 if (this == group_leader && group_leader != group_min) { 3396 if (this == group_leader && group_leader != group_min) {
3409 *imbalance = min_load_per_task; 3397 *imbalance = min_load_per_task;
3398 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3399 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3400 cpumask_first(sched_group_cpus(group_leader));
3401 }
3410 return group_min; 3402 return group_min;
3411 } 3403 }
3412#endif 3404#endif
@@ -3420,16 +3412,16 @@ ret:
3420 */ 3412 */
3421static struct rq * 3413static struct rq *
3422find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3414find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3423 unsigned long imbalance, const cpumask_t *cpus) 3415 unsigned long imbalance, const struct cpumask *cpus)
3424{ 3416{
3425 struct rq *busiest = NULL, *rq; 3417 struct rq *busiest = NULL, *rq;
3426 unsigned long max_load = 0; 3418 unsigned long max_load = 0;
3427 int i; 3419 int i;
3428 3420
3429 for_each_cpu_mask_nr(i, group->cpumask) { 3421 for_each_cpu(i, sched_group_cpus(group)) {
3430 unsigned long wl; 3422 unsigned long wl;
3431 3423
3432 if (!cpu_isset(i, *cpus)) 3424 if (!cpumask_test_cpu(i, cpus))
3433 continue; 3425 continue;
3434 3426
3435 rq = cpu_rq(i); 3427 rq = cpu_rq(i);
@@ -3459,7 +3451,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3459 */ 3451 */
3460static int load_balance(int this_cpu, struct rq *this_rq, 3452static int load_balance(int this_cpu, struct rq *this_rq,
3461 struct sched_domain *sd, enum cpu_idle_type idle, 3453 struct sched_domain *sd, enum cpu_idle_type idle,
3462 int *balance, cpumask_t *cpus) 3454 int *balance, struct cpumask *cpus)
3463{ 3455{
3464 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3456 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3465 struct sched_group *group; 3457 struct sched_group *group;
@@ -3467,7 +3459,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3467 struct rq *busiest; 3459 struct rq *busiest;
3468 unsigned long flags; 3460 unsigned long flags;
3469 3461
3470 cpus_setall(*cpus); 3462 cpumask_setall(cpus);
3471 3463
3472 /* 3464 /*
3473 * When power savings policy is enabled for the parent domain, idle 3465 * When power savings policy is enabled for the parent domain, idle
@@ -3527,8 +3519,8 @@ redo:
3527 3519
3528 /* All tasks on this runqueue were pinned by CPU affinity */ 3520 /* All tasks on this runqueue were pinned by CPU affinity */
3529 if (unlikely(all_pinned)) { 3521 if (unlikely(all_pinned)) {
3530 cpu_clear(cpu_of(busiest), *cpus); 3522 cpumask_clear_cpu(cpu_of(busiest), cpus);
3531 if (!cpus_empty(*cpus)) 3523 if (!cpumask_empty(cpus))
3532 goto redo; 3524 goto redo;
3533 goto out_balanced; 3525 goto out_balanced;
3534 } 3526 }
@@ -3545,7 +3537,8 @@ redo:
3545 /* don't kick the migration_thread, if the curr 3537 /* don't kick the migration_thread, if the curr
3546 * task on busiest cpu can't be moved to this_cpu 3538 * task on busiest cpu can't be moved to this_cpu
3547 */ 3539 */
3548 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 3540 if (!cpumask_test_cpu(this_cpu,
3541 &busiest->curr->cpus_allowed)) {
3549 spin_unlock_irqrestore(&busiest->lock, flags); 3542 spin_unlock_irqrestore(&busiest->lock, flags);
3550 all_pinned = 1; 3543 all_pinned = 1;
3551 goto out_one_pinned; 3544 goto out_one_pinned;
@@ -3620,7 +3613,7 @@ out:
3620 */ 3613 */
3621static int 3614static int
3622load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 3615load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3623 cpumask_t *cpus) 3616 struct cpumask *cpus)
3624{ 3617{
3625 struct sched_group *group; 3618 struct sched_group *group;
3626 struct rq *busiest = NULL; 3619 struct rq *busiest = NULL;
@@ -3629,7 +3622,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3629 int sd_idle = 0; 3622 int sd_idle = 0;
3630 int all_pinned = 0; 3623 int all_pinned = 0;
3631 3624
3632 cpus_setall(*cpus); 3625 cpumask_setall(cpus);
3633 3626
3634 /* 3627 /*
3635 * When power savings policy is enabled for the parent domain, idle 3628 * When power savings policy is enabled for the parent domain, idle
@@ -3673,17 +3666,76 @@ redo:
3673 double_unlock_balance(this_rq, busiest); 3666 double_unlock_balance(this_rq, busiest);
3674 3667
3675 if (unlikely(all_pinned)) { 3668 if (unlikely(all_pinned)) {
3676 cpu_clear(cpu_of(busiest), *cpus); 3669 cpumask_clear_cpu(cpu_of(busiest), cpus);
3677 if (!cpus_empty(*cpus)) 3670 if (!cpumask_empty(cpus))
3678 goto redo; 3671 goto redo;
3679 } 3672 }
3680 } 3673 }
3681 3674
3682 if (!ld_moved) { 3675 if (!ld_moved) {
3676 int active_balance = 0;
3677
3683 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); 3678 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3684 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3679 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3685 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3680 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3686 return -1; 3681 return -1;
3682
3683 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3684 return -1;
3685
3686 if (sd->nr_balance_failed++ < 2)
3687 return -1;
3688
3689 /*
3690 * The only task running in a non-idle cpu can be moved to this
3691 * cpu in an attempt to completely freeup the other CPU
3692 * package. The same method used to move task in load_balance()
3693 * have been extended for load_balance_newidle() to speedup
3694 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
3695 *
3696 * The package power saving logic comes from
3697 * find_busiest_group(). If there are no imbalance, then
3698 * f_b_g() will return NULL. However when sched_mc={1,2} then
3699 * f_b_g() will select a group from which a running task may be
3700 * pulled to this cpu in order to make the other package idle.
3701 * If there is no opportunity to make a package idle and if
3702 * there are no imbalance, then f_b_g() will return NULL and no
3703 * action will be taken in load_balance_newidle().
3704 *
3705 * Under normal task pull operation due to imbalance, there
3706 * will be more than one task in the source run queue and
3707 * move_tasks() will succeed. ld_moved will be true and this
3708 * active balance code will not be triggered.
3709 */
3710
3711 /* Lock busiest in correct order while this_rq is held */
3712 double_lock_balance(this_rq, busiest);
3713
3714 /*
3715 * don't kick the migration_thread, if the curr
3716 * task on busiest cpu can't be moved to this_cpu
3717 */
3718 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
3719 double_unlock_balance(this_rq, busiest);
3720 all_pinned = 1;
3721 return ld_moved;
3722 }
3723
3724 if (!busiest->active_balance) {
3725 busiest->active_balance = 1;
3726 busiest->push_cpu = this_cpu;
3727 active_balance = 1;
3728 }
3729
3730 double_unlock_balance(this_rq, busiest);
3731 /*
3732 * Should not call ttwu while holding a rq->lock
3733 */
3734 spin_unlock(&this_rq->lock);
3735 if (active_balance)
3736 wake_up_process(busiest->migration_thread);
3737 spin_lock(&this_rq->lock);
3738
3687 } else 3739 } else
3688 sd->nr_balance_failed = 0; 3740 sd->nr_balance_failed = 0;
3689 3741
@@ -3707,9 +3759,12 @@ out_balanced:
3707static void idle_balance(int this_cpu, struct rq *this_rq) 3759static void idle_balance(int this_cpu, struct rq *this_rq)
3708{ 3760{
3709 struct sched_domain *sd; 3761 struct sched_domain *sd;
3710 int pulled_task = -1; 3762 int pulled_task = 0;
3711 unsigned long next_balance = jiffies + HZ; 3763 unsigned long next_balance = jiffies + HZ;
3712 cpumask_t tmpmask; 3764 cpumask_var_t tmpmask;
3765
3766 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3767 return;
3713 3768
3714 for_each_domain(this_cpu, sd) { 3769 for_each_domain(this_cpu, sd) {
3715 unsigned long interval; 3770 unsigned long interval;
@@ -3720,7 +3775,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3720 if (sd->flags & SD_BALANCE_NEWIDLE) 3775 if (sd->flags & SD_BALANCE_NEWIDLE)
3721 /* If we've pulled tasks over stop searching: */ 3776 /* If we've pulled tasks over stop searching: */
3722 pulled_task = load_balance_newidle(this_cpu, this_rq, 3777 pulled_task = load_balance_newidle(this_cpu, this_rq,
3723 sd, &tmpmask); 3778 sd, tmpmask);
3724 3779
3725 interval = msecs_to_jiffies(sd->balance_interval); 3780 interval = msecs_to_jiffies(sd->balance_interval);
3726 if (time_after(next_balance, sd->last_balance + interval)) 3781 if (time_after(next_balance, sd->last_balance + interval))
@@ -3735,6 +3790,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3735 */ 3790 */
3736 this_rq->next_balance = next_balance; 3791 this_rq->next_balance = next_balance;
3737 } 3792 }
3793 free_cpumask_var(tmpmask);
3738} 3794}
3739 3795
3740/* 3796/*
@@ -3772,7 +3828,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3772 /* Search for an sd spanning us and the target CPU. */ 3828 /* Search for an sd spanning us and the target CPU. */
3773 for_each_domain(target_cpu, sd) { 3829 for_each_domain(target_cpu, sd) {
3774 if ((sd->flags & SD_LOAD_BALANCE) && 3830 if ((sd->flags & SD_LOAD_BALANCE) &&
3775 cpu_isset(busiest_cpu, sd->span)) 3831 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3776 break; 3832 break;
3777 } 3833 }
3778 3834
@@ -3791,10 +3847,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3791#ifdef CONFIG_NO_HZ 3847#ifdef CONFIG_NO_HZ
3792static struct { 3848static struct {
3793 atomic_t load_balancer; 3849 atomic_t load_balancer;
3794 cpumask_t cpu_mask; 3850 cpumask_var_t cpu_mask;
3795} nohz ____cacheline_aligned = { 3851} nohz ____cacheline_aligned = {
3796 .load_balancer = ATOMIC_INIT(-1), 3852 .load_balancer = ATOMIC_INIT(-1),
3797 .cpu_mask = CPU_MASK_NONE,
3798}; 3853};
3799 3854
3800/* 3855/*
@@ -3822,7 +3877,7 @@ int select_nohz_load_balancer(int stop_tick)
3822 int cpu = smp_processor_id(); 3877 int cpu = smp_processor_id();
3823 3878
3824 if (stop_tick) { 3879 if (stop_tick) {
3825 cpu_set(cpu, nohz.cpu_mask); 3880 cpumask_set_cpu(cpu, nohz.cpu_mask);
3826 cpu_rq(cpu)->in_nohz_recently = 1; 3881 cpu_rq(cpu)->in_nohz_recently = 1;
3827 3882
3828 /* 3883 /*
@@ -3836,7 +3891,7 @@ int select_nohz_load_balancer(int stop_tick)
3836 } 3891 }
3837 3892
3838 /* time for ilb owner also to sleep */ 3893 /* time for ilb owner also to sleep */
3839 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 3894 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3840 if (atomic_read(&nohz.load_balancer) == cpu) 3895 if (atomic_read(&nohz.load_balancer) == cpu)
3841 atomic_set(&nohz.load_balancer, -1); 3896 atomic_set(&nohz.load_balancer, -1);
3842 return 0; 3897 return 0;
@@ -3849,10 +3904,10 @@ int select_nohz_load_balancer(int stop_tick)
3849 } else if (atomic_read(&nohz.load_balancer) == cpu) 3904 } else if (atomic_read(&nohz.load_balancer) == cpu)
3850 return 1; 3905 return 1;
3851 } else { 3906 } else {
3852 if (!cpu_isset(cpu, nohz.cpu_mask)) 3907 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3853 return 0; 3908 return 0;
3854 3909
3855 cpu_clear(cpu, nohz.cpu_mask); 3910 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3856 3911
3857 if (atomic_read(&nohz.load_balancer) == cpu) 3912 if (atomic_read(&nohz.load_balancer) == cpu)
3858 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3913 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3880,7 +3935,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3880 unsigned long next_balance = jiffies + 60*HZ; 3935 unsigned long next_balance = jiffies + 60*HZ;
3881 int update_next_balance = 0; 3936 int update_next_balance = 0;
3882 int need_serialize; 3937 int need_serialize;
3883 cpumask_t tmp; 3938 cpumask_var_t tmp;
3939
3940 /* Fails alloc? Rebalancing probably not a priority right now. */
3941 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3942 return;
3884 3943
3885 for_each_domain(cpu, sd) { 3944 for_each_domain(cpu, sd) {
3886 if (!(sd->flags & SD_LOAD_BALANCE)) 3945 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3905,7 +3964,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3905 } 3964 }
3906 3965
3907 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3966 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3908 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { 3967 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
3909 /* 3968 /*
3910 * We've pulled tasks over so either we're no 3969 * We've pulled tasks over so either we're no
3911 * longer idle, or one of our SMT siblings is 3970 * longer idle, or one of our SMT siblings is
@@ -3939,6 +3998,8 @@ out:
3939 */ 3998 */
3940 if (likely(update_next_balance)) 3999 if (likely(update_next_balance))
3941 rq->next_balance = next_balance; 4000 rq->next_balance = next_balance;
4001
4002 free_cpumask_var(tmp);
3942} 4003}
3943 4004
3944/* 4005/*
@@ -3963,12 +4024,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3963 */ 4024 */
3964 if (this_rq->idle_at_tick && 4025 if (this_rq->idle_at_tick &&
3965 atomic_read(&nohz.load_balancer) == this_cpu) { 4026 atomic_read(&nohz.load_balancer) == this_cpu) {
3966 cpumask_t cpus = nohz.cpu_mask;
3967 struct rq *rq; 4027 struct rq *rq;
3968 int balance_cpu; 4028 int balance_cpu;
3969 4029
3970 cpu_clear(this_cpu, cpus); 4030 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3971 for_each_cpu_mask_nr(balance_cpu, cpus) { 4031 if (balance_cpu == this_cpu)
4032 continue;
4033
3972 /* 4034 /*
3973 * If this cpu gets work to do, stop the load balancing 4035 * If this cpu gets work to do, stop the load balancing
3974 * work being done for other cpus. Next load 4036 * work being done for other cpus. Next load
@@ -4006,7 +4068,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4006 rq->in_nohz_recently = 0; 4068 rq->in_nohz_recently = 0;
4007 4069
4008 if (atomic_read(&nohz.load_balancer) == cpu) { 4070 if (atomic_read(&nohz.load_balancer) == cpu) {
4009 cpu_clear(cpu, nohz.cpu_mask); 4071 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4010 atomic_set(&nohz.load_balancer, -1); 4072 atomic_set(&nohz.load_balancer, -1);
4011 } 4073 }
4012 4074
@@ -4019,7 +4081,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4019 * TBD: Traverse the sched domains and nominate 4081 * TBD: Traverse the sched domains and nominate
4020 * the nearest cpu in the nohz.cpu_mask. 4082 * the nearest cpu in the nohz.cpu_mask.
4021 */ 4083 */
4022 int ilb = first_cpu(nohz.cpu_mask); 4084 int ilb = cpumask_first(nohz.cpu_mask);
4023 4085
4024 if (ilb < nr_cpu_ids) 4086 if (ilb < nr_cpu_ids)
4025 resched_cpu(ilb); 4087 resched_cpu(ilb);
@@ -4031,7 +4093,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4031 * cpus with ticks stopped, is it time for that to stop? 4093 * cpus with ticks stopped, is it time for that to stop?
4032 */ 4094 */
4033 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && 4095 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4034 cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 4096 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4035 resched_cpu(cpu); 4097 resched_cpu(cpu);
4036 return; 4098 return;
4037 } 4099 }
@@ -4041,7 +4103,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4041 * someone else, then no need raise the SCHED_SOFTIRQ 4103 * someone else, then no need raise the SCHED_SOFTIRQ
4042 */ 4104 */
4043 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && 4105 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4044 cpu_isset(cpu, nohz.cpu_mask)) 4106 cpumask_test_cpu(cpu, nohz.cpu_mask))
4045 return; 4107 return;
4046#endif 4108#endif
4047 if (time_after_eq(jiffies, rq->next_balance)) 4109 if (time_after_eq(jiffies, rq->next_balance))
@@ -4093,13 +4155,17 @@ unsigned long long task_delta_exec(struct task_struct *p)
4093 * Account user cpu time to a process. 4155 * Account user cpu time to a process.
4094 * @p: the process that the cpu time gets accounted to 4156 * @p: the process that the cpu time gets accounted to
4095 * @cputime: the cpu time spent in user space since the last update 4157 * @cputime: the cpu time spent in user space since the last update
4158 * @cputime_scaled: cputime scaled by cpu frequency
4096 */ 4159 */
4097void account_user_time(struct task_struct *p, cputime_t cputime) 4160void account_user_time(struct task_struct *p, cputime_t cputime,
4161 cputime_t cputime_scaled)
4098{ 4162{
4099 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4163 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4100 cputime64_t tmp; 4164 cputime64_t tmp;
4101 4165
4166 /* Add user time to process. */
4102 p->utime = cputime_add(p->utime, cputime); 4167 p->utime = cputime_add(p->utime, cputime);
4168 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4103 account_group_user_time(p, cputime); 4169 account_group_user_time(p, cputime);
4104 4170
4105 /* Add user time to cpustat. */ 4171 /* Add user time to cpustat. */
@@ -4116,51 +4182,48 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4116 * Account guest cpu time to a process. 4182 * Account guest cpu time to a process.
4117 * @p: the process that the cpu time gets accounted to 4183 * @p: the process that the cpu time gets accounted to
4118 * @cputime: the cpu time spent in virtual machine since the last update 4184 * @cputime: the cpu time spent in virtual machine since the last update
4185 * @cputime_scaled: cputime scaled by cpu frequency
4119 */ 4186 */
4120static void account_guest_time(struct task_struct *p, cputime_t cputime) 4187static void account_guest_time(struct task_struct *p, cputime_t cputime,
4188 cputime_t cputime_scaled)
4121{ 4189{
4122 cputime64_t tmp; 4190 cputime64_t tmp;
4123 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4191 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4124 4192
4125 tmp = cputime_to_cputime64(cputime); 4193 tmp = cputime_to_cputime64(cputime);
4126 4194
4195 /* Add guest time to process. */
4127 p->utime = cputime_add(p->utime, cputime); 4196 p->utime = cputime_add(p->utime, cputime);
4197 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4128 account_group_user_time(p, cputime); 4198 account_group_user_time(p, cputime);
4129 p->gtime = cputime_add(p->gtime, cputime); 4199 p->gtime = cputime_add(p->gtime, cputime);
4130 4200
4201 /* Add guest time to cpustat. */
4131 cpustat->user = cputime64_add(cpustat->user, tmp); 4202 cpustat->user = cputime64_add(cpustat->user, tmp);
4132 cpustat->guest = cputime64_add(cpustat->guest, tmp); 4203 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4133} 4204}
4134 4205
4135/* 4206/*
4136 * Account scaled user cpu time to a process.
4137 * @p: the process that the cpu time gets accounted to
4138 * @cputime: the cpu time spent in user space since the last update
4139 */
4140void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4141{
4142 p->utimescaled = cputime_add(p->utimescaled, cputime);
4143}
4144
4145/*
4146 * Account system cpu time to a process. 4207 * Account system cpu time to a process.
4147 * @p: the process that the cpu time gets accounted to 4208 * @p: the process that the cpu time gets accounted to
4148 * @hardirq_offset: the offset to subtract from hardirq_count() 4209 * @hardirq_offset: the offset to subtract from hardirq_count()
4149 * @cputime: the cpu time spent in kernel space since the last update 4210 * @cputime: the cpu time spent in kernel space since the last update
4211 * @cputime_scaled: cputime scaled by cpu frequency
4150 */ 4212 */
4151void account_system_time(struct task_struct *p, int hardirq_offset, 4213void account_system_time(struct task_struct *p, int hardirq_offset,
4152 cputime_t cputime) 4214 cputime_t cputime, cputime_t cputime_scaled)
4153{ 4215{
4154 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4216 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4155 struct rq *rq = this_rq();
4156 cputime64_t tmp; 4217 cputime64_t tmp;
4157 4218
4158 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 4219 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4159 account_guest_time(p, cputime); 4220 account_guest_time(p, cputime, cputime_scaled);
4160 return; 4221 return;
4161 } 4222 }
4162 4223
4224 /* Add system time to process. */
4163 p->stime = cputime_add(p->stime, cputime); 4225 p->stime = cputime_add(p->stime, cputime);
4226 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
4164 account_group_system_time(p, cputime); 4227 account_group_system_time(p, cputime);
4165 4228
4166 /* Add system time to cpustat. */ 4229 /* Add system time to cpustat. */
@@ -4169,50 +4232,85 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4169 cpustat->irq = cputime64_add(cpustat->irq, tmp); 4232 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4170 else if (softirq_count()) 4233 else if (softirq_count())
4171 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 4234 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4172 else if (p != rq->idle)
4173 cpustat->system = cputime64_add(cpustat->system, tmp);
4174 else if (atomic_read(&rq->nr_iowait) > 0)
4175 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4176 else 4235 else
4177 cpustat->idle = cputime64_add(cpustat->idle, tmp); 4236 cpustat->system = cputime64_add(cpustat->system, tmp);
4237
4178 /* Account for system time used */ 4238 /* Account for system time used */
4179 acct_update_integrals(p); 4239 acct_update_integrals(p);
4180} 4240}
4181 4241
4182/* 4242/*
4183 * Account scaled system cpu time to a process. 4243 * Account for involuntary wait time.
4184 * @p: the process that the cpu time gets accounted to 4244 * @steal: the cpu time spent in involuntary wait
4185 * @hardirq_offset: the offset to subtract from hardirq_count()
4186 * @cputime: the cpu time spent in kernel space since the last update
4187 */ 4245 */
4188void account_system_time_scaled(struct task_struct *p, cputime_t cputime) 4246void account_steal_time(cputime_t cputime)
4189{ 4247{
4190 p->stimescaled = cputime_add(p->stimescaled, cputime); 4248 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4249 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4250
4251 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
4191} 4252}
4192 4253
4193/* 4254/*
4194 * Account for involuntary wait time. 4255 * Account for idle time.
4195 * @p: the process from which the cpu time has been stolen 4256 * @cputime: the cpu time spent in idle wait
4196 * @steal: the cpu time spent in involuntary wait
4197 */ 4257 */
4198void account_steal_time(struct task_struct *p, cputime_t steal) 4258void account_idle_time(cputime_t cputime)
4199{ 4259{
4200 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4260 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4201 cputime64_t tmp = cputime_to_cputime64(steal); 4261 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4202 struct rq *rq = this_rq(); 4262 struct rq *rq = this_rq();
4203 4263
4204 if (p == rq->idle) { 4264 if (atomic_read(&rq->nr_iowait) > 0)
4205 p->stime = cputime_add(p->stime, steal); 4265 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
4206 account_group_system_time(p, steal); 4266 else
4207 if (atomic_read(&rq->nr_iowait) > 0) 4267 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
4208 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4268}
4209 else 4269
4210 cpustat->idle = cputime64_add(cpustat->idle, tmp); 4270#ifndef CONFIG_VIRT_CPU_ACCOUNTING
4211 } else 4271
4212 cpustat->steal = cputime64_add(cpustat->steal, tmp); 4272/*
4273 * Account a single tick of cpu time.
4274 * @p: the process that the cpu time gets accounted to
4275 * @user_tick: indicates if the tick is a user or a system tick
4276 */
4277void account_process_tick(struct task_struct *p, int user_tick)
4278{
4279 cputime_t one_jiffy = jiffies_to_cputime(1);
4280 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
4281 struct rq *rq = this_rq();
4282
4283 if (user_tick)
4284 account_user_time(p, one_jiffy, one_jiffy_scaled);
4285 else if (p != rq->idle)
4286 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
4287 one_jiffy_scaled);
4288 else
4289 account_idle_time(one_jiffy);
4290}
4291
4292/*
4293 * Account multiple ticks of steal time.
4294 * @p: the process from which the cpu time has been stolen
4295 * @ticks: number of stolen ticks
4296 */
4297void account_steal_ticks(unsigned long ticks)
4298{
4299 account_steal_time(jiffies_to_cputime(ticks));
4213} 4300}
4214 4301
4215/* 4302/*
4303 * Account multiple ticks of idle time.
4304 * @ticks: number of stolen ticks
4305 */
4306void account_idle_ticks(unsigned long ticks)
4307{
4308 account_idle_time(jiffies_to_cputime(ticks));
4309}
4310
4311#endif
4312
4313/*
4216 * Use precise platform statistics if available: 4314 * Use precise platform statistics if available:
4217 */ 4315 */
4218#ifdef CONFIG_VIRT_CPU_ACCOUNTING 4316#ifdef CONFIG_VIRT_CPU_ACCOUNTING
@@ -4339,7 +4437,7 @@ void __kprobes sub_preempt_count(int val)
4339 /* 4437 /*
4340 * Underflow? 4438 * Underflow?
4341 */ 4439 */
4342 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4440 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4343 return; 4441 return;
4344 /* 4442 /*
4345 * Is the spinlock portion underflowing? 4443 * Is the spinlock portion underflowing?
@@ -5134,6 +5232,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5134 set_load_weight(p); 5232 set_load_weight(p);
5135} 5233}
5136 5234
5235/*
5236 * check the target process has a UID that matches the current process's
5237 */
5238static bool check_same_owner(struct task_struct *p)
5239{
5240 const struct cred *cred = current_cred(), *pcred;
5241 bool match;
5242
5243 rcu_read_lock();
5244 pcred = __task_cred(p);
5245 match = (cred->euid == pcred->euid ||
5246 cred->euid == pcred->uid);
5247 rcu_read_unlock();
5248 return match;
5249}
5250
5137static int __sched_setscheduler(struct task_struct *p, int policy, 5251static int __sched_setscheduler(struct task_struct *p, int policy,
5138 struct sched_param *param, bool user) 5252 struct sched_param *param, bool user)
5139{ 5253{
@@ -5193,8 +5307,7 @@ recheck:
5193 return -EPERM; 5307 return -EPERM;
5194 5308
5195 /* can't change other user's priorities */ 5309 /* can't change other user's priorities */
5196 if ((current->euid != p->euid) && 5310 if (!check_same_owner(p))
5197 (current->euid != p->uid))
5198 return -EPERM; 5311 return -EPERM;
5199 } 5312 }
5200 5313
@@ -5400,10 +5513,9 @@ out_unlock:
5400 return retval; 5513 return retval;
5401} 5514}
5402 5515
5403long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) 5516long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5404{ 5517{
5405 cpumask_t cpus_allowed; 5518 cpumask_var_t cpus_allowed, new_mask;
5406 cpumask_t new_mask = *in_mask;
5407 struct task_struct *p; 5519 struct task_struct *p;
5408 int retval; 5520 int retval;
5409 5521
@@ -5425,46 +5537,57 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5425 get_task_struct(p); 5537 get_task_struct(p);
5426 read_unlock(&tasklist_lock); 5538 read_unlock(&tasklist_lock);
5427 5539
5540 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5541 retval = -ENOMEM;
5542 goto out_put_task;
5543 }
5544 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5545 retval = -ENOMEM;
5546 goto out_free_cpus_allowed;
5547 }
5428 retval = -EPERM; 5548 retval = -EPERM;
5429 if ((current->euid != p->euid) && (current->euid != p->uid) && 5549 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5430 !capable(CAP_SYS_NICE))
5431 goto out_unlock; 5550 goto out_unlock;
5432 5551
5433 retval = security_task_setscheduler(p, 0, NULL); 5552 retval = security_task_setscheduler(p, 0, NULL);
5434 if (retval) 5553 if (retval)
5435 goto out_unlock; 5554 goto out_unlock;
5436 5555
5437 cpuset_cpus_allowed(p, &cpus_allowed); 5556 cpuset_cpus_allowed(p, cpus_allowed);
5438 cpus_and(new_mask, new_mask, cpus_allowed); 5557 cpumask_and(new_mask, in_mask, cpus_allowed);
5439 again: 5558 again:
5440 retval = set_cpus_allowed_ptr(p, &new_mask); 5559 retval = set_cpus_allowed_ptr(p, new_mask);
5441 5560
5442 if (!retval) { 5561 if (!retval) {
5443 cpuset_cpus_allowed(p, &cpus_allowed); 5562 cpuset_cpus_allowed(p, cpus_allowed);
5444 if (!cpus_subset(new_mask, cpus_allowed)) { 5563 if (!cpumask_subset(new_mask, cpus_allowed)) {
5445 /* 5564 /*
5446 * We must have raced with a concurrent cpuset 5565 * We must have raced with a concurrent cpuset
5447 * update. Just reset the cpus_allowed to the 5566 * update. Just reset the cpus_allowed to the
5448 * cpuset's cpus_allowed 5567 * cpuset's cpus_allowed
5449 */ 5568 */
5450 new_mask = cpus_allowed; 5569 cpumask_copy(new_mask, cpus_allowed);
5451 goto again; 5570 goto again;
5452 } 5571 }
5453 } 5572 }
5454out_unlock: 5573out_unlock:
5574 free_cpumask_var(new_mask);
5575out_free_cpus_allowed:
5576 free_cpumask_var(cpus_allowed);
5577out_put_task:
5455 put_task_struct(p); 5578 put_task_struct(p);
5456 put_online_cpus(); 5579 put_online_cpus();
5457 return retval; 5580 return retval;
5458} 5581}
5459 5582
5460static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5583static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5461 cpumask_t *new_mask) 5584 struct cpumask *new_mask)
5462{ 5585{
5463 if (len < sizeof(cpumask_t)) { 5586 if (len < cpumask_size())
5464 memset(new_mask, 0, sizeof(cpumask_t)); 5587 cpumask_clear(new_mask);
5465 } else if (len > sizeof(cpumask_t)) { 5588 else if (len > cpumask_size())
5466 len = sizeof(cpumask_t); 5589 len = cpumask_size();
5467 } 5590
5468 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5591 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5469} 5592}
5470 5593
@@ -5477,17 +5600,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5477asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 5600asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5478 unsigned long __user *user_mask_ptr) 5601 unsigned long __user *user_mask_ptr)
5479{ 5602{
5480 cpumask_t new_mask; 5603 cpumask_var_t new_mask;
5481 int retval; 5604 int retval;
5482 5605
5483 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 5606 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5484 if (retval) 5607 return -ENOMEM;
5485 return retval;
5486 5608
5487 return sched_setaffinity(pid, &new_mask); 5609 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5610 if (retval == 0)
5611 retval = sched_setaffinity(pid, new_mask);
5612 free_cpumask_var(new_mask);
5613 return retval;
5488} 5614}
5489 5615
5490long sched_getaffinity(pid_t pid, cpumask_t *mask) 5616long sched_getaffinity(pid_t pid, struct cpumask *mask)
5491{ 5617{
5492 struct task_struct *p; 5618 struct task_struct *p;
5493 int retval; 5619 int retval;
@@ -5504,7 +5630,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
5504 if (retval) 5630 if (retval)
5505 goto out_unlock; 5631 goto out_unlock;
5506 5632
5507 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 5633 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5508 5634
5509out_unlock: 5635out_unlock:
5510 read_unlock(&tasklist_lock); 5636 read_unlock(&tasklist_lock);
@@ -5523,19 +5649,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5523 unsigned long __user *user_mask_ptr) 5649 unsigned long __user *user_mask_ptr)
5524{ 5650{
5525 int ret; 5651 int ret;
5526 cpumask_t mask; 5652 cpumask_var_t mask;
5527 5653
5528 if (len < sizeof(cpumask_t)) 5654 if (len < cpumask_size())
5529 return -EINVAL; 5655 return -EINVAL;
5530 5656
5531 ret = sched_getaffinity(pid, &mask); 5657 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5532 if (ret < 0) 5658 return -ENOMEM;
5533 return ret;
5534 5659
5535 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 5660 ret = sched_getaffinity(pid, mask);
5536 return -EFAULT; 5661 if (ret == 0) {
5662 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
5663 ret = -EFAULT;
5664 else
5665 ret = cpumask_size();
5666 }
5667 free_cpumask_var(mask);
5537 5668
5538 return sizeof(cpumask_t); 5669 return ret;
5539} 5670}
5540 5671
5541/** 5672/**
@@ -5877,7 +6008,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5877 idle->se.exec_start = sched_clock(); 6008 idle->se.exec_start = sched_clock();
5878 6009
5879 idle->prio = idle->normal_prio = MAX_PRIO; 6010 idle->prio = idle->normal_prio = MAX_PRIO;
5880 idle->cpus_allowed = cpumask_of_cpu(cpu); 6011 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5881 __set_task_cpu(idle, cpu); 6012 __set_task_cpu(idle, cpu);
5882 6013
5883 rq->curr = rq->idle = idle; 6014 rq->curr = rq->idle = idle;
@@ -5896,6 +6027,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5896 * The idle tasks have their own, simple scheduling class: 6027 * The idle tasks have their own, simple scheduling class:
5897 */ 6028 */
5898 idle->sched_class = &idle_sched_class; 6029 idle->sched_class = &idle_sched_class;
6030 ftrace_graph_init_task(idle);
5899} 6031}
5900 6032
5901/* 6033/*
@@ -5903,9 +6035,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5903 * indicates which cpus entered this state. This is used 6035 * indicates which cpus entered this state. This is used
5904 * in the rcu update to wait only for active cpus. For system 6036 * in the rcu update to wait only for active cpus. For system
5905 * which do not switch off the HZ timer nohz_cpu_mask should 6037 * which do not switch off the HZ timer nohz_cpu_mask should
5906 * always be CPU_MASK_NONE. 6038 * always be CPU_BITS_NONE.
5907 */ 6039 */
5908cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 6040cpumask_var_t nohz_cpu_mask;
5909 6041
5910/* 6042/*
5911 * Increase the granularity value when there are more CPUs, 6043 * Increase the granularity value when there are more CPUs,
@@ -5960,7 +6092,7 @@ static inline void sched_init_granularity(void)
5960 * task must not exit() & deallocate itself prematurely. The 6092 * task must not exit() & deallocate itself prematurely. The
5961 * call is not atomic; no spinlocks may be held. 6093 * call is not atomic; no spinlocks may be held.
5962 */ 6094 */
5963int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) 6095int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5964{ 6096{
5965 struct migration_req req; 6097 struct migration_req req;
5966 unsigned long flags; 6098 unsigned long flags;
@@ -5968,13 +6100,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5968 int ret = 0; 6100 int ret = 0;
5969 6101
5970 rq = task_rq_lock(p, &flags); 6102 rq = task_rq_lock(p, &flags);
5971 if (!cpus_intersects(*new_mask, cpu_online_map)) { 6103 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
5972 ret = -EINVAL; 6104 ret = -EINVAL;
5973 goto out; 6105 goto out;
5974 } 6106 }
5975 6107
5976 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6108 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5977 !cpus_equal(p->cpus_allowed, *new_mask))) { 6109 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5978 ret = -EINVAL; 6110 ret = -EINVAL;
5979 goto out; 6111 goto out;
5980 } 6112 }
@@ -5982,15 +6114,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5982 if (p->sched_class->set_cpus_allowed) 6114 if (p->sched_class->set_cpus_allowed)
5983 p->sched_class->set_cpus_allowed(p, new_mask); 6115 p->sched_class->set_cpus_allowed(p, new_mask);
5984 else { 6116 else {
5985 p->cpus_allowed = *new_mask; 6117 cpumask_copy(&p->cpus_allowed, new_mask);
5986 p->rt.nr_cpus_allowed = cpus_weight(*new_mask); 6118 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5987 } 6119 }
5988 6120
5989 /* Can the task run on the task's current CPU? If so, we're done */ 6121 /* Can the task run on the task's current CPU? If so, we're done */
5990 if (cpu_isset(task_cpu(p), *new_mask)) 6122 if (cpumask_test_cpu(task_cpu(p), new_mask))
5991 goto out; 6123 goto out;
5992 6124
5993 if (migrate_task(p, any_online_cpu(*new_mask), &req)) { 6125 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
5994 /* Need help from migration thread: drop lock and wait. */ 6126 /* Need help from migration thread: drop lock and wait. */
5995 task_rq_unlock(rq, &flags); 6127 task_rq_unlock(rq, &flags);
5996 wake_up_process(rq->migration_thread); 6128 wake_up_process(rq->migration_thread);
@@ -6032,7 +6164,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6032 if (task_cpu(p) != src_cpu) 6164 if (task_cpu(p) != src_cpu)
6033 goto done; 6165 goto done;
6034 /* Affinity changed (again). */ 6166 /* Affinity changed (again). */
6035 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 6167 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6036 goto fail; 6168 goto fail;
6037 6169
6038 on_rq = p->se.on_rq; 6170 on_rq = p->se.on_rq;
@@ -6126,54 +6258,44 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6126 6258
6127/* 6259/*
6128 * Figure out where task on dead CPU should go, use force if necessary. 6260 * Figure out where task on dead CPU should go, use force if necessary.
6129 * NOTE: interrupts should be disabled by the caller
6130 */ 6261 */
6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6262static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6132{ 6263{
6133 unsigned long flags;
6134 cpumask_t mask;
6135 struct rq *rq;
6136 int dest_cpu; 6264 int dest_cpu;
6265 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
6137 6266
6138 do { 6267again:
6139 /* On same node? */ 6268 /* Look for allowed, online CPU in same node. */
6140 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 6269 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
6141 cpus_and(mask, mask, p->cpus_allowed); 6270 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6142 dest_cpu = any_online_cpu(mask); 6271 goto move;
6143 6272
6144 /* On any allowed CPU? */ 6273 /* Any allowed, online CPU? */
6145 if (dest_cpu >= nr_cpu_ids) 6274 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
6146 dest_cpu = any_online_cpu(p->cpus_allowed); 6275 if (dest_cpu < nr_cpu_ids)
6276 goto move;
6147 6277
6148 /* No more Mr. Nice Guy. */ 6278 /* No more Mr. Nice Guy. */
6149 if (dest_cpu >= nr_cpu_ids) { 6279 if (dest_cpu >= nr_cpu_ids) {
6150 cpumask_t cpus_allowed; 6280 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
6281 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
6151 6282
6152 cpuset_cpus_allowed_locked(p, &cpus_allowed); 6283 /*
6153 /* 6284 * Don't tell them about moving exiting tasks or
6154 * Try to stay on the same cpuset, where the 6285 * kernel threads (both mm NULL), since they never
6155 * current cpuset may be a subset of all cpus. 6286 * leave kernel.
6156 * The cpuset_cpus_allowed_locked() variant of 6287 */
6157 * cpuset_cpus_allowed() will not block. It must be 6288 if (p->mm && printk_ratelimit()) {
6158 * called within calls to cpuset_lock/cpuset_unlock. 6289 printk(KERN_INFO "process %d (%s) no "
6159 */ 6290 "longer affine to cpu%d\n",
6160 rq = task_rq_lock(p, &flags); 6291 task_pid_nr(p), p->comm, dead_cpu);
6161 p->cpus_allowed = cpus_allowed;
6162 dest_cpu = any_online_cpu(p->cpus_allowed);
6163 task_rq_unlock(rq, &flags);
6164
6165 /*
6166 * Don't tell them about moving exiting tasks or
6167 * kernel threads (both mm NULL), since they never
6168 * leave kernel.
6169 */
6170 if (p->mm && printk_ratelimit()) {
6171 printk(KERN_INFO "process %d (%s) no "
6172 "longer affine to cpu%d\n",
6173 task_pid_nr(p), p->comm, dead_cpu);
6174 }
6175 } 6292 }
6176 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 6293 }
6294
6295move:
6296 /* It can have affinity changed while we were choosing. */
6297 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
6298 goto again;
6177} 6299}
6178 6300
6179/* 6301/*
@@ -6185,7 +6307,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6185 */ 6307 */
6186static void migrate_nr_uninterruptible(struct rq *rq_src) 6308static void migrate_nr_uninterruptible(struct rq *rq_src)
6187{ 6309{
6188 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); 6310 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
6189 unsigned long flags; 6311 unsigned long flags;
6190 6312
6191 local_irq_save(flags); 6313 local_irq_save(flags);
@@ -6475,7 +6597,7 @@ static void set_rq_online(struct rq *rq)
6475 if (!rq->online) { 6597 if (!rq->online) {
6476 const struct sched_class *class; 6598 const struct sched_class *class;
6477 6599
6478 cpu_set(rq->cpu, rq->rd->online); 6600 cpumask_set_cpu(rq->cpu, rq->rd->online);
6479 rq->online = 1; 6601 rq->online = 1;
6480 6602
6481 for_each_class(class) { 6603 for_each_class(class) {
@@ -6495,7 +6617,7 @@ static void set_rq_offline(struct rq *rq)
6495 class->rq_offline(rq); 6617 class->rq_offline(rq);
6496 } 6618 }
6497 6619
6498 cpu_clear(rq->cpu, rq->rd->online); 6620 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6499 rq->online = 0; 6621 rq->online = 0;
6500 } 6622 }
6501} 6623}
@@ -6536,7 +6658,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6536 rq = cpu_rq(cpu); 6658 rq = cpu_rq(cpu);
6537 spin_lock_irqsave(&rq->lock, flags); 6659 spin_lock_irqsave(&rq->lock, flags);
6538 if (rq->rd) { 6660 if (rq->rd) {
6539 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6661 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6540 6662
6541 set_rq_online(rq); 6663 set_rq_online(rq);
6542 } 6664 }
@@ -6550,7 +6672,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6550 break; 6672 break;
6551 /* Unbind it from offline cpu so it can run. Fall thru. */ 6673 /* Unbind it from offline cpu so it can run. Fall thru. */
6552 kthread_bind(cpu_rq(cpu)->migration_thread, 6674 kthread_bind(cpu_rq(cpu)->migration_thread,
6553 any_online_cpu(cpu_online_map)); 6675 cpumask_any(cpu_online_mask));
6554 kthread_stop(cpu_rq(cpu)->migration_thread); 6676 kthread_stop(cpu_rq(cpu)->migration_thread);
6555 cpu_rq(cpu)->migration_thread = NULL; 6677 cpu_rq(cpu)->migration_thread = NULL;
6556 break; 6678 break;
@@ -6600,7 +6722,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6600 rq = cpu_rq(cpu); 6722 rq = cpu_rq(cpu);
6601 spin_lock_irqsave(&rq->lock, flags); 6723 spin_lock_irqsave(&rq->lock, flags);
6602 if (rq->rd) { 6724 if (rq->rd) {
6603 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6725 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6604 set_rq_offline(rq); 6726 set_rq_offline(rq);
6605 } 6727 }
6606 spin_unlock_irqrestore(&rq->lock, flags); 6728 spin_unlock_irqrestore(&rq->lock, flags);
@@ -6638,36 +6760,14 @@ early_initcall(migration_init);
6638 6760
6639#ifdef CONFIG_SCHED_DEBUG 6761#ifdef CONFIG_SCHED_DEBUG
6640 6762
6641static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6642{
6643 switch (lvl) {
6644 case SD_LV_NONE:
6645 return "NONE";
6646 case SD_LV_SIBLING:
6647 return "SIBLING";
6648 case SD_LV_MC:
6649 return "MC";
6650 case SD_LV_CPU:
6651 return "CPU";
6652 case SD_LV_NODE:
6653 return "NODE";
6654 case SD_LV_ALLNODES:
6655 return "ALLNODES";
6656 case SD_LV_MAX:
6657 return "MAX";
6658
6659 }
6660 return "MAX";
6661}
6662
6663static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6763static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6664 cpumask_t *groupmask) 6764 struct cpumask *groupmask)
6665{ 6765{
6666 struct sched_group *group = sd->groups; 6766 struct sched_group *group = sd->groups;
6667 char str[256]; 6767 char str[256];
6668 6768
6669 cpulist_scnprintf(str, sizeof(str), sd->span); 6769 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6670 cpus_clear(*groupmask); 6770 cpumask_clear(groupmask);
6671 6771
6672 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6772 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6673 6773
@@ -6679,14 +6779,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6679 return -1; 6779 return -1;
6680 } 6780 }
6681 6781
6682 printk(KERN_CONT "span %s level %s\n", 6782 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6683 str, sd_level_to_string(sd->level));
6684 6783
6685 if (!cpu_isset(cpu, sd->span)) { 6784 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6686 printk(KERN_ERR "ERROR: domain->span does not contain " 6785 printk(KERN_ERR "ERROR: domain->span does not contain "
6687 "CPU%d\n", cpu); 6786 "CPU%d\n", cpu);
6688 } 6787 }
6689 if (!cpu_isset(cpu, group->cpumask)) { 6788 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6690 printk(KERN_ERR "ERROR: domain->groups does not contain" 6789 printk(KERN_ERR "ERROR: domain->groups does not contain"
6691 " CPU%d\n", cpu); 6790 " CPU%d\n", cpu);
6692 } 6791 }
@@ -6706,31 +6805,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6706 break; 6805 break;
6707 } 6806 }
6708 6807
6709 if (!cpus_weight(group->cpumask)) { 6808 if (!cpumask_weight(sched_group_cpus(group))) {
6710 printk(KERN_CONT "\n"); 6809 printk(KERN_CONT "\n");
6711 printk(KERN_ERR "ERROR: empty group\n"); 6810 printk(KERN_ERR "ERROR: empty group\n");
6712 break; 6811 break;
6713 } 6812 }
6714 6813
6715 if (cpus_intersects(*groupmask, group->cpumask)) { 6814 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6716 printk(KERN_CONT "\n"); 6815 printk(KERN_CONT "\n");
6717 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6816 printk(KERN_ERR "ERROR: repeated CPUs\n");
6718 break; 6817 break;
6719 } 6818 }
6720 6819
6721 cpus_or(*groupmask, *groupmask, group->cpumask); 6820 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6722 6821
6723 cpulist_scnprintf(str, sizeof(str), group->cpumask); 6822 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6724 printk(KERN_CONT " %s", str); 6823 printk(KERN_CONT " %s", str);
6725 6824
6726 group = group->next; 6825 group = group->next;
6727 } while (group != sd->groups); 6826 } while (group != sd->groups);
6728 printk(KERN_CONT "\n"); 6827 printk(KERN_CONT "\n");
6729 6828
6730 if (!cpus_equal(sd->span, *groupmask)) 6829 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6731 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6830 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6732 6831
6733 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) 6832 if (sd->parent &&
6833 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6734 printk(KERN_ERR "ERROR: parent span is not a superset " 6834 printk(KERN_ERR "ERROR: parent span is not a superset "
6735 "of domain->span\n"); 6835 "of domain->span\n");
6736 return 0; 6836 return 0;
@@ -6738,7 +6838,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6738 6838
6739static void sched_domain_debug(struct sched_domain *sd, int cpu) 6839static void sched_domain_debug(struct sched_domain *sd, int cpu)
6740{ 6840{
6741 cpumask_t *groupmask; 6841 cpumask_var_t groupmask;
6742 int level = 0; 6842 int level = 0;
6743 6843
6744 if (!sd) { 6844 if (!sd) {
@@ -6748,8 +6848,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6748 6848
6749 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6849 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6750 6850
6751 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 6851 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6752 if (!groupmask) {
6753 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6852 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6754 return; 6853 return;
6755 } 6854 }
@@ -6762,7 +6861,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6762 if (!sd) 6861 if (!sd)
6763 break; 6862 break;
6764 } 6863 }
6765 kfree(groupmask); 6864 free_cpumask_var(groupmask);
6766} 6865}
6767#else /* !CONFIG_SCHED_DEBUG */ 6866#else /* !CONFIG_SCHED_DEBUG */
6768# define sched_domain_debug(sd, cpu) do { } while (0) 6867# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6770,7 +6869,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6770 6869
6771static int sd_degenerate(struct sched_domain *sd) 6870static int sd_degenerate(struct sched_domain *sd)
6772{ 6871{
6773 if (cpus_weight(sd->span) == 1) 6872 if (cpumask_weight(sched_domain_span(sd)) == 1)
6774 return 1; 6873 return 1;
6775 6874
6776 /* Following flags need at least 2 groups */ 6875 /* Following flags need at least 2 groups */
@@ -6801,7 +6900,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6801 if (sd_degenerate(parent)) 6900 if (sd_degenerate(parent))
6802 return 1; 6901 return 1;
6803 6902
6804 if (!cpus_equal(sd->span, parent->span)) 6903 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6805 return 0; 6904 return 0;
6806 6905
6807 /* Does parent contain flags not in child? */ 6906 /* Does parent contain flags not in child? */
@@ -6816,6 +6915,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6816 SD_BALANCE_EXEC | 6915 SD_BALANCE_EXEC |
6817 SD_SHARE_CPUPOWER | 6916 SD_SHARE_CPUPOWER |
6818 SD_SHARE_PKG_RESOURCES); 6917 SD_SHARE_PKG_RESOURCES);
6918 if (nr_node_ids == 1)
6919 pflags &= ~SD_SERIALIZE;
6819 } 6920 }
6820 if (~cflags & pflags) 6921 if (~cflags & pflags)
6821 return 0; 6922 return 0;
@@ -6823,6 +6924,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6823 return 1; 6924 return 1;
6824} 6925}
6825 6926
6927static void free_rootdomain(struct root_domain *rd)
6928{
6929 cpupri_cleanup(&rd->cpupri);
6930
6931 free_cpumask_var(rd->rto_mask);
6932 free_cpumask_var(rd->online);
6933 free_cpumask_var(rd->span);
6934 kfree(rd);
6935}
6936
6826static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6937static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6827{ 6938{
6828 unsigned long flags; 6939 unsigned long flags;
@@ -6832,38 +6943,62 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6832 if (rq->rd) { 6943 if (rq->rd) {
6833 struct root_domain *old_rd = rq->rd; 6944 struct root_domain *old_rd = rq->rd;
6834 6945
6835 if (cpu_isset(rq->cpu, old_rd->online)) 6946 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6836 set_rq_offline(rq); 6947 set_rq_offline(rq);
6837 6948
6838 cpu_clear(rq->cpu, old_rd->span); 6949 cpumask_clear_cpu(rq->cpu, old_rd->span);
6839 6950
6840 if (atomic_dec_and_test(&old_rd->refcount)) 6951 if (atomic_dec_and_test(&old_rd->refcount))
6841 kfree(old_rd); 6952 free_rootdomain(old_rd);
6842 } 6953 }
6843 6954
6844 atomic_inc(&rd->refcount); 6955 atomic_inc(&rd->refcount);
6845 rq->rd = rd; 6956 rq->rd = rd;
6846 6957
6847 cpu_set(rq->cpu, rd->span); 6958 cpumask_set_cpu(rq->cpu, rd->span);
6848 if (cpu_isset(rq->cpu, cpu_online_map)) 6959 if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
6849 set_rq_online(rq); 6960 set_rq_online(rq);
6850 6961
6851 spin_unlock_irqrestore(&rq->lock, flags); 6962 spin_unlock_irqrestore(&rq->lock, flags);
6852} 6963}
6853 6964
6854static void init_rootdomain(struct root_domain *rd) 6965static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
6855{ 6966{
6856 memset(rd, 0, sizeof(*rd)); 6967 memset(rd, 0, sizeof(*rd));
6857 6968
6858 cpus_clear(rd->span); 6969 if (bootmem) {
6859 cpus_clear(rd->online); 6970 alloc_bootmem_cpumask_var(&def_root_domain.span);
6971 alloc_bootmem_cpumask_var(&def_root_domain.online);
6972 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
6973 cpupri_init(&rd->cpupri, true);
6974 return 0;
6975 }
6976
6977 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6978 goto out;
6979 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6980 goto free_span;
6981 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6982 goto free_online;
6860 6983
6861 cpupri_init(&rd->cpupri); 6984 if (cpupri_init(&rd->cpupri, false) != 0)
6985 goto free_rto_mask;
6986 return 0;
6987
6988free_rto_mask:
6989 free_cpumask_var(rd->rto_mask);
6990free_online:
6991 free_cpumask_var(rd->online);
6992free_span:
6993 free_cpumask_var(rd->span);
6994out:
6995 return -ENOMEM;
6862} 6996}
6863 6997
6864static void init_defrootdomain(void) 6998static void init_defrootdomain(void)
6865{ 6999{
6866 init_rootdomain(&def_root_domain); 7000 init_rootdomain(&def_root_domain, true);
7001
6867 atomic_set(&def_root_domain.refcount, 1); 7002 atomic_set(&def_root_domain.refcount, 1);
6868} 7003}
6869 7004
@@ -6875,7 +7010,10 @@ static struct root_domain *alloc_rootdomain(void)
6875 if (!rd) 7010 if (!rd)
6876 return NULL; 7011 return NULL;
6877 7012
6878 init_rootdomain(rd); 7013 if (init_rootdomain(rd, false) != 0) {
7014 kfree(rd);
7015 return NULL;
7016 }
6879 7017
6880 return rd; 7018 return rd;
6881} 7019}
@@ -6917,19 +7055,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6917} 7055}
6918 7056
6919/* cpus with isolated domains */ 7057/* cpus with isolated domains */
6920static cpumask_t cpu_isolated_map = CPU_MASK_NONE; 7058static cpumask_var_t cpu_isolated_map;
6921 7059
6922/* Setup the mask of cpus configured for isolated domains */ 7060/* Setup the mask of cpus configured for isolated domains */
6923static int __init isolated_cpu_setup(char *str) 7061static int __init isolated_cpu_setup(char *str)
6924{ 7062{
6925 static int __initdata ints[NR_CPUS]; 7063 cpulist_parse(str, cpu_isolated_map);
6926 int i;
6927
6928 str = get_options(str, ARRAY_SIZE(ints), ints);
6929 cpus_clear(cpu_isolated_map);
6930 for (i = 1; i <= ints[0]; i++)
6931 if (ints[i] < NR_CPUS)
6932 cpu_set(ints[i], cpu_isolated_map);
6933 return 1; 7064 return 1;
6934} 7065}
6935 7066
@@ -6938,42 +7069,43 @@ __setup("isolcpus=", isolated_cpu_setup);
6938/* 7069/*
6939 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 7070 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6940 * to a function which identifies what group(along with sched group) a CPU 7071 * to a function which identifies what group(along with sched group) a CPU
6941 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS 7072 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6942 * (due to the fact that we keep track of groups covered with a cpumask_t). 7073 * (due to the fact that we keep track of groups covered with a struct cpumask).
6943 * 7074 *
6944 * init_sched_build_groups will build a circular linked list of the groups 7075 * init_sched_build_groups will build a circular linked list of the groups
6945 * covered by the given span, and will set each group's ->cpumask correctly, 7076 * covered by the given span, and will set each group's ->cpumask correctly,
6946 * and ->cpu_power to 0. 7077 * and ->cpu_power to 0.
6947 */ 7078 */
6948static void 7079static void
6949init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, 7080init_sched_build_groups(const struct cpumask *span,
6950 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 7081 const struct cpumask *cpu_map,
7082 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6951 struct sched_group **sg, 7083 struct sched_group **sg,
6952 cpumask_t *tmpmask), 7084 struct cpumask *tmpmask),
6953 cpumask_t *covered, cpumask_t *tmpmask) 7085 struct cpumask *covered, struct cpumask *tmpmask)
6954{ 7086{
6955 struct sched_group *first = NULL, *last = NULL; 7087 struct sched_group *first = NULL, *last = NULL;
6956 int i; 7088 int i;
6957 7089
6958 cpus_clear(*covered); 7090 cpumask_clear(covered);
6959 7091
6960 for_each_cpu_mask_nr(i, *span) { 7092 for_each_cpu(i, span) {
6961 struct sched_group *sg; 7093 struct sched_group *sg;
6962 int group = group_fn(i, cpu_map, &sg, tmpmask); 7094 int group = group_fn(i, cpu_map, &sg, tmpmask);
6963 int j; 7095 int j;
6964 7096
6965 if (cpu_isset(i, *covered)) 7097 if (cpumask_test_cpu(i, covered))
6966 continue; 7098 continue;
6967 7099
6968 cpus_clear(sg->cpumask); 7100 cpumask_clear(sched_group_cpus(sg));
6969 sg->__cpu_power = 0; 7101 sg->__cpu_power = 0;
6970 7102
6971 for_each_cpu_mask_nr(j, *span) { 7103 for_each_cpu(j, span) {
6972 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 7104 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6973 continue; 7105 continue;
6974 7106
6975 cpu_set(j, *covered); 7107 cpumask_set_cpu(j, covered);
6976 cpu_set(j, sg->cpumask); 7108 cpumask_set_cpu(j, sched_group_cpus(sg));
6977 } 7109 }
6978 if (!first) 7110 if (!first)
6979 first = sg; 7111 first = sg;
@@ -7037,23 +7169,21 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
7037 * should be one that prevents unnecessary balancing, but also spreads tasks 7169 * should be one that prevents unnecessary balancing, but also spreads tasks
7038 * out optimally. 7170 * out optimally.
7039 */ 7171 */
7040static void sched_domain_node_span(int node, cpumask_t *span) 7172static void sched_domain_node_span(int node, struct cpumask *span)
7041{ 7173{
7042 nodemask_t used_nodes; 7174 nodemask_t used_nodes;
7043 node_to_cpumask_ptr(nodemask, node);
7044 int i; 7175 int i;
7045 7176
7046 cpus_clear(*span); 7177 cpumask_clear(span);
7047 nodes_clear(used_nodes); 7178 nodes_clear(used_nodes);
7048 7179
7049 cpus_or(*span, *span, *nodemask); 7180 cpumask_or(span, span, cpumask_of_node(node));
7050 node_set(node, used_nodes); 7181 node_set(node, used_nodes);
7051 7182
7052 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7183 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
7053 int next_node = find_next_best_node(node, &used_nodes); 7184 int next_node = find_next_best_node(node, &used_nodes);
7054 7185
7055 node_to_cpumask_ptr_next(nodemask, next_node); 7186 cpumask_or(span, span, cpumask_of_node(next_node));
7056 cpus_or(*span, *span, *nodemask);
7057 } 7187 }
7058} 7188}
7059#endif /* CONFIG_NUMA */ 7189#endif /* CONFIG_NUMA */
@@ -7061,18 +7191,33 @@ static void sched_domain_node_span(int node, cpumask_t *span)
7061int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7191int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7062 7192
7063/* 7193/*
7194 * The cpus mask in sched_group and sched_domain hangs off the end.
7195 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
7196 * for nr_cpu_ids < CONFIG_NR_CPUS.
7197 */
7198struct static_sched_group {
7199 struct sched_group sg;
7200 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
7201};
7202
7203struct static_sched_domain {
7204 struct sched_domain sd;
7205 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
7206};
7207
7208/*
7064 * SMT sched-domains: 7209 * SMT sched-domains:
7065 */ 7210 */
7066#ifdef CONFIG_SCHED_SMT 7211#ifdef CONFIG_SCHED_SMT
7067static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 7212static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
7068static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7213static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
7069 7214
7070static int 7215static int
7071cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7216cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
7072 cpumask_t *unused) 7217 struct sched_group **sg, struct cpumask *unused)
7073{ 7218{
7074 if (sg) 7219 if (sg)
7075 *sg = &per_cpu(sched_group_cpus, cpu); 7220 *sg = &per_cpu(sched_group_cpus, cpu).sg;
7076 return cpu; 7221 return cpu;
7077} 7222}
7078#endif /* CONFIG_SCHED_SMT */ 7223#endif /* CONFIG_SCHED_SMT */
@@ -7081,56 +7226,53 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7081 * multi-core sched-domains: 7226 * multi-core sched-domains:
7082 */ 7227 */
7083#ifdef CONFIG_SCHED_MC 7228#ifdef CONFIG_SCHED_MC
7084static DEFINE_PER_CPU(struct sched_domain, core_domains); 7229static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
7085static DEFINE_PER_CPU(struct sched_group, sched_group_core); 7230static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
7086#endif /* CONFIG_SCHED_MC */ 7231#endif /* CONFIG_SCHED_MC */
7087 7232
7088#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7233#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7089static int 7234static int
7090cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7235cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7091 cpumask_t *mask) 7236 struct sched_group **sg, struct cpumask *mask)
7092{ 7237{
7093 int group; 7238 int group;
7094 7239
7095 *mask = per_cpu(cpu_sibling_map, cpu); 7240 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7096 cpus_and(*mask, *mask, *cpu_map); 7241 group = cpumask_first(mask);
7097 group = first_cpu(*mask);
7098 if (sg) 7242 if (sg)
7099 *sg = &per_cpu(sched_group_core, group); 7243 *sg = &per_cpu(sched_group_core, group).sg;
7100 return group; 7244 return group;
7101} 7245}
7102#elif defined(CONFIG_SCHED_MC) 7246#elif defined(CONFIG_SCHED_MC)
7103static int 7247static int
7104cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7248cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7105 cpumask_t *unused) 7249 struct sched_group **sg, struct cpumask *unused)
7106{ 7250{
7107 if (sg) 7251 if (sg)
7108 *sg = &per_cpu(sched_group_core, cpu); 7252 *sg = &per_cpu(sched_group_core, cpu).sg;
7109 return cpu; 7253 return cpu;
7110} 7254}
7111#endif 7255#endif
7112 7256
7113static DEFINE_PER_CPU(struct sched_domain, phys_domains); 7257static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
7114static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7258static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
7115 7259
7116static int 7260static int
7117cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7261cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7118 cpumask_t *mask) 7262 struct sched_group **sg, struct cpumask *mask)
7119{ 7263{
7120 int group; 7264 int group;
7121#ifdef CONFIG_SCHED_MC 7265#ifdef CONFIG_SCHED_MC
7122 *mask = cpu_coregroup_map(cpu); 7266 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7123 cpus_and(*mask, *mask, *cpu_map); 7267 group = cpumask_first(mask);
7124 group = first_cpu(*mask);
7125#elif defined(CONFIG_SCHED_SMT) 7268#elif defined(CONFIG_SCHED_SMT)
7126 *mask = per_cpu(cpu_sibling_map, cpu); 7269 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7127 cpus_and(*mask, *mask, *cpu_map); 7270 group = cpumask_first(mask);
7128 group = first_cpu(*mask);
7129#else 7271#else
7130 group = cpu; 7272 group = cpu;
7131#endif 7273#endif
7132 if (sg) 7274 if (sg)
7133 *sg = &per_cpu(sched_group_phys, group); 7275 *sg = &per_cpu(sched_group_phys, group).sg;
7134 return group; 7276 return group;
7135} 7277}
7136 7278
@@ -7144,19 +7286,19 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
7144static struct sched_group ***sched_group_nodes_bycpu; 7286static struct sched_group ***sched_group_nodes_bycpu;
7145 7287
7146static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7288static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7147static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7289static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7148 7290
7149static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7291static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7150 struct sched_group **sg, cpumask_t *nodemask) 7292 struct sched_group **sg,
7293 struct cpumask *nodemask)
7151{ 7294{
7152 int group; 7295 int group;
7153 7296
7154 *nodemask = node_to_cpumask(cpu_to_node(cpu)); 7297 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7155 cpus_and(*nodemask, *nodemask, *cpu_map); 7298 group = cpumask_first(nodemask);
7156 group = first_cpu(*nodemask);
7157 7299
7158 if (sg) 7300 if (sg)
7159 *sg = &per_cpu(sched_group_allnodes, group); 7301 *sg = &per_cpu(sched_group_allnodes, group).sg;
7160 return group; 7302 return group;
7161} 7303}
7162 7304
@@ -7168,11 +7310,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7168 if (!sg) 7310 if (!sg)
7169 return; 7311 return;
7170 do { 7312 do {
7171 for_each_cpu_mask_nr(j, sg->cpumask) { 7313 for_each_cpu(j, sched_group_cpus(sg)) {
7172 struct sched_domain *sd; 7314 struct sched_domain *sd;
7173 7315
7174 sd = &per_cpu(phys_domains, j); 7316 sd = &per_cpu(phys_domains, j).sd;
7175 if (j != first_cpu(sd->groups->cpumask)) { 7317 if (j != cpumask_first(sched_group_cpus(sd->groups))) {
7176 /* 7318 /*
7177 * Only add "power" once for each 7319 * Only add "power" once for each
7178 * physical package. 7320 * physical package.
@@ -7189,11 +7331,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7189 7331
7190#ifdef CONFIG_NUMA 7332#ifdef CONFIG_NUMA
7191/* Free memory allocated for various sched_group structures */ 7333/* Free memory allocated for various sched_group structures */
7192static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7334static void free_sched_groups(const struct cpumask *cpu_map,
7335 struct cpumask *nodemask)
7193{ 7336{
7194 int cpu, i; 7337 int cpu, i;
7195 7338
7196 for_each_cpu_mask_nr(cpu, *cpu_map) { 7339 for_each_cpu(cpu, cpu_map) {
7197 struct sched_group **sched_group_nodes 7340 struct sched_group **sched_group_nodes
7198 = sched_group_nodes_bycpu[cpu]; 7341 = sched_group_nodes_bycpu[cpu];
7199 7342
@@ -7203,9 +7346,8 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7203 for (i = 0; i < nr_node_ids; i++) { 7346 for (i = 0; i < nr_node_ids; i++) {
7204 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7347 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7205 7348
7206 *nodemask = node_to_cpumask(i); 7349 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7207 cpus_and(*nodemask, *nodemask, *cpu_map); 7350 if (cpumask_empty(nodemask))
7208 if (cpus_empty(*nodemask))
7209 continue; 7351 continue;
7210 7352
7211 if (sg == NULL) 7353 if (sg == NULL)
@@ -7223,7 +7365,8 @@ next_sg:
7223 } 7365 }
7224} 7366}
7225#else /* !CONFIG_NUMA */ 7367#else /* !CONFIG_NUMA */
7226static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7368static void free_sched_groups(const struct cpumask *cpu_map,
7369 struct cpumask *nodemask)
7227{ 7370{
7228} 7371}
7229#endif /* CONFIG_NUMA */ 7372#endif /* CONFIG_NUMA */
@@ -7249,7 +7392,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7249 7392
7250 WARN_ON(!sd || !sd->groups); 7393 WARN_ON(!sd || !sd->groups);
7251 7394
7252 if (cpu != first_cpu(sd->groups->cpumask)) 7395 if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
7253 return; 7396 return;
7254 7397
7255 child = sd->child; 7398 child = sd->child;
@@ -7314,40 +7457,6 @@ SD_INIT_FUNC(CPU)
7314 SD_INIT_FUNC(MC) 7457 SD_INIT_FUNC(MC)
7315#endif 7458#endif
7316 7459
7317/*
7318 * To minimize stack usage kmalloc room for cpumasks and share the
7319 * space as the usage in build_sched_domains() dictates. Used only
7320 * if the amount of space is significant.
7321 */
7322struct allmasks {
7323 cpumask_t tmpmask; /* make this one first */
7324 union {
7325 cpumask_t nodemask;
7326 cpumask_t this_sibling_map;
7327 cpumask_t this_core_map;
7328 };
7329 cpumask_t send_covered;
7330
7331#ifdef CONFIG_NUMA
7332 cpumask_t domainspan;
7333 cpumask_t covered;
7334 cpumask_t notcovered;
7335#endif
7336};
7337
7338#if NR_CPUS > 128
7339#define SCHED_CPUMASK_ALLOC 1
7340#define SCHED_CPUMASK_FREE(v) kfree(v)
7341#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7342#else
7343#define SCHED_CPUMASK_ALLOC 0
7344#define SCHED_CPUMASK_FREE(v)
7345#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7346#endif
7347
7348#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7349 ((unsigned long)(a) + offsetof(struct allmasks, v))
7350
7351static int default_relax_domain_level = -1; 7460static int default_relax_domain_level = -1;
7352 7461
7353static int __init setup_relax_domain_level(char *str) 7462static int __init setup_relax_domain_level(char *str)
@@ -7387,17 +7496,38 @@ static void set_domain_attribute(struct sched_domain *sd,
7387 * Build sched domains for a given set of cpus and attach the sched domains 7496 * Build sched domains for a given set of cpus and attach the sched domains
7388 * to the individual cpus 7497 * to the individual cpus
7389 */ 7498 */
7390static int __build_sched_domains(const cpumask_t *cpu_map, 7499static int __build_sched_domains(const struct cpumask *cpu_map,
7391 struct sched_domain_attr *attr) 7500 struct sched_domain_attr *attr)
7392{ 7501{
7393 int i; 7502 int i, err = -ENOMEM;
7394 struct root_domain *rd; 7503 struct root_domain *rd;
7395 SCHED_CPUMASK_DECLARE(allmasks); 7504 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
7396 cpumask_t *tmpmask; 7505 tmpmask;
7397#ifdef CONFIG_NUMA 7506#ifdef CONFIG_NUMA
7507 cpumask_var_t domainspan, covered, notcovered;
7398 struct sched_group **sched_group_nodes = NULL; 7508 struct sched_group **sched_group_nodes = NULL;
7399 int sd_allnodes = 0; 7509 int sd_allnodes = 0;
7400 7510
7511 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
7512 goto out;
7513 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
7514 goto free_domainspan;
7515 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
7516 goto free_covered;
7517#endif
7518
7519 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
7520 goto free_notcovered;
7521 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
7522 goto free_nodemask;
7523 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
7524 goto free_this_sibling_map;
7525 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
7526 goto free_this_core_map;
7527 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
7528 goto free_send_covered;
7529
7530#ifdef CONFIG_NUMA
7401 /* 7531 /*
7402 * Allocate the per-node list of sched groups 7532 * Allocate the per-node list of sched groups
7403 */ 7533 */
@@ -7405,55 +7535,35 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7405 GFP_KERNEL); 7535 GFP_KERNEL);
7406 if (!sched_group_nodes) { 7536 if (!sched_group_nodes) {
7407 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7537 printk(KERN_WARNING "Can not alloc sched group node list\n");
7408 return -ENOMEM; 7538 goto free_tmpmask;
7409 } 7539 }
7410#endif 7540#endif
7411 7541
7412 rd = alloc_rootdomain(); 7542 rd = alloc_rootdomain();
7413 if (!rd) { 7543 if (!rd) {
7414 printk(KERN_WARNING "Cannot alloc root domain\n"); 7544 printk(KERN_WARNING "Cannot alloc root domain\n");
7415#ifdef CONFIG_NUMA 7545 goto free_sched_groups;
7416 kfree(sched_group_nodes);
7417#endif
7418 return -ENOMEM;
7419 } 7546 }
7420 7547
7421#if SCHED_CPUMASK_ALLOC
7422 /* get space for all scratch cpumask variables */
7423 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7424 if (!allmasks) {
7425 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7426 kfree(rd);
7427#ifdef CONFIG_NUMA 7548#ifdef CONFIG_NUMA
7428 kfree(sched_group_nodes); 7549 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
7429#endif
7430 return -ENOMEM;
7431 }
7432#endif
7433 tmpmask = (cpumask_t *)allmasks;
7434
7435
7436#ifdef CONFIG_NUMA
7437 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7438#endif 7550#endif
7439 7551
7440 /* 7552 /*
7441 * Set up domains for cpus specified by the cpu_map. 7553 * Set up domains for cpus specified by the cpu_map.
7442 */ 7554 */
7443 for_each_cpu_mask_nr(i, *cpu_map) { 7555 for_each_cpu(i, cpu_map) {
7444 struct sched_domain *sd = NULL, *p; 7556 struct sched_domain *sd = NULL, *p;
7445 SCHED_CPUMASK_VAR(nodemask, allmasks);
7446 7557
7447 *nodemask = node_to_cpumask(cpu_to_node(i)); 7558 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
7448 cpus_and(*nodemask, *nodemask, *cpu_map);
7449 7559
7450#ifdef CONFIG_NUMA 7560#ifdef CONFIG_NUMA
7451 if (cpus_weight(*cpu_map) > 7561 if (cpumask_weight(cpu_map) >
7452 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { 7562 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
7453 sd = &per_cpu(allnodes_domains, i); 7563 sd = &per_cpu(allnodes_domains, i);
7454 SD_INIT(sd, ALLNODES); 7564 SD_INIT(sd, ALLNODES);
7455 set_domain_attribute(sd, attr); 7565 set_domain_attribute(sd, attr);
7456 sd->span = *cpu_map; 7566 cpumask_copy(sched_domain_span(sd), cpu_map);
7457 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7567 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7458 p = sd; 7568 p = sd;
7459 sd_allnodes = 1; 7569 sd_allnodes = 1;
@@ -7463,18 +7573,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7463 sd = &per_cpu(node_domains, i); 7573 sd = &per_cpu(node_domains, i);
7464 SD_INIT(sd, NODE); 7574 SD_INIT(sd, NODE);
7465 set_domain_attribute(sd, attr); 7575 set_domain_attribute(sd, attr);
7466 sched_domain_node_span(cpu_to_node(i), &sd->span); 7576 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7467 sd->parent = p; 7577 sd->parent = p;
7468 if (p) 7578 if (p)
7469 p->child = sd; 7579 p->child = sd;
7470 cpus_and(sd->span, sd->span, *cpu_map); 7580 cpumask_and(sched_domain_span(sd),
7581 sched_domain_span(sd), cpu_map);
7471#endif 7582#endif
7472 7583
7473 p = sd; 7584 p = sd;
7474 sd = &per_cpu(phys_domains, i); 7585 sd = &per_cpu(phys_domains, i).sd;
7475 SD_INIT(sd, CPU); 7586 SD_INIT(sd, CPU);
7476 set_domain_attribute(sd, attr); 7587 set_domain_attribute(sd, attr);
7477 sd->span = *nodemask; 7588 cpumask_copy(sched_domain_span(sd), nodemask);
7478 sd->parent = p; 7589 sd->parent = p;
7479 if (p) 7590 if (p)
7480 p->child = sd; 7591 p->child = sd;
@@ -7482,11 +7593,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7482 7593
7483#ifdef CONFIG_SCHED_MC 7594#ifdef CONFIG_SCHED_MC
7484 p = sd; 7595 p = sd;
7485 sd = &per_cpu(core_domains, i); 7596 sd = &per_cpu(core_domains, i).sd;
7486 SD_INIT(sd, MC); 7597 SD_INIT(sd, MC);
7487 set_domain_attribute(sd, attr); 7598 set_domain_attribute(sd, attr);
7488 sd->span = cpu_coregroup_map(i); 7599 cpumask_and(sched_domain_span(sd), cpu_map,
7489 cpus_and(sd->span, sd->span, *cpu_map); 7600 cpu_coregroup_mask(i));
7490 sd->parent = p; 7601 sd->parent = p;
7491 p->child = sd; 7602 p->child = sd;
7492 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); 7603 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7494,11 +7605,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7494 7605
7495#ifdef CONFIG_SCHED_SMT 7606#ifdef CONFIG_SCHED_SMT
7496 p = sd; 7607 p = sd;
7497 sd = &per_cpu(cpu_domains, i); 7608 sd = &per_cpu(cpu_domains, i).sd;
7498 SD_INIT(sd, SIBLING); 7609 SD_INIT(sd, SIBLING);
7499 set_domain_attribute(sd, attr); 7610 set_domain_attribute(sd, attr);
7500 sd->span = per_cpu(cpu_sibling_map, i); 7611 cpumask_and(sched_domain_span(sd),
7501 cpus_and(sd->span, sd->span, *cpu_map); 7612 &per_cpu(cpu_sibling_map, i), cpu_map);
7502 sd->parent = p; 7613 sd->parent = p;
7503 p->child = sd; 7614 p->child = sd;
7504 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 7615 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7507,13 +7618,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7507 7618
7508#ifdef CONFIG_SCHED_SMT 7619#ifdef CONFIG_SCHED_SMT
7509 /* Set up CPU (sibling) groups */ 7620 /* Set up CPU (sibling) groups */
7510 for_each_cpu_mask_nr(i, *cpu_map) { 7621 for_each_cpu(i, cpu_map) {
7511 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7622 cpumask_and(this_sibling_map,
7512 SCHED_CPUMASK_VAR(send_covered, allmasks); 7623 &per_cpu(cpu_sibling_map, i), cpu_map);
7513 7624 if (i != cpumask_first(this_sibling_map))
7514 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7515 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7516 if (i != first_cpu(*this_sibling_map))
7517 continue; 7625 continue;
7518 7626
7519 init_sched_build_groups(this_sibling_map, cpu_map, 7627 init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7524,13 +7632,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7524 7632
7525#ifdef CONFIG_SCHED_MC 7633#ifdef CONFIG_SCHED_MC
7526 /* Set up multi-core groups */ 7634 /* Set up multi-core groups */
7527 for_each_cpu_mask_nr(i, *cpu_map) { 7635 for_each_cpu(i, cpu_map) {
7528 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7636 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
7529 SCHED_CPUMASK_VAR(send_covered, allmasks); 7637 if (i != cpumask_first(this_core_map))
7530
7531 *this_core_map = cpu_coregroup_map(i);
7532 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7533 if (i != first_cpu(*this_core_map))
7534 continue; 7638 continue;
7535 7639
7536 init_sched_build_groups(this_core_map, cpu_map, 7640 init_sched_build_groups(this_core_map, cpu_map,
@@ -7541,12 +7645,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7541 7645
7542 /* Set up physical groups */ 7646 /* Set up physical groups */
7543 for (i = 0; i < nr_node_ids; i++) { 7647 for (i = 0; i < nr_node_ids; i++) {
7544 SCHED_CPUMASK_VAR(nodemask, allmasks); 7648 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7545 SCHED_CPUMASK_VAR(send_covered, allmasks); 7649 if (cpumask_empty(nodemask))
7546
7547 *nodemask = node_to_cpumask(i);
7548 cpus_and(*nodemask, *nodemask, *cpu_map);
7549 if (cpus_empty(*nodemask))
7550 continue; 7650 continue;
7551 7651
7552 init_sched_build_groups(nodemask, cpu_map, 7652 init_sched_build_groups(nodemask, cpu_map,
@@ -7557,8 +7657,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7557#ifdef CONFIG_NUMA 7657#ifdef CONFIG_NUMA
7558 /* Set up node groups */ 7658 /* Set up node groups */
7559 if (sd_allnodes) { 7659 if (sd_allnodes) {
7560 SCHED_CPUMASK_VAR(send_covered, allmasks);
7561
7562 init_sched_build_groups(cpu_map, cpu_map, 7660 init_sched_build_groups(cpu_map, cpu_map,
7563 &cpu_to_allnodes_group, 7661 &cpu_to_allnodes_group,
7564 send_covered, tmpmask); 7662 send_covered, tmpmask);
@@ -7567,58 +7665,53 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7567 for (i = 0; i < nr_node_ids; i++) { 7665 for (i = 0; i < nr_node_ids; i++) {
7568 /* Set up node groups */ 7666 /* Set up node groups */
7569 struct sched_group *sg, *prev; 7667 struct sched_group *sg, *prev;
7570 SCHED_CPUMASK_VAR(nodemask, allmasks);
7571 SCHED_CPUMASK_VAR(domainspan, allmasks);
7572 SCHED_CPUMASK_VAR(covered, allmasks);
7573 int j; 7668 int j;
7574 7669
7575 *nodemask = node_to_cpumask(i); 7670 cpumask_clear(covered);
7576 cpus_clear(*covered); 7671 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7577 7672 if (cpumask_empty(nodemask)) {
7578 cpus_and(*nodemask, *nodemask, *cpu_map);
7579 if (cpus_empty(*nodemask)) {
7580 sched_group_nodes[i] = NULL; 7673 sched_group_nodes[i] = NULL;
7581 continue; 7674 continue;
7582 } 7675 }
7583 7676
7584 sched_domain_node_span(i, domainspan); 7677 sched_domain_node_span(i, domainspan);
7585 cpus_and(*domainspan, *domainspan, *cpu_map); 7678 cpumask_and(domainspan, domainspan, cpu_map);
7586 7679
7587 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7680 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7681 GFP_KERNEL, i);
7588 if (!sg) { 7682 if (!sg) {
7589 printk(KERN_WARNING "Can not alloc domain group for " 7683 printk(KERN_WARNING "Can not alloc domain group for "
7590 "node %d\n", i); 7684 "node %d\n", i);
7591 goto error; 7685 goto error;
7592 } 7686 }
7593 sched_group_nodes[i] = sg; 7687 sched_group_nodes[i] = sg;
7594 for_each_cpu_mask_nr(j, *nodemask) { 7688 for_each_cpu(j, nodemask) {
7595 struct sched_domain *sd; 7689 struct sched_domain *sd;
7596 7690
7597 sd = &per_cpu(node_domains, j); 7691 sd = &per_cpu(node_domains, j);
7598 sd->groups = sg; 7692 sd->groups = sg;
7599 } 7693 }
7600 sg->__cpu_power = 0; 7694 sg->__cpu_power = 0;
7601 sg->cpumask = *nodemask; 7695 cpumask_copy(sched_group_cpus(sg), nodemask);
7602 sg->next = sg; 7696 sg->next = sg;
7603 cpus_or(*covered, *covered, *nodemask); 7697 cpumask_or(covered, covered, nodemask);
7604 prev = sg; 7698 prev = sg;
7605 7699
7606 for (j = 0; j < nr_node_ids; j++) { 7700 for (j = 0; j < nr_node_ids; j++) {
7607 SCHED_CPUMASK_VAR(notcovered, allmasks);
7608 int n = (i + j) % nr_node_ids; 7701 int n = (i + j) % nr_node_ids;
7609 node_to_cpumask_ptr(pnodemask, n);
7610 7702
7611 cpus_complement(*notcovered, *covered); 7703 cpumask_complement(notcovered, covered);
7612 cpus_and(*tmpmask, *notcovered, *cpu_map); 7704 cpumask_and(tmpmask, notcovered, cpu_map);
7613 cpus_and(*tmpmask, *tmpmask, *domainspan); 7705 cpumask_and(tmpmask, tmpmask, domainspan);
7614 if (cpus_empty(*tmpmask)) 7706 if (cpumask_empty(tmpmask))
7615 break; 7707 break;
7616 7708
7617 cpus_and(*tmpmask, *tmpmask, *pnodemask); 7709 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
7618 if (cpus_empty(*tmpmask)) 7710 if (cpumask_empty(tmpmask))
7619 continue; 7711 continue;
7620 7712
7621 sg = kmalloc_node(sizeof(struct sched_group), 7713 sg = kmalloc_node(sizeof(struct sched_group) +
7714 cpumask_size(),
7622 GFP_KERNEL, i); 7715 GFP_KERNEL, i);
7623 if (!sg) { 7716 if (!sg) {
7624 printk(KERN_WARNING 7717 printk(KERN_WARNING
@@ -7626,9 +7719,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7626 goto error; 7719 goto error;
7627 } 7720 }
7628 sg->__cpu_power = 0; 7721 sg->__cpu_power = 0;
7629 sg->cpumask = *tmpmask; 7722 cpumask_copy(sched_group_cpus(sg), tmpmask);
7630 sg->next = prev->next; 7723 sg->next = prev->next;
7631 cpus_or(*covered, *covered, *tmpmask); 7724 cpumask_or(covered, covered, tmpmask);
7632 prev->next = sg; 7725 prev->next = sg;
7633 prev = sg; 7726 prev = sg;
7634 } 7727 }
@@ -7637,22 +7730,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7637 7730
7638 /* Calculate CPU power for physical packages and nodes */ 7731 /* Calculate CPU power for physical packages and nodes */
7639#ifdef CONFIG_SCHED_SMT 7732#ifdef CONFIG_SCHED_SMT
7640 for_each_cpu_mask_nr(i, *cpu_map) { 7733 for_each_cpu(i, cpu_map) {
7641 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7734 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
7642 7735
7643 init_sched_groups_power(i, sd); 7736 init_sched_groups_power(i, sd);
7644 } 7737 }
7645#endif 7738#endif
7646#ifdef CONFIG_SCHED_MC 7739#ifdef CONFIG_SCHED_MC
7647 for_each_cpu_mask_nr(i, *cpu_map) { 7740 for_each_cpu(i, cpu_map) {
7648 struct sched_domain *sd = &per_cpu(core_domains, i); 7741 struct sched_domain *sd = &per_cpu(core_domains, i).sd;
7649 7742
7650 init_sched_groups_power(i, sd); 7743 init_sched_groups_power(i, sd);
7651 } 7744 }
7652#endif 7745#endif
7653 7746
7654 for_each_cpu_mask_nr(i, *cpu_map) { 7747 for_each_cpu(i, cpu_map) {
7655 struct sched_domain *sd = &per_cpu(phys_domains, i); 7748 struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
7656 7749
7657 init_sched_groups_power(i, sd); 7750 init_sched_groups_power(i, sd);
7658 } 7751 }
@@ -7664,56 +7757,87 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7664 if (sd_allnodes) { 7757 if (sd_allnodes) {
7665 struct sched_group *sg; 7758 struct sched_group *sg;
7666 7759
7667 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, 7760 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7668 tmpmask); 7761 tmpmask);
7669 init_numa_sched_groups_power(sg); 7762 init_numa_sched_groups_power(sg);
7670 } 7763 }
7671#endif 7764#endif
7672 7765
7673 /* Attach the domains */ 7766 /* Attach the domains */
7674 for_each_cpu_mask_nr(i, *cpu_map) { 7767 for_each_cpu(i, cpu_map) {
7675 struct sched_domain *sd; 7768 struct sched_domain *sd;
7676#ifdef CONFIG_SCHED_SMT 7769#ifdef CONFIG_SCHED_SMT
7677 sd = &per_cpu(cpu_domains, i); 7770 sd = &per_cpu(cpu_domains, i).sd;
7678#elif defined(CONFIG_SCHED_MC) 7771#elif defined(CONFIG_SCHED_MC)
7679 sd = &per_cpu(core_domains, i); 7772 sd = &per_cpu(core_domains, i).sd;
7680#else 7773#else
7681 sd = &per_cpu(phys_domains, i); 7774 sd = &per_cpu(phys_domains, i).sd;
7682#endif 7775#endif
7683 cpu_attach_domain(sd, rd, i); 7776 cpu_attach_domain(sd, rd, i);
7684 } 7777 }
7685 7778
7686 SCHED_CPUMASK_FREE((void *)allmasks); 7779 err = 0;
7687 return 0; 7780
7781free_tmpmask:
7782 free_cpumask_var(tmpmask);
7783free_send_covered:
7784 free_cpumask_var(send_covered);
7785free_this_core_map:
7786 free_cpumask_var(this_core_map);
7787free_this_sibling_map:
7788 free_cpumask_var(this_sibling_map);
7789free_nodemask:
7790 free_cpumask_var(nodemask);
7791free_notcovered:
7792#ifdef CONFIG_NUMA
7793 free_cpumask_var(notcovered);
7794free_covered:
7795 free_cpumask_var(covered);
7796free_domainspan:
7797 free_cpumask_var(domainspan);
7798out:
7799#endif
7800 return err;
7801
7802free_sched_groups:
7803#ifdef CONFIG_NUMA
7804 kfree(sched_group_nodes);
7805#endif
7806 goto free_tmpmask;
7688 7807
7689#ifdef CONFIG_NUMA 7808#ifdef CONFIG_NUMA
7690error: 7809error:
7691 free_sched_groups(cpu_map, tmpmask); 7810 free_sched_groups(cpu_map, tmpmask);
7692 SCHED_CPUMASK_FREE((void *)allmasks); 7811 free_rootdomain(rd);
7693 kfree(rd); 7812 goto free_tmpmask;
7694 return -ENOMEM;
7695#endif 7813#endif
7696} 7814}
7697 7815
7698static int build_sched_domains(const cpumask_t *cpu_map) 7816static int build_sched_domains(const struct cpumask *cpu_map)
7699{ 7817{
7700 return __build_sched_domains(cpu_map, NULL); 7818 return __build_sched_domains(cpu_map, NULL);
7701} 7819}
7702 7820
7703static cpumask_t *doms_cur; /* current sched domains */ 7821static struct cpumask *doms_cur; /* current sched domains */
7704static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7822static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7705static struct sched_domain_attr *dattr_cur; 7823static struct sched_domain_attr *dattr_cur;
7706 /* attribues of custom domains in 'doms_cur' */ 7824 /* attribues of custom domains in 'doms_cur' */
7707 7825
7708/* 7826/*
7709 * Special case: If a kmalloc of a doms_cur partition (array of 7827 * Special case: If a kmalloc of a doms_cur partition (array of
7710 * cpumask_t) fails, then fallback to a single sched domain, 7828 * cpumask) fails, then fallback to a single sched domain,
7711 * as determined by the single cpumask_t fallback_doms. 7829 * as determined by the single cpumask fallback_doms.
7712 */ 7830 */
7713static cpumask_t fallback_doms; 7831static cpumask_var_t fallback_doms;
7714 7832
7715void __attribute__((weak)) arch_update_cpu_topology(void) 7833/*
7834 * arch_update_cpu_topology lets virtualized architectures update the
7835 * cpu core maps. It is supposed to return 1 if the topology changed
7836 * or 0 if it stayed the same.
7837 */
7838int __attribute__((weak)) arch_update_cpu_topology(void)
7716{ 7839{
7840 return 0;
7717} 7841}
7718 7842
7719/* 7843/*
@@ -7721,16 +7845,16 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7721 * For now this just excludes isolated cpus, but could be used to 7845 * For now this just excludes isolated cpus, but could be used to
7722 * exclude other special cases in the future. 7846 * exclude other special cases in the future.
7723 */ 7847 */
7724static int arch_init_sched_domains(const cpumask_t *cpu_map) 7848static int arch_init_sched_domains(const struct cpumask *cpu_map)
7725{ 7849{
7726 int err; 7850 int err;
7727 7851
7728 arch_update_cpu_topology(); 7852 arch_update_cpu_topology();
7729 ndoms_cur = 1; 7853 ndoms_cur = 1;
7730 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 7854 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
7731 if (!doms_cur) 7855 if (!doms_cur)
7732 doms_cur = &fallback_doms; 7856 doms_cur = fallback_doms;
7733 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7857 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
7734 dattr_cur = NULL; 7858 dattr_cur = NULL;
7735 err = build_sched_domains(doms_cur); 7859 err = build_sched_domains(doms_cur);
7736 register_sched_domain_sysctl(); 7860 register_sched_domain_sysctl();
@@ -7738,8 +7862,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
7738 return err; 7862 return err;
7739} 7863}
7740 7864
7741static void arch_destroy_sched_domains(const cpumask_t *cpu_map, 7865static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7742 cpumask_t *tmpmask) 7866 struct cpumask *tmpmask)
7743{ 7867{
7744 free_sched_groups(cpu_map, tmpmask); 7868 free_sched_groups(cpu_map, tmpmask);
7745} 7869}
@@ -7748,17 +7872,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7748 * Detach sched domains from a group of cpus specified in cpu_map 7872 * Detach sched domains from a group of cpus specified in cpu_map
7749 * These cpus will now be attached to the NULL domain 7873 * These cpus will now be attached to the NULL domain
7750 */ 7874 */
7751static void detach_destroy_domains(const cpumask_t *cpu_map) 7875static void detach_destroy_domains(const struct cpumask *cpu_map)
7752{ 7876{
7753 cpumask_t tmpmask; 7877 /* Save because hotplug lock held. */
7878 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7754 int i; 7879 int i;
7755 7880
7756 unregister_sched_domain_sysctl(); 7881 for_each_cpu(i, cpu_map)
7757
7758 for_each_cpu_mask_nr(i, *cpu_map)
7759 cpu_attach_domain(NULL, &def_root_domain, i); 7882 cpu_attach_domain(NULL, &def_root_domain, i);
7760 synchronize_sched(); 7883 synchronize_sched();
7761 arch_destroy_sched_domains(cpu_map, &tmpmask); 7884 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7762} 7885}
7763 7886
7764/* handle null as "default" */ 7887/* handle null as "default" */
@@ -7783,7 +7906,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7783 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7906 * doms_new[] to the current sched domain partitioning, doms_cur[].
7784 * It destroys each deleted domain and builds each new domain. 7907 * It destroys each deleted domain and builds each new domain.
7785 * 7908 *
7786 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. 7909 * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
7787 * The masks don't intersect (don't overlap.) We should setup one 7910 * The masks don't intersect (don't overlap.) We should setup one
7788 * sched domain for each mask. CPUs not in any of the cpumasks will 7911 * sched domain for each mask. CPUs not in any of the cpumasks will
7789 * not be load balanced. If the same cpumask appears both in the 7912 * not be load balanced. If the same cpumask appears both in the
@@ -7797,28 +7920,33 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7797 * the single partition 'fallback_doms', it also forces the domains 7920 * the single partition 'fallback_doms', it also forces the domains
7798 * to be rebuilt. 7921 * to be rebuilt.
7799 * 7922 *
7800 * If doms_new == NULL it will be replaced with cpu_online_map. 7923 * If doms_new == NULL it will be replaced with cpu_online_mask.
7801 * ndoms_new == 0 is a special case for destroying existing domains, 7924 * ndoms_new == 0 is a special case for destroying existing domains,
7802 * and it will not create the default domain. 7925 * and it will not create the default domain.
7803 * 7926 *
7804 * Call with hotplug lock held 7927 * Call with hotplug lock held
7805 */ 7928 */
7806void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7929/* FIXME: Change to struct cpumask *doms_new[] */
7930void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
7807 struct sched_domain_attr *dattr_new) 7931 struct sched_domain_attr *dattr_new)
7808{ 7932{
7809 int i, j, n; 7933 int i, j, n;
7934 int new_topology;
7810 7935
7811 mutex_lock(&sched_domains_mutex); 7936 mutex_lock(&sched_domains_mutex);
7812 7937
7813 /* always unregister in case we don't destroy any domains */ 7938 /* always unregister in case we don't destroy any domains */
7814 unregister_sched_domain_sysctl(); 7939 unregister_sched_domain_sysctl();
7815 7940
7941 /* Let architecture update cpu core mappings. */
7942 new_topology = arch_update_cpu_topology();
7943
7816 n = doms_new ? ndoms_new : 0; 7944 n = doms_new ? ndoms_new : 0;
7817 7945
7818 /* Destroy deleted domains */ 7946 /* Destroy deleted domains */
7819 for (i = 0; i < ndoms_cur; i++) { 7947 for (i = 0; i < ndoms_cur; i++) {
7820 for (j = 0; j < n; j++) { 7948 for (j = 0; j < n && !new_topology; j++) {
7821 if (cpus_equal(doms_cur[i], doms_new[j]) 7949 if (cpumask_equal(&doms_cur[i], &doms_new[j])
7822 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7950 && dattrs_equal(dattr_cur, i, dattr_new, j))
7823 goto match1; 7951 goto match1;
7824 } 7952 }
@@ -7830,15 +7958,15 @@ match1:
7830 7958
7831 if (doms_new == NULL) { 7959 if (doms_new == NULL) {
7832 ndoms_cur = 0; 7960 ndoms_cur = 0;
7833 doms_new = &fallback_doms; 7961 doms_new = fallback_doms;
7834 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7962 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
7835 dattr_new = NULL; 7963 WARN_ON_ONCE(dattr_new);
7836 } 7964 }
7837 7965
7838 /* Build new domains */ 7966 /* Build new domains */
7839 for (i = 0; i < ndoms_new; i++) { 7967 for (i = 0; i < ndoms_new; i++) {
7840 for (j = 0; j < ndoms_cur; j++) { 7968 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7841 if (cpus_equal(doms_new[i], doms_cur[j]) 7969 if (cpumask_equal(&doms_new[i], &doms_cur[j])
7842 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7970 && dattrs_equal(dattr_new, i, dattr_cur, j))
7843 goto match2; 7971 goto match2;
7844 } 7972 }
@@ -7850,7 +7978,7 @@ match2:
7850 } 7978 }
7851 7979
7852 /* Remember the new sched domains */ 7980 /* Remember the new sched domains */
7853 if (doms_cur != &fallback_doms) 7981 if (doms_cur != fallback_doms)
7854 kfree(doms_cur); 7982 kfree(doms_cur);
7855 kfree(dattr_cur); /* kfree(NULL) is safe */ 7983 kfree(dattr_cur); /* kfree(NULL) is safe */
7856 doms_cur = doms_new; 7984 doms_cur = doms_new;
@@ -7863,7 +7991,7 @@ match2:
7863} 7991}
7864 7992
7865#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7993#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7866int arch_reinit_sched_domains(void) 7994static void arch_reinit_sched_domains(void)
7867{ 7995{
7868 get_online_cpus(); 7996 get_online_cpus();
7869 7997
@@ -7872,25 +8000,33 @@ int arch_reinit_sched_domains(void)
7872 8000
7873 rebuild_sched_domains(); 8001 rebuild_sched_domains();
7874 put_online_cpus(); 8002 put_online_cpus();
7875
7876 return 0;
7877} 8003}
7878 8004
7879static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 8005static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7880{ 8006{
7881 int ret; 8007 unsigned int level = 0;
7882 8008
7883 if (buf[0] != '0' && buf[0] != '1') 8009 if (sscanf(buf, "%u", &level) != 1)
8010 return -EINVAL;
8011
8012 /*
8013 * level is always be positive so don't check for
8014 * level < POWERSAVINGS_BALANCE_NONE which is 0
8015 * What happens on 0 or 1 byte write,
8016 * need to check for count as well?
8017 */
8018
8019 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7884 return -EINVAL; 8020 return -EINVAL;
7885 8021
7886 if (smt) 8022 if (smt)
7887 sched_smt_power_savings = (buf[0] == '1'); 8023 sched_smt_power_savings = level;
7888 else 8024 else
7889 sched_mc_power_savings = (buf[0] == '1'); 8025 sched_mc_power_savings = level;
7890 8026
7891 ret = arch_reinit_sched_domains(); 8027 arch_reinit_sched_domains();
7892 8028
7893 return ret ? ret : count; 8029 return count;
7894} 8030}
7895 8031
7896#ifdef CONFIG_SCHED_MC 8032#ifdef CONFIG_SCHED_MC
@@ -7925,7 +8061,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7925 sched_smt_power_savings_store); 8061 sched_smt_power_savings_store);
7926#endif 8062#endif
7927 8063
7928int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 8064int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7929{ 8065{
7930 int err = 0; 8066 int err = 0;
7931 8067
@@ -7990,7 +8126,9 @@ static int update_runtime(struct notifier_block *nfb,
7990 8126
7991void __init sched_init_smp(void) 8127void __init sched_init_smp(void)
7992{ 8128{
7993 cpumask_t non_isolated_cpus; 8129 cpumask_var_t non_isolated_cpus;
8130
8131 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7994 8132
7995#if defined(CONFIG_NUMA) 8133#if defined(CONFIG_NUMA)
7996 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 8134 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -7999,10 +8137,10 @@ void __init sched_init_smp(void)
7999#endif 8137#endif
8000 get_online_cpus(); 8138 get_online_cpus();
8001 mutex_lock(&sched_domains_mutex); 8139 mutex_lock(&sched_domains_mutex);
8002 arch_init_sched_domains(&cpu_online_map); 8140 arch_init_sched_domains(cpu_online_mask);
8003 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 8141 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8004 if (cpus_empty(non_isolated_cpus)) 8142 if (cpumask_empty(non_isolated_cpus))
8005 cpu_set(smp_processor_id(), non_isolated_cpus); 8143 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8006 mutex_unlock(&sched_domains_mutex); 8144 mutex_unlock(&sched_domains_mutex);
8007 put_online_cpus(); 8145 put_online_cpus();
8008 8146
@@ -8017,9 +8155,13 @@ void __init sched_init_smp(void)
8017 init_hrtick(); 8155 init_hrtick();
8018 8156
8019 /* Move init over to a non-isolated CPU */ 8157 /* Move init over to a non-isolated CPU */
8020 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 8158 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8021 BUG(); 8159 BUG();
8022 sched_init_granularity(); 8160 sched_init_granularity();
8161 free_cpumask_var(non_isolated_cpus);
8162
8163 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8164 init_sched_rt_class();
8023} 8165}
8024#else 8166#else
8025void __init sched_init_smp(void) 8167void __init sched_init_smp(void)
@@ -8334,6 +8476,15 @@ void __init sched_init(void)
8334 */ 8476 */
8335 current->sched_class = &fair_sched_class; 8477 current->sched_class = &fair_sched_class;
8336 8478
8479 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8480 alloc_bootmem_cpumask_var(&nohz_cpu_mask);
8481#ifdef CONFIG_SMP
8482#ifdef CONFIG_NO_HZ
8483 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
8484#endif
8485 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8486#endif /* SMP */
8487
8337 scheduler_running = 1; 8488 scheduler_running = 1;
8338} 8489}
8339 8490
@@ -8492,7 +8643,7 @@ static
8492int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8643int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8493{ 8644{
8494 struct cfs_rq *cfs_rq; 8645 struct cfs_rq *cfs_rq;
8495 struct sched_entity *se, *parent_se; 8646 struct sched_entity *se;
8496 struct rq *rq; 8647 struct rq *rq;
8497 int i; 8648 int i;
8498 8649
@@ -8508,18 +8659,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8508 for_each_possible_cpu(i) { 8659 for_each_possible_cpu(i) {
8509 rq = cpu_rq(i); 8660 rq = cpu_rq(i);
8510 8661
8511 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8662 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8512 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8663 GFP_KERNEL, cpu_to_node(i));
8513 if (!cfs_rq) 8664 if (!cfs_rq)
8514 goto err; 8665 goto err;
8515 8666
8516 se = kmalloc_node(sizeof(struct sched_entity), 8667 se = kzalloc_node(sizeof(struct sched_entity),
8517 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8668 GFP_KERNEL, cpu_to_node(i));
8518 if (!se) 8669 if (!se)
8519 goto err; 8670 goto err;
8520 8671
8521 parent_se = parent ? parent->se[i] : NULL; 8672 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8522 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8523 } 8673 }
8524 8674
8525 return 1; 8675 return 1;
@@ -8580,7 +8730,7 @@ static
8580int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8730int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8581{ 8731{
8582 struct rt_rq *rt_rq; 8732 struct rt_rq *rt_rq;
8583 struct sched_rt_entity *rt_se, *parent_se; 8733 struct sched_rt_entity *rt_se;
8584 struct rq *rq; 8734 struct rq *rq;
8585 int i; 8735 int i;
8586 8736
@@ -8597,18 +8747,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8597 for_each_possible_cpu(i) { 8747 for_each_possible_cpu(i) {
8598 rq = cpu_rq(i); 8748 rq = cpu_rq(i);
8599 8749
8600 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8750 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8601 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8751 GFP_KERNEL, cpu_to_node(i));
8602 if (!rt_rq) 8752 if (!rt_rq)
8603 goto err; 8753 goto err;
8604 8754
8605 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8755 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8606 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8756 GFP_KERNEL, cpu_to_node(i));
8607 if (!rt_se) 8757 if (!rt_se)
8608 goto err; 8758 goto err;
8609 8759
8610 parent_se = parent ? parent->rt_se[i] : NULL; 8760 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8611 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8612 } 8761 }
8613 8762
8614 return 1; 8763 return 1;
@@ -9251,11 +9400,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9251 * (balbir@in.ibm.com). 9400 * (balbir@in.ibm.com).
9252 */ 9401 */
9253 9402
9254/* track cpu usage of a group of tasks */ 9403/* track cpu usage of a group of tasks and its child groups */
9255struct cpuacct { 9404struct cpuacct {
9256 struct cgroup_subsys_state css; 9405 struct cgroup_subsys_state css;
9257 /* cpuusage holds pointer to a u64-type object on every cpu */ 9406 /* cpuusage holds pointer to a u64-type object on every cpu */
9258 u64 *cpuusage; 9407 u64 *cpuusage;
9408 struct cpuacct *parent;
9259}; 9409};
9260 9410
9261struct cgroup_subsys cpuacct_subsys; 9411struct cgroup_subsys cpuacct_subsys;
@@ -9289,6 +9439,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9289 return ERR_PTR(-ENOMEM); 9439 return ERR_PTR(-ENOMEM);
9290 } 9440 }
9291 9441
9442 if (cgrp->parent)
9443 ca->parent = cgroup_ca(cgrp->parent);
9444
9292 return &ca->css; 9445 return &ca->css;
9293} 9446}
9294 9447
@@ -9302,6 +9455,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9302 kfree(ca); 9455 kfree(ca);
9303} 9456}
9304 9457
9458static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9459{
9460 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9461 u64 data;
9462
9463#ifndef CONFIG_64BIT
9464 /*
9465 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9466 */
9467 spin_lock_irq(&cpu_rq(cpu)->lock);
9468 data = *cpuusage;
9469 spin_unlock_irq(&cpu_rq(cpu)->lock);
9470#else
9471 data = *cpuusage;
9472#endif
9473
9474 return data;
9475}
9476
9477static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9478{
9479 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9480
9481#ifndef CONFIG_64BIT
9482 /*
9483 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9484 */
9485 spin_lock_irq(&cpu_rq(cpu)->lock);
9486 *cpuusage = val;
9487 spin_unlock_irq(&cpu_rq(cpu)->lock);
9488#else
9489 *cpuusage = val;
9490#endif
9491}
9492
9305/* return total cpu usage (in nanoseconds) of a group */ 9493/* return total cpu usage (in nanoseconds) of a group */
9306static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 9494static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9307{ 9495{
@@ -9309,17 +9497,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9309 u64 totalcpuusage = 0; 9497 u64 totalcpuusage = 0;
9310 int i; 9498 int i;
9311 9499
9312 for_each_possible_cpu(i) { 9500 for_each_present_cpu(i)
9313 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9501 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9314
9315 /*
9316 * Take rq->lock to make 64-bit addition safe on 32-bit
9317 * platforms.
9318 */
9319 spin_lock_irq(&cpu_rq(i)->lock);
9320 totalcpuusage += *cpuusage;
9321 spin_unlock_irq(&cpu_rq(i)->lock);
9322 }
9323 9502
9324 return totalcpuusage; 9503 return totalcpuusage;
9325} 9504}
@@ -9336,23 +9515,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9336 goto out; 9515 goto out;
9337 } 9516 }
9338 9517
9339 for_each_possible_cpu(i) { 9518 for_each_present_cpu(i)
9340 u64 *cpuusage = percpu_ptr(ca->cpuusage, i); 9519 cpuacct_cpuusage_write(ca, i, 0);
9341 9520
9342 spin_lock_irq(&cpu_rq(i)->lock);
9343 *cpuusage = 0;
9344 spin_unlock_irq(&cpu_rq(i)->lock);
9345 }
9346out: 9521out:
9347 return err; 9522 return err;
9348} 9523}
9349 9524
9525static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9526 struct seq_file *m)
9527{
9528 struct cpuacct *ca = cgroup_ca(cgroup);
9529 u64 percpu;
9530 int i;
9531
9532 for_each_present_cpu(i) {
9533 percpu = cpuacct_cpuusage_read(ca, i);
9534 seq_printf(m, "%llu ", (unsigned long long) percpu);
9535 }
9536 seq_printf(m, "\n");
9537 return 0;
9538}
9539
9350static struct cftype files[] = { 9540static struct cftype files[] = {
9351 { 9541 {
9352 .name = "usage", 9542 .name = "usage",
9353 .read_u64 = cpuusage_read, 9543 .read_u64 = cpuusage_read,
9354 .write_u64 = cpuusage_write, 9544 .write_u64 = cpuusage_write,
9355 }, 9545 },
9546 {
9547 .name = "usage_percpu",
9548 .read_seq_string = cpuacct_percpu_seq_read,
9549 },
9550
9356}; 9551};
9357 9552
9358static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 9553static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -9368,14 +9563,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9368static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9563static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9369{ 9564{
9370 struct cpuacct *ca; 9565 struct cpuacct *ca;
9566 int cpu;
9371 9567
9372 if (!cpuacct_subsys.active) 9568 if (!cpuacct_subsys.active)
9373 return; 9569 return;
9374 9570
9571 cpu = task_cpu(tsk);
9375 ca = task_ca(tsk); 9572 ca = task_ca(tsk);
9376 if (ca) {
9377 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9378 9573
9574 for (; ca; ca = ca->parent) {
9575 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9379 *cpuusage += cputime; 9576 *cpuusage += cputime;
9380 } 9577 }
9381} 9578}
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e8ab096ddfe3..a0b0852414cc 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
126 min_clock = wrap_max(scd->tick_gtod, scd->clock); 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = scd->tick_gtod + TICK_NSEC; 127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
128 128
129 clock = wrap_max(clock, min_clock); 129 clock = wrap_max(clock, min_clock);
130 clock = wrap_min(clock, max_clock); 130 clock = wrap_min(clock, max_clock);
@@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
227 */ 227 */
228void sched_clock_idle_wakeup_event(u64 delta_ns) 228void sched_clock_idle_wakeup_event(u64 delta_ns)
229{ 229{
230 if (timekeeping_suspended)
231 return;
232
230 sched_clock_tick(); 233 sched_clock_tick();
231 touch_softlockup_watchdog(); 234 touch_softlockup_watchdog();
232} 235}
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 52154fefab7e..1e00bfacf9b8 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -67,24 +67,21 @@ static int convert_prio(int prio)
67 * Returns: (int)bool - CPUs were found 67 * Returns: (int)bool - CPUs were found
68 */ 68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p, 69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask) 70 struct cpumask *lowest_mask)
71{ 71{
72 int idx = 0; 72 int idx = 0;
73 int task_pri = convert_prio(p->prio); 73 int task_pri = convert_prio(p->prio);
74 74
75 for_each_cpupri_active(cp->pri_active, idx) { 75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78 77
79 if (idx >= task_pri) 78 if (idx >= task_pri)
80 break; 79 break;
81 80
82 cpus_and(mask, p->cpus_allowed, vec->mask); 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
83
84 if (cpus_empty(mask))
85 continue; 82 continue;
86 83
87 *lowest_mask = mask; 84 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
88 return 1; 85 return 1;
89 } 86 }
90 87
@@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
126 vec->count--; 123 vec->count--;
127 if (!vec->count) 124 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active); 125 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask); 126 cpumask_clear_cpu(cpu, vec->mask);
130 127
131 spin_unlock_irqrestore(&vec->lock, flags); 128 spin_unlock_irqrestore(&vec->lock, flags);
132 } 129 }
@@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
136 133
137 spin_lock_irqsave(&vec->lock, flags); 134 spin_lock_irqsave(&vec->lock, flags);
138 135
139 cpu_set(cpu, vec->mask); 136 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 137 vec->count++;
141 if (vec->count == 1) 138 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active); 139 set_bit(newpri, cp->pri_active);
@@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
150/** 147/**
151 * cpupri_init - initialize the cpupri structure 148 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context 149 * @cp: The cpupri context
150 * @bootmem: true if allocations need to use bootmem
153 * 151 *
154 * Returns: (void) 152 * Returns: -ENOMEM if memory fails.
155 */ 153 */
156void cpupri_init(struct cpupri *cp) 154int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
157{ 155{
158 int i; 156 int i;
159 157
@@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp)
164 162
165 spin_lock_init(&vec->lock); 163 spin_lock_init(&vec->lock);
166 vec->count = 0; 164 vec->count = 0;
167 cpus_clear(vec->mask); 165 if (bootmem)
166 alloc_bootmem_cpumask_var(&vec->mask);
167 else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
168 goto cleanup;
168 } 169 }
169 170
170 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID; 172 cp->cpu_to_pri[i] = CPUPRI_INVALID;
173 return 0;
174
175cleanup:
176 for (i--; i >= 0; i--)
177 free_cpumask_var(cp->pri_to_cpu[i].mask);
178 return -ENOMEM;
172} 179}
173 180
181/**
182 * cpupri_cleanup - clean up the cpupri structure
183 * @cp: The cpupri context
184 */
185void cpupri_cleanup(struct cpupri *cp)
186{
187 int i;
174 188
189 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
190 free_cpumask_var(cp->pri_to_cpu[i].mask);
191}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index f25811b0f931..642a94ef8a0a 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -14,7 +14,7 @@
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 spinlock_t lock;
16 int count; 16 int count;
17 cpumask_t mask; 17 cpumask_var_t mask;
18}; 18};
19 19
20struct cpupri { 20struct cpupri {
@@ -27,7 +27,8 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask); 28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp); 30int cpupri_init(struct cpupri *cp, bool bootmem);
31void cpupri_cleanup(struct cpupri *cp);
31#else 32#else
32#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0) 34#define cpupri_init() do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 26ed8e3d1c15..4293cfa9681d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
53 53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu,
58 struct task_group *tg)
59{
60 struct sched_entity *se = tg->se[cpu];
61 if (!se)
62 return;
63
64#define P(F) \
65 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
66#define PN(F) \
67 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
68
69 PN(se->exec_start);
70 PN(se->vruntime);
71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start);
74 PN(se->sleep_start);
75 PN(se->block_start);
76 PN(se->sleep_max);
77 PN(se->block_max);
78 PN(se->exec_max);
79 PN(se->slice_max);
80 PN(se->wait_max);
81 PN(se->wait_sum);
82 P(se->wait_count);
83#endif
84 P(se->load.weight);
85#undef PN
86#undef P
87}
88#endif
89
56static void 90static void
57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
58{ 92{
@@ -121,20 +155,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
121 155
122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 156#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 char path[128] = ""; 157 char path[128] = "";
124 struct cgroup *cgroup = NULL;
125 struct task_group *tg = cfs_rq->tg; 158 struct task_group *tg = cfs_rq->tg;
126 159
127 if (tg) 160 cgroup_path(tg->css.cgroup, path, sizeof(path));
128 cgroup = tg->css.cgroup;
129
130 if (cgroup)
131 cgroup_path(cgroup, path, sizeof(path));
132 161
133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 162 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
163#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
164 {
165 uid_t uid = cfs_rq->tg->uid;
166 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
167 }
134#else 168#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 169 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 170#endif
137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 171 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
139 SPLIT_NS(cfs_rq->exec_clock)); 172 SPLIT_NS(cfs_rq->exec_clock));
140 173
@@ -168,6 +201,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168#ifdef CONFIG_SMP 201#ifdef CONFIG_SMP
169 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 202 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
170#endif 203#endif
204 print_cfs_group_stats(m, cpu, cfs_rq->tg);
171#endif 205#endif
172} 206}
173 207
@@ -175,14 +209,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
175{ 209{
176#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 210#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
177 char path[128] = ""; 211 char path[128] = "";
178 struct cgroup *cgroup = NULL;
179 struct task_group *tg = rt_rq->tg; 212 struct task_group *tg = rt_rq->tg;
180 213
181 if (tg) 214 cgroup_path(tg->css.cgroup, path, sizeof(path));
182 cgroup = tg->css.cgroup;
183
184 if (cgroup)
185 cgroup_path(cgroup, path, sizeof(path));
186 215
187 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); 216 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
188#else 217#else
@@ -272,7 +301,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
272 u64 now = ktime_to_ns(ktime_get()); 301 u64 now = ktime_to_ns(ktime_get());
273 int cpu; 302 int cpu;
274 303
275 SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", 304 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
276 init_utsname()->release, 305 init_utsname()->release,
277 (int)strcspn(init_utsname()->version, " "), 306 (int)strcspn(init_utsname()->version, " "),
278 init_utsname()->version); 307 init_utsname()->version);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b059..8e1352c75557 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -386,20 +386,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
386#endif 386#endif
387 387
388/* 388/*
389 * delta *= P[w / rw]
390 */
391static inline unsigned long
392calc_delta_weight(unsigned long delta, struct sched_entity *se)
393{
394 for_each_sched_entity(se) {
395 delta = calc_delta_mine(delta,
396 se->load.weight, &cfs_rq_of(se)->load);
397 }
398
399 return delta;
400}
401
402/*
403 * delta /= w 389 * delta /= w
404 */ 390 */
405static inline unsigned long 391static inline unsigned long
@@ -440,12 +426,20 @@ static u64 __sched_period(unsigned long nr_running)
440 */ 426 */
441static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 427static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
442{ 428{
443 unsigned long nr_running = cfs_rq->nr_running; 429 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
444 430
445 if (unlikely(!se->on_rq)) 431 for_each_sched_entity(se) {
446 nr_running++; 432 struct load_weight *load = &cfs_rq->load;
447 433
448 return calc_delta_weight(__sched_period(nr_running), se); 434 if (unlikely(!se->on_rq)) {
435 struct load_weight lw = cfs_rq->load;
436
437 update_load_add(&lw, se->load.weight);
438 load = &lw;
439 }
440 slice = calc_delta_mine(slice, se->load.weight, load);
441 }
442 return slice;
449} 443}
450 444
451/* 445/*
@@ -492,6 +486,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
492 * overflow on 32 bits): 486 * overflow on 32 bits):
493 */ 487 */
494 delta_exec = (unsigned long)(now - curr->exec_start); 488 delta_exec = (unsigned long)(now - curr->exec_start);
489 if (!delta_exec)
490 return;
495 491
496 __update_curr(cfs_rq, curr, delta_exec); 492 __update_curr(cfs_rq, curr, delta_exec);
497 curr->exec_start = now; 493 curr->exec_start = now;
@@ -1017,16 +1013,33 @@ static void yield_task_fair(struct rq *rq)
1017 * search starts with cpus closest then further out as needed, 1013 * search starts with cpus closest then further out as needed,
1018 * so we always favor a closer, idle cpu. 1014 * so we always favor a closer, idle cpu.
1019 * Domains may include CPUs that are not usable for migration, 1015 * Domains may include CPUs that are not usable for migration,
1020 * hence we need to mask them out (cpu_active_map) 1016 * hence we need to mask them out (cpu_active_mask)
1021 * 1017 *
1022 * Returns the CPU we should wake onto. 1018 * Returns the CPU we should wake onto.
1023 */ 1019 */
1024#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1020#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1025static int wake_idle(int cpu, struct task_struct *p) 1021static int wake_idle(int cpu, struct task_struct *p)
1026{ 1022{
1027 cpumask_t tmp;
1028 struct sched_domain *sd; 1023 struct sched_domain *sd;
1029 int i; 1024 int i;
1025 unsigned int chosen_wakeup_cpu;
1026 int this_cpu;
1027
1028 /*
1029 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1030 * are idle and this is not a kernel thread and this task's affinity
1031 * allows it to be moved to preferred cpu, then just move!
1032 */
1033
1034 this_cpu = smp_processor_id();
1035 chosen_wakeup_cpu =
1036 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1037
1038 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1039 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1040 p->mm && !(p->flags & PF_KTHREAD) &&
1041 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1042 return chosen_wakeup_cpu;
1030 1043
1031 /* 1044 /*
1032 * If it is idle, then it is the best cpu to run this task. 1045 * If it is idle, then it is the best cpu to run this task.
@@ -1044,10 +1057,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1044 if ((sd->flags & SD_WAKE_IDLE) 1057 if ((sd->flags & SD_WAKE_IDLE)
1045 || ((sd->flags & SD_WAKE_IDLE_FAR) 1058 || ((sd->flags & SD_WAKE_IDLE_FAR)
1046 && !task_hot(p, task_rq(p)->clock, sd))) { 1059 && !task_hot(p, task_rq(p)->clock, sd))) {
1047 cpus_and(tmp, sd->span, p->cpus_allowed); 1060 for_each_cpu_and(i, sched_domain_span(sd),
1048 cpus_and(tmp, tmp, cpu_active_map); 1061 &p->cpus_allowed) {
1049 for_each_cpu_mask_nr(i, tmp) { 1062 if (cpu_active(i) && idle_cpu(i)) {
1050 if (idle_cpu(i)) {
1051 if (i != task_cpu(p)) { 1063 if (i != task_cpu(p)) {
1052 schedstat_inc(p, 1064 schedstat_inc(p,
1053 se.nr_wakeups_idle); 1065 se.nr_wakeups_idle);
@@ -1240,13 +1252,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1240 * this_cpu and prev_cpu are present in: 1252 * this_cpu and prev_cpu are present in:
1241 */ 1253 */
1242 for_each_domain(this_cpu, sd) { 1254 for_each_domain(this_cpu, sd) {
1243 if (cpu_isset(prev_cpu, sd->span)) { 1255 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
1244 this_sd = sd; 1256 this_sd = sd;
1245 break; 1257 break;
1246 } 1258 }
1247 } 1259 }
1248 1260
1249 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1261 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
1250 goto out; 1262 goto out;
1251 1263
1252 /* 1264 /*
@@ -1345,12 +1357,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1345{ 1357{
1346 struct task_struct *curr = rq->curr; 1358 struct task_struct *curr = rq->curr;
1347 struct sched_entity *se = &curr->se, *pse = &p->se; 1359 struct sched_entity *se = &curr->se, *pse = &p->se;
1360 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1348 1361
1349 if (unlikely(rt_prio(p->prio))) { 1362 update_curr(cfs_rq);
1350 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1351 1363
1352 update_rq_clock(rq); 1364 if (unlikely(rt_prio(p->prio))) {
1353 update_curr(cfs_rq);
1354 resched_task(curr); 1365 resched_task(curr);
1355 return; 1366 return;
1356 } 1367 }
@@ -1606,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1606 } 1617 }
1607} 1618}
1608 1619
1609#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1610
1611/* 1620/*
1612 * Share the fairness runtime between parent and child, thus the 1621 * Share the fairness runtime between parent and child, thus the
1613 * total amount of pressure for CPU stays equal - new tasks 1622 * total amount of pressure for CPU stays equal - new tasks
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6..954e1a81b796 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq)
15 if (!rq->online) 15 if (!rq->online)
16 return; 16 return;
17 17
18 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
19 /* 19 /*
20 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
21 * the overload count. That is checked to determine 21 * the overload count. That is checked to determine
@@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq)
34 34
35 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
36 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
37 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
38} 38}
39 39
40static void update_rt_migration(struct rq *rq) 40static void update_rt_migration(struct rq *rq)
@@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
77} 77}
78 78
79#define for_each_leaf_rt_rq(rt_rq, rq) \ 79#define for_each_leaf_rt_rq(rt_rq, rq) \
80 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 80 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
81 81
82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 82static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
83{ 83{
@@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
139} 139}
140 140
141#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
142static inline cpumask_t sched_rt_period_mask(void) 142static inline const struct cpumask *sched_rt_period_mask(void)
143{ 143{
144 return cpu_rq(smp_processor_id())->rd->span; 144 return cpu_rq(smp_processor_id())->rd->span;
145} 145}
146#else 146#else
147static inline cpumask_t sched_rt_period_mask(void) 147static inline const struct cpumask *sched_rt_period_mask(void)
148{ 148{
149 return cpu_online_map; 149 return cpu_online_mask;
150} 150}
151#endif 151#endif
152 152
@@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
212 return rt_rq->rt_throttled; 212 return rt_rq->rt_throttled;
213} 213}
214 214
215static inline cpumask_t sched_rt_period_mask(void) 215static inline const struct cpumask *sched_rt_period_mask(void)
216{ 216{
217 return cpu_online_map; 217 return cpu_online_mask;
218} 218}
219 219
220static inline 220static inline
@@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
241 int i, weight, more = 0; 241 int i, weight, more = 0;
242 u64 rt_period; 242 u64 rt_period;
243 243
244 weight = cpus_weight(rd->span); 244 weight = cpumask_weight(rd->span);
245 245
246 spin_lock(&rt_b->rt_runtime_lock); 246 spin_lock(&rt_b->rt_runtime_lock);
247 rt_period = ktime_to_ns(rt_b->rt_period); 247 rt_period = ktime_to_ns(rt_b->rt_period);
248 for_each_cpu_mask_nr(i, rd->span) { 248 for_each_cpu(i, rd->span) {
249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
250 s64 diff; 250 s64 diff;
251 251
@@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq)
324 /* 324 /*
325 * Greedy reclaim, take back as much as we can. 325 * Greedy reclaim, take back as much as we can.
326 */ 326 */
327 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu(i, rd->span) {
328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
329 s64 diff; 329 s64 diff;
330 330
@@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
430{ 430{
431 int i, idle = 1; 431 int i, idle = 1;
432 cpumask_t span; 432 const struct cpumask *span;
433 433
434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
435 return 1; 435 return 1;
436 436
437 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
438 for_each_cpu_mask(i, span) { 438 for_each_cpu(i, span) {
439 int enqueue = 0; 439 int enqueue = 0;
440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
441 struct rq *rq = rq_of_rt_rq(rt_rq); 441 struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
537 for_each_sched_rt_entity(rt_se) { 537 for_each_sched_rt_entity(rt_se) {
538 rt_rq = rt_rq_of_se(rt_se); 538 rt_rq = rt_rq_of_se(rt_se);
539 539
540 spin_lock(&rt_rq->rt_runtime_lock);
541 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 540 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
541 spin_lock(&rt_rq->rt_runtime_lock);
542 rt_rq->rt_time += delta_exec; 542 rt_rq->rt_time += delta_exec;
543 if (sched_rt_runtime_exceeded(rt_rq)) 543 if (sched_rt_runtime_exceeded(rt_rq))
544 resched_task(curr); 544 resched_task(curr);
545 spin_unlock(&rt_rq->rt_runtime_lock);
545 } 546 }
546 spin_unlock(&rt_rq->rt_runtime_lock);
547 } 547 }
548} 548}
549 549
@@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
805 805
806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
807{ 807{
808 cpumask_t mask; 808 cpumask_var_t mask;
809 809
810 if (rq->curr->rt.nr_cpus_allowed == 1) 810 if (rq->curr->rt.nr_cpus_allowed == 1)
811 return; 811 return;
812 812
813 if (p->rt.nr_cpus_allowed != 1 813 if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
814 && cpupri_find(&rq->rd->cpupri, p, &mask))
815 return; 814 return;
816 815
817 if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) 816 if (p->rt.nr_cpus_allowed != 1
818 return; 817 && cpupri_find(&rq->rd->cpupri, p, mask))
818 goto free;
819
820 if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
821 goto free;
819 822
820 /* 823 /*
821 * There appears to be other cpus that can accept 824 * There appears to be other cpus that can accept
@@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
824 */ 827 */
825 requeue_task_rt(rq, p, 1); 828 requeue_task_rt(rq, p, 1);
826 resched_task(rq->curr); 829 resched_task(rq->curr);
830free:
831 free_cpumask_var(mask);
827} 832}
828 833
829#endif /* CONFIG_SMP */ 834#endif /* CONFIG_SMP */
@@ -909,15 +914,12 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
909/* Only try algorithms three times */ 914/* Only try algorithms three times */
910#define RT_MAX_TRIES 3 915#define RT_MAX_TRIES 3
911 916
912static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
913static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
914
915static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 917static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
916 918
917static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 919static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
918{ 920{
919 if (!task_running(rq, p) && 921 if (!task_running(rq, p) &&
920 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && 922 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
921 (p->rt.nr_cpus_allowed > 1)) 923 (p->rt.nr_cpus_allowed > 1))
922 return 1; 924 return 1;
923 return 0; 925 return 0;
@@ -956,7 +958,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
956 return next; 958 return next;
957} 959}
958 960
959static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
960 962
961static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 963static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
962{ 964{
@@ -976,7 +978,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
976static int find_lowest_rq(struct task_struct *task) 978static int find_lowest_rq(struct task_struct *task)
977{ 979{
978 struct sched_domain *sd; 980 struct sched_domain *sd;
979 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 981 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
980 int this_cpu = smp_processor_id(); 982 int this_cpu = smp_processor_id();
981 int cpu = task_cpu(task); 983 int cpu = task_cpu(task);
982 984
@@ -991,7 +993,7 @@ static int find_lowest_rq(struct task_struct *task)
991 * I guess we might want to change cpupri_find() to ignore those 993 * I guess we might want to change cpupri_find() to ignore those
992 * in the first place. 994 * in the first place.
993 */ 995 */
994 cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); 996 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
995 997
996 /* 998 /*
997 * At this point we have built a mask of cpus representing the 999 * At this point we have built a mask of cpus representing the
@@ -1001,7 +1003,7 @@ static int find_lowest_rq(struct task_struct *task)
1001 * We prioritize the last cpu that the task executed on since 1003 * We prioritize the last cpu that the task executed on since
1002 * it is most likely cache-hot in that location. 1004 * it is most likely cache-hot in that location.
1003 */ 1005 */
1004 if (cpu_isset(cpu, *lowest_mask)) 1006 if (cpumask_test_cpu(cpu, lowest_mask))
1005 return cpu; 1007 return cpu;
1006 1008
1007 /* 1009 /*
@@ -1016,7 +1018,8 @@ static int find_lowest_rq(struct task_struct *task)
1016 cpumask_t domain_mask; 1018 cpumask_t domain_mask;
1017 int best_cpu; 1019 int best_cpu;
1018 1020
1019 cpus_and(domain_mask, sd->span, *lowest_mask); 1021 cpumask_and(&domain_mask, sched_domain_span(sd),
1022 lowest_mask);
1020 1023
1021 best_cpu = pick_optimal_cpu(this_cpu, 1024 best_cpu = pick_optimal_cpu(this_cpu,
1022 &domain_mask); 1025 &domain_mask);
@@ -1057,8 +1060,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1057 * Also make sure that it wasn't scheduled on its rq. 1060 * Also make sure that it wasn't scheduled on its rq.
1058 */ 1061 */
1059 if (unlikely(task_rq(task) != rq || 1062 if (unlikely(task_rq(task) != rq ||
1060 !cpu_isset(lowest_rq->cpu, 1063 !cpumask_test_cpu(lowest_rq->cpu,
1061 task->cpus_allowed) || 1064 &task->cpus_allowed) ||
1062 task_running(rq, task) || 1065 task_running(rq, task) ||
1063 !task->se.on_rq)) { 1066 !task->se.on_rq)) {
1064 1067
@@ -1179,7 +1182,7 @@ static int pull_rt_task(struct rq *this_rq)
1179 1182
1180 next = pick_next_task_rt(this_rq); 1183 next = pick_next_task_rt(this_rq);
1181 1184
1182 for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { 1185 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1183 if (this_cpu == cpu) 1186 if (this_cpu == cpu)
1184 continue; 1187 continue;
1185 1188
@@ -1308,9 +1311,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1308} 1311}
1309 1312
1310static void set_cpus_allowed_rt(struct task_struct *p, 1313static void set_cpus_allowed_rt(struct task_struct *p,
1311 const cpumask_t *new_mask) 1314 const struct cpumask *new_mask)
1312{ 1315{
1313 int weight = cpus_weight(*new_mask); 1316 int weight = cpumask_weight(new_mask);
1314 1317
1315 BUG_ON(!rt_task(p)); 1318 BUG_ON(!rt_task(p));
1316 1319
@@ -1331,7 +1334,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1331 update_rt_migration(rq); 1334 update_rt_migration(rq);
1332 } 1335 }
1333 1336
1334 p->cpus_allowed = *new_mask; 1337 cpumask_copy(&p->cpus_allowed, new_mask);
1335 p->rt.nr_cpus_allowed = weight; 1338 p->rt.nr_cpus_allowed = weight;
1336} 1339}
1337 1340
@@ -1374,6 +1377,15 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1374 if (!rq->rt.rt_nr_running) 1377 if (!rq->rt.rt_nr_running)
1375 pull_rt_task(rq); 1378 pull_rt_task(rq);
1376} 1379}
1380
1381static inline void init_sched_rt_class(void)
1382{
1383 unsigned int i;
1384
1385 for_each_possible_cpu(i)
1386 alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1387 GFP_KERNEL, cpu_to_node(i));
1388}
1377#endif /* CONFIG_SMP */ 1389#endif /* CONFIG_SMP */
1378 1390
1379/* 1391/*
@@ -1544,3 +1556,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1544 rcu_read_unlock(); 1556 rcu_read_unlock();
1545} 1557}
1546#endif /* CONFIG_SCHED_DEBUG */ 1558#endif /* CONFIG_SCHED_DEBUG */
1559
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02c..f2773b5d1226 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count, 31 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
32 rq->sched_switch, rq->sched_count, rq->sched_goidle, 32 rq->sched_switch, rq->sched_count, rq->sched_goidle,
33 rq->ttwu_count, rq->ttwu_local, 33 rq->ttwu_count, rq->ttwu_local,
34 rq->rq_sched_info.cpu_time, 34 rq->rq_cpu_time,
35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); 35 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
36 36
37 seq_printf(seq, "\n"); 37 seq_printf(seq, "\n");
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
42 for_each_domain(cpu, sd) { 42 for_each_domain(cpu, sd) {
43 enum cpu_idle_type itype; 43 enum cpu_idle_type itype;
44 44
45 cpumask_scnprintf(mask_str, mask_len, sd->span); 45 cpumask_scnprintf(mask_str, mask_len,
46 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str); 47 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 48 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) { 49 itype++) {
@@ -123,7 +124,7 @@ static inline void
123rq_sched_info_depart(struct rq *rq, unsigned long long delta) 124rq_sched_info_depart(struct rq *rq, unsigned long long delta)
124{ 125{
125 if (rq) 126 if (rq)
126 rq->rq_sched_info.cpu_time += delta; 127 rq->rq_cpu_time += delta;
127} 128}
128 129
129static inline void 130static inline void
@@ -236,7 +237,6 @@ static inline void sched_info_depart(struct task_struct *t)
236 unsigned long long delta = task_rq(t)->clock - 237 unsigned long long delta = task_rq(t)->clock -
237 t->sched_info.last_arrival; 238 t->sched_info.last_arrival;
238 239
239 t->sched_info.cpu_time += delta;
240 rq_sched_info_depart(task_rq(t), delta); 240 rq_sched_info_depart(task_rq(t), delta);
241 241
242 if (t->state == TASK_RUNNING) 242 if (t->state == TASK_RUNNING)
diff --git a/kernel/signal.c b/kernel/signal.c
index 4530fc654455..3152ac3b62e2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,6 +41,8 @@
41 41
42static struct kmem_cache *sigqueue_cachep; 42static struct kmem_cache *sigqueue_cachep;
43 43
44DEFINE_TRACE(sched_signal_send);
45
44static void __user *sig_handler(struct task_struct *t, int sig) 46static void __user *sig_handler(struct task_struct *t, int sig)
45{ 47{
46 return t->sighand->action[sig - 1].sa.sa_handler; 48 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -177,6 +179,11 @@ int next_signal(struct sigpending *pending, sigset_t *mask)
177 return sig; 179 return sig;
178} 180}
179 181
182/*
183 * allocate a new signal queue record
184 * - this may be called without locks if and only if t == current, otherwise an
185 * appopriate lock must be held to stop the target task from exiting
186 */
180static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, 187static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
181 int override_rlimit) 188 int override_rlimit)
182{ 189{
@@ -184,11 +191,12 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
184 struct user_struct *user; 191 struct user_struct *user;
185 192
186 /* 193 /*
187 * In order to avoid problems with "switch_user()", we want to make 194 * We won't get problems with the target's UID changing under us
188 * sure that the compiler doesn't re-load "t->user" 195 * because changing it requires RCU be used, and if t != current, the
196 * caller must be holding the RCU readlock (by way of a spinlock) and
197 * we use RCU protection here
189 */ 198 */
190 user = t->user; 199 user = get_uid(__task_cred(t)->user);
191 barrier();
192 atomic_inc(&user->sigpending); 200 atomic_inc(&user->sigpending);
193 if (override_rlimit || 201 if (override_rlimit ||
194 atomic_read(&user->sigpending) <= 202 atomic_read(&user->sigpending) <=
@@ -196,12 +204,14 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
196 q = kmem_cache_alloc(sigqueue_cachep, flags); 204 q = kmem_cache_alloc(sigqueue_cachep, flags);
197 if (unlikely(q == NULL)) { 205 if (unlikely(q == NULL)) {
198 atomic_dec(&user->sigpending); 206 atomic_dec(&user->sigpending);
207 free_uid(user);
199 } else { 208 } else {
200 INIT_LIST_HEAD(&q->list); 209 INIT_LIST_HEAD(&q->list);
201 q->flags = 0; 210 q->flags = 0;
202 q->user = get_uid(user); 211 q->user = user;
203 } 212 }
204 return(q); 213
214 return q;
205} 215}
206 216
207static void __sigqueue_free(struct sigqueue *q) 217static void __sigqueue_free(struct sigqueue *q)
@@ -562,10 +572,12 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
562 572
563/* 573/*
564 * Bad permissions for sending the signal 574 * Bad permissions for sending the signal
575 * - the caller must hold at least the RCU read lock
565 */ 576 */
566static int check_kill_permission(int sig, struct siginfo *info, 577static int check_kill_permission(int sig, struct siginfo *info,
567 struct task_struct *t) 578 struct task_struct *t)
568{ 579{
580 const struct cred *cred = current_cred(), *tcred;
569 struct pid *sid; 581 struct pid *sid;
570 int error; 582 int error;
571 583
@@ -579,8 +591,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
579 if (error) 591 if (error)
580 return error; 592 return error;
581 593
582 if ((current->euid ^ t->suid) && (current->euid ^ t->uid) && 594 tcred = __task_cred(t);
583 (current->uid ^ t->suid) && (current->uid ^ t->uid) && 595 if ((cred->euid ^ tcred->suid) &&
596 (cred->euid ^ tcred->uid) &&
597 (cred->uid ^ tcred->suid) &&
598 (cred->uid ^ tcred->uid) &&
584 !capable(CAP_KILL)) { 599 !capable(CAP_KILL)) {
585 switch (sig) { 600 switch (sig) {
586 case SIGCONT: 601 case SIGCONT:
@@ -843,8 +858,9 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
843 q->info.si_signo = sig; 858 q->info.si_signo = sig;
844 q->info.si_errno = 0; 859 q->info.si_errno = 0;
845 q->info.si_code = SI_USER; 860 q->info.si_code = SI_USER;
846 q->info.si_pid = task_pid_vnr(current); 861 q->info.si_pid = task_tgid_nr_ns(current,
847 q->info.si_uid = current->uid; 862 task_active_pid_ns(t));
863 q->info.si_uid = current_uid();
848 break; 864 break;
849 case (unsigned long) SEND_SIG_PRIV: 865 case (unsigned long) SEND_SIG_PRIV:
850 q->info.si_signo = sig; 866 q->info.si_signo = sig;
@@ -1008,6 +1024,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1008 return sighand; 1024 return sighand;
1009} 1025}
1010 1026
1027/*
1028 * send signal info to all the members of a group
1029 * - the caller must hold the RCU read lock at least
1030 */
1011int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1031int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1012{ 1032{
1013 unsigned long flags; 1033 unsigned long flags;
@@ -1029,8 +1049,8 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1029/* 1049/*
1030 * __kill_pgrp_info() sends a signal to a process group: this is what the tty 1050 * __kill_pgrp_info() sends a signal to a process group: this is what the tty
1031 * control characters do (^C, ^Z etc) 1051 * control characters do (^C, ^Z etc)
1052 * - the caller must hold at least a readlock on tasklist_lock
1032 */ 1053 */
1033
1034int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) 1054int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp)
1035{ 1055{
1036 struct task_struct *p = NULL; 1056 struct task_struct *p = NULL;
@@ -1086,6 +1106,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1086{ 1106{
1087 int ret = -EINVAL; 1107 int ret = -EINVAL;
1088 struct task_struct *p; 1108 struct task_struct *p;
1109 const struct cred *pcred;
1089 1110
1090 if (!valid_signal(sig)) 1111 if (!valid_signal(sig))
1091 return ret; 1112 return ret;
@@ -1096,9 +1117,11 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1096 ret = -ESRCH; 1117 ret = -ESRCH;
1097 goto out_unlock; 1118 goto out_unlock;
1098 } 1119 }
1099 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) 1120 pcred = __task_cred(p);
1100 && (euid != p->suid) && (euid != p->uid) 1121 if ((info == SEND_SIG_NOINFO ||
1101 && (uid != p->suid) && (uid != p->uid)) { 1122 (!is_si_special(info) && SI_FROMUSER(info))) &&
1123 euid != pcred->suid && euid != pcred->uid &&
1124 uid != pcred->suid && uid != pcred->uid) {
1102 ret = -EPERM; 1125 ret = -EPERM;
1103 goto out_unlock; 1126 goto out_unlock;
1104 } 1127 }
@@ -1369,10 +1392,9 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1369 */ 1392 */
1370 rcu_read_lock(); 1393 rcu_read_lock();
1371 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1394 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1395 info.si_uid = __task_cred(tsk)->uid;
1372 rcu_read_unlock(); 1396 rcu_read_unlock();
1373 1397
1374 info.si_uid = tsk->uid;
1375
1376 thread_group_cputime(tsk, &cputime); 1398 thread_group_cputime(tsk, &cputime);
1377 info.si_utime = cputime_to_jiffies(cputime.utime); 1399 info.si_utime = cputime_to_jiffies(cputime.utime);
1378 info.si_stime = cputime_to_jiffies(cputime.stime); 1400 info.si_stime = cputime_to_jiffies(cputime.stime);
@@ -1440,10 +1462,9 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1440 */ 1462 */
1441 rcu_read_lock(); 1463 rcu_read_lock();
1442 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1464 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
1465 info.si_uid = __task_cred(tsk)->uid;
1443 rcu_read_unlock(); 1466 rcu_read_unlock();
1444 1467
1445 info.si_uid = tsk->uid;
1446
1447 info.si_utime = cputime_to_clock_t(tsk->utime); 1468 info.si_utime = cputime_to_clock_t(tsk->utime);
1448 info.si_stime = cputime_to_clock_t(tsk->stime); 1469 info.si_stime = cputime_to_clock_t(tsk->stime);
1449 1470
@@ -1598,7 +1619,7 @@ void ptrace_notify(int exit_code)
1598 info.si_signo = SIGTRAP; 1619 info.si_signo = SIGTRAP;
1599 info.si_code = exit_code; 1620 info.si_code = exit_code;
1600 info.si_pid = task_pid_vnr(current); 1621 info.si_pid = task_pid_vnr(current);
1601 info.si_uid = current->uid; 1622 info.si_uid = current_uid();
1602 1623
1603 /* Let the debugger run. */ 1624 /* Let the debugger run. */
1604 spin_lock_irq(&current->sighand->siglock); 1625 spin_lock_irq(&current->sighand->siglock);
@@ -1710,7 +1731,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
1710 info->si_errno = 0; 1731 info->si_errno = 0;
1711 info->si_code = SI_USER; 1732 info->si_code = SI_USER;
1712 info->si_pid = task_pid_vnr(current->parent); 1733 info->si_pid = task_pid_vnr(current->parent);
1713 info->si_uid = current->parent->uid; 1734 info->si_uid = task_uid(current->parent);
1714 } 1735 }
1715 1736
1716 /* If the (new) signal is now blocked, requeue it. */ 1737 /* If the (new) signal is now blocked, requeue it. */
@@ -2211,7 +2232,7 @@ sys_kill(pid_t pid, int sig)
2211 info.si_errno = 0; 2232 info.si_errno = 0;
2212 info.si_code = SI_USER; 2233 info.si_code = SI_USER;
2213 info.si_pid = task_tgid_vnr(current); 2234 info.si_pid = task_tgid_vnr(current);
2214 info.si_uid = current->uid; 2235 info.si_uid = current_uid();
2215 2236
2216 return kill_something_info(sig, &info, pid); 2237 return kill_something_info(sig, &info, pid);
2217} 2238}
@@ -2228,7 +2249,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2228 info.si_errno = 0; 2249 info.si_errno = 0;
2229 info.si_code = SI_TKILL; 2250 info.si_code = SI_TKILL;
2230 info.si_pid = task_tgid_vnr(current); 2251 info.si_pid = task_tgid_vnr(current);
2231 info.si_uid = current->uid; 2252 info.si_uid = current_uid();
2232 2253
2233 rcu_read_lock(); 2254 rcu_read_lock();
2234 p = find_task_by_vpid(pid); 2255 p = find_task_by_vpid(pid);
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c8dde58c55..5cfa0e5e3e88 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -24,8 +24,8 @@ struct call_function_data {
24 struct call_single_data csd; 24 struct call_single_data csd;
25 spinlock_t lock; 25 spinlock_t lock;
26 unsigned int refs; 26 unsigned int refs;
27 cpumask_t cpumask;
28 struct rcu_head rcu_head; 27 struct rcu_head rcu_head;
28 unsigned long cpumask_bits[];
29}; 29};
30 30
31struct call_single_queue { 31struct call_single_queue {
@@ -110,13 +110,13 @@ void generic_smp_call_function_interrupt(void)
110 list_for_each_entry_rcu(data, &call_function_queue, csd.list) { 110 list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
111 int refs; 111 int refs;
112 112
113 if (!cpu_isset(cpu, data->cpumask)) 113 if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
114 continue; 114 continue;
115 115
116 data->csd.func(data->csd.info); 116 data->csd.func(data->csd.info);
117 117
118 spin_lock(&data->lock); 118 spin_lock(&data->lock);
119 cpu_clear(cpu, data->cpumask); 119 cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
120 WARN_ON(data->refs == 0); 120 WARN_ON(data->refs == 0);
121 data->refs--; 121 data->refs--;
122 refs = data->refs; 122 refs = data->refs;
@@ -223,7 +223,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
223 local_irq_save(flags); 223 local_irq_save(flags);
224 func(info); 224 func(info);
225 local_irq_restore(flags); 225 local_irq_restore(flags);
226 } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { 226 } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
227 struct call_single_data *data = NULL; 227 struct call_single_data *data = NULL;
228 228
229 if (!wait) { 229 if (!wait) {
@@ -266,51 +266,19 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
266 generic_exec_single(cpu, data); 266 generic_exec_single(cpu, data);
267} 267}
268 268
269/* Dummy function */ 269/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
270static void quiesce_dummy(void *unused) 270#ifndef arch_send_call_function_ipi_mask
271{ 271#define arch_send_call_function_ipi_mask(maskp) \
272} 272 arch_send_call_function_ipi(*(maskp))
273 273#endif
274/*
275 * Ensure stack based data used in call function mask is safe to free.
276 *
277 * This is needed by smp_call_function_mask when using on-stack data, because
278 * a single call function queue is shared by all CPUs, and any CPU may pick up
279 * the data item on the queue at any time before it is deleted. So we need to
280 * ensure that all CPUs have transitioned through a quiescent state after
281 * this call.
282 *
283 * This is a very slow function, implemented by sending synchronous IPIs to
284 * all possible CPUs. For this reason, we have to alloc data rather than use
285 * stack based data even in the case of synchronous calls. The stack based
286 * data is then just used for deadlock/oom fallback which will be very rare.
287 *
288 * If a faster scheme can be made, we could go back to preferring stack based
289 * data -- the data allocation/free is non-zero cost.
290 */
291static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
292{
293 struct call_single_data data;
294 int cpu;
295
296 data.func = quiesce_dummy;
297 data.info = NULL;
298
299 for_each_cpu_mask(cpu, mask) {
300 data.flags = CSD_FLAG_WAIT;
301 generic_exec_single(cpu, &data);
302 }
303}
304 274
305/** 275/**
306 * smp_call_function_mask(): Run a function on a set of other CPUs. 276 * smp_call_function_many(): Run a function on a set of other CPUs.
307 * @mask: The set of cpus to run on. 277 * @mask: The set of cpus to run on (only runs on online subset).
308 * @func: The function to run. This must be fast and non-blocking. 278 * @func: The function to run. This must be fast and non-blocking.
309 * @info: An arbitrary pointer to pass to the function. 279 * @info: An arbitrary pointer to pass to the function.
310 * @wait: If true, wait (atomically) until function has completed on other CPUs. 280 * @wait: If true, wait (atomically) until function has completed on other CPUs.
311 * 281 *
312 * Returns 0 on success, else a negative status code.
313 *
314 * If @wait is true, then returns once @func has returned. Note that @wait 282 * If @wait is true, then returns once @func has returned. Note that @wait
315 * will be implicitly turned on in case of allocation failures, since 283 * will be implicitly turned on in case of allocation failures, since
316 * we fall back to on-stack allocation. 284 * we fall back to on-stack allocation.
@@ -319,53 +287,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
319 * hardware interrupt handler or from a bottom half handler. Preemption 287 * hardware interrupt handler or from a bottom half handler. Preemption
320 * must be disabled when calling this function. 288 * must be disabled when calling this function.
321 */ 289 */
322int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, 290void smp_call_function_many(const struct cpumask *mask,
323 int wait) 291 void (*func)(void *), void *info,
292 bool wait)
324{ 293{
325 struct call_function_data d; 294 struct call_function_data *data;
326 struct call_function_data *data = NULL;
327 cpumask_t allbutself;
328 unsigned long flags; 295 unsigned long flags;
329 int cpu, num_cpus; 296 int cpu, next_cpu;
330 int slowpath = 0;
331 297
332 /* Can deadlock when called with interrupts disabled */ 298 /* Can deadlock when called with interrupts disabled */
333 WARN_ON(irqs_disabled()); 299 WARN_ON(irqs_disabled());
334 300
335 cpu = smp_processor_id(); 301 /* So, what's a CPU they want? Ignoring this one. */
336 allbutself = cpu_online_map; 302 cpu = cpumask_first_and(mask, cpu_online_mask);
337 cpu_clear(cpu, allbutself); 303 if (cpu == smp_processor_id())
338 cpus_and(mask, mask, allbutself); 304 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
339 num_cpus = cpus_weight(mask); 305 /* No online cpus? We're done. */
340 306 if (cpu >= nr_cpu_ids)
341 /* 307 return;
342 * If zero CPUs, return. If just a single CPU, turn this request 308
343 * into a targetted single call instead since it's faster. 309 /* Do we have another CPU which isn't us? */
344 */ 310 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
345 if (!num_cpus) 311 if (next_cpu == smp_processor_id())
346 return 0; 312 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
347 else if (num_cpus == 1) { 313
348 cpu = first_cpu(mask); 314 /* Fastpath: do that cpu by itself. */
349 return smp_call_function_single(cpu, func, info, wait); 315 if (next_cpu >= nr_cpu_ids) {
316 smp_call_function_single(cpu, func, info, wait);
317 return;
350 } 318 }
351 319
352 data = kmalloc(sizeof(*data), GFP_ATOMIC); 320 data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
353 if (data) { 321 if (unlikely(!data)) {
354 data->csd.flags = CSD_FLAG_ALLOC; 322 /* Slow path. */
355 if (wait) 323 for_each_online_cpu(cpu) {
356 data->csd.flags |= CSD_FLAG_WAIT; 324 if (cpu == smp_processor_id())
357 } else { 325 continue;
358 data = &d; 326 if (cpumask_test_cpu(cpu, mask))
359 data->csd.flags = CSD_FLAG_WAIT; 327 smp_call_function_single(cpu, func, info, wait);
360 wait = 1; 328 }
361 slowpath = 1; 329 return;
362 } 330 }
363 331
364 spin_lock_init(&data->lock); 332 spin_lock_init(&data->lock);
333 data->csd.flags = CSD_FLAG_ALLOC;
334 if (wait)
335 data->csd.flags |= CSD_FLAG_WAIT;
365 data->csd.func = func; 336 data->csd.func = func;
366 data->csd.info = info; 337 data->csd.info = info;
367 data->refs = num_cpus; 338 cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
368 data->cpumask = mask; 339 cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
340 data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
369 341
370 spin_lock_irqsave(&call_function_lock, flags); 342 spin_lock_irqsave(&call_function_lock, flags);
371 list_add_tail_rcu(&data->csd.list, &call_function_queue); 343 list_add_tail_rcu(&data->csd.list, &call_function_queue);
@@ -377,18 +349,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
377 smp_mb(); 349 smp_mb();
378 350
379 /* Send a message to all CPUs in the map */ 351 /* Send a message to all CPUs in the map */
380 arch_send_call_function_ipi(mask); 352 arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
381 353
382 /* optionally wait for the CPUs to complete */ 354 /* optionally wait for the CPUs to complete */
383 if (wait) { 355 if (wait)
384 csd_flag_wait(&data->csd); 356 csd_flag_wait(&data->csd);
385 if (unlikely(slowpath))
386 smp_call_function_mask_quiesce_stack(mask);
387 }
388
389 return 0;
390} 357}
391EXPORT_SYMBOL(smp_call_function_mask); 358EXPORT_SYMBOL(smp_call_function_many);
392 359
393/** 360/**
394 * smp_call_function(): Run a function on all other CPUs. 361 * smp_call_function(): Run a function on all other CPUs.
@@ -396,7 +363,7 @@ EXPORT_SYMBOL(smp_call_function_mask);
396 * @info: An arbitrary pointer to pass to the function. 363 * @info: An arbitrary pointer to pass to the function.
397 * @wait: If true, wait (atomically) until function has completed on other CPUs. 364 * @wait: If true, wait (atomically) until function has completed on other CPUs.
398 * 365 *
399 * Returns 0 on success, else a negative status code. 366 * Returns 0.
400 * 367 *
401 * If @wait is true, then returns once @func has returned; otherwise 368 * If @wait is true, then returns once @func has returned; otherwise
402 * it returns just before the target cpu calls @func. In case of allocation 369 * it returns just before the target cpu calls @func. In case of allocation
@@ -407,12 +374,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
407 */ 374 */
408int smp_call_function(void (*func)(void *), void *info, int wait) 375int smp_call_function(void (*func)(void *), void *info, int wait)
409{ 376{
410 int ret;
411
412 preempt_disable(); 377 preempt_disable();
413 ret = smp_call_function_mask(cpu_online_map, func, info, wait); 378 smp_call_function_many(cpu_online_mask, func, info, wait);
414 preempt_enable(); 379 preempt_enable();
415 return ret; 380 return 0;
416} 381}
417EXPORT_SYMBOL(smp_call_function); 382EXPORT_SYMBOL(smp_call_function);
418 383
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d69..bdbe9de9cd8d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -102,20 +102,6 @@ void local_bh_disable(void)
102 102
103EXPORT_SYMBOL(local_bh_disable); 103EXPORT_SYMBOL(local_bh_disable);
104 104
105void __local_bh_enable(void)
106{
107 WARN_ON_ONCE(in_irq());
108
109 /*
110 * softirqs should never be enabled by __local_bh_enable(),
111 * it always nests inside local_bh_enable() sections:
112 */
113 WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
114
115 sub_preempt_count(SOFTIRQ_OFFSET);
116}
117EXPORT_SYMBOL_GPL(__local_bh_enable);
118
119/* 105/*
120 * Special-case - softirqs can safely be enabled in 106 * Special-case - softirqs can safely be enabled in
121 * cond_resched_softirq(), or by __do_softirq(), 107 * cond_resched_softirq(), or by __do_softirq(),
@@ -269,6 +255,7 @@ void irq_enter(void)
269{ 255{
270 int cpu = smp_processor_id(); 256 int cpu = smp_processor_id();
271 257
258 rcu_irq_enter();
272 if (idle_cpu(cpu) && !in_interrupt()) { 259 if (idle_cpu(cpu) && !in_interrupt()) {
273 __irq_enter(); 260 __irq_enter();
274 tick_check_idle(cpu); 261 tick_check_idle(cpu);
@@ -295,9 +282,9 @@ void irq_exit(void)
295 282
296#ifdef CONFIG_NO_HZ 283#ifdef CONFIG_NO_HZ
297 /* Make sure that timer wheel updates are propagated */ 284 /* Make sure that timer wheel updates are propagated */
298 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
299 tick_nohz_stop_sched_tick(0);
300 rcu_irq_exit(); 285 rcu_irq_exit();
286 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
287 tick_nohz_stop_sched_tick(0);
301#endif 288#endif
302 preempt_enable_no_resched(); 289 preempt_enable_no_resched();
303} 290}
@@ -746,7 +733,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
746 break; 733 break;
747 /* Unbind so it can run. Fall thru. */ 734 /* Unbind so it can run. Fall thru. */
748 kthread_bind(per_cpu(ksoftirqd, hotcpu), 735 kthread_bind(per_cpu(ksoftirqd, hotcpu),
749 any_online_cpu(cpu_online_map)); 736 cpumask_any(cpu_online_mask));
750 case CPU_DEAD: 737 case CPU_DEAD:
751 case CPU_DEAD_FROZEN: { 738 case CPU_DEAD_FROZEN: {
752 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 739 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -797,3 +784,23 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
797} 784}
798EXPORT_SYMBOL(on_each_cpu); 785EXPORT_SYMBOL(on_each_cpu);
799#endif 786#endif
787
788/*
789 * [ These __weak aliases are kept in a separate compilation unit, so that
790 * GCC does not inline them incorrectly. ]
791 */
792
793int __init __weak early_irq_init(void)
794{
795 return 0;
796}
797
798int __init __weak arch_early_irq_init(void)
799{
800 return 0;
801}
802
803int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
804{
805 return 0;
806}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index dc0b3be6b7d5..d9188c66278a 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
164/* 164/*
165 * Zero means infinite timeout - no checking done: 165 * Zero means infinite timeout - no checking done:
166 */ 166 */
167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
168 168
169unsigned long __read_mostly sysctl_hung_task_warnings = 10; 169unsigned long __read_mostly sysctl_hung_task_warnings = 10;
170 170
@@ -303,17 +303,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
303 break; 303 break;
304 case CPU_ONLINE: 304 case CPU_ONLINE:
305 case CPU_ONLINE_FROZEN: 305 case CPU_ONLINE_FROZEN:
306 check_cpu = any_online_cpu(cpu_online_map); 306 check_cpu = cpumask_any(cpu_online_mask);
307 wake_up_process(per_cpu(watchdog_task, hotcpu)); 307 wake_up_process(per_cpu(watchdog_task, hotcpu));
308 break; 308 break;
309#ifdef CONFIG_HOTPLUG_CPU 309#ifdef CONFIG_HOTPLUG_CPU
310 case CPU_DOWN_PREPARE: 310 case CPU_DOWN_PREPARE:
311 case CPU_DOWN_PREPARE_FROZEN: 311 case CPU_DOWN_PREPARE_FROZEN:
312 if (hotcpu == check_cpu) { 312 if (hotcpu == check_cpu) {
313 cpumask_t temp_cpu_online_map = cpu_online_map; 313 /* Pick any other online cpu. */
314 314 check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
315 cpu_clear(hotcpu, temp_cpu_online_map);
316 check_cpu = any_online_cpu(temp_cpu_online_map);
317 } 315 }
318 break; 316 break;
319 317
@@ -323,7 +321,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
323 break; 321 break;
324 /* Unbind so it can run. Fall thru. */ 322 /* Unbind so it can run. Fall thru. */
325 kthread_bind(per_cpu(watchdog_task, hotcpu), 323 kthread_bind(per_cpu(watchdog_task, hotcpu),
326 any_online_cpu(cpu_online_map)); 324 cpumask_any(cpu_online_mask));
327 case CPU_DEAD: 325 case CPU_DEAD:
328 case CPU_DEAD_FROZEN: 326 case CPU_DEAD_FROZEN:
329 p = per_cpu(watchdog_task, hotcpu); 327 p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 94b527ef1d1e..eb212f8f8bc8 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
11#include <linux/stacktrace.h> 12#include <linux/stacktrace.h>
@@ -24,3 +25,13 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
24} 25}
25EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
26 27
28/*
29 * Architectures that do not implement save_stack_trace_tsk get this
30 * weak alias and a once-per-bootup warning (whenever this facility
31 * is utilized - for example by procfs):
32 */
33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 24e8ceacc388..0cd415ee62a2 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -38,7 +38,10 @@ struct stop_machine_data {
38static unsigned int num_threads; 38static unsigned int num_threads;
39static atomic_t thread_ack; 39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock); 40static DEFINE_MUTEX(lock);
41 41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
42static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
43static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
44static const cpumask_t *active_cpus; 47static const cpumask_t *active_cpus;
@@ -69,10 +72,10 @@ static void stop_cpu(struct work_struct *unused)
69 int err; 72 int err;
70 73
71 if (!active_cpus) { 74 if (!active_cpus) {
72 if (cpu == first_cpu(cpu_online_map)) 75 if (cpu == cpumask_first(cpu_online_mask))
73 smdata = &active; 76 smdata = &active;
74 } else { 77 } else {
75 if (cpu_isset(cpu, *active_cpus)) 78 if (cpumask_test_cpu(cpu, active_cpus))
76 smdata = &active; 79 smdata = &active;
77 } 80 }
78 /* Simple state machine */ 81 /* Simple state machine */
@@ -109,7 +112,44 @@ static int chill(void *unused)
109 return 0; 112 return 0;
110} 113}
111 114
112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
113{ 153{
114 struct work_struct *sm_work; 154 struct work_struct *sm_work;
115 int i, ret; 155 int i, ret;
@@ -142,23 +182,18 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
142 return ret; 182 return ret;
143} 183}
144 184
145int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
146{ 186{
147 int ret; 187 int ret;
148 188
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
149 /* No CPUs can come up or down during this. */ 192 /* No CPUs can come up or down during this. */
150 get_online_cpus(); 193 get_online_cpus();
151 ret = __stop_machine(fn, data, cpus); 194 ret = __stop_machine(fn, data, cpus);
152 put_online_cpus(); 195 put_online_cpus();
153 196 stop_machine_destroy();
154 return ret; 197 return ret;
155} 198}
156EXPORT_SYMBOL_GPL(stop_machine); 199EXPORT_SYMBOL_GPL(stop_machine);
157
158static int __init stop_machine_init(void)
159{
160 stop_machine_wq = create_rt_workqueue("kstop");
161 stop_machine_work = alloc_percpu(struct work_struct);
162 return 0;
163}
164core_initcall(stop_machine_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d16..763c3c17ded3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/ptrace.h>
36 37
37#include <linux/compat.h> 38#include <linux/compat.h>
38#include <linux/syscalls.h> 39#include <linux/syscalls.h>
@@ -112,12 +113,17 @@ EXPORT_SYMBOL(cad_pid);
112 113
113void (*pm_power_off_prepare)(void); 114void (*pm_power_off_prepare)(void);
114 115
116/*
117 * set the priority of a task
118 * - the caller must hold the RCU read lock
119 */
115static int set_one_prio(struct task_struct *p, int niceval, int error) 120static int set_one_prio(struct task_struct *p, int niceval, int error)
116{ 121{
122 const struct cred *cred = current_cred(), *pcred = __task_cred(p);
117 int no_nice; 123 int no_nice;
118 124
119 if (p->uid != current->euid && 125 if (pcred->uid != cred->euid &&
120 p->euid != current->euid && !capable(CAP_SYS_NICE)) { 126 pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
121 error = -EPERM; 127 error = -EPERM;
122 goto out; 128 goto out;
123 } 129 }
@@ -141,6 +147,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
141{ 147{
142 struct task_struct *g, *p; 148 struct task_struct *g, *p;
143 struct user_struct *user; 149 struct user_struct *user;
150 const struct cred *cred = current_cred();
144 int error = -EINVAL; 151 int error = -EINVAL;
145 struct pid *pgrp; 152 struct pid *pgrp;
146 153
@@ -174,18 +181,18 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
174 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 181 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
175 break; 182 break;
176 case PRIO_USER: 183 case PRIO_USER:
177 user = current->user; 184 user = (struct user_struct *) cred->user;
178 if (!who) 185 if (!who)
179 who = current->uid; 186 who = cred->uid;
180 else 187 else if ((who != cred->uid) &&
181 if ((who != current->uid) && !(user = find_user(who))) 188 !(user = find_user(who)))
182 goto out_unlock; /* No processes for this user */ 189 goto out_unlock; /* No processes for this user */
183 190
184 do_each_thread(g, p) 191 do_each_thread(g, p)
185 if (p->uid == who) 192 if (__task_cred(p)->uid == who)
186 error = set_one_prio(p, niceval, error); 193 error = set_one_prio(p, niceval, error);
187 while_each_thread(g, p); 194 while_each_thread(g, p);
188 if (who != current->uid) 195 if (who != cred->uid)
189 free_uid(user); /* For find_user() */ 196 free_uid(user); /* For find_user() */
190 break; 197 break;
191 } 198 }
@@ -205,6 +212,7 @@ asmlinkage long sys_getpriority(int which, int who)
205{ 212{
206 struct task_struct *g, *p; 213 struct task_struct *g, *p;
207 struct user_struct *user; 214 struct user_struct *user;
215 const struct cred *cred = current_cred();
208 long niceval, retval = -ESRCH; 216 long niceval, retval = -ESRCH;
209 struct pid *pgrp; 217 struct pid *pgrp;
210 218
@@ -236,21 +244,21 @@ asmlinkage long sys_getpriority(int which, int who)
236 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 244 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
237 break; 245 break;
238 case PRIO_USER: 246 case PRIO_USER:
239 user = current->user; 247 user = (struct user_struct *) cred->user;
240 if (!who) 248 if (!who)
241 who = current->uid; 249 who = cred->uid;
242 else 250 else if ((who != cred->uid) &&
243 if ((who != current->uid) && !(user = find_user(who))) 251 !(user = find_user(who)))
244 goto out_unlock; /* No processes for this user */ 252 goto out_unlock; /* No processes for this user */
245 253
246 do_each_thread(g, p) 254 do_each_thread(g, p)
247 if (p->uid == who) { 255 if (__task_cred(p)->uid == who) {
248 niceval = 20 - task_nice(p); 256 niceval = 20 - task_nice(p);
249 if (niceval > retval) 257 if (niceval > retval)
250 retval = niceval; 258 retval = niceval;
251 } 259 }
252 while_each_thread(g, p); 260 while_each_thread(g, p);
253 if (who != current->uid) 261 if (who != cred->uid)
254 free_uid(user); /* for find_user() */ 262 free_uid(user); /* for find_user() */
255 break; 263 break;
256 } 264 }
@@ -472,46 +480,48 @@ void ctrl_alt_del(void)
472 */ 480 */
473asmlinkage long sys_setregid(gid_t rgid, gid_t egid) 481asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
474{ 482{
475 int old_rgid = current->gid; 483 const struct cred *old;
476 int old_egid = current->egid; 484 struct cred *new;
477 int new_rgid = old_rgid;
478 int new_egid = old_egid;
479 int retval; 485 int retval;
480 486
487 new = prepare_creds();
488 if (!new)
489 return -ENOMEM;
490 old = current_cred();
491
481 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); 492 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
482 if (retval) 493 if (retval)
483 return retval; 494 goto error;
484 495
496 retval = -EPERM;
485 if (rgid != (gid_t) -1) { 497 if (rgid != (gid_t) -1) {
486 if ((old_rgid == rgid) || 498 if (old->gid == rgid ||
487 (current->egid==rgid) || 499 old->egid == rgid ||
488 capable(CAP_SETGID)) 500 capable(CAP_SETGID))
489 new_rgid = rgid; 501 new->gid = rgid;
490 else 502 else
491 return -EPERM; 503 goto error;
492 } 504 }
493 if (egid != (gid_t) -1) { 505 if (egid != (gid_t) -1) {
494 if ((old_rgid == egid) || 506 if (old->gid == egid ||
495 (current->egid == egid) || 507 old->egid == egid ||
496 (current->sgid == egid) || 508 old->sgid == egid ||
497 capable(CAP_SETGID)) 509 capable(CAP_SETGID))
498 new_egid = egid; 510 new->egid = egid;
499 else 511 else
500 return -EPERM; 512 goto error;
501 }
502 if (new_egid != old_egid) {
503 set_dumpable(current->mm, suid_dumpable);
504 smp_wmb();
505 } 513 }
514
506 if (rgid != (gid_t) -1 || 515 if (rgid != (gid_t) -1 ||
507 (egid != (gid_t) -1 && egid != old_rgid)) 516 (egid != (gid_t) -1 && egid != old->gid))
508 current->sgid = new_egid; 517 new->sgid = new->egid;
509 current->fsgid = new_egid; 518 new->fsgid = new->egid;
510 current->egid = new_egid; 519
511 current->gid = new_rgid; 520 return commit_creds(new);
512 key_fsgid_changed(current); 521
513 proc_id_connector(current, PROC_EVENT_GID); 522error:
514 return 0; 523 abort_creds(new);
524 return retval;
515} 525}
516 526
517/* 527/*
@@ -521,56 +531,54 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
521 */ 531 */
522asmlinkage long sys_setgid(gid_t gid) 532asmlinkage long sys_setgid(gid_t gid)
523{ 533{
524 int old_egid = current->egid; 534 const struct cred *old;
535 struct cred *new;
525 int retval; 536 int retval;
526 537
538 new = prepare_creds();
539 if (!new)
540 return -ENOMEM;
541 old = current_cred();
542
527 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); 543 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
528 if (retval) 544 if (retval)
529 return retval; 545 goto error;
530 546
531 if (capable(CAP_SETGID)) { 547 retval = -EPERM;
532 if (old_egid != gid) { 548 if (capable(CAP_SETGID))
533 set_dumpable(current->mm, suid_dumpable); 549 new->gid = new->egid = new->sgid = new->fsgid = gid;
534 smp_wmb(); 550 else if (gid == old->gid || gid == old->sgid)
535 } 551 new->egid = new->fsgid = gid;
536 current->gid = current->egid = current->sgid = current->fsgid = gid;
537 } else if ((gid == current->gid) || (gid == current->sgid)) {
538 if (old_egid != gid) {
539 set_dumpable(current->mm, suid_dumpable);
540 smp_wmb();
541 }
542 current->egid = current->fsgid = gid;
543 }
544 else 552 else
545 return -EPERM; 553 goto error;
546 554
547 key_fsgid_changed(current); 555 return commit_creds(new);
548 proc_id_connector(current, PROC_EVENT_GID); 556
549 return 0; 557error:
558 abort_creds(new);
559 return retval;
550} 560}
551 561
552static int set_user(uid_t new_ruid, int dumpclear) 562/*
563 * change the user struct in a credentials set to match the new UID
564 */
565static int set_user(struct cred *new)
553{ 566{
554 struct user_struct *new_user; 567 struct user_struct *new_user;
555 568
556 new_user = alloc_uid(current->nsproxy->user_ns, new_ruid); 569 new_user = alloc_uid(current_user_ns(), new->uid);
557 if (!new_user) 570 if (!new_user)
558 return -EAGAIN; 571 return -EAGAIN;
559 572
560 if (atomic_read(&new_user->processes) >= 573 if (atomic_read(&new_user->processes) >=
561 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 574 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
562 new_user != current->nsproxy->user_ns->root_user) { 575 new_user != INIT_USER) {
563 free_uid(new_user); 576 free_uid(new_user);
564 return -EAGAIN; 577 return -EAGAIN;
565 } 578 }
566 579
567 switch_uid(new_user); 580 free_uid(new->user);
568 581 new->user = new_user;
569 if (dumpclear) {
570 set_dumpable(current->mm, suid_dumpable);
571 smp_wmb();
572 }
573 current->uid = new_ruid;
574 return 0; 582 return 0;
575} 583}
576 584
@@ -591,54 +599,56 @@ static int set_user(uid_t new_ruid, int dumpclear)
591 */ 599 */
592asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) 600asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
593{ 601{
594 int old_ruid, old_euid, old_suid, new_ruid, new_euid; 602 const struct cred *old;
603 struct cred *new;
595 int retval; 604 int retval;
596 605
606 new = prepare_creds();
607 if (!new)
608 return -ENOMEM;
609 old = current_cred();
610
597 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); 611 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
598 if (retval) 612 if (retval)
599 return retval; 613 goto error;
600
601 new_ruid = old_ruid = current->uid;
602 new_euid = old_euid = current->euid;
603 old_suid = current->suid;
604 614
615 retval = -EPERM;
605 if (ruid != (uid_t) -1) { 616 if (ruid != (uid_t) -1) {
606 new_ruid = ruid; 617 new->uid = ruid;
607 if ((old_ruid != ruid) && 618 if (old->uid != ruid &&
608 (current->euid != ruid) && 619 old->euid != ruid &&
609 !capable(CAP_SETUID)) 620 !capable(CAP_SETUID))
610 return -EPERM; 621 goto error;
611 } 622 }
612 623
613 if (euid != (uid_t) -1) { 624 if (euid != (uid_t) -1) {
614 new_euid = euid; 625 new->euid = euid;
615 if ((old_ruid != euid) && 626 if (old->uid != euid &&
616 (current->euid != euid) && 627 old->euid != euid &&
617 (current->suid != euid) && 628 old->suid != euid &&
618 !capable(CAP_SETUID)) 629 !capable(CAP_SETUID))
619 return -EPERM; 630 goto error;
620 } 631 }
621 632
622 if (new_ruid != old_ruid && set_user(new_ruid, new_euid != old_euid) < 0) 633 retval = -EAGAIN;
623 return -EAGAIN; 634 if (new->uid != old->uid && set_user(new) < 0)
635 goto error;
624 636
625 if (new_euid != old_euid) {
626 set_dumpable(current->mm, suid_dumpable);
627 smp_wmb();
628 }
629 current->fsuid = current->euid = new_euid;
630 if (ruid != (uid_t) -1 || 637 if (ruid != (uid_t) -1 ||
631 (euid != (uid_t) -1 && euid != old_ruid)) 638 (euid != (uid_t) -1 && euid != old->uid))
632 current->suid = current->euid; 639 new->suid = new->euid;
633 current->fsuid = current->euid; 640 new->fsuid = new->euid;
634 641
635 key_fsuid_changed(current); 642 retval = security_task_fix_setuid(new, old, LSM_SETID_RE);
636 proc_id_connector(current, PROC_EVENT_UID); 643 if (retval < 0)
637 644 goto error;
638 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
639}
640 645
646 return commit_creds(new);
641 647
648error:
649 abort_creds(new);
650 return retval;
651}
642 652
643/* 653/*
644 * setuid() is implemented like SysV with SAVED_IDS 654 * setuid() is implemented like SysV with SAVED_IDS
@@ -653,36 +663,41 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
653 */ 663 */
654asmlinkage long sys_setuid(uid_t uid) 664asmlinkage long sys_setuid(uid_t uid)
655{ 665{
656 int old_euid = current->euid; 666 const struct cred *old;
657 int old_ruid, old_suid, new_suid; 667 struct cred *new;
658 int retval; 668 int retval;
659 669
670 new = prepare_creds();
671 if (!new)
672 return -ENOMEM;
673 old = current_cred();
674
660 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); 675 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
661 if (retval) 676 if (retval)
662 return retval; 677 goto error;
663 678
664 old_ruid = current->uid; 679 retval = -EPERM;
665 old_suid = current->suid;
666 new_suid = old_suid;
667
668 if (capable(CAP_SETUID)) { 680 if (capable(CAP_SETUID)) {
669 if (uid != old_ruid && set_user(uid, old_euid != uid) < 0) 681 new->suid = new->uid = uid;
670 return -EAGAIN; 682 if (uid != old->uid && set_user(new) < 0) {
671 new_suid = uid; 683 retval = -EAGAIN;
672 } else if ((uid != current->uid) && (uid != new_suid)) 684 goto error;
673 return -EPERM; 685 }
674 686 } else if (uid != old->uid && uid != new->suid) {
675 if (old_euid != uid) { 687 goto error;
676 set_dumpable(current->mm, suid_dumpable);
677 smp_wmb();
678 } 688 }
679 current->fsuid = current->euid = uid;
680 current->suid = new_suid;
681 689
682 key_fsuid_changed(current); 690 new->fsuid = new->euid = uid;
683 proc_id_connector(current, PROC_EVENT_UID); 691
692 retval = security_task_fix_setuid(new, old, LSM_SETID_ID);
693 if (retval < 0)
694 goto error;
684 695
685 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); 696 return commit_creds(new);
697
698error:
699 abort_creds(new);
700 return retval;
686} 701}
687 702
688 703
@@ -692,54 +707,63 @@ asmlinkage long sys_setuid(uid_t uid)
692 */ 707 */
693asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) 708asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
694{ 709{
695 int old_ruid = current->uid; 710 const struct cred *old;
696 int old_euid = current->euid; 711 struct cred *new;
697 int old_suid = current->suid;
698 int retval; 712 int retval;
699 713
714 new = prepare_creds();
715 if (!new)
716 return -ENOMEM;
717
700 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); 718 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
701 if (retval) 719 if (retval)
702 return retval; 720 goto error;
721 old = current_cred();
703 722
723 retval = -EPERM;
704 if (!capable(CAP_SETUID)) { 724 if (!capable(CAP_SETUID)) {
705 if ((ruid != (uid_t) -1) && (ruid != current->uid) && 725 if (ruid != (uid_t) -1 && ruid != old->uid &&
706 (ruid != current->euid) && (ruid != current->suid)) 726 ruid != old->euid && ruid != old->suid)
707 return -EPERM; 727 goto error;
708 if ((euid != (uid_t) -1) && (euid != current->uid) && 728 if (euid != (uid_t) -1 && euid != old->uid &&
709 (euid != current->euid) && (euid != current->suid)) 729 euid != old->euid && euid != old->suid)
710 return -EPERM; 730 goto error;
711 if ((suid != (uid_t) -1) && (suid != current->uid) && 731 if (suid != (uid_t) -1 && suid != old->uid &&
712 (suid != current->euid) && (suid != current->suid)) 732 suid != old->euid && suid != old->suid)
713 return -EPERM; 733 goto error;
714 } 734 }
735
736 retval = -EAGAIN;
715 if (ruid != (uid_t) -1) { 737 if (ruid != (uid_t) -1) {
716 if (ruid != current->uid && set_user(ruid, euid != current->euid) < 0) 738 new->uid = ruid;
717 return -EAGAIN; 739 if (ruid != old->uid && set_user(new) < 0)
740 goto error;
718 } 741 }
719 if (euid != (uid_t) -1) { 742 if (euid != (uid_t) -1)
720 if (euid != current->euid) { 743 new->euid = euid;
721 set_dumpable(current->mm, suid_dumpable);
722 smp_wmb();
723 }
724 current->euid = euid;
725 }
726 current->fsuid = current->euid;
727 if (suid != (uid_t) -1) 744 if (suid != (uid_t) -1)
728 current->suid = suid; 745 new->suid = suid;
746 new->fsuid = new->euid;
747
748 retval = security_task_fix_setuid(new, old, LSM_SETID_RES);
749 if (retval < 0)
750 goto error;
729 751
730 key_fsuid_changed(current); 752 return commit_creds(new);
731 proc_id_connector(current, PROC_EVENT_UID);
732 753
733 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); 754error:
755 abort_creds(new);
756 return retval;
734} 757}
735 758
736asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) 759asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid)
737{ 760{
761 const struct cred *cred = current_cred();
738 int retval; 762 int retval;
739 763
740 if (!(retval = put_user(current->uid, ruid)) && 764 if (!(retval = put_user(cred->uid, ruid)) &&
741 !(retval = put_user(current->euid, euid))) 765 !(retval = put_user(cred->euid, euid)))
742 retval = put_user(current->suid, suid); 766 retval = put_user(cred->suid, suid);
743 767
744 return retval; 768 return retval;
745} 769}
@@ -749,48 +773,55 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
749 */ 773 */
750asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) 774asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
751{ 775{
776 const struct cred *old;
777 struct cred *new;
752 int retval; 778 int retval;
753 779
780 new = prepare_creds();
781 if (!new)
782 return -ENOMEM;
783 old = current_cred();
784
754 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); 785 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
755 if (retval) 786 if (retval)
756 return retval; 787 goto error;
757 788
789 retval = -EPERM;
758 if (!capable(CAP_SETGID)) { 790 if (!capable(CAP_SETGID)) {
759 if ((rgid != (gid_t) -1) && (rgid != current->gid) && 791 if (rgid != (gid_t) -1 && rgid != old->gid &&
760 (rgid != current->egid) && (rgid != current->sgid)) 792 rgid != old->egid && rgid != old->sgid)
761 return -EPERM; 793 goto error;
762 if ((egid != (gid_t) -1) && (egid != current->gid) && 794 if (egid != (gid_t) -1 && egid != old->gid &&
763 (egid != current->egid) && (egid != current->sgid)) 795 egid != old->egid && egid != old->sgid)
764 return -EPERM; 796 goto error;
765 if ((sgid != (gid_t) -1) && (sgid != current->gid) && 797 if (sgid != (gid_t) -1 && sgid != old->gid &&
766 (sgid != current->egid) && (sgid != current->sgid)) 798 sgid != old->egid && sgid != old->sgid)
767 return -EPERM; 799 goto error;
768 }
769 if (egid != (gid_t) -1) {
770 if (egid != current->egid) {
771 set_dumpable(current->mm, suid_dumpable);
772 smp_wmb();
773 }
774 current->egid = egid;
775 } 800 }
776 current->fsgid = current->egid; 801
777 if (rgid != (gid_t) -1) 802 if (rgid != (gid_t) -1)
778 current->gid = rgid; 803 new->gid = rgid;
804 if (egid != (gid_t) -1)
805 new->egid = egid;
779 if (sgid != (gid_t) -1) 806 if (sgid != (gid_t) -1)
780 current->sgid = sgid; 807 new->sgid = sgid;
808 new->fsgid = new->egid;
781 809
782 key_fsgid_changed(current); 810 return commit_creds(new);
783 proc_id_connector(current, PROC_EVENT_GID); 811
784 return 0; 812error:
813 abort_creds(new);
814 return retval;
785} 815}
786 816
787asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) 817asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid)
788{ 818{
819 const struct cred *cred = current_cred();
789 int retval; 820 int retval;
790 821
791 if (!(retval = put_user(current->gid, rgid)) && 822 if (!(retval = put_user(cred->gid, rgid)) &&
792 !(retval = put_user(current->egid, egid))) 823 !(retval = put_user(cred->egid, egid)))
793 retval = put_user(current->sgid, sgid); 824 retval = put_user(cred->sgid, sgid);
794 825
795 return retval; 826 return retval;
796} 827}
@@ -804,27 +835,35 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
804 */ 835 */
805asmlinkage long sys_setfsuid(uid_t uid) 836asmlinkage long sys_setfsuid(uid_t uid)
806{ 837{
807 int old_fsuid; 838 const struct cred *old;
839 struct cred *new;
840 uid_t old_fsuid;
841
842 new = prepare_creds();
843 if (!new)
844 return current_fsuid();
845 old = current_cred();
846 old_fsuid = old->fsuid;
808 847
809 old_fsuid = current->fsuid; 848 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
810 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS)) 849 goto error;
811 return old_fsuid;
812 850
813 if (uid == current->uid || uid == current->euid || 851 if (uid == old->uid || uid == old->euid ||
814 uid == current->suid || uid == current->fsuid || 852 uid == old->suid || uid == old->fsuid ||
815 capable(CAP_SETUID)) { 853 capable(CAP_SETUID)) {
816 if (uid != old_fsuid) { 854 if (uid != old_fsuid) {
817 set_dumpable(current->mm, suid_dumpable); 855 new->fsuid = uid;
818 smp_wmb(); 856 if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
857 goto change_okay;
819 } 858 }
820 current->fsuid = uid;
821 } 859 }
822 860
823 key_fsuid_changed(current); 861error:
824 proc_id_connector(current, PROC_EVENT_UID); 862 abort_creds(new);
825 863 return old_fsuid;
826 security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
827 864
865change_okay:
866 commit_creds(new);
828 return old_fsuid; 867 return old_fsuid;
829} 868}
830 869
@@ -833,23 +872,34 @@ asmlinkage long sys_setfsuid(uid_t uid)
833 */ 872 */
834asmlinkage long sys_setfsgid(gid_t gid) 873asmlinkage long sys_setfsgid(gid_t gid)
835{ 874{
836 int old_fsgid; 875 const struct cred *old;
876 struct cred *new;
877 gid_t old_fsgid;
878
879 new = prepare_creds();
880 if (!new)
881 return current_fsgid();
882 old = current_cred();
883 old_fsgid = old->fsgid;
837 884
838 old_fsgid = current->fsgid;
839 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) 885 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
840 return old_fsgid; 886 goto error;
841 887
842 if (gid == current->gid || gid == current->egid || 888 if (gid == old->gid || gid == old->egid ||
843 gid == current->sgid || gid == current->fsgid || 889 gid == old->sgid || gid == old->fsgid ||
844 capable(CAP_SETGID)) { 890 capable(CAP_SETGID)) {
845 if (gid != old_fsgid) { 891 if (gid != old_fsgid) {
846 set_dumpable(current->mm, suid_dumpable); 892 new->fsgid = gid;
847 smp_wmb(); 893 goto change_okay;
848 } 894 }
849 current->fsgid = gid;
850 key_fsgid_changed(current);
851 proc_id_connector(current, PROC_EVENT_GID);
852 } 895 }
896
897error:
898 abort_creds(new);
899 return old_fsgid;
900
901change_okay:
902 commit_creds(new);
853 return old_fsgid; 903 return old_fsgid;
854} 904}
855 905
@@ -858,8 +908,8 @@ void do_sys_times(struct tms *tms)
858 struct task_cputime cputime; 908 struct task_cputime cputime;
859 cputime_t cutime, cstime; 909 cputime_t cutime, cstime;
860 910
861 spin_lock_irq(&current->sighand->siglock);
862 thread_group_cputime(current, &cputime); 911 thread_group_cputime(current, &cputime);
912 spin_lock_irq(&current->sighand->siglock);
863 cutime = current->signal->cutime; 913 cutime = current->signal->cutime;
864 cstime = current->signal->cstime; 914 cstime = current->signal->cstime;
865 spin_unlock_irq(&current->sighand->siglock); 915 spin_unlock_irq(&current->sighand->siglock);
@@ -878,6 +928,7 @@ asmlinkage long sys_times(struct tms __user * tbuf)
878 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 928 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
879 return -EFAULT; 929 return -EFAULT;
880 } 930 }
931 force_successful_syscall_return();
881 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 932 return (long) jiffies_64_to_clock_t(get_jiffies_64());
882} 933}
883 934
@@ -1118,7 +1169,7 @@ EXPORT_SYMBOL(groups_free);
1118 1169
1119/* export the group_info to a user-space array */ 1170/* export the group_info to a user-space array */
1120static int groups_to_user(gid_t __user *grouplist, 1171static int groups_to_user(gid_t __user *grouplist,
1121 struct group_info *group_info) 1172 const struct group_info *group_info)
1122{ 1173{
1123 int i; 1174 int i;
1124 unsigned int count = group_info->ngroups; 1175 unsigned int count = group_info->ngroups;
@@ -1186,7 +1237,7 @@ static void groups_sort(struct group_info *group_info)
1186} 1237}
1187 1238
1188/* a simple bsearch */ 1239/* a simple bsearch */
1189int groups_search(struct group_info *group_info, gid_t grp) 1240int groups_search(const struct group_info *group_info, gid_t grp)
1190{ 1241{
1191 unsigned int left, right; 1242 unsigned int left, right;
1192 1243
@@ -1208,51 +1259,74 @@ int groups_search(struct group_info *group_info, gid_t grp)
1208 return 0; 1259 return 0;
1209} 1260}
1210 1261
1211/* validate and set current->group_info */ 1262/**
1212int set_current_groups(struct group_info *group_info) 1263 * set_groups - Change a group subscription in a set of credentials
1264 * @new: The newly prepared set of credentials to alter
1265 * @group_info: The group list to install
1266 *
1267 * Validate a group subscription and, if valid, insert it into a set
1268 * of credentials.
1269 */
1270int set_groups(struct cred *new, struct group_info *group_info)
1213{ 1271{
1214 int retval; 1272 int retval;
1215 struct group_info *old_info;
1216 1273
1217 retval = security_task_setgroups(group_info); 1274 retval = security_task_setgroups(group_info);
1218 if (retval) 1275 if (retval)
1219 return retval; 1276 return retval;
1220 1277
1278 put_group_info(new->group_info);
1221 groups_sort(group_info); 1279 groups_sort(group_info);
1222 get_group_info(group_info); 1280 get_group_info(group_info);
1281 new->group_info = group_info;
1282 return 0;
1283}
1223 1284
1224 task_lock(current); 1285EXPORT_SYMBOL(set_groups);
1225 old_info = current->group_info;
1226 current->group_info = group_info;
1227 task_unlock(current);
1228 1286
1229 put_group_info(old_info); 1287/**
1288 * set_current_groups - Change current's group subscription
1289 * @group_info: The group list to impose
1290 *
1291 * Validate a group subscription and, if valid, impose it upon current's task
1292 * security record.
1293 */
1294int set_current_groups(struct group_info *group_info)
1295{
1296 struct cred *new;
1297 int ret;
1230 1298
1231 return 0; 1299 new = prepare_creds();
1300 if (!new)
1301 return -ENOMEM;
1302
1303 ret = set_groups(new, group_info);
1304 if (ret < 0) {
1305 abort_creds(new);
1306 return ret;
1307 }
1308
1309 return commit_creds(new);
1232} 1310}
1233 1311
1234EXPORT_SYMBOL(set_current_groups); 1312EXPORT_SYMBOL(set_current_groups);
1235 1313
1236asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) 1314asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist)
1237{ 1315{
1238 int i = 0; 1316 const struct cred *cred = current_cred();
1239 1317 int i;
1240 /*
1241 * SMP: Nobody else can change our grouplist. Thus we are
1242 * safe.
1243 */
1244 1318
1245 if (gidsetsize < 0) 1319 if (gidsetsize < 0)
1246 return -EINVAL; 1320 return -EINVAL;
1247 1321
1248 /* no need to grab task_lock here; it cannot change */ 1322 /* no need to grab task_lock here; it cannot change */
1249 i = current->group_info->ngroups; 1323 i = cred->group_info->ngroups;
1250 if (gidsetsize) { 1324 if (gidsetsize) {
1251 if (i > gidsetsize) { 1325 if (i > gidsetsize) {
1252 i = -EINVAL; 1326 i = -EINVAL;
1253 goto out; 1327 goto out;
1254 } 1328 }
1255 if (groups_to_user(grouplist, current->group_info)) { 1329 if (groups_to_user(grouplist, cred->group_info)) {
1256 i = -EFAULT; 1330 i = -EFAULT;
1257 goto out; 1331 goto out;
1258 } 1332 }
@@ -1296,9 +1370,11 @@ asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist)
1296 */ 1370 */
1297int in_group_p(gid_t grp) 1371int in_group_p(gid_t grp)
1298{ 1372{
1373 const struct cred *cred = current_cred();
1299 int retval = 1; 1374 int retval = 1;
1300 if (grp != current->fsgid) 1375
1301 retval = groups_search(current->group_info, grp); 1376 if (grp != cred->fsgid)
1377 retval = groups_search(cred->group_info, grp);
1302 return retval; 1378 return retval;
1303} 1379}
1304 1380
@@ -1306,9 +1382,11 @@ EXPORT_SYMBOL(in_group_p);
1306 1382
1307int in_egroup_p(gid_t grp) 1383int in_egroup_p(gid_t grp)
1308{ 1384{
1385 const struct cred *cred = current_cred();
1309 int retval = 1; 1386 int retval = 1;
1310 if (grp != current->egid) 1387
1311 retval = groups_search(current->group_info, grp); 1388 if (grp != cred->egid)
1389 retval = groups_search(cred->group_info, grp);
1312 return retval; 1390 return retval;
1313} 1391}
1314 1392
@@ -1551,6 +1629,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1551 utime = stime = cputime_zero; 1629 utime = stime = cputime_zero;
1552 1630
1553 if (who == RUSAGE_THREAD) { 1631 if (who == RUSAGE_THREAD) {
1632 utime = task_utime(current);
1633 stime = task_stime(current);
1554 accumulate_thread_rusage(p, r); 1634 accumulate_thread_rusage(p, r);
1555 goto out; 1635 goto out;
1556 } 1636 }
@@ -1624,50 +1704,56 @@ asmlinkage long sys_umask(int mask)
1624asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1704asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1625 unsigned long arg4, unsigned long arg5) 1705 unsigned long arg4, unsigned long arg5)
1626{ 1706{
1627 long error = 0; 1707 struct task_struct *me = current;
1708 unsigned char comm[sizeof(me->comm)];
1709 long error;
1628 1710
1629 if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) 1711 error = security_task_prctl(option, arg2, arg3, arg4, arg5);
1712 if (error != -ENOSYS)
1630 return error; 1713 return error;
1631 1714
1715 error = 0;
1632 switch (option) { 1716 switch (option) {
1633 case PR_SET_PDEATHSIG: 1717 case PR_SET_PDEATHSIG:
1634 if (!valid_signal(arg2)) { 1718 if (!valid_signal(arg2)) {
1635 error = -EINVAL; 1719 error = -EINVAL;
1636 break; 1720 break;
1637 } 1721 }
1638 current->pdeath_signal = arg2; 1722 me->pdeath_signal = arg2;
1723 error = 0;
1639 break; 1724 break;
1640 case PR_GET_PDEATHSIG: 1725 case PR_GET_PDEATHSIG:
1641 error = put_user(current->pdeath_signal, (int __user *)arg2); 1726 error = put_user(me->pdeath_signal, (int __user *)arg2);
1642 break; 1727 break;
1643 case PR_GET_DUMPABLE: 1728 case PR_GET_DUMPABLE:
1644 error = get_dumpable(current->mm); 1729 error = get_dumpable(me->mm);
1645 break; 1730 break;
1646 case PR_SET_DUMPABLE: 1731 case PR_SET_DUMPABLE:
1647 if (arg2 < 0 || arg2 > 1) { 1732 if (arg2 < 0 || arg2 > 1) {
1648 error = -EINVAL; 1733 error = -EINVAL;
1649 break; 1734 break;
1650 } 1735 }
1651 set_dumpable(current->mm, arg2); 1736 set_dumpable(me->mm, arg2);
1737 error = 0;
1652 break; 1738 break;
1653 1739
1654 case PR_SET_UNALIGN: 1740 case PR_SET_UNALIGN:
1655 error = SET_UNALIGN_CTL(current, arg2); 1741 error = SET_UNALIGN_CTL(me, arg2);
1656 break; 1742 break;
1657 case PR_GET_UNALIGN: 1743 case PR_GET_UNALIGN:
1658 error = GET_UNALIGN_CTL(current, arg2); 1744 error = GET_UNALIGN_CTL(me, arg2);
1659 break; 1745 break;
1660 case PR_SET_FPEMU: 1746 case PR_SET_FPEMU:
1661 error = SET_FPEMU_CTL(current, arg2); 1747 error = SET_FPEMU_CTL(me, arg2);
1662 break; 1748 break;
1663 case PR_GET_FPEMU: 1749 case PR_GET_FPEMU:
1664 error = GET_FPEMU_CTL(current, arg2); 1750 error = GET_FPEMU_CTL(me, arg2);
1665 break; 1751 break;
1666 case PR_SET_FPEXC: 1752 case PR_SET_FPEXC:
1667 error = SET_FPEXC_CTL(current, arg2); 1753 error = SET_FPEXC_CTL(me, arg2);
1668 break; 1754 break;
1669 case PR_GET_FPEXC: 1755 case PR_GET_FPEXC:
1670 error = GET_FPEXC_CTL(current, arg2); 1756 error = GET_FPEXC_CTL(me, arg2);
1671 break; 1757 break;
1672 case PR_GET_TIMING: 1758 case PR_GET_TIMING:
1673 error = PR_TIMING_STATISTICAL; 1759 error = PR_TIMING_STATISTICAL;
@@ -1675,33 +1761,28 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1675 case PR_SET_TIMING: 1761 case PR_SET_TIMING:
1676 if (arg2 != PR_TIMING_STATISTICAL) 1762 if (arg2 != PR_TIMING_STATISTICAL)
1677 error = -EINVAL; 1763 error = -EINVAL;
1764 else
1765 error = 0;
1678 break; 1766 break;
1679 1767
1680 case PR_SET_NAME: { 1768 case PR_SET_NAME:
1681 struct task_struct *me = current; 1769 comm[sizeof(me->comm)-1] = 0;
1682 unsigned char ncomm[sizeof(me->comm)]; 1770 if (strncpy_from_user(comm, (char __user *)arg2,
1683 1771 sizeof(me->comm) - 1) < 0)
1684 ncomm[sizeof(me->comm)-1] = 0;
1685 if (strncpy_from_user(ncomm, (char __user *)arg2,
1686 sizeof(me->comm)-1) < 0)
1687 return -EFAULT; 1772 return -EFAULT;
1688 set_task_comm(me, ncomm); 1773 set_task_comm(me, comm);
1689 return 0; 1774 return 0;
1690 } 1775 case PR_GET_NAME:
1691 case PR_GET_NAME: { 1776 get_task_comm(comm, me);
1692 struct task_struct *me = current; 1777 if (copy_to_user((char __user *)arg2, comm,
1693 unsigned char tcomm[sizeof(me->comm)]; 1778 sizeof(comm)))
1694
1695 get_task_comm(tcomm, me);
1696 if (copy_to_user((char __user *)arg2, tcomm, sizeof(tcomm)))
1697 return -EFAULT; 1779 return -EFAULT;
1698 return 0; 1780 return 0;
1699 }
1700 case PR_GET_ENDIAN: 1781 case PR_GET_ENDIAN:
1701 error = GET_ENDIAN(current, arg2); 1782 error = GET_ENDIAN(me, arg2);
1702 break; 1783 break;
1703 case PR_SET_ENDIAN: 1784 case PR_SET_ENDIAN:
1704 error = SET_ENDIAN(current, arg2); 1785 error = SET_ENDIAN(me, arg2);
1705 break; 1786 break;
1706 1787
1707 case PR_GET_SECCOMP: 1788 case PR_GET_SECCOMP:
@@ -1725,6 +1806,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1725 current->default_timer_slack_ns; 1806 current->default_timer_slack_ns;
1726 else 1807 else
1727 current->timer_slack_ns = arg2; 1808 current->timer_slack_ns = arg2;
1809 error = 0;
1728 break; 1810 break;
1729 default: 1811 default:
1730 error = -EINVAL; 1812 error = -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3d56fe7570da..92f6e5bc3c24 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -87,10 +87,6 @@ extern int rcutorture_runnable;
87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
88 88
89/* Constants used for minimum and maximum */ 89/* Constants used for minimum and maximum */
90#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
91static int one = 1;
92#endif
93
94#ifdef CONFIG_DETECT_SOFTLOCKUP 90#ifdef CONFIG_DETECT_SOFTLOCKUP
95static int sixty = 60; 91static int sixty = 60;
96static int neg_one = -1; 92static int neg_one = -1;
@@ -101,6 +97,7 @@ static int two = 2;
101#endif 97#endif
102 98
103static int zero; 99static int zero;
100static int one = 1;
104static int one_hundred = 100; 101static int one_hundred = 100;
105 102
106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 103/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -121,6 +118,10 @@ extern int sg_big_buff;
121#include <asm/system.h> 118#include <asm/system.h>
122#endif 119#endif
123 120
121#ifdef CONFIG_SPARC64
122extern int sysctl_tsb_ratio;
123#endif
124
124#ifdef __hppa__ 125#ifdef __hppa__
125extern int pwrsw_enabled; 126extern int pwrsw_enabled;
126extern int unaligned_enabled; 127extern int unaligned_enabled;
@@ -451,6 +452,16 @@ static struct ctl_table kern_table[] = {
451 .proc_handler = &proc_dointvec, 452 .proc_handler = &proc_dointvec,
452 }, 453 },
453#endif 454#endif
455#ifdef CONFIG_SPARC64
456 {
457 .ctl_name = CTL_UNNUMBERED,
458 .procname = "tsb-ratio",
459 .data = &sysctl_tsb_ratio,
460 .maxlen = sizeof (int),
461 .mode = 0644,
462 .proc_handler = &proc_dointvec,
463 },
464#endif
454#ifdef __hppa__ 465#ifdef __hppa__
455 { 466 {
456 .ctl_name = KERN_HPPA_PWRSW, 467 .ctl_name = KERN_HPPA_PWRSW,
@@ -487,6 +498,26 @@ static struct ctl_table kern_table[] = {
487 .proc_handler = &ftrace_enable_sysctl, 498 .proc_handler = &ftrace_enable_sysctl,
488 }, 499 },
489#endif 500#endif
501#ifdef CONFIG_STACK_TRACER
502 {
503 .ctl_name = CTL_UNNUMBERED,
504 .procname = "stack_tracer_enabled",
505 .data = &stack_tracer_enabled,
506 .maxlen = sizeof(int),
507 .mode = 0644,
508 .proc_handler = &stack_trace_sysctl,
509 },
510#endif
511#ifdef CONFIG_TRACING
512 {
513 .ctl_name = CTL_UNNUMBERED,
514 .procname = "ftrace_dump_on_oops",
515 .data = &ftrace_dump_on_oops,
516 .maxlen = sizeof(int),
517 .mode = 0644,
518 .proc_handler = &proc_dointvec,
519 },
520#endif
490#ifdef CONFIG_MODULES 521#ifdef CONFIG_MODULES
491 { 522 {
492 .ctl_name = KERN_MODPROBE, 523 .ctl_name = KERN_MODPROBE,
@@ -918,12 +949,22 @@ static struct ctl_table vm_table[] = {
918 .data = &dirty_background_ratio, 949 .data = &dirty_background_ratio,
919 .maxlen = sizeof(dirty_background_ratio), 950 .maxlen = sizeof(dirty_background_ratio),
920 .mode = 0644, 951 .mode = 0644,
921 .proc_handler = &proc_dointvec_minmax, 952 .proc_handler = &dirty_background_ratio_handler,
922 .strategy = &sysctl_intvec, 953 .strategy = &sysctl_intvec,
923 .extra1 = &zero, 954 .extra1 = &zero,
924 .extra2 = &one_hundred, 955 .extra2 = &one_hundred,
925 }, 956 },
926 { 957 {
958 .ctl_name = CTL_UNNUMBERED,
959 .procname = "dirty_background_bytes",
960 .data = &dirty_background_bytes,
961 .maxlen = sizeof(dirty_background_bytes),
962 .mode = 0644,
963 .proc_handler = &dirty_background_bytes_handler,
964 .strategy = &sysctl_intvec,
965 .extra1 = &one,
966 },
967 {
927 .ctl_name = VM_DIRTY_RATIO, 968 .ctl_name = VM_DIRTY_RATIO,
928 .procname = "dirty_ratio", 969 .procname = "dirty_ratio",
929 .data = &vm_dirty_ratio, 970 .data = &vm_dirty_ratio,
@@ -935,6 +976,16 @@ static struct ctl_table vm_table[] = {
935 .extra2 = &one_hundred, 976 .extra2 = &one_hundred,
936 }, 977 },
937 { 978 {
979 .ctl_name = CTL_UNNUMBERED,
980 .procname = "dirty_bytes",
981 .data = &vm_dirty_bytes,
982 .maxlen = sizeof(vm_dirty_bytes),
983 .mode = 0644,
984 .proc_handler = &dirty_bytes_handler,
985 .strategy = &sysctl_intvec,
986 .extra1 = &one,
987 },
988 {
938 .procname = "dirty_writeback_centisecs", 989 .procname = "dirty_writeback_centisecs",
939 .data = &dirty_writeback_interval, 990 .data = &dirty_writeback_interval,
940 .maxlen = sizeof(dirty_writeback_interval), 991 .maxlen = sizeof(dirty_writeback_interval),
@@ -1651,7 +1702,7 @@ out:
1651 1702
1652static int test_perm(int mode, int op) 1703static int test_perm(int mode, int op)
1653{ 1704{
1654 if (!current->euid) 1705 if (!current_euid())
1655 mode >>= 6; 1706 mode >>= 6;
1656 else if (in_egroup_p(0)) 1707 else if (in_egroup_p(0))
1657 mode >>= 3; 1708 mode >>= 3;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c35da23ab8fb..fafeb48f27c0 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -730,7 +730,6 @@ static const struct trans_ctl_table trans_fs_quota_table[] = {
730}; 730};
731 731
732static const struct trans_ctl_table trans_fs_xfs_table[] = { 732static const struct trans_ctl_table trans_fs_xfs_table[] = {
733 { XFS_RESTRICT_CHOWN, "restrict_chown" },
734 { XFS_SGID_INHERIT, "irix_sgid_inherit" }, 733 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
735 { XFS_SYMLINK_MODE, "irix_symlink_mode" }, 734 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
736 { XFS_PANIC_MASK, "panic_mask" }, 735 { XFS_PANIC_MASK, "panic_mask" },
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index bd6be76303cf..888adbcca30c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,18 +290,17 @@ ret:
290 return; 290 return;
291} 291}
292 292
293static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) 293static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
294{ 294{
295 struct listener_list *listeners; 295 struct listener_list *listeners;
296 struct listener *s, *tmp; 296 struct listener *s, *tmp;
297 unsigned int cpu; 297 unsigned int cpu;
298 cpumask_t mask = *maskp;
299 298
300 if (!cpus_subset(mask, cpu_possible_map)) 299 if (!cpumask_subset(mask, cpu_possible_mask))
301 return -EINVAL; 300 return -EINVAL;
302 301
303 if (isadd == REGISTER) { 302 if (isadd == REGISTER) {
304 for_each_cpu_mask_nr(cpu, mask) { 303 for_each_cpu(cpu, mask) {
305 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 304 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
306 cpu_to_node(cpu)); 305 cpu_to_node(cpu));
307 if (!s) 306 if (!s)
@@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
320 319
321 /* Deregister or cleanup */ 320 /* Deregister or cleanup */
322cleanup: 321cleanup:
323 for_each_cpu_mask_nr(cpu, mask) { 322 for_each_cpu(cpu, mask) {
324 listeners = &per_cpu(listener_array, cpu); 323 listeners = &per_cpu(listener_array, cpu);
325 down_write(&listeners->sem); 324 down_write(&listeners->sem);
326 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 325 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
@@ -335,7 +334,7 @@ cleanup:
335 return 0; 334 return 0;
336} 335}
337 336
338static int parse(struct nlattr *na, cpumask_t *mask) 337static int parse(struct nlattr *na, struct cpumask *mask)
339{ 338{
340 char *data; 339 char *data;
341 int len; 340 int len;
@@ -352,7 +351,7 @@ static int parse(struct nlattr *na, cpumask_t *mask)
352 if (!data) 351 if (!data)
353 return -ENOMEM; 352 return -ENOMEM;
354 nla_strlcpy(data, na, len); 353 nla_strlcpy(data, na, len);
355 ret = cpulist_parse(data, *mask); 354 ret = cpulist_parse(data, mask);
356 kfree(data); 355 kfree(data);
357 return ret; 356 return ret;
358} 357}
@@ -428,23 +427,33 @@ err:
428 427
429static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 428static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
430{ 429{
431 int rc = 0; 430 int rc;
432 struct sk_buff *rep_skb; 431 struct sk_buff *rep_skb;
433 struct taskstats *stats; 432 struct taskstats *stats;
434 size_t size; 433 size_t size;
435 cpumask_t mask; 434 cpumask_var_t mask;
435
436 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
437 return -ENOMEM;
436 438
437 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 439 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
438 if (rc < 0) 440 if (rc < 0)
439 return rc; 441 goto free_return_rc;
440 if (rc == 0) 442 if (rc == 0) {
441 return add_del_listener(info->snd_pid, &mask, REGISTER); 443 rc = add_del_listener(info->snd_pid, mask, REGISTER);
444 goto free_return_rc;
445 }
442 446
443 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); 447 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
444 if (rc < 0) 448 if (rc < 0)
449 goto free_return_rc;
450 if (rc == 0) {
451 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
452free_return_rc:
453 free_cpumask_var(mask);
445 return rc; 454 return rc;
446 if (rc == 0) 455 }
447 return add_del_listener(info->snd_pid, &mask, DEREGISTER); 456 free_cpumask_var(mask);
448 457
449 /* 458 /*
450 * Size includes space for nested attributes 459 * Size includes space for nested attributes
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 06b6395b45b2..4f104515a19b 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -22,21 +22,11 @@
22 22
23static u32 rand1, preh_val, posth_val, jph_val; 23static u32 rand1, preh_val, posth_val, jph_val;
24static int errors, handler_errors, num_tests; 24static int errors, handler_errors, num_tests;
25static u32 (*target)(u32 value);
26static u32 (*target2)(u32 value);
25 27
26static noinline u32 kprobe_target(u32 value) 28static noinline u32 kprobe_target(u32 value)
27{ 29{
28 /*
29 * gcc ignores noinline on some architectures unless we stuff
30 * sufficient lard into the function. The get_kprobe() here is
31 * just for that.
32 *
33 * NOTE: We aren't concerned about the correctness of get_kprobe()
34 * here; hence, this call is neither under !preempt nor with the
35 * kprobe_mutex held. This is fine(tm)
36 */
37 if (get_kprobe((void *)0xdeadbeef))
38 printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
39
40 return (value / div_factor); 30 return (value / div_factor);
41} 31}
42 32
@@ -74,7 +64,7 @@ static int test_kprobe(void)
74 return ret; 64 return ret;
75 } 65 }
76 66
77 ret = kprobe_target(rand1); 67 ret = target(rand1);
78 unregister_kprobe(&kp); 68 unregister_kprobe(&kp);
79 69
80 if (preh_val == 0) { 70 if (preh_val == 0) {
@@ -92,6 +82,84 @@ static int test_kprobe(void)
92 return 0; 82 return 0;
93} 83}
94 84
85static noinline u32 kprobe_target2(u32 value)
86{
87 return (value / div_factor) + 1;
88}
89
90static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs)
91{
92 preh_val = (rand1 / div_factor) + 1;
93 return 0;
94}
95
96static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
97 unsigned long flags)
98{
99 if (preh_val != (rand1 / div_factor) + 1) {
100 handler_errors++;
101 printk(KERN_ERR "Kprobe smoke test failed: "
102 "incorrect value in post_handler2\n");
103 }
104 posth_val = preh_val + div_factor;
105}
106
107static struct kprobe kp2 = {
108 .symbol_name = "kprobe_target2",
109 .pre_handler = kp_pre_handler2,
110 .post_handler = kp_post_handler2
111};
112
113static int test_kprobes(void)
114{
115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2};
117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */
119 ret = register_kprobes(kps, 2);
120 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: "
122 "register_kprobes returned %d\n", ret);
123 return ret;
124 }
125
126 preh_val = 0;
127 posth_val = 0;
128 ret = target(rand1);
129
130 if (preh_val == 0) {
131 printk(KERN_ERR "Kprobe smoke test failed: "
132 "kprobe pre_handler not called\n");
133 handler_errors++;
134 }
135
136 if (posth_val == 0) {
137 printk(KERN_ERR "Kprobe smoke test failed: "
138 "kprobe post_handler not called\n");
139 handler_errors++;
140 }
141
142 preh_val = 0;
143 posth_val = 0;
144 ret = target2(rand1);
145
146 if (preh_val == 0) {
147 printk(KERN_ERR "Kprobe smoke test failed: "
148 "kprobe pre_handler2 not called\n");
149 handler_errors++;
150 }
151
152 if (posth_val == 0) {
153 printk(KERN_ERR "Kprobe smoke test failed: "
154 "kprobe post_handler2 not called\n");
155 handler_errors++;
156 }
157
158 unregister_kprobes(kps, 2);
159 return 0;
160
161}
162
95static u32 j_kprobe_target(u32 value) 163static u32 j_kprobe_target(u32 value)
96{ 164{
97 if (value != rand1) { 165 if (value != rand1) {
@@ -121,7 +189,7 @@ static int test_jprobe(void)
121 return ret; 189 return ret;
122 } 190 }
123 191
124 ret = kprobe_target(rand1); 192 ret = target(rand1);
125 unregister_jprobe(&jp); 193 unregister_jprobe(&jp);
126 if (jph_val == 0) { 194 if (jph_val == 0) {
127 printk(KERN_ERR "Kprobe smoke test failed: " 195 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -132,6 +200,43 @@ static int test_jprobe(void)
132 return 0; 200 return 0;
133} 201}
134 202
203static struct jprobe jp2 = {
204 .entry = j_kprobe_target,
205 .kp.symbol_name = "kprobe_target2"
206};
207
208static int test_jprobes(void)
209{
210 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2};
212
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
214 ret = register_jprobes(jps, 2);
215 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: "
217 "register_jprobes returned %d\n", ret);
218 return ret;
219 }
220
221 jph_val = 0;
222 ret = target(rand1);
223 if (jph_val == 0) {
224 printk(KERN_ERR "Kprobe smoke test failed: "
225 "jprobe handler not called\n");
226 handler_errors++;
227 }
228
229 jph_val = 0;
230 ret = target2(rand1);
231 if (jph_val == 0) {
232 printk(KERN_ERR "Kprobe smoke test failed: "
233 "jprobe handler2 not called\n");
234 handler_errors++;
235 }
236 unregister_jprobes(jps, 2);
237
238 return 0;
239}
135#ifdef CONFIG_KRETPROBES 240#ifdef CONFIG_KRETPROBES
136static u32 krph_val; 241static u32 krph_val;
137 242
@@ -177,7 +282,7 @@ static int test_kretprobe(void)
177 return ret; 282 return ret;
178 } 283 }
179 284
180 ret = kprobe_target(rand1); 285 ret = target(rand1);
181 unregister_kretprobe(&rp); 286 unregister_kretprobe(&rp);
182 if (krph_val != rand1) { 287 if (krph_val != rand1) {
183 printk(KERN_ERR "Kprobe smoke test failed: " 288 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -187,12 +292,72 @@ static int test_kretprobe(void)
187 292
188 return 0; 293 return 0;
189} 294}
295
296static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
297{
298 unsigned long ret = regs_return_value(regs);
299
300 if (ret != (rand1 / div_factor) + 1) {
301 handler_errors++;
302 printk(KERN_ERR "Kprobe smoke test failed: "
303 "incorrect value in kretprobe handler2\n");
304 }
305 if (krph_val == 0) {
306 handler_errors++;
307 printk(KERN_ERR "Kprobe smoke test failed: "
308 "call to kretprobe entry handler failed\n");
309 }
310
311 krph_val = rand1;
312 return 0;
313}
314
315static struct kretprobe rp2 = {
316 .handler = return_handler2,
317 .entry_handler = entry_handler,
318 .kp.symbol_name = "kprobe_target2"
319};
320
321static int test_kretprobes(void)
322{
323 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2};
325
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
327 ret = register_kretprobes(rps, 2);
328 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: "
330 "register_kretprobe returned %d\n", ret);
331 return ret;
332 }
333
334 krph_val = 0;
335 ret = target(rand1);
336 if (krph_val != rand1) {
337 printk(KERN_ERR "Kprobe smoke test failed: "
338 "kretprobe handler not called\n");
339 handler_errors++;
340 }
341
342 krph_val = 0;
343 ret = target2(rand1);
344 if (krph_val != rand1) {
345 printk(KERN_ERR "Kprobe smoke test failed: "
346 "kretprobe handler2 not called\n");
347 handler_errors++;
348 }
349 unregister_kretprobes(rps, 2);
350 return 0;
351}
190#endif /* CONFIG_KRETPROBES */ 352#endif /* CONFIG_KRETPROBES */
191 353
192int init_test_probes(void) 354int init_test_probes(void)
193{ 355{
194 int ret; 356 int ret;
195 357
358 target = kprobe_target;
359 target2 = kprobe_target2;
360
196 do { 361 do {
197 rand1 = random32(); 362 rand1 = random32();
198 } while (rand1 <= div_factor); 363 } while (rand1 <= div_factor);
@@ -204,15 +369,30 @@ int init_test_probes(void)
204 errors++; 369 errors++;
205 370
206 num_tests++; 371 num_tests++;
372 ret = test_kprobes();
373 if (ret < 0)
374 errors++;
375
376 num_tests++;
207 ret = test_jprobe(); 377 ret = test_jprobe();
208 if (ret < 0) 378 if (ret < 0)
209 errors++; 379 errors++;
210 380
381 num_tests++;
382 ret = test_jprobes();
383 if (ret < 0)
384 errors++;
385
211#ifdef CONFIG_KRETPROBES 386#ifdef CONFIG_KRETPROBES
212 num_tests++; 387 num_tests++;
213 ret = test_kretprobe(); 388 ret = test_kretprobe();
214 if (ret < 0) 389 if (ret < 0)
215 errors++; 390 errors++;
391
392 num_tests++;
393 ret = test_kretprobes();
394 if (ret < 0)
395 errors++;
216#endif /* CONFIG_KRETPROBES */ 396#endif /* CONFIG_KRETPROBES */
217 397
218 if (errors) 398 if (errors)
diff --git a/kernel/time.c b/kernel/time.c
index d63a4336fad6..4886e3ce83a4 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -37,6 +37,7 @@
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/math64.h> 39#include <linux/math64.h>
40#include <linux/ptrace.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc)
65 66
66 if (tloc) { 67 if (tloc) {
67 if (put_user(i,tloc)) 68 if (put_user(i,tloc))
68 i = -EFAULT; 69 return -EFAULT;
69 } 70 }
71 force_successful_syscall_return();
70 return i; 72 return i;
71} 73}
72 74
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f8d968063cea..ea2f48af83cf 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -166,6 +166,8 @@ static void clockevents_notify_released(void)
166void clockevents_register_device(struct clock_event_device *dev) 166void clockevents_register_device(struct clock_event_device *dev)
167{ 167{
168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
169 BUG_ON(!dev->cpumask);
170
169 /* 171 /*
170 * A nsec2cyc multiplicator of 0 is invalid and we'd crash 172 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
171 * on it, so fix it up and emit a warning: 173 * on it, so fix it up and emit a warning:
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9ed2eec97526..ca89e1593f08 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,10 +145,11 @@ static void clocksource_watchdog(unsigned long data)
145 * Cycle through CPUs to check if the CPUs stay 145 * Cycle through CPUs to check if the CPUs stay
146 * synchronized to each other. 146 * synchronized to each other.
147 */ 147 */
148 int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); 148 int next_cpu = cpumask_next(raw_smp_processor_id(),
149 cpu_online_mask);
149 150
150 if (next_cpu >= nr_cpu_ids) 151 if (next_cpu >= nr_cpu_ids)
151 next_cpu = first_cpu(cpu_online_map); 152 next_cpu = cpumask_first(cpu_online_mask);
152 watchdog_timer.expires += WATCHDOG_INTERVAL; 153 watchdog_timer.expires += WATCHDOG_INTERVAL;
153 add_timer_on(&watchdog_timer, next_cpu); 154 add_timer_on(&watchdog_timer, next_cpu);
154 } 155 }
@@ -173,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
173 watchdog_last = watchdog->read(); 174 watchdog_last = watchdog->read();
174 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 175 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
175 add_timer_on(&watchdog_timer, 176 add_timer_on(&watchdog_timer,
176 first_cpu(cpu_online_map)); 177 cpumask_first(cpu_online_mask));
177 } 178 }
178 } else { 179 } else {
179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 180 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -195,7 +196,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
195 watchdog_timer.expires = 196 watchdog_timer.expires =
196 jiffies + WATCHDOG_INTERVAL; 197 jiffies + WATCHDOG_INTERVAL;
197 add_timer_on(&watchdog_timer, 198 add_timer_on(&watchdog_timer,
198 first_cpu(cpu_online_map)); 199 cpumask_first(cpu_online_mask));
199 } 200 }
200 } 201 }
201 } 202 }
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1ca99557e929..06f197560f3b 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -45,7 +45,7 @@
45 * 45 *
46 * The value 8 is somewhat carefully chosen, as anything 46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as 47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when 48 * HZ shrinks, so values greater than 8 overflow 32bits when
49 * HZ=100. 49 * HZ=100.
50 */ 50 */
51#define JIFFIES_SHIFT 8 51#define JIFFIES_SHIFT 8
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8ff15e5d486b..f5f793d92415 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -131,7 +131,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
131{ 131{
132 enum hrtimer_restart res = HRTIMER_NORESTART; 132 enum hrtimer_restart res = HRTIMER_NORESTART;
133 133
134 write_seqlock_irq(&xtime_lock); 134 write_seqlock(&xtime_lock);
135 135
136 switch (time_state) { 136 switch (time_state) {
137 case TIME_OK: 137 case TIME_OK:
@@ -164,7 +164,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
164 } 164 }
165 update_vsyscall(&xtime, clock); 165 update_vsyscall(&xtime, clock);
166 166
167 write_sequnlock_irq(&xtime_lock); 167 write_sequnlock(&xtime_lock);
168 168
169 return res; 169 return res;
170} 170}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f98a1b7b16e9..118a3b3b3f9a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,7 +28,9 @@
28 */ 28 */
29 29
30struct tick_device tick_broadcast_device; 30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask; 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS);
32static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_SPINLOCK(tick_broadcast_lock);
33static int tick_broadcast_force; 35static int tick_broadcast_force;
34 36
@@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void)
46 return &tick_broadcast_device; 48 return &tick_broadcast_device;
47} 49}
48 50
49cpumask_t *tick_get_broadcast_mask(void) 51struct cpumask *tick_get_broadcast_mask(void)
50{ 52{
51 return &tick_broadcast_mask; 53 return to_cpumask(tick_broadcast_mask);
52} 54}
53 55
54/* 56/*
@@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
72 74
73 clockevents_exchange_device(NULL, dev); 75 clockevents_exchange_device(NULL, dev);
74 tick_broadcast_device.evtdev = dev; 76 tick_broadcast_device.evtdev = dev;
75 if (!cpus_empty(tick_broadcast_mask)) 77 if (!cpumask_empty(tick_get_broadcast_mask()))
76 tick_broadcast_start_periodic(dev); 78 tick_broadcast_start_periodic(dev);
77 return 1; 79 return 1;
78} 80}
@@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
104 */ 106 */
105 if (!tick_device_is_functional(dev)) { 107 if (!tick_device_is_functional(dev)) {
106 dev->event_handler = tick_handle_periodic; 108 dev->event_handler = tick_handle_periodic;
107 cpu_set(cpu, tick_broadcast_mask); 109 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
108 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 110 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
109 ret = 1; 111 ret = 1;
110 } else { 112 } else {
@@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
116 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 118 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
117 int cpu = smp_processor_id(); 119 int cpu = smp_processor_id();
118 120
119 cpu_clear(cpu, tick_broadcast_mask); 121 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
120 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
121 } 123 }
122 } 124 }
@@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
125} 127}
126 128
127/* 129/*
128 * Broadcast the event to the cpus, which are set in the mask 130 * Broadcast the event to the cpus, which are set in the mask (mangled).
129 */ 131 */
130static void tick_do_broadcast(cpumask_t mask) 132static void tick_do_broadcast(struct cpumask *mask)
131{ 133{
132 int cpu = smp_processor_id(); 134 int cpu = smp_processor_id();
133 struct tick_device *td; 135 struct tick_device *td;
@@ -135,21 +137,20 @@ static void tick_do_broadcast(cpumask_t mask)
135 /* 137 /*
136 * Check, if the current cpu is in the mask 138 * Check, if the current cpu is in the mask
137 */ 139 */
138 if (cpu_isset(cpu, mask)) { 140 if (cpumask_test_cpu(cpu, mask)) {
139 cpu_clear(cpu, mask); 141 cpumask_clear_cpu(cpu, mask);
140 td = &per_cpu(tick_cpu_device, cpu); 142 td = &per_cpu(tick_cpu_device, cpu);
141 td->evtdev->event_handler(td->evtdev); 143 td->evtdev->event_handler(td->evtdev);
142 } 144 }
143 145
144 if (!cpus_empty(mask)) { 146 if (!cpumask_empty(mask)) {
145 /* 147 /*
146 * It might be necessary to actually check whether the devices 148 * It might be necessary to actually check whether the devices
147 * have different broadcast functions. For now, just use the 149 * have different broadcast functions. For now, just use the
148 * one of the first device. This works as long as we have this 150 * one of the first device. This works as long as we have this
149 * misfeature only on x86 (lapic) 151 * misfeature only on x86 (lapic)
150 */ 152 */
151 cpu = first_cpu(mask); 153 td = &per_cpu(tick_cpu_device, cpumask_first(mask));
152 td = &per_cpu(tick_cpu_device, cpu);
153 td->evtdev->broadcast(mask); 154 td->evtdev->broadcast(mask);
154 } 155 }
155} 156}
@@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask)
160 */ 161 */
161static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
162{ 163{
163 cpumask_t mask;
164
165 spin_lock(&tick_broadcast_lock); 164 spin_lock(&tick_broadcast_lock);
166 165
167 cpus_and(mask, cpu_online_map, tick_broadcast_mask); 166 cpumask_and(to_cpumask(tmpmask),
168 tick_do_broadcast(mask); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 spin_unlock(&tick_broadcast_lock);
171} 171}
@@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why)
228 if (!tick_device_is_functional(dev)) 228 if (!tick_device_is_functional(dev))
229 goto out; 229 goto out;
230 230
231 bc_stopped = cpus_empty(tick_broadcast_mask); 231 bc_stopped = cpumask_empty(tick_get_broadcast_mask());
232 232
233 switch (*reason) { 233 switch (*reason) {
234 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 234 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
236 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
237 cpu_set(cpu, tick_broadcast_mask); 237 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
238 if (tick_broadcast_device.mode == 238 if (tick_broadcast_device.mode ==
239 TICKDEV_MODE_PERIODIC) 239 TICKDEV_MODE_PERIODIC)
240 clockevents_shutdown(dev); 240 clockevents_shutdown(dev);
@@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why)
244 break; 244 break;
245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
246 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
247 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
248 cpu_clear(cpu, tick_broadcast_mask); 248 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
249 if (tick_broadcast_device.mode == 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC) 250 TICKDEV_MODE_PERIODIC)
251 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
@@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why)
253 break; 253 break;
254 } 254 }
255 255
256 if (cpus_empty(tick_broadcast_mask)) { 256 if (cpumask_empty(tick_get_broadcast_mask())) {
257 if (!bc_stopped) 257 if (!bc_stopped)
258 clockevents_shutdown(bc); 258 clockevents_shutdown(bc);
259 } else if (bc_stopped) { 259 } else if (bc_stopped) {
@@ -272,7 +272,7 @@ out:
272 */ 272 */
273void tick_broadcast_on_off(unsigned long reason, int *oncpu) 273void tick_broadcast_on_off(unsigned long reason, int *oncpu)
274{ 274{
275 if (!cpu_isset(*oncpu, cpu_online_map)) 275 if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
277 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
278 else 278 else
@@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
303 spin_lock_irqsave(&tick_broadcast_lock, flags); 303 spin_lock_irqsave(&tick_broadcast_lock, flags);
304 304
305 bc = tick_broadcast_device.evtdev; 305 bc = tick_broadcast_device.evtdev;
306 cpu_clear(cpu, tick_broadcast_mask); 306 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
307 307
308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
309 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpumask_empty(tick_get_broadcast_mask()))
310 clockevents_shutdown(bc); 310 clockevents_shutdown(bc);
311 } 311 }
312 312
@@ -342,10 +342,10 @@ int tick_resume_broadcast(void)
342 342
343 switch (tick_broadcast_device.mode) { 343 switch (tick_broadcast_device.mode) {
344 case TICKDEV_MODE_PERIODIC: 344 case TICKDEV_MODE_PERIODIC:
345 if(!cpus_empty(tick_broadcast_mask)) 345 if (!cpumask_empty(tick_get_broadcast_mask()))
346 tick_broadcast_start_periodic(bc); 346 tick_broadcast_start_periodic(bc);
347 broadcast = cpu_isset(smp_processor_id(), 347 broadcast = cpumask_test_cpu(smp_processor_id(),
348 tick_broadcast_mask); 348 tick_get_broadcast_mask());
349 break; 349 break;
350 case TICKDEV_MODE_ONESHOT: 350 case TICKDEV_MODE_ONESHOT:
351 broadcast = tick_resume_broadcast_oneshot(bc); 351 broadcast = tick_resume_broadcast_oneshot(bc);
@@ -360,14 +360,15 @@ int tick_resume_broadcast(void)
360 360
361#ifdef CONFIG_TICK_ONESHOT 361#ifdef CONFIG_TICK_ONESHOT
362 362
363static cpumask_t tick_broadcast_oneshot_mask; 363/* FIXME: use cpumask_var_t. */
364static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
364 365
365/* 366/*
366 * Debugging: see timer_list.c 367 * Exposed for debugging: see timer_list.c
367 */ 368 */
368cpumask_t *tick_get_broadcast_oneshot_mask(void) 369struct cpumask *tick_get_broadcast_oneshot_mask(void)
369{ 370{
370 return &tick_broadcast_oneshot_mask; 371 return to_cpumask(tick_broadcast_oneshot_mask);
371} 372}
372 373
373static int tick_broadcast_set_event(ktime_t expires, int force) 374static int tick_broadcast_set_event(ktime_t expires, int force)
@@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
389 */ 390 */
390void tick_check_oneshot_broadcast(int cpu) 391void tick_check_oneshot_broadcast(int cpu)
391{ 392{
392 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 393 if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
393 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 394 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
394 395
395 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 396 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu)
402static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 403static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
403{ 404{
404 struct tick_device *td; 405 struct tick_device *td;
405 cpumask_t mask;
406 ktime_t now, next_event; 406 ktime_t now, next_event;
407 int cpu; 407 int cpu;
408 408
@@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
410again: 410again:
411 dev->next_event.tv64 = KTIME_MAX; 411 dev->next_event.tv64 = KTIME_MAX;
412 next_event.tv64 = KTIME_MAX; 412 next_event.tv64 = KTIME_MAX;
413 mask = CPU_MASK_NONE; 413 cpumask_clear(to_cpumask(tmpmask));
414 now = ktime_get(); 414 now = ktime_get();
415 /* Find all expired events */ 415 /* Find all expired events */
416 for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { 416 for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
417 td = &per_cpu(tick_cpu_device, cpu); 417 td = &per_cpu(tick_cpu_device, cpu);
418 if (td->evtdev->next_event.tv64 <= now.tv64) 418 if (td->evtdev->next_event.tv64 <= now.tv64)
419 cpu_set(cpu, mask); 419 cpumask_set_cpu(cpu, to_cpumask(tmpmask));
420 else if (td->evtdev->next_event.tv64 < next_event.tv64) 420 else if (td->evtdev->next_event.tv64 < next_event.tv64)
421 next_event.tv64 = td->evtdev->next_event.tv64; 421 next_event.tv64 = td->evtdev->next_event.tv64;
422 } 422 }
@@ -424,7 +424,7 @@ again:
424 /* 424 /*
425 * Wakeup the cpus which have an expired event. 425 * Wakeup the cpus which have an expired event.
426 */ 426 */
427 tick_do_broadcast(mask); 427 tick_do_broadcast(to_cpumask(tmpmask));
428 428
429 /* 429 /*
430 * Two reasons for reprogram: 430 * Two reasons for reprogram:
@@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason)
476 goto out; 476 goto out;
477 477
478 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 478 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
479 if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 479 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
480 cpu_set(cpu, tick_broadcast_oneshot_mask); 480 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
481 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 481 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
482 if (dev->next_event.tv64 < bc->next_event.tv64) 482 if (dev->next_event.tv64 < bc->next_event.tv64)
483 tick_broadcast_set_event(dev->next_event, 1); 483 tick_broadcast_set_event(dev->next_event, 1);
484 } 484 }
485 } else { 485 } else {
486 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 486 if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
487 cpu_clear(cpu, tick_broadcast_oneshot_mask); 487 cpumask_clear_cpu(cpu,
488 tick_get_broadcast_oneshot_mask());
488 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 489 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
489 if (dev->next_event.tv64 != KTIME_MAX) 490 if (dev->next_event.tv64 != KTIME_MAX)
490 tick_program_event(dev->next_event, 1); 491 tick_program_event(dev->next_event, 1);
@@ -502,15 +503,16 @@ out:
502 */ 503 */
503static void tick_broadcast_clear_oneshot(int cpu) 504static void tick_broadcast_clear_oneshot(int cpu)
504{ 505{
505 cpu_clear(cpu, tick_broadcast_oneshot_mask); 506 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
506} 507}
507 508
508static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) 509static void tick_broadcast_init_next_event(struct cpumask *mask,
510 ktime_t expires)
509{ 511{
510 struct tick_device *td; 512 struct tick_device *td;
511 int cpu; 513 int cpu;
512 514
513 for_each_cpu_mask_nr(cpu, *mask) { 515 for_each_cpu(cpu, mask) {
514 td = &per_cpu(tick_cpu_device, cpu); 516 td = &per_cpu(tick_cpu_device, cpu);
515 if (td->evtdev) 517 if (td->evtdev)
516 td->evtdev->next_event = expires; 518 td->evtdev->next_event = expires;
@@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
526 if (bc->event_handler != tick_handle_oneshot_broadcast) { 528 if (bc->event_handler != tick_handle_oneshot_broadcast) {
527 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 529 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
528 int cpu = smp_processor_id(); 530 int cpu = smp_processor_id();
529 cpumask_t mask;
530 531
531 bc->event_handler = tick_handle_oneshot_broadcast; 532 bc->event_handler = tick_handle_oneshot_broadcast;
532 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 533 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
540 * oneshot_mask bits for those and program the 541 * oneshot_mask bits for those and program the
541 * broadcast device to fire. 542 * broadcast device to fire.
542 */ 543 */
543 mask = tick_broadcast_mask; 544 cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
544 cpu_clear(cpu, mask); 545 cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
545 cpus_or(tick_broadcast_oneshot_mask, 546 cpumask_or(tick_get_broadcast_oneshot_mask(),
546 tick_broadcast_oneshot_mask, mask); 547 tick_get_broadcast_oneshot_mask(),
547 548 to_cpumask(tmpmask));
548 if (was_periodic && !cpus_empty(mask)) { 549
549 tick_broadcast_init_next_event(&mask, tick_next_period); 550 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
551 tick_broadcast_init_next_event(to_cpumask(tmpmask),
552 tick_next_period);
550 tick_broadcast_set_event(tick_next_period, 1); 553 tick_broadcast_set_event(tick_next_period, 1);
551 } else 554 } else
552 bc->next_event.tv64 = KTIME_MAX; 555 bc->next_event.tv64 = KTIME_MAX;
@@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
585 * Clear the broadcast mask flag for the dead cpu, but do not 588 * Clear the broadcast mask flag for the dead cpu, but do not
586 * stop the broadcast device! 589 * stop the broadcast device!
587 */ 590 */
588 cpu_clear(cpu, tick_broadcast_oneshot_mask); 591 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
589 592
590 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 593 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
591} 594}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index df12434b43ca..63e05d423a09 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
136 */ 136 */
137static void tick_setup_device(struct tick_device *td, 137static void tick_setup_device(struct tick_device *td,
138 struct clock_event_device *newdev, int cpu, 138 struct clock_event_device *newdev, int cpu,
139 const cpumask_t *cpumask) 139 const struct cpumask *cpumask)
140{ 140{
141 ktime_t next_event; 141 ktime_t next_event;
142 void (*handler)(struct clock_event_device *) = NULL; 142 void (*handler)(struct clock_event_device *) = NULL;
@@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td,
171 * When the device is not per cpu, pin the interrupt to the 171 * When the device is not per cpu, pin the interrupt to the
172 * current cpu: 172 * current cpu:
173 */ 173 */
174 if (!cpus_equal(newdev->cpumask, *cpumask)) 174 if (!cpumask_equal(newdev->cpumask, cpumask))
175 irq_set_affinity(newdev->irq, *cpumask); 175 irq_set_affinity(newdev->irq, cpumask);
176 176
177 /* 177 /*
178 * When global broadcasting is active, check if the current 178 * When global broadcasting is active, check if the current
@@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev)
202 spin_lock_irqsave(&tick_device_lock, flags); 202 spin_lock_irqsave(&tick_device_lock, flags);
203 203
204 cpu = smp_processor_id(); 204 cpu = smp_processor_id();
205 if (!cpu_isset(cpu, newdev->cpumask)) 205 if (!cpumask_test_cpu(cpu, newdev->cpumask))
206 goto out_bc; 206 goto out_bc;
207 207
208 td = &per_cpu(tick_cpu_device, cpu); 208 td = &per_cpu(tick_cpu_device, cpu);
209 curdev = td->evtdev; 209 curdev = td->evtdev;
210 210
211 /* cpu local device ? */ 211 /* cpu local device ? */
212 if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { 212 if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
213 213
214 /* 214 /*
215 * If the cpu affinity of the device interrupt can not 215 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
222 * If we have a cpu local device already, do not replace it 222 * If we have a cpu local device already, do not replace it
223 * by a non cpu local device 223 * by a non cpu local device
224 */ 224 */
225 if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) 225 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
226 goto out_bc; 226 goto out_bc;
227 } 227 }
228 228
@@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
254 curdev = NULL; 254 curdev = NULL;
255 } 255 }
256 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
257 tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); 257 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
259 tick_oneshot_notify(); 259 tick_oneshot_notify();
260 260
@@ -299,9 +299,9 @@ static void tick_shutdown(unsigned int *cpup)
299 } 299 }
300 /* Transfer the do_timer job away from this cpu */ 300 /* Transfer the do_timer job away from this cpu */
301 if (*cpup == tick_do_timer_cpu) { 301 if (*cpup == tick_do_timer_cpu) {
302 int cpu = first_cpu(cpu_online_map); 302 int cpu = cpumask_first(cpu_online_mask);
303 303
304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : 304 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
305 TICK_DO_TIMER_NONE; 305 TICK_DO_TIMER_NONE;
306 } 306 }
307 spin_unlock_irqrestore(&tick_device_lock, flags); 307 spin_unlock_irqrestore(&tick_device_lock, flags);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 342fc9ccab46..1b6c05bd0d0a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
144 if (!ts->tick_stopped) 144 if (!ts->tick_stopped)
145 return; 145 return;
146 146
147 cpu_clear(cpu, nohz_cpu_mask); 147 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get(); 148 now = ktime_get();
149 ts->idle_waketime = now; 149 ts->idle_waketime = now;
150 150
@@ -247,7 +247,7 @@ void tick_nohz_stop_sched_tick(int inidle)
247 if (need_resched()) 247 if (need_resched())
248 goto end; 248 goto end;
249 249
250 if (unlikely(local_softirq_pending())) { 250 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
251 static int ratelimit; 251 static int ratelimit;
252 252
253 if (ratelimit < 10) { 253 if (ratelimit < 10) {
@@ -282,8 +282,31 @@ void tick_nohz_stop_sched_tick(int inidle)
282 /* Schedule the tick, if we are at least one jiffie off */ 282 /* Schedule the tick, if we are at least one jiffie off */
283 if ((long)delta_jiffies >= 1) { 283 if ((long)delta_jiffies >= 1) {
284 284
285 /*
286 * calculate the expiry time for the next timer wheel
287 * timer
288 */
289 expires = ktime_add_ns(last_update, tick_period.tv64 *
290 delta_jiffies);
291
292 /*
293 * If this cpu is the one which updates jiffies, then
294 * give up the assignment and let it be taken by the
295 * cpu which runs the tick timer next, which might be
296 * this cpu as well. If we don't drop this here the
297 * jiffies might be stale and do_timer() never
298 * invoked.
299 */
300 if (cpu == tick_do_timer_cpu)
301 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
302
285 if (delta_jiffies > 1) 303 if (delta_jiffies > 1)
286 cpu_set(cpu, nohz_cpu_mask); 304 cpumask_set_cpu(cpu, nohz_cpu_mask);
305
306 /* Skip reprogram of event if its not changed */
307 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
308 goto out;
309
287 /* 310 /*
288 * nohz_stop_sched_tick can be called several times before 311 * nohz_stop_sched_tick can be called several times before
289 * the nohz_restart_sched_tick is called. This happens when 312 * the nohz_restart_sched_tick is called. This happens when
@@ -296,7 +319,7 @@ void tick_nohz_stop_sched_tick(int inidle)
296 /* 319 /*
297 * sched tick not stopped! 320 * sched tick not stopped!
298 */ 321 */
299 cpu_clear(cpu, nohz_cpu_mask); 322 cpumask_clear_cpu(cpu, nohz_cpu_mask);
300 goto out; 323 goto out;
301 } 324 }
302 325
@@ -306,17 +329,6 @@ void tick_nohz_stop_sched_tick(int inidle)
306 rcu_enter_nohz(); 329 rcu_enter_nohz();
307 } 330 }
308 331
309 /*
310 * If this cpu is the one which updates jiffies, then
311 * give up the assignment and let it be taken by the
312 * cpu which runs the tick timer next, which might be
313 * this cpu as well. If we don't drop this here the
314 * jiffies might be stale and do_timer() never
315 * invoked.
316 */
317 if (cpu == tick_do_timer_cpu)
318 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
319
320 ts->idle_sleeps++; 332 ts->idle_sleeps++;
321 333
322 /* 334 /*
@@ -332,12 +344,7 @@ void tick_nohz_stop_sched_tick(int inidle)
332 goto out; 344 goto out;
333 } 345 }
334 346
335 /* 347 /* Mark expiries */
336 * calculate the expiry time for the next timer wheel
337 * timer
338 */
339 expires = ktime_add_ns(last_update, tick_period.tv64 *
340 delta_jiffies);
341 ts->idle_expires = expires; 348 ts->idle_expires = expires;
342 349
343 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
@@ -354,7 +361,7 @@ void tick_nohz_stop_sched_tick(int inidle)
354 * softirq. 361 * softirq.
355 */ 362 */
356 tick_do_update_jiffies64(ktime_get()); 363 tick_do_update_jiffies64(ktime_get());
357 cpu_clear(cpu, nohz_cpu_mask); 364 cpumask_clear_cpu(cpu, nohz_cpu_mask);
358 } 365 }
359 raise_softirq_irqoff(TIMER_SOFTIRQ); 366 raise_softirq_irqoff(TIMER_SOFTIRQ);
360out: 367out:
@@ -412,7 +419,9 @@ void tick_nohz_restart_sched_tick(void)
412{ 419{
413 int cpu = smp_processor_id(); 420 int cpu = smp_processor_id();
414 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 421 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
422#ifndef CONFIG_VIRT_CPU_ACCOUNTING
415 unsigned long ticks; 423 unsigned long ticks;
424#endif
416 ktime_t now; 425 ktime_t now;
417 426
418 local_irq_disable(); 427 local_irq_disable();
@@ -432,8 +441,9 @@ void tick_nohz_restart_sched_tick(void)
432 select_nohz_load_balancer(0); 441 select_nohz_load_balancer(0);
433 now = ktime_get(); 442 now = ktime_get();
434 tick_do_update_jiffies64(now); 443 tick_do_update_jiffies64(now);
435 cpu_clear(cpu, nohz_cpu_mask); 444 cpumask_clear_cpu(cpu, nohz_cpu_mask);
436 445
446#ifndef CONFIG_VIRT_CPU_ACCOUNTING
437 /* 447 /*
438 * We stopped the tick in idle. Update process times would miss the 448 * We stopped the tick in idle. Update process times would miss the
439 * time we slept as update_process_times does only a 1 tick 449 * time we slept as update_process_times does only a 1 tick
@@ -443,12 +453,9 @@ void tick_nohz_restart_sched_tick(void)
443 /* 453 /*
444 * We might be one off. Do not randomly account a huge number of ticks! 454 * We might be one off. Do not randomly account a huge number of ticks!
445 */ 455 */
446 if (ticks && ticks < LONG_MAX) { 456 if (ticks && ticks < LONG_MAX)
447 add_preempt_count(HARDIRQ_OFFSET); 457 account_idle_ticks(ticks);
448 account_system_time(current, HARDIRQ_OFFSET, 458#endif
449 jiffies_to_cputime(ticks));
450 sub_preempt_count(HARDIRQ_OFFSET);
451 }
452 459
453 touch_softlockup_watchdog(); 460 touch_softlockup_watchdog();
454 /* 461 /*
@@ -681,7 +688,6 @@ void tick_setup_sched_timer(void)
681 */ 688 */
682 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 689 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
683 ts->sched_timer.function = tick_sched_timer; 690 ts->sched_timer.function = tick_sched_timer;
684 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
685 691
686 /* Get the next period (per cpu) */ 692 /* Get the next period (per cpu) */
687 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 693 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fa05e88aa76f..900f1b6598d1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 46struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 47static unsigned long total_sleep_time; /* seconds */
48 48
49/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended;
51
49static struct timespec xtime_cache __attribute__ ((aligned (16))); 52static struct timespec xtime_cache __attribute__ ((aligned (16)));
50void update_xtime_cache(u64 nsec) 53void update_xtime_cache(u64 nsec)
51{ 54{
@@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts)
92 unsigned long seq; 95 unsigned long seq;
93 s64 nsecs; 96 s64 nsecs;
94 97
98 WARN_ON(timekeeping_suspended);
99
95 do { 100 do {
96 seq = read_seqbegin(&xtime_lock); 101 seq = read_seqbegin(&xtime_lock);
97 102
@@ -299,8 +304,6 @@ void __init timekeeping_init(void)
299 write_sequnlock_irqrestore(&xtime_lock, flags); 304 write_sequnlock_irqrestore(&xtime_lock, flags);
300} 305}
301 306
302/* flag for if timekeeping is suspended */
303static int timekeeping_suspended;
304/* time in seconds when suspend began */ 307/* time in seconds when suspend began */
305static unsigned long timekeeping_suspend_time; 308static unsigned long timekeeping_suspend_time;
306 309
diff --git a/kernel/timer.c b/kernel/timer.c
index dbd50fabe4c7..dee3f641a7a7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1018,21 +1018,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1018} 1018}
1019#endif 1019#endif
1020 1020
1021#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1022void account_process_tick(struct task_struct *p, int user_tick)
1023{
1024 cputime_t one_jiffy = jiffies_to_cputime(1);
1025
1026 if (user_tick) {
1027 account_user_time(p, one_jiffy);
1028 account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
1029 } else {
1030 account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
1031 account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
1032 }
1033}
1034#endif
1035
1036/* 1021/*
1037 * Called from the timer interrupt handler to charge one tick to the current 1022 * Called from the timer interrupt handler to charge one tick to the current
1038 * process. user_tick is 1 if the tick is user time, 0 for system. 1023 * process. user_tick is 1 if the tick is user time, 0 for system.
@@ -1192,25 +1177,25 @@ asmlinkage long sys_getppid(void)
1192asmlinkage long sys_getuid(void) 1177asmlinkage long sys_getuid(void)
1193{ 1178{
1194 /* Only we change this so SMP safe */ 1179 /* Only we change this so SMP safe */
1195 return current->uid; 1180 return current_uid();
1196} 1181}
1197 1182
1198asmlinkage long sys_geteuid(void) 1183asmlinkage long sys_geteuid(void)
1199{ 1184{
1200 /* Only we change this so SMP safe */ 1185 /* Only we change this so SMP safe */
1201 return current->euid; 1186 return current_euid();
1202} 1187}
1203 1188
1204asmlinkage long sys_getgid(void) 1189asmlinkage long sys_getgid(void)
1205{ 1190{
1206 /* Only we change this so SMP safe */ 1191 /* Only we change this so SMP safe */
1207 return current->gid; 1192 return current_gid();
1208} 1193}
1209 1194
1210asmlinkage long sys_getegid(void) 1195asmlinkage long sys_getegid(void)
1211{ 1196{
1212 /* Only we change this so SMP safe */ 1197 /* Only we change this so SMP safe */
1213 return current->egid; 1198 return current_egid();
1214} 1199}
1215 1200
1216#endif 1201#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 33dbefd471e8..e2a4ff6fc3a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -3,18 +3,34 @@
3# select HAVE_FUNCTION_TRACER: 3# select HAVE_FUNCTION_TRACER:
4# 4#
5 5
6config USER_STACKTRACE_SUPPORT
7 bool
8
6config NOP_TRACER 9config NOP_TRACER
7 bool 10 bool
8 11
9config HAVE_FUNCTION_TRACER 12config HAVE_FUNCTION_TRACER
10 bool 13 bool
11 14
15config HAVE_FUNCTION_GRAPH_TRACER
16 bool
17
18config HAVE_FUNCTION_TRACE_MCOUNT_TEST
19 bool
20 help
21 This gets selected when the arch tests the function_trace_stop
22 variable at the mcount call site. Otherwise, this variable
23 is tested by the called function.
24
12config HAVE_DYNAMIC_FTRACE 25config HAVE_DYNAMIC_FTRACE
13 bool 26 bool
14 27
15config HAVE_FTRACE_MCOUNT_RECORD 28config HAVE_FTRACE_MCOUNT_RECORD
16 bool 29 bool
17 30
31config HAVE_HW_BRANCH_TRACER
32 bool
33
18config TRACER_MAX_TRACE 34config TRACER_MAX_TRACE
19 bool 35 bool
20 36
@@ -47,6 +63,20 @@ config FUNCTION_TRACER
47 (the bootup default), then the overhead of the instructions is very 63 (the bootup default), then the overhead of the instructions is very
48 small and not measurable even in micro-benchmarks. 64 small and not measurable even in micro-benchmarks.
49 65
66config FUNCTION_GRAPH_TRACER
67 bool "Kernel Function Graph Tracer"
68 depends on HAVE_FUNCTION_GRAPH_TRACER
69 depends on FUNCTION_TRACER
70 default y
71 help
72 Enable the kernel to trace a function at both its return
73 and its entry.
74 It's first purpose is to trace the duration of functions and
75 draw a call graph for each thread with some informations like
76 the return value.
77 This is done by setting the current return address on the current
78 task structure into a stack of calls.
79
50config IRQSOFF_TRACER 80config IRQSOFF_TRACER
51 bool "Interrupts-off Latency Tracer" 81 bool "Interrupts-off Latency Tracer"
52 default n 82 default n
@@ -138,6 +168,70 @@ config BOOT_TRACER
138 selected, because the self-tests are an initcall as well and that 168 selected, because the self-tests are an initcall as well and that
139 would invalidate the boot trace. ) 169 would invalidate the boot trace. )
140 170
171config TRACE_BRANCH_PROFILING
172 bool "Trace likely/unlikely profiler"
173 depends on DEBUG_KERNEL
174 select TRACING
175 help
176 This tracer profiles all the the likely and unlikely macros
177 in the kernel. It will display the results in:
178
179 /debugfs/tracing/profile_annotated_branch
180
181 Note: this will add a significant overhead, only turn this
182 on if you need to profile the system's use of these macros.
183
184 Say N if unsure.
185
186config PROFILE_ALL_BRANCHES
187 bool "Profile all if conditionals"
188 depends on TRACE_BRANCH_PROFILING
189 help
190 This tracer profiles all branch conditions. Every if ()
191 taken in the kernel is recorded whether it hit or miss.
192 The results will be displayed in:
193
194 /debugfs/tracing/profile_branch
195
196 This configuration, when enabled, will impose a great overhead
197 on the system. This should only be enabled when the system
198 is to be analyzed
199
200 Say N if unsure.
201
202config TRACING_BRANCHES
203 bool
204 help
205 Selected by tracers that will trace the likely and unlikely
206 conditions. This prevents the tracers themselves from being
207 profiled. Profiling the tracing infrastructure can only happen
208 when the likelys and unlikelys are not being traced.
209
210config BRANCH_TRACER
211 bool "Trace likely/unlikely instances"
212 depends on TRACE_BRANCH_PROFILING
213 select TRACING_BRANCHES
214 help
215 This traces the events of likely and unlikely condition
216 calls in the kernel. The difference between this and the
217 "Trace likely/unlikely profiler" is that this is not a
218 histogram of the callers, but actually places the calling
219 events into a running trace buffer to see when and where the
220 events happened, as well as their results.
221
222 Say N if unsure.
223
224config POWER_TRACER
225 bool "Trace power consumption behavior"
226 depends on DEBUG_KERNEL
227 depends on X86
228 select TRACING
229 help
230 This tracer helps developers to analyze and optimize the kernels
231 power management decisions, specifically the C-state and P-state
232 behavior.
233
234
141config STACK_TRACER 235config STACK_TRACER
142 bool "Trace max stack" 236 bool "Trace max stack"
143 depends on HAVE_FUNCTION_TRACER 237 depends on HAVE_FUNCTION_TRACER
@@ -150,13 +244,26 @@ config STACK_TRACER
150 244
151 This tracer works by hooking into every function call that the 245 This tracer works by hooking into every function call that the
152 kernel executes, and keeping a maximum stack depth value and 246 kernel executes, and keeping a maximum stack depth value and
153 stack-trace saved. Because this logic has to execute in every 247 stack-trace saved. If this is configured with DYNAMIC_FTRACE
154 kernel function, all the time, this option can slow down the 248 then it will not have any overhead while the stack tracer
155 kernel measurably and is generally intended for kernel 249 is disabled.
156 developers only. 250
251 To enable the stack tracer on bootup, pass in 'stacktrace'
252 on the kernel command line.
253
254 The stack tracer can also be enabled or disabled via the
255 sysctl kernel.stack_tracer_enabled
157 256
158 Say N if unsure. 257 Say N if unsure.
159 258
259config HW_BRANCH_TRACER
260 depends on HAVE_HW_BRANCH_TRACER
261 bool "Trace hw branches"
262 select TRACING
263 help
264 This tracer records all branches on the system in a circular
265 buffer giving access to the last N branches for each cpu.
266
160config DYNAMIC_FTRACE 267config DYNAMIC_FTRACE
161 bool "enable/disable ftrace tracepoints dynamically" 268 bool "enable/disable ftrace tracepoints dynamically"
162 depends on FUNCTION_TRACER 269 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c8228b1a49e9..349d5a93653f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o 10obj-y += trace_selftest_dynamic.o
11endif 11endif
12 12
13# If unlikely tracing is enabled, do not trace these files
14ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif
17
13obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o 18obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
14obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
15 20
@@ -24,5 +29,9 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
24obj-$(CONFIG_STACK_TRACER) += trace_stack.o 29obj-$(CONFIG_STACK_TRACER) += trace_stack.o
25obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 30obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
26obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 31obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
32obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
34obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
35obj-$(CONFIG_POWER_TRACER) += trace_power.o
27 36
28libftrace-y := ftrace.o 37libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 78db083390f0..2f32969c09df 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,6 +47,13 @@
47int ftrace_enabled __read_mostly; 47int ftrace_enabled __read_mostly;
48static int last_ftrace_enabled; 48static int last_ftrace_enabled;
49 49
50/* set when tracing only a pid */
51struct pid *ftrace_pid_trace;
52static struct pid * const ftrace_swapper_pid = &init_struct_pid;
53
54/* Quick disabling of function tracer. */
55int function_trace_stop;
56
50/* 57/*
51 * ftrace_disabled is set when an anomaly is discovered. 58 * ftrace_disabled is set when an anomaly is discovered.
52 * ftrace_disabled is much stronger than ftrace_enabled. 59 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -55,6 +62,7 @@ static int ftrace_disabled __read_mostly;
55 62
56static DEFINE_SPINLOCK(ftrace_lock); 63static DEFINE_SPINLOCK(ftrace_lock);
57static DEFINE_MUTEX(ftrace_sysctl_lock); 64static DEFINE_MUTEX(ftrace_sysctl_lock);
65static DEFINE_MUTEX(ftrace_start_lock);
58 66
59static struct ftrace_ops ftrace_list_end __read_mostly = 67static struct ftrace_ops ftrace_list_end __read_mostly =
60{ 68{
@@ -63,6 +71,8 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
63 71
64static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 72static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
65ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 73ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
74ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
75ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
66 76
67static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 77static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
68{ 78{
@@ -79,6 +89,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
79 }; 89 };
80} 90}
81 91
92static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
93{
94 if (!test_tsk_trace_trace(current))
95 return;
96
97 ftrace_pid_function(ip, parent_ip);
98}
99
100static void set_ftrace_pid_function(ftrace_func_t func)
101{
102 /* do not set ftrace_pid_function to itself! */
103 if (func != ftrace_pid_func)
104 ftrace_pid_function = func;
105}
106
82/** 107/**
83 * clear_ftrace_function - reset the ftrace function 108 * clear_ftrace_function - reset the ftrace function
84 * 109 *
@@ -88,7 +113,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
88void clear_ftrace_function(void) 113void clear_ftrace_function(void)
89{ 114{
90 ftrace_trace_function = ftrace_stub; 115 ftrace_trace_function = ftrace_stub;
116 __ftrace_trace_function = ftrace_stub;
117 ftrace_pid_function = ftrace_stub;
118}
119
120#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
121/*
122 * For those archs that do not test ftrace_trace_stop in their
123 * mcount call site, we need to do it from C.
124 */
125static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
126{
127 if (function_trace_stop)
128 return;
129
130 __ftrace_trace_function(ip, parent_ip);
91} 131}
132#endif
92 133
93static int __register_ftrace_function(struct ftrace_ops *ops) 134static int __register_ftrace_function(struct ftrace_ops *ops)
94{ 135{
@@ -106,14 +147,28 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
106 ftrace_list = ops; 147 ftrace_list = ops;
107 148
108 if (ftrace_enabled) { 149 if (ftrace_enabled) {
150 ftrace_func_t func;
151
152 if (ops->next == &ftrace_list_end)
153 func = ops->func;
154 else
155 func = ftrace_list_func;
156
157 if (ftrace_pid_trace) {
158 set_ftrace_pid_function(func);
159 func = ftrace_pid_func;
160 }
161
109 /* 162 /*
110 * For one func, simply call it directly. 163 * For one func, simply call it directly.
111 * For more than one func, call the chain. 164 * For more than one func, call the chain.
112 */ 165 */
113 if (ops->next == &ftrace_list_end) 166#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
114 ftrace_trace_function = ops->func; 167 ftrace_trace_function = func;
115 else 168#else
116 ftrace_trace_function = ftrace_list_func; 169 __ftrace_trace_function = func;
170 ftrace_trace_function = ftrace_test_stop_func;
171#endif
117 } 172 }
118 173
119 spin_unlock(&ftrace_lock); 174 spin_unlock(&ftrace_lock);
@@ -152,9 +207,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
152 207
153 if (ftrace_enabled) { 208 if (ftrace_enabled) {
154 /* If we only have one func left, then call that directly */ 209 /* If we only have one func left, then call that directly */
155 if (ftrace_list == &ftrace_list_end || 210 if (ftrace_list->next == &ftrace_list_end) {
156 ftrace_list->next == &ftrace_list_end) 211 ftrace_func_t func = ftrace_list->func;
157 ftrace_trace_function = ftrace_list->func; 212
213 if (ftrace_pid_trace) {
214 set_ftrace_pid_function(func);
215 func = ftrace_pid_func;
216 }
217#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
218 ftrace_trace_function = func;
219#else
220 __ftrace_trace_function = func;
221#endif
222 }
158 } 223 }
159 224
160 out: 225 out:
@@ -163,6 +228,36 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
163 return ret; 228 return ret;
164} 229}
165 230
231static void ftrace_update_pid_func(void)
232{
233 ftrace_func_t func;
234
235 /* should not be called from interrupt context */
236 spin_lock(&ftrace_lock);
237
238 if (ftrace_trace_function == ftrace_stub)
239 goto out;
240
241 func = ftrace_trace_function;
242
243 if (ftrace_pid_trace) {
244 set_ftrace_pid_function(func);
245 func = ftrace_pid_func;
246 } else {
247 if (func == ftrace_pid_func)
248 func = ftrace_pid_function;
249 }
250
251#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
252 ftrace_trace_function = func;
253#else
254 __ftrace_trace_function = func;
255#endif
256
257 out:
258 spin_unlock(&ftrace_lock);
259}
260
166#ifdef CONFIG_DYNAMIC_FTRACE 261#ifdef CONFIG_DYNAMIC_FTRACE
167#ifndef CONFIG_FTRACE_MCOUNT_RECORD 262#ifndef CONFIG_FTRACE_MCOUNT_RECORD
168# error Dynamic ftrace depends on MCOUNT_RECORD 263# error Dynamic ftrace depends on MCOUNT_RECORD
@@ -182,6 +277,8 @@ enum {
182 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 277 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
183 FTRACE_ENABLE_MCOUNT = (1 << 3), 278 FTRACE_ENABLE_MCOUNT = (1 << 3),
184 FTRACE_DISABLE_MCOUNT = (1 << 4), 279 FTRACE_DISABLE_MCOUNT = (1 << 4),
280 FTRACE_START_FUNC_RET = (1 << 5),
281 FTRACE_STOP_FUNC_RET = (1 << 6),
185}; 282};
186 283
187static int ftrace_filtered; 284static int ftrace_filtered;
@@ -308,7 +405,7 @@ ftrace_record_ip(unsigned long ip)
308{ 405{
309 struct dyn_ftrace *rec; 406 struct dyn_ftrace *rec;
310 407
311 if (!ftrace_enabled || ftrace_disabled) 408 if (ftrace_disabled)
312 return NULL; 409 return NULL;
313 410
314 rec = ftrace_alloc_dyn_node(ip); 411 rec = ftrace_alloc_dyn_node(ip);
@@ -322,14 +419,51 @@ ftrace_record_ip(unsigned long ip)
322 return rec; 419 return rec;
323} 420}
324 421
325#define FTRACE_ADDR ((long)(ftrace_caller)) 422static void print_ip_ins(const char *fmt, unsigned char *p)
423{
424 int i;
425
426 printk(KERN_CONT "%s", fmt);
427
428 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
429 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
430}
431
432static void ftrace_bug(int failed, unsigned long ip)
433{
434 switch (failed) {
435 case -EFAULT:
436 FTRACE_WARN_ON_ONCE(1);
437 pr_info("ftrace faulted on modifying ");
438 print_ip_sym(ip);
439 break;
440 case -EINVAL:
441 FTRACE_WARN_ON_ONCE(1);
442 pr_info("ftrace failed to modify ");
443 print_ip_sym(ip);
444 print_ip_ins(" actual: ", (unsigned char *)ip);
445 printk(KERN_CONT "\n");
446 break;
447 case -EPERM:
448 FTRACE_WARN_ON_ONCE(1);
449 pr_info("ftrace faulted on writing ");
450 print_ip_sym(ip);
451 break;
452 default:
453 FTRACE_WARN_ON_ONCE(1);
454 pr_info("ftrace faulted on unknown error ");
455 print_ip_sym(ip);
456 }
457}
458
326 459
327static int 460static int
328__ftrace_replace_code(struct dyn_ftrace *rec, 461__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
329 unsigned char *nop, int enable)
330{ 462{
331 unsigned long ip, fl; 463 unsigned long ip, fl;
332 unsigned char *call, *old, *new; 464 unsigned long ftrace_addr;
465
466 ftrace_addr = (unsigned long)ftrace_caller;
333 467
334 ip = rec->ip; 468 ip = rec->ip;
335 469
@@ -388,34 +522,28 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
388 } 522 }
389 } 523 }
390 524
391 call = ftrace_call_replace(ip, FTRACE_ADDR); 525 if (rec->flags & FTRACE_FL_ENABLED)
392 526 return ftrace_make_call(rec, ftrace_addr);
393 if (rec->flags & FTRACE_FL_ENABLED) { 527 else
394 old = nop; 528 return ftrace_make_nop(NULL, rec, ftrace_addr);
395 new = call;
396 } else {
397 old = call;
398 new = nop;
399 }
400
401 return ftrace_modify_code(ip, old, new);
402} 529}
403 530
404static void ftrace_replace_code(int enable) 531static void ftrace_replace_code(int enable)
405{ 532{
406 int i, failed; 533 int i, failed;
407 unsigned char *nop = NULL;
408 struct dyn_ftrace *rec; 534 struct dyn_ftrace *rec;
409 struct ftrace_page *pg; 535 struct ftrace_page *pg;
410 536
411 nop = ftrace_nop_replace();
412
413 for (pg = ftrace_pages_start; pg; pg = pg->next) { 537 for (pg = ftrace_pages_start; pg; pg = pg->next) {
414 for (i = 0; i < pg->index; i++) { 538 for (i = 0; i < pg->index; i++) {
415 rec = &pg->records[i]; 539 rec = &pg->records[i];
416 540
417 /* don't modify code that has already faulted */ 541 /*
418 if (rec->flags & FTRACE_FL_FAILED) 542 * Skip over free records and records that have
543 * failed.
544 */
545 if (rec->flags & FTRACE_FL_FREE ||
546 rec->flags & FTRACE_FL_FAILED)
419 continue; 547 continue;
420 548
421 /* ignore updates to this record's mcount site */ 549 /* ignore updates to this record's mcount site */
@@ -426,68 +554,30 @@ static void ftrace_replace_code(int enable)
426 unfreeze_record(rec); 554 unfreeze_record(rec);
427 } 555 }
428 556
429 failed = __ftrace_replace_code(rec, nop, enable); 557 failed = __ftrace_replace_code(rec, enable);
430 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 558 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
431 rec->flags |= FTRACE_FL_FAILED; 559 rec->flags |= FTRACE_FL_FAILED;
432 if ((system_state == SYSTEM_BOOTING) || 560 if ((system_state == SYSTEM_BOOTING) ||
433 !core_kernel_text(rec->ip)) { 561 !core_kernel_text(rec->ip)) {
434 ftrace_free_rec(rec); 562 ftrace_free_rec(rec);
435 } 563 } else
564 ftrace_bug(failed, rec->ip);
436 } 565 }
437 } 566 }
438 } 567 }
439} 568}
440 569
441static void print_ip_ins(const char *fmt, unsigned char *p)
442{
443 int i;
444
445 printk(KERN_CONT "%s", fmt);
446
447 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
448 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
449}
450
451static int 570static int
452ftrace_code_disable(struct dyn_ftrace *rec) 571ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
453{ 572{
454 unsigned long ip; 573 unsigned long ip;
455 unsigned char *nop, *call;
456 int ret; 574 int ret;
457 575
458 ip = rec->ip; 576 ip = rec->ip;
459 577
460 nop = ftrace_nop_replace(); 578 ret = ftrace_make_nop(mod, rec, mcount_addr);
461 call = ftrace_call_replace(ip, mcount_addr);
462
463 ret = ftrace_modify_code(ip, call, nop);
464 if (ret) { 579 if (ret) {
465 switch (ret) { 580 ftrace_bug(ret, ip);
466 case -EFAULT:
467 FTRACE_WARN_ON_ONCE(1);
468 pr_info("ftrace faulted on modifying ");
469 print_ip_sym(ip);
470 break;
471 case -EINVAL:
472 FTRACE_WARN_ON_ONCE(1);
473 pr_info("ftrace failed to modify ");
474 print_ip_sym(ip);
475 print_ip_ins(" expected: ", call);
476 print_ip_ins(" actual: ", (unsigned char *)ip);
477 print_ip_ins(" replace: ", nop);
478 printk(KERN_CONT "\n");
479 break;
480 case -EPERM:
481 FTRACE_WARN_ON_ONCE(1);
482 pr_info("ftrace faulted on writing ");
483 print_ip_sym(ip);
484 break;
485 default:
486 FTRACE_WARN_ON_ONCE(1);
487 pr_info("ftrace faulted on unknown error ");
488 print_ip_sym(ip);
489 }
490
491 rec->flags |= FTRACE_FL_FAILED; 581 rec->flags |= FTRACE_FL_FAILED;
492 return 0; 582 return 0;
493 } 583 }
@@ -506,6 +596,11 @@ static int __ftrace_modify_code(void *data)
506 if (*command & FTRACE_UPDATE_TRACE_FUNC) 596 if (*command & FTRACE_UPDATE_TRACE_FUNC)
507 ftrace_update_ftrace_func(ftrace_trace_function); 597 ftrace_update_ftrace_func(ftrace_trace_function);
508 598
599 if (*command & FTRACE_START_FUNC_RET)
600 ftrace_enable_ftrace_graph_caller();
601 else if (*command & FTRACE_STOP_FUNC_RET)
602 ftrace_disable_ftrace_graph_caller();
603
509 return 0; 604 return 0;
510} 605}
511 606
@@ -515,43 +610,43 @@ static void ftrace_run_update_code(int command)
515} 610}
516 611
517static ftrace_func_t saved_ftrace_func; 612static ftrace_func_t saved_ftrace_func;
518static int ftrace_start; 613static int ftrace_start_up;
519static DEFINE_MUTEX(ftrace_start_lock);
520 614
521static void ftrace_startup(void) 615static void ftrace_startup_enable(int command)
522{ 616{
523 int command = 0;
524
525 if (unlikely(ftrace_disabled))
526 return;
527
528 mutex_lock(&ftrace_start_lock);
529 ftrace_start++;
530 command |= FTRACE_ENABLE_CALLS;
531
532 if (saved_ftrace_func != ftrace_trace_function) { 617 if (saved_ftrace_func != ftrace_trace_function) {
533 saved_ftrace_func = ftrace_trace_function; 618 saved_ftrace_func = ftrace_trace_function;
534 command |= FTRACE_UPDATE_TRACE_FUNC; 619 command |= FTRACE_UPDATE_TRACE_FUNC;
535 } 620 }
536 621
537 if (!command || !ftrace_enabled) 622 if (!command || !ftrace_enabled)
538 goto out; 623 return;
539 624
540 ftrace_run_update_code(command); 625 ftrace_run_update_code(command);
541 out:
542 mutex_unlock(&ftrace_start_lock);
543} 626}
544 627
545static void ftrace_shutdown(void) 628static void ftrace_startup(int command)
546{ 629{
547 int command = 0; 630 if (unlikely(ftrace_disabled))
631 return;
632
633 mutex_lock(&ftrace_start_lock);
634 ftrace_start_up++;
635 command |= FTRACE_ENABLE_CALLS;
548 636
637 ftrace_startup_enable(command);
638
639 mutex_unlock(&ftrace_start_lock);
640}
641
642static void ftrace_shutdown(int command)
643{
549 if (unlikely(ftrace_disabled)) 644 if (unlikely(ftrace_disabled))
550 return; 645 return;
551 646
552 mutex_lock(&ftrace_start_lock); 647 mutex_lock(&ftrace_start_lock);
553 ftrace_start--; 648 ftrace_start_up--;
554 if (!ftrace_start) 649 if (!ftrace_start_up)
555 command |= FTRACE_DISABLE_CALLS; 650 command |= FTRACE_DISABLE_CALLS;
556 651
557 if (saved_ftrace_func != ftrace_trace_function) { 652 if (saved_ftrace_func != ftrace_trace_function) {
@@ -577,8 +672,8 @@ static void ftrace_startup_sysctl(void)
577 mutex_lock(&ftrace_start_lock); 672 mutex_lock(&ftrace_start_lock);
578 /* Force update next time */ 673 /* Force update next time */
579 saved_ftrace_func = NULL; 674 saved_ftrace_func = NULL;
580 /* ftrace_start is true if we want ftrace running */ 675 /* ftrace_start_up is true if we want ftrace running */
581 if (ftrace_start) 676 if (ftrace_start_up)
582 command |= FTRACE_ENABLE_CALLS; 677 command |= FTRACE_ENABLE_CALLS;
583 678
584 ftrace_run_update_code(command); 679 ftrace_run_update_code(command);
@@ -593,8 +688,8 @@ static void ftrace_shutdown_sysctl(void)
593 return; 688 return;
594 689
595 mutex_lock(&ftrace_start_lock); 690 mutex_lock(&ftrace_start_lock);
596 /* ftrace_start is true if ftrace is running */ 691 /* ftrace_start_up is true if ftrace is running */
597 if (ftrace_start) 692 if (ftrace_start_up)
598 command |= FTRACE_DISABLE_CALLS; 693 command |= FTRACE_DISABLE_CALLS;
599 694
600 ftrace_run_update_code(command); 695 ftrace_run_update_code(command);
@@ -605,7 +700,7 @@ static cycle_t ftrace_update_time;
605static unsigned long ftrace_update_cnt; 700static unsigned long ftrace_update_cnt;
606unsigned long ftrace_update_tot_cnt; 701unsigned long ftrace_update_tot_cnt;
607 702
608static int ftrace_update_code(void) 703static int ftrace_update_code(struct module *mod)
609{ 704{
610 struct dyn_ftrace *p, *t; 705 struct dyn_ftrace *p, *t;
611 cycle_t start, stop; 706 cycle_t start, stop;
@@ -622,7 +717,7 @@ static int ftrace_update_code(void)
622 list_del_init(&p->list); 717 list_del_init(&p->list);
623 718
624 /* convert record (i.e, patch mcount-call with NOP) */ 719 /* convert record (i.e, patch mcount-call with NOP) */
625 if (ftrace_code_disable(p)) { 720 if (ftrace_code_disable(mod, p)) {
626 p->flags |= FTRACE_FL_CONVERTED; 721 p->flags |= FTRACE_FL_CONVERTED;
627 ftrace_update_cnt++; 722 ftrace_update_cnt++;
628 } else 723 } else
@@ -690,7 +785,6 @@ enum {
690#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 785#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
691 786
692struct ftrace_iterator { 787struct ftrace_iterator {
693 loff_t pos;
694 struct ftrace_page *pg; 788 struct ftrace_page *pg;
695 unsigned idx; 789 unsigned idx;
696 unsigned flags; 790 unsigned flags;
@@ -715,6 +809,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
715 iter->pg = iter->pg->next; 809 iter->pg = iter->pg->next;
716 iter->idx = 0; 810 iter->idx = 0;
717 goto retry; 811 goto retry;
812 } else {
813 iter->idx = -1;
718 } 814 }
719 } else { 815 } else {
720 rec = &iter->pg->records[iter->idx++]; 816 rec = &iter->pg->records[iter->idx++];
@@ -737,8 +833,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
737 } 833 }
738 spin_unlock(&ftrace_lock); 834 spin_unlock(&ftrace_lock);
739 835
740 iter->pos = *pos;
741
742 return rec; 836 return rec;
743} 837}
744 838
@@ -746,13 +840,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
746{ 840{
747 struct ftrace_iterator *iter = m->private; 841 struct ftrace_iterator *iter = m->private;
748 void *p = NULL; 842 void *p = NULL;
749 loff_t l = -1;
750 843
751 if (*pos > iter->pos) 844 if (*pos > 0) {
752 *pos = iter->pos; 845 if (iter->idx < 0)
846 return p;
847 (*pos)--;
848 iter->idx--;
849 }
753 850
754 l = *pos; 851 p = t_next(m, p, pos);
755 p = t_next(m, p, &l);
756 852
757 return p; 853 return p;
758} 854}
@@ -763,21 +859,15 @@ static void t_stop(struct seq_file *m, void *p)
763 859
764static int t_show(struct seq_file *m, void *v) 860static int t_show(struct seq_file *m, void *v)
765{ 861{
766 struct ftrace_iterator *iter = m->private;
767 struct dyn_ftrace *rec = v; 862 struct dyn_ftrace *rec = v;
768 char str[KSYM_SYMBOL_LEN]; 863 char str[KSYM_SYMBOL_LEN];
769 int ret = 0;
770 864
771 if (!rec) 865 if (!rec)
772 return 0; 866 return 0;
773 867
774 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 868 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
775 869
776 ret = seq_printf(m, "%s\n", str); 870 seq_printf(m, "%s\n", str);
777 if (ret < 0) {
778 iter->pos--;
779 iter->idx--;
780 }
781 871
782 return 0; 872 return 0;
783} 873}
@@ -803,7 +893,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
803 return -ENOMEM; 893 return -ENOMEM;
804 894
805 iter->pg = ftrace_pages_start; 895 iter->pg = ftrace_pages_start;
806 iter->pos = 0;
807 896
808 ret = seq_open(file, &show_ftrace_seq_ops); 897 ret = seq_open(file, &show_ftrace_seq_ops);
809 if (!ret) { 898 if (!ret) {
@@ -890,7 +979,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
890 979
891 if (file->f_mode & FMODE_READ) { 980 if (file->f_mode & FMODE_READ) {
892 iter->pg = ftrace_pages_start; 981 iter->pg = ftrace_pages_start;
893 iter->pos = 0;
894 iter->flags = enable ? FTRACE_ITER_FILTER : 982 iter->flags = enable ? FTRACE_ITER_FILTER :
895 FTRACE_ITER_NOTRACE; 983 FTRACE_ITER_NOTRACE;
896 984
@@ -959,6 +1047,13 @@ ftrace_match(unsigned char *buff, int len, int enable)
959 int type = MATCH_FULL; 1047 int type = MATCH_FULL;
960 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1048 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
961 unsigned i, match = 0, search_len = 0; 1049 unsigned i, match = 0, search_len = 0;
1050 int not = 0;
1051
1052 if (buff[0] == '!') {
1053 not = 1;
1054 buff++;
1055 len--;
1056 }
962 1057
963 for (i = 0; i < len; i++) { 1058 for (i = 0; i < len; i++) {
964 if (buff[i] == '*') { 1059 if (buff[i] == '*') {
@@ -1012,8 +1107,12 @@ ftrace_match(unsigned char *buff, int len, int enable)
1012 matched = 1; 1107 matched = 1;
1013 break; 1108 break;
1014 } 1109 }
1015 if (matched) 1110 if (matched) {
1016 rec->flags |= flag; 1111 if (not)
1112 rec->flags &= ~flag;
1113 else
1114 rec->flags |= flag;
1115 }
1017 } 1116 }
1018 pg = pg->next; 1117 pg = pg->next;
1019 } 1118 }
@@ -1181,7 +1280,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1181 1280
1182 mutex_lock(&ftrace_sysctl_lock); 1281 mutex_lock(&ftrace_sysctl_lock);
1183 mutex_lock(&ftrace_start_lock); 1282 mutex_lock(&ftrace_start_lock);
1184 if (ftrace_start && ftrace_enabled) 1283 if (ftrace_start_up && ftrace_enabled)
1185 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1284 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1186 mutex_unlock(&ftrace_start_lock); 1285 mutex_unlock(&ftrace_start_lock);
1187 mutex_unlock(&ftrace_sysctl_lock); 1286 mutex_unlock(&ftrace_sysctl_lock);
@@ -1233,12 +1332,233 @@ static struct file_operations ftrace_notrace_fops = {
1233 .release = ftrace_notrace_release, 1332 .release = ftrace_notrace_release,
1234}; 1333};
1235 1334
1236static __init int ftrace_init_debugfs(void) 1335#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1336
1337static DEFINE_MUTEX(graph_lock);
1338
1339int ftrace_graph_count;
1340unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
1341
1342static void *
1343g_next(struct seq_file *m, void *v, loff_t *pos)
1237{ 1344{
1238 struct dentry *d_tracer; 1345 unsigned long *array = m->private;
1239 struct dentry *entry; 1346 int index = *pos;
1240 1347
1241 d_tracer = tracing_init_dentry(); 1348 (*pos)++;
1349
1350 if (index >= ftrace_graph_count)
1351 return NULL;
1352
1353 return &array[index];
1354}
1355
1356static void *g_start(struct seq_file *m, loff_t *pos)
1357{
1358 void *p = NULL;
1359
1360 mutex_lock(&graph_lock);
1361
1362 p = g_next(m, p, pos);
1363
1364 return p;
1365}
1366
1367static void g_stop(struct seq_file *m, void *p)
1368{
1369 mutex_unlock(&graph_lock);
1370}
1371
1372static int g_show(struct seq_file *m, void *v)
1373{
1374 unsigned long *ptr = v;
1375 char str[KSYM_SYMBOL_LEN];
1376
1377 if (!ptr)
1378 return 0;
1379
1380 kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
1381
1382 seq_printf(m, "%s\n", str);
1383
1384 return 0;
1385}
1386
1387static struct seq_operations ftrace_graph_seq_ops = {
1388 .start = g_start,
1389 .next = g_next,
1390 .stop = g_stop,
1391 .show = g_show,
1392};
1393
1394static int
1395ftrace_graph_open(struct inode *inode, struct file *file)
1396{
1397 int ret = 0;
1398
1399 if (unlikely(ftrace_disabled))
1400 return -ENODEV;
1401
1402 mutex_lock(&graph_lock);
1403 if ((file->f_mode & FMODE_WRITE) &&
1404 !(file->f_flags & O_APPEND)) {
1405 ftrace_graph_count = 0;
1406 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
1407 }
1408
1409 if (file->f_mode & FMODE_READ) {
1410 ret = seq_open(file, &ftrace_graph_seq_ops);
1411 if (!ret) {
1412 struct seq_file *m = file->private_data;
1413 m->private = ftrace_graph_funcs;
1414 }
1415 } else
1416 file->private_data = ftrace_graph_funcs;
1417 mutex_unlock(&graph_lock);
1418
1419 return ret;
1420}
1421
1422static ssize_t
1423ftrace_graph_read(struct file *file, char __user *ubuf,
1424 size_t cnt, loff_t *ppos)
1425{
1426 if (file->f_mode & FMODE_READ)
1427 return seq_read(file, ubuf, cnt, ppos);
1428 else
1429 return -EPERM;
1430}
1431
1432static int
1433ftrace_set_func(unsigned long *array, int idx, char *buffer)
1434{
1435 char str[KSYM_SYMBOL_LEN];
1436 struct dyn_ftrace *rec;
1437 struct ftrace_page *pg;
1438 int found = 0;
1439 int i, j;
1440
1441 if (ftrace_disabled)
1442 return -ENODEV;
1443
1444 /* should not be called from interrupt context */
1445 spin_lock(&ftrace_lock);
1446
1447 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1448 for (i = 0; i < pg->index; i++) {
1449 rec = &pg->records[i];
1450
1451 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
1452 continue;
1453
1454 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1455 if (strcmp(str, buffer) == 0) {
1456 found = 1;
1457 for (j = 0; j < idx; j++)
1458 if (array[j] == rec->ip) {
1459 found = 0;
1460 break;
1461 }
1462 if (found)
1463 array[idx] = rec->ip;
1464 break;
1465 }
1466 }
1467 }
1468 spin_unlock(&ftrace_lock);
1469
1470 return found ? 0 : -EINVAL;
1471}
1472
1473static ssize_t
1474ftrace_graph_write(struct file *file, const char __user *ubuf,
1475 size_t cnt, loff_t *ppos)
1476{
1477 unsigned char buffer[FTRACE_BUFF_MAX+1];
1478 unsigned long *array;
1479 size_t read = 0;
1480 ssize_t ret;
1481 int index = 0;
1482 char ch;
1483
1484 if (!cnt || cnt < 0)
1485 return 0;
1486
1487 mutex_lock(&graph_lock);
1488
1489 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
1490 ret = -EBUSY;
1491 goto out;
1492 }
1493
1494 if (file->f_mode & FMODE_READ) {
1495 struct seq_file *m = file->private_data;
1496 array = m->private;
1497 } else
1498 array = file->private_data;
1499
1500 ret = get_user(ch, ubuf++);
1501 if (ret)
1502 goto out;
1503 read++;
1504 cnt--;
1505
1506 /* skip white space */
1507 while (cnt && isspace(ch)) {
1508 ret = get_user(ch, ubuf++);
1509 if (ret)
1510 goto out;
1511 read++;
1512 cnt--;
1513 }
1514
1515 if (isspace(ch)) {
1516 *ppos += read;
1517 ret = read;
1518 goto out;
1519 }
1520
1521 while (cnt && !isspace(ch)) {
1522 if (index < FTRACE_BUFF_MAX)
1523 buffer[index++] = ch;
1524 else {
1525 ret = -EINVAL;
1526 goto out;
1527 }
1528 ret = get_user(ch, ubuf++);
1529 if (ret)
1530 goto out;
1531 read++;
1532 cnt--;
1533 }
1534 buffer[index] = 0;
1535
1536 /* we allow only one at a time */
1537 ret = ftrace_set_func(array, ftrace_graph_count, buffer);
1538 if (ret)
1539 goto out;
1540
1541 ftrace_graph_count++;
1542
1543 file->f_pos += read;
1544
1545 ret = read;
1546 out:
1547 mutex_unlock(&graph_lock);
1548
1549 return ret;
1550}
1551
1552static const struct file_operations ftrace_graph_fops = {
1553 .open = ftrace_graph_open,
1554 .read = ftrace_graph_read,
1555 .write = ftrace_graph_write,
1556};
1557#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1558
1559static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
1560{
1561 struct dentry *entry;
1242 1562
1243 entry = debugfs_create_file("available_filter_functions", 0444, 1563 entry = debugfs_create_file("available_filter_functions", 0444,
1244 d_tracer, NULL, &ftrace_avail_fops); 1564 d_tracer, NULL, &ftrace_avail_fops);
@@ -1263,12 +1583,20 @@ static __init int ftrace_init_debugfs(void)
1263 pr_warning("Could not create debugfs " 1583 pr_warning("Could not create debugfs "
1264 "'set_ftrace_notrace' entry\n"); 1584 "'set_ftrace_notrace' entry\n");
1265 1585
1586#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1587 entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
1588 NULL,
1589 &ftrace_graph_fops);
1590 if (!entry)
1591 pr_warning("Could not create debugfs "
1592 "'set_graph_function' entry\n");
1593#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1594
1266 return 0; 1595 return 0;
1267} 1596}
1268 1597
1269fs_initcall(ftrace_init_debugfs); 1598static int ftrace_convert_nops(struct module *mod,
1270 1599 unsigned long *start,
1271static int ftrace_convert_nops(unsigned long *start,
1272 unsigned long *end) 1600 unsigned long *end)
1273{ 1601{
1274 unsigned long *p; 1602 unsigned long *p;
@@ -1279,23 +1607,32 @@ static int ftrace_convert_nops(unsigned long *start,
1279 p = start; 1607 p = start;
1280 while (p < end) { 1608 while (p < end) {
1281 addr = ftrace_call_adjust(*p++); 1609 addr = ftrace_call_adjust(*p++);
1610 /*
1611 * Some architecture linkers will pad between
1612 * the different mcount_loc sections of different
1613 * object files to satisfy alignments.
1614 * Skip any NULL pointers.
1615 */
1616 if (!addr)
1617 continue;
1282 ftrace_record_ip(addr); 1618 ftrace_record_ip(addr);
1283 } 1619 }
1284 1620
1285 /* disable interrupts to prevent kstop machine */ 1621 /* disable interrupts to prevent kstop machine */
1286 local_irq_save(flags); 1622 local_irq_save(flags);
1287 ftrace_update_code(); 1623 ftrace_update_code(mod);
1288 local_irq_restore(flags); 1624 local_irq_restore(flags);
1289 mutex_unlock(&ftrace_start_lock); 1625 mutex_unlock(&ftrace_start_lock);
1290 1626
1291 return 0; 1627 return 0;
1292} 1628}
1293 1629
1294void ftrace_init_module(unsigned long *start, unsigned long *end) 1630void ftrace_init_module(struct module *mod,
1631 unsigned long *start, unsigned long *end)
1295{ 1632{
1296 if (ftrace_disabled || start == end) 1633 if (ftrace_disabled || start == end)
1297 return; 1634 return;
1298 ftrace_convert_nops(start, end); 1635 ftrace_convert_nops(mod, start, end);
1299} 1636}
1300 1637
1301extern unsigned long __start_mcount_loc[]; 1638extern unsigned long __start_mcount_loc[];
@@ -1325,7 +1662,8 @@ void __init ftrace_init(void)
1325 1662
1326 last_ftrace_enabled = ftrace_enabled = 1; 1663 last_ftrace_enabled = ftrace_enabled = 1;
1327 1664
1328 ret = ftrace_convert_nops(__start_mcount_loc, 1665 ret = ftrace_convert_nops(NULL,
1666 __start_mcount_loc,
1329 __stop_mcount_loc); 1667 __stop_mcount_loc);
1330 1668
1331 return; 1669 return;
@@ -1342,12 +1680,186 @@ static int __init ftrace_nodyn_init(void)
1342} 1680}
1343device_initcall(ftrace_nodyn_init); 1681device_initcall(ftrace_nodyn_init);
1344 1682
1345# define ftrace_startup() do { } while (0) 1683static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
1346# define ftrace_shutdown() do { } while (0) 1684static inline void ftrace_startup_enable(int command) { }
1685/* Keep as macros so we do not need to define the commands */
1686# define ftrace_startup(command) do { } while (0)
1687# define ftrace_shutdown(command) do { } while (0)
1347# define ftrace_startup_sysctl() do { } while (0) 1688# define ftrace_startup_sysctl() do { } while (0)
1348# define ftrace_shutdown_sysctl() do { } while (0) 1689# define ftrace_shutdown_sysctl() do { } while (0)
1349#endif /* CONFIG_DYNAMIC_FTRACE */ 1690#endif /* CONFIG_DYNAMIC_FTRACE */
1350 1691
1692static ssize_t
1693ftrace_pid_read(struct file *file, char __user *ubuf,
1694 size_t cnt, loff_t *ppos)
1695{
1696 char buf[64];
1697 int r;
1698
1699 if (ftrace_pid_trace == ftrace_swapper_pid)
1700 r = sprintf(buf, "swapper tasks\n");
1701 else if (ftrace_pid_trace)
1702 r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
1703 else
1704 r = sprintf(buf, "no pid\n");
1705
1706 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1707}
1708
1709static void clear_ftrace_swapper(void)
1710{
1711 struct task_struct *p;
1712 int cpu;
1713
1714 get_online_cpus();
1715 for_each_online_cpu(cpu) {
1716 p = idle_task(cpu);
1717 clear_tsk_trace_trace(p);
1718 }
1719 put_online_cpus();
1720}
1721
1722static void set_ftrace_swapper(void)
1723{
1724 struct task_struct *p;
1725 int cpu;
1726
1727 get_online_cpus();
1728 for_each_online_cpu(cpu) {
1729 p = idle_task(cpu);
1730 set_tsk_trace_trace(p);
1731 }
1732 put_online_cpus();
1733}
1734
1735static void clear_ftrace_pid(struct pid *pid)
1736{
1737 struct task_struct *p;
1738
1739 do_each_pid_task(pid, PIDTYPE_PID, p) {
1740 clear_tsk_trace_trace(p);
1741 } while_each_pid_task(pid, PIDTYPE_PID, p);
1742 put_pid(pid);
1743}
1744
1745static void set_ftrace_pid(struct pid *pid)
1746{
1747 struct task_struct *p;
1748
1749 do_each_pid_task(pid, PIDTYPE_PID, p) {
1750 set_tsk_trace_trace(p);
1751 } while_each_pid_task(pid, PIDTYPE_PID, p);
1752}
1753
1754static void clear_ftrace_pid_task(struct pid **pid)
1755{
1756 if (*pid == ftrace_swapper_pid)
1757 clear_ftrace_swapper();
1758 else
1759 clear_ftrace_pid(*pid);
1760
1761 *pid = NULL;
1762}
1763
1764static void set_ftrace_pid_task(struct pid *pid)
1765{
1766 if (pid == ftrace_swapper_pid)
1767 set_ftrace_swapper();
1768 else
1769 set_ftrace_pid(pid);
1770}
1771
1772static ssize_t
1773ftrace_pid_write(struct file *filp, const char __user *ubuf,
1774 size_t cnt, loff_t *ppos)
1775{
1776 struct pid *pid;
1777 char buf[64];
1778 long val;
1779 int ret;
1780
1781 if (cnt >= sizeof(buf))
1782 return -EINVAL;
1783
1784 if (copy_from_user(&buf, ubuf, cnt))
1785 return -EFAULT;
1786
1787 buf[cnt] = 0;
1788
1789 ret = strict_strtol(buf, 10, &val);
1790 if (ret < 0)
1791 return ret;
1792
1793 mutex_lock(&ftrace_start_lock);
1794 if (val < 0) {
1795 /* disable pid tracing */
1796 if (!ftrace_pid_trace)
1797 goto out;
1798
1799 clear_ftrace_pid_task(&ftrace_pid_trace);
1800
1801 } else {
1802 /* swapper task is special */
1803 if (!val) {
1804 pid = ftrace_swapper_pid;
1805 if (pid == ftrace_pid_trace)
1806 goto out;
1807 } else {
1808 pid = find_get_pid(val);
1809
1810 if (pid == ftrace_pid_trace) {
1811 put_pid(pid);
1812 goto out;
1813 }
1814 }
1815
1816 if (ftrace_pid_trace)
1817 clear_ftrace_pid_task(&ftrace_pid_trace);
1818
1819 if (!pid)
1820 goto out;
1821
1822 ftrace_pid_trace = pid;
1823
1824 set_ftrace_pid_task(ftrace_pid_trace);
1825 }
1826
1827 /* update the function call */
1828 ftrace_update_pid_func();
1829 ftrace_startup_enable(0);
1830
1831 out:
1832 mutex_unlock(&ftrace_start_lock);
1833
1834 return cnt;
1835}
1836
1837static struct file_operations ftrace_pid_fops = {
1838 .read = ftrace_pid_read,
1839 .write = ftrace_pid_write,
1840};
1841
1842static __init int ftrace_init_debugfs(void)
1843{
1844 struct dentry *d_tracer;
1845 struct dentry *entry;
1846
1847 d_tracer = tracing_init_dentry();
1848 if (!d_tracer)
1849 return 0;
1850
1851 ftrace_init_dyn_debugfs(d_tracer);
1852
1853 entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
1854 NULL, &ftrace_pid_fops);
1855 if (!entry)
1856 pr_warning("Could not create debugfs "
1857 "'set_ftrace_pid' entry\n");
1858 return 0;
1859}
1860
1861fs_initcall(ftrace_init_debugfs);
1862
1351/** 1863/**
1352 * ftrace_kill - kill ftrace 1864 * ftrace_kill - kill ftrace
1353 * 1865 *
@@ -1381,10 +1893,11 @@ int register_ftrace_function(struct ftrace_ops *ops)
1381 return -1; 1893 return -1;
1382 1894
1383 mutex_lock(&ftrace_sysctl_lock); 1895 mutex_lock(&ftrace_sysctl_lock);
1896
1384 ret = __register_ftrace_function(ops); 1897 ret = __register_ftrace_function(ops);
1385 ftrace_startup(); 1898 ftrace_startup(0);
1386 mutex_unlock(&ftrace_sysctl_lock);
1387 1899
1900 mutex_unlock(&ftrace_sysctl_lock);
1388 return ret; 1901 return ret;
1389} 1902}
1390 1903
@@ -1400,7 +1913,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
1400 1913
1401 mutex_lock(&ftrace_sysctl_lock); 1914 mutex_lock(&ftrace_sysctl_lock);
1402 ret = __unregister_ftrace_function(ops); 1915 ret = __unregister_ftrace_function(ops);
1403 ftrace_shutdown(); 1916 ftrace_shutdown(0);
1404 mutex_unlock(&ftrace_sysctl_lock); 1917 mutex_unlock(&ftrace_sysctl_lock);
1405 1918
1406 return ret; 1919 return ret;
@@ -1449,3 +1962,153 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1449 return ret; 1962 return ret;
1450} 1963}
1451 1964
1965#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1966
1967static atomic_t ftrace_graph_active;
1968
1969int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
1970{
1971 return 0;
1972}
1973
1974/* The callbacks that hook a function */
1975trace_func_graph_ret_t ftrace_graph_return =
1976 (trace_func_graph_ret_t)ftrace_stub;
1977trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
1978
1979/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
1980static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
1981{
1982 int i;
1983 int ret = 0;
1984 unsigned long flags;
1985 int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
1986 struct task_struct *g, *t;
1987
1988 for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
1989 ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
1990 * sizeof(struct ftrace_ret_stack),
1991 GFP_KERNEL);
1992 if (!ret_stack_list[i]) {
1993 start = 0;
1994 end = i;
1995 ret = -ENOMEM;
1996 goto free;
1997 }
1998 }
1999
2000 read_lock_irqsave(&tasklist_lock, flags);
2001 do_each_thread(g, t) {
2002 if (start == end) {
2003 ret = -EAGAIN;
2004 goto unlock;
2005 }
2006
2007 if (t->ret_stack == NULL) {
2008 t->curr_ret_stack = -1;
2009 /* Make sure IRQs see the -1 first: */
2010 barrier();
2011 t->ret_stack = ret_stack_list[start++];
2012 atomic_set(&t->tracing_graph_pause, 0);
2013 atomic_set(&t->trace_overrun, 0);
2014 }
2015 } while_each_thread(g, t);
2016
2017unlock:
2018 read_unlock_irqrestore(&tasklist_lock, flags);
2019free:
2020 for (i = start; i < end; i++)
2021 kfree(ret_stack_list[i]);
2022 return ret;
2023}
2024
2025/* Allocate a return stack for each task */
2026static int start_graph_tracing(void)
2027{
2028 struct ftrace_ret_stack **ret_stack_list;
2029 int ret;
2030
2031 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
2032 sizeof(struct ftrace_ret_stack *),
2033 GFP_KERNEL);
2034
2035 if (!ret_stack_list)
2036 return -ENOMEM;
2037
2038 do {
2039 ret = alloc_retstack_tasklist(ret_stack_list);
2040 } while (ret == -EAGAIN);
2041
2042 kfree(ret_stack_list);
2043 return ret;
2044}
2045
2046int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2047 trace_func_graph_ent_t entryfunc)
2048{
2049 int ret = 0;
2050
2051 mutex_lock(&ftrace_sysctl_lock);
2052
2053 atomic_inc(&ftrace_graph_active);
2054 ret = start_graph_tracing();
2055 if (ret) {
2056 atomic_dec(&ftrace_graph_active);
2057 goto out;
2058 }
2059
2060 ftrace_graph_return = retfunc;
2061 ftrace_graph_entry = entryfunc;
2062
2063 ftrace_startup(FTRACE_START_FUNC_RET);
2064
2065out:
2066 mutex_unlock(&ftrace_sysctl_lock);
2067 return ret;
2068}
2069
2070void unregister_ftrace_graph(void)
2071{
2072 mutex_lock(&ftrace_sysctl_lock);
2073
2074 atomic_dec(&ftrace_graph_active);
2075 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2076 ftrace_graph_entry = ftrace_graph_entry_stub;
2077 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
2078
2079 mutex_unlock(&ftrace_sysctl_lock);
2080}
2081
2082/* Allocate a return stack for newly created task */
2083void ftrace_graph_init_task(struct task_struct *t)
2084{
2085 if (atomic_read(&ftrace_graph_active)) {
2086 t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
2087 * sizeof(struct ftrace_ret_stack),
2088 GFP_KERNEL);
2089 if (!t->ret_stack)
2090 return;
2091 t->curr_ret_stack = -1;
2092 atomic_set(&t->tracing_graph_pause, 0);
2093 atomic_set(&t->trace_overrun, 0);
2094 } else
2095 t->ret_stack = NULL;
2096}
2097
2098void ftrace_graph_exit_task(struct task_struct *t)
2099{
2100 struct ftrace_ret_stack *ret_stack = t->ret_stack;
2101
2102 t->ret_stack = NULL;
2103 /* NULL must become visible to IRQs before we free it: */
2104 barrier();
2105
2106 kfree(ret_stack);
2107}
2108
2109void ftrace_graph_stop(void)
2110{
2111 ftrace_stop();
2112}
2113#endif
2114
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 668bbb5ef2bd..a9d9760dc7b6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -18,8 +18,46 @@
18 18
19#include "trace.h" 19#include "trace.h"
20 20
21/* Global flag to disable all recording to ring buffers */ 21/*
22static int ring_buffers_off __read_mostly; 22 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to.
25 * Turning this switch on, makes it OK to write to the
26 * ring buffer, if the ring buffer is enabled itself.
27 *
28 * There's three layers that must be on in order to write
29 * to the ring buffer.
30 *
31 * 1) This global flag must be set.
32 * 2) The ring buffer must be enabled for recording.
33 * 3) The per cpu buffer must be enabled for recording.
34 *
35 * In case of an anomaly, this global flag has a bit set that
36 * will permantly disable all ring buffers.
37 */
38
39/*
40 * Global flag to disable all recording to ring buffers
41 * This has two bits: ON, DISABLED
42 *
43 * ON DISABLED
44 * ---- ----------
45 * 0 0 : ring buffers are off
46 * 1 0 : ring buffers are on
47 * X 1 : ring buffers are permanently disabled
48 */
49
50enum {
51 RB_BUFFERS_ON_BIT = 0,
52 RB_BUFFERS_DISABLED_BIT = 1,
53};
54
55enum {
56 RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
57 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
58};
59
60static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
23 61
24/** 62/**
25 * tracing_on - enable all tracing buffers 63 * tracing_on - enable all tracing buffers
@@ -29,8 +67,9 @@ static int ring_buffers_off __read_mostly;
29 */ 67 */
30void tracing_on(void) 68void tracing_on(void)
31{ 69{
32 ring_buffers_off = 0; 70 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
33} 71}
72EXPORT_SYMBOL_GPL(tracing_on);
34 73
35/** 74/**
36 * tracing_off - turn off all tracing buffers 75 * tracing_off - turn off all tracing buffers
@@ -42,8 +81,22 @@ void tracing_on(void)
42 */ 81 */
43void tracing_off(void) 82void tracing_off(void)
44{ 83{
45 ring_buffers_off = 1; 84 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
46} 85}
86EXPORT_SYMBOL_GPL(tracing_off);
87
88/**
89 * tracing_off_permanent - permanently disable ring buffers
90 *
91 * This function, once called, will disable all ring buffers
92 * permanenty.
93 */
94void tracing_off_permanent(void)
95{
96 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
97}
98
99#include "trace.h"
47 100
48/* Up this if you want to test the TIME_EXTENTS and normalization */ 101/* Up this if you want to test the TIME_EXTENTS and normalization */
49#define DEBUG_SHIFT 0 102#define DEBUG_SHIFT 0
@@ -56,16 +109,18 @@ u64 ring_buffer_time_stamp(int cpu)
56 preempt_disable_notrace(); 109 preempt_disable_notrace();
57 /* shift to debug/test normalization and TIME_EXTENTS */ 110 /* shift to debug/test normalization and TIME_EXTENTS */
58 time = sched_clock() << DEBUG_SHIFT; 111 time = sched_clock() << DEBUG_SHIFT;
59 preempt_enable_notrace(); 112 preempt_enable_no_resched_notrace();
60 113
61 return time; 114 return time;
62} 115}
116EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
63 117
64void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) 118void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
65{ 119{
66 /* Just stupid testing the normalize function and deltas */ 120 /* Just stupid testing the normalize function and deltas */
67 *ts >>= DEBUG_SHIFT; 121 *ts >>= DEBUG_SHIFT;
68} 122}
123EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
69 124
70#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) 125#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
71#define RB_ALIGNMENT_SHIFT 2 126#define RB_ALIGNMENT_SHIFT 2
@@ -115,6 +170,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
115{ 170{
116 return rb_event_length(event); 171 return rb_event_length(event);
117} 172}
173EXPORT_SYMBOL_GPL(ring_buffer_event_length);
118 174
119/* inline for ring buffer fast paths */ 175/* inline for ring buffer fast paths */
120static inline void * 176static inline void *
@@ -136,28 +192,33 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
136{ 192{
137 return rb_event_data(event); 193 return rb_event_data(event);
138} 194}
195EXPORT_SYMBOL_GPL(ring_buffer_event_data);
139 196
140#define for_each_buffer_cpu(buffer, cpu) \ 197#define for_each_buffer_cpu(buffer, cpu) \
141 for_each_cpu_mask(cpu, buffer->cpumask) 198 for_each_cpu(cpu, buffer->cpumask)
142 199
143#define TS_SHIFT 27 200#define TS_SHIFT 27
144#define TS_MASK ((1ULL << TS_SHIFT) - 1) 201#define TS_MASK ((1ULL << TS_SHIFT) - 1)
145#define TS_DELTA_TEST (~TS_MASK) 202#define TS_DELTA_TEST (~TS_MASK)
146 203
147/* 204struct buffer_data_page {
148 * This hack stolen from mm/slob.c.
149 * We can store per page timing information in the page frame of the page.
150 * Thanks to Peter Zijlstra for suggesting this idea.
151 */
152struct buffer_page {
153 u64 time_stamp; /* page time stamp */ 205 u64 time_stamp; /* page time stamp */
154 local_t write; /* index for next write */
155 local_t commit; /* write commited index */ 206 local_t commit; /* write commited index */
207 unsigned char data[]; /* data of buffer page */
208};
209
210struct buffer_page {
211 local_t write; /* index for next write */
156 unsigned read; /* index for next read */ 212 unsigned read; /* index for next read */
157 struct list_head list; /* list of free pages */ 213 struct list_head list; /* list of free pages */
158 void *page; /* Actual data page */ 214 struct buffer_data_page *page; /* Actual data page */
159}; 215};
160 216
217static void rb_init_page(struct buffer_data_page *bpage)
218{
219 local_set(&bpage->commit, 0);
220}
221
161/* 222/*
162 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing 223 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
163 * this issue out. 224 * this issue out.
@@ -179,7 +240,7 @@ static inline int test_time_stamp(u64 delta)
179 return 0; 240 return 0;
180} 241}
181 242
182#define BUF_PAGE_SIZE PAGE_SIZE 243#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
183 244
184/* 245/*
185 * head_page == tail_page && head == tail then buffer is empty. 246 * head_page == tail_page && head == tail then buffer is empty.
@@ -187,7 +248,8 @@ static inline int test_time_stamp(u64 delta)
187struct ring_buffer_per_cpu { 248struct ring_buffer_per_cpu {
188 int cpu; 249 int cpu;
189 struct ring_buffer *buffer; 250 struct ring_buffer *buffer;
190 spinlock_t lock; 251 spinlock_t reader_lock; /* serialize readers */
252 raw_spinlock_t lock;
191 struct lock_class_key lock_key; 253 struct lock_class_key lock_key;
192 struct list_head pages; 254 struct list_head pages;
193 struct buffer_page *head_page; /* read from head */ 255 struct buffer_page *head_page; /* read from head */
@@ -202,11 +264,10 @@ struct ring_buffer_per_cpu {
202}; 264};
203 265
204struct ring_buffer { 266struct ring_buffer {
205 unsigned long size;
206 unsigned pages; 267 unsigned pages;
207 unsigned flags; 268 unsigned flags;
208 int cpus; 269 int cpus;
209 cpumask_t cpumask; 270 cpumask_var_t cpumask;
210 atomic_t record_disabled; 271 atomic_t record_disabled;
211 272
212 struct mutex mutex; 273 struct mutex mutex;
@@ -221,32 +282,16 @@ struct ring_buffer_iter {
221 u64 read_stamp; 282 u64 read_stamp;
222}; 283};
223 284
285/* buffer may be either ring_buffer or ring_buffer_per_cpu */
224#define RB_WARN_ON(buffer, cond) \ 286#define RB_WARN_ON(buffer, cond) \
225 do { \ 287 ({ \
226 if (unlikely(cond)) { \ 288 int _____ret = unlikely(cond); \
227 atomic_inc(&buffer->record_disabled); \ 289 if (_____ret) { \
228 WARN_ON(1); \
229 } \
230 } while (0)
231
232#define RB_WARN_ON_RET(buffer, cond) \
233 do { \
234 if (unlikely(cond)) { \
235 atomic_inc(&buffer->record_disabled); \
236 WARN_ON(1); \
237 return -1; \
238 } \
239 } while (0)
240
241#define RB_WARN_ON_ONCE(buffer, cond) \
242 do { \
243 static int once; \
244 if (unlikely(cond) && !once) { \
245 once++; \
246 atomic_inc(&buffer->record_disabled); \ 290 atomic_inc(&buffer->record_disabled); \
247 WARN_ON(1); \ 291 WARN_ON(1); \
248 } \ 292 } \
249 } while (0) 293 _____ret; \
294 })
250 295
251/** 296/**
252 * check_pages - integrity check of buffer pages 297 * check_pages - integrity check of buffer pages
@@ -258,16 +303,20 @@ struct ring_buffer_iter {
258static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 303static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
259{ 304{
260 struct list_head *head = &cpu_buffer->pages; 305 struct list_head *head = &cpu_buffer->pages;
261 struct buffer_page *page, *tmp; 306 struct buffer_page *bpage, *tmp;
262 307
263 RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); 308 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
264 RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); 309 return -1;
310 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
311 return -1;
265 312
266 list_for_each_entry_safe(page, tmp, head, list) { 313 list_for_each_entry_safe(bpage, tmp, head, list) {
267 RB_WARN_ON_RET(cpu_buffer, 314 if (RB_WARN_ON(cpu_buffer,
268 page->list.next->prev != &page->list); 315 bpage->list.next->prev != &bpage->list))
269 RB_WARN_ON_RET(cpu_buffer, 316 return -1;
270 page->list.prev->next != &page->list); 317 if (RB_WARN_ON(cpu_buffer,
318 bpage->list.prev->next != &bpage->list))
319 return -1;
271 } 320 }
272 321
273 return 0; 322 return 0;
@@ -277,22 +326,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
277 unsigned nr_pages) 326 unsigned nr_pages)
278{ 327{
279 struct list_head *head = &cpu_buffer->pages; 328 struct list_head *head = &cpu_buffer->pages;
280 struct buffer_page *page, *tmp; 329 struct buffer_page *bpage, *tmp;
281 unsigned long addr; 330 unsigned long addr;
282 LIST_HEAD(pages); 331 LIST_HEAD(pages);
283 unsigned i; 332 unsigned i;
284 333
285 for (i = 0; i < nr_pages; i++) { 334 for (i = 0; i < nr_pages; i++) {
286 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 335 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
287 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 336 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
288 if (!page) 337 if (!bpage)
289 goto free_pages; 338 goto free_pages;
290 list_add(&page->list, &pages); 339 list_add(&bpage->list, &pages);
291 340
292 addr = __get_free_page(GFP_KERNEL); 341 addr = __get_free_page(GFP_KERNEL);
293 if (!addr) 342 if (!addr)
294 goto free_pages; 343 goto free_pages;
295 page->page = (void *)addr; 344 bpage->page = (void *)addr;
345 rb_init_page(bpage->page);
296 } 346 }
297 347
298 list_splice(&pages, head); 348 list_splice(&pages, head);
@@ -302,9 +352,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
302 return 0; 352 return 0;
303 353
304 free_pages: 354 free_pages:
305 list_for_each_entry_safe(page, tmp, &pages, list) { 355 list_for_each_entry_safe(bpage, tmp, &pages, list) {
306 list_del_init(&page->list); 356 list_del_init(&bpage->list);
307 free_buffer_page(page); 357 free_buffer_page(bpage);
308 } 358 }
309 return -ENOMEM; 359 return -ENOMEM;
310} 360}
@@ -313,7 +363,7 @@ static struct ring_buffer_per_cpu *
313rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) 363rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
314{ 364{
315 struct ring_buffer_per_cpu *cpu_buffer; 365 struct ring_buffer_per_cpu *cpu_buffer;
316 struct buffer_page *page; 366 struct buffer_page *bpage;
317 unsigned long addr; 367 unsigned long addr;
318 int ret; 368 int ret;
319 369
@@ -324,19 +374,21 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
324 374
325 cpu_buffer->cpu = cpu; 375 cpu_buffer->cpu = cpu;
326 cpu_buffer->buffer = buffer; 376 cpu_buffer->buffer = buffer;
327 spin_lock_init(&cpu_buffer->lock); 377 spin_lock_init(&cpu_buffer->reader_lock);
378 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
328 INIT_LIST_HEAD(&cpu_buffer->pages); 379 INIT_LIST_HEAD(&cpu_buffer->pages);
329 380
330 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 381 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
331 GFP_KERNEL, cpu_to_node(cpu)); 382 GFP_KERNEL, cpu_to_node(cpu));
332 if (!page) 383 if (!bpage)
333 goto fail_free_buffer; 384 goto fail_free_buffer;
334 385
335 cpu_buffer->reader_page = page; 386 cpu_buffer->reader_page = bpage;
336 addr = __get_free_page(GFP_KERNEL); 387 addr = __get_free_page(GFP_KERNEL);
337 if (!addr) 388 if (!addr)
338 goto fail_free_reader; 389 goto fail_free_reader;
339 page->page = (void *)addr; 390 bpage->page = (void *)addr;
391 rb_init_page(bpage->page);
340 392
341 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 393 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
342 394
@@ -361,14 +413,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
361static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 413static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
362{ 414{
363 struct list_head *head = &cpu_buffer->pages; 415 struct list_head *head = &cpu_buffer->pages;
364 struct buffer_page *page, *tmp; 416 struct buffer_page *bpage, *tmp;
365 417
366 list_del_init(&cpu_buffer->reader_page->list); 418 list_del_init(&cpu_buffer->reader_page->list);
367 free_buffer_page(cpu_buffer->reader_page); 419 free_buffer_page(cpu_buffer->reader_page);
368 420
369 list_for_each_entry_safe(page, tmp, head, list) { 421 list_for_each_entry_safe(bpage, tmp, head, list) {
370 list_del_init(&page->list); 422 list_del_init(&bpage->list);
371 free_buffer_page(page); 423 free_buffer_page(bpage);
372 } 424 }
373 kfree(cpu_buffer); 425 kfree(cpu_buffer);
374} 426}
@@ -381,7 +433,7 @@ extern int ring_buffer_page_too_big(void);
381 433
382/** 434/**
383 * ring_buffer_alloc - allocate a new ring_buffer 435 * ring_buffer_alloc - allocate a new ring_buffer
384 * @size: the size in bytes that is needed. 436 * @size: the size in bytes per cpu that is needed.
385 * @flags: attributes to set for the ring buffer. 437 * @flags: attributes to set for the ring buffer.
386 * 438 *
387 * Currently the only flag that is available is the RB_FL_OVERWRITE 439 * Currently the only flag that is available is the RB_FL_OVERWRITE
@@ -406,6 +458,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
406 if (!buffer) 458 if (!buffer)
407 return NULL; 459 return NULL;
408 460
461 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
462 goto fail_free_buffer;
463
409 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 464 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
410 buffer->flags = flags; 465 buffer->flags = flags;
411 466
@@ -413,14 +468,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
413 if (buffer->pages == 1) 468 if (buffer->pages == 1)
414 buffer->pages++; 469 buffer->pages++;
415 470
416 buffer->cpumask = cpu_possible_map; 471 cpumask_copy(buffer->cpumask, cpu_possible_mask);
417 buffer->cpus = nr_cpu_ids; 472 buffer->cpus = nr_cpu_ids;
418 473
419 bsize = sizeof(void *) * nr_cpu_ids; 474 bsize = sizeof(void *) * nr_cpu_ids;
420 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), 475 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
421 GFP_KERNEL); 476 GFP_KERNEL);
422 if (!buffer->buffers) 477 if (!buffer->buffers)
423 goto fail_free_buffer; 478 goto fail_free_cpumask;
424 479
425 for_each_buffer_cpu(buffer, cpu) { 480 for_each_buffer_cpu(buffer, cpu) {
426 buffer->buffers[cpu] = 481 buffer->buffers[cpu] =
@@ -440,10 +495,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
440 } 495 }
441 kfree(buffer->buffers); 496 kfree(buffer->buffers);
442 497
498 fail_free_cpumask:
499 free_cpumask_var(buffer->cpumask);
500
443 fail_free_buffer: 501 fail_free_buffer:
444 kfree(buffer); 502 kfree(buffer);
445 return NULL; 503 return NULL;
446} 504}
505EXPORT_SYMBOL_GPL(ring_buffer_alloc);
447 506
448/** 507/**
449 * ring_buffer_free - free a ring buffer. 508 * ring_buffer_free - free a ring buffer.
@@ -457,15 +516,18 @@ ring_buffer_free(struct ring_buffer *buffer)
457 for_each_buffer_cpu(buffer, cpu) 516 for_each_buffer_cpu(buffer, cpu)
458 rb_free_cpu_buffer(buffer->buffers[cpu]); 517 rb_free_cpu_buffer(buffer->buffers[cpu]);
459 518
519 free_cpumask_var(buffer->cpumask);
520
460 kfree(buffer); 521 kfree(buffer);
461} 522}
523EXPORT_SYMBOL_GPL(ring_buffer_free);
462 524
463static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 525static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
464 526
465static void 527static void
466rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) 528rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
467{ 529{
468 struct buffer_page *page; 530 struct buffer_page *bpage;
469 struct list_head *p; 531 struct list_head *p;
470 unsigned i; 532 unsigned i;
471 533
@@ -473,13 +535,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
473 synchronize_sched(); 535 synchronize_sched();
474 536
475 for (i = 0; i < nr_pages; i++) { 537 for (i = 0; i < nr_pages; i++) {
476 BUG_ON(list_empty(&cpu_buffer->pages)); 538 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
539 return;
477 p = cpu_buffer->pages.next; 540 p = cpu_buffer->pages.next;
478 page = list_entry(p, struct buffer_page, list); 541 bpage = list_entry(p, struct buffer_page, list);
479 list_del_init(&page->list); 542 list_del_init(&bpage->list);
480 free_buffer_page(page); 543 free_buffer_page(bpage);
481 } 544 }
482 BUG_ON(list_empty(&cpu_buffer->pages)); 545 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
546 return;
483 547
484 rb_reset_cpu(cpu_buffer); 548 rb_reset_cpu(cpu_buffer);
485 549
@@ -493,7 +557,7 @@ static void
493rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, 557rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
494 struct list_head *pages, unsigned nr_pages) 558 struct list_head *pages, unsigned nr_pages)
495{ 559{
496 struct buffer_page *page; 560 struct buffer_page *bpage;
497 struct list_head *p; 561 struct list_head *p;
498 unsigned i; 562 unsigned i;
499 563
@@ -501,11 +565,12 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
501 synchronize_sched(); 565 synchronize_sched();
502 566
503 for (i = 0; i < nr_pages; i++) { 567 for (i = 0; i < nr_pages; i++) {
504 BUG_ON(list_empty(pages)); 568 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
569 return;
505 p = pages->next; 570 p = pages->next;
506 page = list_entry(p, struct buffer_page, list); 571 bpage = list_entry(p, struct buffer_page, list);
507 list_del_init(&page->list); 572 list_del_init(&bpage->list);
508 list_add_tail(&page->list, &cpu_buffer->pages); 573 list_add_tail(&bpage->list, &cpu_buffer->pages);
509 } 574 }
510 rb_reset_cpu(cpu_buffer); 575 rb_reset_cpu(cpu_buffer);
511 576
@@ -532,7 +597,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
532{ 597{
533 struct ring_buffer_per_cpu *cpu_buffer; 598 struct ring_buffer_per_cpu *cpu_buffer;
534 unsigned nr_pages, rm_pages, new_pages; 599 unsigned nr_pages, rm_pages, new_pages;
535 struct buffer_page *page, *tmp; 600 struct buffer_page *bpage, *tmp;
536 unsigned long buffer_size; 601 unsigned long buffer_size;
537 unsigned long addr; 602 unsigned long addr;
538 LIST_HEAD(pages); 603 LIST_HEAD(pages);
@@ -562,7 +627,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
562 if (size < buffer_size) { 627 if (size < buffer_size) {
563 628
564 /* easy case, just free pages */ 629 /* easy case, just free pages */
565 BUG_ON(nr_pages >= buffer->pages); 630 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
631 mutex_unlock(&buffer->mutex);
632 return -1;
633 }
566 634
567 rm_pages = buffer->pages - nr_pages; 635 rm_pages = buffer->pages - nr_pages;
568 636
@@ -581,21 +649,26 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
581 * add these pages to the cpu_buffers. Otherwise we just free 649 * add these pages to the cpu_buffers. Otherwise we just free
582 * them all and return -ENOMEM; 650 * them all and return -ENOMEM;
583 */ 651 */
584 BUG_ON(nr_pages <= buffer->pages); 652 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
653 mutex_unlock(&buffer->mutex);
654 return -1;
655 }
656
585 new_pages = nr_pages - buffer->pages; 657 new_pages = nr_pages - buffer->pages;
586 658
587 for_each_buffer_cpu(buffer, cpu) { 659 for_each_buffer_cpu(buffer, cpu) {
588 for (i = 0; i < new_pages; i++) { 660 for (i = 0; i < new_pages; i++) {
589 page = kzalloc_node(ALIGN(sizeof(*page), 661 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
590 cache_line_size()), 662 cache_line_size()),
591 GFP_KERNEL, cpu_to_node(cpu)); 663 GFP_KERNEL, cpu_to_node(cpu));
592 if (!page) 664 if (!bpage)
593 goto free_pages; 665 goto free_pages;
594 list_add(&page->list, &pages); 666 list_add(&bpage->list, &pages);
595 addr = __get_free_page(GFP_KERNEL); 667 addr = __get_free_page(GFP_KERNEL);
596 if (!addr) 668 if (!addr)
597 goto free_pages; 669 goto free_pages;
598 page->page = (void *)addr; 670 bpage->page = (void *)addr;
671 rb_init_page(bpage->page);
599 } 672 }
600 } 673 }
601 674
@@ -604,7 +677,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
604 rb_insert_pages(cpu_buffer, &pages, new_pages); 677 rb_insert_pages(cpu_buffer, &pages, new_pages);
605 } 678 }
606 679
607 BUG_ON(!list_empty(&pages)); 680 if (RB_WARN_ON(buffer, !list_empty(&pages))) {
681 mutex_unlock(&buffer->mutex);
682 return -1;
683 }
608 684
609 out: 685 out:
610 buffer->pages = nr_pages; 686 buffer->pages = nr_pages;
@@ -613,22 +689,29 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
613 return size; 689 return size;
614 690
615 free_pages: 691 free_pages:
616 list_for_each_entry_safe(page, tmp, &pages, list) { 692 list_for_each_entry_safe(bpage, tmp, &pages, list) {
617 list_del_init(&page->list); 693 list_del_init(&bpage->list);
618 free_buffer_page(page); 694 free_buffer_page(bpage);
619 } 695 }
620 mutex_unlock(&buffer->mutex); 696 mutex_unlock(&buffer->mutex);
621 return -ENOMEM; 697 return -ENOMEM;
622} 698}
699EXPORT_SYMBOL_GPL(ring_buffer_resize);
623 700
624static inline int rb_null_event(struct ring_buffer_event *event) 701static inline int rb_null_event(struct ring_buffer_event *event)
625{ 702{
626 return event->type == RINGBUF_TYPE_PADDING; 703 return event->type == RINGBUF_TYPE_PADDING;
627} 704}
628 705
629static inline void *__rb_page_index(struct buffer_page *page, unsigned index) 706static inline void *
707__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
708{
709 return bpage->data + index;
710}
711
712static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
630{ 713{
631 return page->page + index; 714 return bpage->page->data + index;
632} 715}
633 716
634static inline struct ring_buffer_event * 717static inline struct ring_buffer_event *
@@ -658,7 +741,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage)
658 741
659static inline unsigned rb_page_commit(struct buffer_page *bpage) 742static inline unsigned rb_page_commit(struct buffer_page *bpage)
660{ 743{
661 return local_read(&bpage->commit); 744 return local_read(&bpage->page->commit);
662} 745}
663 746
664/* Size is determined by what has been commited */ 747/* Size is determined by what has been commited */
@@ -693,7 +776,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
693 head += rb_event_length(event)) { 776 head += rb_event_length(event)) {
694 777
695 event = __rb_page_index(cpu_buffer->head_page, head); 778 event = __rb_page_index(cpu_buffer->head_page, head);
696 BUG_ON(rb_null_event(event)); 779 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
780 return;
697 /* Only count data entries */ 781 /* Only count data entries */
698 if (event->type != RINGBUF_TYPE_DATA) 782 if (event->type != RINGBUF_TYPE_DATA)
699 continue; 783 continue;
@@ -703,14 +787,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
703} 787}
704 788
705static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, 789static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
706 struct buffer_page **page) 790 struct buffer_page **bpage)
707{ 791{
708 struct list_head *p = (*page)->list.next; 792 struct list_head *p = (*bpage)->list.next;
709 793
710 if (p == &cpu_buffer->pages) 794 if (p == &cpu_buffer->pages)
711 p = p->next; 795 p = p->next;
712 796
713 *page = list_entry(p, struct buffer_page, list); 797 *bpage = list_entry(p, struct buffer_page, list);
714} 798}
715 799
716static inline unsigned 800static inline unsigned
@@ -746,16 +830,18 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
746 addr &= PAGE_MASK; 830 addr &= PAGE_MASK;
747 831
748 while (cpu_buffer->commit_page->page != (void *)addr) { 832 while (cpu_buffer->commit_page->page != (void *)addr) {
749 RB_WARN_ON(cpu_buffer, 833 if (RB_WARN_ON(cpu_buffer,
750 cpu_buffer->commit_page == cpu_buffer->tail_page); 834 cpu_buffer->commit_page == cpu_buffer->tail_page))
751 cpu_buffer->commit_page->commit = 835 return;
836 cpu_buffer->commit_page->page->commit =
752 cpu_buffer->commit_page->write; 837 cpu_buffer->commit_page->write;
753 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 838 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
754 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; 839 cpu_buffer->write_stamp =
840 cpu_buffer->commit_page->page->time_stamp;
755 } 841 }
756 842
757 /* Now set the commit to the event's index */ 843 /* Now set the commit to the event's index */
758 local_set(&cpu_buffer->commit_page->commit, index); 844 local_set(&cpu_buffer->commit_page->page->commit, index);
759} 845}
760 846
761static inline void 847static inline void
@@ -769,25 +855,38 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
769 * back to us). This allows us to do a simple loop to 855 * back to us). This allows us to do a simple loop to
770 * assign the commit to the tail. 856 * assign the commit to the tail.
771 */ 857 */
858 again:
772 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 859 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
773 cpu_buffer->commit_page->commit = 860 cpu_buffer->commit_page->page->commit =
774 cpu_buffer->commit_page->write; 861 cpu_buffer->commit_page->write;
775 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 862 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
776 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; 863 cpu_buffer->write_stamp =
864 cpu_buffer->commit_page->page->time_stamp;
777 /* add barrier to keep gcc from optimizing too much */ 865 /* add barrier to keep gcc from optimizing too much */
778 barrier(); 866 barrier();
779 } 867 }
780 while (rb_commit_index(cpu_buffer) != 868 while (rb_commit_index(cpu_buffer) !=
781 rb_page_write(cpu_buffer->commit_page)) { 869 rb_page_write(cpu_buffer->commit_page)) {
782 cpu_buffer->commit_page->commit = 870 cpu_buffer->commit_page->page->commit =
783 cpu_buffer->commit_page->write; 871 cpu_buffer->commit_page->write;
784 barrier(); 872 barrier();
785 } 873 }
874
875 /* again, keep gcc from optimizing */
876 barrier();
877
878 /*
879 * If an interrupt came in just after the first while loop
880 * and pushed the tail page forward, we will be left with
881 * a dangling commit that will never go forward.
882 */
883 if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
884 goto again;
786} 885}
787 886
788static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 887static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
789{ 888{
790 cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; 889 cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
791 cpu_buffer->reader_page->read = 0; 890 cpu_buffer->reader_page->read = 0;
792} 891}
793 892
@@ -806,7 +905,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
806 else 905 else
807 rb_inc_page(cpu_buffer, &iter->head_page); 906 rb_inc_page(cpu_buffer, &iter->head_page);
808 907
809 iter->read_stamp = iter->head_page->time_stamp; 908 iter->read_stamp = iter->head_page->page->time_stamp;
810 iter->head = 0; 909 iter->head = 0;
811} 910}
812 911
@@ -880,12 +979,15 @@ static struct ring_buffer_event *
880__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 979__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
881 unsigned type, unsigned long length, u64 *ts) 980 unsigned type, unsigned long length, u64 *ts)
882{ 981{
883 struct buffer_page *tail_page, *head_page, *reader_page; 982 struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
884 unsigned long tail, write; 983 unsigned long tail, write;
885 struct ring_buffer *buffer = cpu_buffer->buffer; 984 struct ring_buffer *buffer = cpu_buffer->buffer;
886 struct ring_buffer_event *event; 985 struct ring_buffer_event *event;
887 unsigned long flags; 986 unsigned long flags;
888 987
988 commit_page = cpu_buffer->commit_page;
989 /* we just need to protect against interrupts */
990 barrier();
889 tail_page = cpu_buffer->tail_page; 991 tail_page = cpu_buffer->tail_page;
890 write = local_add_return(length, &tail_page->write); 992 write = local_add_return(length, &tail_page->write);
891 tail = write - length; 993 tail = write - length;
@@ -894,7 +996,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
894 if (write > BUF_PAGE_SIZE) { 996 if (write > BUF_PAGE_SIZE) {
895 struct buffer_page *next_page = tail_page; 997 struct buffer_page *next_page = tail_page;
896 998
897 spin_lock_irqsave(&cpu_buffer->lock, flags); 999 local_irq_save(flags);
1000 __raw_spin_lock(&cpu_buffer->lock);
898 1001
899 rb_inc_page(cpu_buffer, &next_page); 1002 rb_inc_page(cpu_buffer, &next_page);
900 1003
@@ -902,14 +1005,15 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
902 reader_page = cpu_buffer->reader_page; 1005 reader_page = cpu_buffer->reader_page;
903 1006
904 /* we grabbed the lock before incrementing */ 1007 /* we grabbed the lock before incrementing */
905 RB_WARN_ON(cpu_buffer, next_page == reader_page); 1008 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1009 goto out_unlock;
906 1010
907 /* 1011 /*
908 * If for some reason, we had an interrupt storm that made 1012 * If for some reason, we had an interrupt storm that made
909 * it all the way around the buffer, bail, and warn 1013 * it all the way around the buffer, bail, and warn
910 * about it. 1014 * about it.
911 */ 1015 */
912 if (unlikely(next_page == cpu_buffer->commit_page)) { 1016 if (unlikely(next_page == commit_page)) {
913 WARN_ON_ONCE(1); 1017 WARN_ON_ONCE(1);
914 goto out_unlock; 1018 goto out_unlock;
915 } 1019 }
@@ -940,12 +1044,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
940 */ 1044 */
941 if (tail_page == cpu_buffer->tail_page) { 1045 if (tail_page == cpu_buffer->tail_page) {
942 local_set(&next_page->write, 0); 1046 local_set(&next_page->write, 0);
943 local_set(&next_page->commit, 0); 1047 local_set(&next_page->page->commit, 0);
944 cpu_buffer->tail_page = next_page; 1048 cpu_buffer->tail_page = next_page;
945 1049
946 /* reread the time stamp */ 1050 /* reread the time stamp */
947 *ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1051 *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
948 cpu_buffer->tail_page->time_stamp = *ts; 1052 cpu_buffer->tail_page->page->time_stamp = *ts;
949 } 1053 }
950 1054
951 /* 1055 /*
@@ -970,7 +1074,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
970 rb_set_commit_to_write(cpu_buffer); 1074 rb_set_commit_to_write(cpu_buffer);
971 } 1075 }
972 1076
973 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1077 __raw_spin_unlock(&cpu_buffer->lock);
1078 local_irq_restore(flags);
974 1079
975 /* fail and let the caller try again */ 1080 /* fail and let the caller try again */
976 return ERR_PTR(-EAGAIN); 1081 return ERR_PTR(-EAGAIN);
@@ -978,7 +1083,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
978 1083
979 /* We reserved something on the buffer */ 1084 /* We reserved something on the buffer */
980 1085
981 BUG_ON(write > BUF_PAGE_SIZE); 1086 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1087 return NULL;
982 1088
983 event = __rb_page_index(tail_page, tail); 1089 event = __rb_page_index(tail_page, tail);
984 rb_update_event(event, type, length); 1090 rb_update_event(event, type, length);
@@ -988,12 +1094,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
988 * this page's time stamp. 1094 * this page's time stamp.
989 */ 1095 */
990 if (!tail && rb_is_commit(cpu_buffer, event)) 1096 if (!tail && rb_is_commit(cpu_buffer, event))
991 cpu_buffer->commit_page->time_stamp = *ts; 1097 cpu_buffer->commit_page->page->time_stamp = *ts;
992 1098
993 return event; 1099 return event;
994 1100
995 out_unlock: 1101 out_unlock:
996 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1102 __raw_spin_unlock(&cpu_buffer->lock);
1103 local_irq_restore(flags);
997 return NULL; 1104 return NULL;
998} 1105}
999 1106
@@ -1038,7 +1145,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1038 event->time_delta = *delta & TS_MASK; 1145 event->time_delta = *delta & TS_MASK;
1039 event->array[0] = *delta >> TS_SHIFT; 1146 event->array[0] = *delta >> TS_SHIFT;
1040 } else { 1147 } else {
1041 cpu_buffer->commit_page->time_stamp = *ts; 1148 cpu_buffer->commit_page->page->time_stamp = *ts;
1042 event->time_delta = 0; 1149 event->time_delta = 0;
1043 event->array[0] = 0; 1150 event->array[0] = 0;
1044 } 1151 }
@@ -1076,10 +1183,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1076 * storm or we have something buggy. 1183 * storm or we have something buggy.
1077 * Bail! 1184 * Bail!
1078 */ 1185 */
1079 if (unlikely(++nr_loops > 1000)) { 1186 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1080 RB_WARN_ON(cpu_buffer, 1);
1081 return NULL; 1187 return NULL;
1082 }
1083 1188
1084 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1189 ts = ring_buffer_time_stamp(cpu_buffer->cpu);
1085 1190
@@ -1175,19 +1280,18 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1175 struct ring_buffer_event *event; 1280 struct ring_buffer_event *event;
1176 int cpu, resched; 1281 int cpu, resched;
1177 1282
1178 if (ring_buffers_off) 1283 if (ring_buffer_flags != RB_BUFFERS_ON)
1179 return NULL; 1284 return NULL;
1180 1285
1181 if (atomic_read(&buffer->record_disabled)) 1286 if (atomic_read(&buffer->record_disabled))
1182 return NULL; 1287 return NULL;
1183 1288
1184 /* If we are tracing schedule, we don't want to recurse */ 1289 /* If we are tracing schedule, we don't want to recurse */
1185 resched = need_resched(); 1290 resched = ftrace_preempt_disable();
1186 preempt_disable_notrace();
1187 1291
1188 cpu = raw_smp_processor_id(); 1292 cpu = raw_smp_processor_id();
1189 1293
1190 if (!cpu_isset(cpu, buffer->cpumask)) 1294 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1191 goto out; 1295 goto out;
1192 1296
1193 cpu_buffer = buffer->buffers[cpu]; 1297 cpu_buffer = buffer->buffers[cpu];
@@ -1214,12 +1318,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1214 return event; 1318 return event;
1215 1319
1216 out: 1320 out:
1217 if (resched) 1321 ftrace_preempt_enable(resched);
1218 preempt_enable_no_resched_notrace();
1219 else
1220 preempt_enable_notrace();
1221 return NULL; 1322 return NULL;
1222} 1323}
1324EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1223 1325
1224static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 1326static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1225 struct ring_buffer_event *event) 1327 struct ring_buffer_event *event)
@@ -1259,16 +1361,14 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1259 /* 1361 /*
1260 * Only the last preempt count needs to restore preemption. 1362 * Only the last preempt count needs to restore preemption.
1261 */ 1363 */
1262 if (preempt_count() == 1) { 1364 if (preempt_count() == 1)
1263 if (per_cpu(rb_need_resched, cpu)) 1365 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
1264 preempt_enable_no_resched_notrace(); 1366 else
1265 else
1266 preempt_enable_notrace();
1267 } else
1268 preempt_enable_no_resched_notrace(); 1367 preempt_enable_no_resched_notrace();
1269 1368
1270 return 0; 1369 return 0;
1271} 1370}
1371EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
1272 1372
1273/** 1373/**
1274 * ring_buffer_write - write data to the buffer without reserving 1374 * ring_buffer_write - write data to the buffer without reserving
@@ -1294,18 +1394,17 @@ int ring_buffer_write(struct ring_buffer *buffer,
1294 int ret = -EBUSY; 1394 int ret = -EBUSY;
1295 int cpu, resched; 1395 int cpu, resched;
1296 1396
1297 if (ring_buffers_off) 1397 if (ring_buffer_flags != RB_BUFFERS_ON)
1298 return -EBUSY; 1398 return -EBUSY;
1299 1399
1300 if (atomic_read(&buffer->record_disabled)) 1400 if (atomic_read(&buffer->record_disabled))
1301 return -EBUSY; 1401 return -EBUSY;
1302 1402
1303 resched = need_resched(); 1403 resched = ftrace_preempt_disable();
1304 preempt_disable_notrace();
1305 1404
1306 cpu = raw_smp_processor_id(); 1405 cpu = raw_smp_processor_id();
1307 1406
1308 if (!cpu_isset(cpu, buffer->cpumask)) 1407 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1309 goto out; 1408 goto out;
1310 1409
1311 cpu_buffer = buffer->buffers[cpu]; 1410 cpu_buffer = buffer->buffers[cpu];
@@ -1327,13 +1426,11 @@ int ring_buffer_write(struct ring_buffer *buffer,
1327 1426
1328 ret = 0; 1427 ret = 0;
1329 out: 1428 out:
1330 if (resched) 1429 ftrace_preempt_enable(resched);
1331 preempt_enable_no_resched_notrace();
1332 else
1333 preempt_enable_notrace();
1334 1430
1335 return ret; 1431 return ret;
1336} 1432}
1433EXPORT_SYMBOL_GPL(ring_buffer_write);
1337 1434
1338static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 1435static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1339{ 1436{
@@ -1360,6 +1457,7 @@ void ring_buffer_record_disable(struct ring_buffer *buffer)
1360{ 1457{
1361 atomic_inc(&buffer->record_disabled); 1458 atomic_inc(&buffer->record_disabled);
1362} 1459}
1460EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
1363 1461
1364/** 1462/**
1365 * ring_buffer_record_enable - enable writes to the buffer 1463 * ring_buffer_record_enable - enable writes to the buffer
@@ -1372,6 +1470,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
1372{ 1470{
1373 atomic_dec(&buffer->record_disabled); 1471 atomic_dec(&buffer->record_disabled);
1374} 1472}
1473EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
1375 1474
1376/** 1475/**
1377 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer 1476 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
@@ -1387,12 +1486,13 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
1387{ 1486{
1388 struct ring_buffer_per_cpu *cpu_buffer; 1487 struct ring_buffer_per_cpu *cpu_buffer;
1389 1488
1390 if (!cpu_isset(cpu, buffer->cpumask)) 1489 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1391 return; 1490 return;
1392 1491
1393 cpu_buffer = buffer->buffers[cpu]; 1492 cpu_buffer = buffer->buffers[cpu];
1394 atomic_inc(&cpu_buffer->record_disabled); 1493 atomic_inc(&cpu_buffer->record_disabled);
1395} 1494}
1495EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
1396 1496
1397/** 1497/**
1398 * ring_buffer_record_enable_cpu - enable writes to the buffer 1498 * ring_buffer_record_enable_cpu - enable writes to the buffer
@@ -1406,12 +1506,13 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
1406{ 1506{
1407 struct ring_buffer_per_cpu *cpu_buffer; 1507 struct ring_buffer_per_cpu *cpu_buffer;
1408 1508
1409 if (!cpu_isset(cpu, buffer->cpumask)) 1509 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1410 return; 1510 return;
1411 1511
1412 cpu_buffer = buffer->buffers[cpu]; 1512 cpu_buffer = buffer->buffers[cpu];
1413 atomic_dec(&cpu_buffer->record_disabled); 1513 atomic_dec(&cpu_buffer->record_disabled);
1414} 1514}
1515EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
1415 1516
1416/** 1517/**
1417 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 1518 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
@@ -1422,12 +1523,13 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1422{ 1523{
1423 struct ring_buffer_per_cpu *cpu_buffer; 1524 struct ring_buffer_per_cpu *cpu_buffer;
1424 1525
1425 if (!cpu_isset(cpu, buffer->cpumask)) 1526 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1426 return 0; 1527 return 0;
1427 1528
1428 cpu_buffer = buffer->buffers[cpu]; 1529 cpu_buffer = buffer->buffers[cpu];
1429 return cpu_buffer->entries; 1530 return cpu_buffer->entries;
1430} 1531}
1532EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
1431 1533
1432/** 1534/**
1433 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer 1535 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
@@ -1438,12 +1540,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1438{ 1540{
1439 struct ring_buffer_per_cpu *cpu_buffer; 1541 struct ring_buffer_per_cpu *cpu_buffer;
1440 1542
1441 if (!cpu_isset(cpu, buffer->cpumask)) 1543 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1442 return 0; 1544 return 0;
1443 1545
1444 cpu_buffer = buffer->buffers[cpu]; 1546 cpu_buffer = buffer->buffers[cpu];
1445 return cpu_buffer->overrun; 1547 return cpu_buffer->overrun;
1446} 1548}
1549EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1447 1550
1448/** 1551/**
1449 * ring_buffer_entries - get the number of entries in a buffer 1552 * ring_buffer_entries - get the number of entries in a buffer
@@ -1466,6 +1569,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
1466 1569
1467 return entries; 1570 return entries;
1468} 1571}
1572EXPORT_SYMBOL_GPL(ring_buffer_entries);
1469 1573
1470/** 1574/**
1471 * ring_buffer_overrun_cpu - get the number of overruns in buffer 1575 * ring_buffer_overrun_cpu - get the number of overruns in buffer
@@ -1488,15 +1592,9 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
1488 1592
1489 return overruns; 1593 return overruns;
1490} 1594}
1595EXPORT_SYMBOL_GPL(ring_buffer_overruns);
1491 1596
1492/** 1597static void rb_iter_reset(struct ring_buffer_iter *iter)
1493 * ring_buffer_iter_reset - reset an iterator
1494 * @iter: The iterator to reset
1495 *
1496 * Resets the iterator, so that it will start from the beginning
1497 * again.
1498 */
1499void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1500{ 1598{
1501 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1599 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1502 1600
@@ -1511,10 +1609,28 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1511 if (iter->head) 1609 if (iter->head)
1512 iter->read_stamp = cpu_buffer->read_stamp; 1610 iter->read_stamp = cpu_buffer->read_stamp;
1513 else 1611 else
1514 iter->read_stamp = iter->head_page->time_stamp; 1612 iter->read_stamp = iter->head_page->page->time_stamp;
1515} 1613}
1516 1614
1517/** 1615/**
1616 * ring_buffer_iter_reset - reset an iterator
1617 * @iter: The iterator to reset
1618 *
1619 * Resets the iterator, so that it will start from the beginning
1620 * again.
1621 */
1622void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1623{
1624 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1625 unsigned long flags;
1626
1627 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1628 rb_iter_reset(iter);
1629 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1630}
1631EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
1632
1633/**
1518 * ring_buffer_iter_empty - check if an iterator has no more to read 1634 * ring_buffer_iter_empty - check if an iterator has no more to read
1519 * @iter: The iterator to check 1635 * @iter: The iterator to check
1520 */ 1636 */
@@ -1527,6 +1643,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
1527 return iter->head_page == cpu_buffer->commit_page && 1643 return iter->head_page == cpu_buffer->commit_page &&
1528 iter->head == rb_commit_index(cpu_buffer); 1644 iter->head == rb_commit_index(cpu_buffer);
1529} 1645}
1646EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
1530 1647
1531static void 1648static void
1532rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, 1649rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1597,7 +1714,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1597 unsigned long flags; 1714 unsigned long flags;
1598 int nr_loops = 0; 1715 int nr_loops = 0;
1599 1716
1600 spin_lock_irqsave(&cpu_buffer->lock, flags); 1717 local_irq_save(flags);
1718 __raw_spin_lock(&cpu_buffer->lock);
1601 1719
1602 again: 1720 again:
1603 /* 1721 /*
@@ -1606,8 +1724,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1606 * a case where we will loop three times. There should be no 1724 * a case where we will loop three times. There should be no
1607 * reason to loop four times (that I know of). 1725 * reason to loop four times (that I know of).
1608 */ 1726 */
1609 if (unlikely(++nr_loops > 3)) { 1727 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
1610 RB_WARN_ON(cpu_buffer, 1);
1611 reader = NULL; 1728 reader = NULL;
1612 goto out; 1729 goto out;
1613 } 1730 }
@@ -1619,8 +1736,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1619 goto out; 1736 goto out;
1620 1737
1621 /* Never should we have an index greater than the size */ 1738 /* Never should we have an index greater than the size */
1622 RB_WARN_ON(cpu_buffer, 1739 if (RB_WARN_ON(cpu_buffer,
1623 cpu_buffer->reader_page->read > rb_page_size(reader)); 1740 cpu_buffer->reader_page->read > rb_page_size(reader)))
1741 goto out;
1624 1742
1625 /* check if we caught up to the tail */ 1743 /* check if we caught up to the tail */
1626 reader = NULL; 1744 reader = NULL;
@@ -1637,7 +1755,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1637 cpu_buffer->reader_page->list.prev = reader->list.prev; 1755 cpu_buffer->reader_page->list.prev = reader->list.prev;
1638 1756
1639 local_set(&cpu_buffer->reader_page->write, 0); 1757 local_set(&cpu_buffer->reader_page->write, 0);
1640 local_set(&cpu_buffer->reader_page->commit, 0); 1758 local_set(&cpu_buffer->reader_page->page->commit, 0);
1641 1759
1642 /* Make the reader page now replace the head */ 1760 /* Make the reader page now replace the head */
1643 reader->list.prev->next = &cpu_buffer->reader_page->list; 1761 reader->list.prev->next = &cpu_buffer->reader_page->list;
@@ -1659,7 +1777,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1659 goto again; 1777 goto again;
1660 1778
1661 out: 1779 out:
1662 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1780 __raw_spin_unlock(&cpu_buffer->lock);
1781 local_irq_restore(flags);
1663 1782
1664 return reader; 1783 return reader;
1665} 1784}
@@ -1673,7 +1792,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1673 reader = rb_get_reader_page(cpu_buffer); 1792 reader = rb_get_reader_page(cpu_buffer);
1674 1793
1675 /* This function should not be called when buffer is empty */ 1794 /* This function should not be called when buffer is empty */
1676 BUG_ON(!reader); 1795 if (RB_WARN_ON(cpu_buffer, !reader))
1796 return;
1677 1797
1678 event = rb_reader_event(cpu_buffer); 1798 event = rb_reader_event(cpu_buffer);
1679 1799
@@ -1700,7 +1820,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1700 * Check if we are at the end of the buffer. 1820 * Check if we are at the end of the buffer.
1701 */ 1821 */
1702 if (iter->head >= rb_page_size(iter->head_page)) { 1822 if (iter->head >= rb_page_size(iter->head_page)) {
1703 BUG_ON(iter->head_page == cpu_buffer->commit_page); 1823 if (RB_WARN_ON(buffer,
1824 iter->head_page == cpu_buffer->commit_page))
1825 return;
1704 rb_inc_iter(iter); 1826 rb_inc_iter(iter);
1705 return; 1827 return;
1706 } 1828 }
@@ -1713,8 +1835,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1713 * This should not be called to advance the header if we are 1835 * This should not be called to advance the header if we are
1714 * at the tail of the buffer. 1836 * at the tail of the buffer.
1715 */ 1837 */
1716 BUG_ON((iter->head_page == cpu_buffer->commit_page) && 1838 if (RB_WARN_ON(cpu_buffer,
1717 (iter->head + length > rb_commit_index(cpu_buffer))); 1839 (iter->head_page == cpu_buffer->commit_page) &&
1840 (iter->head + length > rb_commit_index(cpu_buffer))))
1841 return;
1718 1842
1719 rb_update_iter_read_stamp(iter, event); 1843 rb_update_iter_read_stamp(iter, event);
1720 1844
@@ -1726,24 +1850,15 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1726 rb_advance_iter(iter); 1850 rb_advance_iter(iter);
1727} 1851}
1728 1852
1729/** 1853static struct ring_buffer_event *
1730 * ring_buffer_peek - peek at the next event to be read 1854rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1731 * @buffer: The ring buffer to read
1732 * @cpu: The cpu to peak at
1733 * @ts: The timestamp counter of this event.
1734 *
1735 * This will return the event that will be read next, but does
1736 * not consume the data.
1737 */
1738struct ring_buffer_event *
1739ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1740{ 1855{
1741 struct ring_buffer_per_cpu *cpu_buffer; 1856 struct ring_buffer_per_cpu *cpu_buffer;
1742 struct ring_buffer_event *event; 1857 struct ring_buffer_event *event;
1743 struct buffer_page *reader; 1858 struct buffer_page *reader;
1744 int nr_loops = 0; 1859 int nr_loops = 0;
1745 1860
1746 if (!cpu_isset(cpu, buffer->cpumask)) 1861 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1747 return NULL; 1862 return NULL;
1748 1863
1749 cpu_buffer = buffer->buffers[cpu]; 1864 cpu_buffer = buffer->buffers[cpu];
@@ -1757,10 +1872,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1757 * can have. Nesting 10 deep of interrupts is clearly 1872 * can have. Nesting 10 deep of interrupts is clearly
1758 * an anomaly. 1873 * an anomaly.
1759 */ 1874 */
1760 if (unlikely(++nr_loops > 10)) { 1875 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1761 RB_WARN_ON(cpu_buffer, 1);
1762 return NULL; 1876 return NULL;
1763 }
1764 1877
1765 reader = rb_get_reader_page(cpu_buffer); 1878 reader = rb_get_reader_page(cpu_buffer);
1766 if (!reader) 1879 if (!reader)
@@ -1797,17 +1910,10 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1797 1910
1798 return NULL; 1911 return NULL;
1799} 1912}
1913EXPORT_SYMBOL_GPL(ring_buffer_peek);
1800 1914
1801/** 1915static struct ring_buffer_event *
1802 * ring_buffer_iter_peek - peek at the next event to be read 1916rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1803 * @iter: The ring buffer iterator
1804 * @ts: The timestamp counter of this event.
1805 *
1806 * This will return the event that will be read next, but does
1807 * not increment the iterator.
1808 */
1809struct ring_buffer_event *
1810ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1811{ 1917{
1812 struct ring_buffer *buffer; 1918 struct ring_buffer *buffer;
1813 struct ring_buffer_per_cpu *cpu_buffer; 1919 struct ring_buffer_per_cpu *cpu_buffer;
@@ -1829,10 +1935,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1829 * can have. Nesting 10 deep of interrupts is clearly 1935 * can have. Nesting 10 deep of interrupts is clearly
1830 * an anomaly. 1936 * an anomaly.
1831 */ 1937 */
1832 if (unlikely(++nr_loops > 10)) { 1938 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1833 RB_WARN_ON(cpu_buffer, 1);
1834 return NULL; 1939 return NULL;
1835 }
1836 1940
1837 if (rb_per_cpu_empty(cpu_buffer)) 1941 if (rb_per_cpu_empty(cpu_buffer))
1838 return NULL; 1942 return NULL;
@@ -1867,6 +1971,52 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1867 1971
1868 return NULL; 1972 return NULL;
1869} 1973}
1974EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
1975
1976/**
1977 * ring_buffer_peek - peek at the next event to be read
1978 * @buffer: The ring buffer to read
1979 * @cpu: The cpu to peak at
1980 * @ts: The timestamp counter of this event.
1981 *
1982 * This will return the event that will be read next, but does
1983 * not consume the data.
1984 */
1985struct ring_buffer_event *
1986ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1987{
1988 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1989 struct ring_buffer_event *event;
1990 unsigned long flags;
1991
1992 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1993 event = rb_buffer_peek(buffer, cpu, ts);
1994 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1995
1996 return event;
1997}
1998
1999/**
2000 * ring_buffer_iter_peek - peek at the next event to be read
2001 * @iter: The ring buffer iterator
2002 * @ts: The timestamp counter of this event.
2003 *
2004 * This will return the event that will be read next, but does
2005 * not increment the iterator.
2006 */
2007struct ring_buffer_event *
2008ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2009{
2010 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2011 struct ring_buffer_event *event;
2012 unsigned long flags;
2013
2014 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2015 event = rb_iter_peek(iter, ts);
2016 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2017
2018 return event;
2019}
1870 2020
1871/** 2021/**
1872 * ring_buffer_consume - return an event and consume it 2022 * ring_buffer_consume - return an event and consume it
@@ -1879,21 +2029,27 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1879struct ring_buffer_event * 2029struct ring_buffer_event *
1880ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 2030ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
1881{ 2031{
1882 struct ring_buffer_per_cpu *cpu_buffer; 2032 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1883 struct ring_buffer_event *event; 2033 struct ring_buffer_event *event;
2034 unsigned long flags;
1884 2035
1885 if (!cpu_isset(cpu, buffer->cpumask)) 2036 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1886 return NULL; 2037 return NULL;
1887 2038
1888 event = ring_buffer_peek(buffer, cpu, ts); 2039 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2040
2041 event = rb_buffer_peek(buffer, cpu, ts);
1889 if (!event) 2042 if (!event)
1890 return NULL; 2043 goto out;
1891 2044
1892 cpu_buffer = buffer->buffers[cpu];
1893 rb_advance_reader(cpu_buffer); 2045 rb_advance_reader(cpu_buffer);
1894 2046
2047 out:
2048 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2049
1895 return event; 2050 return event;
1896} 2051}
2052EXPORT_SYMBOL_GPL(ring_buffer_consume);
1897 2053
1898/** 2054/**
1899 * ring_buffer_read_start - start a non consuming read of the buffer 2055 * ring_buffer_read_start - start a non consuming read of the buffer
@@ -1914,7 +2070,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
1914 struct ring_buffer_iter *iter; 2070 struct ring_buffer_iter *iter;
1915 unsigned long flags; 2071 unsigned long flags;
1916 2072
1917 if (!cpu_isset(cpu, buffer->cpumask)) 2073 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1918 return NULL; 2074 return NULL;
1919 2075
1920 iter = kmalloc(sizeof(*iter), GFP_KERNEL); 2076 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
@@ -1928,12 +2084,15 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
1928 atomic_inc(&cpu_buffer->record_disabled); 2084 atomic_inc(&cpu_buffer->record_disabled);
1929 synchronize_sched(); 2085 synchronize_sched();
1930 2086
1931 spin_lock_irqsave(&cpu_buffer->lock, flags); 2087 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1932 ring_buffer_iter_reset(iter); 2088 __raw_spin_lock(&cpu_buffer->lock);
1933 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2089 rb_iter_reset(iter);
2090 __raw_spin_unlock(&cpu_buffer->lock);
2091 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1934 2092
1935 return iter; 2093 return iter;
1936} 2094}
2095EXPORT_SYMBOL_GPL(ring_buffer_read_start);
1937 2096
1938/** 2097/**
1939 * ring_buffer_finish - finish reading the iterator of the buffer 2098 * ring_buffer_finish - finish reading the iterator of the buffer
@@ -1950,6 +2109,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
1950 atomic_dec(&cpu_buffer->record_disabled); 2109 atomic_dec(&cpu_buffer->record_disabled);
1951 kfree(iter); 2110 kfree(iter);
1952} 2111}
2112EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
1953 2113
1954/** 2114/**
1955 * ring_buffer_read - read the next item in the ring buffer by the iterator 2115 * ring_buffer_read - read the next item in the ring buffer by the iterator
@@ -1962,15 +2122,21 @@ struct ring_buffer_event *
1962ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) 2122ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
1963{ 2123{
1964 struct ring_buffer_event *event; 2124 struct ring_buffer_event *event;
2125 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2126 unsigned long flags;
1965 2127
1966 event = ring_buffer_iter_peek(iter, ts); 2128 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2129 event = rb_iter_peek(iter, ts);
1967 if (!event) 2130 if (!event)
1968 return NULL; 2131 goto out;
1969 2132
1970 rb_advance_iter(iter); 2133 rb_advance_iter(iter);
2134 out:
2135 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1971 2136
1972 return event; 2137 return event;
1973} 2138}
2139EXPORT_SYMBOL_GPL(ring_buffer_read);
1974 2140
1975/** 2141/**
1976 * ring_buffer_size - return the size of the ring buffer (in bytes) 2142 * ring_buffer_size - return the size of the ring buffer (in bytes)
@@ -1980,6 +2146,7 @@ unsigned long ring_buffer_size(struct ring_buffer *buffer)
1980{ 2146{
1981 return BUF_PAGE_SIZE * buffer->pages; 2147 return BUF_PAGE_SIZE * buffer->pages;
1982} 2148}
2149EXPORT_SYMBOL_GPL(ring_buffer_size);
1983 2150
1984static void 2151static void
1985rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 2152rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
@@ -1987,7 +2154,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
1987 cpu_buffer->head_page 2154 cpu_buffer->head_page
1988 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 2155 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
1989 local_set(&cpu_buffer->head_page->write, 0); 2156 local_set(&cpu_buffer->head_page->write, 0);
1990 local_set(&cpu_buffer->head_page->commit, 0); 2157 local_set(&cpu_buffer->head_page->page->commit, 0);
1991 2158
1992 cpu_buffer->head_page->read = 0; 2159 cpu_buffer->head_page->read = 0;
1993 2160
@@ -1996,7 +2163,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
1996 2163
1997 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 2164 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1998 local_set(&cpu_buffer->reader_page->write, 0); 2165 local_set(&cpu_buffer->reader_page->write, 0);
1999 local_set(&cpu_buffer->reader_page->commit, 0); 2166 local_set(&cpu_buffer->reader_page->page->commit, 0);
2000 cpu_buffer->reader_page->read = 0; 2167 cpu_buffer->reader_page->read = 0;
2001 2168
2002 cpu_buffer->overrun = 0; 2169 cpu_buffer->overrun = 0;
@@ -2013,15 +2180,20 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2013 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2180 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2014 unsigned long flags; 2181 unsigned long flags;
2015 2182
2016 if (!cpu_isset(cpu, buffer->cpumask)) 2183 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2017 return; 2184 return;
2018 2185
2019 spin_lock_irqsave(&cpu_buffer->lock, flags); 2186 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2187
2188 __raw_spin_lock(&cpu_buffer->lock);
2020 2189
2021 rb_reset_cpu(cpu_buffer); 2190 rb_reset_cpu(cpu_buffer);
2022 2191
2023 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2192 __raw_spin_unlock(&cpu_buffer->lock);
2193
2194 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2024} 2195}
2196EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
2025 2197
2026/** 2198/**
2027 * ring_buffer_reset - reset a ring buffer 2199 * ring_buffer_reset - reset a ring buffer
@@ -2034,6 +2206,7 @@ void ring_buffer_reset(struct ring_buffer *buffer)
2034 for_each_buffer_cpu(buffer, cpu) 2206 for_each_buffer_cpu(buffer, cpu)
2035 ring_buffer_reset_cpu(buffer, cpu); 2207 ring_buffer_reset_cpu(buffer, cpu);
2036} 2208}
2209EXPORT_SYMBOL_GPL(ring_buffer_reset);
2037 2210
2038/** 2211/**
2039 * rind_buffer_empty - is the ring buffer empty? 2212 * rind_buffer_empty - is the ring buffer empty?
@@ -2052,6 +2225,7 @@ int ring_buffer_empty(struct ring_buffer *buffer)
2052 } 2225 }
2053 return 1; 2226 return 1;
2054} 2227}
2228EXPORT_SYMBOL_GPL(ring_buffer_empty);
2055 2229
2056/** 2230/**
2057 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? 2231 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
@@ -2062,12 +2236,13 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2062{ 2236{
2063 struct ring_buffer_per_cpu *cpu_buffer; 2237 struct ring_buffer_per_cpu *cpu_buffer;
2064 2238
2065 if (!cpu_isset(cpu, buffer->cpumask)) 2239 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2066 return 1; 2240 return 1;
2067 2241
2068 cpu_buffer = buffer->buffers[cpu]; 2242 cpu_buffer = buffer->buffers[cpu];
2069 return rb_per_cpu_empty(cpu_buffer); 2243 return rb_per_cpu_empty(cpu_buffer);
2070} 2244}
2245EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2071 2246
2072/** 2247/**
2073 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 2248 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
@@ -2085,13 +2260,12 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2085 struct ring_buffer_per_cpu *cpu_buffer_a; 2260 struct ring_buffer_per_cpu *cpu_buffer_a;
2086 struct ring_buffer_per_cpu *cpu_buffer_b; 2261 struct ring_buffer_per_cpu *cpu_buffer_b;
2087 2262
2088 if (!cpu_isset(cpu, buffer_a->cpumask) || 2263 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
2089 !cpu_isset(cpu, buffer_b->cpumask)) 2264 !cpumask_test_cpu(cpu, buffer_b->cpumask))
2090 return -EINVAL; 2265 return -EINVAL;
2091 2266
2092 /* At least make sure the two buffers are somewhat the same */ 2267 /* At least make sure the two buffers are somewhat the same */
2093 if (buffer_a->size != buffer_b->size || 2268 if (buffer_a->pages != buffer_b->pages)
2094 buffer_a->pages != buffer_b->pages)
2095 return -EINVAL; 2269 return -EINVAL;
2096 2270
2097 cpu_buffer_a = buffer_a->buffers[cpu]; 2271 cpu_buffer_a = buffer_a->buffers[cpu];
@@ -2117,17 +2291,180 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2117 2291
2118 return 0; 2292 return 0;
2119} 2293}
2294EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2295
2296static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2297 struct buffer_data_page *bpage)
2298{
2299 struct ring_buffer_event *event;
2300 unsigned long head;
2301
2302 __raw_spin_lock(&cpu_buffer->lock);
2303 for (head = 0; head < local_read(&bpage->commit);
2304 head += rb_event_length(event)) {
2305
2306 event = __rb_data_page_index(bpage, head);
2307 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
2308 return;
2309 /* Only count data entries */
2310 if (event->type != RINGBUF_TYPE_DATA)
2311 continue;
2312 cpu_buffer->entries--;
2313 }
2314 __raw_spin_unlock(&cpu_buffer->lock);
2315}
2316
2317/**
2318 * ring_buffer_alloc_read_page - allocate a page to read from buffer
2319 * @buffer: the buffer to allocate for.
2320 *
2321 * This function is used in conjunction with ring_buffer_read_page.
2322 * When reading a full page from the ring buffer, these functions
2323 * can be used to speed up the process. The calling function should
2324 * allocate a few pages first with this function. Then when it
2325 * needs to get pages from the ring buffer, it passes the result
2326 * of this function into ring_buffer_read_page, which will swap
2327 * the page that was allocated, with the read page of the buffer.
2328 *
2329 * Returns:
2330 * The page allocated, or NULL on error.
2331 */
2332void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2333{
2334 unsigned long addr;
2335 struct buffer_data_page *bpage;
2336
2337 addr = __get_free_page(GFP_KERNEL);
2338 if (!addr)
2339 return NULL;
2340
2341 bpage = (void *)addr;
2342
2343 return bpage;
2344}
2345
2346/**
2347 * ring_buffer_free_read_page - free an allocated read page
2348 * @buffer: the buffer the page was allocate for
2349 * @data: the page to free
2350 *
2351 * Free a page allocated from ring_buffer_alloc_read_page.
2352 */
2353void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2354{
2355 free_page((unsigned long)data);
2356}
2357
2358/**
2359 * ring_buffer_read_page - extract a page from the ring buffer
2360 * @buffer: buffer to extract from
2361 * @data_page: the page to use allocated from ring_buffer_alloc_read_page
2362 * @cpu: the cpu of the buffer to extract
2363 * @full: should the extraction only happen when the page is full.
2364 *
2365 * This function will pull out a page from the ring buffer and consume it.
2366 * @data_page must be the address of the variable that was returned
2367 * from ring_buffer_alloc_read_page. This is because the page might be used
2368 * to swap with a page in the ring buffer.
2369 *
2370 * for example:
2371 * rpage = ring_buffer_alloc_page(buffer);
2372 * if (!rpage)
2373 * return error;
2374 * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
2375 * if (ret)
2376 * process_page(rpage);
2377 *
2378 * When @full is set, the function will not return true unless
2379 * the writer is off the reader page.
2380 *
2381 * Note: it is up to the calling functions to handle sleeps and wakeups.
2382 * The ring buffer can be used anywhere in the kernel and can not
2383 * blindly call wake_up. The layer that uses the ring buffer must be
2384 * responsible for that.
2385 *
2386 * Returns:
2387 * 1 if data has been transferred
2388 * 0 if no data has been transferred.
2389 */
2390int ring_buffer_read_page(struct ring_buffer *buffer,
2391 void **data_page, int cpu, int full)
2392{
2393 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2394 struct ring_buffer_event *event;
2395 struct buffer_data_page *bpage;
2396 unsigned long flags;
2397 int ret = 0;
2398
2399 if (!data_page)
2400 return 0;
2401
2402 bpage = *data_page;
2403 if (!bpage)
2404 return 0;
2405
2406 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2407
2408 /*
2409 * rb_buffer_peek will get the next ring buffer if
2410 * the current reader page is empty.
2411 */
2412 event = rb_buffer_peek(buffer, cpu, NULL);
2413 if (!event)
2414 goto out;
2415
2416 /* check for data */
2417 if (!local_read(&cpu_buffer->reader_page->page->commit))
2418 goto out;
2419 /*
2420 * If the writer is already off of the read page, then simply
2421 * switch the read page with the given page. Otherwise
2422 * we need to copy the data from the reader to the writer.
2423 */
2424 if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
2425 unsigned int read = cpu_buffer->reader_page->read;
2426
2427 if (full)
2428 goto out;
2429 /* The writer is still on the reader page, we must copy */
2430 bpage = cpu_buffer->reader_page->page;
2431 memcpy(bpage->data,
2432 cpu_buffer->reader_page->page->data + read,
2433 local_read(&bpage->commit) - read);
2434
2435 /* consume what was read */
2436 cpu_buffer->reader_page += read;
2437
2438 } else {
2439 /* swap the pages */
2440 rb_init_page(bpage);
2441 bpage = cpu_buffer->reader_page->page;
2442 cpu_buffer->reader_page->page = *data_page;
2443 cpu_buffer->reader_page->read = 0;
2444 *data_page = bpage;
2445 }
2446 ret = 1;
2447
2448 /* update the entry counter */
2449 rb_remove_entries(cpu_buffer, bpage);
2450 out:
2451 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2452
2453 return ret;
2454}
2120 2455
2121static ssize_t 2456static ssize_t
2122rb_simple_read(struct file *filp, char __user *ubuf, 2457rb_simple_read(struct file *filp, char __user *ubuf,
2123 size_t cnt, loff_t *ppos) 2458 size_t cnt, loff_t *ppos)
2124{ 2459{
2125 int *p = filp->private_data; 2460 long *p = filp->private_data;
2126 char buf[64]; 2461 char buf[64];
2127 int r; 2462 int r;
2128 2463
2129 /* !ring_buffers_off == tracing_on */ 2464 if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
2130 r = sprintf(buf, "%d\n", !*p); 2465 r = sprintf(buf, "permanently disabled\n");
2466 else
2467 r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
2131 2468
2132 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2469 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2133} 2470}
@@ -2136,7 +2473,7 @@ static ssize_t
2136rb_simple_write(struct file *filp, const char __user *ubuf, 2473rb_simple_write(struct file *filp, const char __user *ubuf,
2137 size_t cnt, loff_t *ppos) 2474 size_t cnt, loff_t *ppos)
2138{ 2475{
2139 int *p = filp->private_data; 2476 long *p = filp->private_data;
2140 char buf[64]; 2477 char buf[64];
2141 long val; 2478 long val;
2142 int ret; 2479 int ret;
@@ -2153,8 +2490,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
2153 if (ret < 0) 2490 if (ret < 0)
2154 return ret; 2491 return ret;
2155 2492
2156 /* !ring_buffers_off == tracing_on */ 2493 if (val)
2157 *p = !val; 2494 set_bit(RB_BUFFERS_ON_BIT, p);
2495 else
2496 clear_bit(RB_BUFFERS_ON_BIT, p);
2158 2497
2159 (*ppos)++; 2498 (*ppos)++;
2160 2499
@@ -2176,7 +2515,7 @@ static __init int rb_init_debugfs(void)
2176 d_tracer = tracing_init_dentry(); 2515 d_tracer = tracing_init_dentry();
2177 2516
2178 entry = debugfs_create_file("tracing_on", 0644, d_tracer, 2517 entry = debugfs_create_file("tracing_on", 0644, d_tracer,
2179 &ring_buffers_off, &rb_simple_fops); 2518 &ring_buffer_flags, &rb_simple_fops);
2180 if (!entry) 2519 if (!entry)
2181 pr_warning("Could not create debugfs 'tracing_on' entry\n"); 2520 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2182 2521
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d86e3252f300..c580233add95 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,6 +43,38 @@
43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
44unsigned long __read_mostly tracing_thresh; 44unsigned long __read_mostly tracing_thresh;
45 45
46/*
47 * We need to change this state when a selftest is running.
48 * A selftest will lurk into the ring-buffer to count the
49 * entries inserted during the selftest although some concurrent
50 * insertions into the ring-buffer such as ftrace_printk could occurred
51 * at the same time, giving false positive or negative results.
52 */
53static bool __read_mostly tracing_selftest_running;
54
55/* For tracers that don't implement custom flags */
56static struct tracer_opt dummy_tracer_opt[] = {
57 { }
58};
59
60static struct tracer_flags dummy_tracer_flags = {
61 .val = 0,
62 .opts = dummy_tracer_opt
63};
64
65static int dummy_set_flag(u32 old_flags, u32 bit, int set)
66{
67 return 0;
68}
69
70/*
71 * Kill all tracing for good (never come back).
72 * It is initialized to 1 but will turn to zero if the initialization
73 * of the tracer is successful. But that is the only place that sets
74 * this back to zero.
75 */
76int tracing_disabled = 1;
77
46static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 78static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
47 79
48static inline void ftrace_disable_cpu(void) 80static inline void ftrace_disable_cpu(void)
@@ -57,12 +89,41 @@ static inline void ftrace_enable_cpu(void)
57 preempt_enable(); 89 preempt_enable();
58} 90}
59 91
60static cpumask_t __read_mostly tracing_buffer_mask; 92static cpumask_var_t __read_mostly tracing_buffer_mask;
61 93
62#define for_each_tracing_cpu(cpu) \ 94#define for_each_tracing_cpu(cpu) \
63 for_each_cpu_mask(cpu, tracing_buffer_mask) 95 for_each_cpu(cpu, tracing_buffer_mask)
96
97/*
98 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
99 *
100 * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
101 * is set, then ftrace_dump is called. This will output the contents
102 * of the ftrace buffers to the console. This is very useful for
103 * capturing traces that lead to crashes and outputing it to a
104 * serial console.
105 *
106 * It is default off, but you can enable it with either specifying
107 * "ftrace_dump_on_oops" in the kernel command line, or setting
108 * /proc/sys/kernel/ftrace_dump_on_oops to true.
109 */
110int ftrace_dump_on_oops;
64 111
65static int tracing_disabled = 1; 112static int tracing_set_tracer(char *buf);
113
114static int __init set_ftrace(char *str)
115{
116 tracing_set_tracer(str);
117 return 1;
118}
119__setup("ftrace", set_ftrace);
120
121static int __init set_ftrace_dump_on_oops(char *str)
122{
123 ftrace_dump_on_oops = 1;
124 return 1;
125}
126__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
66 127
67long 128long
68ns2usecs(cycle_t nsec) 129ns2usecs(cycle_t nsec)
@@ -112,6 +173,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
112/* tracer_enabled is used to toggle activation of a tracer */ 173/* tracer_enabled is used to toggle activation of a tracer */
113static int tracer_enabled = 1; 174static int tracer_enabled = 1;
114 175
176/**
177 * tracing_is_enabled - return tracer_enabled status
178 *
179 * This function is used by other tracers to know the status
180 * of the tracer_enabled flag. Tracers may use this function
181 * to know if it should enable their features when starting
182 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
183 */
184int tracing_is_enabled(void)
185{
186 return tracer_enabled;
187}
188
115/* function tracing enabled */ 189/* function tracing enabled */
116int ftrace_function_enabled; 190int ftrace_function_enabled;
117 191
@@ -153,8 +227,9 @@ static DEFINE_MUTEX(trace_types_lock);
153/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 227/* trace_wait is a waitqueue for tasks blocked on trace_poll */
154static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 228static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
155 229
156/* trace_flags holds iter_ctrl options */ 230/* trace_flags holds trace_options default values */
157unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; 231unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
232 TRACE_ITER_ANNOTATE;
158 233
159/** 234/**
160 * trace_wake_up - wake up tasks waiting for trace input 235 * trace_wake_up - wake up tasks waiting for trace input
@@ -193,13 +268,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
193 return nsecs / 1000; 268 return nsecs / 1000;
194} 269}
195 270
196/*
197 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
198 * control the output of kernel symbols.
199 */
200#define TRACE_ITER_SYM_MASK \
201 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
202
203/* These must match the bit postions in trace_iterator_flags */ 271/* These must match the bit postions in trace_iterator_flags */
204static const char *trace_options[] = { 272static const char *trace_options[] = {
205 "print-parent", 273 "print-parent",
@@ -213,6 +281,12 @@ static const char *trace_options[] = {
213 "stacktrace", 281 "stacktrace",
214 "sched-tree", 282 "sched-tree",
215 "ftrace_printk", 283 "ftrace_printk",
284 "ftrace_preempt",
285 "branch",
286 "annotate",
287 "userstacktrace",
288 "sym-userobj",
289 "printk-msg-only",
216 NULL 290 NULL
217}; 291};
218 292
@@ -246,7 +320,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
246 320
247 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 321 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
248 data->pid = tsk->pid; 322 data->pid = tsk->pid;
249 data->uid = tsk->uid; 323 data->uid = task_uid(tsk);
250 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; 324 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
251 data->policy = tsk->policy; 325 data->policy = tsk->policy;
252 data->rt_priority = tsk->rt_priority; 326 data->rt_priority = tsk->rt_priority;
@@ -359,6 +433,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
359 return trace_seq_putmem(s, hex, j); 433 return trace_seq_putmem(s, hex, j);
360} 434}
361 435
436static int
437trace_seq_path(struct trace_seq *s, struct path *path)
438{
439 unsigned char *p;
440
441 if (s->len >= (PAGE_SIZE - 1))
442 return 0;
443 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
444 if (!IS_ERR(p)) {
445 p = mangle_path(s->buffer + s->len, p, "\n");
446 if (p) {
447 s->len = p - s->buffer;
448 return 1;
449 }
450 } else {
451 s->buffer[s->len++] = '?';
452 return 1;
453 }
454
455 return 0;
456}
457
362static void 458static void
363trace_seq_reset(struct trace_seq *s) 459trace_seq_reset(struct trace_seq *s)
364{ 460{
@@ -470,7 +566,17 @@ int register_tracer(struct tracer *type)
470 return -1; 566 return -1;
471 } 567 }
472 568
569 /*
570 * When this gets called we hold the BKL which means that
571 * preemption is disabled. Various trace selftests however
572 * need to disable and enable preemption for successful tests.
573 * So we drop the BKL here and grab it after the tests again.
574 */
575 unlock_kernel();
473 mutex_lock(&trace_types_lock); 576 mutex_lock(&trace_types_lock);
577
578 tracing_selftest_running = true;
579
474 for (t = trace_types; t; t = t->next) { 580 for (t = trace_types; t; t = t->next) {
475 if (strcmp(type->name, t->name) == 0) { 581 if (strcmp(type->name, t->name) == 0) {
476 /* already found */ 582 /* already found */
@@ -481,12 +587,20 @@ int register_tracer(struct tracer *type)
481 } 587 }
482 } 588 }
483 589
590 if (!type->set_flag)
591 type->set_flag = &dummy_set_flag;
592 if (!type->flags)
593 type->flags = &dummy_tracer_flags;
594 else
595 if (!type->flags->opts)
596 type->flags->opts = dummy_tracer_opt;
597
484#ifdef CONFIG_FTRACE_STARTUP_TEST 598#ifdef CONFIG_FTRACE_STARTUP_TEST
485 if (type->selftest) { 599 if (type->selftest) {
486 struct tracer *saved_tracer = current_trace; 600 struct tracer *saved_tracer = current_trace;
487 struct trace_array *tr = &global_trace; 601 struct trace_array *tr = &global_trace;
488 int saved_ctrl = tr->ctrl;
489 int i; 602 int i;
603
490 /* 604 /*
491 * Run a selftest on this tracer. 605 * Run a selftest on this tracer.
492 * Here we reset the trace buffer, and set the current 606 * Here we reset the trace buffer, and set the current
@@ -494,25 +608,23 @@ int register_tracer(struct tracer *type)
494 * internal tracing to verify that everything is in order. 608 * internal tracing to verify that everything is in order.
495 * If we fail, we do not register this tracer. 609 * If we fail, we do not register this tracer.
496 */ 610 */
497 for_each_tracing_cpu(i) { 611 for_each_tracing_cpu(i)
498 tracing_reset(tr, i); 612 tracing_reset(tr, i);
499 } 613
500 current_trace = type; 614 current_trace = type;
501 tr->ctrl = 0;
502 /* the test is responsible for initializing and enabling */ 615 /* the test is responsible for initializing and enabling */
503 pr_info("Testing tracer %s: ", type->name); 616 pr_info("Testing tracer %s: ", type->name);
504 ret = type->selftest(type, tr); 617 ret = type->selftest(type, tr);
505 /* the test is responsible for resetting too */ 618 /* the test is responsible for resetting too */
506 current_trace = saved_tracer; 619 current_trace = saved_tracer;
507 tr->ctrl = saved_ctrl;
508 if (ret) { 620 if (ret) {
509 printk(KERN_CONT "FAILED!\n"); 621 printk(KERN_CONT "FAILED!\n");
510 goto out; 622 goto out;
511 } 623 }
512 /* Only reset on passing, to avoid touching corrupted buffers */ 624 /* Only reset on passing, to avoid touching corrupted buffers */
513 for_each_tracing_cpu(i) { 625 for_each_tracing_cpu(i)
514 tracing_reset(tr, i); 626 tracing_reset(tr, i);
515 } 627
516 printk(KERN_CONT "PASSED\n"); 628 printk(KERN_CONT "PASSED\n");
517 } 629 }
518#endif 630#endif
@@ -524,7 +636,9 @@ int register_tracer(struct tracer *type)
524 max_tracer_type_len = len; 636 max_tracer_type_len = len;
525 637
526 out: 638 out:
639 tracing_selftest_running = false;
527 mutex_unlock(&trace_types_lock); 640 mutex_unlock(&trace_types_lock);
641 lock_kernel();
528 642
529 return ret; 643 return ret;
530} 644}
@@ -564,6 +678,16 @@ void tracing_reset(struct trace_array *tr, int cpu)
564 ftrace_enable_cpu(); 678 ftrace_enable_cpu();
565} 679}
566 680
681void tracing_reset_online_cpus(struct trace_array *tr)
682{
683 int cpu;
684
685 tr->time_start = ftrace_now(tr->cpu);
686
687 for_each_online_cpu(cpu)
688 tracing_reset(tr, cpu);
689}
690
567#define SAVED_CMDLINES 128 691#define SAVED_CMDLINES 128
568static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; 692static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
569static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 693static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
@@ -581,6 +705,91 @@ static void trace_init_cmdlines(void)
581 cmdline_idx = 0; 705 cmdline_idx = 0;
582} 706}
583 707
708static int trace_stop_count;
709static DEFINE_SPINLOCK(tracing_start_lock);
710
711/**
712 * ftrace_off_permanent - disable all ftrace code permanently
713 *
714 * This should only be called when a serious anomally has
715 * been detected. This will turn off the function tracing,
716 * ring buffers, and other tracing utilites. It takes no
717 * locks and can be called from any context.
718 */
719void ftrace_off_permanent(void)
720{
721 tracing_disabled = 1;
722 ftrace_stop();
723 tracing_off_permanent();
724}
725
726/**
727 * tracing_start - quick start of the tracer
728 *
729 * If tracing is enabled but was stopped by tracing_stop,
730 * this will start the tracer back up.
731 */
732void tracing_start(void)
733{
734 struct ring_buffer *buffer;
735 unsigned long flags;
736
737 if (tracing_disabled)
738 return;
739
740 spin_lock_irqsave(&tracing_start_lock, flags);
741 if (--trace_stop_count)
742 goto out;
743
744 if (trace_stop_count < 0) {
745 /* Someone screwed up their debugging */
746 WARN_ON_ONCE(1);
747 trace_stop_count = 0;
748 goto out;
749 }
750
751
752 buffer = global_trace.buffer;
753 if (buffer)
754 ring_buffer_record_enable(buffer);
755
756 buffer = max_tr.buffer;
757 if (buffer)
758 ring_buffer_record_enable(buffer);
759
760 ftrace_start();
761 out:
762 spin_unlock_irqrestore(&tracing_start_lock, flags);
763}
764
765/**
766 * tracing_stop - quick stop of the tracer
767 *
768 * Light weight way to stop tracing. Use in conjunction with
769 * tracing_start.
770 */
771void tracing_stop(void)
772{
773 struct ring_buffer *buffer;
774 unsigned long flags;
775
776 ftrace_stop();
777 spin_lock_irqsave(&tracing_start_lock, flags);
778 if (trace_stop_count++)
779 goto out;
780
781 buffer = global_trace.buffer;
782 if (buffer)
783 ring_buffer_record_disable(buffer);
784
785 buffer = max_tr.buffer;
786 if (buffer)
787 ring_buffer_record_disable(buffer);
788
789 out:
790 spin_unlock_irqrestore(&tracing_start_lock, flags);
791}
792
584void trace_stop_cmdline_recording(void); 793void trace_stop_cmdline_recording(void);
585 794
586static void trace_save_cmdline(struct task_struct *tsk) 795static void trace_save_cmdline(struct task_struct *tsk)
@@ -618,7 +827,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
618 spin_unlock(&trace_cmdline_lock); 827 spin_unlock(&trace_cmdline_lock);
619} 828}
620 829
621static char *trace_find_cmdline(int pid) 830char *trace_find_cmdline(int pid)
622{ 831{
623 char *cmdline = "<...>"; 832 char *cmdline = "<...>";
624 unsigned map; 833 unsigned map;
@@ -655,6 +864,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
655 864
656 entry->preempt_count = pc & 0xff; 865 entry->preempt_count = pc & 0xff;
657 entry->pid = (tsk) ? tsk->pid : 0; 866 entry->pid = (tsk) ? tsk->pid : 0;
867 entry->tgid = (tsk) ? tsk->tgid : 0;
658 entry->flags = 868 entry->flags =
659#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 869#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
660 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 870 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -691,6 +901,56 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
691 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 901 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
692} 902}
693 903
904#ifdef CONFIG_FUNCTION_GRAPH_TRACER
905static void __trace_graph_entry(struct trace_array *tr,
906 struct trace_array_cpu *data,
907 struct ftrace_graph_ent *trace,
908 unsigned long flags,
909 int pc)
910{
911 struct ring_buffer_event *event;
912 struct ftrace_graph_ent_entry *entry;
913 unsigned long irq_flags;
914
915 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
916 return;
917
918 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
919 &irq_flags);
920 if (!event)
921 return;
922 entry = ring_buffer_event_data(event);
923 tracing_generic_entry_update(&entry->ent, flags, pc);
924 entry->ent.type = TRACE_GRAPH_ENT;
925 entry->graph_ent = *trace;
926 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
927}
928
929static void __trace_graph_return(struct trace_array *tr,
930 struct trace_array_cpu *data,
931 struct ftrace_graph_ret *trace,
932 unsigned long flags,
933 int pc)
934{
935 struct ring_buffer_event *event;
936 struct ftrace_graph_ret_entry *entry;
937 unsigned long irq_flags;
938
939 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
940 return;
941
942 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
943 &irq_flags);
944 if (!event)
945 return;
946 entry = ring_buffer_event_data(event);
947 tracing_generic_entry_update(&entry->ent, flags, pc);
948 entry->ent.type = TRACE_GRAPH_RET;
949 entry->ret = *trace;
950 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
951}
952#endif
953
694void 954void
695ftrace(struct trace_array *tr, struct trace_array_cpu *data, 955ftrace(struct trace_array *tr, struct trace_array_cpu *data,
696 unsigned long ip, unsigned long parent_ip, unsigned long flags, 956 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -742,6 +1002,46 @@ void __trace_stack(struct trace_array *tr,
742 ftrace_trace_stack(tr, data, flags, skip, preempt_count()); 1002 ftrace_trace_stack(tr, data, flags, skip, preempt_count());
743} 1003}
744 1004
1005static void ftrace_trace_userstack(struct trace_array *tr,
1006 struct trace_array_cpu *data,
1007 unsigned long flags, int pc)
1008{
1009#ifdef CONFIG_STACKTRACE
1010 struct ring_buffer_event *event;
1011 struct userstack_entry *entry;
1012 struct stack_trace trace;
1013 unsigned long irq_flags;
1014
1015 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1016 return;
1017
1018 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
1019 &irq_flags);
1020 if (!event)
1021 return;
1022 entry = ring_buffer_event_data(event);
1023 tracing_generic_entry_update(&entry->ent, flags, pc);
1024 entry->ent.type = TRACE_USER_STACK;
1025
1026 memset(&entry->caller, 0, sizeof(entry->caller));
1027
1028 trace.nr_entries = 0;
1029 trace.max_entries = FTRACE_STACK_ENTRIES;
1030 trace.skip = 0;
1031 trace.entries = entry->caller;
1032
1033 save_stack_trace_user(&trace);
1034 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1035#endif
1036}
1037
1038void __trace_userstack(struct trace_array *tr,
1039 struct trace_array_cpu *data,
1040 unsigned long flags)
1041{
1042 ftrace_trace_userstack(tr, data, flags, preempt_count());
1043}
1044
745static void 1045static void
746ftrace_trace_special(void *__tr, void *__data, 1046ftrace_trace_special(void *__tr, void *__data,
747 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1047 unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -765,6 +1065,7 @@ ftrace_trace_special(void *__tr, void *__data,
765 entry->arg3 = arg3; 1065 entry->arg3 = arg3;
766 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1066 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
767 ftrace_trace_stack(tr, data, irq_flags, 4, pc); 1067 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
1068 ftrace_trace_userstack(tr, data, irq_flags, pc);
768 1069
769 trace_wake_up(); 1070 trace_wake_up();
770} 1071}
@@ -803,6 +1104,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
803 entry->next_cpu = task_cpu(next); 1104 entry->next_cpu = task_cpu(next);
804 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1105 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
805 ftrace_trace_stack(tr, data, flags, 5, pc); 1106 ftrace_trace_stack(tr, data, flags, 5, pc);
1107 ftrace_trace_userstack(tr, data, flags, pc);
806} 1108}
807 1109
808void 1110void
@@ -832,6 +1134,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
832 entry->next_cpu = task_cpu(wakee); 1134 entry->next_cpu = task_cpu(wakee);
833 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
834 ftrace_trace_stack(tr, data, flags, 6, pc); 1136 ftrace_trace_stack(tr, data, flags, 6, pc);
1137 ftrace_trace_userstack(tr, data, flags, pc);
835 1138
836 trace_wake_up(); 1139 trace_wake_up();
837} 1140}
@@ -841,26 +1144,28 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
841{ 1144{
842 struct trace_array *tr = &global_trace; 1145 struct trace_array *tr = &global_trace;
843 struct trace_array_cpu *data; 1146 struct trace_array_cpu *data;
1147 unsigned long flags;
844 int cpu; 1148 int cpu;
845 int pc; 1149 int pc;
846 1150
847 if (tracing_disabled || !tr->ctrl) 1151 if (tracing_disabled)
848 return; 1152 return;
849 1153
850 pc = preempt_count(); 1154 pc = preempt_count();
851 preempt_disable_notrace(); 1155 local_irq_save(flags);
852 cpu = raw_smp_processor_id(); 1156 cpu = raw_smp_processor_id();
853 data = tr->data[cpu]; 1157 data = tr->data[cpu];
854 1158
855 if (likely(!atomic_read(&data->disabled))) 1159 if (likely(atomic_inc_return(&data->disabled) == 1))
856 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); 1160 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
857 1161
858 preempt_enable_notrace(); 1162 atomic_dec(&data->disabled);
1163 local_irq_restore(flags);
859} 1164}
860 1165
861#ifdef CONFIG_FUNCTION_TRACER 1166#ifdef CONFIG_FUNCTION_TRACER
862static void 1167static void
863function_trace_call(unsigned long ip, unsigned long parent_ip) 1168function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
864{ 1169{
865 struct trace_array *tr = &global_trace; 1170 struct trace_array *tr = &global_trace;
866 struct trace_array_cpu *data; 1171 struct trace_array_cpu *data;
@@ -873,8 +1178,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
873 return; 1178 return;
874 1179
875 pc = preempt_count(); 1180 pc = preempt_count();
876 resched = need_resched(); 1181 resched = ftrace_preempt_disable();
877 preempt_disable_notrace();
878 local_save_flags(flags); 1182 local_save_flags(flags);
879 cpu = raw_smp_processor_id(); 1183 cpu = raw_smp_processor_id();
880 data = tr->data[cpu]; 1184 data = tr->data[cpu];
@@ -884,11 +1188,96 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
884 trace_function(tr, data, ip, parent_ip, flags, pc); 1188 trace_function(tr, data, ip, parent_ip, flags, pc);
885 1189
886 atomic_dec(&data->disabled); 1190 atomic_dec(&data->disabled);
887 if (resched) 1191 ftrace_preempt_enable(resched);
888 preempt_enable_no_resched_notrace(); 1192}
889 else 1193
890 preempt_enable_notrace(); 1194static void
1195function_trace_call(unsigned long ip, unsigned long parent_ip)
1196{
1197 struct trace_array *tr = &global_trace;
1198 struct trace_array_cpu *data;
1199 unsigned long flags;
1200 long disabled;
1201 int cpu;
1202 int pc;
1203
1204 if (unlikely(!ftrace_function_enabled))
1205 return;
1206
1207 /*
1208 * Need to use raw, since this must be called before the
1209 * recursive protection is performed.
1210 */
1211 local_irq_save(flags);
1212 cpu = raw_smp_processor_id();
1213 data = tr->data[cpu];
1214 disabled = atomic_inc_return(&data->disabled);
1215
1216 if (likely(disabled == 1)) {
1217 pc = preempt_count();
1218 trace_function(tr, data, ip, parent_ip, flags, pc);
1219 }
1220
1221 atomic_dec(&data->disabled);
1222 local_irq_restore(flags);
1223}
1224
1225#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1226int trace_graph_entry(struct ftrace_graph_ent *trace)
1227{
1228 struct trace_array *tr = &global_trace;
1229 struct trace_array_cpu *data;
1230 unsigned long flags;
1231 long disabled;
1232 int cpu;
1233 int pc;
1234
1235 if (!ftrace_trace_task(current))
1236 return 0;
1237
1238 if (!ftrace_graph_addr(trace->func))
1239 return 0;
1240
1241 local_irq_save(flags);
1242 cpu = raw_smp_processor_id();
1243 data = tr->data[cpu];
1244 disabled = atomic_inc_return(&data->disabled);
1245 if (likely(disabled == 1)) {
1246 pc = preempt_count();
1247 __trace_graph_entry(tr, data, trace, flags, pc);
1248 }
1249 /* Only do the atomic if it is not already set */
1250 if (!test_tsk_trace_graph(current))
1251 set_tsk_trace_graph(current);
1252 atomic_dec(&data->disabled);
1253 local_irq_restore(flags);
1254
1255 return 1;
1256}
1257
1258void trace_graph_return(struct ftrace_graph_ret *trace)
1259{
1260 struct trace_array *tr = &global_trace;
1261 struct trace_array_cpu *data;
1262 unsigned long flags;
1263 long disabled;
1264 int cpu;
1265 int pc;
1266
1267 local_irq_save(flags);
1268 cpu = raw_smp_processor_id();
1269 data = tr->data[cpu];
1270 disabled = atomic_inc_return(&data->disabled);
1271 if (likely(disabled == 1)) {
1272 pc = preempt_count();
1273 __trace_graph_return(tr, data, trace, flags, pc);
1274 }
1275 if (!trace->depth)
1276 clear_tsk_trace_graph(current);
1277 atomic_dec(&data->disabled);
1278 local_irq_restore(flags);
891} 1279}
1280#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
892 1281
893static struct ftrace_ops trace_ops __read_mostly = 1282static struct ftrace_ops trace_ops __read_mostly =
894{ 1283{
@@ -898,9 +1287,14 @@ static struct ftrace_ops trace_ops __read_mostly =
898void tracing_start_function_trace(void) 1287void tracing_start_function_trace(void)
899{ 1288{
900 ftrace_function_enabled = 0; 1289 ftrace_function_enabled = 0;
1290
1291 if (trace_flags & TRACE_ITER_PREEMPTONLY)
1292 trace_ops.func = function_trace_call_preempt_only;
1293 else
1294 trace_ops.func = function_trace_call;
1295
901 register_ftrace_function(&trace_ops); 1296 register_ftrace_function(&trace_ops);
902 if (tracer_enabled) 1297 ftrace_function_enabled = 1;
903 ftrace_function_enabled = 1;
904} 1298}
905 1299
906void tracing_stop_function_trace(void) 1300void tracing_stop_function_trace(void)
@@ -912,9 +1306,10 @@ void tracing_stop_function_trace(void)
912 1306
913enum trace_file_type { 1307enum trace_file_type {
914 TRACE_FILE_LAT_FMT = 1, 1308 TRACE_FILE_LAT_FMT = 1,
1309 TRACE_FILE_ANNOTATE = 2,
915}; 1310};
916 1311
917static void trace_iterator_increment(struct trace_iterator *iter, int cpu) 1312static void trace_iterator_increment(struct trace_iterator *iter)
918{ 1313{
919 /* Don't allow ftrace to trace into the ring buffers */ 1314 /* Don't allow ftrace to trace into the ring buffers */
920 ftrace_disable_cpu(); 1315 ftrace_disable_cpu();
@@ -993,7 +1388,7 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
993 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1388 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
994 1389
995 if (iter->ent) 1390 if (iter->ent)
996 trace_iterator_increment(iter, iter->cpu); 1391 trace_iterator_increment(iter);
997 1392
998 return iter->ent ? iter : NULL; 1393 return iter->ent ? iter : NULL;
999} 1394}
@@ -1047,10 +1442,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1047 1442
1048 atomic_inc(&trace_record_cmdline_disabled); 1443 atomic_inc(&trace_record_cmdline_disabled);
1049 1444
1050 /* let the tracer grab locks here if needed */
1051 if (current_trace->start)
1052 current_trace->start(iter);
1053
1054 if (*pos != iter->pos) { 1445 if (*pos != iter->pos) {
1055 iter->ent = NULL; 1446 iter->ent = NULL;
1056 iter->cpu = 0; 1447 iter->cpu = 0;
@@ -1077,14 +1468,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1077 1468
1078static void s_stop(struct seq_file *m, void *p) 1469static void s_stop(struct seq_file *m, void *p)
1079{ 1470{
1080 struct trace_iterator *iter = m->private;
1081
1082 atomic_dec(&trace_record_cmdline_disabled); 1471 atomic_dec(&trace_record_cmdline_disabled);
1083
1084 /* let the tracer release locks here if needed */
1085 if (current_trace && current_trace == iter->trace && iter->trace->stop)
1086 iter->trace->stop(iter);
1087
1088 mutex_unlock(&trace_types_lock); 1472 mutex_unlock(&trace_types_lock);
1089} 1473}
1090 1474
@@ -1143,7 +1527,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1143# define IP_FMT "%016lx" 1527# define IP_FMT "%016lx"
1144#endif 1528#endif
1145 1529
1146static int 1530int
1147seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) 1531seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1148{ 1532{
1149 int ret; 1533 int ret;
@@ -1164,6 +1548,78 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1164 return ret; 1548 return ret;
1165} 1549}
1166 1550
1551static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
1552 unsigned long ip, unsigned long sym_flags)
1553{
1554 struct file *file = NULL;
1555 unsigned long vmstart = 0;
1556 int ret = 1;
1557
1558 if (mm) {
1559 const struct vm_area_struct *vma;
1560
1561 down_read(&mm->mmap_sem);
1562 vma = find_vma(mm, ip);
1563 if (vma) {
1564 file = vma->vm_file;
1565 vmstart = vma->vm_start;
1566 }
1567 if (file) {
1568 ret = trace_seq_path(s, &file->f_path);
1569 if (ret)
1570 ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
1571 }
1572 up_read(&mm->mmap_sem);
1573 }
1574 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
1575 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1576 return ret;
1577}
1578
1579static int
1580seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
1581 unsigned long sym_flags)
1582{
1583 struct mm_struct *mm = NULL;
1584 int ret = 1;
1585 unsigned int i;
1586
1587 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
1588 struct task_struct *task;
1589 /*
1590 * we do the lookup on the thread group leader,
1591 * since individual threads might have already quit!
1592 */
1593 rcu_read_lock();
1594 task = find_task_by_vpid(entry->ent.tgid);
1595 if (task)
1596 mm = get_task_mm(task);
1597 rcu_read_unlock();
1598 }
1599
1600 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1601 unsigned long ip = entry->caller[i];
1602
1603 if (ip == ULONG_MAX || !ret)
1604 break;
1605 if (i && ret)
1606 ret = trace_seq_puts(s, " <- ");
1607 if (!ip) {
1608 if (ret)
1609 ret = trace_seq_puts(s, "??");
1610 continue;
1611 }
1612 if (!ret)
1613 break;
1614 if (ret)
1615 ret = seq_print_user_ip(s, mm, ip, sym_flags);
1616 }
1617
1618 if (mm)
1619 mmput(mm);
1620 return ret;
1621}
1622
1167static void print_lat_help_header(struct seq_file *m) 1623static void print_lat_help_header(struct seq_file *m)
1168{ 1624{
1169 seq_puts(m, "# _------=> CPU# \n"); 1625 seq_puts(m, "# _------=> CPU# \n");
@@ -1301,6 +1757,13 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
1301 1757
1302static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 1758static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1303 1759
1760static int task_state_char(unsigned long state)
1761{
1762 int bit = state ? __ffs(state) + 1 : 0;
1763
1764 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
1765}
1766
1304/* 1767/*
1305 * The message is supposed to contain an ending newline. 1768 * The message is supposed to contain an ending newline.
1306 * If the printing stops prematurely, try to add a newline of our own. 1769 * If the printing stops prematurely, try to add a newline of our own.
@@ -1338,6 +1801,23 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1338 trace_seq_putc(s, '\n'); 1801 trace_seq_putc(s, '\n');
1339} 1802}
1340 1803
1804static void test_cpu_buff_start(struct trace_iterator *iter)
1805{
1806 struct trace_seq *s = &iter->seq;
1807
1808 if (!(trace_flags & TRACE_ITER_ANNOTATE))
1809 return;
1810
1811 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
1812 return;
1813
1814 if (cpumask_test_cpu(iter->cpu, iter->started))
1815 return;
1816
1817 cpumask_set_cpu(iter->cpu, iter->started);
1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1819}
1820
1341static enum print_line_t 1821static enum print_line_t
1342print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) 1822print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1343{ 1823{
@@ -1352,11 +1832,12 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1352 char *comm; 1832 char *comm;
1353 int S, T; 1833 int S, T;
1354 int i; 1834 int i;
1355 unsigned state;
1356 1835
1357 if (entry->type == TRACE_CONT) 1836 if (entry->type == TRACE_CONT)
1358 return TRACE_TYPE_HANDLED; 1837 return TRACE_TYPE_HANDLED;
1359 1838
1839 test_cpu_buff_start(iter);
1840
1360 next_entry = find_next_entry(iter, NULL, &next_ts); 1841 next_entry = find_next_entry(iter, NULL, &next_ts);
1361 if (!next_entry) 1842 if (!next_entry)
1362 next_ts = iter->ts; 1843 next_ts = iter->ts;
@@ -1396,12 +1877,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1396 1877
1397 trace_assign_type(field, entry); 1878 trace_assign_type(field, entry);
1398 1879
1399 T = field->next_state < sizeof(state_to_char) ? 1880 T = task_state_char(field->next_state);
1400 state_to_char[field->next_state] : 'X'; 1881 S = task_state_char(field->prev_state);
1401
1402 state = field->prev_state ?
1403 __ffs(field->prev_state) + 1 : 0;
1404 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
1405 comm = trace_find_cmdline(field->next_pid); 1882 comm = trace_find_cmdline(field->next_pid);
1406 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", 1883 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
1407 field->prev_pid, 1884 field->prev_pid,
@@ -1448,6 +1925,27 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1448 trace_seq_print_cont(s, iter); 1925 trace_seq_print_cont(s, iter);
1449 break; 1926 break;
1450 } 1927 }
1928 case TRACE_BRANCH: {
1929 struct trace_branch *field;
1930
1931 trace_assign_type(field, entry);
1932
1933 trace_seq_printf(s, "[%s] %s:%s:%d\n",
1934 field->correct ? " ok " : " MISS ",
1935 field->func,
1936 field->file,
1937 field->line);
1938 break;
1939 }
1940 case TRACE_USER_STACK: {
1941 struct userstack_entry *field;
1942
1943 trace_assign_type(field, entry);
1944
1945 seq_print_userip_objs(field, s, sym_flags);
1946 trace_seq_putc(s, '\n');
1947 break;
1948 }
1451 default: 1949 default:
1452 trace_seq_printf(s, "Unknown type %d\n", entry->type); 1950 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1453 } 1951 }
@@ -1472,6 +1970,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1472 if (entry->type == TRACE_CONT) 1970 if (entry->type == TRACE_CONT)
1473 return TRACE_TYPE_HANDLED; 1971 return TRACE_TYPE_HANDLED;
1474 1972
1973 test_cpu_buff_start(iter);
1974
1475 comm = trace_find_cmdline(iter->ent->pid); 1975 comm = trace_find_cmdline(iter->ent->pid);
1476 1976
1477 t = ns2usecs(iter->ts); 1977 t = ns2usecs(iter->ts);
@@ -1519,10 +2019,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1519 2019
1520 trace_assign_type(field, entry); 2020 trace_assign_type(field, entry);
1521 2021
1522 S = field->prev_state < sizeof(state_to_char) ? 2022 T = task_state_char(field->next_state);
1523 state_to_char[field->prev_state] : 'X'; 2023 S = task_state_char(field->prev_state);
1524 T = field->next_state < sizeof(state_to_char) ?
1525 state_to_char[field->next_state] : 'X';
1526 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", 2024 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
1527 field->prev_pid, 2025 field->prev_pid,
1528 field->prev_prio, 2026 field->prev_prio,
@@ -1581,6 +2079,37 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1581 trace_seq_print_cont(s, iter); 2079 trace_seq_print_cont(s, iter);
1582 break; 2080 break;
1583 } 2081 }
2082 case TRACE_GRAPH_RET: {
2083 return print_graph_function(iter);
2084 }
2085 case TRACE_GRAPH_ENT: {
2086 return print_graph_function(iter);
2087 }
2088 case TRACE_BRANCH: {
2089 struct trace_branch *field;
2090
2091 trace_assign_type(field, entry);
2092
2093 trace_seq_printf(s, "[%s] %s:%s:%d\n",
2094 field->correct ? " ok " : " MISS ",
2095 field->func,
2096 field->file,
2097 field->line);
2098 break;
2099 }
2100 case TRACE_USER_STACK: {
2101 struct userstack_entry *field;
2102
2103 trace_assign_type(field, entry);
2104
2105 ret = seq_print_userip_objs(field, s, sym_flags);
2106 if (!ret)
2107 return TRACE_TYPE_PARTIAL_LINE;
2108 ret = trace_seq_putc(s, '\n');
2109 if (!ret)
2110 return TRACE_TYPE_PARTIAL_LINE;
2111 break;
2112 }
1584 } 2113 }
1585 return TRACE_TYPE_HANDLED; 2114 return TRACE_TYPE_HANDLED;
1586} 2115}
@@ -1621,12 +2150,9 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1621 2150
1622 trace_assign_type(field, entry); 2151 trace_assign_type(field, entry);
1623 2152
1624 S = field->prev_state < sizeof(state_to_char) ? 2153 T = task_state_char(field->next_state);
1625 state_to_char[field->prev_state] : 'X'; 2154 S = entry->type == TRACE_WAKE ? '+' :
1626 T = field->next_state < sizeof(state_to_char) ? 2155 task_state_char(field->prev_state);
1627 state_to_char[field->next_state] : 'X';
1628 if (entry->type == TRACE_WAKE)
1629 S = '+';
1630 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", 2156 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
1631 field->prev_pid, 2157 field->prev_pid,
1632 field->prev_prio, 2158 field->prev_prio,
@@ -1640,6 +2166,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1640 break; 2166 break;
1641 } 2167 }
1642 case TRACE_SPECIAL: 2168 case TRACE_SPECIAL:
2169 case TRACE_USER_STACK:
1643 case TRACE_STACK: { 2170 case TRACE_STACK: {
1644 struct special_entry *field; 2171 struct special_entry *field;
1645 2172
@@ -1712,12 +2239,9 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1712 2239
1713 trace_assign_type(field, entry); 2240 trace_assign_type(field, entry);
1714 2241
1715 S = field->prev_state < sizeof(state_to_char) ? 2242 T = task_state_char(field->next_state);
1716 state_to_char[field->prev_state] : 'X'; 2243 S = entry->type == TRACE_WAKE ? '+' :
1717 T = field->next_state < sizeof(state_to_char) ? 2244 task_state_char(field->prev_state);
1718 state_to_char[field->next_state] : 'X';
1719 if (entry->type == TRACE_WAKE)
1720 S = '+';
1721 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 2245 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
1722 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); 2246 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
1723 SEQ_PUT_HEX_FIELD_RET(s, S); 2247 SEQ_PUT_HEX_FIELD_RET(s, S);
@@ -1728,6 +2252,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1728 break; 2252 break;
1729 } 2253 }
1730 case TRACE_SPECIAL: 2254 case TRACE_SPECIAL:
2255 case TRACE_USER_STACK:
1731 case TRACE_STACK: { 2256 case TRACE_STACK: {
1732 struct special_entry *field; 2257 struct special_entry *field;
1733 2258
@@ -1744,6 +2269,25 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1744 return TRACE_TYPE_HANDLED; 2269 return TRACE_TYPE_HANDLED;
1745} 2270}
1746 2271
2272static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
2273{
2274 struct trace_seq *s = &iter->seq;
2275 struct trace_entry *entry = iter->ent;
2276 struct print_entry *field;
2277 int ret;
2278
2279 trace_assign_type(field, entry);
2280
2281 ret = trace_seq_printf(s, field->buf);
2282 if (!ret)
2283 return TRACE_TYPE_PARTIAL_LINE;
2284
2285 if (entry->flags & TRACE_FLAG_CONT)
2286 trace_seq_print_cont(s, iter);
2287
2288 return TRACE_TYPE_HANDLED;
2289}
2290
1747static enum print_line_t print_bin_fmt(struct trace_iterator *iter) 2291static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1748{ 2292{
1749 struct trace_seq *s = &iter->seq; 2293 struct trace_seq *s = &iter->seq;
@@ -1782,6 +2326,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1782 break; 2326 break;
1783 } 2327 }
1784 case TRACE_SPECIAL: 2328 case TRACE_SPECIAL:
2329 case TRACE_USER_STACK:
1785 case TRACE_STACK: { 2330 case TRACE_STACK: {
1786 struct special_entry *field; 2331 struct special_entry *field;
1787 2332
@@ -1823,6 +2368,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1823 return ret; 2368 return ret;
1824 } 2369 }
1825 2370
2371 if (iter->ent->type == TRACE_PRINT &&
2372 trace_flags & TRACE_ITER_PRINTK &&
2373 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
2374 return print_printk_msg_only(iter);
2375
1826 if (trace_flags & TRACE_ITER_BIN) 2376 if (trace_flags & TRACE_ITER_BIN)
1827 return print_bin_fmt(iter); 2377 return print_bin_fmt(iter);
1828 2378
@@ -1847,7 +2397,9 @@ static int s_show(struct seq_file *m, void *v)
1847 seq_printf(m, "# tracer: %s\n", iter->trace->name); 2397 seq_printf(m, "# tracer: %s\n", iter->trace->name);
1848 seq_puts(m, "#\n"); 2398 seq_puts(m, "#\n");
1849 } 2399 }
1850 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2400 if (iter->trace && iter->trace->print_header)
2401 iter->trace->print_header(m);
2402 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
1851 /* print nothing if the buffers are empty */ 2403 /* print nothing if the buffers are empty */
1852 if (trace_empty(iter)) 2404 if (trace_empty(iter))
1853 return 0; 2405 return 0;
@@ -1899,6 +2451,15 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1899 iter->trace = current_trace; 2451 iter->trace = current_trace;
1900 iter->pos = -1; 2452 iter->pos = -1;
1901 2453
2454 /* Notify the tracer early; before we stop tracing. */
2455 if (iter->trace && iter->trace->open)
2456 iter->trace->open(iter);
2457
2458 /* Annotate start of buffers if we had overruns */
2459 if (ring_buffer_overruns(iter->tr->buffer))
2460 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2461
2462
1902 for_each_tracing_cpu(cpu) { 2463 for_each_tracing_cpu(cpu) {
1903 2464
1904 iter->buffer_iter[cpu] = 2465 iter->buffer_iter[cpu] =
@@ -1917,13 +2478,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1917 m->private = iter; 2478 m->private = iter;
1918 2479
1919 /* stop the trace while dumping */ 2480 /* stop the trace while dumping */
1920 if (iter->tr->ctrl) { 2481 tracing_stop();
1921 tracer_enabled = 0;
1922 ftrace_function_enabled = 0;
1923 }
1924
1925 if (iter->trace && iter->trace->open)
1926 iter->trace->open(iter);
1927 2482
1928 mutex_unlock(&trace_types_lock); 2483 mutex_unlock(&trace_types_lock);
1929 2484
@@ -1966,14 +2521,7 @@ int tracing_release(struct inode *inode, struct file *file)
1966 iter->trace->close(iter); 2521 iter->trace->close(iter);
1967 2522
1968 /* reenable tracing if it was previously enabled */ 2523 /* reenable tracing if it was previously enabled */
1969 if (iter->tr->ctrl) { 2524 tracing_start();
1970 tracer_enabled = 1;
1971 /*
1972 * It is safe to enable function tracing even if it
1973 * isn't used
1974 */
1975 ftrace_function_enabled = 1;
1976 }
1977 mutex_unlock(&trace_types_lock); 2525 mutex_unlock(&trace_types_lock);
1978 2526
1979 seq_release(inode, file); 2527 seq_release(inode, file);
@@ -2098,13 +2646,7 @@ static struct file_operations show_traces_fops = {
2098/* 2646/*
2099 * Only trace on a CPU if the bitmask is set: 2647 * Only trace on a CPU if the bitmask is set:
2100 */ 2648 */
2101static cpumask_t tracing_cpumask = CPU_MASK_ALL; 2649static cpumask_var_t tracing_cpumask;
2102
2103/*
2104 * When tracing/tracing_cpu_mask is modified then this holds
2105 * the new bitmask we are about to install:
2106 */
2107static cpumask_t tracing_cpumask_new;
2108 2650
2109/* 2651/*
2110 * The tracer itself will not take this lock, but still we want 2652 * The tracer itself will not take this lock, but still we want
@@ -2145,39 +2687,45 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2145 size_t count, loff_t *ppos) 2687 size_t count, loff_t *ppos)
2146{ 2688{
2147 int err, cpu; 2689 int err, cpu;
2690 cpumask_var_t tracing_cpumask_new;
2691
2692 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2693 return -ENOMEM;
2148 2694
2149 mutex_lock(&tracing_cpumask_update_lock); 2695 mutex_lock(&tracing_cpumask_update_lock);
2150 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); 2696 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2151 if (err) 2697 if (err)
2152 goto err_unlock; 2698 goto err_unlock;
2153 2699
2154 raw_local_irq_disable(); 2700 local_irq_disable();
2155 __raw_spin_lock(&ftrace_max_lock); 2701 __raw_spin_lock(&ftrace_max_lock);
2156 for_each_tracing_cpu(cpu) { 2702 for_each_tracing_cpu(cpu) {
2157 /* 2703 /*
2158 * Increase/decrease the disabled counter if we are 2704 * Increase/decrease the disabled counter if we are
2159 * about to flip a bit in the cpumask: 2705 * about to flip a bit in the cpumask:
2160 */ 2706 */
2161 if (cpu_isset(cpu, tracing_cpumask) && 2707 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2162 !cpu_isset(cpu, tracing_cpumask_new)) { 2708 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2163 atomic_inc(&global_trace.data[cpu]->disabled); 2709 atomic_inc(&global_trace.data[cpu]->disabled);
2164 } 2710 }
2165 if (!cpu_isset(cpu, tracing_cpumask) && 2711 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2166 cpu_isset(cpu, tracing_cpumask_new)) { 2712 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2167 atomic_dec(&global_trace.data[cpu]->disabled); 2713 atomic_dec(&global_trace.data[cpu]->disabled);
2168 } 2714 }
2169 } 2715 }
2170 __raw_spin_unlock(&ftrace_max_lock); 2716 __raw_spin_unlock(&ftrace_max_lock);
2171 raw_local_irq_enable(); 2717 local_irq_enable();
2172 2718
2173 tracing_cpumask = tracing_cpumask_new; 2719 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
2174 2720
2175 mutex_unlock(&tracing_cpumask_update_lock); 2721 mutex_unlock(&tracing_cpumask_update_lock);
2722 free_cpumask_var(tracing_cpumask_new);
2176 2723
2177 return count; 2724 return count;
2178 2725
2179err_unlock: 2726err_unlock:
2180 mutex_unlock(&tracing_cpumask_update_lock); 2727 mutex_unlock(&tracing_cpumask_update_lock);
2728 free_cpumask_var(tracing_cpumask);
2181 2729
2182 return err; 2730 return err;
2183} 2731}
@@ -2189,13 +2737,16 @@ static struct file_operations tracing_cpumask_fops = {
2189}; 2737};
2190 2738
2191static ssize_t 2739static ssize_t
2192tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, 2740tracing_trace_options_read(struct file *filp, char __user *ubuf,
2193 size_t cnt, loff_t *ppos) 2741 size_t cnt, loff_t *ppos)
2194{ 2742{
2743 int i;
2195 char *buf; 2744 char *buf;
2196 int r = 0; 2745 int r = 0;
2197 int len = 0; 2746 int len = 0;
2198 int i; 2747 u32 tracer_flags = current_trace->flags->val;
2748 struct tracer_opt *trace_opts = current_trace->flags->opts;
2749
2199 2750
2200 /* calulate max size */ 2751 /* calulate max size */
2201 for (i = 0; trace_options[i]; i++) { 2752 for (i = 0; trace_options[i]; i++) {
@@ -2203,6 +2754,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2203 len += 3; /* "no" and space */ 2754 len += 3; /* "no" and space */
2204 } 2755 }
2205 2756
2757 /*
2758 * Increase the size with names of options specific
2759 * of the current tracer.
2760 */
2761 for (i = 0; trace_opts[i].name; i++) {
2762 len += strlen(trace_opts[i].name);
2763 len += 3; /* "no" and space */
2764 }
2765
2206 /* +2 for \n and \0 */ 2766 /* +2 for \n and \0 */
2207 buf = kmalloc(len + 2, GFP_KERNEL); 2767 buf = kmalloc(len + 2, GFP_KERNEL);
2208 if (!buf) 2768 if (!buf)
@@ -2215,6 +2775,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2215 r += sprintf(buf + r, "no%s ", trace_options[i]); 2775 r += sprintf(buf + r, "no%s ", trace_options[i]);
2216 } 2776 }
2217 2777
2778 for (i = 0; trace_opts[i].name; i++) {
2779 if (tracer_flags & trace_opts[i].bit)
2780 r += sprintf(buf + r, "%s ",
2781 trace_opts[i].name);
2782 else
2783 r += sprintf(buf + r, "no%s ",
2784 trace_opts[i].name);
2785 }
2786
2218 r += sprintf(buf + r, "\n"); 2787 r += sprintf(buf + r, "\n");
2219 WARN_ON(r >= len + 2); 2788 WARN_ON(r >= len + 2);
2220 2789
@@ -2225,13 +2794,48 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2225 return r; 2794 return r;
2226} 2795}
2227 2796
2797/* Try to assign a tracer specific option */
2798static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2799{
2800 struct tracer_flags *trace_flags = trace->flags;
2801 struct tracer_opt *opts = NULL;
2802 int ret = 0, i = 0;
2803 int len;
2804
2805 for (i = 0; trace_flags->opts[i].name; i++) {
2806 opts = &trace_flags->opts[i];
2807 len = strlen(opts->name);
2808
2809 if (strncmp(cmp, opts->name, len) == 0) {
2810 ret = trace->set_flag(trace_flags->val,
2811 opts->bit, !neg);
2812 break;
2813 }
2814 }
2815 /* Not found */
2816 if (!trace_flags->opts[i].name)
2817 return -EINVAL;
2818
2819 /* Refused to handle */
2820 if (ret)
2821 return ret;
2822
2823 if (neg)
2824 trace_flags->val &= ~opts->bit;
2825 else
2826 trace_flags->val |= opts->bit;
2827
2828 return 0;
2829}
2830
2228static ssize_t 2831static ssize_t
2229tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, 2832tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2230 size_t cnt, loff_t *ppos) 2833 size_t cnt, loff_t *ppos)
2231{ 2834{
2232 char buf[64]; 2835 char buf[64];
2233 char *cmp = buf; 2836 char *cmp = buf;
2234 int neg = 0; 2837 int neg = 0;
2838 int ret;
2235 int i; 2839 int i;
2236 2840
2237 if (cnt >= sizeof(buf)) 2841 if (cnt >= sizeof(buf))
@@ -2258,11 +2862,13 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2258 break; 2862 break;
2259 } 2863 }
2260 } 2864 }
2261 /* 2865
2262 * If no option could be set, return an error: 2866 /* If no option could be set, test the specific tracer options */
2263 */ 2867 if (!trace_options[i]) {
2264 if (!trace_options[i]) 2868 ret = set_tracer_option(current_trace, cmp, neg);
2265 return -EINVAL; 2869 if (ret)
2870 return ret;
2871 }
2266 2872
2267 filp->f_pos += cnt; 2873 filp->f_pos += cnt;
2268 2874
@@ -2271,8 +2877,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2271 2877
2272static struct file_operations tracing_iter_fops = { 2878static struct file_operations tracing_iter_fops = {
2273 .open = tracing_open_generic, 2879 .open = tracing_open_generic,
2274 .read = tracing_iter_ctrl_read, 2880 .read = tracing_trace_options_read,
2275 .write = tracing_iter_ctrl_write, 2881 .write = tracing_trace_options_write,
2276}; 2882};
2277 2883
2278static const char readme_msg[] = 2884static const char readme_msg[] =
@@ -2286,9 +2892,9 @@ static const char readme_msg[] =
2286 "# echo sched_switch > /debug/tracing/current_tracer\n" 2892 "# echo sched_switch > /debug/tracing/current_tracer\n"
2287 "# cat /debug/tracing/current_tracer\n" 2893 "# cat /debug/tracing/current_tracer\n"
2288 "sched_switch\n" 2894 "sched_switch\n"
2289 "# cat /debug/tracing/iter_ctrl\n" 2895 "# cat /debug/tracing/trace_options\n"
2290 "noprint-parent nosym-offset nosym-addr noverbose\n" 2896 "noprint-parent nosym-offset nosym-addr noverbose\n"
2291 "# echo print-parent > /debug/tracing/iter_ctrl\n" 2897 "# echo print-parent > /debug/tracing/trace_options\n"
2292 "# echo 1 > /debug/tracing/tracing_enabled\n" 2898 "# echo 1 > /debug/tracing/tracing_enabled\n"
2293 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2899 "# cat /debug/tracing/trace > /tmp/trace.txt\n"
2294 "echo 0 > /debug/tracing/tracing_enabled\n" 2900 "echo 0 > /debug/tracing/tracing_enabled\n"
@@ -2311,11 +2917,10 @@ static ssize_t
2311tracing_ctrl_read(struct file *filp, char __user *ubuf, 2917tracing_ctrl_read(struct file *filp, char __user *ubuf,
2312 size_t cnt, loff_t *ppos) 2918 size_t cnt, loff_t *ppos)
2313{ 2919{
2314 struct trace_array *tr = filp->private_data;
2315 char buf[64]; 2920 char buf[64];
2316 int r; 2921 int r;
2317 2922
2318 r = sprintf(buf, "%ld\n", tr->ctrl); 2923 r = sprintf(buf, "%u\n", tracer_enabled);
2319 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2924 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2320} 2925}
2321 2926
@@ -2343,16 +2948,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2343 val = !!val; 2948 val = !!val;
2344 2949
2345 mutex_lock(&trace_types_lock); 2950 mutex_lock(&trace_types_lock);
2346 if (tr->ctrl ^ val) { 2951 if (tracer_enabled ^ val) {
2347 if (val) 2952 if (val) {
2348 tracer_enabled = 1; 2953 tracer_enabled = 1;
2349 else 2954 if (current_trace->start)
2955 current_trace->start(tr);
2956 tracing_start();
2957 } else {
2350 tracer_enabled = 0; 2958 tracer_enabled = 0;
2351 2959 tracing_stop();
2352 tr->ctrl = val; 2960 if (current_trace->stop)
2353 2961 current_trace->stop(tr);
2354 if (current_trace && current_trace->ctrl_update) 2962 }
2355 current_trace->ctrl_update(tr);
2356 } 2963 }
2357 mutex_unlock(&trace_types_lock); 2964 mutex_unlock(&trace_types_lock);
2358 2965
@@ -2378,29 +2985,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
2378 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2985 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2379} 2986}
2380 2987
2381static ssize_t 2988static int tracing_set_tracer(char *buf)
2382tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2383 size_t cnt, loff_t *ppos)
2384{ 2989{
2385 struct trace_array *tr = &global_trace; 2990 struct trace_array *tr = &global_trace;
2386 struct tracer *t; 2991 struct tracer *t;
2387 char buf[max_tracer_type_len+1]; 2992 int ret = 0;
2388 int i;
2389 size_t ret;
2390
2391 ret = cnt;
2392
2393 if (cnt > max_tracer_type_len)
2394 cnt = max_tracer_type_len;
2395
2396 if (copy_from_user(&buf, ubuf, cnt))
2397 return -EFAULT;
2398
2399 buf[cnt] = 0;
2400
2401 /* strip ending whitespace. */
2402 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
2403 buf[i] = 0;
2404 2993
2405 mutex_lock(&trace_types_lock); 2994 mutex_lock(&trace_types_lock);
2406 for (t = trace_types; t; t = t->next) { 2995 for (t = trace_types; t; t = t->next) {
@@ -2414,18 +3003,52 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2414 if (t == current_trace) 3003 if (t == current_trace)
2415 goto out; 3004 goto out;
2416 3005
3006 trace_branch_disable();
2417 if (current_trace && current_trace->reset) 3007 if (current_trace && current_trace->reset)
2418 current_trace->reset(tr); 3008 current_trace->reset(tr);
2419 3009
2420 current_trace = t; 3010 current_trace = t;
2421 if (t->init) 3011 if (t->init) {
2422 t->init(tr); 3012 ret = t->init(tr);
3013 if (ret)
3014 goto out;
3015 }
2423 3016
3017 trace_branch_enable(tr);
2424 out: 3018 out:
2425 mutex_unlock(&trace_types_lock); 3019 mutex_unlock(&trace_types_lock);
2426 3020
2427 if (ret > 0) 3021 return ret;
2428 filp->f_pos += ret; 3022}
3023
3024static ssize_t
3025tracing_set_trace_write(struct file *filp, const char __user *ubuf,
3026 size_t cnt, loff_t *ppos)
3027{
3028 char buf[max_tracer_type_len+1];
3029 int i;
3030 size_t ret;
3031 int err;
3032
3033 ret = cnt;
3034
3035 if (cnt > max_tracer_type_len)
3036 cnt = max_tracer_type_len;
3037
3038 if (copy_from_user(&buf, ubuf, cnt))
3039 return -EFAULT;
3040
3041 buf[cnt] = 0;
3042
3043 /* strip ending whitespace. */
3044 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
3045 buf[i] = 0;
3046
3047 err = tracing_set_tracer(buf);
3048 if (err)
3049 return err;
3050
3051 filp->f_pos += ret;
2429 3052
2430 return ret; 3053 return ret;
2431} 3054}
@@ -2491,7 +3114,16 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2491 if (!iter) 3114 if (!iter)
2492 return -ENOMEM; 3115 return -ENOMEM;
2493 3116
3117 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3118 kfree(iter);
3119 return -ENOMEM;
3120 }
3121
2494 mutex_lock(&trace_types_lock); 3122 mutex_lock(&trace_types_lock);
3123
3124 /* trace pipe does not show start of buffer */
3125 cpumask_setall(iter->started);
3126
2495 iter->tr = &global_trace; 3127 iter->tr = &global_trace;
2496 iter->trace = current_trace; 3128 iter->trace = current_trace;
2497 filp->private_data = iter; 3129 filp->private_data = iter;
@@ -2507,6 +3139,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2507{ 3139{
2508 struct trace_iterator *iter = file->private_data; 3140 struct trace_iterator *iter = file->private_data;
2509 3141
3142 free_cpumask_var(iter->started);
2510 kfree(iter); 3143 kfree(iter);
2511 atomic_dec(&tracing_reader); 3144 atomic_dec(&tracing_reader);
2512 3145
@@ -2667,7 +3300,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
2667 char buf[64]; 3300 char buf[64];
2668 int r; 3301 int r;
2669 3302
2670 r = sprintf(buf, "%lu\n", tr->entries); 3303 r = sprintf(buf, "%lu\n", tr->entries >> 10);
2671 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3304 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2672} 3305}
2673 3306
@@ -2678,7 +3311,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2678 unsigned long val; 3311 unsigned long val;
2679 char buf[64]; 3312 char buf[64];
2680 int ret, cpu; 3313 int ret, cpu;
2681 struct trace_array *tr = filp->private_data;
2682 3314
2683 if (cnt >= sizeof(buf)) 3315 if (cnt >= sizeof(buf))
2684 return -EINVAL; 3316 return -EINVAL;
@@ -2698,12 +3330,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2698 3330
2699 mutex_lock(&trace_types_lock); 3331 mutex_lock(&trace_types_lock);
2700 3332
2701 if (tr->ctrl) { 3333 tracing_stop();
2702 cnt = -EBUSY;
2703 pr_info("ftrace: please disable tracing"
2704 " before modifying buffer size\n");
2705 goto out;
2706 }
2707 3334
2708 /* disable all cpu buffers */ 3335 /* disable all cpu buffers */
2709 for_each_tracing_cpu(cpu) { 3336 for_each_tracing_cpu(cpu) {
@@ -2713,6 +3340,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2713 atomic_inc(&max_tr.data[cpu]->disabled); 3340 atomic_inc(&max_tr.data[cpu]->disabled);
2714 } 3341 }
2715 3342
3343 /* value is in KB */
3344 val <<= 10;
3345
2716 if (val != global_trace.entries) { 3346 if (val != global_trace.entries) {
2717 ret = ring_buffer_resize(global_trace.buffer, val); 3347 ret = ring_buffer_resize(global_trace.buffer, val);
2718 if (ret < 0) { 3348 if (ret < 0) {
@@ -2751,6 +3381,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2751 atomic_dec(&max_tr.data[cpu]->disabled); 3381 atomic_dec(&max_tr.data[cpu]->disabled);
2752 } 3382 }
2753 3383
3384 tracing_start();
2754 max_tr.entries = global_trace.entries; 3385 max_tr.entries = global_trace.entries;
2755 mutex_unlock(&trace_types_lock); 3386 mutex_unlock(&trace_types_lock);
2756 3387
@@ -2762,7 +3393,7 @@ static int mark_printk(const char *fmt, ...)
2762 int ret; 3393 int ret;
2763 va_list args; 3394 va_list args;
2764 va_start(args, fmt); 3395 va_start(args, fmt);
2765 ret = trace_vprintk(0, fmt, args); 3396 ret = trace_vprintk(0, -1, fmt, args);
2766 va_end(args); 3397 va_end(args);
2767 return ret; 3398 return ret;
2768} 3399}
@@ -2773,9 +3404,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
2773{ 3404{
2774 char *buf; 3405 char *buf;
2775 char *end; 3406 char *end;
2776 struct trace_array *tr = &global_trace;
2777 3407
2778 if (!tr->ctrl || tracing_disabled) 3408 if (tracing_disabled)
2779 return -EINVAL; 3409 return -EINVAL;
2780 3410
2781 if (cnt > TRACE_BUF_SIZE) 3411 if (cnt > TRACE_BUF_SIZE)
@@ -2841,22 +3471,38 @@ static struct file_operations tracing_mark_fops = {
2841 3471
2842#ifdef CONFIG_DYNAMIC_FTRACE 3472#ifdef CONFIG_DYNAMIC_FTRACE
2843 3473
3474int __weak ftrace_arch_read_dyn_info(char *buf, int size)
3475{
3476 return 0;
3477}
3478
2844static ssize_t 3479static ssize_t
2845tracing_read_long(struct file *filp, char __user *ubuf, 3480tracing_read_dyn_info(struct file *filp, char __user *ubuf,
2846 size_t cnt, loff_t *ppos) 3481 size_t cnt, loff_t *ppos)
2847{ 3482{
3483 static char ftrace_dyn_info_buffer[1024];
3484 static DEFINE_MUTEX(dyn_info_mutex);
2848 unsigned long *p = filp->private_data; 3485 unsigned long *p = filp->private_data;
2849 char buf[64]; 3486 char *buf = ftrace_dyn_info_buffer;
3487 int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
2850 int r; 3488 int r;
2851 3489
2852 r = sprintf(buf, "%ld\n", *p); 3490 mutex_lock(&dyn_info_mutex);
3491 r = sprintf(buf, "%ld ", *p);
2853 3492
2854 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3493 r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
3494 buf[r++] = '\n';
3495
3496 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3497
3498 mutex_unlock(&dyn_info_mutex);
3499
3500 return r;
2855} 3501}
2856 3502
2857static struct file_operations tracing_read_long_fops = { 3503static struct file_operations tracing_dyn_info_fops = {
2858 .open = tracing_open_generic, 3504 .open = tracing_open_generic,
2859 .read = tracing_read_long, 3505 .read = tracing_read_dyn_info,
2860}; 3506};
2861#endif 3507#endif
2862 3508
@@ -2897,10 +3543,10 @@ static __init int tracer_init_debugfs(void)
2897 if (!entry) 3543 if (!entry)
2898 pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); 3544 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
2899 3545
2900 entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, 3546 entry = debugfs_create_file("trace_options", 0644, d_tracer,
2901 NULL, &tracing_iter_fops); 3547 NULL, &tracing_iter_fops);
2902 if (!entry) 3548 if (!entry)
2903 pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); 3549 pr_warning("Could not create debugfs 'trace_options' entry\n");
2904 3550
2905 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 3551 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
2906 NULL, &tracing_cpumask_fops); 3552 NULL, &tracing_cpumask_fops);
@@ -2950,11 +3596,11 @@ static __init int tracer_init_debugfs(void)
2950 pr_warning("Could not create debugfs " 3596 pr_warning("Could not create debugfs "
2951 "'trace_pipe' entry\n"); 3597 "'trace_pipe' entry\n");
2952 3598
2953 entry = debugfs_create_file("trace_entries", 0644, d_tracer, 3599 entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
2954 &global_trace, &tracing_entries_fops); 3600 &global_trace, &tracing_entries_fops);
2955 if (!entry) 3601 if (!entry)
2956 pr_warning("Could not create debugfs " 3602 pr_warning("Could not create debugfs "
2957 "'trace_entries' entry\n"); 3603 "'buffer_size_kb' entry\n");
2958 3604
2959 entry = debugfs_create_file("trace_marker", 0220, d_tracer, 3605 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
2960 NULL, &tracing_mark_fops); 3606 NULL, &tracing_mark_fops);
@@ -2965,7 +3611,7 @@ static __init int tracer_init_debugfs(void)
2965#ifdef CONFIG_DYNAMIC_FTRACE 3611#ifdef CONFIG_DYNAMIC_FTRACE
2966 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 3612 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
2967 &ftrace_update_tot_cnt, 3613 &ftrace_update_tot_cnt,
2968 &tracing_read_long_fops); 3614 &tracing_dyn_info_fops);
2969 if (!entry) 3615 if (!entry)
2970 pr_warning("Could not create debugfs " 3616 pr_warning("Could not create debugfs "
2971 "'dyn_ftrace_total_info' entry\n"); 3617 "'dyn_ftrace_total_info' entry\n");
@@ -2976,7 +3622,7 @@ static __init int tracer_init_debugfs(void)
2976 return 0; 3622 return 0;
2977} 3623}
2978 3624
2979int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 3625int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
2980{ 3626{
2981 static DEFINE_SPINLOCK(trace_buf_lock); 3627 static DEFINE_SPINLOCK(trace_buf_lock);
2982 static char trace_buf[TRACE_BUF_SIZE]; 3628 static char trace_buf[TRACE_BUF_SIZE];
@@ -2984,11 +3630,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2984 struct ring_buffer_event *event; 3630 struct ring_buffer_event *event;
2985 struct trace_array *tr = &global_trace; 3631 struct trace_array *tr = &global_trace;
2986 struct trace_array_cpu *data; 3632 struct trace_array_cpu *data;
2987 struct print_entry *entry;
2988 unsigned long flags, irq_flags;
2989 int cpu, len = 0, size, pc; 3633 int cpu, len = 0, size, pc;
3634 struct print_entry *entry;
3635 unsigned long irq_flags;
2990 3636
2991 if (!tr->ctrl || tracing_disabled) 3637 if (tracing_disabled || tracing_selftest_running)
2992 return 0; 3638 return 0;
2993 3639
2994 pc = preempt_count(); 3640 pc = preempt_count();
@@ -2999,7 +3645,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2999 if (unlikely(atomic_read(&data->disabled))) 3645 if (unlikely(atomic_read(&data->disabled)))
3000 goto out; 3646 goto out;
3001 3647
3002 spin_lock_irqsave(&trace_buf_lock, flags); 3648 pause_graph_tracing();
3649 spin_lock_irqsave(&trace_buf_lock, irq_flags);
3003 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 3650 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
3004 3651
3005 len = min(len, TRACE_BUF_SIZE-1); 3652 len = min(len, TRACE_BUF_SIZE-1);
@@ -3010,17 +3657,18 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
3010 if (!event) 3657 if (!event)
3011 goto out_unlock; 3658 goto out_unlock;
3012 entry = ring_buffer_event_data(event); 3659 entry = ring_buffer_event_data(event);
3013 tracing_generic_entry_update(&entry->ent, flags, pc); 3660 tracing_generic_entry_update(&entry->ent, irq_flags, pc);
3014 entry->ent.type = TRACE_PRINT; 3661 entry->ent.type = TRACE_PRINT;
3015 entry->ip = ip; 3662 entry->ip = ip;
3663 entry->depth = depth;
3016 3664
3017 memcpy(&entry->buf, trace_buf, len); 3665 memcpy(&entry->buf, trace_buf, len);
3018 entry->buf[len] = 0; 3666 entry->buf[len] = 0;
3019 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 3667 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
3020 3668
3021 out_unlock: 3669 out_unlock:
3022 spin_unlock_irqrestore(&trace_buf_lock, flags); 3670 spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
3023 3671 unpause_graph_tracing();
3024 out: 3672 out:
3025 preempt_enable_notrace(); 3673 preempt_enable_notrace();
3026 3674
@@ -3037,7 +3685,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
3037 return 0; 3685 return 0;
3038 3686
3039 va_start(ap, fmt); 3687 va_start(ap, fmt);
3040 ret = trace_vprintk(ip, fmt, ap); 3688 ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
3041 va_end(ap); 3689 va_end(ap);
3042 return ret; 3690 return ret;
3043} 3691}
@@ -3046,7 +3694,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk);
3046static int trace_panic_handler(struct notifier_block *this, 3694static int trace_panic_handler(struct notifier_block *this,
3047 unsigned long event, void *unused) 3695 unsigned long event, void *unused)
3048{ 3696{
3049 ftrace_dump(); 3697 if (ftrace_dump_on_oops)
3698 ftrace_dump();
3050 return NOTIFY_OK; 3699 return NOTIFY_OK;
3051} 3700}
3052 3701
@@ -3062,7 +3711,8 @@ static int trace_die_handler(struct notifier_block *self,
3062{ 3711{
3063 switch (val) { 3712 switch (val) {
3064 case DIE_OOPS: 3713 case DIE_OOPS:
3065 ftrace_dump(); 3714 if (ftrace_dump_on_oops)
3715 ftrace_dump();
3066 break; 3716 break;
3067 default: 3717 default:
3068 break; 3718 break;
@@ -3103,13 +3753,11 @@ trace_printk_seq(struct trace_seq *s)
3103 trace_seq_reset(s); 3753 trace_seq_reset(s);
3104} 3754}
3105 3755
3106
3107void ftrace_dump(void) 3756void ftrace_dump(void)
3108{ 3757{
3109 static DEFINE_SPINLOCK(ftrace_dump_lock); 3758 static DEFINE_SPINLOCK(ftrace_dump_lock);
3110 /* use static because iter can be a bit big for the stack */ 3759 /* use static because iter can be a bit big for the stack */
3111 static struct trace_iterator iter; 3760 static struct trace_iterator iter;
3112 static cpumask_t mask;
3113 static int dump_ran; 3761 static int dump_ran;
3114 unsigned long flags; 3762 unsigned long flags;
3115 int cnt = 0, cpu; 3763 int cnt = 0, cpu;
@@ -3128,6 +3776,9 @@ void ftrace_dump(void)
3128 atomic_inc(&global_trace.data[cpu]->disabled); 3776 atomic_inc(&global_trace.data[cpu]->disabled);
3129 } 3777 }
3130 3778
3779 /* don't look at user memory in panic mode */
3780 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
3781
3131 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 3782 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3132 3783
3133 iter.tr = &global_trace; 3784 iter.tr = &global_trace;
@@ -3140,8 +3791,6 @@ void ftrace_dump(void)
3140 * and then release the locks again. 3791 * and then release the locks again.
3141 */ 3792 */
3142 3793
3143 cpus_clear(mask);
3144
3145 while (!trace_empty(&iter)) { 3794 while (!trace_empty(&iter)) {
3146 3795
3147 if (!cnt) 3796 if (!cnt)
@@ -3177,19 +3826,28 @@ __init static int tracer_alloc_buffers(void)
3177{ 3826{
3178 struct trace_array_cpu *data; 3827 struct trace_array_cpu *data;
3179 int i; 3828 int i;
3829 int ret = -ENOMEM;
3180 3830
3181 /* TODO: make the number of buffers hot pluggable with CPUS */ 3831 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
3182 tracing_buffer_mask = cpu_possible_map; 3832 goto out;
3833
3834 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
3835 goto out_free_buffer_mask;
3836
3837 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
3838 cpumask_copy(tracing_cpumask, cpu_all_mask);
3183 3839
3840 /* TODO: make the number of buffers hot pluggable with CPUS */
3184 global_trace.buffer = ring_buffer_alloc(trace_buf_size, 3841 global_trace.buffer = ring_buffer_alloc(trace_buf_size,
3185 TRACE_BUFFER_FLAGS); 3842 TRACE_BUFFER_FLAGS);
3186 if (!global_trace.buffer) { 3843 if (!global_trace.buffer) {
3187 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 3844 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
3188 WARN_ON(1); 3845 WARN_ON(1);
3189 return 0; 3846 goto out_free_cpumask;
3190 } 3847 }
3191 global_trace.entries = ring_buffer_size(global_trace.buffer); 3848 global_trace.entries = ring_buffer_size(global_trace.buffer);
3192 3849
3850
3193#ifdef CONFIG_TRACER_MAX_TRACE 3851#ifdef CONFIG_TRACER_MAX_TRACE
3194 max_tr.buffer = ring_buffer_alloc(trace_buf_size, 3852 max_tr.buffer = ring_buffer_alloc(trace_buf_size,
3195 TRACE_BUFFER_FLAGS); 3853 TRACE_BUFFER_FLAGS);
@@ -3197,7 +3855,7 @@ __init static int tracer_alloc_buffers(void)
3197 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 3855 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
3198 WARN_ON(1); 3856 WARN_ON(1);
3199 ring_buffer_free(global_trace.buffer); 3857 ring_buffer_free(global_trace.buffer);
3200 return 0; 3858 goto out_free_cpumask;
3201 } 3859 }
3202 max_tr.entries = ring_buffer_size(max_tr.buffer); 3860 max_tr.entries = ring_buffer_size(max_tr.buffer);
3203 WARN_ON(max_tr.entries != global_trace.entries); 3861 WARN_ON(max_tr.entries != global_trace.entries);
@@ -3221,15 +3879,20 @@ __init static int tracer_alloc_buffers(void)
3221#endif 3879#endif
3222 3880
3223 /* All seems OK, enable tracing */ 3881 /* All seems OK, enable tracing */
3224 global_trace.ctrl = tracer_enabled;
3225 tracing_disabled = 0; 3882 tracing_disabled = 0;
3226 3883
3227 atomic_notifier_chain_register(&panic_notifier_list, 3884 atomic_notifier_chain_register(&panic_notifier_list,
3228 &trace_panic_notifier); 3885 &trace_panic_notifier);
3229 3886
3230 register_die_notifier(&trace_die_notifier); 3887 register_die_notifier(&trace_die_notifier);
3888 ret = 0;
3231 3889
3232 return 0; 3890out_free_cpumask:
3891 free_cpumask_var(tracing_cpumask);
3892out_free_buffer_mask:
3893 free_cpumask_var(tracing_buffer_mask);
3894out:
3895 return ret;
3233} 3896}
3234early_initcall(tracer_alloc_buffers); 3897early_initcall(tracer_alloc_buffers);
3235fs_initcall(tracer_init_debugfs); 3898fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8465ad052707..4d3d381bfd95 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h>
11 12
12enum trace_type { 13enum trace_type {
13 __TRACE_FIRST_TYPE = 0, 14 __TRACE_FIRST_TYPE = 0,
@@ -21,7 +22,14 @@ enum trace_type {
21 TRACE_SPECIAL, 22 TRACE_SPECIAL,
22 TRACE_MMIO_RW, 23 TRACE_MMIO_RW,
23 TRACE_MMIO_MAP, 24 TRACE_MMIO_MAP,
24 TRACE_BOOT, 25 TRACE_BRANCH,
26 TRACE_BOOT_CALL,
27 TRACE_BOOT_RET,
28 TRACE_GRAPH_RET,
29 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK,
31 TRACE_HW_BRANCHES,
32 TRACE_POWER,
25 33
26 __TRACE_LAST_TYPE 34 __TRACE_LAST_TYPE
27}; 35};
@@ -38,6 +46,7 @@ struct trace_entry {
38 unsigned char flags; 46 unsigned char flags;
39 unsigned char preempt_count; 47 unsigned char preempt_count;
40 int pid; 48 int pid;
49 int tgid;
41}; 50};
42 51
43/* 52/*
@@ -48,6 +57,18 @@ struct ftrace_entry {
48 unsigned long ip; 57 unsigned long ip;
49 unsigned long parent_ip; 58 unsigned long parent_ip;
50}; 59};
60
61/* Function call entry */
62struct ftrace_graph_ent_entry {
63 struct trace_entry ent;
64 struct ftrace_graph_ent graph_ent;
65};
66
67/* Function return entry */
68struct ftrace_graph_ret_entry {
69 struct trace_entry ent;
70 struct ftrace_graph_ret ret;
71};
51extern struct tracer boot_tracer; 72extern struct tracer boot_tracer;
52 73
53/* 74/*
@@ -85,12 +106,18 @@ struct stack_entry {
85 unsigned long caller[FTRACE_STACK_ENTRIES]; 106 unsigned long caller[FTRACE_STACK_ENTRIES];
86}; 107};
87 108
109struct userstack_entry {
110 struct trace_entry ent;
111 unsigned long caller[FTRACE_STACK_ENTRIES];
112};
113
88/* 114/*
89 * ftrace_printk entry: 115 * ftrace_printk entry:
90 */ 116 */
91struct print_entry { 117struct print_entry {
92 struct trace_entry ent; 118 struct trace_entry ent;
93 unsigned long ip; 119 unsigned long ip;
120 int depth;
94 char buf[]; 121 char buf[];
95}; 122};
96 123
@@ -112,9 +139,35 @@ struct trace_mmiotrace_map {
112 struct mmiotrace_map map; 139 struct mmiotrace_map map;
113}; 140};
114 141
115struct trace_boot { 142struct trace_boot_call {
116 struct trace_entry ent; 143 struct trace_entry ent;
117 struct boot_trace initcall; 144 struct boot_trace_call boot_call;
145};
146
147struct trace_boot_ret {
148 struct trace_entry ent;
149 struct boot_trace_ret boot_ret;
150};
151
152#define TRACE_FUNC_SIZE 30
153#define TRACE_FILE_SIZE 20
154struct trace_branch {
155 struct trace_entry ent;
156 unsigned line;
157 char func[TRACE_FUNC_SIZE+1];
158 char file[TRACE_FILE_SIZE+1];
159 char correct;
160};
161
162struct hw_branch_entry {
163 struct trace_entry ent;
164 u64 from;
165 u64 to;
166};
167
168struct trace_power {
169 struct trace_entry ent;
170 struct power_trace state_data;
118}; 171};
119 172
120/* 173/*
@@ -172,7 +225,6 @@ struct trace_iterator;
172struct trace_array { 225struct trace_array {
173 struct ring_buffer *buffer; 226 struct ring_buffer *buffer;
174 unsigned long entries; 227 unsigned long entries;
175 long ctrl;
176 int cpu; 228 int cpu;
177 cycle_t time_start; 229 cycle_t time_start;
178 struct task_struct *waiter; 230 struct task_struct *waiter;
@@ -212,13 +264,22 @@ extern void __ftrace_bad_type(void);
212 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ 264 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
213 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ 265 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
214 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ 266 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
267 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
215 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 268 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
216 IF_ASSIGN(var, ent, struct special_entry, 0); \ 269 IF_ASSIGN(var, ent, struct special_entry, 0); \
217 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 270 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
218 TRACE_MMIO_RW); \ 271 TRACE_MMIO_RW); \
219 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 272 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
220 TRACE_MMIO_MAP); \ 273 TRACE_MMIO_MAP); \
221 IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ 274 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
275 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
276 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
277 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
278 TRACE_GRAPH_ENT); \
279 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
280 TRACE_GRAPH_RET); \
281 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
282 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
222 __ftrace_bad_type(); \ 283 __ftrace_bad_type(); \
223 } while (0) 284 } while (0)
224 285
@@ -229,29 +290,56 @@ enum print_line_t {
229 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ 290 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
230}; 291};
231 292
293
294/*
295 * An option specific to a tracer. This is a boolean value.
296 * The bit is the bit index that sets its value on the
297 * flags value in struct tracer_flags.
298 */
299struct tracer_opt {
300 const char *name; /* Will appear on the trace_options file */
301 u32 bit; /* Mask assigned in val field in tracer_flags */
302};
303
304/*
305 * The set of specific options for a tracer. Your tracer
306 * have to set the initial value of the flags val.
307 */
308struct tracer_flags {
309 u32 val;
310 struct tracer_opt *opts;
311};
312
313/* Makes more easy to define a tracer opt */
314#define TRACER_OPT(s, b) .name = #s, .bit = b
315
232/* 316/*
233 * A specific tracer, represented by methods that operate on a trace array: 317 * A specific tracer, represented by methods that operate on a trace array:
234 */ 318 */
235struct tracer { 319struct tracer {
236 const char *name; 320 const char *name;
237 void (*init)(struct trace_array *tr); 321 /* Your tracer should raise a warning if init fails */
322 int (*init)(struct trace_array *tr);
238 void (*reset)(struct trace_array *tr); 323 void (*reset)(struct trace_array *tr);
324 void (*start)(struct trace_array *tr);
325 void (*stop)(struct trace_array *tr);
239 void (*open)(struct trace_iterator *iter); 326 void (*open)(struct trace_iterator *iter);
240 void (*pipe_open)(struct trace_iterator *iter); 327 void (*pipe_open)(struct trace_iterator *iter);
241 void (*close)(struct trace_iterator *iter); 328 void (*close)(struct trace_iterator *iter);
242 void (*start)(struct trace_iterator *iter);
243 void (*stop)(struct trace_iterator *iter);
244 ssize_t (*read)(struct trace_iterator *iter, 329 ssize_t (*read)(struct trace_iterator *iter,
245 struct file *filp, char __user *ubuf, 330 struct file *filp, char __user *ubuf,
246 size_t cnt, loff_t *ppos); 331 size_t cnt, loff_t *ppos);
247 void (*ctrl_update)(struct trace_array *tr);
248#ifdef CONFIG_FTRACE_STARTUP_TEST 332#ifdef CONFIG_FTRACE_STARTUP_TEST
249 int (*selftest)(struct tracer *trace, 333 int (*selftest)(struct tracer *trace,
250 struct trace_array *tr); 334 struct trace_array *tr);
251#endif 335#endif
336 void (*print_header)(struct seq_file *m);
252 enum print_line_t (*print_line)(struct trace_iterator *iter); 337 enum print_line_t (*print_line)(struct trace_iterator *iter);
338 /* If you handled the flag setting, return 0 */
339 int (*set_flag)(u32 old_flags, u32 bit, int set);
253 struct tracer *next; 340 struct tracer *next;
254 int print_max; 341 int print_max;
342 struct tracer_flags *flags;
255}; 343};
256 344
257struct trace_seq { 345struct trace_seq {
@@ -279,10 +367,14 @@ struct trace_iterator {
279 unsigned long iter_flags; 367 unsigned long iter_flags;
280 loff_t pos; 368 loff_t pos;
281 long idx; 369 long idx;
370
371 cpumask_var_t started;
282}; 372};
283 373
374int tracing_is_enabled(void);
284void trace_wake_up(void); 375void trace_wake_up(void);
285void tracing_reset(struct trace_array *tr, int cpu); 376void tracing_reset(struct trace_array *tr, int cpu);
377void tracing_reset_online_cpus(struct trace_array *tr);
286int tracing_open_generic(struct inode *inode, struct file *filp); 378int tracing_open_generic(struct inode *inode, struct file *filp);
287struct dentry *tracing_init_dentry(void); 379struct dentry *tracing_init_dentry(void);
288void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 380void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
@@ -321,8 +413,15 @@ void trace_function(struct trace_array *tr,
321 unsigned long parent_ip, 413 unsigned long parent_ip,
322 unsigned long flags, int pc); 414 unsigned long flags, int pc);
323 415
416void trace_graph_return(struct ftrace_graph_ret *trace);
417int trace_graph_entry(struct ftrace_graph_ent *trace);
418void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
419
324void tracing_start_cmdline_record(void); 420void tracing_start_cmdline_record(void);
325void tracing_stop_cmdline_record(void); 421void tracing_stop_cmdline_record(void);
422void tracing_sched_switch_assign_trace(struct trace_array *tr);
423void tracing_stop_sched_switch_record(void);
424void tracing_start_sched_switch_record(void);
326int register_tracer(struct tracer *type); 425int register_tracer(struct tracer *type);
327void unregister_tracer(struct tracer *type); 426void unregister_tracer(struct tracer *type);
328 427
@@ -358,6 +457,7 @@ struct tracer_switch_ops {
358 struct tracer_switch_ops *next; 457 struct tracer_switch_ops *next;
359}; 458};
360 459
460char *trace_find_cmdline(int pid);
361#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 461#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
362 462
363#ifdef CONFIG_DYNAMIC_FTRACE 463#ifdef CONFIG_DYNAMIC_FTRACE
@@ -383,19 +483,79 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
383 struct trace_array *tr); 483 struct trace_array *tr);
384extern int trace_selftest_startup_sysprof(struct tracer *trace, 484extern int trace_selftest_startup_sysprof(struct tracer *trace,
385 struct trace_array *tr); 485 struct trace_array *tr);
486extern int trace_selftest_startup_branch(struct tracer *trace,
487 struct trace_array *tr);
386#endif /* CONFIG_FTRACE_STARTUP_TEST */ 488#endif /* CONFIG_FTRACE_STARTUP_TEST */
387 489
388extern void *head_page(struct trace_array_cpu *data); 490extern void *head_page(struct trace_array_cpu *data);
389extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); 491extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
390extern void trace_seq_print_cont(struct trace_seq *s, 492extern void trace_seq_print_cont(struct trace_seq *s,
391 struct trace_iterator *iter); 493 struct trace_iterator *iter);
494
495extern int
496seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
497 unsigned long sym_flags);
392extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, 498extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
393 size_t cnt); 499 size_t cnt);
394extern long ns2usecs(cycle_t nsec); 500extern long ns2usecs(cycle_t nsec);
395extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); 501extern int
502trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
396 503
397extern unsigned long trace_flags; 504extern unsigned long trace_flags;
398 505
506/* Standard output formatting function used for function return traces */
507#ifdef CONFIG_FUNCTION_GRAPH_TRACER
508extern enum print_line_t print_graph_function(struct trace_iterator *iter);
509
510#ifdef CONFIG_DYNAMIC_FTRACE
511/* TODO: make this variable */
512#define FTRACE_GRAPH_MAX_FUNCS 32
513extern int ftrace_graph_count;
514extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
515
516static inline int ftrace_graph_addr(unsigned long addr)
517{
518 int i;
519
520 if (!ftrace_graph_count || test_tsk_trace_graph(current))
521 return 1;
522
523 for (i = 0; i < ftrace_graph_count; i++) {
524 if (addr == ftrace_graph_funcs[i])
525 return 1;
526 }
527
528 return 0;
529}
530#else
531static inline int ftrace_trace_addr(unsigned long addr)
532{
533 return 1;
534}
535static inline int ftrace_graph_addr(unsigned long addr)
536{
537 return 1;
538}
539#endif /* CONFIG_DYNAMIC_FTRACE */
540
541#else /* CONFIG_FUNCTION_GRAPH_TRACER */
542static inline enum print_line_t
543print_graph_function(struct trace_iterator *iter)
544{
545 return TRACE_TYPE_UNHANDLED;
546}
547#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
548
549extern struct pid *ftrace_pid_trace;
550
551static inline int ftrace_trace_task(struct task_struct *task)
552{
553 if (!ftrace_pid_trace)
554 return 1;
555
556 return test_tsk_trace_trace(task);
557}
558
399/* 559/*
400 * trace_iterator_flags is an enumeration that defines bit 560 * trace_iterator_flags is an enumeration that defines bit
401 * positions into trace_flags that controls the output. 561 * positions into trace_flags that controls the output.
@@ -415,8 +575,93 @@ enum trace_iterator_flags {
415 TRACE_ITER_STACKTRACE = 0x100, 575 TRACE_ITER_STACKTRACE = 0x100,
416 TRACE_ITER_SCHED_TREE = 0x200, 576 TRACE_ITER_SCHED_TREE = 0x200,
417 TRACE_ITER_PRINTK = 0x400, 577 TRACE_ITER_PRINTK = 0x400,
578 TRACE_ITER_PREEMPTONLY = 0x800,
579 TRACE_ITER_BRANCH = 0x1000,
580 TRACE_ITER_ANNOTATE = 0x2000,
581 TRACE_ITER_USERSTACKTRACE = 0x4000,
582 TRACE_ITER_SYM_USEROBJ = 0x8000,
583 TRACE_ITER_PRINTK_MSGONLY = 0x10000
418}; 584};
419 585
586/*
587 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
588 * control the output of kernel symbols.
589 */
590#define TRACE_ITER_SYM_MASK \
591 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
592
420extern struct tracer nop_trace; 593extern struct tracer nop_trace;
421 594
595/**
596 * ftrace_preempt_disable - disable preemption scheduler safe
597 *
598 * When tracing can happen inside the scheduler, there exists
599 * cases that the tracing might happen before the need_resched
600 * flag is checked. If this happens and the tracer calls
601 * preempt_enable (after a disable), a schedule might take place
602 * causing an infinite recursion.
603 *
604 * To prevent this, we read the need_recshed flag before
605 * disabling preemption. When we want to enable preemption we
606 * check the flag, if it is set, then we call preempt_enable_no_resched.
607 * Otherwise, we call preempt_enable.
608 *
609 * The rational for doing the above is that if need resched is set
610 * and we have yet to reschedule, we are either in an atomic location
611 * (where we do not need to check for scheduling) or we are inside
612 * the scheduler and do not want to resched.
613 */
614static inline int ftrace_preempt_disable(void)
615{
616 int resched;
617
618 resched = need_resched();
619 preempt_disable_notrace();
620
621 return resched;
622}
623
624/**
625 * ftrace_preempt_enable - enable preemption scheduler safe
626 * @resched: the return value from ftrace_preempt_disable
627 *
628 * This is a scheduler safe way to enable preemption and not miss
629 * any preemption checks. The disabled saved the state of preemption.
630 * If resched is set, then we were either inside an atomic or
631 * are inside the scheduler (we would have already scheduled
632 * otherwise). In this case, we do not want to call normal
633 * preempt_enable, but preempt_enable_no_resched instead.
634 */
635static inline void ftrace_preempt_enable(int resched)
636{
637 if (resched)
638 preempt_enable_no_resched_notrace();
639 else
640 preempt_enable_notrace();
641}
642
643#ifdef CONFIG_BRANCH_TRACER
644extern int enable_branch_tracing(struct trace_array *tr);
645extern void disable_branch_tracing(void);
646static inline int trace_branch_enable(struct trace_array *tr)
647{
648 if (trace_flags & TRACE_ITER_BRANCH)
649 return enable_branch_tracing(tr);
650 return 0;
651}
652static inline void trace_branch_disable(void)
653{
654 /* due to races, always disable */
655 disable_branch_tracing();
656}
657#else
658static inline int trace_branch_enable(struct trace_array *tr)
659{
660 return 0;
661}
662static inline void trace_branch_disable(void)
663{
664}
665#endif /* CONFIG_BRANCH_TRACER */
666
422#endif /* _LINUX_KERNEL_TRACE_H */ 667#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d0a5e50eeff2..366c8c333e13 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -13,101 +13,161 @@
13#include "trace.h" 13#include "trace.h"
14 14
15static struct trace_array *boot_trace; 15static struct trace_array *boot_trace;
16static int trace_boot_enabled; 16static bool pre_initcalls_finished;
17 17
18 18/* Tells the boot tracer that the pre_smp_initcalls are finished.
19/* Should be started after do_pre_smp_initcalls() in init/main.c */ 19 * So we are ready .
20 * It doesn't enable sched events tracing however.
21 * You have to call enable_boot_trace to do so.
22 */
20void start_boot_trace(void) 23void start_boot_trace(void)
21{ 24{
22 trace_boot_enabled = 1; 25 pre_initcalls_finished = true;
23} 26}
24 27
25void stop_boot_trace(void) 28void enable_boot_trace(void)
26{ 29{
27 trace_boot_enabled = 0; 30 if (pre_initcalls_finished)
31 tracing_start_sched_switch_record();
28} 32}
29 33
30void reset_boot_trace(struct trace_array *tr) 34void disable_boot_trace(void)
31{ 35{
32 stop_boot_trace(); 36 if (pre_initcalls_finished)
37 tracing_stop_sched_switch_record();
33} 38}
34 39
35static void boot_trace_init(struct trace_array *tr) 40static int boot_trace_init(struct trace_array *tr)
36{ 41{
37 int cpu; 42 int cpu;
38 boot_trace = tr; 43 boot_trace = tr;
39 44
40 trace_boot_enabled = 0; 45 for_each_cpu(cpu, cpu_possible_mask)
41
42 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu); 46 tracing_reset(tr, cpu);
47
48 tracing_sched_switch_assign_trace(tr);
49 return 0;
44} 50}
45 51
46static void boot_trace_ctrl_update(struct trace_array *tr) 52static enum print_line_t
53initcall_call_print_line(struct trace_iterator *iter)
47{ 54{
48 if (tr->ctrl) 55 struct trace_entry *entry = iter->ent;
49 start_boot_trace(); 56 struct trace_seq *s = &iter->seq;
57 struct trace_boot_call *field;
58 struct boot_trace_call *call;
59 u64 ts;
60 unsigned long nsec_rem;
61 int ret;
62
63 trace_assign_type(field, entry);
64 call = &field->boot_call;
65 ts = iter->ts;
66 nsec_rem = do_div(ts, 1000000000);
67
68 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
69 (unsigned long)ts, nsec_rem, call->func, call->caller);
70
71 if (!ret)
72 return TRACE_TYPE_PARTIAL_LINE;
50 else 73 else
51 stop_boot_trace(); 74 return TRACE_TYPE_HANDLED;
52} 75}
53 76
54static enum print_line_t initcall_print_line(struct trace_iterator *iter) 77static enum print_line_t
78initcall_ret_print_line(struct trace_iterator *iter)
55{ 79{
56 int ret;
57 struct trace_entry *entry = iter->ent; 80 struct trace_entry *entry = iter->ent;
58 struct trace_boot *field = (struct trace_boot *)entry;
59 struct boot_trace *it = &field->initcall;
60 struct trace_seq *s = &iter->seq; 81 struct trace_seq *s = &iter->seq;
61 struct timespec calltime = ktime_to_timespec(it->calltime); 82 struct trace_boot_ret *field;
62 struct timespec rettime = ktime_to_timespec(it->rettime); 83 struct boot_trace_ret *init_ret;
63 84 u64 ts;
64 if (entry->type == TRACE_BOOT) { 85 unsigned long nsec_rem;
65 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", 86 int ret;
66 calltime.tv_sec, 87
67 calltime.tv_nsec, 88 trace_assign_type(field, entry);
68 it->func, it->caller); 89 init_ret = &field->boot_ret;
69 if (!ret) 90 ts = iter->ts;
70 return TRACE_TYPE_PARTIAL_LINE; 91 nsec_rem = do_div(ts, 1000000000);
71 92
72 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " 93 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
73 "returned %d after %lld msecs\n", 94 "returned %d after %llu msecs\n",
74 rettime.tv_sec, 95 (unsigned long) ts,
75 rettime.tv_nsec, 96 nsec_rem,
76 it->func, it->result, it->duration); 97 init_ret->func, init_ret->result, init_ret->duration);
77 98
78 if (!ret) 99 if (!ret)
79 return TRACE_TYPE_PARTIAL_LINE; 100 return TRACE_TYPE_PARTIAL_LINE;
101 else
80 return TRACE_TYPE_HANDLED; 102 return TRACE_TYPE_HANDLED;
103}
104
105static enum print_line_t initcall_print_line(struct trace_iterator *iter)
106{
107 struct trace_entry *entry = iter->ent;
108
109 switch (entry->type) {
110 case TRACE_BOOT_CALL:
111 return initcall_call_print_line(iter);
112 case TRACE_BOOT_RET:
113 return initcall_ret_print_line(iter);
114 default:
115 return TRACE_TYPE_UNHANDLED;
81 } 116 }
82 return TRACE_TYPE_UNHANDLED;
83} 117}
84 118
85struct tracer boot_tracer __read_mostly = 119struct tracer boot_tracer __read_mostly =
86{ 120{
87 .name = "initcall", 121 .name = "initcall",
88 .init = boot_trace_init, 122 .init = boot_trace_init,
89 .reset = reset_boot_trace, 123 .reset = tracing_reset_online_cpus,
90 .ctrl_update = boot_trace_ctrl_update,
91 .print_line = initcall_print_line, 124 .print_line = initcall_print_line,
92}; 125};
93 126
94void trace_boot(struct boot_trace *it, initcall_t fn) 127void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
95{ 128{
96 struct ring_buffer_event *event; 129 struct ring_buffer_event *event;
97 struct trace_boot *entry; 130 struct trace_boot_call *entry;
98 struct trace_array_cpu *data;
99 unsigned long irq_flags; 131 unsigned long irq_flags;
100 struct trace_array *tr = boot_trace; 132 struct trace_array *tr = boot_trace;
101 133
102 if (!trace_boot_enabled) 134 if (!pre_initcalls_finished)
103 return; 135 return;
104 136
105 /* Get its name now since this function could 137 /* Get its name now since this function could
106 * disappear because it is in the .init section. 138 * disappear because it is in the .init section.
107 */ 139 */
108 sprint_symbol(it->func, (unsigned long)fn); 140 sprint_symbol(bt->func, (unsigned long)fn);
141 preempt_disable();
142
143 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
144 &irq_flags);
145 if (!event)
146 goto out;
147 entry = ring_buffer_event_data(event);
148 tracing_generic_entry_update(&entry->ent, 0, 0);
149 entry->ent.type = TRACE_BOOT_CALL;
150 entry->boot_call = *bt;
151 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
152
153 trace_wake_up();
154
155 out:
156 preempt_enable();
157}
158
159void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
160{
161 struct ring_buffer_event *event;
162 struct trace_boot_ret *entry;
163 unsigned long irq_flags;
164 struct trace_array *tr = boot_trace;
165
166 if (!pre_initcalls_finished)
167 return;
168
169 sprint_symbol(bt->func, (unsigned long)fn);
109 preempt_disable(); 170 preempt_disable();
110 data = tr->data[smp_processor_id()];
111 171
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 172 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
113 &irq_flags); 173 &irq_flags);
@@ -115,8 +175,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
115 goto out; 175 goto out;
116 entry = ring_buffer_event_data(event); 176 entry = ring_buffer_event_data(event);
117 tracing_generic_entry_update(&entry->ent, 0, 0); 177 tracing_generic_entry_update(&entry->ent, 0, 0);
118 entry->ent.type = TRACE_BOOT; 178 entry->ent.type = TRACE_BOOT_RET;
119 entry->initcall = *it; 179 entry->boot_ret = *bt;
120 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 180 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
121 181
122 trace_wake_up(); 182 trace_wake_up();
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644
index 000000000000..6c00feb3bac7
--- /dev/null
+++ b/kernel/trace/trace_branch.c
@@ -0,0 +1,342 @@
1/*
2 * unlikely profiler
3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/kallsyms.h>
7#include <linux/seq_file.h>
8#include <linux/spinlock.h>
9#include <linux/irqflags.h>
10#include <linux/debugfs.h>
11#include <linux/uaccess.h>
12#include <linux/module.h>
13#include <linux/ftrace.h>
14#include <linux/hash.h>
15#include <linux/fs.h>
16#include <asm/local.h>
17#include "trace.h"
18
19#ifdef CONFIG_BRANCH_TRACER
20
21static int branch_tracing_enabled __read_mostly;
22static DEFINE_MUTEX(branch_tracing_mutex);
23static struct trace_array *branch_tracer;
24
25static void
26probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
27{
28 struct trace_array *tr = branch_tracer;
29 struct ring_buffer_event *event;
30 struct trace_branch *entry;
31 unsigned long flags, irq_flags;
32 int cpu, pc;
33 const char *p;
34
35 /*
36 * I would love to save just the ftrace_likely_data pointer, but
37 * this code can also be used by modules. Ugly things can happen
38 * if the module is unloaded, and then we go and read the
39 * pointer. This is slower, but much safer.
40 */
41
42 if (unlikely(!tr))
43 return;
44
45 local_irq_save(flags);
46 cpu = raw_smp_processor_id();
47 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
48 goto out;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
51 &irq_flags);
52 if (!event)
53 goto out;
54
55 pc = preempt_count();
56 entry = ring_buffer_event_data(event);
57 tracing_generic_entry_update(&entry->ent, flags, pc);
58 entry->ent.type = TRACE_BRANCH;
59
60 /* Strip off the path, only save the file */
61 p = f->file + strlen(f->file);
62 while (p >= f->file && *p != '/')
63 p--;
64 p++;
65
66 strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
67 strncpy(entry->file, p, TRACE_FILE_SIZE);
68 entry->func[TRACE_FUNC_SIZE] = 0;
69 entry->file[TRACE_FILE_SIZE] = 0;
70 entry->line = f->line;
71 entry->correct = val == expect;
72
73 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
74
75 out:
76 atomic_dec(&tr->data[cpu]->disabled);
77 local_irq_restore(flags);
78}
79
80static inline
81void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
82{
83 if (!branch_tracing_enabled)
84 return;
85
86 probe_likely_condition(f, val, expect);
87}
88
89int enable_branch_tracing(struct trace_array *tr)
90{
91 int ret = 0;
92
93 mutex_lock(&branch_tracing_mutex);
94 branch_tracer = tr;
95 /*
96 * Must be seen before enabling. The reader is a condition
97 * where we do not need a matching rmb()
98 */
99 smp_wmb();
100 branch_tracing_enabled++;
101 mutex_unlock(&branch_tracing_mutex);
102
103 return ret;
104}
105
106void disable_branch_tracing(void)
107{
108 mutex_lock(&branch_tracing_mutex);
109
110 if (!branch_tracing_enabled)
111 goto out_unlock;
112
113 branch_tracing_enabled--;
114
115 out_unlock:
116 mutex_unlock(&branch_tracing_mutex);
117}
118
119static void start_branch_trace(struct trace_array *tr)
120{
121 enable_branch_tracing(tr);
122}
123
124static void stop_branch_trace(struct trace_array *tr)
125{
126 disable_branch_tracing();
127}
128
129static int branch_trace_init(struct trace_array *tr)
130{
131 int cpu;
132
133 for_each_online_cpu(cpu)
134 tracing_reset(tr, cpu);
135
136 start_branch_trace(tr);
137 return 0;
138}
139
140static void branch_trace_reset(struct trace_array *tr)
141{
142 stop_branch_trace(tr);
143}
144
145struct tracer branch_trace __read_mostly =
146{
147 .name = "branch",
148 .init = branch_trace_init,
149 .reset = branch_trace_reset,
150#ifdef CONFIG_FTRACE_SELFTEST
151 .selftest = trace_selftest_startup_branch,
152#endif
153};
154
155__init static int init_branch_trace(void)
156{
157 return register_tracer(&branch_trace);
158}
159
160device_initcall(init_branch_trace);
161#else
162static inline
163void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
164{
165}
166#endif /* CONFIG_BRANCH_TRACER */
167
168void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
169{
170 /*
171 * I would love to have a trace point here instead, but the
172 * trace point code is so inundated with unlikely and likely
173 * conditions that the recursive nightmare that exists is too
174 * much to try to get working. At least for now.
175 */
176 trace_likely_condition(f, val, expect);
177
178 /* FIXME: Make this atomic! */
179 if (val == expect)
180 f->correct++;
181 else
182 f->incorrect++;
183}
184EXPORT_SYMBOL(ftrace_likely_update);
185
186struct ftrace_pointer {
187 void *start;
188 void *stop;
189 int hit;
190};
191
192static void *
193t_next(struct seq_file *m, void *v, loff_t *pos)
194{
195 const struct ftrace_pointer *f = m->private;
196 struct ftrace_branch_data *p = v;
197
198 (*pos)++;
199
200 if (v == (void *)1)
201 return f->start;
202
203 ++p;
204
205 if ((void *)p >= (void *)f->stop)
206 return NULL;
207
208 return p;
209}
210
211static void *t_start(struct seq_file *m, loff_t *pos)
212{
213 void *t = (void *)1;
214 loff_t l = 0;
215
216 for (; t && l < *pos; t = t_next(m, t, &l))
217 ;
218
219 return t;
220}
221
222static void t_stop(struct seq_file *m, void *p)
223{
224}
225
226static int t_show(struct seq_file *m, void *v)
227{
228 const struct ftrace_pointer *fp = m->private;
229 struct ftrace_branch_data *p = v;
230 const char *f;
231 long percent;
232
233 if (v == (void *)1) {
234 if (fp->hit)
235 seq_printf(m, " miss hit %% ");
236 else
237 seq_printf(m, " correct incorrect %% ");
238 seq_printf(m, " Function "
239 " File Line\n"
240 " ------- --------- - "
241 " -------- "
242 " ---- ----\n");
243 return 0;
244 }
245
246 /* Only print the file, not the path */
247 f = p->file + strlen(p->file);
248 while (f >= p->file && *f != '/')
249 f--;
250 f++;
251
252 /*
253 * The miss is overlayed on correct, and hit on incorrect.
254 */
255 if (p->correct) {
256 percent = p->incorrect * 100;
257 percent /= p->correct + p->incorrect;
258 } else
259 percent = p->incorrect ? 100 : -1;
260
261 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
262 if (percent < 0)
263 seq_printf(m, " X ");
264 else
265 seq_printf(m, "%3ld ", percent);
266 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
267 return 0;
268}
269
270static struct seq_operations tracing_likely_seq_ops = {
271 .start = t_start,
272 .next = t_next,
273 .stop = t_stop,
274 .show = t_show,
275};
276
277static int tracing_branch_open(struct inode *inode, struct file *file)
278{
279 int ret;
280
281 ret = seq_open(file, &tracing_likely_seq_ops);
282 if (!ret) {
283 struct seq_file *m = file->private_data;
284 m->private = (void *)inode->i_private;
285 }
286
287 return ret;
288}
289
290static const struct file_operations tracing_branch_fops = {
291 .open = tracing_branch_open,
292 .read = seq_read,
293 .llseek = seq_lseek,
294};
295
296#ifdef CONFIG_PROFILE_ALL_BRANCHES
297extern unsigned long __start_branch_profile[];
298extern unsigned long __stop_branch_profile[];
299
300static const struct ftrace_pointer ftrace_branch_pos = {
301 .start = __start_branch_profile,
302 .stop = __stop_branch_profile,
303 .hit = 1,
304};
305
306#endif /* CONFIG_PROFILE_ALL_BRANCHES */
307
308extern unsigned long __start_annotated_branch_profile[];
309extern unsigned long __stop_annotated_branch_profile[];
310
311static const struct ftrace_pointer ftrace_annotated_branch_pos = {
312 .start = __start_annotated_branch_profile,
313 .stop = __stop_annotated_branch_profile,
314};
315
316static __init int ftrace_branch_init(void)
317{
318 struct dentry *d_tracer;
319 struct dentry *entry;
320
321 d_tracer = tracing_init_dentry();
322
323 entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
324 (void *)&ftrace_annotated_branch_pos,
325 &tracing_branch_fops);
326 if (!entry)
327 pr_warning("Could not create debugfs "
328 "'profile_annotatet_branch' entry\n");
329
330#ifdef CONFIG_PROFILE_ALL_BRANCHES
331 entry = debugfs_create_file("profile_branch", 0444, d_tracer,
332 (void *)&ftrace_branch_pos,
333 &tracing_branch_fops);
334 if (!entry)
335 pr_warning("Could not create debugfs"
336 " 'profile_branch' entry\n");
337#endif
338
339 return 0;
340}
341
342device_initcall(ftrace_branch_init);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0f85a64003d3..9236d7e25a16 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,20 +16,10 @@
16 16
17#include "trace.h" 17#include "trace.h"
18 18
19static void function_reset(struct trace_array *tr)
20{
21 int cpu;
22
23 tr->time_start = ftrace_now(tr->cpu);
24
25 for_each_online_cpu(cpu)
26 tracing_reset(tr, cpu);
27}
28
29static void start_function_trace(struct trace_array *tr) 19static void start_function_trace(struct trace_array *tr)
30{ 20{
31 tr->cpu = get_cpu(); 21 tr->cpu = get_cpu();
32 function_reset(tr); 22 tracing_reset_online_cpus(tr);
33 put_cpu(); 23 put_cpu();
34 24
35 tracing_start_cmdline_record(); 25 tracing_start_cmdline_record();
@@ -42,24 +32,20 @@ static void stop_function_trace(struct trace_array *tr)
42 tracing_stop_cmdline_record(); 32 tracing_stop_cmdline_record();
43} 33}
44 34
45static void function_trace_init(struct trace_array *tr) 35static int function_trace_init(struct trace_array *tr)
46{ 36{
47 if (tr->ctrl) 37 start_function_trace(tr);
48 start_function_trace(tr); 38 return 0;
49} 39}
50 40
51static void function_trace_reset(struct trace_array *tr) 41static void function_trace_reset(struct trace_array *tr)
52{ 42{
53 if (tr->ctrl) 43 stop_function_trace(tr);
54 stop_function_trace(tr);
55} 44}
56 45
57static void function_trace_ctrl_update(struct trace_array *tr) 46static void function_trace_start(struct trace_array *tr)
58{ 47{
59 if (tr->ctrl) 48 tracing_reset_online_cpus(tr);
60 start_function_trace(tr);
61 else
62 stop_function_trace(tr);
63} 49}
64 50
65static struct tracer function_trace __read_mostly = 51static struct tracer function_trace __read_mostly =
@@ -67,7 +53,7 @@ static struct tracer function_trace __read_mostly =
67 .name = "function", 53 .name = "function",
68 .init = function_trace_init, 54 .init = function_trace_init,
69 .reset = function_trace_reset, 55 .reset = function_trace_reset,
70 .ctrl_update = function_trace_ctrl_update, 56 .start = function_trace_start,
71#ifdef CONFIG_FTRACE_SELFTEST 57#ifdef CONFIG_FTRACE_SELFTEST
72 .selftest = trace_selftest_startup_function, 58 .selftest = trace_selftest_startup_function,
73#endif 59#endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644
index 000000000000..930c08e5b38e
--- /dev/null
+++ b/kernel/trace/trace_functions_graph.c
@@ -0,0 +1,669 @@
1/*
2 *
3 * Function graph tracer.
4 * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 * Mostly borrowed from function tracer which
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 *
8 */
9#include <linux/debugfs.h>
10#include <linux/uaccess.h>
11#include <linux/ftrace.h>
12#include <linux/fs.h>
13
14#include "trace.h"
15
16#define TRACE_GRAPH_INDENT 2
17
18/* Flag options */
19#define TRACE_GRAPH_PRINT_OVERRUN 0x1
20#define TRACE_GRAPH_PRINT_CPU 0x2
21#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
22#define TRACE_GRAPH_PRINT_PROC 0x8
23
24static struct tracer_opt trace_opts[] = {
25 /* Display overruns ? */
26 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
27 /* Display CPU ? */
28 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
29 /* Display Overhead ? */
30 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
31 /* Display proc name/pid */
32 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
33 { } /* Empty entry */
34};
35
36static struct tracer_flags tracer_flags = {
37 /* Don't display overruns and proc by default */
38 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
39 .opts = trace_opts
40};
41
42/* pid on the last trace processed */
43static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
44
45static int graph_trace_init(struct trace_array *tr)
46{
47 int cpu, ret;
48
49 for_each_online_cpu(cpu)
50 tracing_reset(tr, cpu);
51
52 ret = register_ftrace_graph(&trace_graph_return,
53 &trace_graph_entry);
54 if (ret)
55 return ret;
56 tracing_start_cmdline_record();
57
58 return 0;
59}
60
61static void graph_trace_reset(struct trace_array *tr)
62{
63 tracing_stop_cmdline_record();
64 unregister_ftrace_graph();
65}
66
67static inline int log10_cpu(int nb)
68{
69 if (nb / 100)
70 return 3;
71 if (nb / 10)
72 return 2;
73 return 1;
74}
75
76static enum print_line_t
77print_graph_cpu(struct trace_seq *s, int cpu)
78{
79 int i;
80 int ret;
81 int log10_this = log10_cpu(cpu);
82 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
83
84
85 /*
86 * Start with a space character - to make it stand out
87 * to the right a bit when trace output is pasted into
88 * email:
89 */
90 ret = trace_seq_printf(s, " ");
91
92 /*
93 * Tricky - we space the CPU field according to the max
94 * number of online CPUs. On a 2-cpu system it would take
95 * a maximum of 1 digit - on a 128 cpu system it would
96 * take up to 3 digits:
97 */
98 for (i = 0; i < log10_all - log10_this; i++) {
99 ret = trace_seq_printf(s, " ");
100 if (!ret)
101 return TRACE_TYPE_PARTIAL_LINE;
102 }
103 ret = trace_seq_printf(s, "%d) ", cpu);
104 if (!ret)
105 return TRACE_TYPE_PARTIAL_LINE;
106
107 return TRACE_TYPE_HANDLED;
108}
109
110#define TRACE_GRAPH_PROCINFO_LENGTH 14
111
112static enum print_line_t
113print_graph_proc(struct trace_seq *s, pid_t pid)
114{
115 int i;
116 int ret;
117 int len;
118 char comm[8];
119 int spaces = 0;
120 /* sign + log10(MAX_INT) + '\0' */
121 char pid_str[11];
122
123 strncpy(comm, trace_find_cmdline(pid), 7);
124 comm[7] = '\0';
125 sprintf(pid_str, "%d", pid);
126
127 /* 1 stands for the "-" character */
128 len = strlen(comm) + strlen(pid_str) + 1;
129
130 if (len < TRACE_GRAPH_PROCINFO_LENGTH)
131 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
132
133 /* First spaces to align center */
134 for (i = 0; i < spaces / 2; i++) {
135 ret = trace_seq_printf(s, " ");
136 if (!ret)
137 return TRACE_TYPE_PARTIAL_LINE;
138 }
139
140 ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
141 if (!ret)
142 return TRACE_TYPE_PARTIAL_LINE;
143
144 /* Last spaces to align center */
145 for (i = 0; i < spaces - (spaces / 2); i++) {
146 ret = trace_seq_printf(s, " ");
147 if (!ret)
148 return TRACE_TYPE_PARTIAL_LINE;
149 }
150 return TRACE_TYPE_HANDLED;
151}
152
153
154/* If the pid changed since the last trace, output this event */
155static enum print_line_t
156verif_pid(struct trace_seq *s, pid_t pid, int cpu)
157{
158 pid_t prev_pid;
159 int ret;
160
161 if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
162 return TRACE_TYPE_HANDLED;
163
164 prev_pid = last_pid[cpu];
165 last_pid[cpu] = pid;
166
167/*
168 * Context-switch trace line:
169
170 ------------------------------------------
171 | 1) migration/0--1 => sshd-1755
172 ------------------------------------------
173
174 */
175 ret = trace_seq_printf(s,
176 " ------------------------------------------\n");
177 if (!ret)
178 TRACE_TYPE_PARTIAL_LINE;
179
180 ret = print_graph_cpu(s, cpu);
181 if (ret == TRACE_TYPE_PARTIAL_LINE)
182 TRACE_TYPE_PARTIAL_LINE;
183
184 ret = print_graph_proc(s, prev_pid);
185 if (ret == TRACE_TYPE_PARTIAL_LINE)
186 TRACE_TYPE_PARTIAL_LINE;
187
188 ret = trace_seq_printf(s, " => ");
189 if (!ret)
190 TRACE_TYPE_PARTIAL_LINE;
191
192 ret = print_graph_proc(s, pid);
193 if (ret == TRACE_TYPE_PARTIAL_LINE)
194 TRACE_TYPE_PARTIAL_LINE;
195
196 ret = trace_seq_printf(s,
197 "\n ------------------------------------------\n\n");
198 if (!ret)
199 TRACE_TYPE_PARTIAL_LINE;
200
201 return ret;
202}
203
204static bool
205trace_branch_is_leaf(struct trace_iterator *iter,
206 struct ftrace_graph_ent_entry *curr)
207{
208 struct ring_buffer_iter *ring_iter;
209 struct ring_buffer_event *event;
210 struct ftrace_graph_ret_entry *next;
211
212 ring_iter = iter->buffer_iter[iter->cpu];
213
214 if (!ring_iter)
215 return false;
216
217 event = ring_buffer_iter_peek(ring_iter, NULL);
218
219 if (!event)
220 return false;
221
222 next = ring_buffer_event_data(event);
223
224 if (next->ent.type != TRACE_GRAPH_RET)
225 return false;
226
227 if (curr->ent.pid != next->ent.pid ||
228 curr->graph_ent.func != next->ret.func)
229 return false;
230
231 return true;
232}
233
234static enum print_line_t
235print_graph_irq(struct trace_seq *s, unsigned long addr,
236 enum trace_type type, int cpu, pid_t pid)
237{
238 int ret;
239
240 if (addr < (unsigned long)__irqentry_text_start ||
241 addr >= (unsigned long)__irqentry_text_end)
242 return TRACE_TYPE_UNHANDLED;
243
244 if (type == TRACE_GRAPH_ENT) {
245 ret = trace_seq_printf(s, "==========> | ");
246 } else {
247 /* Cpu */
248 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
249 ret = print_graph_cpu(s, cpu);
250 if (ret == TRACE_TYPE_PARTIAL_LINE)
251 return TRACE_TYPE_PARTIAL_LINE;
252 }
253 /* Proc */
254 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
255 ret = print_graph_proc(s, pid);
256 if (ret == TRACE_TYPE_PARTIAL_LINE)
257 return TRACE_TYPE_PARTIAL_LINE;
258
259 ret = trace_seq_printf(s, " | ");
260 if (!ret)
261 return TRACE_TYPE_PARTIAL_LINE;
262 }
263
264 /* No overhead */
265 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
266 ret = trace_seq_printf(s, " ");
267 if (!ret)
268 return TRACE_TYPE_PARTIAL_LINE;
269 }
270
271 ret = trace_seq_printf(s, "<========== |\n");
272 }
273 if (!ret)
274 return TRACE_TYPE_PARTIAL_LINE;
275 return TRACE_TYPE_HANDLED;
276}
277
278static enum print_line_t
279print_graph_duration(unsigned long long duration, struct trace_seq *s)
280{
281 unsigned long nsecs_rem = do_div(duration, 1000);
282 /* log10(ULONG_MAX) + '\0' */
283 char msecs_str[21];
284 char nsecs_str[5];
285 int ret, len;
286 int i;
287
288 sprintf(msecs_str, "%lu", (unsigned long) duration);
289
290 /* Print msecs */
291 ret = trace_seq_printf(s, msecs_str);
292 if (!ret)
293 return TRACE_TYPE_PARTIAL_LINE;
294
295 len = strlen(msecs_str);
296
297 /* Print nsecs (we don't want to exceed 7 numbers) */
298 if (len < 7) {
299 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
300 ret = trace_seq_printf(s, ".%s", nsecs_str);
301 if (!ret)
302 return TRACE_TYPE_PARTIAL_LINE;
303 len += strlen(nsecs_str);
304 }
305
306 ret = trace_seq_printf(s, " us ");
307 if (!ret)
308 return TRACE_TYPE_PARTIAL_LINE;
309
310 /* Print remaining spaces to fit the row's width */
311 for (i = len; i < 7; i++) {
312 ret = trace_seq_printf(s, " ");
313 if (!ret)
314 return TRACE_TYPE_PARTIAL_LINE;
315 }
316
317 ret = trace_seq_printf(s, "| ");
318 if (!ret)
319 return TRACE_TYPE_PARTIAL_LINE;
320 return TRACE_TYPE_HANDLED;
321
322}
323
324/* Signal a overhead of time execution to the output */
325static int
326print_graph_overhead(unsigned long long duration, struct trace_seq *s)
327{
328 /* Duration exceeded 100 msecs */
329 if (duration > 100000ULL)
330 return trace_seq_printf(s, "! ");
331
332 /* Duration exceeded 10 msecs */
333 if (duration > 10000ULL)
334 return trace_seq_printf(s, "+ ");
335
336 return trace_seq_printf(s, " ");
337}
338
339/* Case of a leaf function on its call entry */
340static enum print_line_t
341print_graph_entry_leaf(struct trace_iterator *iter,
342 struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
343{
344 struct ftrace_graph_ret_entry *ret_entry;
345 struct ftrace_graph_ret *graph_ret;
346 struct ring_buffer_event *event;
347 struct ftrace_graph_ent *call;
348 unsigned long long duration;
349 int ret;
350 int i;
351
352 event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
353 ret_entry = ring_buffer_event_data(event);
354 graph_ret = &ret_entry->ret;
355 call = &entry->graph_ent;
356 duration = graph_ret->rettime - graph_ret->calltime;
357
358 /* Overhead */
359 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
360 ret = print_graph_overhead(duration, s);
361 if (!ret)
362 return TRACE_TYPE_PARTIAL_LINE;
363 }
364
365 /* Duration */
366 ret = print_graph_duration(duration, s);
367 if (ret == TRACE_TYPE_PARTIAL_LINE)
368 return TRACE_TYPE_PARTIAL_LINE;
369
370 /* Function */
371 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
372 ret = trace_seq_printf(s, " ");
373 if (!ret)
374 return TRACE_TYPE_PARTIAL_LINE;
375 }
376
377 ret = seq_print_ip_sym(s, call->func, 0);
378 if (!ret)
379 return TRACE_TYPE_PARTIAL_LINE;
380
381 ret = trace_seq_printf(s, "();\n");
382 if (!ret)
383 return TRACE_TYPE_PARTIAL_LINE;
384
385 return TRACE_TYPE_HANDLED;
386}
387
388static enum print_line_t
389print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
390 struct trace_seq *s, pid_t pid, int cpu)
391{
392 int i;
393 int ret;
394 struct ftrace_graph_ent *call = &entry->graph_ent;
395
396 /* No overhead */
397 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
398 ret = trace_seq_printf(s, " ");
399 if (!ret)
400 return TRACE_TYPE_PARTIAL_LINE;
401 }
402
403 /* Interrupt */
404 ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
405 if (ret == TRACE_TYPE_UNHANDLED) {
406 /* No time */
407 ret = trace_seq_printf(s, " | ");
408 if (!ret)
409 return TRACE_TYPE_PARTIAL_LINE;
410 } else {
411 if (ret == TRACE_TYPE_PARTIAL_LINE)
412 return TRACE_TYPE_PARTIAL_LINE;
413 }
414
415
416 /* Function */
417 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
418 ret = trace_seq_printf(s, " ");
419 if (!ret)
420 return TRACE_TYPE_PARTIAL_LINE;
421 }
422
423 ret = seq_print_ip_sym(s, call->func, 0);
424 if (!ret)
425 return TRACE_TYPE_PARTIAL_LINE;
426
427 ret = trace_seq_printf(s, "() {\n");
428 if (!ret)
429 return TRACE_TYPE_PARTIAL_LINE;
430
431 return TRACE_TYPE_HANDLED;
432}
433
434static enum print_line_t
435print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
436 struct trace_iterator *iter, int cpu)
437{
438 int ret;
439 struct trace_entry *ent = iter->ent;
440
441 /* Pid */
442 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
443 return TRACE_TYPE_PARTIAL_LINE;
444
445 /* Cpu */
446 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
447 ret = print_graph_cpu(s, cpu);
448 if (ret == TRACE_TYPE_PARTIAL_LINE)
449 return TRACE_TYPE_PARTIAL_LINE;
450 }
451
452 /* Proc */
453 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
454 ret = print_graph_proc(s, ent->pid);
455 if (ret == TRACE_TYPE_PARTIAL_LINE)
456 return TRACE_TYPE_PARTIAL_LINE;
457
458 ret = trace_seq_printf(s, " | ");
459 if (!ret)
460 return TRACE_TYPE_PARTIAL_LINE;
461 }
462
463 if (trace_branch_is_leaf(iter, field))
464 return print_graph_entry_leaf(iter, field, s);
465 else
466 return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
467
468}
469
470static enum print_line_t
471print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
472 struct trace_entry *ent, int cpu)
473{
474 int i;
475 int ret;
476 unsigned long long duration = trace->rettime - trace->calltime;
477
478 /* Pid */
479 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
480 return TRACE_TYPE_PARTIAL_LINE;
481
482 /* Cpu */
483 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
484 ret = print_graph_cpu(s, cpu);
485 if (ret == TRACE_TYPE_PARTIAL_LINE)
486 return TRACE_TYPE_PARTIAL_LINE;
487 }
488
489 /* Proc */
490 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
491 ret = print_graph_proc(s, ent->pid);
492 if (ret == TRACE_TYPE_PARTIAL_LINE)
493 return TRACE_TYPE_PARTIAL_LINE;
494
495 ret = trace_seq_printf(s, " | ");
496 if (!ret)
497 return TRACE_TYPE_PARTIAL_LINE;
498 }
499
500 /* Overhead */
501 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
502 ret = print_graph_overhead(duration, s);
503 if (!ret)
504 return TRACE_TYPE_PARTIAL_LINE;
505 }
506
507 /* Duration */
508 ret = print_graph_duration(duration, s);
509 if (ret == TRACE_TYPE_PARTIAL_LINE)
510 return TRACE_TYPE_PARTIAL_LINE;
511
512 /* Closing brace */
513 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
514 ret = trace_seq_printf(s, " ");
515 if (!ret)
516 return TRACE_TYPE_PARTIAL_LINE;
517 }
518
519 ret = trace_seq_printf(s, "}\n");
520 if (!ret)
521 return TRACE_TYPE_PARTIAL_LINE;
522
523 /* Overrun */
524 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
525 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
526 trace->overrun);
527 if (!ret)
528 return TRACE_TYPE_PARTIAL_LINE;
529 }
530
531 ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
532 if (ret == TRACE_TYPE_PARTIAL_LINE)
533 return TRACE_TYPE_PARTIAL_LINE;
534
535 return TRACE_TYPE_HANDLED;
536}
537
538static enum print_line_t
539print_graph_comment(struct print_entry *trace, struct trace_seq *s,
540 struct trace_entry *ent, struct trace_iterator *iter)
541{
542 int i;
543 int ret;
544
545 /* Pid */
546 if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
547 return TRACE_TYPE_PARTIAL_LINE;
548
549 /* Cpu */
550 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
551 ret = print_graph_cpu(s, iter->cpu);
552 if (ret == TRACE_TYPE_PARTIAL_LINE)
553 return TRACE_TYPE_PARTIAL_LINE;
554 }
555
556 /* Proc */
557 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
558 ret = print_graph_proc(s, ent->pid);
559 if (ret == TRACE_TYPE_PARTIAL_LINE)
560 return TRACE_TYPE_PARTIAL_LINE;
561
562 ret = trace_seq_printf(s, " | ");
563 if (!ret)
564 return TRACE_TYPE_PARTIAL_LINE;
565 }
566
567 /* No overhead */
568 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
569 ret = trace_seq_printf(s, " ");
570 if (!ret)
571 return TRACE_TYPE_PARTIAL_LINE;
572 }
573
574 /* No time */
575 ret = trace_seq_printf(s, " | ");
576 if (!ret)
577 return TRACE_TYPE_PARTIAL_LINE;
578
579 /* Indentation */
580 if (trace->depth > 0)
581 for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
582 ret = trace_seq_printf(s, " ");
583 if (!ret)
584 return TRACE_TYPE_PARTIAL_LINE;
585 }
586
587 /* The comment */
588 ret = trace_seq_printf(s, "/* %s", trace->buf);
589 if (!ret)
590 return TRACE_TYPE_PARTIAL_LINE;
591
592 if (ent->flags & TRACE_FLAG_CONT)
593 trace_seq_print_cont(s, iter);
594
595 ret = trace_seq_printf(s, " */\n");
596 if (!ret)
597 return TRACE_TYPE_PARTIAL_LINE;
598
599 return TRACE_TYPE_HANDLED;
600}
601
602
603enum print_line_t
604print_graph_function(struct trace_iterator *iter)
605{
606 struct trace_seq *s = &iter->seq;
607 struct trace_entry *entry = iter->ent;
608
609 switch (entry->type) {
610 case TRACE_GRAPH_ENT: {
611 struct ftrace_graph_ent_entry *field;
612 trace_assign_type(field, entry);
613 return print_graph_entry(field, s, iter,
614 iter->cpu);
615 }
616 case TRACE_GRAPH_RET: {
617 struct ftrace_graph_ret_entry *field;
618 trace_assign_type(field, entry);
619 return print_graph_return(&field->ret, s, entry, iter->cpu);
620 }
621 case TRACE_PRINT: {
622 struct print_entry *field;
623 trace_assign_type(field, entry);
624 return print_graph_comment(field, s, entry, iter);
625 }
626 default:
627 return TRACE_TYPE_UNHANDLED;
628 }
629}
630
631static void print_graph_headers(struct seq_file *s)
632{
633 /* 1st line */
634 seq_printf(s, "# ");
635 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
636 seq_printf(s, "CPU ");
637 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
638 seq_printf(s, "TASK/PID ");
639 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
640 seq_printf(s, "OVERHEAD/");
641 seq_printf(s, "DURATION FUNCTION CALLS\n");
642
643 /* 2nd line */
644 seq_printf(s, "# ");
645 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
646 seq_printf(s, "| ");
647 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
648 seq_printf(s, "| | ");
649 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
650 seq_printf(s, "| ");
651 seq_printf(s, "| | | | |\n");
652 } else
653 seq_printf(s, " | | | | |\n");
654}
655static struct tracer graph_trace __read_mostly = {
656 .name = "function_graph",
657 .init = graph_trace_init,
658 .reset = graph_trace_reset,
659 .print_line = print_graph_function,
660 .print_header = print_graph_headers,
661 .flags = &tracer_flags,
662};
663
664static __init int init_graph_trace(void)
665{
666 return register_tracer(&graph_trace);
667}
668
669device_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
new file mode 100644
index 000000000000..649df22d435f
--- /dev/null
+++ b/kernel/trace/trace_hw_branches.c
@@ -0,0 +1,195 @@
1/*
2 * h/w branch tracer for x86 based on bts
3 *
4 * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12#include <linux/kallsyms.h>
13
14#include <asm/ds.h>
15
16#include "trace.h"
17
18
19#define SIZEOF_BTS (1 << 13)
20
21static DEFINE_PER_CPU(struct bts_tracer *, tracer);
22static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
23
24#define this_tracer per_cpu(tracer, smp_processor_id())
25#define this_buffer per_cpu(buffer, smp_processor_id())
26
27
28static void bts_trace_start_cpu(void *arg)
29{
30 if (this_tracer)
31 ds_release_bts(this_tracer);
32
33 this_tracer =
34 ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
35 /* ovfl = */ NULL, /* th = */ (size_t)-1,
36 BTS_KERNEL);
37 if (IS_ERR(this_tracer)) {
38 this_tracer = NULL;
39 return;
40 }
41}
42
43static void bts_trace_start(struct trace_array *tr)
44{
45 int cpu;
46
47 tracing_reset_online_cpus(tr);
48
49 for_each_cpu(cpu, cpu_possible_mask)
50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
51}
52
53static void bts_trace_stop_cpu(void *arg)
54{
55 if (this_tracer) {
56 ds_release_bts(this_tracer);
57 this_tracer = NULL;
58 }
59}
60
61static void bts_trace_stop(struct trace_array *tr)
62{
63 int cpu;
64
65 for_each_cpu(cpu, cpu_possible_mask)
66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
67}
68
69static int bts_trace_init(struct trace_array *tr)
70{
71 tracing_reset_online_cpus(tr);
72 bts_trace_start(tr);
73
74 return 0;
75}
76
77static void bts_trace_print_header(struct seq_file *m)
78{
79 seq_puts(m,
80 "# CPU# FROM TO FUNCTION\n");
81 seq_puts(m,
82 "# | | | |\n");
83}
84
85static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
86{
87 struct trace_entry *entry = iter->ent;
88 struct trace_seq *seq = &iter->seq;
89 struct hw_branch_entry *it;
90
91 trace_assign_type(it, entry);
92
93 if (entry->type == TRACE_HW_BRANCHES) {
94 if (trace_seq_printf(seq, "%4d ", entry->cpu) &&
95 trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
96 it->from, it->to) &&
97 (!it->from ||
98 seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
99 trace_seq_printf(seq, "\n"))
100 return TRACE_TYPE_HANDLED;
101 return TRACE_TYPE_PARTIAL_LINE;;
102 }
103 return TRACE_TYPE_UNHANDLED;
104}
105
106void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
107{
108 struct ring_buffer_event *event;
109 struct hw_branch_entry *entry;
110 unsigned long irq;
111
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
113 if (!event)
114 return;
115 entry = ring_buffer_event_data(event);
116 tracing_generic_entry_update(&entry->ent, 0, from);
117 entry->ent.type = TRACE_HW_BRANCHES;
118 entry->ent.cpu = smp_processor_id();
119 entry->from = from;
120 entry->to = to;
121 ring_buffer_unlock_commit(tr->buffer, event, irq);
122}
123
124static void trace_bts_at(struct trace_array *tr,
125 const struct bts_trace *trace, void *at)
126{
127 struct bts_struct bts;
128 int err = 0;
129
130 WARN_ON_ONCE(!trace->read);
131 if (!trace->read)
132 return;
133
134 err = trace->read(this_tracer, at, &bts);
135 if (err < 0)
136 return;
137
138 switch (bts.qualifier) {
139 case BTS_BRANCH:
140 trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
141 break;
142 }
143}
144
145static void trace_bts_cpu(void *arg)
146{
147 struct trace_array *tr = (struct trace_array *) arg;
148 const struct bts_trace *trace;
149 unsigned char *at;
150
151 if (!this_tracer)
152 return;
153
154 ds_suspend_bts(this_tracer);
155 trace = ds_read_bts(this_tracer);
156 if (!trace)
157 goto out;
158
159 for (at = trace->ds.top; (void *)at < trace->ds.end;
160 at += trace->ds.size)
161 trace_bts_at(tr, trace, at);
162
163 for (at = trace->ds.begin; (void *)at < trace->ds.top;
164 at += trace->ds.size)
165 trace_bts_at(tr, trace, at);
166
167out:
168 ds_resume_bts(this_tracer);
169}
170
171static void trace_bts_prepare(struct trace_iterator *iter)
172{
173 int cpu;
174
175 for_each_cpu(cpu, cpu_possible_mask)
176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
177}
178
179struct tracer bts_tracer __read_mostly =
180{
181 .name = "hw-branch-tracer",
182 .init = bts_trace_init,
183 .reset = bts_trace_stop,
184 .print_header = bts_trace_print_header,
185 .print_line = bts_trace_print_line,
186 .start = bts_trace_start,
187 .stop = bts_trace_stop,
188 .open = trace_bts_prepare
189};
190
191__init static int init_bts_trace(void)
192{
193 return register_tracer(&bts_tracer);
194}
195device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9c74071c10e0..7c2e326bbc8b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
353} 353}
354#endif /* CONFIG_PREEMPT_TRACER */ 354#endif /* CONFIG_PREEMPT_TRACER */
355 355
356/*
357 * save_tracer_enabled is used to save the state of the tracer_enabled
358 * variable when we disable it when we open a trace output file.
359 */
360static int save_tracer_enabled;
361
356static void start_irqsoff_tracer(struct trace_array *tr) 362static void start_irqsoff_tracer(struct trace_array *tr)
357{ 363{
358 register_ftrace_function(&trace_ops); 364 register_ftrace_function(&trace_ops);
359 tracer_enabled = 1; 365 if (tracing_is_enabled()) {
366 tracer_enabled = 1;
367 save_tracer_enabled = 1;
368 } else {
369 tracer_enabled = 0;
370 save_tracer_enabled = 0;
371 }
360} 372}
361 373
362static void stop_irqsoff_tracer(struct trace_array *tr) 374static void stop_irqsoff_tracer(struct trace_array *tr)
363{ 375{
364 tracer_enabled = 0; 376 tracer_enabled = 0;
377 save_tracer_enabled = 0;
365 unregister_ftrace_function(&trace_ops); 378 unregister_ftrace_function(&trace_ops);
366} 379}
367 380
@@ -370,53 +383,55 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
370 irqsoff_trace = tr; 383 irqsoff_trace = tr;
371 /* make sure that the tracer is visible */ 384 /* make sure that the tracer is visible */
372 smp_wmb(); 385 smp_wmb();
373 386 start_irqsoff_tracer(tr);
374 if (tr->ctrl)
375 start_irqsoff_tracer(tr);
376} 387}
377 388
378static void irqsoff_tracer_reset(struct trace_array *tr) 389static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 390{
380 if (tr->ctrl) 391 stop_irqsoff_tracer(tr);
381 stop_irqsoff_tracer(tr);
382} 392}
383 393
384static void irqsoff_tracer_ctrl_update(struct trace_array *tr) 394static void irqsoff_tracer_start(struct trace_array *tr)
385{ 395{
386 if (tr->ctrl) 396 tracer_enabled = 1;
387 start_irqsoff_tracer(tr); 397 save_tracer_enabled = 1;
388 else 398}
389 stop_irqsoff_tracer(tr); 399
400static void irqsoff_tracer_stop(struct trace_array *tr)
401{
402 tracer_enabled = 0;
403 save_tracer_enabled = 0;
390} 404}
391 405
392static void irqsoff_tracer_open(struct trace_iterator *iter) 406static void irqsoff_tracer_open(struct trace_iterator *iter)
393{ 407{
394 /* stop the trace while dumping */ 408 /* stop the trace while dumping */
395 if (iter->tr->ctrl) 409 tracer_enabled = 0;
396 stop_irqsoff_tracer(iter->tr);
397} 410}
398 411
399static void irqsoff_tracer_close(struct trace_iterator *iter) 412static void irqsoff_tracer_close(struct trace_iterator *iter)
400{ 413{
401 if (iter->tr->ctrl) 414 /* restart tracing */
402 start_irqsoff_tracer(iter->tr); 415 tracer_enabled = save_tracer_enabled;
403} 416}
404 417
405#ifdef CONFIG_IRQSOFF_TRACER 418#ifdef CONFIG_IRQSOFF_TRACER
406static void irqsoff_tracer_init(struct trace_array *tr) 419static int irqsoff_tracer_init(struct trace_array *tr)
407{ 420{
408 trace_type = TRACER_IRQS_OFF; 421 trace_type = TRACER_IRQS_OFF;
409 422
410 __irqsoff_tracer_init(tr); 423 __irqsoff_tracer_init(tr);
424 return 0;
411} 425}
412static struct tracer irqsoff_tracer __read_mostly = 426static struct tracer irqsoff_tracer __read_mostly =
413{ 427{
414 .name = "irqsoff", 428 .name = "irqsoff",
415 .init = irqsoff_tracer_init, 429 .init = irqsoff_tracer_init,
416 .reset = irqsoff_tracer_reset, 430 .reset = irqsoff_tracer_reset,
431 .start = irqsoff_tracer_start,
432 .stop = irqsoff_tracer_stop,
417 .open = irqsoff_tracer_open, 433 .open = irqsoff_tracer_open,
418 .close = irqsoff_tracer_close, 434 .close = irqsoff_tracer_close,
419 .ctrl_update = irqsoff_tracer_ctrl_update,
420 .print_max = 1, 435 .print_max = 1,
421#ifdef CONFIG_FTRACE_SELFTEST 436#ifdef CONFIG_FTRACE_SELFTEST
422 .selftest = trace_selftest_startup_irqsoff, 437 .selftest = trace_selftest_startup_irqsoff,
@@ -428,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly =
428#endif 443#endif
429 444
430#ifdef CONFIG_PREEMPT_TRACER 445#ifdef CONFIG_PREEMPT_TRACER
431static void preemptoff_tracer_init(struct trace_array *tr) 446static int preemptoff_tracer_init(struct trace_array *tr)
432{ 447{
433 trace_type = TRACER_PREEMPT_OFF; 448 trace_type = TRACER_PREEMPT_OFF;
434 449
435 __irqsoff_tracer_init(tr); 450 __irqsoff_tracer_init(tr);
451 return 0;
436} 452}
437 453
438static struct tracer preemptoff_tracer __read_mostly = 454static struct tracer preemptoff_tracer __read_mostly =
@@ -440,9 +456,10 @@ static struct tracer preemptoff_tracer __read_mostly =
440 .name = "preemptoff", 456 .name = "preemptoff",
441 .init = preemptoff_tracer_init, 457 .init = preemptoff_tracer_init,
442 .reset = irqsoff_tracer_reset, 458 .reset = irqsoff_tracer_reset,
459 .start = irqsoff_tracer_start,
460 .stop = irqsoff_tracer_stop,
443 .open = irqsoff_tracer_open, 461 .open = irqsoff_tracer_open,
444 .close = irqsoff_tracer_close, 462 .close = irqsoff_tracer_close,
445 .ctrl_update = irqsoff_tracer_ctrl_update,
446 .print_max = 1, 463 .print_max = 1,
447#ifdef CONFIG_FTRACE_SELFTEST 464#ifdef CONFIG_FTRACE_SELFTEST
448 .selftest = trace_selftest_startup_preemptoff, 465 .selftest = trace_selftest_startup_preemptoff,
@@ -456,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly =
456#if defined(CONFIG_IRQSOFF_TRACER) && \ 473#if defined(CONFIG_IRQSOFF_TRACER) && \
457 defined(CONFIG_PREEMPT_TRACER) 474 defined(CONFIG_PREEMPT_TRACER)
458 475
459static void preemptirqsoff_tracer_init(struct trace_array *tr) 476static int preemptirqsoff_tracer_init(struct trace_array *tr)
460{ 477{
461 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; 478 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
462 479
463 __irqsoff_tracer_init(tr); 480 __irqsoff_tracer_init(tr);
481 return 0;
464} 482}
465 483
466static struct tracer preemptirqsoff_tracer __read_mostly = 484static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -468,9 +486,10 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
468 .name = "preemptirqsoff", 486 .name = "preemptirqsoff",
469 .init = preemptirqsoff_tracer_init, 487 .init = preemptirqsoff_tracer_init,
470 .reset = irqsoff_tracer_reset, 488 .reset = irqsoff_tracer_reset,
489 .start = irqsoff_tracer_start,
490 .stop = irqsoff_tracer_stop,
471 .open = irqsoff_tracer_open, 491 .open = irqsoff_tracer_open,
472 .close = irqsoff_tracer_close, 492 .close = irqsoff_tracer_close,
473 .ctrl_update = irqsoff_tracer_ctrl_update,
474 .print_max = 1, 493 .print_max = 1,
475#ifdef CONFIG_FTRACE_SELFTEST 494#ifdef CONFIG_FTRACE_SELFTEST
476 .selftest = trace_selftest_startup_preemptirqsoff, 495 .selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index e62cbf78eab6..fffcb069f1dc 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -22,44 +22,35 @@ static unsigned long prev_overruns;
22 22
23static void mmio_reset_data(struct trace_array *tr) 23static void mmio_reset_data(struct trace_array *tr)
24{ 24{
25 int cpu;
26
27 overrun_detected = false; 25 overrun_detected = false;
28 prev_overruns = 0; 26 prev_overruns = 0;
29 tr->time_start = ftrace_now(tr->cpu);
30 27
31 for_each_online_cpu(cpu) 28 tracing_reset_online_cpus(tr);
32 tracing_reset(tr, cpu);
33} 29}
34 30
35static void mmio_trace_init(struct trace_array *tr) 31static int mmio_trace_init(struct trace_array *tr)
36{ 32{
37 pr_debug("in %s\n", __func__); 33 pr_debug("in %s\n", __func__);
38 mmio_trace_array = tr; 34 mmio_trace_array = tr;
39 if (tr->ctrl) { 35
40 mmio_reset_data(tr); 36 mmio_reset_data(tr);
41 enable_mmiotrace(); 37 enable_mmiotrace();
42 } 38 return 0;
43} 39}
44 40
45static void mmio_trace_reset(struct trace_array *tr) 41static void mmio_trace_reset(struct trace_array *tr)
46{ 42{
47 pr_debug("in %s\n", __func__); 43 pr_debug("in %s\n", __func__);
48 if (tr->ctrl) 44
49 disable_mmiotrace(); 45 disable_mmiotrace();
50 mmio_reset_data(tr); 46 mmio_reset_data(tr);
51 mmio_trace_array = NULL; 47 mmio_trace_array = NULL;
52} 48}
53 49
54static void mmio_trace_ctrl_update(struct trace_array *tr) 50static void mmio_trace_start(struct trace_array *tr)
55{ 51{
56 pr_debug("in %s\n", __func__); 52 pr_debug("in %s\n", __func__);
57 if (tr->ctrl) { 53 mmio_reset_data(tr);
58 mmio_reset_data(tr);
59 enable_mmiotrace();
60 } else {
61 disable_mmiotrace();
62 }
63} 54}
64 55
65static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) 56static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
@@ -296,10 +287,10 @@ static struct tracer mmio_tracer __read_mostly =
296 .name = "mmiotrace", 287 .name = "mmiotrace",
297 .init = mmio_trace_init, 288 .init = mmio_trace_init,
298 .reset = mmio_trace_reset, 289 .reset = mmio_trace_reset,
290 .start = mmio_trace_start,
299 .pipe_open = mmio_pipe_open, 291 .pipe_open = mmio_pipe_open,
300 .close = mmio_close, 292 .close = mmio_close,
301 .read = mmio_read, 293 .read = mmio_read,
302 .ctrl_update = mmio_trace_ctrl_update,
303 .print_line = mmio_print_line, 294 .print_line = mmio_print_line,
304}; 295};
305 296
@@ -371,5 +362,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
371 362
372int mmio_trace_printk(const char *fmt, va_list args) 363int mmio_trace_printk(const char *fmt, va_list args)
373{ 364{
374 return trace_vprintk(0, fmt, args); 365 return trace_vprintk(0, -1, fmt, args);
375} 366}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 4592b4862515..b9767acd30ac 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -12,6 +12,27 @@
12 12
13#include "trace.h" 13#include "trace.h"
14 14
15/* Our two options */
16enum {
17 TRACE_NOP_OPT_ACCEPT = 0x1,
18 TRACE_NOP_OPT_REFUSE = 0x2
19};
20
21/* Options for the tracer (see trace_options file) */
22static struct tracer_opt nop_opts[] = {
23 /* Option that will be accepted by set_flag callback */
24 { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
25 /* Option that will be refused by set_flag callback */
26 { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
27 { } /* Always set a last empty entry */
28};
29
30static struct tracer_flags nop_flags = {
31 /* You can check your flags value here when you want. */
32 .val = 0, /* By default: all flags disabled */
33 .opts = nop_opts
34};
35
15static struct trace_array *ctx_trace; 36static struct trace_array *ctx_trace;
16 37
17static void start_nop_trace(struct trace_array *tr) 38static void start_nop_trace(struct trace_array *tr)
@@ -24,7 +45,7 @@ static void stop_nop_trace(struct trace_array *tr)
24 /* Nothing to do! */ 45 /* Nothing to do! */
25} 46}
26 47
27static void nop_trace_init(struct trace_array *tr) 48static int nop_trace_init(struct trace_array *tr)
28{ 49{
29 int cpu; 50 int cpu;
30 ctx_trace = tr; 51 ctx_trace = tr;
@@ -32,33 +53,53 @@ static void nop_trace_init(struct trace_array *tr)
32 for_each_online_cpu(cpu) 53 for_each_online_cpu(cpu)
33 tracing_reset(tr, cpu); 54 tracing_reset(tr, cpu);
34 55
35 if (tr->ctrl) 56 start_nop_trace(tr);
36 start_nop_trace(tr); 57 return 0;
37} 58}
38 59
39static void nop_trace_reset(struct trace_array *tr) 60static void nop_trace_reset(struct trace_array *tr)
40{ 61{
41 if (tr->ctrl) 62 stop_nop_trace(tr);
42 stop_nop_trace(tr);
43} 63}
44 64
45static void nop_trace_ctrl_update(struct trace_array *tr) 65/* It only serves as a signal handler and a callback to
66 * accept or refuse tthe setting of a flag.
67 * If you don't implement it, then the flag setting will be
68 * automatically accepted.
69 */
70static int nop_set_flag(u32 old_flags, u32 bit, int set)
46{ 71{
47 /* When starting a new trace, reset the buffers */ 72 /*
48 if (tr->ctrl) 73 * Note that you don't need to update nop_flags.val yourself.
49 start_nop_trace(tr); 74 * The tracing Api will do it automatically if you return 0
50 else 75 */
51 stop_nop_trace(tr); 76 if (bit == TRACE_NOP_OPT_ACCEPT) {
77 printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
78 " Now cat trace_options to see the result\n",
79 set);
80 return 0;
81 }
82
83 if (bit == TRACE_NOP_OPT_REFUSE) {
84 printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
85 "Now cat trace_options to see the result\n",
86 set);
87 return -EINVAL;
88 }
89
90 return 0;
52} 91}
53 92
93
54struct tracer nop_trace __read_mostly = 94struct tracer nop_trace __read_mostly =
55{ 95{
56 .name = "nop", 96 .name = "nop",
57 .init = nop_trace_init, 97 .init = nop_trace_init,
58 .reset = nop_trace_reset, 98 .reset = nop_trace_reset,
59 .ctrl_update = nop_trace_ctrl_update,
60#ifdef CONFIG_FTRACE_SELFTEST 99#ifdef CONFIG_FTRACE_SELFTEST
61 .selftest = trace_selftest_startup_nop, 100 .selftest = trace_selftest_startup_nop,
62#endif 101#endif
102 .flags = &nop_flags,
103 .set_flag = nop_set_flag
63}; 104};
64 105
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 000000000000..7bda248daf55
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,179 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <linux/ftrace.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19
20static struct trace_array *power_trace;
21static int __read_mostly trace_power_enabled;
22
23
24static void start_power_trace(struct trace_array *tr)
25{
26 trace_power_enabled = 1;
27}
28
29static void stop_power_trace(struct trace_array *tr)
30{
31 trace_power_enabled = 0;
32}
33
34
35static int power_trace_init(struct trace_array *tr)
36{
37 int cpu;
38 power_trace = tr;
39
40 trace_power_enabled = 1;
41
42 for_each_cpu(cpu, cpu_possible_mask)
43 tracing_reset(tr, cpu);
44 return 0;
45}
46
47static enum print_line_t power_print_line(struct trace_iterator *iter)
48{
49 int ret = 0;
50 struct trace_entry *entry = iter->ent;
51 struct trace_power *field ;
52 struct power_trace *it;
53 struct trace_seq *s = &iter->seq;
54 struct timespec stamp;
55 struct timespec duration;
56
57 trace_assign_type(field, entry);
58 it = &field->state_data;
59 stamp = ktime_to_timespec(it->stamp);
60 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
61
62 if (entry->type == TRACE_POWER) {
63 if (it->type == POWER_CSTATE)
64 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
65 stamp.tv_sec,
66 stamp.tv_nsec,
67 it->state, iter->cpu,
68 duration.tv_sec,
69 duration.tv_nsec);
70 if (it->type == POWER_PSTATE)
71 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
72 stamp.tv_sec,
73 stamp.tv_nsec,
74 it->state, iter->cpu);
75 if (!ret)
76 return TRACE_TYPE_PARTIAL_LINE;
77 return TRACE_TYPE_HANDLED;
78 }
79 return TRACE_TYPE_UNHANDLED;
80}
81
82static struct tracer power_tracer __read_mostly =
83{
84 .name = "power",
85 .init = power_trace_init,
86 .start = start_power_trace,
87 .stop = stop_power_trace,
88 .reset = stop_power_trace,
89 .print_line = power_print_line,
90};
91
92static int init_power_trace(void)
93{
94 return register_tracer(&power_tracer);
95}
96device_initcall(init_power_trace);
97
98void trace_power_start(struct power_trace *it, unsigned int type,
99 unsigned int level)
100{
101 if (!trace_power_enabled)
102 return;
103
104 memset(it, 0, sizeof(struct power_trace));
105 it->state = level;
106 it->type = type;
107 it->stamp = ktime_get();
108}
109EXPORT_SYMBOL_GPL(trace_power_start);
110
111
112void trace_power_end(struct power_trace *it)
113{
114 struct ring_buffer_event *event;
115 struct trace_power *entry;
116 struct trace_array_cpu *data;
117 unsigned long irq_flags;
118 struct trace_array *tr = power_trace;
119
120 if (!trace_power_enabled)
121 return;
122
123 preempt_disable();
124 it->end = ktime_get();
125 data = tr->data[smp_processor_id()];
126
127 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
128 &irq_flags);
129 if (!event)
130 goto out;
131 entry = ring_buffer_event_data(event);
132 tracing_generic_entry_update(&entry->ent, 0, 0);
133 entry->ent.type = TRACE_POWER;
134 entry->state_data = *it;
135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
136
137 trace_wake_up();
138
139 out:
140 preempt_enable();
141}
142EXPORT_SYMBOL_GPL(trace_power_end);
143
144void trace_power_mark(struct power_trace *it, unsigned int type,
145 unsigned int level)
146{
147 struct ring_buffer_event *event;
148 struct trace_power *entry;
149 struct trace_array_cpu *data;
150 unsigned long irq_flags;
151 struct trace_array *tr = power_trace;
152
153 if (!trace_power_enabled)
154 return;
155
156 memset(it, 0, sizeof(struct power_trace));
157 it->state = level;
158 it->type = type;
159 it->stamp = ktime_get();
160 preempt_disable();
161 it->end = it->stamp;
162 data = tr->data[smp_processor_id()];
163
164 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
165 &irq_flags);
166 if (!event)
167 goto out;
168 entry = ring_buffer_event_data(event);
169 tracing_generic_entry_update(&entry->ent, 0, 0);
170 entry->ent.type = TRACE_POWER;
171 entry->state_data = *it;
172 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
173
174 trace_wake_up();
175
176 out:
177 preempt_enable();
178}
179EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index b8f56beb1a62..df175cb4564f 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@
16 16
17static struct trace_array *ctx_trace; 17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled; 18static int __read_mostly tracer_enabled;
19static atomic_t sched_ref; 19static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex);
20 21
21static void 22static void
22probe_sched_switch(struct rq *__rq, struct task_struct *prev, 23probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -27,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
27 int cpu; 28 int cpu;
28 int pc; 29 int pc;
29 30
30 if (!atomic_read(&sched_ref)) 31 if (!sched_ref)
31 return; 32 return;
32 33
33 tracing_record_cmdline(prev); 34 tracing_record_cmdline(prev);
@@ -48,7 +49,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
48} 49}
49 50
50static void 51static void
51probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) 52probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
52{ 53{
53 struct trace_array_cpu *data; 54 struct trace_array_cpu *data;
54 unsigned long flags; 55 unsigned long flags;
@@ -71,16 +72,6 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
71 local_irq_restore(flags); 72 local_irq_restore(flags);
72} 73}
73 74
74static void sched_switch_reset(struct trace_array *tr)
75{
76 int cpu;
77
78 tr->time_start = ftrace_now(tr->cpu);
79
80 for_each_online_cpu(cpu)
81 tracing_reset(tr, cpu);
82}
83
84static int tracing_sched_register(void) 75static int tracing_sched_register(void)
85{ 76{
86 int ret; 77 int ret;
@@ -123,20 +114,18 @@ static void tracing_sched_unregister(void)
123 114
124static void tracing_start_sched_switch(void) 115static void tracing_start_sched_switch(void)
125{ 116{
126 long ref; 117 mutex_lock(&sched_register_mutex);
127 118 if (!(sched_ref++))
128 ref = atomic_inc_return(&sched_ref);
129 if (ref == 1)
130 tracing_sched_register(); 119 tracing_sched_register();
120 mutex_unlock(&sched_register_mutex);
131} 121}
132 122
133static void tracing_stop_sched_switch(void) 123static void tracing_stop_sched_switch(void)
134{ 124{
135 long ref; 125 mutex_lock(&sched_register_mutex);
136 126 if (!(--sched_ref))
137 ref = atomic_dec_and_test(&sched_ref);
138 if (ref)
139 tracing_sched_unregister(); 127 tracing_sched_unregister();
128 mutex_unlock(&sched_register_mutex);
140} 129}
141 130
142void tracing_start_cmdline_record(void) 131void tracing_start_cmdline_record(void)
@@ -149,40 +138,86 @@ void tracing_stop_cmdline_record(void)
149 tracing_stop_sched_switch(); 138 tracing_stop_sched_switch();
150} 139}
151 140
141/**
142 * tracing_start_sched_switch_record - start tracing context switches
143 *
144 * Turns on context switch tracing for a tracer.
145 */
146void tracing_start_sched_switch_record(void)
147{
148 if (unlikely(!ctx_trace)) {
149 WARN_ON(1);
150 return;
151 }
152
153 tracing_start_sched_switch();
154
155 mutex_lock(&sched_register_mutex);
156 tracer_enabled++;
157 mutex_unlock(&sched_register_mutex);
158}
159
160/**
161 * tracing_stop_sched_switch_record - start tracing context switches
162 *
163 * Turns off context switch tracing for a tracer.
164 */
165void tracing_stop_sched_switch_record(void)
166{
167 mutex_lock(&sched_register_mutex);
168 tracer_enabled--;
169 WARN_ON(tracer_enabled < 0);
170 mutex_unlock(&sched_register_mutex);
171
172 tracing_stop_sched_switch();
173}
174
175/**
176 * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
177 * @tr: trace array pointer to assign
178 *
179 * Some tracers might want to record the context switches in their
180 * trace. This function lets those tracers assign the trace array
181 * to use.
182 */
183void tracing_sched_switch_assign_trace(struct trace_array *tr)
184{
185 ctx_trace = tr;
186}
187
152static void start_sched_trace(struct trace_array *tr) 188static void start_sched_trace(struct trace_array *tr)
153{ 189{
154 sched_switch_reset(tr); 190 tracing_reset_online_cpus(tr);
155 tracing_start_cmdline_record(); 191 tracing_start_sched_switch_record();
156 tracer_enabled = 1;
157} 192}
158 193
159static void stop_sched_trace(struct trace_array *tr) 194static void stop_sched_trace(struct trace_array *tr)
160{ 195{
161 tracer_enabled = 0; 196 tracing_stop_sched_switch_record();
162 tracing_stop_cmdline_record();
163} 197}
164 198
165static void sched_switch_trace_init(struct trace_array *tr) 199static int sched_switch_trace_init(struct trace_array *tr)
166{ 200{
167 ctx_trace = tr; 201 ctx_trace = tr;
168 202 start_sched_trace(tr);
169 if (tr->ctrl) 203 return 0;
170 start_sched_trace(tr);
171} 204}
172 205
173static void sched_switch_trace_reset(struct trace_array *tr) 206static void sched_switch_trace_reset(struct trace_array *tr)
174{ 207{
175 if (tr->ctrl) 208 if (sched_ref)
176 stop_sched_trace(tr); 209 stop_sched_trace(tr);
177} 210}
178 211
179static void sched_switch_trace_ctrl_update(struct trace_array *tr) 212static void sched_switch_trace_start(struct trace_array *tr)
180{ 213{
181 /* When starting a new trace, reset the buffers */ 214 tracing_reset_online_cpus(tr);
182 if (tr->ctrl) 215 tracing_start_sched_switch();
183 start_sched_trace(tr); 216}
184 else 217
185 stop_sched_trace(tr); 218static void sched_switch_trace_stop(struct trace_array *tr)
219{
220 tracing_stop_sched_switch();
186} 221}
187 222
188static struct tracer sched_switch_trace __read_mostly = 223static struct tracer sched_switch_trace __read_mostly =
@@ -190,7 +225,8 @@ static struct tracer sched_switch_trace __read_mostly =
190 .name = "sched_switch", 225 .name = "sched_switch",
191 .init = sched_switch_trace_init, 226 .init = sched_switch_trace_init,
192 .reset = sched_switch_trace_reset, 227 .reset = sched_switch_trace_reset,
193 .ctrl_update = sched_switch_trace_ctrl_update, 228 .start = sched_switch_trace_start,
229 .stop = sched_switch_trace_stop,
194#ifdef CONFIG_FTRACE_SELFTEST 230#ifdef CONFIG_FTRACE_SELFTEST
195 .selftest = trace_selftest_startup_sched_switch, 231 .selftest = trace_selftest_startup_sched_switch,
196#endif 232#endif
@@ -198,14 +234,7 @@ static struct tracer sched_switch_trace __read_mostly =
198 234
199__init static int init_sched_switch_trace(void) 235__init static int init_sched_switch_trace(void)
200{ 236{
201 int ret = 0;
202
203 if (atomic_read(&sched_ref))
204 ret = tracing_sched_register();
205 if (ret) {
206 pr_info("error registering scheduler trace\n");
207 return ret;
208 }
209 return register_tracer(&sched_switch_trace); 237 return register_tracer(&sched_switch_trace);
210} 238}
211device_initcall(init_sched_switch_trace); 239device_initcall(init_sched_switch_trace);
240
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3ae93f16b565..43586b689e31 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
50 return; 50 return;
51 51
52 pc = preempt_count(); 52 pc = preempt_count();
53 resched = need_resched(); 53 resched = ftrace_preempt_disable();
54 preempt_disable_notrace();
55 54
56 cpu = raw_smp_processor_id(); 55 cpu = raw_smp_processor_id();
57 data = tr->data[cpu]; 56 data = tr->data[cpu];
@@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
81 out: 80 out:
82 atomic_dec(&data->disabled); 81 atomic_dec(&data->disabled);
83 82
84 /* 83 ftrace_preempt_enable(resched);
85 * To prevent recursion from the scheduler, if the
86 * resched flag was set before we entered, then
87 * don't reschedule.
88 */
89 if (resched)
90 preempt_enable_no_resched_notrace();
91 else
92 preempt_enable_notrace();
93} 84}
94 85
95static struct ftrace_ops trace_ops __read_mostly = 86static struct ftrace_ops trace_ops __read_mostly =
@@ -220,7 +211,7 @@ static void wakeup_reset(struct trace_array *tr)
220} 211}
221 212
222static void 213static void
223probe_wakeup(struct rq *rq, struct task_struct *p) 214probe_wakeup(struct rq *rq, struct task_struct *p, int success)
224{ 215{
225 int cpu = smp_processor_id(); 216 int cpu = smp_processor_id();
226 unsigned long flags; 217 unsigned long flags;
@@ -271,6 +262,12 @@ out:
271 atomic_dec(&wakeup_trace->data[cpu]->disabled); 262 atomic_dec(&wakeup_trace->data[cpu]->disabled);
272} 263}
273 264
265/*
266 * save_tracer_enabled is used to save the state of the tracer_enabled
267 * variable when we disable it when we open a trace output file.
268 */
269static int save_tracer_enabled;
270
274static void start_wakeup_tracer(struct trace_array *tr) 271static void start_wakeup_tracer(struct trace_array *tr)
275{ 272{
276 int ret; 273 int ret;
@@ -309,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
309 306
310 register_ftrace_function(&trace_ops); 307 register_ftrace_function(&trace_ops);
311 308
312 tracer_enabled = 1; 309 if (tracing_is_enabled()) {
310 tracer_enabled = 1;
311 save_tracer_enabled = 1;
312 } else {
313 tracer_enabled = 0;
314 save_tracer_enabled = 0;
315 }
313 316
314 return; 317 return;
315fail_deprobe_wake_new: 318fail_deprobe_wake_new:
@@ -321,49 +324,53 @@ fail_deprobe:
321static void stop_wakeup_tracer(struct trace_array *tr) 324static void stop_wakeup_tracer(struct trace_array *tr)
322{ 325{
323 tracer_enabled = 0; 326 tracer_enabled = 0;
327 save_tracer_enabled = 0;
324 unregister_ftrace_function(&trace_ops); 328 unregister_ftrace_function(&trace_ops);
325 unregister_trace_sched_switch(probe_wakeup_sched_switch); 329 unregister_trace_sched_switch(probe_wakeup_sched_switch);
326 unregister_trace_sched_wakeup_new(probe_wakeup); 330 unregister_trace_sched_wakeup_new(probe_wakeup);
327 unregister_trace_sched_wakeup(probe_wakeup); 331 unregister_trace_sched_wakeup(probe_wakeup);
328} 332}
329 333
330static void wakeup_tracer_init(struct trace_array *tr) 334static int wakeup_tracer_init(struct trace_array *tr)
331{ 335{
332 wakeup_trace = tr; 336 wakeup_trace = tr;
333 337 start_wakeup_tracer(tr);
334 if (tr->ctrl) 338 return 0;
335 start_wakeup_tracer(tr);
336} 339}
337 340
338static void wakeup_tracer_reset(struct trace_array *tr) 341static void wakeup_tracer_reset(struct trace_array *tr)
339{ 342{
340 if (tr->ctrl) { 343 stop_wakeup_tracer(tr);
341 stop_wakeup_tracer(tr); 344 /* make sure we put back any tasks we are tracing */
342 /* make sure we put back any tasks we are tracing */ 345 wakeup_reset(tr);
343 wakeup_reset(tr); 346}
344 } 347
348static void wakeup_tracer_start(struct trace_array *tr)
349{
350 wakeup_reset(tr);
351 tracer_enabled = 1;
352 save_tracer_enabled = 1;
345} 353}
346 354
347static void wakeup_tracer_ctrl_update(struct trace_array *tr) 355static void wakeup_tracer_stop(struct trace_array *tr)
348{ 356{
349 if (tr->ctrl) 357 tracer_enabled = 0;
350 start_wakeup_tracer(tr); 358 save_tracer_enabled = 0;
351 else
352 stop_wakeup_tracer(tr);
353} 359}
354 360
355static void wakeup_tracer_open(struct trace_iterator *iter) 361static void wakeup_tracer_open(struct trace_iterator *iter)
356{ 362{
357 /* stop the trace while dumping */ 363 /* stop the trace while dumping */
358 if (iter->tr->ctrl) 364 tracer_enabled = 0;
359 stop_wakeup_tracer(iter->tr);
360} 365}
361 366
362static void wakeup_tracer_close(struct trace_iterator *iter) 367static void wakeup_tracer_close(struct trace_iterator *iter)
363{ 368{
364 /* forget about any processes we were recording */ 369 /* forget about any processes we were recording */
365 if (iter->tr->ctrl) 370 if (save_tracer_enabled) {
366 start_wakeup_tracer(iter->tr); 371 wakeup_reset(iter->tr);
372 tracer_enabled = 1;
373 }
367} 374}
368 375
369static struct tracer wakeup_tracer __read_mostly = 376static struct tracer wakeup_tracer __read_mostly =
@@ -371,9 +378,10 @@ static struct tracer wakeup_tracer __read_mostly =
371 .name = "wakeup", 378 .name = "wakeup",
372 .init = wakeup_tracer_init, 379 .init = wakeup_tracer_init,
373 .reset = wakeup_tracer_reset, 380 .reset = wakeup_tracer_reset,
381 .start = wakeup_tracer_start,
382 .stop = wakeup_tracer_stop,
374 .open = wakeup_tracer_open, 383 .open = wakeup_tracer_open,
375 .close = wakeup_tracer_close, 384 .close = wakeup_tracer_close,
376 .ctrl_update = wakeup_tracer_ctrl_update,
377 .print_max = 1, 385 .print_max = 1,
378#ifdef CONFIG_FTRACE_SELFTEST 386#ifdef CONFIG_FTRACE_SELFTEST
379 .selftest = trace_selftest_startup_wakeup, 387 .selftest = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 90bc752a7580..88c8eb70f54a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT: 14 case TRACE_PRINT:
15 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
16 case TRACE_BRANCH:
16 return 1; 17 return 1;
17 } 18 }
18 return 0; 19 return 0;
@@ -51,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
51 int cpu, ret = 0; 52 int cpu, ret = 0;
52 53
53 /* Don't allow flipping of max traces now */ 54 /* Don't allow flipping of max traces now */
54 raw_local_irq_save(flags); 55 local_irq_save(flags);
55 __raw_spin_lock(&ftrace_max_lock); 56 __raw_spin_lock(&ftrace_max_lock);
56 57
57 cnt = ring_buffer_entries(tr->buffer); 58 cnt = ring_buffer_entries(tr->buffer);
@@ -62,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
62 break; 63 break;
63 } 64 }
64 __raw_spin_unlock(&ftrace_max_lock); 65 __raw_spin_unlock(&ftrace_max_lock);
65 raw_local_irq_restore(flags); 66 local_irq_restore(flags);
66 67
67 if (count) 68 if (count)
68 *count = cnt; 69 *count = cnt;
@@ -70,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
70 return ret; 71 return ret;
71} 72}
72 73
74static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
75{
76 printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
77 trace->name, init_ret);
78}
73#ifdef CONFIG_FUNCTION_TRACER 79#ifdef CONFIG_FUNCTION_TRACER
74 80
75#ifdef CONFIG_DYNAMIC_FTRACE 81#ifdef CONFIG_DYNAMIC_FTRACE
@@ -110,8 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
110 ftrace_set_filter(func_name, strlen(func_name), 1); 116 ftrace_set_filter(func_name, strlen(func_name), 1);
111 117
112 /* enable tracing */ 118 /* enable tracing */
113 tr->ctrl = 1; 119 ret = trace->init(tr);
114 trace->init(tr); 120 if (ret) {
121 warn_failed_init_tracer(trace, ret);
122 goto out;
123 }
115 124
116 /* Sleep for a 1/10 of a second */ 125 /* Sleep for a 1/10 of a second */
117 msleep(100); 126 msleep(100);
@@ -134,13 +143,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
134 msleep(100); 143 msleep(100);
135 144
136 /* stop the tracing. */ 145 /* stop the tracing. */
137 tr->ctrl = 0; 146 tracing_stop();
138 trace->ctrl_update(tr);
139 ftrace_enabled = 0; 147 ftrace_enabled = 0;
140 148
141 /* check the trace buffer */ 149 /* check the trace buffer */
142 ret = trace_test_buffer(tr, &count); 150 ret = trace_test_buffer(tr, &count);
143 trace->reset(tr); 151 trace->reset(tr);
152 tracing_start();
144 153
145 /* we should only have one item */ 154 /* we should only have one item */
146 if (!ret && count != 1) { 155 if (!ret && count != 1) {
@@ -148,6 +157,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
148 ret = -1; 157 ret = -1;
149 goto out; 158 goto out;
150 } 159 }
160
151 out: 161 out:
152 ftrace_enabled = save_ftrace_enabled; 162 ftrace_enabled = save_ftrace_enabled;
153 tracer_enabled = save_tracer_enabled; 163 tracer_enabled = save_tracer_enabled;
@@ -180,18 +190,22 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
180 ftrace_enabled = 1; 190 ftrace_enabled = 1;
181 tracer_enabled = 1; 191 tracer_enabled = 1;
182 192
183 tr->ctrl = 1; 193 ret = trace->init(tr);
184 trace->init(tr); 194 if (ret) {
195 warn_failed_init_tracer(trace, ret);
196 goto out;
197 }
198
185 /* Sleep for a 1/10 of a second */ 199 /* Sleep for a 1/10 of a second */
186 msleep(100); 200 msleep(100);
187 /* stop the tracing. */ 201 /* stop the tracing. */
188 tr->ctrl = 0; 202 tracing_stop();
189 trace->ctrl_update(tr);
190 ftrace_enabled = 0; 203 ftrace_enabled = 0;
191 204
192 /* check the trace buffer */ 205 /* check the trace buffer */
193 ret = trace_test_buffer(tr, &count); 206 ret = trace_test_buffer(tr, &count);
194 trace->reset(tr); 207 trace->reset(tr);
208 tracing_start();
195 209
196 if (!ret && !count) { 210 if (!ret && !count) {
197 printk(KERN_CONT ".. no entries found .."); 211 printk(KERN_CONT ".. no entries found ..");
@@ -223,8 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
223 int ret; 237 int ret;
224 238
225 /* start the tracing */ 239 /* start the tracing */
226 tr->ctrl = 1; 240 ret = trace->init(tr);
227 trace->init(tr); 241 if (ret) {
242 warn_failed_init_tracer(trace, ret);
243 return ret;
244 }
245
228 /* reset the max latency */ 246 /* reset the max latency */
229 tracing_max_latency = 0; 247 tracing_max_latency = 0;
230 /* disable interrupts for a bit */ 248 /* disable interrupts for a bit */
@@ -232,13 +250,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
232 udelay(100); 250 udelay(100);
233 local_irq_enable(); 251 local_irq_enable();
234 /* stop the tracing. */ 252 /* stop the tracing. */
235 tr->ctrl = 0; 253 tracing_stop();
236 trace->ctrl_update(tr);
237 /* check both trace buffers */ 254 /* check both trace buffers */
238 ret = trace_test_buffer(tr, NULL); 255 ret = trace_test_buffer(tr, NULL);
239 if (!ret) 256 if (!ret)
240 ret = trace_test_buffer(&max_tr, &count); 257 ret = trace_test_buffer(&max_tr, &count);
241 trace->reset(tr); 258 trace->reset(tr);
259 tracing_start();
242 260
243 if (!ret && !count) { 261 if (!ret && !count) {
244 printk(KERN_CONT ".. no entries found .."); 262 printk(KERN_CONT ".. no entries found ..");
@@ -259,9 +277,26 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
259 unsigned long count; 277 unsigned long count;
260 int ret; 278 int ret;
261 279
280 /*
281 * Now that the big kernel lock is no longer preemptable,
282 * and this is called with the BKL held, it will always
283 * fail. If preemption is already disabled, simply
284 * pass the test. When the BKL is removed, or becomes
285 * preemptible again, we will once again test this,
286 * so keep it in.
287 */
288 if (preempt_count()) {
289 printk(KERN_CONT "can not test ... force ");
290 return 0;
291 }
292
262 /* start the tracing */ 293 /* start the tracing */
263 tr->ctrl = 1; 294 ret = trace->init(tr);
264 trace->init(tr); 295 if (ret) {
296 warn_failed_init_tracer(trace, ret);
297 return ret;
298 }
299
265 /* reset the max latency */ 300 /* reset the max latency */
266 tracing_max_latency = 0; 301 tracing_max_latency = 0;
267 /* disable preemption for a bit */ 302 /* disable preemption for a bit */
@@ -269,13 +304,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
269 udelay(100); 304 udelay(100);
270 preempt_enable(); 305 preempt_enable();
271 /* stop the tracing. */ 306 /* stop the tracing. */
272 tr->ctrl = 0; 307 tracing_stop();
273 trace->ctrl_update(tr);
274 /* check both trace buffers */ 308 /* check both trace buffers */
275 ret = trace_test_buffer(tr, NULL); 309 ret = trace_test_buffer(tr, NULL);
276 if (!ret) 310 if (!ret)
277 ret = trace_test_buffer(&max_tr, &count); 311 ret = trace_test_buffer(&max_tr, &count);
278 trace->reset(tr); 312 trace->reset(tr);
313 tracing_start();
279 314
280 if (!ret && !count) { 315 if (!ret && !count) {
281 printk(KERN_CONT ".. no entries found .."); 316 printk(KERN_CONT ".. no entries found ..");
@@ -296,9 +331,25 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
296 unsigned long count; 331 unsigned long count;
297 int ret; 332 int ret;
298 333
334 /*
335 * Now that the big kernel lock is no longer preemptable,
336 * and this is called with the BKL held, it will always
337 * fail. If preemption is already disabled, simply
338 * pass the test. When the BKL is removed, or becomes
339 * preemptible again, we will once again test this,
340 * so keep it in.
341 */
342 if (preempt_count()) {
343 printk(KERN_CONT "can not test ... force ");
344 return 0;
345 }
346
299 /* start the tracing */ 347 /* start the tracing */
300 tr->ctrl = 1; 348 ret = trace->init(tr);
301 trace->init(tr); 349 if (ret) {
350 warn_failed_init_tracer(trace, ret);
351 goto out;
352 }
302 353
303 /* reset the max latency */ 354 /* reset the max latency */
304 tracing_max_latency = 0; 355 tracing_max_latency = 0;
@@ -312,27 +363,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
312 local_irq_enable(); 363 local_irq_enable();
313 364
314 /* stop the tracing. */ 365 /* stop the tracing. */
315 tr->ctrl = 0; 366 tracing_stop();
316 trace->ctrl_update(tr);
317 /* check both trace buffers */ 367 /* check both trace buffers */
318 ret = trace_test_buffer(tr, NULL); 368 ret = trace_test_buffer(tr, NULL);
319 if (ret) 369 if (ret) {
370 tracing_start();
320 goto out; 371 goto out;
372 }
321 373
322 ret = trace_test_buffer(&max_tr, &count); 374 ret = trace_test_buffer(&max_tr, &count);
323 if (ret) 375 if (ret) {
376 tracing_start();
324 goto out; 377 goto out;
378 }
325 379
326 if (!ret && !count) { 380 if (!ret && !count) {
327 printk(KERN_CONT ".. no entries found .."); 381 printk(KERN_CONT ".. no entries found ..");
328 ret = -1; 382 ret = -1;
383 tracing_start();
329 goto out; 384 goto out;
330 } 385 }
331 386
332 /* do the test by disabling interrupts first this time */ 387 /* do the test by disabling interrupts first this time */
333 tracing_max_latency = 0; 388 tracing_max_latency = 0;
334 tr->ctrl = 1; 389 tracing_start();
335 trace->ctrl_update(tr);
336 preempt_disable(); 390 preempt_disable();
337 local_irq_disable(); 391 local_irq_disable();
338 udelay(100); 392 udelay(100);
@@ -341,8 +395,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
341 local_irq_enable(); 395 local_irq_enable();
342 396
343 /* stop the tracing. */ 397 /* stop the tracing. */
344 tr->ctrl = 0; 398 tracing_stop();
345 trace->ctrl_update(tr);
346 /* check both trace buffers */ 399 /* check both trace buffers */
347 ret = trace_test_buffer(tr, NULL); 400 ret = trace_test_buffer(tr, NULL);
348 if (ret) 401 if (ret)
@@ -358,6 +411,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
358 411
359 out: 412 out:
360 trace->reset(tr); 413 trace->reset(tr);
414 tracing_start();
361 tracing_max_latency = save_max; 415 tracing_max_latency = save_max;
362 416
363 return ret; 417 return ret;
@@ -423,8 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
423 wait_for_completion(&isrt); 477 wait_for_completion(&isrt);
424 478
425 /* start the tracing */ 479 /* start the tracing */
426 tr->ctrl = 1; 480 ret = trace->init(tr);
427 trace->init(tr); 481 if (ret) {
482 warn_failed_init_tracer(trace, ret);
483 return ret;
484 }
485
428 /* reset the max latency */ 486 /* reset the max latency */
429 tracing_max_latency = 0; 487 tracing_max_latency = 0;
430 488
@@ -448,8 +506,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
448 msleep(100); 506 msleep(100);
449 507
450 /* stop the tracing. */ 508 /* stop the tracing. */
451 tr->ctrl = 0; 509 tracing_stop();
452 trace->ctrl_update(tr);
453 /* check both trace buffers */ 510 /* check both trace buffers */
454 ret = trace_test_buffer(tr, NULL); 511 ret = trace_test_buffer(tr, NULL);
455 if (!ret) 512 if (!ret)
@@ -457,6 +514,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
457 514
458 515
459 trace->reset(tr); 516 trace->reset(tr);
517 tracing_start();
460 518
461 tracing_max_latency = save_max; 519 tracing_max_latency = save_max;
462 520
@@ -480,16 +538,20 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
480 int ret; 538 int ret;
481 539
482 /* start the tracing */ 540 /* start the tracing */
483 tr->ctrl = 1; 541 ret = trace->init(tr);
484 trace->init(tr); 542 if (ret) {
543 warn_failed_init_tracer(trace, ret);
544 return ret;
545 }
546
485 /* Sleep for a 1/10 of a second */ 547 /* Sleep for a 1/10 of a second */
486 msleep(100); 548 msleep(100);
487 /* stop the tracing. */ 549 /* stop the tracing. */
488 tr->ctrl = 0; 550 tracing_stop();
489 trace->ctrl_update(tr);
490 /* check the trace buffer */ 551 /* check the trace buffer */
491 ret = trace_test_buffer(tr, &count); 552 ret = trace_test_buffer(tr, &count);
492 trace->reset(tr); 553 trace->reset(tr);
554 tracing_start();
493 555
494 if (!ret && !count) { 556 if (!ret && !count) {
495 printk(KERN_CONT ".. no entries found .."); 557 printk(KERN_CONT ".. no entries found ..");
@@ -508,17 +570,48 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
508 int ret; 570 int ret;
509 571
510 /* start the tracing */ 572 /* start the tracing */
511 tr->ctrl = 1; 573 ret = trace->init(tr);
512 trace->init(tr); 574 if (ret) {
575 warn_failed_init_tracer(trace, ret);
576 return 0;
577 }
578
513 /* Sleep for a 1/10 of a second */ 579 /* Sleep for a 1/10 of a second */
514 msleep(100); 580 msleep(100);
515 /* stop the tracing. */ 581 /* stop the tracing. */
516 tr->ctrl = 0; 582 tracing_stop();
517 trace->ctrl_update(tr);
518 /* check the trace buffer */ 583 /* check the trace buffer */
519 ret = trace_test_buffer(tr, &count); 584 ret = trace_test_buffer(tr, &count);
520 trace->reset(tr); 585 trace->reset(tr);
586 tracing_start();
521 587
522 return ret; 588 return ret;
523} 589}
524#endif /* CONFIG_SYSPROF_TRACER */ 590#endif /* CONFIG_SYSPROF_TRACER */
591
592#ifdef CONFIG_BRANCH_TRACER
593int
594trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
595{
596 unsigned long count;
597 int ret;
598
599 /* start the tracing */
600 ret = trace->init(tr);
601 if (ret) {
602 warn_failed_init_tracer(trace, ret);
603 return ret;
604 }
605
606 /* Sleep for a 1/10 of a second */
607 msleep(100);
608 /* stop the tracing. */
609 tracing_stop();
610 /* check the trace buffer */
611 ret = trace_test_buffer(tr, &count);
612 trace->reset(tr);
613 tracing_start();
614
615 return ret;
616}
617#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3bdb44bde4b7..d0871bc0aca5 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -10,6 +10,7 @@
10#include <linux/debugfs.h> 10#include <linux/debugfs.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/sysctl.h>
13#include <linux/init.h> 14#include <linux/init.h>
14#include <linux/fs.h> 15#include <linux/fs.h>
15#include "trace.h" 16#include "trace.h"
@@ -31,6 +32,10 @@ static raw_spinlock_t max_stack_lock =
31 32
32static int stack_trace_disabled __read_mostly; 33static int stack_trace_disabled __read_mostly;
33static DEFINE_PER_CPU(int, trace_active); 34static DEFINE_PER_CPU(int, trace_active);
35static DEFINE_MUTEX(stack_sysctl_mutex);
36
37int stack_tracer_enabled;
38static int last_stack_tracer_enabled;
34 39
35static inline void check_stack(void) 40static inline void check_stack(void)
36{ 41{
@@ -48,7 +53,7 @@ static inline void check_stack(void)
48 if (!object_is_on_stack(&this_size)) 53 if (!object_is_on_stack(&this_size))
49 return; 54 return;
50 55
51 raw_local_irq_save(flags); 56 local_irq_save(flags);
52 __raw_spin_lock(&max_stack_lock); 57 __raw_spin_lock(&max_stack_lock);
53 58
54 /* a race could have already updated it */ 59 /* a race could have already updated it */
@@ -78,6 +83,7 @@ static inline void check_stack(void)
78 * on a new max, so it is far from a fast path. 83 * on a new max, so it is far from a fast path.
79 */ 84 */
80 while (i < max_stack_trace.nr_entries) { 85 while (i < max_stack_trace.nr_entries) {
86 int found = 0;
81 87
82 stack_dump_index[i] = this_size; 88 stack_dump_index[i] = this_size;
83 p = start; 89 p = start;
@@ -86,17 +92,19 @@ static inline void check_stack(void)
86 if (*p == stack_dump_trace[i]) { 92 if (*p == stack_dump_trace[i]) {
87 this_size = stack_dump_index[i++] = 93 this_size = stack_dump_index[i++] =
88 (top - p) * sizeof(unsigned long); 94 (top - p) * sizeof(unsigned long);
95 found = 1;
89 /* Start the search from here */ 96 /* Start the search from here */
90 start = p + 1; 97 start = p + 1;
91 } 98 }
92 } 99 }
93 100
94 i++; 101 if (!found)
102 i++;
95 } 103 }
96 104
97 out: 105 out:
98 __raw_spin_unlock(&max_stack_lock); 106 __raw_spin_unlock(&max_stack_lock);
99 raw_local_irq_restore(flags); 107 local_irq_restore(flags);
100} 108}
101 109
102static void 110static void
@@ -107,8 +115,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
107 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 115 if (unlikely(!ftrace_enabled || stack_trace_disabled))
108 return; 116 return;
109 117
110 resched = need_resched(); 118 resched = ftrace_preempt_disable();
111 preempt_disable_notrace();
112 119
113 cpu = raw_smp_processor_id(); 120 cpu = raw_smp_processor_id();
114 /* no atomic needed, we only modify this variable by this cpu */ 121 /* no atomic needed, we only modify this variable by this cpu */
@@ -120,10 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
120 out: 127 out:
121 per_cpu(trace_active, cpu)--; 128 per_cpu(trace_active, cpu)--;
122 /* prevent recursion in schedule */ 129 /* prevent recursion in schedule */
123 if (resched) 130 ftrace_preempt_enable(resched);
124 preempt_enable_no_resched_notrace();
125 else
126 preempt_enable_notrace();
127} 131}
128 132
129static struct ftrace_ops trace_ops __read_mostly = 133static struct ftrace_ops trace_ops __read_mostly =
@@ -166,16 +170,16 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
166 if (ret < 0) 170 if (ret < 0)
167 return ret; 171 return ret;
168 172
169 raw_local_irq_save(flags); 173 local_irq_save(flags);
170 __raw_spin_lock(&max_stack_lock); 174 __raw_spin_lock(&max_stack_lock);
171 *ptr = val; 175 *ptr = val;
172 __raw_spin_unlock(&max_stack_lock); 176 __raw_spin_unlock(&max_stack_lock);
173 raw_local_irq_restore(flags); 177 local_irq_restore(flags);
174 178
175 return count; 179 return count;
176} 180}
177 181
178static struct file_operations stack_max_size_fops = { 182static const struct file_operations stack_max_size_fops = {
179 .open = tracing_open_generic, 183 .open = tracing_open_generic,
180 .read = stack_max_size_read, 184 .read = stack_max_size_read,
181 .write = stack_max_size_write, 185 .write = stack_max_size_write,
@@ -273,7 +277,7 @@ static int t_show(struct seq_file *m, void *v)
273 return 0; 277 return 0;
274} 278}
275 279
276static struct seq_operations stack_trace_seq_ops = { 280static const struct seq_operations stack_trace_seq_ops = {
277 .start = t_start, 281 .start = t_start,
278 .next = t_next, 282 .next = t_next,
279 .stop = t_stop, 283 .stop = t_stop,
@@ -289,12 +293,47 @@ static int stack_trace_open(struct inode *inode, struct file *file)
289 return ret; 293 return ret;
290} 294}
291 295
292static struct file_operations stack_trace_fops = { 296static const struct file_operations stack_trace_fops = {
293 .open = stack_trace_open, 297 .open = stack_trace_open,
294 .read = seq_read, 298 .read = seq_read,
295 .llseek = seq_lseek, 299 .llseek = seq_lseek,
296}; 300};
297 301
302int
303stack_trace_sysctl(struct ctl_table *table, int write,
304 struct file *file, void __user *buffer, size_t *lenp,
305 loff_t *ppos)
306{
307 int ret;
308
309 mutex_lock(&stack_sysctl_mutex);
310
311 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
312
313 if (ret || !write ||
314 (last_stack_tracer_enabled == stack_tracer_enabled))
315 goto out;
316
317 last_stack_tracer_enabled = stack_tracer_enabled;
318
319 if (stack_tracer_enabled)
320 register_ftrace_function(&trace_ops);
321 else
322 unregister_ftrace_function(&trace_ops);
323
324 out:
325 mutex_unlock(&stack_sysctl_mutex);
326 return ret;
327}
328
329static __init int enable_stacktrace(char *str)
330{
331 stack_tracer_enabled = 1;
332 last_stack_tracer_enabled = 1;
333 return 1;
334}
335__setup("stacktrace", enable_stacktrace);
336
298static __init int stack_trace_init(void) 337static __init int stack_trace_init(void)
299{ 338{
300 struct dentry *d_tracer; 339 struct dentry *d_tracer;
@@ -312,7 +351,8 @@ static __init int stack_trace_init(void)
312 if (!entry) 351 if (!entry)
313 pr_warning("Could not create debugfs 'stack_trace' entry\n"); 352 pr_warning("Could not create debugfs 'stack_trace' entry\n");
314 353
315 register_ftrace_function(&trace_ops); 354 if (stack_tracer_enabled)
355 register_ftrace_function(&trace_ops);
316 356
317 return 0; 357 return 0;
318} 358}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba55..eaca5ad803ff 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -196,27 +196,19 @@ static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
196 return HRTIMER_RESTART; 196 return HRTIMER_RESTART;
197} 197}
198 198
199static void start_stack_timer(int cpu) 199static void start_stack_timer(void *unused)
200{ 200{
201 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); 201 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 205
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 206 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 207}
209 208
210static void start_stack_timers(void) 209static void start_stack_timers(void)
211{ 210{
212 cpumask_t saved_mask = current->cpus_allowed; 211 on_each_cpu(start_stack_timer, NULL, 1);
213 int cpu;
214
215 for_each_online_cpu(cpu) {
216 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
217 start_stack_timer(cpu);
218 }
219 set_cpus_allowed_ptr(current, &saved_mask);
220} 212}
221 213
222static void stop_stack_timer(int cpu) 214static void stop_stack_timer(int cpu)
@@ -234,20 +226,10 @@ static void stop_stack_timers(void)
234 stop_stack_timer(cpu); 226 stop_stack_timer(cpu);
235} 227}
236 228
237static void stack_reset(struct trace_array *tr)
238{
239 int cpu;
240
241 tr->time_start = ftrace_now(tr->cpu);
242
243 for_each_online_cpu(cpu)
244 tracing_reset(tr, cpu);
245}
246
247static void start_stack_trace(struct trace_array *tr) 229static void start_stack_trace(struct trace_array *tr)
248{ 230{
249 mutex_lock(&sample_timer_lock); 231 mutex_lock(&sample_timer_lock);
250 stack_reset(tr); 232 tracing_reset_online_cpus(tr);
251 start_stack_timers(); 233 start_stack_timers();
252 tracer_enabled = 1; 234 tracer_enabled = 1;
253 mutex_unlock(&sample_timer_lock); 235 mutex_unlock(&sample_timer_lock);
@@ -261,27 +243,17 @@ static void stop_stack_trace(struct trace_array *tr)
261 mutex_unlock(&sample_timer_lock); 243 mutex_unlock(&sample_timer_lock);
262} 244}
263 245
264static void stack_trace_init(struct trace_array *tr) 246static int stack_trace_init(struct trace_array *tr)
265{ 247{
266 sysprof_trace = tr; 248 sysprof_trace = tr;
267 249
268 if (tr->ctrl) 250 start_stack_trace(tr);
269 start_stack_trace(tr); 251 return 0;
270} 252}
271 253
272static void stack_trace_reset(struct trace_array *tr) 254static void stack_trace_reset(struct trace_array *tr)
273{ 255{
274 if (tr->ctrl) 256 stop_stack_trace(tr);
275 stop_stack_trace(tr);
276}
277
278static void stack_trace_ctrl_update(struct trace_array *tr)
279{
280 /* When starting a new trace, reset the buffers */
281 if (tr->ctrl)
282 start_stack_trace(tr);
283 else
284 stop_stack_trace(tr);
285} 257}
286 258
287static struct tracer stack_trace __read_mostly = 259static struct tracer stack_trace __read_mostly =
@@ -289,7 +261,6 @@ static struct tracer stack_trace __read_mostly =
289 .name = "sysprof", 261 .name = "sysprof",
290 .init = stack_trace_init, 262 .init = stack_trace_init,
291 .reset = stack_trace_reset, 263 .reset = stack_trace_reset,
292 .ctrl_update = stack_trace_ctrl_update,
293#ifdef CONFIG_FTRACE_SELFTEST 264#ifdef CONFIG_FTRACE_SELFTEST
294 .selftest = trace_selftest_startup_sysprof, 265 .selftest = trace_selftest_startup_sysprof,
295#endif 266#endif
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index af8c85664882..79602740bbb5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex);
43 */ 43 */
44#define TRACEPOINT_HASH_BITS 6 44#define TRACEPOINT_HASH_BITS 6
45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) 45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
46static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -54,40 +55,43 @@ struct tracepoint_entry {
54 struct hlist_node hlist; 55 struct hlist_node hlist;
55 void **funcs; 56 void **funcs;
56 int refcount; /* Number of times armed. 0 if disarmed. */ 57 int refcount; /* Number of times armed. 0 if disarmed. */
57 struct rcu_head rcu;
58 void *oldptr;
59 unsigned char rcu_pending:1;
60 char name[0]; 58 char name[0];
61}; 59};
62 60
63static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; 61struct tp_probes {
62 union {
63 struct rcu_head rcu;
64 struct list_head list;
65 } u;
66 void *probes[0];
67};
64 68
65static void free_old_closure(struct rcu_head *head) 69static inline void *allocate_probes(int count)
66{ 70{
67 struct tracepoint_entry *entry = container_of(head, 71 struct tp_probes *p = kmalloc(count * sizeof(void *)
68 struct tracepoint_entry, rcu); 72 + sizeof(struct tp_probes), GFP_KERNEL);
69 kfree(entry->oldptr); 73 return p == NULL ? NULL : p->probes;
70 /* Make sure we free the data before setting the pending flag to 0 */
71 smp_wmb();
72 entry->rcu_pending = 0;
73} 74}
74 75
75static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) 76static void rcu_free_old_probes(struct rcu_head *head)
76{ 77{
77 if (!old) 78 kfree(container_of(head, struct tp_probes, u.rcu));
78 return; 79}
79 entry->oldptr = old; 80
80 entry->rcu_pending = 1; 81static inline void release_probes(void *old)
81 /* write rcu_pending before calling the RCU callback */ 82{
82 smp_wmb(); 83 if (old) {
83 call_rcu_sched(&entry->rcu, free_old_closure); 84 struct tp_probes *tp_probes = container_of(old,
85 struct tp_probes, probes[0]);
86 call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
87 }
84} 88}
85 89
86static void debug_print_probes(struct tracepoint_entry *entry) 90static void debug_print_probes(struct tracepoint_entry *entry)
87{ 91{
88 int i; 92 int i;
89 93
90 if (!tracepoint_debug) 94 if (!tracepoint_debug || !entry->funcs)
91 return; 95 return;
92 96
93 for (i = 0; entry->funcs[i]; i++) 97 for (i = 0; entry->funcs[i]; i++)
@@ -111,12 +115,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 return ERR_PTR(-EEXIST); 115 return ERR_PTR(-EEXIST);
112 } 116 }
113 /* + 2 : one for new probe, one for NULL func */ 117 /* + 2 : one for new probe, one for NULL func */
114 new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); 118 new = allocate_probes(nr_probes + 2);
115 if (new == NULL) 119 if (new == NULL)
116 return ERR_PTR(-ENOMEM); 120 return ERR_PTR(-ENOMEM);
117 if (old) 121 if (old)
118 memcpy(new, old, nr_probes * sizeof(void *)); 122 memcpy(new, old, nr_probes * sizeof(void *));
119 new[nr_probes] = probe; 123 new[nr_probes] = probe;
124 new[nr_probes + 1] = NULL;
120 entry->refcount = nr_probes + 1; 125 entry->refcount = nr_probes + 1;
121 entry->funcs = new; 126 entry->funcs = new;
122 debug_print_probes(entry); 127 debug_print_probes(entry);
@@ -132,7 +137,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
132 old = entry->funcs; 137 old = entry->funcs;
133 138
134 if (!old) 139 if (!old)
135 return NULL; 140 return ERR_PTR(-ENOENT);
136 141
137 debug_print_probes(entry); 142 debug_print_probes(entry);
138 /* (N -> M), (N > 1, M >= 0) probes */ 143 /* (N -> M), (N > 1, M >= 0) probes */
@@ -151,13 +156,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
151 int j = 0; 156 int j = 0;
152 /* N -> M, (N > 1, M > 0) */ 157 /* N -> M, (N > 1, M > 0) */
153 /* + 1 for NULL */ 158 /* + 1 for NULL */
154 new = kzalloc((nr_probes - nr_del + 1) 159 new = allocate_probes(nr_probes - nr_del + 1);
155 * sizeof(void *), GFP_KERNEL);
156 if (new == NULL) 160 if (new == NULL)
157 return ERR_PTR(-ENOMEM); 161 return ERR_PTR(-ENOMEM);
158 for (i = 0; old[i]; i++) 162 for (i = 0; old[i]; i++)
159 if ((probe && old[i] != probe)) 163 if ((probe && old[i] != probe))
160 new[j++] = old[i]; 164 new[j++] = old[i];
165 new[nr_probes - nr_del] = NULL;
161 entry->refcount = nr_probes - nr_del; 166 entry->refcount = nr_probes - nr_del;
162 entry->funcs = new; 167 entry->funcs = new;
163 } 168 }
@@ -215,7 +220,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
215 memcpy(&e->name[0], name, name_len); 220 memcpy(&e->name[0], name, name_len);
216 e->funcs = NULL; 221 e->funcs = NULL;
217 e->refcount = 0; 222 e->refcount = 0;
218 e->rcu_pending = 0;
219 hlist_add_head(&e->hlist, head); 223 hlist_add_head(&e->hlist, head);
220 return e; 224 return e;
221} 225}
@@ -224,32 +228,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
224 * Remove the tracepoint from the tracepoint hash table. Must be called with 228 * Remove the tracepoint from the tracepoint hash table. Must be called with
225 * mutex_lock held. 229 * mutex_lock held.
226 */ 230 */
227static int remove_tracepoint(const char *name) 231static inline void remove_tracepoint(struct tracepoint_entry *e)
228{ 232{
229 struct hlist_head *head;
230 struct hlist_node *node;
231 struct tracepoint_entry *e;
232 int found = 0;
233 size_t len = strlen(name) + 1;
234 u32 hash = jhash(name, len-1, 0);
235
236 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
237 hlist_for_each_entry(e, node, head, hlist) {
238 if (!strcmp(name, e->name)) {
239 found = 1;
240 break;
241 }
242 }
243 if (!found)
244 return -ENOENT;
245 if (e->refcount)
246 return -EBUSY;
247 hlist_del(&e->hlist); 233 hlist_del(&e->hlist);
248 /* Make sure the call_rcu_sched has been executed */
249 if (e->rcu_pending)
250 rcu_barrier_sched();
251 kfree(e); 234 kfree(e);
252 return 0;
253} 235}
254 236
255/* 237/*
@@ -280,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry,
280static void disable_tracepoint(struct tracepoint *elem) 262static void disable_tracepoint(struct tracepoint *elem)
281{ 263{
282 elem->state = 0; 264 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL);
283} 266}
284 267
285/** 268/**
@@ -320,6 +303,23 @@ static void tracepoint_update_probes(void)
320 module_update_tracepoints(); 303 module_update_tracepoints();
321} 304}
322 305
306static void *tracepoint_add_probe(const char *name, void *probe)
307{
308 struct tracepoint_entry *entry;
309 void *old;
310
311 entry = get_tracepoint(name);
312 if (!entry) {
313 entry = add_tracepoint(name);
314 if (IS_ERR(entry))
315 return entry;
316 }
317 old = tracepoint_entry_add_probe(entry, probe);
318 if (IS_ERR(old) && !entry->refcount)
319 remove_tracepoint(entry);
320 return old;
321}
322
323/** 323/**
324 * tracepoint_probe_register - Connect a probe to a tracepoint 324 * tracepoint_probe_register - Connect a probe to a tracepoint
325 * @name: tracepoint name 325 * @name: tracepoint name
@@ -330,44 +330,36 @@ static void tracepoint_update_probes(void)
330 */ 330 */
331int tracepoint_probe_register(const char *name, void *probe) 331int tracepoint_probe_register(const char *name, void *probe)
332{ 332{
333 struct tracepoint_entry *entry;
334 int ret = 0;
335 void *old; 333 void *old;
336 334
337 mutex_lock(&tracepoints_mutex); 335 mutex_lock(&tracepoints_mutex);
338 entry = get_tracepoint(name); 336 old = tracepoint_add_probe(name, probe);
339 if (!entry) {
340 entry = add_tracepoint(name);
341 if (IS_ERR(entry)) {
342 ret = PTR_ERR(entry);
343 goto end;
344 }
345 }
346 /*
347 * If we detect that a call_rcu_sched is pending for this tracepoint,
348 * make sure it's executed now.
349 */
350 if (entry->rcu_pending)
351 rcu_barrier_sched();
352 old = tracepoint_entry_add_probe(entry, probe);
353 if (IS_ERR(old)) {
354 ret = PTR_ERR(old);
355 goto end;
356 }
357 mutex_unlock(&tracepoints_mutex); 337 mutex_unlock(&tracepoints_mutex);
338 if (IS_ERR(old))
339 return PTR_ERR(old);
340
358 tracepoint_update_probes(); /* may update entry */ 341 tracepoint_update_probes(); /* may update entry */
359 mutex_lock(&tracepoints_mutex); 342 release_probes(old);
360 entry = get_tracepoint(name); 343 return 0;
361 WARN_ON(!entry);
362 if (entry->rcu_pending)
363 rcu_barrier_sched();
364 tracepoint_entry_free_old(entry, old);
365end:
366 mutex_unlock(&tracepoints_mutex);
367 return ret;
368} 344}
369EXPORT_SYMBOL_GPL(tracepoint_probe_register); 345EXPORT_SYMBOL_GPL(tracepoint_probe_register);
370 346
347static void *tracepoint_remove_probe(const char *name, void *probe)
348{
349 struct tracepoint_entry *entry;
350 void *old;
351
352 entry = get_tracepoint(name);
353 if (!entry)
354 return ERR_PTR(-ENOENT);
355 old = tracepoint_entry_remove_probe(entry, probe);
356 if (IS_ERR(old))
357 return old;
358 if (!entry->refcount)
359 remove_tracepoint(entry);
360 return old;
361}
362
371/** 363/**
372 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 364 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
373 * @name: tracepoint name 365 * @name: tracepoint name
@@ -380,38 +372,104 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
380 */ 372 */
381int tracepoint_probe_unregister(const char *name, void *probe) 373int tracepoint_probe_unregister(const char *name, void *probe)
382{ 374{
383 struct tracepoint_entry *entry;
384 void *old; 375 void *old;
385 int ret = -ENOENT;
386 376
387 mutex_lock(&tracepoints_mutex); 377 mutex_lock(&tracepoints_mutex);
388 entry = get_tracepoint(name); 378 old = tracepoint_remove_probe(name, probe);
389 if (!entry)
390 goto end;
391 if (entry->rcu_pending)
392 rcu_barrier_sched();
393 old = tracepoint_entry_remove_probe(entry, probe);
394 if (!old) {
395 printk(KERN_WARNING "Warning: Trying to unregister a probe"
396 "that doesn't exist\n");
397 goto end;
398 }
399 mutex_unlock(&tracepoints_mutex); 379 mutex_unlock(&tracepoints_mutex);
380 if (IS_ERR(old))
381 return PTR_ERR(old);
382
400 tracepoint_update_probes(); /* may update entry */ 383 tracepoint_update_probes(); /* may update entry */
384 release_probes(old);
385 return 0;
386}
387EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
388
389static LIST_HEAD(old_probes);
390static int need_update;
391
392static void tracepoint_add_old_probes(void *old)
393{
394 need_update = 1;
395 if (old) {
396 struct tp_probes *tp_probes = container_of(old,
397 struct tp_probes, probes[0]);
398 list_add(&tp_probes->u.list, &old_probes);
399 }
400}
401
402/**
403 * tracepoint_probe_register_noupdate - register a probe but not connect
404 * @name: tracepoint name
405 * @probe: probe handler
406 *
407 * caller must call tracepoint_probe_update_all()
408 */
409int tracepoint_probe_register_noupdate(const char *name, void *probe)
410{
411 void *old;
412
401 mutex_lock(&tracepoints_mutex); 413 mutex_lock(&tracepoints_mutex);
402 entry = get_tracepoint(name); 414 old = tracepoint_add_probe(name, probe);
403 if (!entry) 415 if (IS_ERR(old)) {
404 goto end; 416 mutex_unlock(&tracepoints_mutex);
405 if (entry->rcu_pending) 417 return PTR_ERR(old);
406 rcu_barrier_sched(); 418 }
407 tracepoint_entry_free_old(entry, old); 419 tracepoint_add_old_probes(old);
408 remove_tracepoint(name); /* Ignore busy error message */
409 ret = 0;
410end:
411 mutex_unlock(&tracepoints_mutex); 420 mutex_unlock(&tracepoints_mutex);
412 return ret; 421 return 0;
413} 422}
414EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); 423EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
424
425/**
426 * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
427 * @name: tracepoint name
428 * @probe: probe function pointer
429 *
430 * caller must call tracepoint_probe_update_all()
431 */
432int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
433{
434 void *old;
435
436 mutex_lock(&tracepoints_mutex);
437 old = tracepoint_remove_probe(name, probe);
438 if (IS_ERR(old)) {
439 mutex_unlock(&tracepoints_mutex);
440 return PTR_ERR(old);
441 }
442 tracepoint_add_old_probes(old);
443 mutex_unlock(&tracepoints_mutex);
444 return 0;
445}
446EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
447
448/**
449 * tracepoint_probe_update_all - update tracepoints
450 */
451void tracepoint_probe_update_all(void)
452{
453 LIST_HEAD(release_probes);
454 struct tp_probes *pos, *next;
455
456 mutex_lock(&tracepoints_mutex);
457 if (!need_update) {
458 mutex_unlock(&tracepoints_mutex);
459 return;
460 }
461 if (!list_empty(&old_probes))
462 list_replace_init(&old_probes, &release_probes);
463 need_update = 0;
464 mutex_unlock(&tracepoints_mutex);
465
466 tracepoint_update_probes();
467 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
468 list_del(&pos->u.list);
469 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
470 }
471}
472EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
415 473
416/** 474/**
417 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. 475 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
@@ -483,3 +541,36 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
483 iter->tracepoint = NULL; 541 iter->tracepoint = NULL;
484} 542}
485EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 543EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
544
545#ifdef CONFIG_MODULES
546
547int tracepoint_module_notify(struct notifier_block *self,
548 unsigned long val, void *data)
549{
550 struct module *mod = data;
551
552 switch (val) {
553 case MODULE_STATE_COMING:
554 tracepoint_update_probe_range(mod->tracepoints,
555 mod->tracepoints + mod->num_tracepoints);
556 break;
557 case MODULE_STATE_GOING:
558 tracepoint_update_probe_range(mod->tracepoints,
559 mod->tracepoints + mod->num_tracepoints);
560 break;
561 }
562 return 0;
563}
564
565struct notifier_block tracepoint_module_nb = {
566 .notifier_call = tracepoint_module_notify,
567 .priority = 0,
568};
569
570static int init_tracepoints(void)
571{
572 return register_module_notifier(&tracepoint_module_nb);
573}
574__initcall(init_tracepoints);
575
576#endif /* CONFIG_MODULES */
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 8ebcd8532dfb..43f891b05a4b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -27,6 +27,7 @@
27 */ 27 */
28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) 28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
29{ 29{
30 const struct cred *tcred;
30 struct timespec uptime, ts; 31 struct timespec uptime, ts;
31 u64 ac_etime; 32 u64 ac_etime;
32 33
@@ -53,10 +54,11 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
53 stats->ac_flag |= AXSIG; 54 stats->ac_flag |= AXSIG;
54 stats->ac_nice = task_nice(tsk); 55 stats->ac_nice = task_nice(tsk);
55 stats->ac_sched = tsk->policy; 56 stats->ac_sched = tsk->policy;
56 stats->ac_uid = tsk->uid;
57 stats->ac_gid = tsk->gid;
58 stats->ac_pid = tsk->pid; 57 stats->ac_pid = tsk->pid;
59 rcu_read_lock(); 58 rcu_read_lock();
59 tcred = __task_cred(tsk);
60 stats->ac_uid = tcred->uid;
61 stats->ac_gid = tcred->gid;
60 stats->ac_ppid = pid_alive(tsk) ? 62 stats->ac_ppid = pid_alive(tsk) ?
61 rcu_dereference(tsk->real_parent)->tgid : 0; 63 rcu_dereference(tsk->real_parent)->tgid : 0;
62 rcu_read_unlock(); 64 rcu_read_unlock();
@@ -90,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
90 mm = get_task_mm(p); 92 mm = get_task_mm(p);
91 if (mm) { 93 if (mm) {
92 /* adjust to KB unit */ 94 /* adjust to KB unit */
93 stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; 95 stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
94 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; 96 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB;
95 mmput(mm); 97 mmput(mm);
96 } 98 }
97 stats->read_char = p->ioac.rchar; 99 stats->read_char = p->ioac.rchar;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 3e41c1673e2f..2460c3199b5a 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -84,11 +84,12 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
84 84
85asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) 85asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid)
86{ 86{
87 const struct cred *cred = current_cred();
87 int retval; 88 int retval;
88 89
89 if (!(retval = put_user(high2lowuid(current->uid), ruid)) && 90 if (!(retval = put_user(high2lowuid(cred->uid), ruid)) &&
90 !(retval = put_user(high2lowuid(current->euid), euid))) 91 !(retval = put_user(high2lowuid(cred->euid), euid)))
91 retval = put_user(high2lowuid(current->suid), suid); 92 retval = put_user(high2lowuid(cred->suid), suid);
92 93
93 return retval; 94 return retval;
94} 95}
@@ -104,11 +105,12 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
104 105
105asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) 106asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid)
106{ 107{
108 const struct cred *cred = current_cred();
107 int retval; 109 int retval;
108 110
109 if (!(retval = put_user(high2lowgid(current->gid), rgid)) && 111 if (!(retval = put_user(high2lowgid(cred->gid), rgid)) &&
110 !(retval = put_user(high2lowgid(current->egid), egid))) 112 !(retval = put_user(high2lowgid(cred->egid), egid)))
111 retval = put_user(high2lowgid(current->sgid), sgid); 113 retval = put_user(high2lowgid(cred->sgid), sgid);
112 114
113 return retval; 115 return retval;
114} 116}
@@ -161,25 +163,24 @@ static int groups16_from_user(struct group_info *group_info,
161 163
162asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) 164asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist)
163{ 165{
164 int i = 0; 166 const struct cred *cred = current_cred();
167 int i;
165 168
166 if (gidsetsize < 0) 169 if (gidsetsize < 0)
167 return -EINVAL; 170 return -EINVAL;
168 171
169 get_group_info(current->group_info); 172 i = cred->group_info->ngroups;
170 i = current->group_info->ngroups;
171 if (gidsetsize) { 173 if (gidsetsize) {
172 if (i > gidsetsize) { 174 if (i > gidsetsize) {
173 i = -EINVAL; 175 i = -EINVAL;
174 goto out; 176 goto out;
175 } 177 }
176 if (groups16_to_user(grouplist, current->group_info)) { 178 if (groups16_to_user(grouplist, cred->group_info)) {
177 i = -EFAULT; 179 i = -EFAULT;
178 goto out; 180 goto out;
179 } 181 }
180 } 182 }
181out: 183out:
182 put_group_info(current->group_info);
183 return i; 184 return i;
184} 185}
185 186
@@ -210,20 +211,20 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
210 211
211asmlinkage long sys_getuid16(void) 212asmlinkage long sys_getuid16(void)
212{ 213{
213 return high2lowuid(current->uid); 214 return high2lowuid(current_uid());
214} 215}
215 216
216asmlinkage long sys_geteuid16(void) 217asmlinkage long sys_geteuid16(void)
217{ 218{
218 return high2lowuid(current->euid); 219 return high2lowuid(current_euid());
219} 220}
220 221
221asmlinkage long sys_getgid16(void) 222asmlinkage long sys_getgid16(void)
222{ 223{
223 return high2lowgid(current->gid); 224 return high2lowgid(current_gid());
224} 225}
225 226
226asmlinkage long sys_getegid16(void) 227asmlinkage long sys_getegid16(void)
227{ 228{
228 return high2lowgid(current->egid); 229 return high2lowgid(current_egid());
229} 230}
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae43..477b6660f447 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,12 +16,13 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
19 20
20struct user_namespace init_user_ns = { 21struct user_namespace init_user_ns = {
21 .kref = { 22 .kref = {
22 .refcount = ATOMIC_INIT(2), 23 .refcount = ATOMIC_INIT(1),
23 }, 24 },
24 .root_user = &root_user, 25 .creator = &root_user,
25}; 26};
26EXPORT_SYMBOL_GPL(init_user_ns); 27EXPORT_SYMBOL_GPL(init_user_ns);
27 28
@@ -47,12 +48,14 @@ static struct kmem_cache *uid_cachep;
47 */ 48 */
48static DEFINE_SPINLOCK(uidhash_lock); 49static DEFINE_SPINLOCK(uidhash_lock);
49 50
51/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
50struct user_struct root_user = { 52struct user_struct root_user = {
51 .__count = ATOMIC_INIT(1), 53 .__count = ATOMIC_INIT(2),
52 .processes = ATOMIC_INIT(1), 54 .processes = ATOMIC_INIT(1),
53 .files = ATOMIC_INIT(0), 55 .files = ATOMIC_INIT(0),
54 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
55 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns,
56#ifdef CONFIG_USER_SCHED 59#ifdef CONFIG_USER_SCHED
57 .tg = &init_task_group, 60 .tg = &init_task_group,
58#endif 61#endif
@@ -101,19 +104,15 @@ static int sched_create_user(struct user_struct *up)
101 if (IS_ERR(up->tg)) 104 if (IS_ERR(up->tg))
102 rc = -ENOMEM; 105 rc = -ENOMEM;
103 106
104 return rc; 107 set_tg_uid(up);
105}
106 108
107static void sched_switch_user(struct task_struct *p) 109 return rc;
108{
109 sched_move_task(p);
110} 110}
111 111
112#else /* CONFIG_USER_SCHED */ 112#else /* CONFIG_USER_SCHED */
113 113
114static void sched_destroy_user(struct user_struct *up) { } 114static void sched_destroy_user(struct user_struct *up) { }
115static int sched_create_user(struct user_struct *up) { return 0; } 115static int sched_create_user(struct user_struct *up) { return 0; }
116static void sched_switch_user(struct task_struct *p) { }
117 116
118#endif /* CONFIG_USER_SCHED */ 117#endif /* CONFIG_USER_SCHED */
119 118
@@ -242,13 +241,21 @@ static struct kobj_type uids_ktype = {
242 .release = uids_release, 241 .release = uids_release,
243}; 242};
244 243
245/* create /sys/kernel/uids/<uid>/cpu_share file for this user */ 244/*
245 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
246 * We do not create this file for users in a user namespace (until
247 * sysfs tagging is implemented).
248 *
249 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
250 */
246static int uids_user_create(struct user_struct *up) 251static int uids_user_create(struct user_struct *up)
247{ 252{
248 struct kobject *kobj = &up->kobj; 253 struct kobject *kobj = &up->kobj;
249 int error; 254 int error;
250 255
251 memset(kobj, 0, sizeof(struct kobject)); 256 memset(kobj, 0, sizeof(struct kobject));
257 if (up->user_ns != &init_user_ns)
258 return 0;
252 kobj->kset = uids_kset; 259 kobj->kset = uids_kset;
253 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); 260 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
254 if (error) { 261 if (error) {
@@ -284,6 +291,8 @@ static void remove_user_sysfs_dir(struct work_struct *w)
284 unsigned long flags; 291 unsigned long flags;
285 int remove_user = 0; 292 int remove_user = 0;
286 293
294 if (up->user_ns != &init_user_ns)
295 return;
287 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() 296 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
288 * atomic. 297 * atomic.
289 */ 298 */
@@ -319,12 +328,13 @@ done:
319 * IRQ state (as stored in flags) is restored and uidhash_lock released 328 * IRQ state (as stored in flags) is restored and uidhash_lock released
320 * upon function exit. 329 * upon function exit.
321 */ 330 */
322static inline void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
323{ 332{
324 /* restore back the count */ 333 /* restore back the count */
325 atomic_inc(&up->__count); 334 atomic_inc(&up->__count);
326 spin_unlock_irqrestore(&uidhash_lock, flags); 335 spin_unlock_irqrestore(&uidhash_lock, flags);
327 336
337 put_user_ns(up->user_ns);
328 INIT_WORK(&up->work, remove_user_sysfs_dir); 338 INIT_WORK(&up->work, remove_user_sysfs_dir);
329 schedule_work(&up->work); 339 schedule_work(&up->work);
330} 340}
@@ -340,13 +350,14 @@ static inline void uids_mutex_unlock(void) { }
340 * IRQ state (as stored in flags) is restored and uidhash_lock released 350 * IRQ state (as stored in flags) is restored and uidhash_lock released
341 * upon function exit. 351 * upon function exit.
342 */ 352 */
343static inline void free_user(struct user_struct *up, unsigned long flags) 353static void free_user(struct user_struct *up, unsigned long flags)
344{ 354{
345 uid_hash_remove(up); 355 uid_hash_remove(up);
346 spin_unlock_irqrestore(&uidhash_lock, flags); 356 spin_unlock_irqrestore(&uidhash_lock, flags);
347 sched_destroy_user(up); 357 sched_destroy_user(up);
348 key_put(up->uid_keyring); 358 key_put(up->uid_keyring);
349 key_put(up->session_keyring); 359 key_put(up->session_keyring);
360 put_user_ns(up->user_ns);
350 kmem_cache_free(uid_cachep, up); 361 kmem_cache_free(uid_cachep, up);
351} 362}
352 363
@@ -362,7 +373,7 @@ struct user_struct *find_user(uid_t uid)
362{ 373{
363 struct user_struct *ret; 374 struct user_struct *ret;
364 unsigned long flags; 375 unsigned long flags;
365 struct user_namespace *ns = current->nsproxy->user_ns; 376 struct user_namespace *ns = current_user_ns();
366 377
367 spin_lock_irqsave(&uidhash_lock, flags); 378 spin_lock_irqsave(&uidhash_lock, flags);
368 ret = uid_hash_find(uid, uidhashentry(ns, uid)); 379 ret = uid_hash_find(uid, uidhashentry(ns, uid));
@@ -409,6 +420,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
409 if (sched_create_user(new) < 0) 420 if (sched_create_user(new) < 0)
410 goto out_free_user; 421 goto out_free_user;
411 422
423 new->user_ns = get_user_ns(ns);
424
412 if (uids_user_create(new)) 425 if (uids_user_create(new))
413 goto out_destoy_sched; 426 goto out_destoy_sched;
414 427
@@ -432,7 +445,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
432 up = new; 445 up = new;
433 } 446 }
434 spin_unlock_irq(&uidhash_lock); 447 spin_unlock_irq(&uidhash_lock);
435
436 } 448 }
437 449
438 uids_mutex_unlock(); 450 uids_mutex_unlock();
@@ -441,6 +453,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
441 453
442out_destoy_sched: 454out_destoy_sched:
443 sched_destroy_user(new); 455 sched_destroy_user(new);
456 put_user_ns(new->user_ns);
444out_free_user: 457out_free_user:
445 kmem_cache_free(uid_cachep, new); 458 kmem_cache_free(uid_cachep, new);
446out_unlock: 459out_unlock:
@@ -448,63 +461,6 @@ out_unlock:
448 return NULL; 461 return NULL;
449} 462}
450 463
451void switch_uid(struct user_struct *new_user)
452{
453 struct user_struct *old_user;
454
455 /* What if a process setreuid()'s and this brings the
456 * new uid over his NPROC rlimit? We can check this now
457 * cheaply with the new uid cache, so if it matters
458 * we should be checking for it. -DaveM
459 */
460 old_user = current->user;
461 atomic_inc(&new_user->processes);
462 atomic_dec(&old_user->processes);
463 switch_uid_keyring(new_user);
464 current->user = new_user;
465 sched_switch_user(current);
466
467 /*
468 * We need to synchronize with __sigqueue_alloc()
469 * doing a get_uid(p->user).. If that saw the old
470 * user value, we need to wait until it has exited
471 * its critical region before we can free the old
472 * structure.
473 */
474 smp_mb();
475 spin_unlock_wait(&current->sighand->siglock);
476
477 free_uid(old_user);
478 suid_keys(current);
479}
480
481#ifdef CONFIG_USER_NS
482void release_uids(struct user_namespace *ns)
483{
484 int i;
485 unsigned long flags;
486 struct hlist_head *head;
487 struct hlist_node *nd;
488
489 spin_lock_irqsave(&uidhash_lock, flags);
490 /*
491 * collapse the chains so that the user_struct-s will
492 * be still alive, but not in hashes. subsequent free_uid()
493 * will free them.
494 */
495 for (i = 0; i < UIDHASH_SZ; i++) {
496 head = ns->uidhash_table + i;
497 while (!hlist_empty(head)) {
498 nd = head->first;
499 hlist_del_init(nd);
500 }
501 }
502 spin_unlock_irqrestore(&uidhash_lock, flags);
503
504 free_uid(ns->root_user);
505}
506#endif
507
508static int __init uid_cache_init(void) 464static int __init uid_cache_init(void)
509{ 465{
510 int n; 466 int n;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 532858fa5b88..79084311ee57 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,60 +9,55 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/cred.h>
12 13
13/* 14/*
14 * Clone a new ns copying an original user ns, setting refcount to 1 15 * Create a new user namespace, deriving the creator from the user in the
15 * @old_ns: namespace to clone 16 * passed credentials, and replacing that user with the new root user for the
16 * Return NULL on error (failure to kmalloc), new ns otherwise 17 * new namespace.
18 *
19 * This is called by copy_creds(), which will finish setting the target task's
20 * credentials.
17 */ 21 */
18static struct user_namespace *clone_user_ns(struct user_namespace *old_ns) 22int create_user_ns(struct cred *new)
19{ 23{
20 struct user_namespace *ns; 24 struct user_namespace *ns;
21 struct user_struct *new_user; 25 struct user_struct *root_user;
22 int n; 26 int n;
23 27
24 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); 28 ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
25 if (!ns) 29 if (!ns)
26 return ERR_PTR(-ENOMEM); 30 return -ENOMEM;
27 31
28 kref_init(&ns->kref); 32 kref_init(&ns->kref);
29 33
30 for (n = 0; n < UIDHASH_SZ; ++n) 34 for (n = 0; n < UIDHASH_SZ; ++n)
31 INIT_HLIST_HEAD(ns->uidhash_table + n); 35 INIT_HLIST_HEAD(ns->uidhash_table + n);
32 36
33 /* Insert new root user. */ 37 /* Alloc new root user. */
34 ns->root_user = alloc_uid(ns, 0); 38 root_user = alloc_uid(ns, 0);
35 if (!ns->root_user) { 39 if (!root_user) {
36 kfree(ns); 40 kfree(ns);
37 return ERR_PTR(-ENOMEM); 41 return -ENOMEM;
38 } 42 }
39 43
40 /* Reset current->user with a new one */ 44 /* set the new root user in the credentials under preparation */
41 new_user = alloc_uid(ns, current->uid); 45 ns->creator = new->user;
42 if (!new_user) { 46 new->user = root_user;
43 free_uid(ns->root_user); 47 new->uid = new->euid = new->suid = new->fsuid = 0;
44 kfree(ns); 48 new->gid = new->egid = new->sgid = new->fsgid = 0;
45 return ERR_PTR(-ENOMEM); 49 put_group_info(new->group_info);
46 } 50 new->group_info = get_group_info(&init_groups);
47 51#ifdef CONFIG_KEYS
48 switch_uid(new_user); 52 key_put(new->request_key_auth);
49 return ns; 53 new->request_key_auth = NULL;
50} 54#endif
51 55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
52struct user_namespace * copy_user_ns(int flags, struct user_namespace *old_ns)
53{
54 struct user_namespace *new_ns;
55
56 BUG_ON(!old_ns);
57 get_user_ns(old_ns);
58
59 if (!(flags & CLONE_NEWUSER))
60 return old_ns;
61 56
62 new_ns = clone_user_ns(old_ns); 57 /* alloc_uid() incremented the userns refcount. Just set it to 1 */
58 kref_set(&ns->kref, 1);
63 59
64 put_user_ns(old_ns); 60 return 0;
65 return new_ns;
66} 61}
67 62
68void free_user_ns(struct kref *kref) 63void free_user_ns(struct kref *kref)
@@ -70,7 +65,7 @@ void free_user_ns(struct kref *kref)
70 struct user_namespace *ns; 65 struct user_namespace *ns;
71 66
72 ns = container_of(kref, struct user_namespace, kref); 67 ns = container_of(kref, struct user_namespace, kref);
73 release_uids(ns); 68 free_uid(ns->creator);
74 kfree(ns); 69 kfree(ns);
75} 70}
76EXPORT_SYMBOL(free_user_ns); 71EXPORT_SYMBOL(free_user_ns);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d4dc69ddebd7..2f445833ae37 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 73static LIST_HEAD(workqueues);
74 74
75static int singlethread_cpu __read_mostly; 75static int singlethread_cpu __read_mostly;
76static cpumask_t cpu_singlethread_map __read_mostly; 76static const struct cpumask *cpu_singlethread_map __read_mostly;
77/* 77/*
78 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD 78 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
79 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work 79 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
@@ -81,24 +81,24 @@ static cpumask_t cpu_singlethread_map __read_mostly;
81 * use cpu_possible_map, the cpumask below is more a documentation 81 * use cpu_possible_map, the cpumask below is more a documentation
82 * than optimization. 82 * than optimization.
83 */ 83 */
84static cpumask_t cpu_populated_map __read_mostly; 84static cpumask_var_t cpu_populated_map __read_mostly;
85 85
86/* If it's single threaded, it isn't in the list of workqueues. */ 86/* If it's single threaded, it isn't in the list of workqueues. */
87static inline int is_single_threaded(struct workqueue_struct *wq) 87static inline int is_wq_single_threaded(struct workqueue_struct *wq)
88{ 88{
89 return wq->singlethread; 89 return wq->singlethread;
90} 90}
91 91
92static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) 92static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
93{ 93{
94 return is_single_threaded(wq) 94 return is_wq_single_threaded(wq)
95 ? &cpu_singlethread_map : &cpu_populated_map; 95 ? cpu_singlethread_map : cpu_populated_map;
96} 96}
97 97
98static 98static
99struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) 99struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
100{ 100{
101 if (unlikely(is_single_threaded(wq))) 101 if (unlikely(is_wq_single_threaded(wq)))
102 cpu = singlethread_cpu; 102 cpu = singlethread_cpu;
103 return per_cpu_ptr(wq->cpu_wq, cpu); 103 return per_cpu_ptr(wq->cpu_wq, cpu);
104} 104}
@@ -410,7 +410,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
410 */ 410 */
411void flush_workqueue(struct workqueue_struct *wq) 411void flush_workqueue(struct workqueue_struct *wq)
412{ 412{
413 const cpumask_t *cpu_map = wq_cpu_map(wq); 413 const struct cpumask *cpu_map = wq_cpu_map(wq);
414 int cpu; 414 int cpu;
415 415
416 might_sleep(); 416 might_sleep();
@@ -532,7 +532,7 @@ static void wait_on_work(struct work_struct *work)
532{ 532{
533 struct cpu_workqueue_struct *cwq; 533 struct cpu_workqueue_struct *cwq;
534 struct workqueue_struct *wq; 534 struct workqueue_struct *wq;
535 const cpumask_t *cpu_map; 535 const struct cpumask *cpu_map;
536 int cpu; 536 int cpu;
537 537
538 might_sleep(); 538 might_sleep();
@@ -769,7 +769,7 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
769{ 769{
770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
771 struct workqueue_struct *wq = cwq->wq; 771 struct workqueue_struct *wq = cwq->wq;
772 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; 772 const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d";
773 struct task_struct *p; 773 struct task_struct *p;
774 774
775 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); 775 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
@@ -903,7 +903,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
903 */ 903 */
904void destroy_workqueue(struct workqueue_struct *wq) 904void destroy_workqueue(struct workqueue_struct *wq)
905{ 905{
906 const cpumask_t *cpu_map = wq_cpu_map(wq); 906 const struct cpumask *cpu_map = wq_cpu_map(wq);
907 int cpu; 907 int cpu;
908 908
909 cpu_maps_update_begin(); 909 cpu_maps_update_begin();
@@ -933,7 +933,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
933 933
934 switch (action) { 934 switch (action) {
935 case CPU_UP_PREPARE: 935 case CPU_UP_PREPARE:
936 cpu_set(cpu, cpu_populated_map); 936 cpumask_set_cpu(cpu, cpu_populated_map);
937 } 937 }
938undo: 938undo:
939 list_for_each_entry(wq, &workqueues, list) { 939 list_for_each_entry(wq, &workqueues, list) {
@@ -964,7 +964,7 @@ undo:
964 switch (action) { 964 switch (action) {
965 case CPU_UP_CANCELED: 965 case CPU_UP_CANCELED:
966 case CPU_POST_DEAD: 966 case CPU_POST_DEAD:
967 cpu_clear(cpu, cpu_populated_map); 967 cpumask_clear_cpu(cpu, cpu_populated_map);
968 } 968 }
969 969
970 return ret; 970 return ret;
@@ -1017,9 +1017,11 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
1017 1017
1018void __init init_workqueues(void) 1018void __init init_workqueues(void)
1019{ 1019{
1020 cpu_populated_map = cpu_online_map; 1020 alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL);
1021 singlethread_cpu = first_cpu(cpu_possible_map); 1021
1022 cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); 1022 cpumask_copy(cpu_populated_map, cpu_online_mask);
1023 singlethread_cpu = cpumask_first(cpu_possible_mask);
1024 cpu_singlethread_map = cpumask_of(singlethread_cpu);
1023 hotcpu_notifier(workqueue_cpu_callback, 0); 1025 hotcpu_notifier(workqueue_cpu_callback, 0);
1024 keventd_wq = create_workqueue("events"); 1026 keventd_wq = create_workqueue("events");
1025 BUG_ON(!keventd_wq); 1027 BUG_ON(!keventd_wq);