aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-01-10 20:42:53 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-10 20:42:53 -0500
commit506c10f26c481b7f8ef27c1c79290f68989b2e9e (patch)
tree03de82e812f00957aa6276dac2fe51c3358e88d7 /kernel
parente1df957670aef74ffd9a4ad93e6d2c90bf6b4845 (diff)
parentc59765042f53a79a7a65585042ff463b69cb248c (diff)
Merge commit 'v2.6.29-rc1' into perfcounters/core
Conflicts: include/linux/kernel_stat.h
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt25
-rw-r--r--kernel/Makefile9
-rw-r--r--kernel/async.c335
-rw-r--r--kernel/audit.h5
-rw-r--r--kernel/audit_tree.c3
-rw-r--r--kernel/auditfilter.c325
-rw-r--r--kernel/auditsc.c739
-rw-r--r--kernel/capability.c6
-rw-r--r--kernel/cgroup.c316
-rw-r--r--kernel/compat.c54
-rw-r--r--kernel/cpu.c157
-rw-r--r--kernel/cpuset.c285
-rw-r--r--kernel/cred.c5
-rw-r--r--kernel/dma-coherent.c42
-rw-r--r--kernel/exit.c25
-rw-r--r--kernel/extable.c16
-rw-r--r--kernel/fork.c29
-rw-r--r--kernel/futex.c413
-rw-r--r--kernel/hrtimer.c390
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/autoprobe.c5
-rw-r--r--kernel/irq/chip.c18
-rw-r--r--kernel/irq/handle.c205
-rw-r--r--kernel/irq/internals.h5
-rw-r--r--kernel/irq/manage.c58
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/numa_migrate.c119
-rw-r--r--kernel/irq/proc.c63
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c281
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/lockdep.c60
-rw-r--r--kernel/lockdep_proc.c28
-rw-r--r--kernel/module.c94
-rw-r--r--kernel/mutex.c10
-rw-r--r--kernel/notifier.c8
-rw-r--r--kernel/ns_cgroup.c2
-rw-r--r--kernel/panic.c34
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/posix-cpu-timers.c10
-rw-r--r--kernel/posix-timers.c40
-rw-r--r--kernel/power/disk.c6
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/snapshot.c370
-rw-r--r--kernel/power/swsusp.c122
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/profile.c39
-rw-r--r--kernel/rcuclassic.c36
-rw-r--r--kernel/rcupdate.c11
-rw-r--r--kernel/rcupreempt.c40
-rw-r--r--kernel/rcupreempt_trace.c10
-rw-r--r--kernel/rcutorture.c91
-rw-r--r--kernel/rcutree.c1532
-rw-r--r--kernel/rcutree_trace.c271
-rw-r--r--kernel/res_counter.c44
-rw-r--r--kernel/resource.c70
-rw-r--r--kernel/sched.c1110
-rw-r--r--kernel/sched_clock.c5
-rw-r--r--kernel/sched_cpupri.c39
-rw-r--r--kernel/sched_cpupri.h5
-rw-r--r--kernel/sched_fair.c64
-rw-r--r--kernel/sched_rt.c74
-rw-r--r--kernel/sched_stats.h3
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/smp.c145
-rw-r--r--kernel/softirq.c41
-rw-r--r--kernel/softlockup.c12
-rw-r--r--kernel/stacktrace.c11
-rw-r--r--kernel/stop_machine.c63
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sysctl.c55
-rw-r--r--kernel/sysctl_check.c1
-rw-r--r--kernel/taskstats.c41
-rw-r--r--kernel/test_kprobes.c210
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/tick-broadcast.c113
-rw-r--r--kernel/time/tick-common.c18
-rw-r--r--kernel/time/tick-sched.c66
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/timer.c15
-rw-r--r--kernel/trace/ring_buffer.c86
-rw-r--r--kernel/trace/trace.c73
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_hw_branches.c6
-rw-r--r--kernel/trace/trace_power.c2
-rw-r--r--kernel/trace/trace_sysprof.c14
-rw-r--r--kernel/tsacct.c4
-rw-r--r--kernel/workqueue.c26
96 files changed, 6247 insertions, 3004 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1fc..bf987b95b356 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,28 +52,3 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_RCU
56 bool "Preemptible RCU"
57 depends on PREEMPT
58 default n
59 help
60 This option reduces the latency of the kernel by making certain
61 RCU sections preemptible. Normally RCU code is non-preemptible, if
62 this option is selected then read-only RCU sections become
63 preemptible. This helps latency, but may expose bugs due to
64 now-naive assumptions about each RCU read-side critical section
65 remaining on a given CPU through its execution.
66
67 Say N if you are unsure.
68
69config RCU_TRACE
70 bool "Enable tracing for RCU - currently stats in debugfs"
71 depends on PREEMPT_RCU
72 select DEBUG_FS
73 default y
74 help
75 This option provides tracing in RCU which presents stats
76 in debugfs for debugging RCU implementation.
77
78 Say Y here if you want to enable RCU tracing
79 Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 4476da868f86..8b2628c7914b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o
13 14
14ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -73,10 +74,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
73obj-$(CONFIG_SECCOMP) += seccomp.o 74obj-$(CONFIG_SECCOMP) += seccomp.o
74obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 75obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
75obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o 76obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
77obj-$(CONFIG_TREE_RCU) += rcutree.o
76obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 78obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
77ifeq ($(CONFIG_PREEMPT_RCU),y) 79obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
78obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o 80obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
79endif
80obj-$(CONFIG_RELAY) += relay.o 81obj-$(CONFIG_RELAY) += relay.o
81obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 82obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
82obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 83obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/async.c b/kernel/async.c
new file mode 100644
index 000000000000..f286e9f2b736
--- /dev/null
+++ b/kernel/async.c
@@ -0,0 +1,335 @@
1/*
2 * async.c: Asynchronous function calls for boot performance
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13
14/*
15
16Goals and Theory of Operation
17
18The primary goal of this feature is to reduce the kernel boot time,
19by doing various independent hardware delays and discovery operations
20decoupled and not strictly serialized.
21
22More specifically, the asynchronous function call concept allows
23certain operations (primarily during system boot) to happen
24asynchronously, out of order, while these operations still
25have their externally visible parts happen sequentially and in-order.
26(not unlike how out-of-order CPUs retire their instructions in order)
27
28Key to the asynchronous function call implementation is the concept of
29a "sequence cookie" (which, although it has an abstracted type, can be
30thought of as a monotonically incrementing number).
31
32The async core will assign each scheduled event such a sequence cookie and
33pass this to the called functions.
34
35The asynchronously called function should before doing a globally visible
36operation, such as registering device numbers, call the
37async_synchronize_cookie() function and pass in its own cookie. The
38async_synchronize_cookie() function will make sure that all asynchronous
39operations that were scheduled prior to the operation corresponding with the
40cookie have completed.
41
42Subsystem/driver initialization code that scheduled asynchronous probe
43functions, but which shares global resources with other drivers/subsystems
44that do not use the asynchronous call feature, need to do a full
45synchronization with the async_synchronize_full() function, before returning
46from their init function. This is to maintain strict ordering between the
47asynchronous and synchronous parts of the kernel.
48
49*/
50
51#include <linux/async.h>
52#include <linux/module.h>
53#include <linux/wait.h>
54#include <linux/sched.h>
55#include <linux/init.h>
56#include <linux/kthread.h>
57#include <asm/atomic.h>
58
59static async_cookie_t next_cookie = 1;
60
61#define MAX_THREADS 256
62#define MAX_WORK 32768
63
64static LIST_HEAD(async_pending);
65static LIST_HEAD(async_running);
66static DEFINE_SPINLOCK(async_lock);
67
68static int async_enabled = 0;
69
70struct async_entry {
71 struct list_head list;
72 async_cookie_t cookie;
73 async_func_ptr *func;
74 void *data;
75 struct list_head *running;
76};
77
78static DECLARE_WAIT_QUEUE_HEAD(async_done);
79static DECLARE_WAIT_QUEUE_HEAD(async_new);
80
81static atomic_t entry_count;
82static atomic_t thread_count;
83
84extern int initcall_debug;
85
86
87/*
88 * MUST be called with the lock held!
89 */
90static async_cookie_t __lowest_in_progress(struct list_head *running)
91{
92 struct async_entry *entry;
93 if (!list_empty(&async_pending)) {
94 entry = list_first_entry(&async_pending,
95 struct async_entry, list);
96 return entry->cookie;
97 } else if (!list_empty(running)) {
98 entry = list_first_entry(running,
99 struct async_entry, list);
100 return entry->cookie;
101 } else {
102 /* nothing in progress... next_cookie is "infinity" */
103 return next_cookie;
104 }
105
106}
107/*
108 * pick the first pending entry and run it
109 */
110static void run_one_entry(void)
111{
112 unsigned long flags;
113 struct async_entry *entry;
114 ktime_t calltime, delta, rettime;
115
116 /* 1) pick one task from the pending queue */
117
118 spin_lock_irqsave(&async_lock, flags);
119 if (list_empty(&async_pending))
120 goto out;
121 entry = list_first_entry(&async_pending, struct async_entry, list);
122
123 /* 2) move it to the running queue */
124 list_del(&entry->list);
125 list_add_tail(&entry->list, &async_running);
126 spin_unlock_irqrestore(&async_lock, flags);
127
128 /* 3) run it (and print duration)*/
129 if (initcall_debug && system_state == SYSTEM_BOOTING) {
130 printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
131 calltime = ktime_get();
132 }
133 entry->func(entry->data, entry->cookie);
134 if (initcall_debug && system_state == SYSTEM_BOOTING) {
135 rettime = ktime_get();
136 delta = ktime_sub(rettime, calltime);
137 printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
138 entry->func, ktime_to_ns(delta) >> 10);
139 }
140
141 /* 4) remove it from the running queue */
142 spin_lock_irqsave(&async_lock, flags);
143 list_del(&entry->list);
144
145 /* 5) free the entry */
146 kfree(entry);
147 atomic_dec(&entry_count);
148
149 spin_unlock_irqrestore(&async_lock, flags);
150
151 /* 6) wake up any waiters. */
152 wake_up(&async_done);
153 return;
154
155out:
156 spin_unlock_irqrestore(&async_lock, flags);
157}
158
159
160static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
161{
162 struct async_entry *entry;
163 unsigned long flags;
164 async_cookie_t newcookie;
165
166
167 /* allow irq-off callers */
168 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
169
170 /*
171 * If we're out of memory or if there's too much work
172 * pending already, we execute synchronously.
173 */
174 if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
175 kfree(entry);
176 spin_lock_irqsave(&async_lock, flags);
177 newcookie = next_cookie++;
178 spin_unlock_irqrestore(&async_lock, flags);
179
180 /* low on memory.. run synchronously */
181 ptr(data, newcookie);
182 return newcookie;
183 }
184 entry->func = ptr;
185 entry->data = data;
186 entry->running = running;
187
188 spin_lock_irqsave(&async_lock, flags);
189 newcookie = entry->cookie = next_cookie++;
190 list_add_tail(&entry->list, &async_pending);
191 atomic_inc(&entry_count);
192 spin_unlock_irqrestore(&async_lock, flags);
193 wake_up(&async_new);
194 return newcookie;
195}
196
197async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
198{
199 return __async_schedule(ptr, data, &async_pending);
200}
201EXPORT_SYMBOL_GPL(async_schedule);
202
203async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
204{
205 return __async_schedule(ptr, data, running);
206}
207EXPORT_SYMBOL_GPL(async_schedule_special);
208
209void async_synchronize_full(void)
210{
211 do {
212 async_synchronize_cookie(next_cookie);
213 } while (!list_empty(&async_running) || !list_empty(&async_pending));
214}
215EXPORT_SYMBOL_GPL(async_synchronize_full);
216
217void async_synchronize_full_special(struct list_head *list)
218{
219 async_synchronize_cookie_special(next_cookie, list);
220}
221EXPORT_SYMBOL_GPL(async_synchronize_full_special);
222
223void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
224{
225 ktime_t starttime, delta, endtime;
226
227 if (initcall_debug && system_state == SYSTEM_BOOTING) {
228 printk("async_waiting @ %i\n", task_pid_nr(current));
229 starttime = ktime_get();
230 }
231
232 wait_event(async_done, __lowest_in_progress(running) >= cookie);
233
234 if (initcall_debug && system_state == SYSTEM_BOOTING) {
235 endtime = ktime_get();
236 delta = ktime_sub(endtime, starttime);
237
238 printk("async_continuing @ %i after %lli usec\n",
239 task_pid_nr(current), ktime_to_ns(delta) >> 10);
240 }
241}
242EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
243
244void async_synchronize_cookie(async_cookie_t cookie)
245{
246 async_synchronize_cookie_special(cookie, &async_running);
247}
248EXPORT_SYMBOL_GPL(async_synchronize_cookie);
249
250
251static int async_thread(void *unused)
252{
253 DECLARE_WAITQUEUE(wq, current);
254 add_wait_queue(&async_new, &wq);
255
256 while (!kthread_should_stop()) {
257 int ret = HZ;
258 set_current_state(TASK_INTERRUPTIBLE);
259 /*
260 * check the list head without lock.. false positives
261 * are dealt with inside run_one_entry() while holding
262 * the lock.
263 */
264 rmb();
265 if (!list_empty(&async_pending))
266 run_one_entry();
267 else
268 ret = schedule_timeout(HZ);
269
270 if (ret == 0) {
271 /*
272 * we timed out, this means we as thread are redundant.
273 * we sign off and die, but we to avoid any races there
274 * is a last-straw check to see if work snuck in.
275 */
276 atomic_dec(&thread_count);
277 wmb(); /* manager must see our departure first */
278 if (list_empty(&async_pending))
279 break;
280 /*
281 * woops work came in between us timing out and us
282 * signing off; we need to stay alive and keep working.
283 */
284 atomic_inc(&thread_count);
285 }
286 }
287 remove_wait_queue(&async_new, &wq);
288
289 return 0;
290}
291
292static int async_manager_thread(void *unused)
293{
294 DECLARE_WAITQUEUE(wq, current);
295 add_wait_queue(&async_new, &wq);
296
297 while (!kthread_should_stop()) {
298 int tc, ec;
299
300 set_current_state(TASK_INTERRUPTIBLE);
301
302 tc = atomic_read(&thread_count);
303 rmb();
304 ec = atomic_read(&entry_count);
305
306 while (tc < ec && tc < MAX_THREADS) {
307 kthread_run(async_thread, NULL, "async/%i", tc);
308 atomic_inc(&thread_count);
309 tc++;
310 }
311
312 schedule();
313 }
314 remove_wait_queue(&async_new, &wq);
315
316 return 0;
317}
318
319static int __init async_init(void)
320{
321 if (async_enabled)
322 kthread_run(async_manager_thread, NULL, "async/mgr");
323 return 0;
324}
325
326static int __init setup_async(char *str)
327{
328 async_enabled = 1;
329 return 1;
330}
331
332__setup("fastboot", setup_async);
333
334
335core_initcall(async_init);
diff --git a/kernel/audit.h b/kernel/audit.h
index 9d6717412fec..16f18cac661b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -159,11 +159,8 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
159 return __audit_signal_info(sig, t); 159 return __audit_signal_info(sig, t);
160 return 0; 160 return 0;
161} 161}
162extern enum audit_state audit_filter_inodes(struct task_struct *, 162extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
163 struct audit_context *);
164extern void audit_set_auditable(struct audit_context *);
165#else 163#else
166#define audit_signal_info(s,t) AUDIT_DISABLED 164#define audit_signal_info(s,t) AUDIT_DISABLED
167#define audit_filter_inodes(t,c) AUDIT_DISABLED 165#define audit_filter_inodes(t,c) AUDIT_DISABLED
168#define audit_set_auditable(c)
169#endif 166#endif
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8b509441f49a..8ad9545b8db9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -450,6 +450,7 @@ static void kill_rules(struct audit_tree *tree)
450 audit_log_end(ab); 450 audit_log_end(ab);
451 rule->tree = NULL; 451 rule->tree = NULL;
452 list_del_rcu(&entry->list); 452 list_del_rcu(&entry->list);
453 list_del(&entry->rule.list);
453 call_rcu(&entry->rcu, audit_free_rule_rcu); 454 call_rcu(&entry->rcu, audit_free_rule_rcu);
454 } 455 }
455 } 456 }
@@ -617,7 +618,7 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
617 618
618 if (pathname[0] != '/' || 619 if (pathname[0] != '/' ||
619 rule->listnr != AUDIT_FILTER_EXIT || 620 rule->listnr != AUDIT_FILTER_EXIT ||
620 op & ~AUDIT_EQUAL || 621 op != Audit_equal ||
621 rule->inode_f || rule->watch || rule->tree) 622 rule->inode_f || rule->watch || rule->tree)
622 return -EINVAL; 623 return -EINVAL;
623 rule->tree = alloc_tree(pathname); 624 rule->tree = alloc_tree(pathname);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9fd85a4640a0..fbf24d121d97 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -86,6 +86,14 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
86#error Fix audit_filter_list initialiser 86#error Fix audit_filter_list initialiser
87#endif 87#endif
88}; 88};
89static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
90 LIST_HEAD_INIT(audit_rules_list[0]),
91 LIST_HEAD_INIT(audit_rules_list[1]),
92 LIST_HEAD_INIT(audit_rules_list[2]),
93 LIST_HEAD_INIT(audit_rules_list[3]),
94 LIST_HEAD_INIT(audit_rules_list[4]),
95 LIST_HEAD_INIT(audit_rules_list[5]),
96};
89 97
90DEFINE_MUTEX(audit_filter_mutex); 98DEFINE_MUTEX(audit_filter_mutex);
91 99
@@ -244,7 +252,8 @@ static inline int audit_to_inode(struct audit_krule *krule,
244 struct audit_field *f) 252 struct audit_field *f)
245{ 253{
246 if (krule->listnr != AUDIT_FILTER_EXIT || 254 if (krule->listnr != AUDIT_FILTER_EXIT ||
247 krule->watch || krule->inode_f || krule->tree) 255 krule->watch || krule->inode_f || krule->tree ||
256 (f->op != Audit_equal && f->op != Audit_not_equal))
248 return -EINVAL; 257 return -EINVAL;
249 258
250 krule->inode_f = f; 259 krule->inode_f = f;
@@ -262,7 +271,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
262 271
263 if (path[0] != '/' || path[len-1] == '/' || 272 if (path[0] != '/' || path[len-1] == '/' ||
264 krule->listnr != AUDIT_FILTER_EXIT || 273 krule->listnr != AUDIT_FILTER_EXIT ||
265 op & ~AUDIT_EQUAL || 274 op != Audit_equal ||
266 krule->inode_f || krule->watch || krule->tree) 275 krule->inode_f || krule->watch || krule->tree)
267 return -EINVAL; 276 return -EINVAL;
268 277
@@ -412,12 +421,32 @@ exit_err:
412 return ERR_PTR(err); 421 return ERR_PTR(err);
413} 422}
414 423
424static u32 audit_ops[] =
425{
426 [Audit_equal] = AUDIT_EQUAL,
427 [Audit_not_equal] = AUDIT_NOT_EQUAL,
428 [Audit_bitmask] = AUDIT_BIT_MASK,
429 [Audit_bittest] = AUDIT_BIT_TEST,
430 [Audit_lt] = AUDIT_LESS_THAN,
431 [Audit_gt] = AUDIT_GREATER_THAN,
432 [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL,
433 [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL,
434};
435
436static u32 audit_to_op(u32 op)
437{
438 u32 n;
439 for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++)
440 ;
441 return n;
442}
443
444
415/* Translate struct audit_rule to kernel's rule respresentation. 445/* Translate struct audit_rule to kernel's rule respresentation.
416 * Exists for backward compatibility with userspace. */ 446 * Exists for backward compatibility with userspace. */
417static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 447static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
418{ 448{
419 struct audit_entry *entry; 449 struct audit_entry *entry;
420 struct audit_field *ino_f;
421 int err = 0; 450 int err = 0;
422 int i; 451 int i;
423 452
@@ -427,12 +456,28 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
427 456
428 for (i = 0; i < rule->field_count; i++) { 457 for (i = 0; i < rule->field_count; i++) {
429 struct audit_field *f = &entry->rule.fields[i]; 458 struct audit_field *f = &entry->rule.fields[i];
459 u32 n;
460
461 n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
462
463 /* Support for legacy operators where
464 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
465 if (n & AUDIT_NEGATE)
466 f->op = Audit_not_equal;
467 else if (!n)
468 f->op = Audit_equal;
469 else
470 f->op = audit_to_op(n);
471
472 entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
430 473
431 f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
432 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 474 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
433 f->val = rule->values[i]; 475 f->val = rule->values[i];
434 476
435 err = -EINVAL; 477 err = -EINVAL;
478 if (f->op == Audit_bad)
479 goto exit_free;
480
436 switch(f->type) { 481 switch(f->type) {
437 default: 482 default:
438 goto exit_free; 483 goto exit_free;
@@ -454,11 +499,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
454 case AUDIT_EXIT: 499 case AUDIT_EXIT:
455 case AUDIT_SUCCESS: 500 case AUDIT_SUCCESS:
456 /* bit ops are only useful on syscall args */ 501 /* bit ops are only useful on syscall args */
457 if (f->op == AUDIT_BIT_MASK || 502 if (f->op == Audit_bitmask || f->op == Audit_bittest)
458 f->op == AUDIT_BIT_TEST) {
459 err = -EINVAL;
460 goto exit_free; 503 goto exit_free;
461 }
462 break; 504 break;
463 case AUDIT_ARG0: 505 case AUDIT_ARG0:
464 case AUDIT_ARG1: 506 case AUDIT_ARG1:
@@ -467,11 +509,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
467 break; 509 break;
468 /* arch is only allowed to be = or != */ 510 /* arch is only allowed to be = or != */
469 case AUDIT_ARCH: 511 case AUDIT_ARCH:
470 if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) 512 if (f->op != Audit_not_equal && f->op != Audit_equal)
471 && (f->op != AUDIT_NEGATE) && (f->op)) {
472 err = -EINVAL;
473 goto exit_free; 513 goto exit_free;
474 }
475 entry->rule.arch_f = f; 514 entry->rule.arch_f = f;
476 break; 515 break;
477 case AUDIT_PERM: 516 case AUDIT_PERM:
@@ -488,33 +527,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
488 goto exit_free; 527 goto exit_free;
489 break; 528 break;
490 } 529 }
491
492 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
493
494 /* Support for legacy operators where
495 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
496 if (f->op & AUDIT_NEGATE)
497 f->op = AUDIT_NOT_EQUAL;
498 else if (!f->op)
499 f->op = AUDIT_EQUAL;
500 else if (f->op == AUDIT_OPERATORS) {
501 err = -EINVAL;
502 goto exit_free;
503 }
504 } 530 }
505 531
506 ino_f = entry->rule.inode_f; 532 if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
507 if (ino_f) { 533 entry->rule.inode_f = NULL;
508 switch(ino_f->op) {
509 case AUDIT_NOT_EQUAL:
510 entry->rule.inode_f = NULL;
511 case AUDIT_EQUAL:
512 break;
513 default:
514 err = -EINVAL;
515 goto exit_free;
516 }
517 }
518 534
519exit_nofree: 535exit_nofree:
520 return entry; 536 return entry;
@@ -530,7 +546,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
530{ 546{
531 int err = 0; 547 int err = 0;
532 struct audit_entry *entry; 548 struct audit_entry *entry;
533 struct audit_field *ino_f;
534 void *bufp; 549 void *bufp;
535 size_t remain = datasz - sizeof(struct audit_rule_data); 550 size_t remain = datasz - sizeof(struct audit_rule_data);
536 int i; 551 int i;
@@ -546,11 +561,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
546 struct audit_field *f = &entry->rule.fields[i]; 561 struct audit_field *f = &entry->rule.fields[i];
547 562
548 err = -EINVAL; 563 err = -EINVAL;
549 if (!(data->fieldflags[i] & AUDIT_OPERATORS) || 564
550 data->fieldflags[i] & ~AUDIT_OPERATORS) 565 f->op = audit_to_op(data->fieldflags[i]);
566 if (f->op == Audit_bad)
551 goto exit_free; 567 goto exit_free;
552 568
553 f->op = data->fieldflags[i] & AUDIT_OPERATORS;
554 f->type = data->fields[i]; 569 f->type = data->fields[i];
555 f->val = data->values[i]; 570 f->val = data->values[i];
556 f->lsm_str = NULL; 571 f->lsm_str = NULL;
@@ -662,18 +677,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
662 } 677 }
663 } 678 }
664 679
665 ino_f = entry->rule.inode_f; 680 if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
666 if (ino_f) { 681 entry->rule.inode_f = NULL;
667 switch(ino_f->op) {
668 case AUDIT_NOT_EQUAL:
669 entry->rule.inode_f = NULL;
670 case AUDIT_EQUAL:
671 break;
672 default:
673 err = -EINVAL;
674 goto exit_free;
675 }
676 }
677 682
678exit_nofree: 683exit_nofree:
679 return entry; 684 return entry;
@@ -713,10 +718,10 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
713 rule->fields[i] = krule->fields[i].type; 718 rule->fields[i] = krule->fields[i].type;
714 719
715 if (krule->vers_ops == 1) { 720 if (krule->vers_ops == 1) {
716 if (krule->fields[i].op & AUDIT_NOT_EQUAL) 721 if (krule->fields[i].op == Audit_not_equal)
717 rule->fields[i] |= AUDIT_NEGATE; 722 rule->fields[i] |= AUDIT_NEGATE;
718 } else { 723 } else {
719 rule->fields[i] |= krule->fields[i].op; 724 rule->fields[i] |= audit_ops[krule->fields[i].op];
720 } 725 }
721 } 726 }
722 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; 727 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
@@ -744,7 +749,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
744 struct audit_field *f = &krule->fields[i]; 749 struct audit_field *f = &krule->fields[i];
745 750
746 data->fields[i] = f->type; 751 data->fields[i] = f->type;
747 data->fieldflags[i] = f->op; 752 data->fieldflags[i] = audit_ops[f->op];
748 switch(f->type) { 753 switch(f->type) {
749 case AUDIT_SUBJ_USER: 754 case AUDIT_SUBJ_USER:
750 case AUDIT_SUBJ_ROLE: 755 case AUDIT_SUBJ_ROLE:
@@ -919,6 +924,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
919 new->action = old->action; 924 new->action = old->action;
920 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 925 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
921 new->mask[i] = old->mask[i]; 926 new->mask[i] = old->mask[i];
927 new->prio = old->prio;
922 new->buflen = old->buflen; 928 new->buflen = old->buflen;
923 new->inode_f = old->inode_f; 929 new->inode_f = old->inode_f;
924 new->watch = NULL; 930 new->watch = NULL;
@@ -987,9 +993,8 @@ static void audit_update_watch(struct audit_parent *parent,
987 993
988 /* If the update involves invalidating rules, do the inode-based 994 /* If the update involves invalidating rules, do the inode-based
989 * filtering now, so we don't omit records. */ 995 * filtering now, so we don't omit records. */
990 if (invalidating && current->audit_context && 996 if (invalidating && current->audit_context)
991 audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) 997 audit_filter_inodes(current, current->audit_context);
992 audit_set_auditable(current->audit_context);
993 998
994 nwatch = audit_dupe_watch(owatch); 999 nwatch = audit_dupe_watch(owatch);
995 if (IS_ERR(nwatch)) { 1000 if (IS_ERR(nwatch)) {
@@ -1007,12 +1012,15 @@ static void audit_update_watch(struct audit_parent *parent,
1007 list_del_rcu(&oentry->list); 1012 list_del_rcu(&oentry->list);
1008 1013
1009 nentry = audit_dupe_rule(&oentry->rule, nwatch); 1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1010 if (IS_ERR(nentry)) 1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1011 audit_panic("error updating watch, removing"); 1017 audit_panic("error updating watch, removing");
1012 else { 1018 } else {
1013 int h = audit_hash_ino((u32)ino); 1019 int h = audit_hash_ino((u32)ino);
1014 list_add(&nentry->rule.rlist, &nwatch->rules); 1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1015 list_add_rcu(&nentry->list, &audit_inode_hash[h]); 1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1016 } 1024 }
1017 1025
1018 call_rcu(&oentry->rcu, audit_free_rule_rcu); 1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
@@ -1077,6 +1085,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
1077 audit_log_end(ab); 1085 audit_log_end(ab);
1078 } 1086 }
1079 list_del(&r->rlist); 1087 list_del(&r->rlist);
1088 list_del(&r->list);
1080 list_del_rcu(&e->list); 1089 list_del_rcu(&e->list);
1081 call_rcu(&e->rcu, audit_free_rule_rcu); 1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1082 } 1091 }
@@ -1102,12 +1111,16 @@ static void audit_inotify_unregister(struct list_head *in_list)
1102/* Find an existing audit rule. 1111/* Find an existing audit rule.
1103 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1104static struct audit_entry *audit_find_rule(struct audit_entry *entry, 1113static struct audit_entry *audit_find_rule(struct audit_entry *entry,
1105 struct list_head *list) 1114 struct list_head **p)
1106{ 1115{
1107 struct audit_entry *e, *found = NULL; 1116 struct audit_entry *e, *found = NULL;
1117 struct list_head *list;
1108 int h; 1118 int h;
1109 1119
1110 if (entry->rule.watch) { 1120 if (entry->rule.inode_f) {
1121 h = audit_hash_ino(entry->rule.inode_f->val);
1122 *p = list = &audit_inode_hash[h];
1123 } else if (entry->rule.watch) {
1111 /* we don't know the inode number, so must walk entire hash */ 1124 /* we don't know the inode number, so must walk entire hash */
1112 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { 1125 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
1113 list = &audit_inode_hash[h]; 1126 list = &audit_inode_hash[h];
@@ -1118,6 +1131,8 @@ static struct audit_entry *audit_find_rule(struct audit_entry *entry,
1118 } 1131 }
1119 } 1132 }
1120 goto out; 1133 goto out;
1134 } else {
1135 *p = list = &audit_filter_list[entry->rule.listnr];
1121 } 1136 }
1122 1137
1123 list_for_each_entry(e, list, list) 1138 list_for_each_entry(e, list, list)
@@ -1258,15 +1273,17 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1258 return ret; 1273 return ret;
1259} 1274}
1260 1275
1276static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1;
1278
1261/* Add rule to given filterlist if not a duplicate. */ 1279/* Add rule to given filterlist if not a duplicate. */
1262static inline int audit_add_rule(struct audit_entry *entry, 1280static inline int audit_add_rule(struct audit_entry *entry)
1263 struct list_head *list)
1264{ 1281{
1265 struct audit_entry *e; 1282 struct audit_entry *e;
1266 struct audit_field *inode_f = entry->rule.inode_f;
1267 struct audit_watch *watch = entry->rule.watch; 1283 struct audit_watch *watch = entry->rule.watch;
1268 struct audit_tree *tree = entry->rule.tree; 1284 struct audit_tree *tree = entry->rule.tree;
1269 struct nameidata *ndp = NULL, *ndw = NULL; 1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list;
1270 int h, err; 1287 int h, err;
1271#ifdef CONFIG_AUDITSYSCALL 1288#ifdef CONFIG_AUDITSYSCALL
1272 int dont_count = 0; 1289 int dont_count = 0;
@@ -1277,13 +1294,8 @@ static inline int audit_add_rule(struct audit_entry *entry,
1277 dont_count = 1; 1294 dont_count = 1;
1278#endif 1295#endif
1279 1296
1280 if (inode_f) {
1281 h = audit_hash_ino(inode_f->val);
1282 list = &audit_inode_hash[h];
1283 }
1284
1285 mutex_lock(&audit_filter_mutex); 1297 mutex_lock(&audit_filter_mutex);
1286 e = audit_find_rule(entry, list); 1298 e = audit_find_rule(entry, &list);
1287 mutex_unlock(&audit_filter_mutex); 1299 mutex_unlock(&audit_filter_mutex);
1288 if (e) { 1300 if (e) {
1289 err = -EEXIST; 1301 err = -EEXIST;
@@ -1319,10 +1331,22 @@ static inline int audit_add_rule(struct audit_entry *entry,
1319 } 1331 }
1320 } 1332 }
1321 1333
1334 entry->rule.prio = ~0ULL;
1335 if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
1336 if (entry->rule.flags & AUDIT_FILTER_PREPEND)
1337 entry->rule.prio = ++prio_high;
1338 else
1339 entry->rule.prio = --prio_low;
1340 }
1341
1322 if (entry->rule.flags & AUDIT_FILTER_PREPEND) { 1342 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
1343 list_add(&entry->rule.list,
1344 &audit_rules_list[entry->rule.listnr]);
1323 list_add_rcu(&entry->list, list); 1345 list_add_rcu(&entry->list, list);
1324 entry->rule.flags &= ~AUDIT_FILTER_PREPEND; 1346 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
1325 } else { 1347 } else {
1348 list_add_tail(&entry->rule.list,
1349 &audit_rules_list[entry->rule.listnr]);
1326 list_add_tail_rcu(&entry->list, list); 1350 list_add_tail_rcu(&entry->list, list);
1327 } 1351 }
1328#ifdef CONFIG_AUDITSYSCALL 1352#ifdef CONFIG_AUDITSYSCALL
@@ -1345,15 +1369,14 @@ error:
1345} 1369}
1346 1370
1347/* Remove an existing rule from filterlist. */ 1371/* Remove an existing rule from filterlist. */
1348static inline int audit_del_rule(struct audit_entry *entry, 1372static inline int audit_del_rule(struct audit_entry *entry)
1349 struct list_head *list)
1350{ 1373{
1351 struct audit_entry *e; 1374 struct audit_entry *e;
1352 struct audit_field *inode_f = entry->rule.inode_f;
1353 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch;
1354 struct audit_tree *tree = entry->rule.tree; 1376 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list;
1355 LIST_HEAD(inotify_list); 1378 LIST_HEAD(inotify_list);
1356 int h, ret = 0; 1379 int ret = 0;
1357#ifdef CONFIG_AUDITSYSCALL 1380#ifdef CONFIG_AUDITSYSCALL
1358 int dont_count = 0; 1381 int dont_count = 0;
1359 1382
@@ -1363,13 +1386,8 @@ static inline int audit_del_rule(struct audit_entry *entry,
1363 dont_count = 1; 1386 dont_count = 1;
1364#endif 1387#endif
1365 1388
1366 if (inode_f) {
1367 h = audit_hash_ino(inode_f->val);
1368 list = &audit_inode_hash[h];
1369 }
1370
1371 mutex_lock(&audit_filter_mutex); 1389 mutex_lock(&audit_filter_mutex);
1372 e = audit_find_rule(entry, list); 1390 e = audit_find_rule(entry, &list);
1373 if (!e) { 1391 if (!e) {
1374 mutex_unlock(&audit_filter_mutex); 1392 mutex_unlock(&audit_filter_mutex);
1375 ret = -ENOENT; 1393 ret = -ENOENT;
@@ -1404,6 +1422,7 @@ static inline int audit_del_rule(struct audit_entry *entry,
1404 audit_remove_tree_rule(&e->rule); 1422 audit_remove_tree_rule(&e->rule);
1405 1423
1406 list_del_rcu(&e->list); 1424 list_del_rcu(&e->list);
1425 list_del(&e->rule.list);
1407 call_rcu(&e->rcu, audit_free_rule_rcu); 1426 call_rcu(&e->rcu, audit_free_rule_rcu);
1408 1427
1409#ifdef CONFIG_AUDITSYSCALL 1428#ifdef CONFIG_AUDITSYSCALL
@@ -1432,30 +1451,16 @@ out:
1432static void audit_list(int pid, int seq, struct sk_buff_head *q) 1451static void audit_list(int pid, int seq, struct sk_buff_head *q)
1433{ 1452{
1434 struct sk_buff *skb; 1453 struct sk_buff *skb;
1435 struct audit_entry *entry; 1454 struct audit_krule *r;
1436 int i; 1455 int i;
1437 1456
1438 /* This is a blocking read, so use audit_filter_mutex instead of rcu 1457 /* This is a blocking read, so use audit_filter_mutex instead of rcu
1439 * iterator to sync with list writers. */ 1458 * iterator to sync with list writers. */
1440 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1459 for (i=0; i<AUDIT_NR_FILTERS; i++) {
1441 list_for_each_entry(entry, &audit_filter_list[i], list) { 1460 list_for_each_entry(r, &audit_rules_list[i], list) {
1442 struct audit_rule *rule;
1443
1444 rule = audit_krule_to_rule(&entry->rule);
1445 if (unlikely(!rule))
1446 break;
1447 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
1448 rule, sizeof(*rule));
1449 if (skb)
1450 skb_queue_tail(q, skb);
1451 kfree(rule);
1452 }
1453 }
1454 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
1455 list_for_each_entry(entry, &audit_inode_hash[i], list) {
1456 struct audit_rule *rule; 1461 struct audit_rule *rule;
1457 1462
1458 rule = audit_krule_to_rule(&entry->rule); 1463 rule = audit_krule_to_rule(r);
1459 if (unlikely(!rule)) 1464 if (unlikely(!rule))
1460 break; 1465 break;
1461 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, 1466 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
@@ -1474,30 +1479,16 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q)
1474static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) 1479static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1475{ 1480{
1476 struct sk_buff *skb; 1481 struct sk_buff *skb;
1477 struct audit_entry *e; 1482 struct audit_krule *r;
1478 int i; 1483 int i;
1479 1484
1480 /* This is a blocking read, so use audit_filter_mutex instead of rcu 1485 /* This is a blocking read, so use audit_filter_mutex instead of rcu
1481 * iterator to sync with list writers. */ 1486 * iterator to sync with list writers. */
1482 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1487 for (i=0; i<AUDIT_NR_FILTERS; i++) {
1483 list_for_each_entry(e, &audit_filter_list[i], list) { 1488 list_for_each_entry(r, &audit_rules_list[i], list) {
1484 struct audit_rule_data *data;
1485
1486 data = audit_krule_to_data(&e->rule);
1487 if (unlikely(!data))
1488 break;
1489 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
1490 data, sizeof(*data) + data->buflen);
1491 if (skb)
1492 skb_queue_tail(q, skb);
1493 kfree(data);
1494 }
1495 }
1496 for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
1497 list_for_each_entry(e, &audit_inode_hash[i], list) {
1498 struct audit_rule_data *data; 1489 struct audit_rule_data *data;
1499 1490
1500 data = audit_krule_to_data(&e->rule); 1491 data = audit_krule_to_data(r);
1501 if (unlikely(!data)) 1492 if (unlikely(!data))
1502 break; 1493 break;
1503 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, 1494 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
@@ -1603,8 +1594,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1603 if (IS_ERR(entry)) 1594 if (IS_ERR(entry))
1604 return PTR_ERR(entry); 1595 return PTR_ERR(entry);
1605 1596
1606 err = audit_add_rule(entry, 1597 err = audit_add_rule(entry);
1607 &audit_filter_list[entry->rule.listnr]);
1608 audit_log_rule_change(loginuid, sessionid, sid, "add", 1598 audit_log_rule_change(loginuid, sessionid, sid, "add",
1609 &entry->rule, !err); 1599 &entry->rule, !err);
1610 1600
@@ -1620,8 +1610,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1620 if (IS_ERR(entry)) 1610 if (IS_ERR(entry))
1621 return PTR_ERR(entry); 1611 return PTR_ERR(entry);
1622 1612
1623 err = audit_del_rule(entry, 1613 err = audit_del_rule(entry);
1624 &audit_filter_list[entry->rule.listnr]);
1625 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1614 audit_log_rule_change(loginuid, sessionid, sid, "remove",
1626 &entry->rule, !err); 1615 &entry->rule, !err);
1627 1616
@@ -1634,28 +1623,29 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1634 return err; 1623 return err;
1635} 1624}
1636 1625
1637int audit_comparator(const u32 left, const u32 op, const u32 right) 1626int audit_comparator(u32 left, u32 op, u32 right)
1638{ 1627{
1639 switch (op) { 1628 switch (op) {
1640 case AUDIT_EQUAL: 1629 case Audit_equal:
1641 return (left == right); 1630 return (left == right);
1642 case AUDIT_NOT_EQUAL: 1631 case Audit_not_equal:
1643 return (left != right); 1632 return (left != right);
1644 case AUDIT_LESS_THAN: 1633 case Audit_lt:
1645 return (left < right); 1634 return (left < right);
1646 case AUDIT_LESS_THAN_OR_EQUAL: 1635 case Audit_le:
1647 return (left <= right); 1636 return (left <= right);
1648 case AUDIT_GREATER_THAN: 1637 case Audit_gt:
1649 return (left > right); 1638 return (left > right);
1650 case AUDIT_GREATER_THAN_OR_EQUAL: 1639 case Audit_ge:
1651 return (left >= right); 1640 return (left >= right);
1652 case AUDIT_BIT_MASK: 1641 case Audit_bitmask:
1653 return (left & right); 1642 return (left & right);
1654 case AUDIT_BIT_TEST: 1643 case Audit_bittest:
1655 return ((left & right) == right); 1644 return ((left & right) == right);
1645 default:
1646 BUG();
1647 return 0;
1656 } 1648 }
1657 BUG();
1658 return 0;
1659} 1649}
1660 1650
1661/* Compare given dentry name with last component in given path, 1651/* Compare given dentry name with last component in given path,
@@ -1778,6 +1768,43 @@ unlock_and_return:
1778 return result; 1768 return result;
1779} 1769}
1780 1770
1771static int update_lsm_rule(struct audit_krule *r)
1772{
1773 struct audit_entry *entry = container_of(r, struct audit_entry, rule);
1774 struct audit_entry *nentry;
1775 struct audit_watch *watch;
1776 struct audit_tree *tree;
1777 int err = 0;
1778
1779 if (!security_audit_rule_known(r))
1780 return 0;
1781
1782 watch = r->watch;
1783 tree = r->tree;
1784 nentry = audit_dupe_rule(r, watch);
1785 if (IS_ERR(nentry)) {
1786 /* save the first error encountered for the
1787 * return value */
1788 err = PTR_ERR(nentry);
1789 audit_panic("error updating LSM filters");
1790 if (watch)
1791 list_del(&r->rlist);
1792 list_del_rcu(&entry->list);
1793 list_del(&r->list);
1794 } else {
1795 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules);
1797 list_del(&r->rlist);
1798 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist);
1800 list_replace_rcu(&entry->list, &nentry->list);
1801 list_replace(&r->list, &nentry->rule.list);
1802 }
1803 call_rcu(&entry->rcu, audit_free_rule_rcu);
1804
1805 return err;
1806}
1807
1781/* This function will re-initialize the lsm_rule field of all applicable rules. 1808/* This function will re-initialize the lsm_rule field of all applicable rules.
1782 * It will traverse the filter lists serarching for rules that contain LSM 1809 * It will traverse the filter lists serarching for rules that contain LSM
1783 * specific filter fields. When such a rule is found, it is copied, the 1810 * specific filter fields. When such a rule is found, it is copied, the
@@ -1785,45 +1812,19 @@ unlock_and_return:
1785 * updated rule. */ 1812 * updated rule. */
1786int audit_update_lsm_rules(void) 1813int audit_update_lsm_rules(void)
1787{ 1814{
1788 struct audit_entry *entry, *n, *nentry; 1815 struct audit_krule *r, *n;
1789 struct audit_watch *watch;
1790 struct audit_tree *tree;
1791 int i, err = 0; 1816 int i, err = 0;
1792 1817
1793 /* audit_filter_mutex synchronizes the writers */ 1818 /* audit_filter_mutex synchronizes the writers */
1794 mutex_lock(&audit_filter_mutex); 1819 mutex_lock(&audit_filter_mutex);
1795 1820
1796 for (i = 0; i < AUDIT_NR_FILTERS; i++) { 1821 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
1797 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { 1822 list_for_each_entry_safe(r, n, &audit_rules_list[i], list) {
1798 if (!security_audit_rule_known(&entry->rule)) 1823 int res = update_lsm_rule(r);
1799 continue; 1824 if (!err)
1800 1825 err = res;
1801 watch = entry->rule.watch;
1802 tree = entry->rule.tree;
1803 nentry = audit_dupe_rule(&entry->rule, watch);
1804 if (IS_ERR(nentry)) {
1805 /* save the first error encountered for the
1806 * return value */
1807 if (!err)
1808 err = PTR_ERR(nentry);
1809 audit_panic("error updating LSM filters");
1810 if (watch)
1811 list_del(&entry->rule.rlist);
1812 list_del_rcu(&entry->list);
1813 } else {
1814 if (watch) {
1815 list_add(&nentry->rule.rlist,
1816 &watch->rules);
1817 list_del(&entry->rule.rlist);
1818 } else if (tree)
1819 list_replace_init(&entry->rule.rlist,
1820 &nentry->rule.rlist);
1821 list_replace_rcu(&entry->list, &nentry->list);
1822 }
1823 call_rcu(&entry->rcu, audit_free_rule_rcu);
1824 } 1826 }
1825 } 1827 }
1826
1827 mutex_unlock(&audit_filter_mutex); 1828 mutex_unlock(&audit_filter_mutex);
1828 1829
1829 return err; 1830 return err;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4819f3711973..8cbddff6c283 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -124,43 +124,6 @@ struct audit_aux_data {
124/* Number of target pids per aux struct. */ 124/* Number of target pids per aux struct. */
125#define AUDIT_AUX_PIDS 16 125#define AUDIT_AUX_PIDS 16
126 126
127struct audit_aux_data_mq_open {
128 struct audit_aux_data d;
129 int oflag;
130 mode_t mode;
131 struct mq_attr attr;
132};
133
134struct audit_aux_data_mq_sendrecv {
135 struct audit_aux_data d;
136 mqd_t mqdes;
137 size_t msg_len;
138 unsigned int msg_prio;
139 struct timespec abs_timeout;
140};
141
142struct audit_aux_data_mq_notify {
143 struct audit_aux_data d;
144 mqd_t mqdes;
145 struct sigevent notification;
146};
147
148struct audit_aux_data_mq_getsetattr {
149 struct audit_aux_data d;
150 mqd_t mqdes;
151 struct mq_attr mqstat;
152};
153
154struct audit_aux_data_ipcctl {
155 struct audit_aux_data d;
156 struct ipc_perm p;
157 unsigned long qbytes;
158 uid_t uid;
159 gid_t gid;
160 mode_t mode;
161 u32 osid;
162};
163
164struct audit_aux_data_execve { 127struct audit_aux_data_execve {
165 struct audit_aux_data d; 128 struct audit_aux_data d;
166 int argc; 129 int argc;
@@ -168,23 +131,6 @@ struct audit_aux_data_execve {
168 struct mm_struct *mm; 131 struct mm_struct *mm;
169}; 132};
170 133
171struct audit_aux_data_socketcall {
172 struct audit_aux_data d;
173 int nargs;
174 unsigned long args[0];
175};
176
177struct audit_aux_data_sockaddr {
178 struct audit_aux_data d;
179 int len;
180 char a[0];
181};
182
183struct audit_aux_data_fd_pair {
184 struct audit_aux_data d;
185 int fd[2];
186};
187
188struct audit_aux_data_pids { 134struct audit_aux_data_pids {
189 struct audit_aux_data d; 135 struct audit_aux_data d;
190 pid_t target_pid[AUDIT_AUX_PIDS]; 136 pid_t target_pid[AUDIT_AUX_PIDS];
@@ -219,14 +165,14 @@ struct audit_tree_refs {
219struct audit_context { 165struct audit_context {
220 int dummy; /* must be the first element */ 166 int dummy; /* must be the first element */
221 int in_syscall; /* 1 if task is in a syscall */ 167 int in_syscall; /* 1 if task is in a syscall */
222 enum audit_state state; 168 enum audit_state state, current_state;
223 unsigned int serial; /* serial number for record */ 169 unsigned int serial; /* serial number for record */
224 struct timespec ctime; /* time of syscall entry */ 170 struct timespec ctime; /* time of syscall entry */
225 int major; /* syscall number */ 171 int major; /* syscall number */
226 unsigned long argv[4]; /* syscall arguments */ 172 unsigned long argv[4]; /* syscall arguments */
227 int return_valid; /* return code is valid */ 173 int return_valid; /* return code is valid */
228 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
229 int auditable; /* 1 if record should be written */ 175 u64 prio;
230 int name_count; 176 int name_count;
231 struct audit_names names[AUDIT_NAMES]; 177 struct audit_names names[AUDIT_NAMES];
232 char * filterkey; /* key for rule that triggered record */ 178 char * filterkey; /* key for rule that triggered record */
@@ -234,7 +180,8 @@ struct audit_context {
234 struct audit_context *previous; /* For nested syscalls */ 180 struct audit_context *previous; /* For nested syscalls */
235 struct audit_aux_data *aux; 181 struct audit_aux_data *aux;
236 struct audit_aux_data *aux_pids; 182 struct audit_aux_data *aux_pids;
237 183 struct sockaddr_storage *sockaddr;
184 size_t sockaddr_len;
238 /* Save things to print about task_struct */ 185 /* Save things to print about task_struct */
239 pid_t pid, ppid; 186 pid_t pid, ppid;
240 uid_t uid, euid, suid, fsuid; 187 uid_t uid, euid, suid, fsuid;
@@ -252,6 +199,49 @@ struct audit_context {
252 struct audit_tree_refs *trees, *first_trees; 199 struct audit_tree_refs *trees, *first_trees;
253 int tree_count; 200 int tree_count;
254 201
202 int type;
203 union {
204 struct {
205 int nargs;
206 long args[6];
207 } socketcall;
208 struct {
209 uid_t uid;
210 gid_t gid;
211 mode_t mode;
212 u32 osid;
213 int has_perm;
214 uid_t perm_uid;
215 gid_t perm_gid;
216 mode_t perm_mode;
217 unsigned long qbytes;
218 } ipc;
219 struct {
220 mqd_t mqdes;
221 struct mq_attr mqstat;
222 } mq_getsetattr;
223 struct {
224 mqd_t mqdes;
225 int sigev_signo;
226 } mq_notify;
227 struct {
228 mqd_t mqdes;
229 size_t msg_len;
230 unsigned int msg_prio;
231 struct timespec abs_timeout;
232 } mq_sendrecv;
233 struct {
234 int oflag;
235 mode_t mode;
236 struct mq_attr attr;
237 } mq_open;
238 struct {
239 pid_t pid;
240 struct audit_cap_data cap;
241 } capset;
242 };
243 int fds[2];
244
255#if AUDIT_DEBUG 245#if AUDIT_DEBUG
256 int put_count; 246 int put_count;
257 int ino_count; 247 int ino_count;
@@ -608,19 +598,12 @@ static int audit_filter_rules(struct task_struct *tsk,
608 } 598 }
609 } 599 }
610 /* Find ipc objects that match */ 600 /* Find ipc objects that match */
611 if (ctx) { 601 if (!ctx || ctx->type != AUDIT_IPC)
612 struct audit_aux_data *aux; 602 break;
613 for (aux = ctx->aux; aux; 603 if (security_audit_rule_match(ctx->ipc.osid,
614 aux = aux->next) { 604 f->type, f->op,
615 if (aux->type == AUDIT_IPC) { 605 f->lsm_rule, ctx))
616 struct audit_aux_data_ipcctl *axi = (void *)aux; 606 ++result;
617 if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) {
618 ++result;
619 break;
620 }
621 }
622 }
623 }
624 } 607 }
625 break; 608 break;
626 case AUDIT_ARG0: 609 case AUDIT_ARG0:
@@ -647,8 +630,16 @@ static int audit_filter_rules(struct task_struct *tsk,
647 return 0; 630 return 0;
648 } 631 }
649 } 632 }
650 if (rule->filterkey && ctx) 633
651 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); 634 if (ctx) {
635 if (rule->prio <= ctx->prio)
636 return 0;
637 if (rule->filterkey) {
638 kfree(ctx->filterkey);
639 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
640 }
641 ctx->prio = rule->prio;
642 }
652 switch (rule->action) { 643 switch (rule->action) {
653 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 644 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
654 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 645 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
@@ -661,7 +652,7 @@ static int audit_filter_rules(struct task_struct *tsk,
661 * completely disabled for this task. Since we only have the task 652 * completely disabled for this task. Since we only have the task
662 * structure at this point, we can only check uid and gid. 653 * structure at this point, we can only check uid and gid.
663 */ 654 */
664static enum audit_state audit_filter_task(struct task_struct *tsk) 655static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
665{ 656{
666 struct audit_entry *e; 657 struct audit_entry *e;
667 enum audit_state state; 658 enum audit_state state;
@@ -669,6 +660,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
669 rcu_read_lock(); 660 rcu_read_lock();
670 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 661 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
671 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { 662 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
663 if (state == AUDIT_RECORD_CONTEXT)
664 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
672 rcu_read_unlock(); 665 rcu_read_unlock();
673 return state; 666 return state;
674 } 667 }
@@ -702,6 +695,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
702 audit_filter_rules(tsk, &e->rule, ctx, NULL, 695 audit_filter_rules(tsk, &e->rule, ctx, NULL,
703 &state)) { 696 &state)) {
704 rcu_read_unlock(); 697 rcu_read_unlock();
698 ctx->current_state = state;
705 return state; 699 return state;
706 } 700 }
707 } 701 }
@@ -715,15 +709,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
715 * buckets applicable to the inode numbers in audit_names[]. 709 * buckets applicable to the inode numbers in audit_names[].
716 * Regarding audit_state, same rules apply as for audit_filter_syscall(). 710 * Regarding audit_state, same rules apply as for audit_filter_syscall().
717 */ 711 */
718enum audit_state audit_filter_inodes(struct task_struct *tsk, 712void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
719 struct audit_context *ctx)
720{ 713{
721 int i; 714 int i;
722 struct audit_entry *e; 715 struct audit_entry *e;
723 enum audit_state state; 716 enum audit_state state;
724 717
725 if (audit_pid && tsk->tgid == audit_pid) 718 if (audit_pid && tsk->tgid == audit_pid)
726 return AUDIT_DISABLED; 719 return;
727 720
728 rcu_read_lock(); 721 rcu_read_lock();
729 for (i = 0; i < ctx->name_count; i++) { 722 for (i = 0; i < ctx->name_count; i++) {
@@ -740,17 +733,20 @@ enum audit_state audit_filter_inodes(struct task_struct *tsk,
740 if ((e->rule.mask[word] & bit) == bit && 733 if ((e->rule.mask[word] & bit) == bit &&
741 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { 734 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
742 rcu_read_unlock(); 735 rcu_read_unlock();
743 return state; 736 ctx->current_state = state;
737 return;
744 } 738 }
745 } 739 }
746 } 740 }
747 rcu_read_unlock(); 741 rcu_read_unlock();
748 return AUDIT_BUILD_CONTEXT;
749} 742}
750 743
751void audit_set_auditable(struct audit_context *ctx) 744static void audit_set_auditable(struct audit_context *ctx)
752{ 745{
753 ctx->auditable = 1; 746 if (!ctx->prio) {
747 ctx->prio = 1;
748 ctx->current_state = AUDIT_RECORD_CONTEXT;
749 }
754} 750}
755 751
756static inline struct audit_context *audit_get_context(struct task_struct *tsk, 752static inline struct audit_context *audit_get_context(struct task_struct *tsk,
@@ -781,23 +777,11 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
781 else 777 else
782 context->return_code = return_code; 778 context->return_code = return_code;
783 779
784 if (context->in_syscall && !context->dummy && !context->auditable) { 780 if (context->in_syscall && !context->dummy) {
785 enum audit_state state; 781 audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
786 782 audit_filter_inodes(tsk, context);
787 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
788 if (state == AUDIT_RECORD_CONTEXT) {
789 context->auditable = 1;
790 goto get_context;
791 }
792
793 state = audit_filter_inodes(tsk, context);
794 if (state == AUDIT_RECORD_CONTEXT)
795 context->auditable = 1;
796
797 } 783 }
798 784
799get_context:
800
801 tsk->audit_context = NULL; 785 tsk->audit_context = NULL;
802 return context; 786 return context;
803} 787}
@@ -807,8 +791,7 @@ static inline void audit_free_names(struct audit_context *context)
807 int i; 791 int i;
808 792
809#if AUDIT_DEBUG == 2 793#if AUDIT_DEBUG == 2
810 if (context->auditable 794 if (context->put_count + context->ino_count != context->name_count) {
811 ||context->put_count + context->ino_count != context->name_count) {
812 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" 795 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
813 " name_count=%d put_count=%d" 796 " name_count=%d put_count=%d"
814 " ino_count=%d [NOT freeing]\n", 797 " ino_count=%d [NOT freeing]\n",
@@ -859,6 +842,7 @@ static inline void audit_zero_context(struct audit_context *context,
859{ 842{
860 memset(context, 0, sizeof(*context)); 843 memset(context, 0, sizeof(*context));
861 context->state = state; 844 context->state = state;
845 context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
862} 846}
863 847
864static inline struct audit_context *audit_alloc_context(enum audit_state state) 848static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -884,18 +868,21 @@ int audit_alloc(struct task_struct *tsk)
884{ 868{
885 struct audit_context *context; 869 struct audit_context *context;
886 enum audit_state state; 870 enum audit_state state;
871 char *key = NULL;
887 872
888 if (likely(!audit_ever_enabled)) 873 if (likely(!audit_ever_enabled))
889 return 0; /* Return if not auditing. */ 874 return 0; /* Return if not auditing. */
890 875
891 state = audit_filter_task(tsk); 876 state = audit_filter_task(tsk, &key);
892 if (likely(state == AUDIT_DISABLED)) 877 if (likely(state == AUDIT_DISABLED))
893 return 0; 878 return 0;
894 879
895 if (!(context = audit_alloc_context(state))) { 880 if (!(context = audit_alloc_context(state))) {
881 kfree(key);
896 audit_log_lost("out of memory in audit_alloc"); 882 audit_log_lost("out of memory in audit_alloc");
897 return -ENOMEM; 883 return -ENOMEM;
898 } 884 }
885 context->filterkey = key;
899 886
900 tsk->audit_context = context; 887 tsk->audit_context = context;
901 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); 888 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
@@ -921,6 +908,7 @@ static inline void audit_free_context(struct audit_context *context)
921 free_tree_refs(context); 908 free_tree_refs(context);
922 audit_free_aux(context); 909 audit_free_aux(context);
923 kfree(context->filterkey); 910 kfree(context->filterkey);
911 kfree(context->sockaddr);
924 kfree(context); 912 kfree(context);
925 context = previous; 913 context = previous;
926 } while (context); 914 } while (context);
@@ -1230,6 +1218,97 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1230 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); 1218 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
1231} 1219}
1232 1220
1221static void show_special(struct audit_context *context, int *call_panic)
1222{
1223 struct audit_buffer *ab;
1224 int i;
1225
1226 ab = audit_log_start(context, GFP_KERNEL, context->type);
1227 if (!ab)
1228 return;
1229
1230 switch (context->type) {
1231 case AUDIT_SOCKETCALL: {
1232 int nargs = context->socketcall.nargs;
1233 audit_log_format(ab, "nargs=%d", nargs);
1234 for (i = 0; i < nargs; i++)
1235 audit_log_format(ab, " a%d=%lx", i,
1236 context->socketcall.args[i]);
1237 break; }
1238 case AUDIT_IPC: {
1239 u32 osid = context->ipc.osid;
1240
1241 audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
1242 context->ipc.uid, context->ipc.gid, context->ipc.mode);
1243 if (osid) {
1244 char *ctx = NULL;
1245 u32 len;
1246 if (security_secid_to_secctx(osid, &ctx, &len)) {
1247 audit_log_format(ab, " osid=%u", osid);
1248 *call_panic = 1;
1249 } else {
1250 audit_log_format(ab, " obj=%s", ctx);
1251 security_release_secctx(ctx, len);
1252 }
1253 }
1254 if (context->ipc.has_perm) {
1255 audit_log_end(ab);
1256 ab = audit_log_start(context, GFP_KERNEL,
1257 AUDIT_IPC_SET_PERM);
1258 audit_log_format(ab,
1259 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
1260 context->ipc.qbytes,
1261 context->ipc.perm_uid,
1262 context->ipc.perm_gid,
1263 context->ipc.perm_mode);
1264 if (!ab)
1265 return;
1266 }
1267 break; }
1268 case AUDIT_MQ_OPEN: {
1269 audit_log_format(ab,
1270 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
1271 "mq_msgsize=%ld mq_curmsgs=%ld",
1272 context->mq_open.oflag, context->mq_open.mode,
1273 context->mq_open.attr.mq_flags,
1274 context->mq_open.attr.mq_maxmsg,
1275 context->mq_open.attr.mq_msgsize,
1276 context->mq_open.attr.mq_curmsgs);
1277 break; }
1278 case AUDIT_MQ_SENDRECV: {
1279 audit_log_format(ab,
1280 "mqdes=%d msg_len=%zd msg_prio=%u "
1281 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
1282 context->mq_sendrecv.mqdes,
1283 context->mq_sendrecv.msg_len,
1284 context->mq_sendrecv.msg_prio,
1285 context->mq_sendrecv.abs_timeout.tv_sec,
1286 context->mq_sendrecv.abs_timeout.tv_nsec);
1287 break; }
1288 case AUDIT_MQ_NOTIFY: {
1289 audit_log_format(ab, "mqdes=%d sigev_signo=%d",
1290 context->mq_notify.mqdes,
1291 context->mq_notify.sigev_signo);
1292 break; }
1293 case AUDIT_MQ_GETSETATTR: {
1294 struct mq_attr *attr = &context->mq_getsetattr.mqstat;
1295 audit_log_format(ab,
1296 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
1297 "mq_curmsgs=%ld ",
1298 context->mq_getsetattr.mqdes,
1299 attr->mq_flags, attr->mq_maxmsg,
1300 attr->mq_msgsize, attr->mq_curmsgs);
1301 break; }
1302 case AUDIT_CAPSET: {
1303 audit_log_format(ab, "pid=%d", context->capset.pid);
1304 audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
1305 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1306 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1307 break; }
1308 }
1309 audit_log_end(ab);
1310}
1311
1233static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1312static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1234{ 1313{
1235 const struct cred *cred; 1314 const struct cred *cred;
@@ -1307,94 +1386,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1307 continue; /* audit_panic has been called */ 1386 continue; /* audit_panic has been called */
1308 1387
1309 switch (aux->type) { 1388 switch (aux->type) {
1310 case AUDIT_MQ_OPEN: {
1311 struct audit_aux_data_mq_open *axi = (void *)aux;
1312 audit_log_format(ab,
1313 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
1314 "mq_msgsize=%ld mq_curmsgs=%ld",
1315 axi->oflag, axi->mode, axi->attr.mq_flags,
1316 axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
1317 axi->attr.mq_curmsgs);
1318 break; }
1319
1320 case AUDIT_MQ_SENDRECV: {
1321 struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
1322 audit_log_format(ab,
1323 "mqdes=%d msg_len=%zd msg_prio=%u "
1324 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
1325 axi->mqdes, axi->msg_len, axi->msg_prio,
1326 axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
1327 break; }
1328
1329 case AUDIT_MQ_NOTIFY: {
1330 struct audit_aux_data_mq_notify *axi = (void *)aux;
1331 audit_log_format(ab,
1332 "mqdes=%d sigev_signo=%d",
1333 axi->mqdes,
1334 axi->notification.sigev_signo);
1335 break; }
1336
1337 case AUDIT_MQ_GETSETATTR: {
1338 struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
1339 audit_log_format(ab,
1340 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
1341 "mq_curmsgs=%ld ",
1342 axi->mqdes,
1343 axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
1344 axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
1345 break; }
1346
1347 case AUDIT_IPC: {
1348 struct audit_aux_data_ipcctl *axi = (void *)aux;
1349 audit_log_format(ab,
1350 "ouid=%u ogid=%u mode=%#o",
1351 axi->uid, axi->gid, axi->mode);
1352 if (axi->osid != 0) {
1353 char *ctx = NULL;
1354 u32 len;
1355 if (security_secid_to_secctx(
1356 axi->osid, &ctx, &len)) {
1357 audit_log_format(ab, " osid=%u",
1358 axi->osid);
1359 call_panic = 1;
1360 } else {
1361 audit_log_format(ab, " obj=%s", ctx);
1362 security_release_secctx(ctx, len);
1363 }
1364 }
1365 break; }
1366
1367 case AUDIT_IPC_SET_PERM: {
1368 struct audit_aux_data_ipcctl *axi = (void *)aux;
1369 audit_log_format(ab,
1370 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
1371 axi->qbytes, axi->uid, axi->gid, axi->mode);
1372 break; }
1373 1389
1374 case AUDIT_EXECVE: { 1390 case AUDIT_EXECVE: {
1375 struct audit_aux_data_execve *axi = (void *)aux; 1391 struct audit_aux_data_execve *axi = (void *)aux;
1376 audit_log_execve_info(context, &ab, axi); 1392 audit_log_execve_info(context, &ab, axi);
1377 break; } 1393 break; }
1378 1394
1379 case AUDIT_SOCKETCALL: {
1380 struct audit_aux_data_socketcall *axs = (void *)aux;
1381 audit_log_format(ab, "nargs=%d", axs->nargs);
1382 for (i=0; i<axs->nargs; i++)
1383 audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
1384 break; }
1385
1386 case AUDIT_SOCKADDR: {
1387 struct audit_aux_data_sockaddr *axs = (void *)aux;
1388
1389 audit_log_format(ab, "saddr=");
1390 audit_log_n_hex(ab, axs->a, axs->len);
1391 break; }
1392
1393 case AUDIT_FD_PAIR: {
1394 struct audit_aux_data_fd_pair *axs = (void *)aux;
1395 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
1396 break; }
1397
1398 case AUDIT_BPRM_FCAPS: { 1395 case AUDIT_BPRM_FCAPS: {
1399 struct audit_aux_data_bprm_fcaps *axs = (void *)aux; 1396 struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
1400 audit_log_format(ab, "fver=%x", axs->fcap_ver); 1397 audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1409,18 +1406,32 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1409 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); 1406 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
1410 break; } 1407 break; }
1411 1408
1412 case AUDIT_CAPSET: {
1413 struct audit_aux_data_capset *axs = (void *)aux;
1414 audit_log_format(ab, "pid=%d", axs->pid);
1415 audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
1416 audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
1417 audit_log_cap(ab, "cap_pe", &axs->cap.effective);
1418 break; }
1419
1420 } 1409 }
1421 audit_log_end(ab); 1410 audit_log_end(ab);
1422 } 1411 }
1423 1412
1413 if (context->type)
1414 show_special(context, &call_panic);
1415
1416 if (context->fds[0] >= 0) {
1417 ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR);
1418 if (ab) {
1419 audit_log_format(ab, "fd0=%d fd1=%d",
1420 context->fds[0], context->fds[1]);
1421 audit_log_end(ab);
1422 }
1423 }
1424
1425 if (context->sockaddr_len) {
1426 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
1427 if (ab) {
1428 audit_log_format(ab, "saddr=");
1429 audit_log_n_hex(ab, (void *)context->sockaddr,
1430 context->sockaddr_len);
1431 audit_log_end(ab);
1432 }
1433 }
1434
1424 for (aux = context->aux_pids; aux; aux = aux->next) { 1435 for (aux = context->aux_pids; aux; aux = aux->next) {
1425 struct audit_aux_data_pids *axs = (void *)aux; 1436 struct audit_aux_data_pids *axs = (void *)aux;
1426 1437
@@ -1536,7 +1547,7 @@ void audit_free(struct task_struct *tsk)
1536 * We use GFP_ATOMIC here because we might be doing this 1547 * We use GFP_ATOMIC here because we might be doing this
1537 * in the context of the idle thread */ 1548 * in the context of the idle thread */
1538 /* that can happen only if we are called from do_exit() */ 1549 /* that can happen only if we are called from do_exit() */
1539 if (context->in_syscall && context->auditable) 1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1540 audit_log_exit(context, tsk); 1551 audit_log_exit(context, tsk);
1541 1552
1542 audit_free_context(context); 1553 audit_free_context(context);
@@ -1620,15 +1631,17 @@ void audit_syscall_entry(int arch, int major,
1620 1631
1621 state = context->state; 1632 state = context->state;
1622 context->dummy = !audit_n_rules; 1633 context->dummy = !audit_n_rules;
1623 if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) 1634 if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
1635 context->prio = 0;
1624 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); 1636 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
1637 }
1625 if (likely(state == AUDIT_DISABLED)) 1638 if (likely(state == AUDIT_DISABLED))
1626 return; 1639 return;
1627 1640
1628 context->serial = 0; 1641 context->serial = 0;
1629 context->ctime = CURRENT_TIME; 1642 context->ctime = CURRENT_TIME;
1630 context->in_syscall = 1; 1643 context->in_syscall = 1;
1631 context->auditable = !!(state == AUDIT_RECORD_CONTEXT); 1644 context->current_state = state;
1632 context->ppid = 0; 1645 context->ppid = 0;
1633} 1646}
1634 1647
@@ -1636,17 +1649,20 @@ void audit_finish_fork(struct task_struct *child)
1636{ 1649{
1637 struct audit_context *ctx = current->audit_context; 1650 struct audit_context *ctx = current->audit_context;
1638 struct audit_context *p = child->audit_context; 1651 struct audit_context *p = child->audit_context;
1639 if (!p || !ctx || !ctx->auditable) 1652 if (!p || !ctx)
1653 return;
1654 if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
1640 return; 1655 return;
1641 p->arch = ctx->arch; 1656 p->arch = ctx->arch;
1642 p->major = ctx->major; 1657 p->major = ctx->major;
1643 memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); 1658 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1644 p->ctime = ctx->ctime; 1659 p->ctime = ctx->ctime;
1645 p->dummy = ctx->dummy; 1660 p->dummy = ctx->dummy;
1646 p->auditable = ctx->auditable;
1647 p->in_syscall = ctx->in_syscall; 1661 p->in_syscall = ctx->in_syscall;
1648 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); 1662 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1649 p->ppid = current->pid; 1663 p->ppid = current->pid;
1664 p->prio = ctx->prio;
1665 p->current_state = ctx->current_state;
1650} 1666}
1651 1667
1652/** 1668/**
@@ -1670,11 +1686,11 @@ void audit_syscall_exit(int valid, long return_code)
1670 if (likely(!context)) 1686 if (likely(!context))
1671 return; 1687 return;
1672 1688
1673 if (context->in_syscall && context->auditable) 1689 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1674 audit_log_exit(context, tsk); 1690 audit_log_exit(context, tsk);
1675 1691
1676 context->in_syscall = 0; 1692 context->in_syscall = 0;
1677 context->auditable = 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1678 1694
1679 if (context->previous) { 1695 if (context->previous) {
1680 struct audit_context *new_context = context->previous; 1696 struct audit_context *new_context = context->previous;
@@ -1689,8 +1705,13 @@ void audit_syscall_exit(int valid, long return_code)
1689 context->aux_pids = NULL; 1705 context->aux_pids = NULL;
1690 context->target_pid = 0; 1706 context->target_pid = 0;
1691 context->target_sid = 0; 1707 context->target_sid = 0;
1692 kfree(context->filterkey); 1708 context->sockaddr_len = 0;
1693 context->filterkey = NULL; 1709 context->type = 0;
1710 context->fds[0] = -1;
1711 if (context->state != AUDIT_RECORD_CONTEXT) {
1712 kfree(context->filterkey);
1713 context->filterkey = NULL;
1714 }
1694 tsk->audit_context = context; 1715 tsk->audit_context = context;
1695 } 1716 }
1696} 1717}
@@ -2081,7 +2102,10 @@ int auditsc_get_stamp(struct audit_context *ctx,
2081 t->tv_sec = ctx->ctime.tv_sec; 2102 t->tv_sec = ctx->ctime.tv_sec;
2082 t->tv_nsec = ctx->ctime.tv_nsec; 2103 t->tv_nsec = ctx->ctime.tv_nsec;
2083 *serial = ctx->serial; 2104 *serial = ctx->serial;
2084 ctx->auditable = 1; 2105 if (!ctx->prio) {
2106 ctx->prio = 1;
2107 ctx->current_state = AUDIT_RECORD_CONTEXT;
2108 }
2085 return 1; 2109 return 1;
2086} 2110}
2087 2111
@@ -2127,132 +2151,46 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2127 * @mode: mode bits 2151 * @mode: mode bits
2128 * @u_attr: queue attributes 2152 * @u_attr: queue attributes
2129 * 2153 *
2130 * Returns 0 for success or NULL context or < 0 on error.
2131 */ 2154 */
2132int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) 2155void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
2133{ 2156{
2134 struct audit_aux_data_mq_open *ax;
2135 struct audit_context *context = current->audit_context; 2157 struct audit_context *context = current->audit_context;
2136 2158
2137 if (!audit_enabled) 2159 if (attr)
2138 return 0; 2160 memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
2139 2161 else
2140 if (likely(!context)) 2162 memset(&context->mq_open.attr, 0, sizeof(struct mq_attr));
2141 return 0;
2142
2143 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2144 if (!ax)
2145 return -ENOMEM;
2146
2147 if (u_attr != NULL) {
2148 if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
2149 kfree(ax);
2150 return -EFAULT;
2151 }
2152 } else
2153 memset(&ax->attr, 0, sizeof(ax->attr));
2154 2163
2155 ax->oflag = oflag; 2164 context->mq_open.oflag = oflag;
2156 ax->mode = mode; 2165 context->mq_open.mode = mode;
2157 2166
2158 ax->d.type = AUDIT_MQ_OPEN; 2167 context->type = AUDIT_MQ_OPEN;
2159 ax->d.next = context->aux;
2160 context->aux = (void *)ax;
2161 return 0;
2162} 2168}
2163 2169
2164/** 2170/**
2165 * __audit_mq_timedsend - record audit data for a POSIX MQ timed send 2171 * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive
2166 * @mqdes: MQ descriptor 2172 * @mqdes: MQ descriptor
2167 * @msg_len: Message length 2173 * @msg_len: Message length
2168 * @msg_prio: Message priority 2174 * @msg_prio: Message priority
2169 * @u_abs_timeout: Message timeout in absolute time 2175 * @abs_timeout: Message timeout in absolute time
2170 * 2176 *
2171 * Returns 0 for success or NULL context or < 0 on error.
2172 */ 2177 */
2173int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, 2178void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
2174 const struct timespec __user *u_abs_timeout) 2179 const struct timespec *abs_timeout)
2175{ 2180{
2176 struct audit_aux_data_mq_sendrecv *ax;
2177 struct audit_context *context = current->audit_context; 2181 struct audit_context *context = current->audit_context;
2182 struct timespec *p = &context->mq_sendrecv.abs_timeout;
2178 2183
2179 if (!audit_enabled) 2184 if (abs_timeout)
2180 return 0; 2185 memcpy(p, abs_timeout, sizeof(struct timespec));
2181 2186 else
2182 if (likely(!context)) 2187 memset(p, 0, sizeof(struct timespec));
2183 return 0;
2184
2185 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2186 if (!ax)
2187 return -ENOMEM;
2188
2189 if (u_abs_timeout != NULL) {
2190 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
2191 kfree(ax);
2192 return -EFAULT;
2193 }
2194 } else
2195 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
2196
2197 ax->mqdes = mqdes;
2198 ax->msg_len = msg_len;
2199 ax->msg_prio = msg_prio;
2200
2201 ax->d.type = AUDIT_MQ_SENDRECV;
2202 ax->d.next = context->aux;
2203 context->aux = (void *)ax;
2204 return 0;
2205}
2206
2207/**
2208 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
2209 * @mqdes: MQ descriptor
2210 * @msg_len: Message length
2211 * @u_msg_prio: Message priority
2212 * @u_abs_timeout: Message timeout in absolute time
2213 *
2214 * Returns 0 for success or NULL context or < 0 on error.
2215 */
2216int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
2217 unsigned int __user *u_msg_prio,
2218 const struct timespec __user *u_abs_timeout)
2219{
2220 struct audit_aux_data_mq_sendrecv *ax;
2221 struct audit_context *context = current->audit_context;
2222
2223 if (!audit_enabled)
2224 return 0;
2225
2226 if (likely(!context))
2227 return 0;
2228
2229 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2230 if (!ax)
2231 return -ENOMEM;
2232
2233 if (u_msg_prio != NULL) {
2234 if (get_user(ax->msg_prio, u_msg_prio)) {
2235 kfree(ax);
2236 return -EFAULT;
2237 }
2238 } else
2239 ax->msg_prio = 0;
2240
2241 if (u_abs_timeout != NULL) {
2242 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
2243 kfree(ax);
2244 return -EFAULT;
2245 }
2246 } else
2247 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
2248 2188
2249 ax->mqdes = mqdes; 2189 context->mq_sendrecv.mqdes = mqdes;
2250 ax->msg_len = msg_len; 2190 context->mq_sendrecv.msg_len = msg_len;
2191 context->mq_sendrecv.msg_prio = msg_prio;
2251 2192
2252 ax->d.type = AUDIT_MQ_SENDRECV; 2193 context->type = AUDIT_MQ_SENDRECV;
2253 ax->d.next = context->aux;
2254 context->aux = (void *)ax;
2255 return 0;
2256} 2194}
2257 2195
2258/** 2196/**
@@ -2260,38 +2198,19 @@ int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
2260 * @mqdes: MQ descriptor 2198 * @mqdes: MQ descriptor
2261 * @u_notification: Notification event 2199 * @u_notification: Notification event
2262 * 2200 *
2263 * Returns 0 for success or NULL context or < 0 on error.
2264 */ 2201 */
2265 2202
2266int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) 2203void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
2267{ 2204{
2268 struct audit_aux_data_mq_notify *ax;
2269 struct audit_context *context = current->audit_context; 2205 struct audit_context *context = current->audit_context;
2270 2206
2271 if (!audit_enabled) 2207 if (notification)
2272 return 0; 2208 context->mq_notify.sigev_signo = notification->sigev_signo;
2273 2209 else
2274 if (likely(!context)) 2210 context->mq_notify.sigev_signo = 0;
2275 return 0;
2276
2277 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2278 if (!ax)
2279 return -ENOMEM;
2280
2281 if (u_notification != NULL) {
2282 if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
2283 kfree(ax);
2284 return -EFAULT;
2285 }
2286 } else
2287 memset(&ax->notification, 0, sizeof(ax->notification));
2288
2289 ax->mqdes = mqdes;
2290 2211
2291 ax->d.type = AUDIT_MQ_NOTIFY; 2212 context->mq_notify.mqdes = mqdes;
2292 ax->d.next = context->aux; 2213 context->type = AUDIT_MQ_NOTIFY;
2293 context->aux = (void *)ax;
2294 return 0;
2295} 2214}
2296 2215
2297/** 2216/**
@@ -2299,55 +2218,29 @@ int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
2299 * @mqdes: MQ descriptor 2218 * @mqdes: MQ descriptor
2300 * @mqstat: MQ flags 2219 * @mqstat: MQ flags
2301 * 2220 *
2302 * Returns 0 for success or NULL context or < 0 on error.
2303 */ 2221 */
2304int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) 2222void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
2305{ 2223{
2306 struct audit_aux_data_mq_getsetattr *ax;
2307 struct audit_context *context = current->audit_context; 2224 struct audit_context *context = current->audit_context;
2308 2225 context->mq_getsetattr.mqdes = mqdes;
2309 if (!audit_enabled) 2226 context->mq_getsetattr.mqstat = *mqstat;
2310 return 0; 2227 context->type = AUDIT_MQ_GETSETATTR;
2311
2312 if (likely(!context))
2313 return 0;
2314
2315 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2316 if (!ax)
2317 return -ENOMEM;
2318
2319 ax->mqdes = mqdes;
2320 ax->mqstat = *mqstat;
2321
2322 ax->d.type = AUDIT_MQ_GETSETATTR;
2323 ax->d.next = context->aux;
2324 context->aux = (void *)ax;
2325 return 0;
2326} 2228}
2327 2229
2328/** 2230/**
2329 * audit_ipc_obj - record audit data for ipc object 2231 * audit_ipc_obj - record audit data for ipc object
2330 * @ipcp: ipc permissions 2232 * @ipcp: ipc permissions
2331 * 2233 *
2332 * Returns 0 for success or NULL context or < 0 on error.
2333 */ 2234 */
2334int __audit_ipc_obj(struct kern_ipc_perm *ipcp) 2235void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2335{ 2236{
2336 struct audit_aux_data_ipcctl *ax;
2337 struct audit_context *context = current->audit_context; 2237 struct audit_context *context = current->audit_context;
2338 2238 context->ipc.uid = ipcp->uid;
2339 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 2239 context->ipc.gid = ipcp->gid;
2340 if (!ax) 2240 context->ipc.mode = ipcp->mode;
2341 return -ENOMEM; 2241 context->ipc.has_perm = 0;
2342 2242 security_ipc_getsecid(ipcp, &context->ipc.osid);
2343 ax->uid = ipcp->uid; 2243 context->type = AUDIT_IPC;
2344 ax->gid = ipcp->gid;
2345 ax->mode = ipcp->mode;
2346 security_ipc_getsecid(ipcp, &ax->osid);
2347 ax->d.type = AUDIT_IPC;
2348 ax->d.next = context->aux;
2349 context->aux = (void *)ax;
2350 return 0;
2351} 2244}
2352 2245
2353/** 2246/**
@@ -2357,26 +2250,17 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2357 * @gid: msgq group id 2250 * @gid: msgq group id
2358 * @mode: msgq mode (permissions) 2251 * @mode: msgq mode (permissions)
2359 * 2252 *
2360 * Returns 0 for success or NULL context or < 0 on error. 2253 * Called only after audit_ipc_obj().
2361 */ 2254 */
2362int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) 2255void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
2363{ 2256{
2364 struct audit_aux_data_ipcctl *ax;
2365 struct audit_context *context = current->audit_context; 2257 struct audit_context *context = current->audit_context;
2366 2258
2367 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 2259 context->ipc.qbytes = qbytes;
2368 if (!ax) 2260 context->ipc.perm_uid = uid;
2369 return -ENOMEM; 2261 context->ipc.perm_gid = gid;
2370 2262 context->ipc.perm_mode = mode;
2371 ax->qbytes = qbytes; 2263 context->ipc.has_perm = 1;
2372 ax->uid = uid;
2373 ax->gid = gid;
2374 ax->mode = mode;
2375
2376 ax->d.type = AUDIT_IPC_SET_PERM;
2377 ax->d.next = context->aux;
2378 context->aux = (void *)ax;
2379 return 0;
2380} 2264}
2381 2265
2382int audit_bprm(struct linux_binprm *bprm) 2266int audit_bprm(struct linux_binprm *bprm)
@@ -2406,27 +2290,17 @@ int audit_bprm(struct linux_binprm *bprm)
2406 * @nargs: number of args 2290 * @nargs: number of args
2407 * @args: args array 2291 * @args: args array
2408 * 2292 *
2409 * Returns 0 for success or NULL context or < 0 on error.
2410 */ 2293 */
2411int audit_socketcall(int nargs, unsigned long *args) 2294void audit_socketcall(int nargs, unsigned long *args)
2412{ 2295{
2413 struct audit_aux_data_socketcall *ax;
2414 struct audit_context *context = current->audit_context; 2296 struct audit_context *context = current->audit_context;
2415 2297
2416 if (likely(!context || context->dummy)) 2298 if (likely(!context || context->dummy))
2417 return 0; 2299 return;
2418
2419 ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
2420 if (!ax)
2421 return -ENOMEM;
2422
2423 ax->nargs = nargs;
2424 memcpy(ax->args, args, nargs * sizeof(unsigned long));
2425 2300
2426 ax->d.type = AUDIT_SOCKETCALL; 2301 context->type = AUDIT_SOCKETCALL;
2427 ax->d.next = context->aux; 2302 context->socketcall.nargs = nargs;
2428 context->aux = (void *)ax; 2303 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
2429 return 0;
2430} 2304}
2431 2305
2432/** 2306/**
@@ -2434,29 +2308,12 @@ int audit_socketcall(int nargs, unsigned long *args)
2434 * @fd1: the first file descriptor 2308 * @fd1: the first file descriptor
2435 * @fd2: the second file descriptor 2309 * @fd2: the second file descriptor
2436 * 2310 *
2437 * Returns 0 for success or NULL context or < 0 on error.
2438 */ 2311 */
2439int __audit_fd_pair(int fd1, int fd2) 2312void __audit_fd_pair(int fd1, int fd2)
2440{ 2313{
2441 struct audit_context *context = current->audit_context; 2314 struct audit_context *context = current->audit_context;
2442 struct audit_aux_data_fd_pair *ax; 2315 context->fds[0] = fd1;
2443 2316 context->fds[1] = fd2;
2444 if (likely(!context)) {
2445 return 0;
2446 }
2447
2448 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2449 if (!ax) {
2450 return -ENOMEM;
2451 }
2452
2453 ax->fd[0] = fd1;
2454 ax->fd[1] = fd2;
2455
2456 ax->d.type = AUDIT_FD_PAIR;
2457 ax->d.next = context->aux;
2458 context->aux = (void *)ax;
2459 return 0;
2460} 2317}
2461 2318
2462/** 2319/**
@@ -2468,22 +2325,20 @@ int __audit_fd_pair(int fd1, int fd2)
2468 */ 2325 */
2469int audit_sockaddr(int len, void *a) 2326int audit_sockaddr(int len, void *a)
2470{ 2327{
2471 struct audit_aux_data_sockaddr *ax;
2472 struct audit_context *context = current->audit_context; 2328 struct audit_context *context = current->audit_context;
2473 2329
2474 if (likely(!context || context->dummy)) 2330 if (likely(!context || context->dummy))
2475 return 0; 2331 return 0;
2476 2332
2477 ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); 2333 if (!context->sockaddr) {
2478 if (!ax) 2334 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
2479 return -ENOMEM; 2335 if (!p)
2480 2336 return -ENOMEM;
2481 ax->len = len; 2337 context->sockaddr = p;
2482 memcpy(ax->a, a, len); 2338 }
2483 2339
2484 ax->d.type = AUDIT_SOCKADDR; 2340 context->sockaddr_len = len;
2485 ax->d.next = context->aux; 2341 memcpy(context->sockaddr, a, len);
2486 context->aux = (void *)ax;
2487 return 0; 2342 return 0;
2488} 2343}
2489 2344
@@ -2617,29 +2472,15 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2617 * Record the aguments userspace sent to sys_capset for later printing by the 2472 * Record the aguments userspace sent to sys_capset for later printing by the
2618 * audit system if applicable 2473 * audit system if applicable
2619 */ 2474 */
2620int __audit_log_capset(pid_t pid, 2475void __audit_log_capset(pid_t pid,
2621 const struct cred *new, const struct cred *old) 2476 const struct cred *new, const struct cred *old)
2622{ 2477{
2623 struct audit_aux_data_capset *ax;
2624 struct audit_context *context = current->audit_context; 2478 struct audit_context *context = current->audit_context;
2625 2479 context->capset.pid = pid;
2626 if (likely(!audit_enabled || !context || context->dummy)) 2480 context->capset.cap.effective = new->cap_effective;
2627 return 0; 2481 context->capset.cap.inheritable = new->cap_effective;
2628 2482 context->capset.cap.permitted = new->cap_permitted;
2629 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2483 context->type = AUDIT_CAPSET;
2630 if (!ax)
2631 return -ENOMEM;
2632
2633 ax->d.type = AUDIT_CAPSET;
2634 ax->d.next = context->aux;
2635 context->aux = (void *)ax;
2636
2637 ax->pid = pid;
2638 ax->cap.effective = new->cap_effective;
2639 ax->cap.inheritable = new->cap_effective;
2640 ax->cap.permitted = new->cap_permitted;
2641
2642 return 0;
2643} 2484}
2644 2485
2645/** 2486/**
diff --git a/kernel/capability.c b/kernel/capability.c
index 36b4b4daebec..688926e496be 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -280,9 +280,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
280 if (ret < 0) 280 if (ret < 0)
281 goto error; 281 goto error;
282 282
283 ret = audit_log_capset(pid, new, current_cred()); 283 audit_log_capset(pid, new, current_cred());
284 if (ret < 0)
285 return ret;
286 284
287 return commit_creds(new); 285 return commit_creds(new);
288 286
@@ -308,7 +306,7 @@ int capable(int cap)
308 BUG(); 306 BUG();
309 } 307 }
310 308
311 if (has_capability(current, cap)) { 309 if (security_capable(cap) == 0) {
312 current->flags |= PF_SUPERPRIV; 310 current->flags |= PF_SUPERPRIV;
313 return 1; 311 return 1;
314 } 312 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 48348dde6d81..c29831076e7a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
84 /* Tracks how many cgroups are currently defined in hierarchy.*/ 84 /* Tracks how many cgroups are currently defined in hierarchy.*/
85 int number_of_cgroups; 85 int number_of_cgroups;
86 86
87 /* A list running through the mounted hierarchies */ 87 /* A list running through the active hierarchies */
88 struct list_head root_list; 88 struct list_head root_list;
89 89
90 /* Hierarchy-specific flags */ 90 /* Hierarchy-specific flags */
@@ -116,7 +116,6 @@ static int root_count;
116 * be called. 116 * be called.
117 */ 117 */
118static int need_forkexit_callback __read_mostly; 118static int need_forkexit_callback __read_mostly;
119static int need_mm_owner_callback __read_mostly;
120 119
121/* convenient tests for these bits */ 120/* convenient tests for these bits */
122inline int cgroup_is_removed(const struct cgroup *cgrp) 121inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -149,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
149#define for_each_subsys(_root, _ss) \ 148#define for_each_subsys(_root, _ss) \
150list_for_each_entry(_ss, &_root->subsys_list, sibling) 149list_for_each_entry(_ss, &_root->subsys_list, sibling)
151 150
152/* for_each_root() allows you to iterate across the active hierarchies */ 151/* for_each_active_root() allows you to iterate across the active hierarchies */
153#define for_each_root(_root) \ 152#define for_each_active_root(_root) \
154list_for_each_entry(_root, &roots, root_list) 153list_for_each_entry(_root, &roots, root_list)
155 154
156/* the list of cgroups eligible for automatic release. Protected by 155/* the list of cgroups eligible for automatic release. Protected by
@@ -272,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
272 271
273 rcu_read_lock(); 272 rcu_read_lock();
274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
275 struct cgroup *cgrp = cg->subsys[i]->cgroup; 274 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
276 if (atomic_dec_and_test(&cgrp->count) && 275 if (atomic_dec_and_test(&cgrp->count) &&
277 notify_on_release(cgrp)) { 276 notify_on_release(cgrp)) {
278 if (taskexit) 277 if (taskexit)
@@ -385,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
385 return 0; 384 return 0;
386} 385}
387 386
387/**
388 * link_css_set - a helper function to link a css_set to a cgroup
389 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
390 * @cg: the css_set to be linked
391 * @cgrp: the destination cgroup
392 */
393static void link_css_set(struct list_head *tmp_cg_links,
394 struct css_set *cg, struct cgroup *cgrp)
395{
396 struct cg_cgroup_link *link;
397
398 BUG_ON(list_empty(tmp_cg_links));
399 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
400 cgrp_link_list);
401 link->cg = cg;
402 list_move(&link->cgrp_link_list, &cgrp->css_sets);
403 list_add(&link->cg_link_list, &cg->cg_links);
404}
405
388/* 406/*
389 * find_css_set() takes an existing cgroup group and a 407 * find_css_set() takes an existing cgroup group and a
390 * cgroup object, and returns a css_set object that's 408 * cgroup object, and returns a css_set object that's
@@ -400,7 +418,6 @@ static struct css_set *find_css_set(
400 int i; 418 int i;
401 419
402 struct list_head tmp_cg_links; 420 struct list_head tmp_cg_links;
403 struct cg_cgroup_link *link;
404 421
405 struct hlist_head *hhead; 422 struct hlist_head *hhead;
406 423
@@ -445,26 +462,11 @@ static struct css_set *find_css_set(
445 * only do it for the first subsystem in each 462 * only do it for the first subsystem in each
446 * hierarchy 463 * hierarchy
447 */ 464 */
448 if (ss->root->subsys_list.next == &ss->sibling) { 465 if (ss->root->subsys_list.next == &ss->sibling)
449 BUG_ON(list_empty(&tmp_cg_links)); 466 link_css_set(&tmp_cg_links, res, cgrp);
450 link = list_entry(tmp_cg_links.next,
451 struct cg_cgroup_link,
452 cgrp_link_list);
453 list_del(&link->cgrp_link_list);
454 list_add(&link->cgrp_link_list, &cgrp->css_sets);
455 link->cg = res;
456 list_add(&link->cg_link_list, &res->cg_links);
457 }
458 }
459 if (list_empty(&rootnode.subsys_list)) {
460 link = list_entry(tmp_cg_links.next,
461 struct cg_cgroup_link,
462 cgrp_link_list);
463 list_del(&link->cgrp_link_list);
464 list_add(&link->cgrp_link_list, &dummytop->css_sets);
465 link->cg = res;
466 list_add(&link->cg_link_list, &res->cg_links);
467 } 467 }
468 if (list_empty(&rootnode.subsys_list))
469 link_css_set(&tmp_cg_links, res, dummytop);
468 470
469 BUG_ON(!list_empty(&tmp_cg_links)); 471 BUG_ON(!list_empty(&tmp_cg_links));
470 472
@@ -573,7 +575,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
573 inode->i_mode = mode; 575 inode->i_mode = mode;
574 inode->i_uid = current_fsuid(); 576 inode->i_uid = current_fsuid();
575 inode->i_gid = current_fsgid(); 577 inode->i_gid = current_fsgid();
576 inode->i_blocks = 0;
577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 578 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 579 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
579 } 580 }
@@ -588,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
588{ 589{
589 struct cgroup_subsys *ss; 590 struct cgroup_subsys *ss;
590 for_each_subsys(cgrp->root, ss) 591 for_each_subsys(cgrp->root, ss)
591 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) 592 if (ss->pre_destroy)
592 ss->pre_destroy(ss, cgrp); 593 ss->pre_destroy(ss, cgrp);
593 return; 594 return;
594} 595}
595 596
597static void free_cgroup_rcu(struct rcu_head *obj)
598{
599 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
600
601 kfree(cgrp);
602}
603
596static void cgroup_diput(struct dentry *dentry, struct inode *inode) 604static void cgroup_diput(struct dentry *dentry, struct inode *inode)
597{ 605{
598 /* is dentry a directory ? if so, kfree() associated cgroup */ 606 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -612,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
612 /* 620 /*
613 * Release the subsystem state objects. 621 * Release the subsystem state objects.
614 */ 622 */
615 for_each_subsys(cgrp->root, ss) { 623 for_each_subsys(cgrp->root, ss)
616 if (cgrp->subsys[ss->subsys_id]) 624 ss->destroy(ss, cgrp);
617 ss->destroy(ss, cgrp);
618 }
619 625
620 cgrp->root->number_of_cgroups--; 626 cgrp->root->number_of_cgroups--;
621 mutex_unlock(&cgroup_mutex); 627 mutex_unlock(&cgroup_mutex);
622 628
623 /* Drop the active superblock reference that we took when we 629 /*
624 * created the cgroup */ 630 * Drop the active superblock reference that we took when we
631 * created the cgroup
632 */
625 deactivate_super(cgrp->root->sb); 633 deactivate_super(cgrp->root->sb);
626 634
627 kfree(cgrp); 635 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
628 } 636 }
629 iput(inode); 637 iput(inode);
630} 638}
@@ -714,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
714 BUG_ON(cgrp->subsys[i]); 722 BUG_ON(cgrp->subsys[i]);
715 BUG_ON(!dummytop->subsys[i]); 723 BUG_ON(!dummytop->subsys[i]);
716 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 724 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
725 mutex_lock(&ss->hierarchy_mutex);
717 cgrp->subsys[i] = dummytop->subsys[i]; 726 cgrp->subsys[i] = dummytop->subsys[i];
718 cgrp->subsys[i]->cgroup = cgrp; 727 cgrp->subsys[i]->cgroup = cgrp;
719 list_add(&ss->sibling, &root->subsys_list); 728 list_move(&ss->sibling, &root->subsys_list);
720 rcu_assign_pointer(ss->root, root); 729 ss->root = root;
721 if (ss->bind) 730 if (ss->bind)
722 ss->bind(ss, cgrp); 731 ss->bind(ss, cgrp);
723 732 mutex_unlock(&ss->hierarchy_mutex);
724 } else if (bit & removed_bits) { 733 } else if (bit & removed_bits) {
725 /* We're removing this subsystem */ 734 /* We're removing this subsystem */
726 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 735 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
727 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 736 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
737 mutex_lock(&ss->hierarchy_mutex);
728 if (ss->bind) 738 if (ss->bind)
729 ss->bind(ss, dummytop); 739 ss->bind(ss, dummytop);
730 dummytop->subsys[i]->cgroup = dummytop; 740 dummytop->subsys[i]->cgroup = dummytop;
731 cgrp->subsys[i] = NULL; 741 cgrp->subsys[i] = NULL;
732 rcu_assign_pointer(subsys[i]->root, &rootnode); 742 subsys[i]->root = &rootnode;
733 list_del(&ss->sibling); 743 list_move(&ss->sibling, &rootnode.subsys_list);
744 mutex_unlock(&ss->hierarchy_mutex);
734 } else if (bit & final_bits) { 745 } else if (bit & final_bits) {
735 /* Subsystem state should already exist */ 746 /* Subsystem state should already exist */
736 BUG_ON(!cgrp->subsys[i]); 747 BUG_ON(!cgrp->subsys[i]);
@@ -992,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
992 root = NULL; 1003 root = NULL;
993 } else { 1004 } else {
994 /* New superblock */ 1005 /* New superblock */
995 struct cgroup *cgrp = &root->top_cgroup; 1006 struct cgroup *root_cgrp = &root->top_cgroup;
996 struct inode *inode; 1007 struct inode *inode;
997 int i; 1008 int i;
998 1009
@@ -1033,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1033 list_add(&root->root_list, &roots); 1044 list_add(&root->root_list, &roots);
1034 root_count++; 1045 root_count++;
1035 1046
1036 sb->s_root->d_fsdata = &root->top_cgroup; 1047 sb->s_root->d_fsdata = root_cgrp;
1037 root->top_cgroup.dentry = sb->s_root; 1048 root->top_cgroup.dentry = sb->s_root;
1038 1049
1039 /* Link the top cgroup in this hierarchy into all 1050 /* Link the top cgroup in this hierarchy into all
@@ -1044,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1044 struct hlist_node *node; 1055 struct hlist_node *node;
1045 struct css_set *cg; 1056 struct css_set *cg;
1046 1057
1047 hlist_for_each_entry(cg, node, hhead, hlist) { 1058 hlist_for_each_entry(cg, node, hhead, hlist)
1048 struct cg_cgroup_link *link; 1059 link_css_set(&tmp_cg_links, cg, root_cgrp);
1049
1050 BUG_ON(list_empty(&tmp_cg_links));
1051 link = list_entry(tmp_cg_links.next,
1052 struct cg_cgroup_link,
1053 cgrp_link_list);
1054 list_del(&link->cgrp_link_list);
1055 link->cg = cg;
1056 list_add(&link->cgrp_link_list,
1057 &root->top_cgroup.css_sets);
1058 list_add(&link->cg_link_list, &cg->cg_links);
1059 }
1060 } 1060 }
1061 write_unlock(&css_set_lock); 1061 write_unlock(&css_set_lock);
1062 1062
1063 free_cg_links(&tmp_cg_links); 1063 free_cg_links(&tmp_cg_links);
1064 1064
1065 BUG_ON(!list_empty(&cgrp->sibling)); 1065 BUG_ON(!list_empty(&root_cgrp->sibling));
1066 BUG_ON(!list_empty(&cgrp->children)); 1066 BUG_ON(!list_empty(&root_cgrp->children));
1067 BUG_ON(root->number_of_cgroups != 1); 1067 BUG_ON(root->number_of_cgroups != 1);
1068 1068
1069 cgroup_populate_dir(cgrp); 1069 cgroup_populate_dir(root_cgrp);
1070 mutex_unlock(&inode->i_mutex); 1070 mutex_unlock(&inode->i_mutex);
1071 mutex_unlock(&cgroup_mutex); 1071 mutex_unlock(&cgroup_mutex);
1072 } 1072 }
@@ -1115,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
1115 } 1115 }
1116 write_unlock(&css_set_lock); 1116 write_unlock(&css_set_lock);
1117 1117
1118 if (!list_empty(&root->root_list)) { 1118 list_del(&root->root_list);
1119 list_del(&root->root_list); 1119 root_count--;
1120 root_count--; 1120
1121 }
1122 mutex_unlock(&cgroup_mutex); 1121 mutex_unlock(&cgroup_mutex);
1123 1122
1124 kfree(root); 1123 kfree(root);
@@ -1147,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1147 * @buf: the buffer to write the path into 1146 * @buf: the buffer to write the path into
1148 * @buflen: the length of the buffer 1147 * @buflen: the length of the buffer
1149 * 1148 *
1150 * Called with cgroup_mutex held. Writes path of cgroup into buf. 1149 * Called with cgroup_mutex held or else with an RCU-protected cgroup
1151 * Returns 0 on success, -errno on error. 1150 * reference. Writes path of cgroup into buf. Returns 0 on success,
1151 * -errno on error.
1152 */ 1152 */
1153int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1153int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1154{ 1154{
1155 char *start; 1155 char *start;
1156 struct dentry *dentry = rcu_dereference(cgrp->dentry);
1156 1157
1157 if (cgrp == dummytop) { 1158 if (!dentry || cgrp == dummytop) {
1158 /* 1159 /*
1159 * Inactive subsystems have no dentry for their root 1160 * Inactive subsystems have no dentry for their root
1160 * cgroup 1161 * cgroup
@@ -1167,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1167 1168
1168 *--start = '\0'; 1169 *--start = '\0';
1169 for (;;) { 1170 for (;;) {
1170 int len = cgrp->dentry->d_name.len; 1171 int len = dentry->d_name.len;
1171 if ((start -= len) < buf) 1172 if ((start -= len) < buf)
1172 return -ENAMETOOLONG; 1173 return -ENAMETOOLONG;
1173 memcpy(start, cgrp->dentry->d_name.name, len); 1174 memcpy(start, cgrp->dentry->d_name.name, len);
1174 cgrp = cgrp->parent; 1175 cgrp = cgrp->parent;
1175 if (!cgrp) 1176 if (!cgrp)
1176 break; 1177 break;
1178 dentry = rcu_dereference(cgrp->dentry);
1177 if (!cgrp->parent) 1179 if (!cgrp->parent)
1178 continue; 1180 continue;
1179 if (--start < buf) 1181 if (--start < buf)
@@ -1218,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1218 int retval = 0; 1220 int retval = 0;
1219 struct cgroup_subsys *ss; 1221 struct cgroup_subsys *ss;
1220 struct cgroup *oldcgrp; 1222 struct cgroup *oldcgrp;
1221 struct css_set *cg = tsk->cgroups; 1223 struct css_set *cg;
1222 struct css_set *newcg; 1224 struct css_set *newcg;
1223 struct cgroupfs_root *root = cgrp->root; 1225 struct cgroupfs_root *root = cgrp->root;
1224 int subsys_id; 1226 int subsys_id;
@@ -1238,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1238 } 1240 }
1239 } 1241 }
1240 1242
1243 task_lock(tsk);
1244 cg = tsk->cgroups;
1245 get_css_set(cg);
1246 task_unlock(tsk);
1241 /* 1247 /*
1242 * Locate or allocate a new css_set for this task, 1248 * Locate or allocate a new css_set for this task,
1243 * based on its final set of cgroups 1249 * based on its final set of cgroups
1244 */ 1250 */
1245 newcg = find_css_set(cg, cgrp); 1251 newcg = find_css_set(cg, cgrp);
1252 put_css_set(cg);
1246 if (!newcg) 1253 if (!newcg)
1247 return -ENOMEM; 1254 return -ENOMEM;
1248 1255
@@ -1447,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1447 struct cftype *cft = __d_cft(file->f_dentry); 1454 struct cftype *cft = __d_cft(file->f_dentry);
1448 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1455 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1449 1456
1450 if (!cft || cgroup_is_removed(cgrp)) 1457 if (cgroup_is_removed(cgrp))
1451 return -ENODEV; 1458 return -ENODEV;
1452 if (cft->write) 1459 if (cft->write)
1453 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1460 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1492,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1492 struct cftype *cft = __d_cft(file->f_dentry); 1499 struct cftype *cft = __d_cft(file->f_dentry);
1493 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1500 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1494 1501
1495 if (!cft || cgroup_is_removed(cgrp)) 1502 if (cgroup_is_removed(cgrp))
1496 return -ENODEV; 1503 return -ENODEV;
1497 1504
1498 if (cft->read) 1505 if (cft->read)
@@ -1556,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1556 err = generic_file_open(inode, file); 1563 err = generic_file_open(inode, file);
1557 if (err) 1564 if (err)
1558 return err; 1565 return err;
1559
1560 cft = __d_cft(file->f_dentry); 1566 cft = __d_cft(file->f_dentry);
1561 if (!cft) 1567
1562 return -ENODEV;
1563 if (cft->read_map || cft->read_seq_string) { 1568 if (cft->read_map || cft->read_seq_string) {
1564 struct cgroup_seqfile_state *state = 1569 struct cgroup_seqfile_state *state =
1565 kzalloc(sizeof(*state), GFP_USER); 1570 kzalloc(sizeof(*state), GFP_USER);
@@ -1673,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1673 if (!error) { 1678 if (!error) {
1674 dentry->d_fsdata = cgrp; 1679 dentry->d_fsdata = cgrp;
1675 inc_nlink(parent->d_inode); 1680 inc_nlink(parent->d_inode);
1676 cgrp->dentry = dentry; 1681 rcu_assign_pointer(cgrp->dentry, dentry);
1677 dget(dentry); 1682 dget(dentry);
1678 } 1683 }
1679 dput(dentry); 1684 dput(dentry);
@@ -1814,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1814{ 1819{
1815 struct task_struct *res; 1820 struct task_struct *res;
1816 struct list_head *l = it->task; 1821 struct list_head *l = it->task;
1822 struct cg_cgroup_link *link;
1817 1823
1818 /* If the iterator cg is NULL, we have no tasks */ 1824 /* If the iterator cg is NULL, we have no tasks */
1819 if (!it->cg_link) 1825 if (!it->cg_link)
@@ -1821,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1821 res = list_entry(l, struct task_struct, cg_list); 1827 res = list_entry(l, struct task_struct, cg_list);
1822 /* Advance iterator to find next entry */ 1828 /* Advance iterator to find next entry */
1823 l = l->next; 1829 l = l->next;
1824 if (l == &res->cgroups->tasks) { 1830 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
1831 if (l == &link->cg->tasks) {
1825 /* We reached the end of this task list - move on to 1832 /* We reached the end of this task list - move on to
1826 * the next cg_cgroup_link */ 1833 * the next cg_cgroup_link */
1827 cgroup_advance_iter(cgrp, it); 1834 cgroup_advance_iter(cgrp, it);
@@ -2015,14 +2022,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2015 */ 2022 */
2016static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2023static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
2017{ 2024{
2018 int n = 0; 2025 int n = 0, pid;
2019 struct cgroup_iter it; 2026 struct cgroup_iter it;
2020 struct task_struct *tsk; 2027 struct task_struct *tsk;
2021 cgroup_iter_start(cgrp, &it); 2028 cgroup_iter_start(cgrp, &it);
2022 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2029 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2023 if (unlikely(n == npids)) 2030 if (unlikely(n == npids))
2024 break; 2031 break;
2025 pidarray[n++] = task_pid_vnr(tsk); 2032 pid = task_pid_vnr(tsk);
2033 if (pid > 0)
2034 pidarray[n++] = pid;
2026 } 2035 }
2027 cgroup_iter_end(cgrp, &it); 2036 cgroup_iter_end(cgrp, &it);
2028 return n; 2037 return n;
@@ -2054,7 +2063,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2054 2063
2055 ret = 0; 2064 ret = 0;
2056 cgrp = dentry->d_fsdata; 2065 cgrp = dentry->d_fsdata;
2057 rcu_read_lock();
2058 2066
2059 cgroup_iter_start(cgrp, &it); 2067 cgroup_iter_start(cgrp, &it);
2060 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2068 while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2079,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2079 } 2087 }
2080 cgroup_iter_end(cgrp, &it); 2088 cgroup_iter_end(cgrp, &it);
2081 2089
2082 rcu_read_unlock();
2083err: 2090err:
2084 return ret; 2091 return ret;
2085} 2092}
@@ -2326,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2326 struct cgroup *cgrp) 2333 struct cgroup *cgrp)
2327{ 2334{
2328 css->cgroup = cgrp; 2335 css->cgroup = cgrp;
2329 atomic_set(&css->refcnt, 0); 2336 atomic_set(&css->refcnt, 1);
2330 css->flags = 0; 2337 css->flags = 0;
2331 if (cgrp == dummytop) 2338 if (cgrp == dummytop)
2332 set_bit(CSS_ROOT, &css->flags); 2339 set_bit(CSS_ROOT, &css->flags);
@@ -2334,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2334 cgrp->subsys[ss->subsys_id] = css; 2341 cgrp->subsys[ss->subsys_id] = css;
2335} 2342}
2336 2343
2344static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2345{
2346 /* We need to take each hierarchy_mutex in a consistent order */
2347 int i;
2348
2349 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2350 struct cgroup_subsys *ss = subsys[i];
2351 if (ss->root == root)
2352 mutex_lock_nested(&ss->hierarchy_mutex, i);
2353 }
2354}
2355
2356static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2357{
2358 int i;
2359
2360 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2361 struct cgroup_subsys *ss = subsys[i];
2362 if (ss->root == root)
2363 mutex_unlock(&ss->hierarchy_mutex);
2364 }
2365}
2366
2337/* 2367/*
2338 * cgroup_create - create a cgroup 2368 * cgroup_create - create a cgroup
2339 * @parent: cgroup that will be parent of the new cgroup 2369 * @parent: cgroup that will be parent of the new cgroup
@@ -2382,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2382 init_cgroup_css(css, ss, cgrp); 2412 init_cgroup_css(css, ss, cgrp);
2383 } 2413 }
2384 2414
2415 cgroup_lock_hierarchy(root);
2385 list_add(&cgrp->sibling, &cgrp->parent->children); 2416 list_add(&cgrp->sibling, &cgrp->parent->children);
2417 cgroup_unlock_hierarchy(root);
2386 root->number_of_cgroups++; 2418 root->number_of_cgroups++;
2387 2419
2388 err = cgroup_create_dir(cgrp, dentry, mode); 2420 err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2433,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2433{ 2465{
2434 /* Check the reference count on each subsystem. Since we 2466 /* Check the reference count on each subsystem. Since we
2435 * already established that there are no tasks in the 2467 * already established that there are no tasks in the
2436 * cgroup, if the css refcount is also 0, then there should 2468 * cgroup, if the css refcount is also 1, then there should
2437 * be no outstanding references, so the subsystem is safe to 2469 * be no outstanding references, so the subsystem is safe to
2438 * destroy. We scan across all subsystems rather than using 2470 * destroy. We scan across all subsystems rather than using
2439 * the per-hierarchy linked list of mounted subsystems since 2471 * the per-hierarchy linked list of mounted subsystems since
@@ -2454,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2454 * matter, since it can only happen if the cgroup 2486 * matter, since it can only happen if the cgroup
2455 * has been deleted and hence no longer needs the 2487 * has been deleted and hence no longer needs the
2456 * release agent to be called anyway. */ 2488 * release agent to be called anyway. */
2457 if (css && atomic_read(&css->refcnt)) 2489 if (css && (atomic_read(&css->refcnt) > 1))
2458 return 1; 2490 return 1;
2459 } 2491 }
2460 return 0; 2492 return 0;
2461} 2493}
2462 2494
2495/*
2496 * Atomically mark all (or else none) of the cgroup's CSS objects as
2497 * CSS_REMOVED. Return true on success, or false if the cgroup has
2498 * busy subsystems. Call with cgroup_mutex held
2499 */
2500
2501static int cgroup_clear_css_refs(struct cgroup *cgrp)
2502{
2503 struct cgroup_subsys *ss;
2504 unsigned long flags;
2505 bool failed = false;
2506 local_irq_save(flags);
2507 for_each_subsys(cgrp->root, ss) {
2508 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2509 int refcnt;
2510 do {
2511 /* We can only remove a CSS with a refcnt==1 */
2512 refcnt = atomic_read(&css->refcnt);
2513 if (refcnt > 1) {
2514 failed = true;
2515 goto done;
2516 }
2517 BUG_ON(!refcnt);
2518 /*
2519 * Drop the refcnt to 0 while we check other
2520 * subsystems. This will cause any racing
2521 * css_tryget() to spin until we set the
2522 * CSS_REMOVED bits or abort
2523 */
2524 } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
2525 }
2526 done:
2527 for_each_subsys(cgrp->root, ss) {
2528 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2529 if (failed) {
2530 /*
2531 * Restore old refcnt if we previously managed
2532 * to clear it from 1 to 0
2533 */
2534 if (!atomic_read(&css->refcnt))
2535 atomic_set(&css->refcnt, 1);
2536 } else {
2537 /* Commit the fact that the CSS is removed */
2538 set_bit(CSS_REMOVED, &css->flags);
2539 }
2540 }
2541 local_irq_restore(flags);
2542 return !failed;
2543}
2544
2463static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 2545static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2464{ 2546{
2465 struct cgroup *cgrp = dentry->d_fsdata; 2547 struct cgroup *cgrp = dentry->d_fsdata;
2466 struct dentry *d; 2548 struct dentry *d;
2467 struct cgroup *parent; 2549 struct cgroup *parent;
2468 struct super_block *sb;
2469 struct cgroupfs_root *root;
2470 2550
2471 /* the vfs holds both inode->i_mutex already */ 2551 /* the vfs holds both inode->i_mutex already */
2472 2552
@@ -2489,12 +2569,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2489 2569
2490 mutex_lock(&cgroup_mutex); 2570 mutex_lock(&cgroup_mutex);
2491 parent = cgrp->parent; 2571 parent = cgrp->parent;
2492 root = cgrp->root;
2493 sb = root->sb;
2494 2572
2495 if (atomic_read(&cgrp->count) 2573 if (atomic_read(&cgrp->count)
2496 || !list_empty(&cgrp->children) 2574 || !list_empty(&cgrp->children)
2497 || cgroup_has_css_refs(cgrp)) { 2575 || !cgroup_clear_css_refs(cgrp)) {
2498 mutex_unlock(&cgroup_mutex); 2576 mutex_unlock(&cgroup_mutex);
2499 return -EBUSY; 2577 return -EBUSY;
2500 } 2578 }
@@ -2504,8 +2582,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2504 if (!list_empty(&cgrp->release_list)) 2582 if (!list_empty(&cgrp->release_list))
2505 list_del(&cgrp->release_list); 2583 list_del(&cgrp->release_list);
2506 spin_unlock(&release_list_lock); 2584 spin_unlock(&release_list_lock);
2507 /* delete my sibling from parent->children */ 2585
2586 cgroup_lock_hierarchy(cgrp->root);
2587 /* delete this cgroup from parent->children */
2508 list_del(&cgrp->sibling); 2588 list_del(&cgrp->sibling);
2589 cgroup_unlock_hierarchy(cgrp->root);
2590
2509 spin_lock(&cgrp->dentry->d_lock); 2591 spin_lock(&cgrp->dentry->d_lock);
2510 d = dget(cgrp->dentry); 2592 d = dget(cgrp->dentry);
2511 spin_unlock(&d->d_lock); 2593 spin_unlock(&d->d_lock);
@@ -2527,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2527 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2609 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2528 2610
2529 /* Create the top cgroup state for this subsystem */ 2611 /* Create the top cgroup state for this subsystem */
2612 list_add(&ss->sibling, &rootnode.subsys_list);
2530 ss->root = &rootnode; 2613 ss->root = &rootnode;
2531 css = ss->create(ss, dummytop); 2614 css = ss->create(ss, dummytop);
2532 /* We don't handle early failures gracefully */ 2615 /* We don't handle early failures gracefully */
@@ -2540,13 +2623,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2540 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 2623 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2541 2624
2542 need_forkexit_callback |= ss->fork || ss->exit; 2625 need_forkexit_callback |= ss->fork || ss->exit;
2543 need_mm_owner_callback |= !!ss->mm_owner_changed;
2544 2626
2545 /* At system boot, before all subsystems have been 2627 /* At system boot, before all subsystems have been
2546 * registered, no tasks have been forked, so we don't 2628 * registered, no tasks have been forked, so we don't
2547 * need to invoke fork callbacks here. */ 2629 * need to invoke fork callbacks here. */
2548 BUG_ON(!list_empty(&init_task.tasks)); 2630 BUG_ON(!list_empty(&init_task.tasks));
2549 2631
2632 mutex_init(&ss->hierarchy_mutex);
2550 ss->active = 1; 2633 ss->active = 1;
2551} 2634}
2552 2635
@@ -2565,7 +2648,6 @@ int __init cgroup_init_early(void)
2565 INIT_HLIST_NODE(&init_css_set.hlist); 2648 INIT_HLIST_NODE(&init_css_set.hlist);
2566 css_set_count = 1; 2649 css_set_count = 1;
2567 init_cgroup_root(&rootnode); 2650 init_cgroup_root(&rootnode);
2568 list_add(&rootnode.root_list, &roots);
2569 root_count = 1; 2651 root_count = 1;
2570 init_task.cgroups = &init_css_set; 2652 init_task.cgroups = &init_css_set;
2571 2653
@@ -2672,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2672 2754
2673 mutex_lock(&cgroup_mutex); 2755 mutex_lock(&cgroup_mutex);
2674 2756
2675 for_each_root(root) { 2757 for_each_active_root(root) {
2676 struct cgroup_subsys *ss; 2758 struct cgroup_subsys *ss;
2677 struct cgroup *cgrp; 2759 struct cgroup *cgrp;
2678 int subsys_id; 2760 int subsys_id;
2679 int count = 0; 2761 int count = 0;
2680 2762
2681 /* Skip this hierarchy if it has no active subsystems */
2682 if (!root->actual_subsys_bits)
2683 continue;
2684 seq_printf(m, "%lu:", root->subsys_bits); 2763 seq_printf(m, "%lu:", root->subsys_bits);
2685 for_each_subsys(root, ss) 2764 for_each_subsys(root, ss)
2686 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 2765 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2790,37 +2869,6 @@ void cgroup_fork_callbacks(struct task_struct *child)
2790 } 2869 }
2791} 2870}
2792 2871
2793#ifdef CONFIG_MM_OWNER
2794/**
2795 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2796 * @p: the new owner
2797 *
2798 * Called on every change to mm->owner. mm_init_owner() does not
2799 * invoke this routine, since it assigns the mm->owner the first time
2800 * and does not change it.
2801 *
2802 * The callbacks are invoked with mmap_sem held in read mode.
2803 */
2804void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2805{
2806 struct cgroup *oldcgrp, *newcgrp = NULL;
2807
2808 if (need_mm_owner_callback) {
2809 int i;
2810 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2811 struct cgroup_subsys *ss = subsys[i];
2812 oldcgrp = task_cgroup(old, ss->subsys_id);
2813 if (new)
2814 newcgrp = task_cgroup(new, ss->subsys_id);
2815 if (oldcgrp == newcgrp)
2816 continue;
2817 if (ss->mm_owner_changed)
2818 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2819 }
2820 }
2821}
2822#endif /* CONFIG_MM_OWNER */
2823
2824/** 2872/**
2825 * cgroup_post_fork - called on a new task after adding it to the task list 2873 * cgroup_post_fork - called on a new task after adding it to the task list
2826 * @child: the task in question 2874 * @child: the task in question
@@ -2834,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child)
2834{ 2882{
2835 if (use_task_css_set_links) { 2883 if (use_task_css_set_links) {
2836 write_lock(&css_set_lock); 2884 write_lock(&css_set_lock);
2885 task_lock(child);
2837 if (list_empty(&child->cg_list)) 2886 if (list_empty(&child->cg_list))
2838 list_add(&child->cg_list, &child->cgroups->tasks); 2887 list_add(&child->cg_list, &child->cgroups->tasks);
2888 task_unlock(child);
2839 write_unlock(&css_set_lock); 2889 write_unlock(&css_set_lock);
2840 } 2890 }
2841} 2891}
@@ -2941,14 +2991,20 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2941 mutex_unlock(&cgroup_mutex); 2991 mutex_unlock(&cgroup_mutex);
2942 return 0; 2992 return 0;
2943 } 2993 }
2994 task_lock(tsk);
2944 cg = tsk->cgroups; 2995 cg = tsk->cgroups;
2945 parent = task_cgroup(tsk, subsys->subsys_id); 2996 parent = task_cgroup(tsk, subsys->subsys_id);
2946 2997
2947 /* Pin the hierarchy */ 2998 /* Pin the hierarchy */
2948 atomic_inc(&parent->root->sb->s_active); 2999 if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
3000 /* We race with the final deactivate_super() */
3001 mutex_unlock(&cgroup_mutex);
3002 return 0;
3003 }
2949 3004
2950 /* Keep the cgroup alive */ 3005 /* Keep the cgroup alive */
2951 get_css_set(cg); 3006 get_css_set(cg);
3007 task_unlock(tsk);
2952 mutex_unlock(&cgroup_mutex); 3008 mutex_unlock(&cgroup_mutex);
2953 3009
2954 /* Now do the VFS work to create a cgroup */ 3010 /* Now do the VFS work to create a cgroup */
@@ -2967,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2967 } 3023 }
2968 3024
2969 /* Create the cgroup directory, which also creates the cgroup */ 3025 /* Create the cgroup directory, which also creates the cgroup */
2970 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); 3026 ret = vfs_mkdir(inode, dentry, 0755);
2971 child = __d_cgrp(dentry); 3027 child = __d_cgrp(dentry);
2972 dput(dentry); 3028 dput(dentry);
2973 if (ret) { 3029 if (ret) {
@@ -2977,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2977 goto out_release; 3033 goto out_release;
2978 } 3034 }
2979 3035
2980 if (!child) {
2981 printk(KERN_INFO
2982 "Couldn't find new cgroup %s\n", nodename);
2983 ret = -ENOMEM;
2984 goto out_release;
2985 }
2986
2987 /* The cgroup now exists. Retake cgroup_mutex and check 3036 /* The cgroup now exists. Retake cgroup_mutex and check
2988 * that we're still in the same state that we thought we 3037 * that we're still in the same state that we thought we
2989 * were. */ 3038 * were. */
@@ -3079,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
3079{ 3128{
3080 struct cgroup *cgrp = css->cgroup; 3129 struct cgroup *cgrp = css->cgroup;
3081 rcu_read_lock(); 3130 rcu_read_lock();
3082 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { 3131 if ((atomic_dec_return(&css->refcnt) == 1) &&
3132 notify_on_release(cgrp)) {
3083 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3133 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3084 check_for_release(cgrp); 3134 check_for_release(cgrp);
3085 } 3135 }
diff --git a/kernel/compat.c b/kernel/compat.c
index 8eafe3eb50d9..42d56544460f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -24,6 +24,7 @@
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h> 26#include <linux/times.h>
27#include <linux/ptrace.h>
27 28
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29 30
@@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
229 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 230 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
230 return -EFAULT; 231 return -EFAULT;
231 } 232 }
233 force_successful_syscall_return();
232 return compat_jiffies_to_clock_t(jiffies); 234 return compat_jiffies_to_clock_t(jiffies);
233} 235}
234 236
@@ -454,16 +456,16 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
454} 456}
455 457
456static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, 458static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
457 unsigned len, cpumask_t *new_mask) 459 unsigned len, struct cpumask *new_mask)
458{ 460{
459 unsigned long *k; 461 unsigned long *k;
460 462
461 if (len < sizeof(cpumask_t)) 463 if (len < cpumask_size())
462 memset(new_mask, 0, sizeof(cpumask_t)); 464 memset(new_mask, 0, cpumask_size());
463 else if (len > sizeof(cpumask_t)) 465 else if (len > cpumask_size())
464 len = sizeof(cpumask_t); 466 len = cpumask_size();
465 467
466 k = cpus_addr(*new_mask); 468 k = cpumask_bits(new_mask);
467 return compat_get_bitmap(k, user_mask_ptr, len * 8); 469 return compat_get_bitmap(k, user_mask_ptr, len * 8);
468} 470}
469 471
@@ -471,40 +473,51 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
471 unsigned int len, 473 unsigned int len,
472 compat_ulong_t __user *user_mask_ptr) 474 compat_ulong_t __user *user_mask_ptr)
473{ 475{
474 cpumask_t new_mask; 476 cpumask_var_t new_mask;
475 int retval; 477 int retval;
476 478
477 retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); 479 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
480 return -ENOMEM;
481
482 retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask);
478 if (retval) 483 if (retval)
479 return retval; 484 goto out;
480 485
481 return sched_setaffinity(pid, &new_mask); 486 retval = sched_setaffinity(pid, new_mask);
487out:
488 free_cpumask_var(new_mask);
489 return retval;
482} 490}
483 491
484asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, 492asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
485 compat_ulong_t __user *user_mask_ptr) 493 compat_ulong_t __user *user_mask_ptr)
486{ 494{
487 int ret; 495 int ret;
488 cpumask_t mask; 496 cpumask_var_t mask;
489 unsigned long *k; 497 unsigned long *k;
490 unsigned int min_length = sizeof(cpumask_t); 498 unsigned int min_length = cpumask_size();
491 499
492 if (NR_CPUS <= BITS_PER_COMPAT_LONG) 500 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
493 min_length = sizeof(compat_ulong_t); 501 min_length = sizeof(compat_ulong_t);
494 502
495 if (len < min_length) 503 if (len < min_length)
496 return -EINVAL; 504 return -EINVAL;
497 505
498 ret = sched_getaffinity(pid, &mask); 506 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
507 return -ENOMEM;
508
509 ret = sched_getaffinity(pid, mask);
499 if (ret < 0) 510 if (ret < 0)
500 return ret; 511 goto out;
501 512
502 k = cpus_addr(mask); 513 k = cpumask_bits(mask);
503 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 514 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
504 if (ret) 515 if (ret == 0)
505 return ret; 516 ret = min_length;
506 517
507 return min_length; 518out:
519 free_cpumask_var(mask);
520 return ret;
508} 521}
509 522
510int get_compat_itimerspec(struct itimerspec *dst, 523int get_compat_itimerspec(struct itimerspec *dst,
@@ -883,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
883 896
884 if (tloc) { 897 if (tloc) {
885 if (put_user(i,tloc)) 898 if (put_user(i,tloc))
886 i = -EFAULT; 899 return -EFAULT;
887 } 900 }
901 force_successful_syscall_return();
888 return i; 902 return i;
889} 903}
890 904
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ea32e8d68b0..79e40f00dcb8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,29 +15,8 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/* 18#ifdef CONFIG_SMP
19 * Represents all cpu's present in the system 19/* Serializes the updates to cpu_online_mask, cpu_present_mask */
20 * In systems capable of hotplug, this map could dynamically grow
21 * as new cpu's are detected in the system via any platform specific
22 * method, such as ACPI for e.g.
23 */
24cpumask_t cpu_present_map __read_mostly;
25EXPORT_SYMBOL(cpu_present_map);
26
27#ifndef CONFIG_SMP
28
29/*
30 * Represents all cpu's that are currently online.
31 */
32cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
33EXPORT_SYMBOL(cpu_online_map);
34
35cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
36EXPORT_SYMBOL(cpu_possible_map);
37
38#else /* CONFIG_SMP */
39
40/* Serializes the updates to cpu_online_map, cpu_present_map */
41static DEFINE_MUTEX(cpu_add_remove_lock); 20static DEFINE_MUTEX(cpu_add_remove_lock);
42 21
43static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
@@ -64,8 +43,6 @@ void __init cpu_hotplug_init(void)
64 cpu_hotplug.refcount = 0; 43 cpu_hotplug.refcount = 0;
65} 44}
66 45
67cpumask_t cpu_active_map;
68
69#ifdef CONFIG_HOTPLUG_CPU 46#ifdef CONFIG_HOTPLUG_CPU
70 47
71void get_online_cpus(void) 48void get_online_cpus(void)
@@ -96,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
96 73
97/* 74/*
98 * The following two API's must be used when attempting 75 * The following two API's must be used when attempting
99 * to serialize the updates to cpu_online_map, cpu_present_map. 76 * to serialize the updates to cpu_online_mask, cpu_present_mask.
100 */ 77 */
101void cpu_maps_update_begin(void) 78void cpu_maps_update_begin(void)
102{ 79{
@@ -217,7 +194,7 @@ static int __ref take_cpu_down(void *_param)
217static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 194static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
218{ 195{
219 int err, nr_calls = 0; 196 int err, nr_calls = 0;
220 cpumask_t old_allowed, tmp; 197 cpumask_var_t old_allowed;
221 void *hcpu = (void *)(long)cpu; 198 void *hcpu = (void *)(long)cpu;
222 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 199 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
223 struct take_cpu_down_param tcd_param = { 200 struct take_cpu_down_param tcd_param = {
@@ -231,6 +208,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
231 if (!cpu_online(cpu)) 208 if (!cpu_online(cpu))
232 return -EINVAL; 209 return -EINVAL;
233 210
211 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
212 return -ENOMEM;
213
234 cpu_hotplug_begin(); 214 cpu_hotplug_begin();
235 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 215 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
236 hcpu, -1, &nr_calls); 216 hcpu, -1, &nr_calls);
@@ -245,13 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
245 } 225 }
246 226
247 /* Ensure that we are not runnable on dying cpu */ 227 /* Ensure that we are not runnable on dying cpu */
248 old_allowed = current->cpus_allowed; 228 cpumask_copy(old_allowed, &current->cpus_allowed);
249 cpus_setall(tmp); 229 set_cpus_allowed_ptr(current,
250 cpu_clear(cpu, tmp); 230 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
251 set_cpus_allowed_ptr(current, &tmp);
252 tmp = cpumask_of_cpu(cpu);
253 231
254 err = __stop_machine(take_cpu_down, &tcd_param, &tmp); 232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
255 if (err) { 233 if (err) {
256 /* CPU didn't die: tell everyone. Can't complain. */ 234 /* CPU didn't die: tell everyone. Can't complain. */
257 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 235 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
@@ -277,7 +255,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
277 check_for_tasks(cpu); 255 check_for_tasks(cpu);
278 256
279out_allowed: 257out_allowed:
280 set_cpus_allowed_ptr(current, &old_allowed); 258 set_cpus_allowed_ptr(current, old_allowed);
281out_release: 259out_release:
282 cpu_hotplug_done(); 260 cpu_hotplug_done();
283 if (!err) { 261 if (!err) {
@@ -285,13 +263,17 @@ out_release:
285 hcpu) == NOTIFY_BAD) 263 hcpu) == NOTIFY_BAD)
286 BUG(); 264 BUG();
287 } 265 }
266 free_cpumask_var(old_allowed);
288 return err; 267 return err;
289} 268}
290 269
291int __ref cpu_down(unsigned int cpu) 270int __ref cpu_down(unsigned int cpu)
292{ 271{
293 int err = 0; 272 int err;
294 273
274 err = stop_machine_create();
275 if (err)
276 return err;
295 cpu_maps_update_begin(); 277 cpu_maps_update_begin();
296 278
297 if (cpu_hotplug_disabled) { 279 if (cpu_hotplug_disabled) {
@@ -303,7 +285,7 @@ int __ref cpu_down(unsigned int cpu)
303 285
304 /* 286 /*
305 * Make sure the all cpus did the reschedule and are not 287 * Make sure the all cpus did the reschedule and are not
306 * using stale version of the cpu_active_map. 288 * using stale version of the cpu_active_mask.
307 * This is not strictly necessary becuase stop_machine() 289 * This is not strictly necessary becuase stop_machine()
308 * that we run down the line already provides the required 290 * that we run down the line already provides the required
309 * synchronization. But it's really a side effect and we do not 291 * synchronization. But it's really a side effect and we do not
@@ -318,6 +300,7 @@ int __ref cpu_down(unsigned int cpu)
318 300
319out: 301out:
320 cpu_maps_update_done(); 302 cpu_maps_update_done();
303 stop_machine_destroy();
321 return err; 304 return err;
322} 305}
323EXPORT_SYMBOL(cpu_down); 306EXPORT_SYMBOL(cpu_down);
@@ -367,7 +350,7 @@ out_notify:
367int __cpuinit cpu_up(unsigned int cpu) 350int __cpuinit cpu_up(unsigned int cpu)
368{ 351{
369 int err = 0; 352 int err = 0;
370 if (!cpu_isset(cpu, cpu_possible_map)) { 353 if (!cpu_possible(cpu)) {
371 printk(KERN_ERR "can't online cpu %d because it is not " 354 printk(KERN_ERR "can't online cpu %d because it is not "
372 "configured as may-hotadd at boot time\n", cpu); 355 "configured as may-hotadd at boot time\n", cpu);
373#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 356#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
@@ -392,25 +375,28 @@ out:
392} 375}
393 376
394#ifdef CONFIG_PM_SLEEP_SMP 377#ifdef CONFIG_PM_SLEEP_SMP
395static cpumask_t frozen_cpus; 378static cpumask_var_t frozen_cpus;
396 379
397int disable_nonboot_cpus(void) 380int disable_nonboot_cpus(void)
398{ 381{
399 int cpu, first_cpu, error = 0; 382 int cpu, first_cpu, error;
400 383
384 error = stop_machine_create();
385 if (error)
386 return error;
401 cpu_maps_update_begin(); 387 cpu_maps_update_begin();
402 first_cpu = first_cpu(cpu_online_map); 388 first_cpu = cpumask_first(cpu_online_mask);
403 /* We take down all of the non-boot CPUs in one shot to avoid races 389 /* We take down all of the non-boot CPUs in one shot to avoid races
404 * with the userspace trying to use the CPU hotplug at the same time 390 * with the userspace trying to use the CPU hotplug at the same time
405 */ 391 */
406 cpus_clear(frozen_cpus); 392 cpumask_clear(frozen_cpus);
407 printk("Disabling non-boot CPUs ...\n"); 393 printk("Disabling non-boot CPUs ...\n");
408 for_each_online_cpu(cpu) { 394 for_each_online_cpu(cpu) {
409 if (cpu == first_cpu) 395 if (cpu == first_cpu)
410 continue; 396 continue;
411 error = _cpu_down(cpu, 1); 397 error = _cpu_down(cpu, 1);
412 if (!error) { 398 if (!error) {
413 cpu_set(cpu, frozen_cpus); 399 cpumask_set_cpu(cpu, frozen_cpus);
414 printk("CPU%d is down\n", cpu); 400 printk("CPU%d is down\n", cpu);
415 } else { 401 } else {
416 printk(KERN_ERR "Error taking CPU%d down: %d\n", 402 printk(KERN_ERR "Error taking CPU%d down: %d\n",
@@ -426,6 +412,7 @@ int disable_nonboot_cpus(void)
426 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 412 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
427 } 413 }
428 cpu_maps_update_done(); 414 cpu_maps_update_done();
415 stop_machine_destroy();
429 return error; 416 return error;
430} 417}
431 418
@@ -436,11 +423,11 @@ void __ref enable_nonboot_cpus(void)
436 /* Allow everyone to use the CPU hotplug again */ 423 /* Allow everyone to use the CPU hotplug again */
437 cpu_maps_update_begin(); 424 cpu_maps_update_begin();
438 cpu_hotplug_disabled = 0; 425 cpu_hotplug_disabled = 0;
439 if (cpus_empty(frozen_cpus)) 426 if (cpumask_empty(frozen_cpus))
440 goto out; 427 goto out;
441 428
442 printk("Enabling non-boot CPUs ...\n"); 429 printk("Enabling non-boot CPUs ...\n");
443 for_each_cpu_mask_nr(cpu, frozen_cpus) { 430 for_each_cpu(cpu, frozen_cpus) {
444 error = _cpu_up(cpu, 1); 431 error = _cpu_up(cpu, 1);
445 if (!error) { 432 if (!error) {
446 printk("CPU%d is up\n", cpu); 433 printk("CPU%d is up\n", cpu);
@@ -448,10 +435,18 @@ void __ref enable_nonboot_cpus(void)
448 } 435 }
449 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 436 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
450 } 437 }
451 cpus_clear(frozen_cpus); 438 cpumask_clear(frozen_cpus);
452out: 439out:
453 cpu_maps_update_done(); 440 cpu_maps_update_done();
454} 441}
442
443static int alloc_frozen_cpus(void)
444{
445 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
446 return -ENOMEM;
447 return 0;
448}
449core_initcall(alloc_frozen_cpus);
455#endif /* CONFIG_PM_SLEEP_SMP */ 450#endif /* CONFIG_PM_SLEEP_SMP */
456 451
457/** 452/**
@@ -467,7 +462,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
467 unsigned long val = CPU_STARTING; 462 unsigned long val = CPU_STARTING;
468 463
469#ifdef CONFIG_PM_SLEEP_SMP 464#ifdef CONFIG_PM_SLEEP_SMP
470 if (cpu_isset(cpu, frozen_cpus)) 465 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
471 val = CPU_STARTING_FROZEN; 466 val = CPU_STARTING_FROZEN;
472#endif /* CONFIG_PM_SLEEP_SMP */ 467#endif /* CONFIG_PM_SLEEP_SMP */
473 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); 468 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
@@ -479,7 +474,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
479 * cpu_bit_bitmap[] is a special, "compressed" data structure that 474 * cpu_bit_bitmap[] is a special, "compressed" data structure that
480 * represents all NR_CPUS bits binary values of 1<<nr. 475 * represents all NR_CPUS bits binary values of 1<<nr.
481 * 476 *
482 * It is used by cpumask_of_cpu() to get a constant address to a CPU 477 * It is used by cpumask_of() to get a constant address to a CPU
483 * mask value that has a single bit set only. 478 * mask value that has a single bit set only.
484 */ 479 */
485 480
@@ -502,3 +497,71 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
502 497
503const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; 498const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
504EXPORT_SYMBOL(cpu_all_bits); 499EXPORT_SYMBOL(cpu_all_bits);
500
501#ifdef CONFIG_INIT_ALL_POSSIBLE
502static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
503 = CPU_BITS_ALL;
504#else
505static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
506#endif
507const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
508EXPORT_SYMBOL(cpu_possible_mask);
509
510static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
511const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
512EXPORT_SYMBOL(cpu_online_mask);
513
514static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
515const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
516EXPORT_SYMBOL(cpu_present_mask);
517
518static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
519const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
520EXPORT_SYMBOL(cpu_active_mask);
521
522void set_cpu_possible(unsigned int cpu, bool possible)
523{
524 if (possible)
525 cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
526 else
527 cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
528}
529
530void set_cpu_present(unsigned int cpu, bool present)
531{
532 if (present)
533 cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
534 else
535 cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
536}
537
538void set_cpu_online(unsigned int cpu, bool online)
539{
540 if (online)
541 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
542 else
543 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
544}
545
546void set_cpu_active(unsigned int cpu, bool active)
547{
548 if (active)
549 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
550 else
551 cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
552}
553
554void init_cpu_present(const struct cpumask *src)
555{
556 cpumask_copy(to_cpumask(cpu_present_bits), src);
557}
558
559void init_cpu_possible(const struct cpumask *src)
560{
561 cpumask_copy(to_cpumask(cpu_possible_bits), src);
562}
563
564void init_cpu_online(const struct cpumask *src)
565{
566 cpumask_copy(to_cpumask(cpu_online_bits), src);
567}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 96c0ba13b8cd..647c77a88fcb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -84,7 +84,7 @@ struct cpuset {
84 struct cgroup_subsys_state css; 84 struct cgroup_subsys_state css;
85 85
86 unsigned long flags; /* "unsigned long" so bitops work */ 86 unsigned long flags; /* "unsigned long" so bitops work */
87 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
89 89
90 struct cpuset *parent; /* my parent */ 90 struct cpuset *parent; /* my parent */
@@ -195,8 +195,6 @@ static int cpuset_mems_generation;
195 195
196static struct cpuset top_cpuset = { 196static struct cpuset top_cpuset = {
197 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 197 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
198 .cpus_allowed = CPU_MASK_ALL,
199 .mems_allowed = NODE_MASK_ALL,
200}; 198};
201 199
202/* 200/*
@@ -240,6 +238,17 @@ static struct cpuset top_cpuset = {
240static DEFINE_MUTEX(callback_mutex); 238static DEFINE_MUTEX(callback_mutex);
241 239
242/* 240/*
241 * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
242 * buffers. They are statically allocated to prevent using excess stack
243 * when calling cpuset_print_task_mems_allowed().
244 */
245#define CPUSET_NAME_LEN (128)
246#define CPUSET_NODELIST_LEN (256)
247static char cpuset_name[CPUSET_NAME_LEN];
248static char cpuset_nodelist[CPUSET_NODELIST_LEN];
249static DEFINE_SPINLOCK(cpuset_buffer_lock);
250
251/*
243 * This is ugly, but preserves the userspace API for existing cpuset 252 * This is ugly, but preserves the userspace API for existing cpuset
244 * users. If someone tries to mount the "cpuset" filesystem, we 253 * users. If someone tries to mount the "cpuset" filesystem, we
245 * silently switch it to mount "cgroup" instead 254 * silently switch it to mount "cgroup" instead
@@ -267,7 +276,7 @@ static struct file_system_type cpuset_fs_type = {
267}; 276};
268 277
269/* 278/*
270 * Return in *pmask the portion of a cpusets's cpus_allowed that 279 * Return in pmask the portion of a cpusets's cpus_allowed that
271 * are online. If none are online, walk up the cpuset hierarchy 280 * are online. If none are online, walk up the cpuset hierarchy
272 * until we find one that does have some online cpus. If we get 281 * until we find one that does have some online cpus. If we get
273 * all the way to the top and still haven't found any online cpus, 282 * all the way to the top and still haven't found any online cpus,
@@ -280,15 +289,16 @@ static struct file_system_type cpuset_fs_type = {
280 * Call with callback_mutex held. 289 * Call with callback_mutex held.
281 */ 290 */
282 291
283static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 292static void guarantee_online_cpus(const struct cpuset *cs,
293 struct cpumask *pmask)
284{ 294{
285 while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) 295 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
286 cs = cs->parent; 296 cs = cs->parent;
287 if (cs) 297 if (cs)
288 cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); 298 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
289 else 299 else
290 *pmask = cpu_online_map; 300 cpumask_copy(pmask, cpu_online_mask);
291 BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); 301 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
292} 302}
293 303
294/* 304/*
@@ -364,14 +374,9 @@ void cpuset_update_task_memory_state(void)
364 struct task_struct *tsk = current; 374 struct task_struct *tsk = current;
365 struct cpuset *cs; 375 struct cpuset *cs;
366 376
367 if (task_cs(tsk) == &top_cpuset) { 377 rcu_read_lock();
368 /* Don't need rcu for top_cpuset. It's never freed. */ 378 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
369 my_cpusets_mem_gen = top_cpuset.mems_generation; 379 rcu_read_unlock();
370 } else {
371 rcu_read_lock();
372 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
373 rcu_read_unlock();
374 }
375 380
376 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 381 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
377 mutex_lock(&callback_mutex); 382 mutex_lock(&callback_mutex);
@@ -403,12 +408,43 @@ void cpuset_update_task_memory_state(void)
403 408
404static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 409static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
405{ 410{
406 return cpus_subset(p->cpus_allowed, q->cpus_allowed) && 411 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
407 nodes_subset(p->mems_allowed, q->mems_allowed) && 412 nodes_subset(p->mems_allowed, q->mems_allowed) &&
408 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 413 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
409 is_mem_exclusive(p) <= is_mem_exclusive(q); 414 is_mem_exclusive(p) <= is_mem_exclusive(q);
410} 415}
411 416
417/**
418 * alloc_trial_cpuset - allocate a trial cpuset
419 * @cs: the cpuset that the trial cpuset duplicates
420 */
421static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
422{
423 struct cpuset *trial;
424
425 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
426 if (!trial)
427 return NULL;
428
429 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
430 kfree(trial);
431 return NULL;
432 }
433 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
434
435 return trial;
436}
437
438/**
439 * free_trial_cpuset - free the trial cpuset
440 * @trial: the trial cpuset to be freed
441 */
442static void free_trial_cpuset(struct cpuset *trial)
443{
444 free_cpumask_var(trial->cpus_allowed);
445 kfree(trial);
446}
447
412/* 448/*
413 * validate_change() - Used to validate that any proposed cpuset change 449 * validate_change() - Used to validate that any proposed cpuset change
414 * follows the structural rules for cpusets. 450 * follows the structural rules for cpusets.
@@ -458,7 +494,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
458 c = cgroup_cs(cont); 494 c = cgroup_cs(cont);
459 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 495 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
460 c != cur && 496 c != cur &&
461 cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) 497 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
462 return -EINVAL; 498 return -EINVAL;
463 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 499 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
464 c != cur && 500 c != cur &&
@@ -468,7 +504,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
468 504
469 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 505 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
470 if (cgroup_task_count(cur->css.cgroup)) { 506 if (cgroup_task_count(cur->css.cgroup)) {
471 if (cpus_empty(trial->cpus_allowed) || 507 if (cpumask_empty(trial->cpus_allowed) ||
472 nodes_empty(trial->mems_allowed)) { 508 nodes_empty(trial->mems_allowed)) {
473 return -ENOSPC; 509 return -ENOSPC;
474 } 510 }
@@ -483,7 +519,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
483 */ 519 */
484static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 520static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
485{ 521{
486 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 522 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
487} 523}
488 524
489static void 525static void
@@ -508,7 +544,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
508 cp = list_first_entry(&q, struct cpuset, stack_list); 544 cp = list_first_entry(&q, struct cpuset, stack_list);
509 list_del(q.next); 545 list_del(q.next);
510 546
511 if (cpus_empty(cp->cpus_allowed)) 547 if (cpumask_empty(cp->cpus_allowed))
512 continue; 548 continue;
513 549
514 if (is_sched_load_balance(cp)) 550 if (is_sched_load_balance(cp))
@@ -575,7 +611,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
575 * element of the partition (one sched domain) to be passed to 611 * element of the partition (one sched domain) to be passed to
576 * partition_sched_domains(). 612 * partition_sched_domains().
577 */ 613 */
578static int generate_sched_domains(cpumask_t **domains, 614/* FIXME: see the FIXME in partition_sched_domains() */
615static int generate_sched_domains(struct cpumask **domains,
579 struct sched_domain_attr **attributes) 616 struct sched_domain_attr **attributes)
580{ 617{
581 LIST_HEAD(q); /* queue of cpusets to be scanned */ 618 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -583,10 +620,10 @@ static int generate_sched_domains(cpumask_t **domains,
583 struct cpuset **csa; /* array of all cpuset ptrs */ 620 struct cpuset **csa; /* array of all cpuset ptrs */
584 int csn; /* how many cpuset ptrs in csa so far */ 621 int csn; /* how many cpuset ptrs in csa so far */
585 int i, j, k; /* indices for partition finding loops */ 622 int i, j, k; /* indices for partition finding loops */
586 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 623 struct cpumask *doms; /* resulting partition; i.e. sched domains */
587 struct sched_domain_attr *dattr; /* attributes for custom domains */ 624 struct sched_domain_attr *dattr; /* attributes for custom domains */
588 int ndoms = 0; /* number of sched domains in result */ 625 int ndoms = 0; /* number of sched domains in result */
589 int nslot; /* next empty doms[] cpumask_t slot */ 626 int nslot; /* next empty doms[] struct cpumask slot */
590 627
591 doms = NULL; 628 doms = NULL;
592 dattr = NULL; 629 dattr = NULL;
@@ -594,7 +631,7 @@ static int generate_sched_domains(cpumask_t **domains,
594 631
595 /* Special case for the 99% of systems with one, full, sched domain */ 632 /* Special case for the 99% of systems with one, full, sched domain */
596 if (is_sched_load_balance(&top_cpuset)) { 633 if (is_sched_load_balance(&top_cpuset)) {
597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 634 doms = kmalloc(cpumask_size(), GFP_KERNEL);
598 if (!doms) 635 if (!doms)
599 goto done; 636 goto done;
600 637
@@ -603,7 +640,7 @@ static int generate_sched_domains(cpumask_t **domains,
603 *dattr = SD_ATTR_INIT; 640 *dattr = SD_ATTR_INIT;
604 update_domain_attr_tree(dattr, &top_cpuset); 641 update_domain_attr_tree(dattr, &top_cpuset);
605 } 642 }
606 *doms = top_cpuset.cpus_allowed; 643 cpumask_copy(doms, top_cpuset.cpus_allowed);
607 644
608 ndoms = 1; 645 ndoms = 1;
609 goto done; 646 goto done;
@@ -622,7 +659,7 @@ static int generate_sched_domains(cpumask_t **domains,
622 cp = list_first_entry(&q, struct cpuset, stack_list); 659 cp = list_first_entry(&q, struct cpuset, stack_list);
623 list_del(q.next); 660 list_del(q.next);
624 661
625 if (cpus_empty(cp->cpus_allowed)) 662 if (cpumask_empty(cp->cpus_allowed))
626 continue; 663 continue;
627 664
628 /* 665 /*
@@ -673,7 +710,7 @@ restart:
673 * Now we know how many domains to create. 710 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 711 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */ 712 */
676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 713 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
677 if (!doms) 714 if (!doms)
678 goto done; 715 goto done;
679 716
@@ -685,7 +722,7 @@ restart:
685 722
686 for (nslot = 0, i = 0; i < csn; i++) { 723 for (nslot = 0, i = 0; i < csn; i++) {
687 struct cpuset *a = csa[i]; 724 struct cpuset *a = csa[i];
688 cpumask_t *dp; 725 struct cpumask *dp;
689 int apn = a->pn; 726 int apn = a->pn;
690 727
691 if (apn < 0) { 728 if (apn < 0) {
@@ -708,14 +745,14 @@ restart:
708 continue; 745 continue;
709 } 746 }
710 747
711 cpus_clear(*dp); 748 cpumask_clear(dp);
712 if (dattr) 749 if (dattr)
713 *(dattr + nslot) = SD_ATTR_INIT; 750 *(dattr + nslot) = SD_ATTR_INIT;
714 for (j = i; j < csn; j++) { 751 for (j = i; j < csn; j++) {
715 struct cpuset *b = csa[j]; 752 struct cpuset *b = csa[j];
716 753
717 if (apn == b->pn) { 754 if (apn == b->pn) {
718 cpus_or(*dp, *dp, b->cpus_allowed); 755 cpumask_or(dp, dp, b->cpus_allowed);
719 if (dattr) 756 if (dattr)
720 update_domain_attr_tree(dattr + nslot, b); 757 update_domain_attr_tree(dattr + nslot, b);
721 758
@@ -755,7 +792,7 @@ done:
755static void do_rebuild_sched_domains(struct work_struct *unused) 792static void do_rebuild_sched_domains(struct work_struct *unused)
756{ 793{
757 struct sched_domain_attr *attr; 794 struct sched_domain_attr *attr;
758 cpumask_t *doms; 795 struct cpumask *doms;
759 int ndoms; 796 int ndoms;
760 797
761 get_online_cpus(); 798 get_online_cpus();
@@ -824,7 +861,7 @@ void rebuild_sched_domains(void)
824static int cpuset_test_cpumask(struct task_struct *tsk, 861static int cpuset_test_cpumask(struct task_struct *tsk,
825 struct cgroup_scanner *scan) 862 struct cgroup_scanner *scan)
826{ 863{
827 return !cpus_equal(tsk->cpus_allowed, 864 return !cpumask_equal(&tsk->cpus_allowed,
828 (cgroup_cs(scan->cg))->cpus_allowed); 865 (cgroup_cs(scan->cg))->cpus_allowed);
829} 866}
830 867
@@ -842,7 +879,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
842static void cpuset_change_cpumask(struct task_struct *tsk, 879static void cpuset_change_cpumask(struct task_struct *tsk,
843 struct cgroup_scanner *scan) 880 struct cgroup_scanner *scan)
844{ 881{
845 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); 882 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
846} 883}
847 884
848/** 885/**
@@ -874,10 +911,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
874 * @cs: the cpuset to consider 911 * @cs: the cpuset to consider
875 * @buf: buffer of cpu numbers written to this cpuset 912 * @buf: buffer of cpu numbers written to this cpuset
876 */ 913 */
877static int update_cpumask(struct cpuset *cs, const char *buf) 914static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
915 const char *buf)
878{ 916{
879 struct ptr_heap heap; 917 struct ptr_heap heap;
880 struct cpuset trialcs;
881 int retval; 918 int retval;
882 int is_load_balanced; 919 int is_load_balanced;
883 920
@@ -885,8 +922,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
885 if (cs == &top_cpuset) 922 if (cs == &top_cpuset)
886 return -EACCES; 923 return -EACCES;
887 924
888 trialcs = *cs;
889
890 /* 925 /*
891 * An empty cpus_allowed is ok only if the cpuset has no tasks. 926 * An empty cpus_allowed is ok only if the cpuset has no tasks.
892 * Since cpulist_parse() fails on an empty mask, we special case 927 * Since cpulist_parse() fails on an empty mask, we special case
@@ -894,31 +929,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
894 * with tasks have cpus. 929 * with tasks have cpus.
895 */ 930 */
896 if (!*buf) { 931 if (!*buf) {
897 cpus_clear(trialcs.cpus_allowed); 932 cpumask_clear(trialcs->cpus_allowed);
898 } else { 933 } else {
899 retval = cpulist_parse(buf, trialcs.cpus_allowed); 934 retval = cpulist_parse(buf, trialcs->cpus_allowed);
900 if (retval < 0) 935 if (retval < 0)
901 return retval; 936 return retval;
902 937
903 if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) 938 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
904 return -EINVAL; 939 return -EINVAL;
905 } 940 }
906 retval = validate_change(cs, &trialcs); 941 retval = validate_change(cs, trialcs);
907 if (retval < 0) 942 if (retval < 0)
908 return retval; 943 return retval;
909 944
910 /* Nothing to do if the cpus didn't change */ 945 /* Nothing to do if the cpus didn't change */
911 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 946 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
912 return 0; 947 return 0;
913 948
914 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 949 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
915 if (retval) 950 if (retval)
916 return retval; 951 return retval;
917 952
918 is_load_balanced = is_sched_load_balance(&trialcs); 953 is_load_balanced = is_sched_load_balance(trialcs);
919 954
920 mutex_lock(&callback_mutex); 955 mutex_lock(&callback_mutex);
921 cs->cpus_allowed = trialcs.cpus_allowed; 956 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
922 mutex_unlock(&callback_mutex); 957 mutex_unlock(&callback_mutex);
923 958
924 /* 959 /*
@@ -1006,7 +1041,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
1006 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1041 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1007 1042
1008 fudge = 10; /* spare mmarray[] slots */ 1043 fudge = 10; /* spare mmarray[] slots */
1009 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 1044 fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
1010 retval = -ENOMEM; 1045 retval = -ENOMEM;
1011 1046
1012 /* 1047 /*
@@ -1093,9 +1128,9 @@ done:
1093 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1128 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1094 * their mempolicies to the cpusets new mems_allowed. 1129 * their mempolicies to the cpusets new mems_allowed.
1095 */ 1130 */
1096static int update_nodemask(struct cpuset *cs, const char *buf) 1131static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1132 const char *buf)
1097{ 1133{
1098 struct cpuset trialcs;
1099 nodemask_t oldmem; 1134 nodemask_t oldmem;
1100 int retval; 1135 int retval;
1101 1136
@@ -1106,8 +1141,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
1106 if (cs == &top_cpuset) 1141 if (cs == &top_cpuset)
1107 return -EACCES; 1142 return -EACCES;
1108 1143
1109 trialcs = *cs;
1110
1111 /* 1144 /*
1112 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1145 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1113 * Since nodelist_parse() fails on an empty mask, we special case 1146 * Since nodelist_parse() fails on an empty mask, we special case
@@ -1115,27 +1148,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
1115 * with tasks have memory. 1148 * with tasks have memory.
1116 */ 1149 */
1117 if (!*buf) { 1150 if (!*buf) {
1118 nodes_clear(trialcs.mems_allowed); 1151 nodes_clear(trialcs->mems_allowed);
1119 } else { 1152 } else {
1120 retval = nodelist_parse(buf, trialcs.mems_allowed); 1153 retval = nodelist_parse(buf, trialcs->mems_allowed);
1121 if (retval < 0) 1154 if (retval < 0)
1122 goto done; 1155 goto done;
1123 1156
1124 if (!nodes_subset(trialcs.mems_allowed, 1157 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_HIGH_MEMORY])) 1158 node_states[N_HIGH_MEMORY]))
1126 return -EINVAL; 1159 return -EINVAL;
1127 } 1160 }
1128 oldmem = cs->mems_allowed; 1161 oldmem = cs->mems_allowed;
1129 if (nodes_equal(oldmem, trialcs.mems_allowed)) { 1162 if (nodes_equal(oldmem, trialcs->mems_allowed)) {
1130 retval = 0; /* Too easy - nothing to do */ 1163 retval = 0; /* Too easy - nothing to do */
1131 goto done; 1164 goto done;
1132 } 1165 }
1133 retval = validate_change(cs, &trialcs); 1166 retval = validate_change(cs, trialcs);
1134 if (retval < 0) 1167 if (retval < 0)
1135 goto done; 1168 goto done;
1136 1169
1137 mutex_lock(&callback_mutex); 1170 mutex_lock(&callback_mutex);
1138 cs->mems_allowed = trialcs.mems_allowed; 1171 cs->mems_allowed = trialcs->mems_allowed;
1139 cs->mems_generation = cpuset_mems_generation++; 1172 cs->mems_generation = cpuset_mems_generation++;
1140 mutex_unlock(&callback_mutex); 1173 mutex_unlock(&callback_mutex);
1141 1174
@@ -1156,7 +1189,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1156 1189
1157 if (val != cs->relax_domain_level) { 1190 if (val != cs->relax_domain_level) {
1158 cs->relax_domain_level = val; 1191 cs->relax_domain_level = val;
1159 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1192 if (!cpumask_empty(cs->cpus_allowed) &&
1193 is_sched_load_balance(cs))
1160 async_rebuild_sched_domains(); 1194 async_rebuild_sched_domains();
1161 } 1195 }
1162 1196
@@ -1175,31 +1209,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1175static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1209static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1176 int turning_on) 1210 int turning_on)
1177{ 1211{
1178 struct cpuset trialcs; 1212 struct cpuset *trialcs;
1179 int err; 1213 int err;
1180 int balance_flag_changed; 1214 int balance_flag_changed;
1181 1215
1182 trialcs = *cs; 1216 trialcs = alloc_trial_cpuset(cs);
1217 if (!trialcs)
1218 return -ENOMEM;
1219
1183 if (turning_on) 1220 if (turning_on)
1184 set_bit(bit, &trialcs.flags); 1221 set_bit(bit, &trialcs->flags);
1185 else 1222 else
1186 clear_bit(bit, &trialcs.flags); 1223 clear_bit(bit, &trialcs->flags);
1187 1224
1188 err = validate_change(cs, &trialcs); 1225 err = validate_change(cs, trialcs);
1189 if (err < 0) 1226 if (err < 0)
1190 return err; 1227 goto out;
1191 1228
1192 balance_flag_changed = (is_sched_load_balance(cs) != 1229 balance_flag_changed = (is_sched_load_balance(cs) !=
1193 is_sched_load_balance(&trialcs)); 1230 is_sched_load_balance(trialcs));
1194 1231
1195 mutex_lock(&callback_mutex); 1232 mutex_lock(&callback_mutex);
1196 cs->flags = trialcs.flags; 1233 cs->flags = trialcs->flags;
1197 mutex_unlock(&callback_mutex); 1234 mutex_unlock(&callback_mutex);
1198 1235
1199 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) 1236 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1200 async_rebuild_sched_domains(); 1237 async_rebuild_sched_domains();
1201 1238
1202 return 0; 1239out:
1240 free_trial_cpuset(trialcs);
1241 return err;
1203} 1242}
1204 1243
1205/* 1244/*
@@ -1300,42 +1339,47 @@ static int fmeter_getrate(struct fmeter *fmp)
1300 return val; 1339 return val;
1301} 1340}
1302 1341
1342/* Protected by cgroup_lock */
1343static cpumask_var_t cpus_attach;
1344
1303/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1345/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1304static int cpuset_can_attach(struct cgroup_subsys *ss, 1346static int cpuset_can_attach(struct cgroup_subsys *ss,
1305 struct cgroup *cont, struct task_struct *tsk) 1347 struct cgroup *cont, struct task_struct *tsk)
1306{ 1348{
1307 struct cpuset *cs = cgroup_cs(cont); 1349 struct cpuset *cs = cgroup_cs(cont);
1350 int ret = 0;
1308 1351
1309 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1352 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1310 return -ENOSPC; 1353 return -ENOSPC;
1311 if (tsk->flags & PF_THREAD_BOUND) {
1312 cpumask_t mask;
1313 1354
1355 if (tsk->flags & PF_THREAD_BOUND) {
1314 mutex_lock(&callback_mutex); 1356 mutex_lock(&callback_mutex);
1315 mask = cs->cpus_allowed; 1357 if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
1358 ret = -EINVAL;
1316 mutex_unlock(&callback_mutex); 1359 mutex_unlock(&callback_mutex);
1317 if (!cpus_equal(tsk->cpus_allowed, mask))
1318 return -EINVAL;
1319 } 1360 }
1320 1361
1321 return security_task_setscheduler(tsk, 0, NULL); 1362 return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
1322} 1363}
1323 1364
1324static void cpuset_attach(struct cgroup_subsys *ss, 1365static void cpuset_attach(struct cgroup_subsys *ss,
1325 struct cgroup *cont, struct cgroup *oldcont, 1366 struct cgroup *cont, struct cgroup *oldcont,
1326 struct task_struct *tsk) 1367 struct task_struct *tsk)
1327{ 1368{
1328 cpumask_t cpus;
1329 nodemask_t from, to; 1369 nodemask_t from, to;
1330 struct mm_struct *mm; 1370 struct mm_struct *mm;
1331 struct cpuset *cs = cgroup_cs(cont); 1371 struct cpuset *cs = cgroup_cs(cont);
1332 struct cpuset *oldcs = cgroup_cs(oldcont); 1372 struct cpuset *oldcs = cgroup_cs(oldcont);
1333 int err; 1373 int err;
1334 1374
1335 mutex_lock(&callback_mutex); 1375 if (cs == &top_cpuset) {
1336 guarantee_online_cpus(cs, &cpus); 1376 cpumask_copy(cpus_attach, cpu_possible_mask);
1337 err = set_cpus_allowed_ptr(tsk, &cpus); 1377 } else {
1338 mutex_unlock(&callback_mutex); 1378 mutex_lock(&callback_mutex);
1379 guarantee_online_cpus(cs, cpus_attach);
1380 mutex_unlock(&callback_mutex);
1381 }
1382 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1339 if (err) 1383 if (err)
1340 return; 1384 return;
1341 1385
@@ -1348,7 +1392,6 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1348 cpuset_migrate_mm(mm, &from, &to); 1392 cpuset_migrate_mm(mm, &from, &to);
1349 mmput(mm); 1393 mmput(mm);
1350 } 1394 }
1351
1352} 1395}
1353 1396
1354/* The various types of files and directories in a cpuset file system */ 1397/* The various types of files and directories in a cpuset file system */
@@ -1443,21 +1486,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1443 const char *buf) 1486 const char *buf)
1444{ 1487{
1445 int retval = 0; 1488 int retval = 0;
1489 struct cpuset *cs = cgroup_cs(cgrp);
1490 struct cpuset *trialcs;
1446 1491
1447 if (!cgroup_lock_live_group(cgrp)) 1492 if (!cgroup_lock_live_group(cgrp))
1448 return -ENODEV; 1493 return -ENODEV;
1449 1494
1495 trialcs = alloc_trial_cpuset(cs);
1496 if (!trialcs)
1497 return -ENOMEM;
1498
1450 switch (cft->private) { 1499 switch (cft->private) {
1451 case FILE_CPULIST: 1500 case FILE_CPULIST:
1452 retval = update_cpumask(cgroup_cs(cgrp), buf); 1501 retval = update_cpumask(cs, trialcs, buf);
1453 break; 1502 break;
1454 case FILE_MEMLIST: 1503 case FILE_MEMLIST:
1455 retval = update_nodemask(cgroup_cs(cgrp), buf); 1504 retval = update_nodemask(cs, trialcs, buf);
1456 break; 1505 break;
1457 default: 1506 default:
1458 retval = -EINVAL; 1507 retval = -EINVAL;
1459 break; 1508 break;
1460 } 1509 }
1510
1511 free_trial_cpuset(trialcs);
1461 cgroup_unlock(); 1512 cgroup_unlock();
1462 return retval; 1513 return retval;
1463} 1514}
@@ -1476,13 +1527,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1476 1527
1477static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1528static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1478{ 1529{
1479 cpumask_t mask; 1530 int ret;
1480 1531
1481 mutex_lock(&callback_mutex); 1532 mutex_lock(&callback_mutex);
1482 mask = cs->cpus_allowed; 1533 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1483 mutex_unlock(&callback_mutex); 1534 mutex_unlock(&callback_mutex);
1484 1535
1485 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1536 return ret;
1486} 1537}
1487 1538
1488static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1539static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
@@ -1718,7 +1769,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1718 parent_cs = cgroup_cs(parent); 1769 parent_cs = cgroup_cs(parent);
1719 1770
1720 cs->mems_allowed = parent_cs->mems_allowed; 1771 cs->mems_allowed = parent_cs->mems_allowed;
1721 cs->cpus_allowed = parent_cs->cpus_allowed; 1772 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1722 return; 1773 return;
1723} 1774}
1724 1775
@@ -1744,6 +1795,10 @@ static struct cgroup_subsys_state *cpuset_create(
1744 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1795 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1745 if (!cs) 1796 if (!cs)
1746 return ERR_PTR(-ENOMEM); 1797 return ERR_PTR(-ENOMEM);
1798 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1799 kfree(cs);
1800 return ERR_PTR(-ENOMEM);
1801 }
1747 1802
1748 cpuset_update_task_memory_state(); 1803 cpuset_update_task_memory_state();
1749 cs->flags = 0; 1804 cs->flags = 0;
@@ -1752,7 +1807,7 @@ static struct cgroup_subsys_state *cpuset_create(
1752 if (is_spread_slab(parent)) 1807 if (is_spread_slab(parent))
1753 set_bit(CS_SPREAD_SLAB, &cs->flags); 1808 set_bit(CS_SPREAD_SLAB, &cs->flags);
1754 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1809 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1755 cpus_clear(cs->cpus_allowed); 1810 cpumask_clear(cs->cpus_allowed);
1756 nodes_clear(cs->mems_allowed); 1811 nodes_clear(cs->mems_allowed);
1757 cs->mems_generation = cpuset_mems_generation++; 1812 cs->mems_generation = cpuset_mems_generation++;
1758 fmeter_init(&cs->fmeter); 1813 fmeter_init(&cs->fmeter);
@@ -1779,6 +1834,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1779 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1834 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1780 1835
1781 number_of_cpusets--; 1836 number_of_cpusets--;
1837 free_cpumask_var(cs->cpus_allowed);
1782 kfree(cs); 1838 kfree(cs);
1783} 1839}
1784 1840
@@ -1802,6 +1858,8 @@ struct cgroup_subsys cpuset_subsys = {
1802 1858
1803int __init cpuset_init_early(void) 1859int __init cpuset_init_early(void)
1804{ 1860{
1861 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
1862
1805 top_cpuset.mems_generation = cpuset_mems_generation++; 1863 top_cpuset.mems_generation = cpuset_mems_generation++;
1806 return 0; 1864 return 0;
1807} 1865}
@@ -1817,7 +1875,7 @@ int __init cpuset_init(void)
1817{ 1875{
1818 int err = 0; 1876 int err = 0;
1819 1877
1820 cpus_setall(top_cpuset.cpus_allowed); 1878 cpumask_setall(top_cpuset.cpus_allowed);
1821 nodes_setall(top_cpuset.mems_allowed); 1879 nodes_setall(top_cpuset.mems_allowed);
1822 1880
1823 fmeter_init(&top_cpuset.fmeter); 1881 fmeter_init(&top_cpuset.fmeter);
@@ -1829,6 +1887,9 @@ int __init cpuset_init(void)
1829 if (err < 0) 1887 if (err < 0)
1830 return err; 1888 return err;
1831 1889
1890 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1891 BUG();
1892
1832 number_of_cpusets = 1; 1893 number_of_cpusets = 1;
1833 return 0; 1894 return 0;
1834} 1895}
@@ -1903,7 +1964,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1903 * has online cpus, so can't be empty). 1964 * has online cpus, so can't be empty).
1904 */ 1965 */
1905 parent = cs->parent; 1966 parent = cs->parent;
1906 while (cpus_empty(parent->cpus_allowed) || 1967 while (cpumask_empty(parent->cpus_allowed) ||
1907 nodes_empty(parent->mems_allowed)) 1968 nodes_empty(parent->mems_allowed))
1908 parent = parent->parent; 1969 parent = parent->parent;
1909 1970
@@ -1944,7 +2005,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1944 } 2005 }
1945 2006
1946 /* Continue past cpusets with all cpus, mems online */ 2007 /* Continue past cpusets with all cpus, mems online */
1947 if (cpus_subset(cp->cpus_allowed, cpu_online_map) && 2008 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
1948 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2009 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1949 continue; 2010 continue;
1950 2011
@@ -1952,13 +2013,14 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1952 2013
1953 /* Remove offline cpus and mems from this cpuset. */ 2014 /* Remove offline cpus and mems from this cpuset. */
1954 mutex_lock(&callback_mutex); 2015 mutex_lock(&callback_mutex);
1955 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); 2016 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2017 cpu_online_mask);
1956 nodes_and(cp->mems_allowed, cp->mems_allowed, 2018 nodes_and(cp->mems_allowed, cp->mems_allowed,
1957 node_states[N_HIGH_MEMORY]); 2019 node_states[N_HIGH_MEMORY]);
1958 mutex_unlock(&callback_mutex); 2020 mutex_unlock(&callback_mutex);
1959 2021
1960 /* Move tasks from the empty cpuset to a parent */ 2022 /* Move tasks from the empty cpuset to a parent */
1961 if (cpus_empty(cp->cpus_allowed) || 2023 if (cpumask_empty(cp->cpus_allowed) ||
1962 nodes_empty(cp->mems_allowed)) 2024 nodes_empty(cp->mems_allowed))
1963 remove_tasks_in_empty_cpuset(cp); 2025 remove_tasks_in_empty_cpuset(cp);
1964 else { 2026 else {
@@ -1984,7 +2046,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1984 unsigned long phase, void *unused_cpu) 2046 unsigned long phase, void *unused_cpu)
1985{ 2047{
1986 struct sched_domain_attr *attr; 2048 struct sched_domain_attr *attr;
1987 cpumask_t *doms; 2049 struct cpumask *doms;
1988 int ndoms; 2050 int ndoms;
1989 2051
1990 switch (phase) { 2052 switch (phase) {
@@ -1999,7 +2061,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1999 } 2061 }
2000 2062
2001 cgroup_lock(); 2063 cgroup_lock();
2002 top_cpuset.cpus_allowed = cpu_online_map; 2064 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2003 scan_for_empty_cpusets(&top_cpuset); 2065 scan_for_empty_cpusets(&top_cpuset);
2004 ndoms = generate_sched_domains(&doms, &attr); 2066 ndoms = generate_sched_domains(&doms, &attr);
2005 cgroup_unlock(); 2067 cgroup_unlock();
@@ -2044,7 +2106,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2044 2106
2045void __init cpuset_init_smp(void) 2107void __init cpuset_init_smp(void)
2046{ 2108{
2047 top_cpuset.cpus_allowed = cpu_online_map; 2109 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2048 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2110 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2049 2111
2050 hotcpu_notifier(cpuset_track_online_cpus, 0); 2112 hotcpu_notifier(cpuset_track_online_cpus, 0);
@@ -2054,15 +2116,15 @@ void __init cpuset_init_smp(void)
2054/** 2116/**
2055 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2117 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
2056 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2118 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
2057 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. 2119 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
2058 * 2120 *
2059 * Description: Returns the cpumask_t cpus_allowed of the cpuset 2121 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
2060 * attached to the specified @tsk. Guaranteed to return some non-empty 2122 * attached to the specified @tsk. Guaranteed to return some non-empty
2061 * subset of cpu_online_map, even if this means going outside the 2123 * subset of cpu_online_map, even if this means going outside the
2062 * tasks cpuset. 2124 * tasks cpuset.
2063 **/ 2125 **/
2064 2126
2065void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) 2127void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2066{ 2128{
2067 mutex_lock(&callback_mutex); 2129 mutex_lock(&callback_mutex);
2068 cpuset_cpus_allowed_locked(tsk, pmask); 2130 cpuset_cpus_allowed_locked(tsk, pmask);
@@ -2073,7 +2135,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
2073 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. 2135 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2074 * Must be called with callback_mutex held. 2136 * Must be called with callback_mutex held.
2075 **/ 2137 **/
2076void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) 2138void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2077{ 2139{
2078 task_lock(tsk); 2140 task_lock(tsk);
2079 guarantee_online_cpus(task_cs(tsk), pmask); 2141 guarantee_online_cpus(task_cs(tsk), pmask);
@@ -2356,6 +2418,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2356 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2418 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2357} 2419}
2358 2420
2421/**
2422 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2423 * @task: pointer to task_struct of some task.
2424 *
2425 * Description: Prints @task's name, cpuset name, and cached copy of its
2426 * mems_allowed to the kernel log. Must hold task_lock(task) to allow
2427 * dereferencing task_cs(task).
2428 */
2429void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2430{
2431 struct dentry *dentry;
2432
2433 dentry = task_cs(tsk)->css.cgroup->dentry;
2434 spin_lock(&cpuset_buffer_lock);
2435 snprintf(cpuset_name, CPUSET_NAME_LEN,
2436 dentry ? (const char *)dentry->d_name.name : "/");
2437 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2438 tsk->mems_allowed);
2439 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2440 tsk->comm, cpuset_name, cpuset_nodelist);
2441 spin_unlock(&cpuset_buffer_lock);
2442}
2443
2359/* 2444/*
2360 * Collection of memory_pressure is suppressed unless 2445 * Collection of memory_pressure is suppressed unless
2361 * this flag is enabled by writing "1" to the special 2446 * this flag is enabled by writing "1" to the special
diff --git a/kernel/cred.c b/kernel/cred.c
index ff7bc071991c..3a039189d707 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -372,7 +372,8 @@ int commit_creds(struct cred *new)
372 old->fsuid != new->fsuid || 372 old->fsuid != new->fsuid ||
373 old->fsgid != new->fsgid || 373 old->fsgid != new->fsgid ||
374 !cap_issubset(new->cap_permitted, old->cap_permitted)) { 374 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
375 set_dumpable(task->mm, suid_dumpable); 375 if (task->mm)
376 set_dumpable(task->mm, suid_dumpable);
376 task->pdeath_signal = 0; 377 task->pdeath_signal = 0;
377 smp_wmb(); 378 smp_wmb();
378 } 379 }
@@ -506,6 +507,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
506 else 507 else
507 old = get_cred(&init_cred); 508 old = get_cred(&init_cred);
508 509
510 *new = *old;
509 get_uid(new->user); 511 get_uid(new->user);
510 get_group_info(new->group_info); 512 get_group_info(new->group_info);
511 513
@@ -529,6 +531,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
529 531
530error: 532error:
531 put_cred(new); 533 put_cred(new);
534 put_cred(old);
532 return NULL; 535 return NULL;
533} 536}
534EXPORT_SYMBOL(prepare_kernel_cred); 537EXPORT_SYMBOL(prepare_kernel_cred);
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index f013a0c2e111..038707404b76 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -109,20 +109,40 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
109int dma_alloc_from_coherent(struct device *dev, ssize_t size, 109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret) 110 dma_addr_t *dma_handle, void **ret)
111{ 111{
112 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; 112 struct dma_coherent_mem *mem;
113 int order = get_order(size); 113 int order = get_order(size);
114 int pageno;
114 115
115 if (mem) { 116 if (!dev)
116 int page = bitmap_find_free_region(mem->bitmap, mem->size, 117 return 0;
117 order); 118 mem = dev->dma_mem;
118 if (page >= 0) { 119 if (!mem)
119 *dma_handle = mem->device_base + (page << PAGE_SHIFT); 120 return 0;
120 *ret = mem->virt_base + (page << PAGE_SHIFT); 121 if (unlikely(size > mem->size))
121 memset(*ret, 0, size); 122 return 0;
122 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) 123
123 *ret = NULL; 124 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
125 if (pageno >= 0) {
126 /*
127 * Memory was found in the per-device arena.
128 */
129 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
130 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
131 memset(*ret, 0, size);
132 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
133 /*
134 * The per-device arena is exhausted and we are not
135 * permitted to fall back to generic memory.
136 */
137 *ret = NULL;
138 } else {
139 /*
140 * The per-device arena is exhausted and we are
141 * permitted to fall back to generic memory.
142 */
143 return 0;
124 } 144 }
125 return (mem != NULL); 145 return 1;
126} 146}
127EXPORT_SYMBOL(dma_alloc_from_coherent); 147EXPORT_SYMBOL(dma_alloc_from_coherent);
128 148
diff --git a/kernel/exit.c b/kernel/exit.c
index ad8d04d83a2e..cbdb39a498eb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -645,35 +645,31 @@ retry:
645 /* 645 /*
646 * We found no owner yet mm_users > 1: this implies that we are 646 * We found no owner yet mm_users > 1: this implies that we are
647 * most likely racing with swapoff (try_to_unuse()) or /proc or 647 * most likely racing with swapoff (try_to_unuse()) or /proc or
648 * ptrace or page migration (get_task_mm()). Mark owner as NULL, 648 * ptrace or page migration (get_task_mm()). Mark owner as NULL.
649 * so that subsystems can understand the callback and take action.
650 */ 649 */
651 down_write(&mm->mmap_sem);
652 cgroup_mm_owner_callbacks(mm->owner, NULL);
653 mm->owner = NULL; 650 mm->owner = NULL;
654 up_write(&mm->mmap_sem);
655 return; 651 return;
656 652
657assign_new_owner: 653assign_new_owner:
658 BUG_ON(c == p); 654 BUG_ON(c == p);
659 get_task_struct(c); 655 get_task_struct(c);
660 read_unlock(&tasklist_lock);
661 down_write(&mm->mmap_sem);
662 /* 656 /*
663 * The task_lock protects c->mm from changing. 657 * The task_lock protects c->mm from changing.
664 * We always want mm->owner->mm == mm 658 * We always want mm->owner->mm == mm
665 */ 659 */
666 task_lock(c); 660 task_lock(c);
661 /*
662 * Delay read_unlock() till we have the task_lock()
663 * to ensure that c does not slip away underneath us
664 */
665 read_unlock(&tasklist_lock);
667 if (c->mm != mm) { 666 if (c->mm != mm) {
668 task_unlock(c); 667 task_unlock(c);
669 up_write(&mm->mmap_sem);
670 put_task_struct(c); 668 put_task_struct(c);
671 goto retry; 669 goto retry;
672 } 670 }
673 cgroup_mm_owner_callbacks(mm->owner, c);
674 mm->owner = c; 671 mm->owner = c;
675 task_unlock(c); 672 task_unlock(c);
676 up_write(&mm->mmap_sem);
677 put_task_struct(c); 673 put_task_struct(c);
678} 674}
679#endif /* CONFIG_MM_OWNER */ 675#endif /* CONFIG_MM_OWNER */
@@ -1040,8 +1036,6 @@ NORET_TYPE void do_exit(long code)
1040 * task into the wait for ever nirwana as well. 1036 * task into the wait for ever nirwana as well.
1041 */ 1037 */
1042 tsk->flags |= PF_EXITPIDONE; 1038 tsk->flags |= PF_EXITPIDONE;
1043 if (tsk->io_context)
1044 exit_io_context();
1045 set_current_state(TASK_UNINTERRUPTIBLE); 1039 set_current_state(TASK_UNINTERRUPTIBLE);
1046 schedule(); 1040 schedule();
1047 } 1041 }
@@ -1060,10 +1054,7 @@ NORET_TYPE void do_exit(long code)
1060 preempt_count()); 1054 preempt_count());
1061 1055
1062 acct_update_integrals(tsk); 1056 acct_update_integrals(tsk);
1063 if (tsk->mm) { 1057
1064 update_hiwater_rss(tsk->mm);
1065 update_hiwater_vm(tsk->mm);
1066 }
1067 group_dead = atomic_dec_and_test(&tsk->signal->live); 1058 group_dead = atomic_dec_and_test(&tsk->signal->live);
1068 if (group_dead) { 1059 if (group_dead) {
1069 hrtimer_cancel(&tsk->signal->real_timer); 1060 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1327,10 +1318,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
1327 * group, which consolidates times for all threads in the 1318 * group, which consolidates times for all threads in the
1328 * group including the group leader. 1319 * group including the group leader.
1329 */ 1320 */
1321 thread_group_cputime(p, &cputime);
1330 spin_lock_irq(&p->parent->sighand->siglock); 1322 spin_lock_irq(&p->parent->sighand->siglock);
1331 psig = p->parent->signal; 1323 psig = p->parent->signal;
1332 sig = p->signal; 1324 sig = p->signal;
1333 thread_group_cputime(p, &cputime);
1334 psig->cutime = 1325 psig->cutime =
1335 cputime_add(psig->cutime, 1326 cputime_add(psig->cutime,
1336 cputime_add(cputime.utime, 1327 cputime_add(cputime.utime,
diff --git a/kernel/extable.c b/kernel/extable.c
index feb0317cf09a..e136ed8d82ba 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -67,3 +67,19 @@ int kernel_text_address(unsigned long addr)
67 return 1; 67 return 1;
68 return module_text_address(addr) != NULL; 68 return module_text_address(addr) != NULL;
69} 69}
70
71/*
72 * On some architectures (PPC64, IA64) function pointers
73 * are actually only tokens to some data that then holds the
74 * real function address. As a result, to find if a function
75 * pointer is part of the kernel text, we need to do some
76 * special dereferencing first.
77 */
78int func_ptr_is_kernel_text(void *ptr)
79{
80 unsigned long addr;
81 addr = (unsigned long) dereference_function_descriptor(ptr);
82 if (core_kernel_text(addr))
83 return 1;
84 return module_text_address(addr) != NULL;
85}
diff --git a/kernel/fork.c b/kernel/fork.c
index cb706599057f..b1f8609287eb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -400,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
400#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) 400#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
401#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 401#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
402 402
403static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
404
405static int __init coredump_filter_setup(char *s)
406{
407 default_dump_filter =
408 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
409 MMF_DUMP_FILTER_MASK;
410 return 1;
411}
412
413__setup("coredump_filter=", coredump_filter_setup);
414
403#include <linux/init_task.h> 415#include <linux/init_task.h>
404 416
405static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 417static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
@@ -408,15 +420,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
408 atomic_set(&mm->mm_count, 1); 420 atomic_set(&mm->mm_count, 1);
409 init_rwsem(&mm->mmap_sem); 421 init_rwsem(&mm->mmap_sem);
410 INIT_LIST_HEAD(&mm->mmlist); 422 INIT_LIST_HEAD(&mm->mmlist);
411 mm->flags = (current->mm) ? current->mm->flags 423 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
412 : MMF_DUMP_FILTER_DEFAULT;
413 mm->core_state = NULL; 424 mm->core_state = NULL;
414 mm->nr_ptes = 0; 425 mm->nr_ptes = 0;
415 set_mm_counter(mm, file_rss, 0); 426 set_mm_counter(mm, file_rss, 0);
416 set_mm_counter(mm, anon_rss, 0); 427 set_mm_counter(mm, anon_rss, 0);
417 spin_lock_init(&mm->page_table_lock); 428 spin_lock_init(&mm->page_table_lock);
418 rwlock_init(&mm->ioctx_list_lock); 429 spin_lock_init(&mm->ioctx_lock);
419 mm->ioctx_list = NULL; 430 INIT_HLIST_HEAD(&mm->ioctx_list);
420 mm->free_area_cache = TASK_UNMAPPED_BASE; 431 mm->free_area_cache = TASK_UNMAPPED_BASE;
421 mm->cached_hole_size = ~0UL; 432 mm->cached_hole_size = ~0UL;
422 mm_init_owner(mm, p); 433 mm_init_owner(mm, p);
@@ -758,7 +769,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
758{ 769{
759 struct sighand_struct *sig; 770 struct sighand_struct *sig;
760 771
761 if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { 772 if (clone_flags & CLONE_SIGHAND) {
762 atomic_inc(&current->sighand->count); 773 atomic_inc(&current->sighand->count);
763 return 0; 774 return 0;
764 } 775 }
@@ -1116,12 +1127,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1116 1127
1117 if (pid != &init_struct_pid) { 1128 if (pid != &init_struct_pid) {
1118 retval = -ENOMEM; 1129 retval = -ENOMEM;
1119 pid = alloc_pid(task_active_pid_ns(p)); 1130 pid = alloc_pid(p->nsproxy->pid_ns);
1120 if (!pid) 1131 if (!pid)
1121 goto bad_fork_cleanup_io; 1132 goto bad_fork_cleanup_io;
1122 1133
1123 if (clone_flags & CLONE_NEWPID) { 1134 if (clone_flags & CLONE_NEWPID) {
1124 retval = pid_ns_prepare_proc(task_active_pid_ns(p)); 1135 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1125 if (retval < 0) 1136 if (retval < 0)
1126 goto bad_fork_free_pid; 1137 goto bad_fork_free_pid;
1127 } 1138 }
@@ -1471,12 +1482,10 @@ void __init proc_caches_init(void)
1471 fs_cachep = kmem_cache_create("fs_cache", 1482 fs_cachep = kmem_cache_create("fs_cache",
1472 sizeof(struct fs_struct), 0, 1483 sizeof(struct fs_struct), 0,
1473 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1484 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1474 vm_area_cachep = kmem_cache_create("vm_area_struct",
1475 sizeof(struct vm_area_struct), 0,
1476 SLAB_PANIC, NULL);
1477 mm_cachep = kmem_cache_create("mm_struct", 1485 mm_cachep = kmem_cache_create("mm_struct",
1478 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1486 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1479 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1487 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1488 mmap_init();
1480} 1489}
1481 1490
1482/* 1491/*
diff --git a/kernel/futex.c b/kernel/futex.c
index 4fe790e89d0f..002aa189eb09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -92,11 +92,12 @@ struct futex_pi_state {
92 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
94 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
95 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiter, then make the second condition true.
96 */ 96 */
97struct futex_q { 97struct futex_q {
98 struct plist_node list; 98 struct plist_node list;
99 wait_queue_head_t waiters; 99 /* There can only be a single waiter */
100 wait_queue_head_t waiter;
100 101
101 /* Which hash list lock to use: */ 102 /* Which hash list lock to use: */
102 spinlock_t *lock_ptr; 103 spinlock_t *lock_ptr;
@@ -123,24 +124,6 @@ struct futex_hash_bucket {
123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 124static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
124 125
125/* 126/*
126 * Take mm->mmap_sem, when futex is shared
127 */
128static inline void futex_lock_mm(struct rw_semaphore *fshared)
129{
130 if (fshared)
131 down_read(fshared);
132}
133
134/*
135 * Release mm->mmap_sem, when the futex is shared
136 */
137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
138{
139 if (fshared)
140 up_read(fshared);
141}
142
143/*
144 * We hash on the keys returned from get_futex_key (see below). 127 * We hash on the keys returned from get_futex_key (see below).
145 */ 128 */
146static struct futex_hash_bucket *hash_futex(union futex_key *key) 129static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -161,6 +144,48 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
161 && key1->both.offset == key2->both.offset); 144 && key1->both.offset == key2->both.offset);
162} 145}
163 146
147/*
148 * Take a reference to the resource addressed by a key.
149 * Can be called while holding spinlocks.
150 *
151 */
152static void get_futex_key_refs(union futex_key *key)
153{
154 if (!key->both.ptr)
155 return;
156
157 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
158 case FUT_OFF_INODE:
159 atomic_inc(&key->shared.inode->i_count);
160 break;
161 case FUT_OFF_MMSHARED:
162 atomic_inc(&key->private.mm->mm_count);
163 break;
164 }
165}
166
167/*
168 * Drop a reference to the resource addressed by a key.
169 * The hash bucket spinlock must not be held.
170 */
171static void drop_futex_key_refs(union futex_key *key)
172{
173 if (!key->both.ptr) {
174 /* If we're here then we tried to put a key we failed to get */
175 WARN_ON_ONCE(1);
176 return;
177 }
178
179 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
180 case FUT_OFF_INODE:
181 iput(key->shared.inode);
182 break;
183 case FUT_OFF_MMSHARED:
184 mmdrop(key->private.mm);
185 break;
186 }
187}
188
164/** 189/**
165 * get_futex_key - Get parameters which are the keys for a futex. 190 * get_futex_key - Get parameters which are the keys for a futex.
166 * @uaddr: virtual address of the futex 191 * @uaddr: virtual address of the futex
@@ -179,12 +204,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
179 * For other futexes, it points to &current->mm->mmap_sem and 204 * For other futexes, it points to &current->mm->mmap_sem and
180 * caller must have taken the reader lock. but NOT any spinlocks. 205 * caller must have taken the reader lock. but NOT any spinlocks.
181 */ 206 */
182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 207static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
183 union futex_key *key)
184{ 208{
185 unsigned long address = (unsigned long)uaddr; 209 unsigned long address = (unsigned long)uaddr;
186 struct mm_struct *mm = current->mm; 210 struct mm_struct *mm = current->mm;
187 struct vm_area_struct *vma;
188 struct page *page; 211 struct page *page;
189 int err; 212 int err;
190 213
@@ -208,100 +231,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
208 return -EFAULT; 231 return -EFAULT;
209 key->private.mm = mm; 232 key->private.mm = mm;
210 key->private.address = address; 233 key->private.address = address;
234 get_futex_key_refs(key);
211 return 0; 235 return 0;
212 } 236 }
213 /*
214 * The futex is hashed differently depending on whether
215 * it's in a shared or private mapping. So check vma first.
216 */
217 vma = find_extend_vma(mm, address);
218 if (unlikely(!vma))
219 return -EFAULT;
220 237
221 /* 238again:
222 * Permissions. 239 err = get_user_pages_fast(address, 1, 0, &page);
223 */ 240 if (err < 0)
224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 241 return err;
225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 242
243 lock_page(page);
244 if (!page->mapping) {
245 unlock_page(page);
246 put_page(page);
247 goto again;
248 }
226 249
227 /* 250 /*
228 * Private mappings are handled in a simple way. 251 * Private mappings are handled in a simple way.
229 * 252 *
230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 253 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
231 * it's a read-only handle, it's expected that futexes attach to 254 * it's a read-only handle, it's expected that futexes attach to
232 * the object not the particular process. Therefore we use 255 * the object not the particular process.
233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
234 * mappings of _writable_ handles.
235 */ 256 */
236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 257 if (PageAnon(page)) {
237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 258 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
238 key->private.mm = mm; 259 key->private.mm = mm;
239 key->private.address = address; 260 key->private.address = address;
240 return 0; 261 } else {
241 } 262 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
242 263 key->shared.inode = page->mapping->host;
243 /* 264 key->shared.pgoff = page->index;
244 * Linear file mappings are also simple.
245 */
246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
250 + vma->vm_pgoff);
251 return 0;
252 } 265 }
253 266
254 /* 267 get_futex_key_refs(key);
255 * We could walk the page table to read the non-linear
256 * pte, and get the page index without fetching the page
257 * from swap. But that's a lot of code to duplicate here
258 * for a rare case, so we simply fetch the page.
259 */
260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
261 if (err >= 0) {
262 key->shared.pgoff =
263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
264 put_page(page);
265 return 0;
266 }
267 return err;
268}
269 268
270/* 269 unlock_page(page);
271 * Take a reference to the resource addressed by a key. 270 put_page(page);
272 * Can be called while holding spinlocks. 271 return 0;
273 *
274 */
275static void get_futex_key_refs(union futex_key *key)
276{
277 if (key->both.ptr == NULL)
278 return;
279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
280 case FUT_OFF_INODE:
281 atomic_inc(&key->shared.inode->i_count);
282 break;
283 case FUT_OFF_MMSHARED:
284 atomic_inc(&key->private.mm->mm_count);
285 break;
286 }
287} 272}
288 273
289/* 274static inline
290 * Drop a reference to the resource addressed by a key. 275void put_futex_key(int fshared, union futex_key *key)
291 * The hash bucket spinlock must not be held.
292 */
293static void drop_futex_key_refs(union futex_key *key)
294{ 276{
295 if (!key->both.ptr) 277 drop_futex_key_refs(key);
296 return;
297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
298 case FUT_OFF_INODE:
299 iput(key->shared.inode);
300 break;
301 case FUT_OFF_MMSHARED:
302 mmdrop(key->private.mm);
303 break;
304 }
305} 278}
306 279
307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 280static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -328,10 +301,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
328 301
329/* 302/*
330 * Fault handling. 303 * Fault handling.
331 * if fshared is non NULL, current->mm->mmap_sem is already held
332 */ 304 */
333static int futex_handle_fault(unsigned long address, 305static int futex_handle_fault(unsigned long address, int attempt)
334 struct rw_semaphore *fshared, int attempt)
335{ 306{
336 struct vm_area_struct * vma; 307 struct vm_area_struct * vma;
337 struct mm_struct *mm = current->mm; 308 struct mm_struct *mm = current->mm;
@@ -340,8 +311,7 @@ static int futex_handle_fault(unsigned long address,
340 if (attempt > 2) 311 if (attempt > 2)
341 return ret; 312 return ret;
342 313
343 if (!fshared) 314 down_read(&mm->mmap_sem);
344 down_read(&mm->mmap_sem);
345 vma = find_vma(mm, address); 315 vma = find_vma(mm, address);
346 if (vma && address >= vma->vm_start && 316 if (vma && address >= vma->vm_start &&
347 (vma->vm_flags & VM_WRITE)) { 317 (vma->vm_flags & VM_WRITE)) {
@@ -361,8 +331,7 @@ static int futex_handle_fault(unsigned long address,
361 current->min_flt++; 331 current->min_flt++;
362 } 332 }
363 } 333 }
364 if (!fshared) 334 up_read(&mm->mmap_sem);
365 up_read(&mm->mmap_sem);
366 return ret; 335 return ret;
367} 336}
368 337
@@ -385,6 +354,7 @@ static int refill_pi_state_cache(void)
385 /* pi_mutex gets initialized later */ 354 /* pi_mutex gets initialized later */
386 pi_state->owner = NULL; 355 pi_state->owner = NULL;
387 atomic_set(&pi_state->refcount, 1); 356 atomic_set(&pi_state->refcount, 1);
357 pi_state->key = FUTEX_KEY_INIT;
388 358
389 current->pi_state_cache = pi_state; 359 current->pi_state_cache = pi_state;
390 360
@@ -469,7 +439,7 @@ void exit_pi_state_list(struct task_struct *curr)
469 struct list_head *next, *head = &curr->pi_state_list; 439 struct list_head *next, *head = &curr->pi_state_list;
470 struct futex_pi_state *pi_state; 440 struct futex_pi_state *pi_state;
471 struct futex_hash_bucket *hb; 441 struct futex_hash_bucket *hb;
472 union futex_key key; 442 union futex_key key = FUTEX_KEY_INIT;
473 443
474 if (!futex_cmpxchg_enabled) 444 if (!futex_cmpxchg_enabled)
475 return; 445 return;
@@ -614,7 +584,7 @@ static void wake_futex(struct futex_q *q)
614 * The lock in wake_up_all() is a crucial memory barrier after the 584 * The lock in wake_up_all() is a crucial memory barrier after the
615 * plist_del() and also before assigning to q->lock_ptr. 585 * plist_del() and also before assigning to q->lock_ptr.
616 */ 586 */
617 wake_up_all(&q->waiters); 587 wake_up(&q->waiter);
618 /* 588 /*
619 * The waiting task can free the futex_q as soon as this is written, 589 * The waiting task can free the futex_q as soon as this is written,
620 * without taking any locks. This must come last. 590 * without taking any locks. This must come last.
@@ -726,20 +696,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
726 * Wake up all waiters hashed on the physical page that is mapped 696 * Wake up all waiters hashed on the physical page that is mapped
727 * to this virtual address: 697 * to this virtual address:
728 */ 698 */
729static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 699static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
730 int nr_wake, u32 bitset)
731{ 700{
732 struct futex_hash_bucket *hb; 701 struct futex_hash_bucket *hb;
733 struct futex_q *this, *next; 702 struct futex_q *this, *next;
734 struct plist_head *head; 703 struct plist_head *head;
735 union futex_key key; 704 union futex_key key = FUTEX_KEY_INIT;
736 int ret; 705 int ret;
737 706
738 if (!bitset) 707 if (!bitset)
739 return -EINVAL; 708 return -EINVAL;
740 709
741 futex_lock_mm(fshared);
742
743 ret = get_futex_key(uaddr, fshared, &key); 710 ret = get_futex_key(uaddr, fshared, &key);
744 if (unlikely(ret != 0)) 711 if (unlikely(ret != 0))
745 goto out; 712 goto out;
@@ -766,8 +733,8 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
766 } 733 }
767 734
768 spin_unlock(&hb->lock); 735 spin_unlock(&hb->lock);
736 put_futex_key(fshared, &key);
769out: 737out:
770 futex_unlock_mm(fshared);
771 return ret; 738 return ret;
772} 739}
773 740
@@ -776,25 +743,22 @@ out:
776 * to this virtual address: 743 * to this virtual address:
777 */ 744 */
778static int 745static int
779futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 746futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
780 u32 __user *uaddr2,
781 int nr_wake, int nr_wake2, int op) 747 int nr_wake, int nr_wake2, int op)
782{ 748{
783 union futex_key key1, key2; 749 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
784 struct futex_hash_bucket *hb1, *hb2; 750 struct futex_hash_bucket *hb1, *hb2;
785 struct plist_head *head; 751 struct plist_head *head;
786 struct futex_q *this, *next; 752 struct futex_q *this, *next;
787 int ret, op_ret, attempt = 0; 753 int ret, op_ret, attempt = 0;
788 754
789retryfull: 755retryfull:
790 futex_lock_mm(fshared);
791
792 ret = get_futex_key(uaddr1, fshared, &key1); 756 ret = get_futex_key(uaddr1, fshared, &key1);
793 if (unlikely(ret != 0)) 757 if (unlikely(ret != 0))
794 goto out; 758 goto out;
795 ret = get_futex_key(uaddr2, fshared, &key2); 759 ret = get_futex_key(uaddr2, fshared, &key2);
796 if (unlikely(ret != 0)) 760 if (unlikely(ret != 0))
797 goto out; 761 goto out_put_key1;
798 762
799 hb1 = hash_futex(&key1); 763 hb1 = hash_futex(&key1);
800 hb2 = hash_futex(&key2); 764 hb2 = hash_futex(&key2);
@@ -816,12 +780,12 @@ retry:
816 * but we might get them from range checking 780 * but we might get them from range checking
817 */ 781 */
818 ret = op_ret; 782 ret = op_ret;
819 goto out; 783 goto out_put_keys;
820#endif 784#endif
821 785
822 if (unlikely(op_ret != -EFAULT)) { 786 if (unlikely(op_ret != -EFAULT)) {
823 ret = op_ret; 787 ret = op_ret;
824 goto out; 788 goto out_put_keys;
825 } 789 }
826 790
827 /* 791 /*
@@ -833,18 +797,12 @@ retry:
833 */ 797 */
834 if (attempt++) { 798 if (attempt++) {
835 ret = futex_handle_fault((unsigned long)uaddr2, 799 ret = futex_handle_fault((unsigned long)uaddr2,
836 fshared, attempt); 800 attempt);
837 if (ret) 801 if (ret)
838 goto out; 802 goto out_put_keys;
839 goto retry; 803 goto retry;
840 } 804 }
841 805
842 /*
843 * If we would have faulted, release mmap_sem,
844 * fault it in and start all over again.
845 */
846 futex_unlock_mm(fshared);
847
848 ret = get_user(dummy, uaddr2); 806 ret = get_user(dummy, uaddr2);
849 if (ret) 807 if (ret)
850 return ret; 808 return ret;
@@ -879,9 +837,11 @@ retry:
879 spin_unlock(&hb1->lock); 837 spin_unlock(&hb1->lock);
880 if (hb1 != hb2) 838 if (hb1 != hb2)
881 spin_unlock(&hb2->lock); 839 spin_unlock(&hb2->lock);
840out_put_keys:
841 put_futex_key(fshared, &key2);
842out_put_key1:
843 put_futex_key(fshared, &key1);
882out: 844out:
883 futex_unlock_mm(fshared);
884
885 return ret; 845 return ret;
886} 846}
887 847
@@ -889,25 +849,22 @@ out:
889 * Requeue all waiters hashed on one physical page to another 849 * Requeue all waiters hashed on one physical page to another
890 * physical page. 850 * physical page.
891 */ 851 */
892static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 852static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
893 u32 __user *uaddr2,
894 int nr_wake, int nr_requeue, u32 *cmpval) 853 int nr_wake, int nr_requeue, u32 *cmpval)
895{ 854{
896 union futex_key key1, key2; 855 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
897 struct futex_hash_bucket *hb1, *hb2; 856 struct futex_hash_bucket *hb1, *hb2;
898 struct plist_head *head1; 857 struct plist_head *head1;
899 struct futex_q *this, *next; 858 struct futex_q *this, *next;
900 int ret, drop_count = 0; 859 int ret, drop_count = 0;
901 860
902 retry: 861retry:
903 futex_lock_mm(fshared);
904
905 ret = get_futex_key(uaddr1, fshared, &key1); 862 ret = get_futex_key(uaddr1, fshared, &key1);
906 if (unlikely(ret != 0)) 863 if (unlikely(ret != 0))
907 goto out; 864 goto out;
908 ret = get_futex_key(uaddr2, fshared, &key2); 865 ret = get_futex_key(uaddr2, fshared, &key2);
909 if (unlikely(ret != 0)) 866 if (unlikely(ret != 0))
910 goto out; 867 goto out_put_key1;
911 868
912 hb1 = hash_futex(&key1); 869 hb1 = hash_futex(&key1);
913 hb2 = hash_futex(&key2); 870 hb2 = hash_futex(&key2);
@@ -924,18 +881,12 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
924 if (hb1 != hb2) 881 if (hb1 != hb2)
925 spin_unlock(&hb2->lock); 882 spin_unlock(&hb2->lock);
926 883
927 /*
928 * If we would have faulted, release mmap_sem, fault
929 * it in and start all over again.
930 */
931 futex_unlock_mm(fshared);
932
933 ret = get_user(curval, uaddr1); 884 ret = get_user(curval, uaddr1);
934 885
935 if (!ret) 886 if (!ret)
936 goto retry; 887 goto retry;
937 888
938 return ret; 889 goto out_put_keys;
939 } 890 }
940 if (curval != *cmpval) { 891 if (curval != *cmpval) {
941 ret = -EAGAIN; 892 ret = -EAGAIN;
@@ -980,8 +931,11 @@ out_unlock:
980 while (--drop_count >= 0) 931 while (--drop_count >= 0)
981 drop_futex_key_refs(&key1); 932 drop_futex_key_refs(&key1);
982 933
934out_put_keys:
935 put_futex_key(fshared, &key2);
936out_put_key1:
937 put_futex_key(fshared, &key1);
983out: 938out:
984 futex_unlock_mm(fshared);
985 return ret; 939 return ret;
986} 940}
987 941
@@ -990,7 +944,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
990{ 944{
991 struct futex_hash_bucket *hb; 945 struct futex_hash_bucket *hb;
992 946
993 init_waitqueue_head(&q->waiters); 947 init_waitqueue_head(&q->waiter);
994 948
995 get_futex_key_refs(&q->key); 949 get_futex_key_refs(&q->key);
996 hb = hash_futex(&q->key); 950 hb = hash_futex(&q->key);
@@ -1042,7 +996,7 @@ static int unqueue_me(struct futex_q *q)
1042 int ret = 0; 996 int ret = 0;
1043 997
1044 /* In the common case we don't take the spinlock, which is nice. */ 998 /* In the common case we don't take the spinlock, which is nice. */
1045 retry: 999retry:
1046 lock_ptr = q->lock_ptr; 1000 lock_ptr = q->lock_ptr;
1047 barrier(); 1001 barrier();
1048 if (lock_ptr != NULL) { 1002 if (lock_ptr != NULL) {
@@ -1103,8 +1057,7 @@ static void unqueue_me_pi(struct futex_q *q)
1103 * private futexes. 1057 * private futexes.
1104 */ 1058 */
1105static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1059static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1106 struct task_struct *newowner, 1060 struct task_struct *newowner, int fshared)
1107 struct rw_semaphore *fshared)
1108{ 1061{
1109 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1062 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1110 struct futex_pi_state *pi_state = q->pi_state; 1063 struct futex_pi_state *pi_state = q->pi_state;
@@ -1183,7 +1136,7 @@ retry:
1183handle_fault: 1136handle_fault:
1184 spin_unlock(q->lock_ptr); 1137 spin_unlock(q->lock_ptr);
1185 1138
1186 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); 1139 ret = futex_handle_fault((unsigned long)uaddr, attempt++);
1187 1140
1188 spin_lock(q->lock_ptr); 1141 spin_lock(q->lock_ptr);
1189 1142
@@ -1203,12 +1156,13 @@ handle_fault:
1203 * In case we must use restart_block to restart a futex_wait, 1156 * In case we must use restart_block to restart a futex_wait,
1204 * we encode in the 'flags' shared capability 1157 * we encode in the 'flags' shared capability
1205 */ 1158 */
1206#define FLAGS_SHARED 1 1159#define FLAGS_SHARED 0x01
1160#define FLAGS_CLOCKRT 0x02
1207 1161
1208static long futex_wait_restart(struct restart_block *restart); 1162static long futex_wait_restart(struct restart_block *restart);
1209 1163
1210static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1164static int futex_wait(u32 __user *uaddr, int fshared,
1211 u32 val, ktime_t *abs_time, u32 bitset) 1165 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1212{ 1166{
1213 struct task_struct *curr = current; 1167 struct task_struct *curr = current;
1214 DECLARE_WAITQUEUE(wait, curr); 1168 DECLARE_WAITQUEUE(wait, curr);
@@ -1224,12 +1178,11 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1224 1178
1225 q.pi_state = NULL; 1179 q.pi_state = NULL;
1226 q.bitset = bitset; 1180 q.bitset = bitset;
1227 retry: 1181retry:
1228 futex_lock_mm(fshared); 1182 q.key = FUTEX_KEY_INIT;
1229
1230 ret = get_futex_key(uaddr, fshared, &q.key); 1183 ret = get_futex_key(uaddr, fshared, &q.key);
1231 if (unlikely(ret != 0)) 1184 if (unlikely(ret != 0))
1232 goto out_release_sem; 1185 goto out;
1233 1186
1234 hb = queue_lock(&q); 1187 hb = queue_lock(&q);
1235 1188
@@ -1257,12 +1210,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1257 1210
1258 if (unlikely(ret)) { 1211 if (unlikely(ret)) {
1259 queue_unlock(&q, hb); 1212 queue_unlock(&q, hb);
1260 1213 put_futex_key(fshared, &q.key);
1261 /*
1262 * If we would have faulted, release mmap_sem, fault it in and
1263 * start all over again.
1264 */
1265 futex_unlock_mm(fshared);
1266 1214
1267 ret = get_user(uval, uaddr); 1215 ret = get_user(uval, uaddr);
1268 1216
@@ -1272,18 +1220,12 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1272 } 1220 }
1273 ret = -EWOULDBLOCK; 1221 ret = -EWOULDBLOCK;
1274 if (uval != val) 1222 if (uval != val)
1275 goto out_unlock_release_sem; 1223 goto out_unlock_put_key;
1276 1224
1277 /* Only actually queue if *uaddr contained val. */ 1225 /* Only actually queue if *uaddr contained val. */
1278 queue_me(&q, hb); 1226 queue_me(&q, hb);
1279 1227
1280 /* 1228 /*
1281 * Now the futex is queued and we have checked the data, we
1282 * don't want to hold mmap_sem while we sleep.
1283 */
1284 futex_unlock_mm(fshared);
1285
1286 /*
1287 * There might have been scheduling since the queue_me(), as we 1229 * There might have been scheduling since the queue_me(), as we
1288 * cannot hold a spinlock across the get_user() in case it 1230 * cannot hold a spinlock across the get_user() in case it
1289 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1231 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
@@ -1294,7 +1236,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1294 1236
1295 /* add_wait_queue is the barrier after __set_current_state. */ 1237 /* add_wait_queue is the barrier after __set_current_state. */
1296 __set_current_state(TASK_INTERRUPTIBLE); 1238 __set_current_state(TASK_INTERRUPTIBLE);
1297 add_wait_queue(&q.waiters, &wait); 1239 add_wait_queue(&q.waiter, &wait);
1298 /* 1240 /*
1299 * !plist_node_empty() is safe here without any lock. 1241 * !plist_node_empty() is safe here without any lock.
1300 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1242 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
@@ -1307,8 +1249,10 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1307 slack = current->timer_slack_ns; 1249 slack = current->timer_slack_ns;
1308 if (rt_task(current)) 1250 if (rt_task(current))
1309 slack = 0; 1251 slack = 0;
1310 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, 1252 hrtimer_init_on_stack(&t.timer,
1311 HRTIMER_MODE_ABS); 1253 clockrt ? CLOCK_REALTIME :
1254 CLOCK_MONOTONIC,
1255 HRTIMER_MODE_ABS);
1312 hrtimer_init_sleeper(&t, current); 1256 hrtimer_init_sleeper(&t, current);
1313 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); 1257 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
1314 1258
@@ -1363,14 +1307,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1363 1307
1364 if (fshared) 1308 if (fshared)
1365 restart->futex.flags |= FLAGS_SHARED; 1309 restart->futex.flags |= FLAGS_SHARED;
1310 if (clockrt)
1311 restart->futex.flags |= FLAGS_CLOCKRT;
1366 return -ERESTART_RESTARTBLOCK; 1312 return -ERESTART_RESTARTBLOCK;
1367 } 1313 }
1368 1314
1369 out_unlock_release_sem: 1315out_unlock_put_key:
1370 queue_unlock(&q, hb); 1316 queue_unlock(&q, hb);
1317 put_futex_key(fshared, &q.key);
1371 1318
1372 out_release_sem: 1319out:
1373 futex_unlock_mm(fshared);
1374 return ret; 1320 return ret;
1375} 1321}
1376 1322
@@ -1378,15 +1324,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1378static long futex_wait_restart(struct restart_block *restart) 1324static long futex_wait_restart(struct restart_block *restart)
1379{ 1325{
1380 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1326 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1381 struct rw_semaphore *fshared = NULL; 1327 int fshared = 0;
1382 ktime_t t; 1328 ktime_t t;
1383 1329
1384 t.tv64 = restart->futex.time; 1330 t.tv64 = restart->futex.time;
1385 restart->fn = do_no_restart_syscall; 1331 restart->fn = do_no_restart_syscall;
1386 if (restart->futex.flags & FLAGS_SHARED) 1332 if (restart->futex.flags & FLAGS_SHARED)
1387 fshared = &current->mm->mmap_sem; 1333 fshared = 1;
1388 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1334 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1389 restart->futex.bitset); 1335 restart->futex.bitset,
1336 restart->futex.flags & FLAGS_CLOCKRT);
1390} 1337}
1391 1338
1392 1339
@@ -1396,7 +1343,7 @@ static long futex_wait_restart(struct restart_block *restart)
1396 * if there are waiters then it will block, it does PI, etc. (Due to 1343 * if there are waiters then it will block, it does PI, etc. (Due to
1397 * races the kernel might see a 0 value of the futex too.) 1344 * races the kernel might see a 0 value of the futex too.)
1398 */ 1345 */
1399static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, 1346static int futex_lock_pi(u32 __user *uaddr, int fshared,
1400 int detect, ktime_t *time, int trylock) 1347 int detect, ktime_t *time, int trylock)
1401{ 1348{
1402 struct hrtimer_sleeper timeout, *to = NULL; 1349 struct hrtimer_sleeper timeout, *to = NULL;
@@ -1418,17 +1365,16 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1418 } 1365 }
1419 1366
1420 q.pi_state = NULL; 1367 q.pi_state = NULL;
1421 retry: 1368retry:
1422 futex_lock_mm(fshared); 1369 q.key = FUTEX_KEY_INIT;
1423
1424 ret = get_futex_key(uaddr, fshared, &q.key); 1370 ret = get_futex_key(uaddr, fshared, &q.key);
1425 if (unlikely(ret != 0)) 1371 if (unlikely(ret != 0))
1426 goto out_release_sem; 1372 goto out;
1427 1373
1428 retry_unlocked: 1374retry_unlocked:
1429 hb = queue_lock(&q); 1375 hb = queue_lock(&q);
1430 1376
1431 retry_locked: 1377retry_locked:
1432 ret = lock_taken = 0; 1378 ret = lock_taken = 0;
1433 1379
1434 /* 1380 /*
@@ -1449,14 +1395,14 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1449 */ 1395 */
1450 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { 1396 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1451 ret = -EDEADLK; 1397 ret = -EDEADLK;
1452 goto out_unlock_release_sem; 1398 goto out_unlock_put_key;
1453 } 1399 }
1454 1400
1455 /* 1401 /*
1456 * Surprise - we got the lock. Just return to userspace: 1402 * Surprise - we got the lock. Just return to userspace:
1457 */ 1403 */
1458 if (unlikely(!curval)) 1404 if (unlikely(!curval))
1459 goto out_unlock_release_sem; 1405 goto out_unlock_put_key;
1460 1406
1461 uval = curval; 1407 uval = curval;
1462 1408
@@ -1492,7 +1438,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1492 * We took the lock due to owner died take over. 1438 * We took the lock due to owner died take over.
1493 */ 1439 */
1494 if (unlikely(lock_taken)) 1440 if (unlikely(lock_taken))
1495 goto out_unlock_release_sem; 1441 goto out_unlock_put_key;
1496 1442
1497 /* 1443 /*
1498 * We dont have the lock. Look up the PI state (or create it if 1444 * We dont have the lock. Look up the PI state (or create it if
@@ -1509,7 +1455,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1509 * exit to complete. 1455 * exit to complete.
1510 */ 1456 */
1511 queue_unlock(&q, hb); 1457 queue_unlock(&q, hb);
1512 futex_unlock_mm(fshared);
1513 cond_resched(); 1458 cond_resched();
1514 goto retry; 1459 goto retry;
1515 1460
@@ -1532,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1532 goto retry_locked; 1477 goto retry_locked;
1533 } 1478 }
1534 default: 1479 default:
1535 goto out_unlock_release_sem; 1480 goto out_unlock_put_key;
1536 } 1481 }
1537 } 1482 }
1538 1483
@@ -1541,12 +1486,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1541 */ 1486 */
1542 queue_me(&q, hb); 1487 queue_me(&q, hb);
1543 1488
1544 /*
1545 * Now the futex is queued and we have checked the data, we
1546 * don't want to hold mmap_sem while we sleep.
1547 */
1548 futex_unlock_mm(fshared);
1549
1550 WARN_ON(!q.pi_state); 1489 WARN_ON(!q.pi_state);
1551 /* 1490 /*
1552 * Block on the PI mutex: 1491 * Block on the PI mutex:
@@ -1559,7 +1498,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1559 ret = ret ? 0 : -EWOULDBLOCK; 1498 ret = ret ? 0 : -EWOULDBLOCK;
1560 } 1499 }
1561 1500
1562 futex_lock_mm(fshared);
1563 spin_lock(q.lock_ptr); 1501 spin_lock(q.lock_ptr);
1564 1502
1565 if (!ret) { 1503 if (!ret) {
@@ -1625,44 +1563,40 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1625 1563
1626 /* Unqueue and drop the lock */ 1564 /* Unqueue and drop the lock */
1627 unqueue_me_pi(&q); 1565 unqueue_me_pi(&q);
1628 futex_unlock_mm(fshared);
1629 1566
1630 if (to) 1567 if (to)
1631 destroy_hrtimer_on_stack(&to->timer); 1568 destroy_hrtimer_on_stack(&to->timer);
1632 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1569 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1633 1570
1634 out_unlock_release_sem: 1571out_unlock_put_key:
1635 queue_unlock(&q, hb); 1572 queue_unlock(&q, hb);
1636 1573
1637 out_release_sem: 1574out_put_key:
1638 futex_unlock_mm(fshared); 1575 put_futex_key(fshared, &q.key);
1576out:
1639 if (to) 1577 if (to)
1640 destroy_hrtimer_on_stack(&to->timer); 1578 destroy_hrtimer_on_stack(&to->timer);
1641 return ret; 1579 return ret;
1642 1580
1643 uaddr_faulted: 1581uaddr_faulted:
1644 /* 1582 /*
1645 * We have to r/w *(int __user *)uaddr, but we can't modify it 1583 * We have to r/w *(int __user *)uaddr, and we have to modify it
1646 * non-atomically. Therefore, if get_user below is not 1584 * atomically. Therefore, if we continue to fault after get_user()
1647 * enough, we need to handle the fault ourselves, while 1585 * below, we need to handle the fault ourselves, while still holding
1648 * still holding the mmap_sem. 1586 * the mmap_sem. This can occur if the uaddr is under contention as
1649 * 1587 * we have to drop the mmap_sem in order to call get_user().
1650 * ... and hb->lock. :-) --ANK
1651 */ 1588 */
1652 queue_unlock(&q, hb); 1589 queue_unlock(&q, hb);
1653 1590
1654 if (attempt++) { 1591 if (attempt++) {
1655 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1592 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1656 attempt);
1657 if (ret) 1593 if (ret)
1658 goto out_release_sem; 1594 goto out_put_key;
1659 goto retry_unlocked; 1595 goto retry_unlocked;
1660 } 1596 }
1661 1597
1662 futex_unlock_mm(fshared);
1663
1664 ret = get_user(uval, uaddr); 1598 ret = get_user(uval, uaddr);
1665 if (!ret && (uval != -EFAULT)) 1599 if (!ret)
1666 goto retry; 1600 goto retry;
1667 1601
1668 if (to) 1602 if (to)
@@ -1675,13 +1609,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1675 * This is the in-kernel slowpath: we look up the PI state (if any), 1609 * This is the in-kernel slowpath: we look up the PI state (if any),
1676 * and do the rt-mutex unlock. 1610 * and do the rt-mutex unlock.
1677 */ 1611 */
1678static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) 1612static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1679{ 1613{
1680 struct futex_hash_bucket *hb; 1614 struct futex_hash_bucket *hb;
1681 struct futex_q *this, *next; 1615 struct futex_q *this, *next;
1682 u32 uval; 1616 u32 uval;
1683 struct plist_head *head; 1617 struct plist_head *head;
1684 union futex_key key; 1618 union futex_key key = FUTEX_KEY_INIT;
1685 int ret, attempt = 0; 1619 int ret, attempt = 0;
1686 1620
1687retry: 1621retry:
@@ -1692,10 +1626,6 @@ retry:
1692 */ 1626 */
1693 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1627 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1694 return -EPERM; 1628 return -EPERM;
1695 /*
1696 * First take all the futex related locks:
1697 */
1698 futex_lock_mm(fshared);
1699 1629
1700 ret = get_futex_key(uaddr, fshared, &key); 1630 ret = get_futex_key(uaddr, fshared, &key);
1701 if (unlikely(ret != 0)) 1631 if (unlikely(ret != 0))
@@ -1753,35 +1683,31 @@ retry_unlocked:
1753 1683
1754out_unlock: 1684out_unlock:
1755 spin_unlock(&hb->lock); 1685 spin_unlock(&hb->lock);
1756out: 1686 put_futex_key(fshared, &key);
1757 futex_unlock_mm(fshared);
1758 1687
1688out:
1759 return ret; 1689 return ret;
1760 1690
1761pi_faulted: 1691pi_faulted:
1762 /* 1692 /*
1763 * We have to r/w *(int __user *)uaddr, but we can't modify it 1693 * We have to r/w *(int __user *)uaddr, and we have to modify it
1764 * non-atomically. Therefore, if get_user below is not 1694 * atomically. Therefore, if we continue to fault after get_user()
1765 * enough, we need to handle the fault ourselves, while 1695 * below, we need to handle the fault ourselves, while still holding
1766 * still holding the mmap_sem. 1696 * the mmap_sem. This can occur if the uaddr is under contention as
1767 * 1697 * we have to drop the mmap_sem in order to call get_user().
1768 * ... and hb->lock. --ANK
1769 */ 1698 */
1770 spin_unlock(&hb->lock); 1699 spin_unlock(&hb->lock);
1771 1700
1772 if (attempt++) { 1701 if (attempt++) {
1773 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1702 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1774 attempt);
1775 if (ret) 1703 if (ret)
1776 goto out; 1704 goto out;
1777 uval = 0; 1705 uval = 0;
1778 goto retry_unlocked; 1706 goto retry_unlocked;
1779 } 1707 }
1780 1708
1781 futex_unlock_mm(fshared);
1782
1783 ret = get_user(uval, uaddr); 1709 ret = get_user(uval, uaddr);
1784 if (!ret && (uval != -EFAULT)) 1710 if (!ret)
1785 goto retry; 1711 goto retry;
1786 1712
1787 return ret; 1713 return ret;
@@ -1908,8 +1834,7 @@ retry:
1908 * PI futexes happens in exit_pi_state(): 1834 * PI futexes happens in exit_pi_state():
1909 */ 1835 */
1910 if (!pi && (uval & FUTEX_WAITERS)) 1836 if (!pi && (uval & FUTEX_WAITERS))
1911 futex_wake(uaddr, &curr->mm->mmap_sem, 1, 1837 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
1912 FUTEX_BITSET_MATCH_ANY);
1913 } 1838 }
1914 return 0; 1839 return 0;
1915} 1840}
@@ -2003,18 +1928,22 @@ void exit_robust_list(struct task_struct *curr)
2003long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 1928long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2004 u32 __user *uaddr2, u32 val2, u32 val3) 1929 u32 __user *uaddr2, u32 val2, u32 val3)
2005{ 1930{
2006 int ret = -ENOSYS; 1931 int clockrt, ret = -ENOSYS;
2007 int cmd = op & FUTEX_CMD_MASK; 1932 int cmd = op & FUTEX_CMD_MASK;
2008 struct rw_semaphore *fshared = NULL; 1933 int fshared = 0;
2009 1934
2010 if (!(op & FUTEX_PRIVATE_FLAG)) 1935 if (!(op & FUTEX_PRIVATE_FLAG))
2011 fshared = &current->mm->mmap_sem; 1936 fshared = 1;
1937
1938 clockrt = op & FUTEX_CLOCK_REALTIME;
1939 if (clockrt && cmd != FUTEX_WAIT_BITSET)
1940 return -ENOSYS;
2012 1941
2013 switch (cmd) { 1942 switch (cmd) {
2014 case FUTEX_WAIT: 1943 case FUTEX_WAIT:
2015 val3 = FUTEX_BITSET_MATCH_ANY; 1944 val3 = FUTEX_BITSET_MATCH_ANY;
2016 case FUTEX_WAIT_BITSET: 1945 case FUTEX_WAIT_BITSET:
2017 ret = futex_wait(uaddr, fshared, val, timeout, val3); 1946 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
2018 break; 1947 break;
2019 case FUTEX_WAKE: 1948 case FUTEX_WAKE:
2020 val3 = FUTEX_BITSET_MATCH_ANY; 1949 val3 = FUTEX_BITSET_MATCH_ANY;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 47e63349d1b2..1455b7651b6b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,6 @@
32 */ 32 */
33 33
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/irq.h>
36#include <linux/module.h> 35#include <linux/module.h>
37#include <linux/percpu.h> 36#include <linux/percpu.h>
38#include <linux/hrtimer.h> 37#include <linux/hrtimer.h>
@@ -442,22 +441,6 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
442static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 441static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
443#endif 442#endif
444 443
445/*
446 * Check, whether the timer is on the callback pending list
447 */
448static inline int hrtimer_cb_pending(const struct hrtimer *timer)
449{
450 return timer->state & HRTIMER_STATE_PENDING;
451}
452
453/*
454 * Remove a timer from the callback pending list
455 */
456static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
457{
458 list_del_init(&timer->cb_entry);
459}
460
461/* High resolution timer related functions */ 444/* High resolution timer related functions */
462#ifdef CONFIG_HIGH_RES_TIMERS 445#ifdef CONFIG_HIGH_RES_TIMERS
463 446
@@ -651,6 +634,7 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
651{ 634{
652} 635}
653 636
637
654/* 638/*
655 * When High resolution timers are active, try to reprogram. Note, that in case 639 * When High resolution timers are active, try to reprogram. Note, that in case
656 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry 640 * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
@@ -661,31 +645,10 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
661 struct hrtimer_clock_base *base) 645 struct hrtimer_clock_base *base)
662{ 646{
663 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 647 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
664 648 spin_unlock(&base->cpu_base->lock);
665 /* Timer is expired, act upon the callback mode */ 649 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
666 switch(timer->cb_mode) { 650 spin_lock(&base->cpu_base->lock);
667 case HRTIMER_CB_IRQSAFE_PERCPU: 651 return 1;
668 case HRTIMER_CB_IRQSAFE_UNLOCKED:
669 /*
670 * This is solely for the sched tick emulation with
671 * dynamic tick support to ensure that we do not
672 * restart the tick right on the edge and end up with
673 * the tick timer in the softirq ! The calling site
674 * takes care of this. Also used for hrtimer sleeper !
675 */
676 debug_hrtimer_deactivate(timer);
677 return 1;
678 case HRTIMER_CB_SOFTIRQ:
679 /*
680 * Move everything else into the softirq pending list !
681 */
682 list_add_tail(&timer->cb_entry,
683 &base->cpu_base->cb_pending);
684 timer->state = HRTIMER_STATE_PENDING;
685 return 1;
686 default:
687 BUG();
688 }
689 } 652 }
690 return 0; 653 return 0;
691} 654}
@@ -724,11 +687,6 @@ static int hrtimer_switch_to_hres(void)
724 return 1; 687 return 1;
725} 688}
726 689
727static inline void hrtimer_raise_softirq(void)
728{
729 raise_softirq(HRTIMER_SOFTIRQ);
730}
731
732#else 690#else
733 691
734static inline int hrtimer_hres_active(void) { return 0; } 692static inline int hrtimer_hres_active(void) { return 0; }
@@ -742,12 +700,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
742} 700}
743static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 701static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
744static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 702static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
745static inline int hrtimer_reprogram(struct hrtimer *timer,
746 struct hrtimer_clock_base *base)
747{
748 return 0;
749}
750static inline void hrtimer_raise_softirq(void) { }
751 703
752#endif /* CONFIG_HIGH_RES_TIMERS */ 704#endif /* CONFIG_HIGH_RES_TIMERS */
753 705
@@ -818,9 +770,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
818 * 770 *
819 * The timer is inserted in expiry order. Insertion into the 771 * The timer is inserted in expiry order. Insertion into the
820 * red black tree is O(log(n)). Must hold the base lock. 772 * red black tree is O(log(n)). Must hold the base lock.
773 *
774 * Returns 1 when the new timer is the leftmost timer in the tree.
821 */ 775 */
822static void enqueue_hrtimer(struct hrtimer *timer, 776static int enqueue_hrtimer(struct hrtimer *timer,
823 struct hrtimer_clock_base *base, int reprogram) 777 struct hrtimer_clock_base *base)
824{ 778{
825 struct rb_node **link = &base->active.rb_node; 779 struct rb_node **link = &base->active.rb_node;
826 struct rb_node *parent = NULL; 780 struct rb_node *parent = NULL;
@@ -852,20 +806,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
852 * Insert the timer to the rbtree and check whether it 806 * Insert the timer to the rbtree and check whether it
853 * replaces the first pending timer 807 * replaces the first pending timer
854 */ 808 */
855 if (leftmost) { 809 if (leftmost)
856 /*
857 * Reprogram the clock event device. When the timer is already
858 * expired hrtimer_enqueue_reprogram has either called the
859 * callback or added it to the pending list and raised the
860 * softirq.
861 *
862 * This is a NOP for !HIGHRES
863 */
864 if (reprogram && hrtimer_enqueue_reprogram(timer, base))
865 return;
866
867 base->first = &timer->node; 810 base->first = &timer->node;
868 }
869 811
870 rb_link_node(&timer->node, parent, link); 812 rb_link_node(&timer->node, parent, link);
871 rb_insert_color(&timer->node, &base->active); 813 rb_insert_color(&timer->node, &base->active);
@@ -874,6 +816,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
874 * state of a possibly running callback. 816 * state of a possibly running callback.
875 */ 817 */
876 timer->state |= HRTIMER_STATE_ENQUEUED; 818 timer->state |= HRTIMER_STATE_ENQUEUED;
819
820 return leftmost;
877} 821}
878 822
879/* 823/*
@@ -890,10 +834,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
890 struct hrtimer_clock_base *base, 834 struct hrtimer_clock_base *base,
891 unsigned long newstate, int reprogram) 835 unsigned long newstate, int reprogram)
892{ 836{
893 /* High res. callback list. NOP for !HIGHRES */ 837 if (timer->state & HRTIMER_STATE_ENQUEUED) {
894 if (hrtimer_cb_pending(timer))
895 hrtimer_remove_cb_pending(timer);
896 else {
897 /* 838 /*
898 * Remove the timer from the rbtree and replace the 839 * Remove the timer from the rbtree and replace the
899 * first entry pointer if necessary. 840 * first entry pointer if necessary.
@@ -953,7 +894,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
953{ 894{
954 struct hrtimer_clock_base *base, *new_base; 895 struct hrtimer_clock_base *base, *new_base;
955 unsigned long flags; 896 unsigned long flags;
956 int ret, raise; 897 int ret, leftmost;
957 898
958 base = lock_hrtimer_base(timer, &flags); 899 base = lock_hrtimer_base(timer, &flags);
959 900
@@ -981,33 +922,19 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
981 922
982 timer_stats_hrtimer_set_start_info(timer); 923 timer_stats_hrtimer_set_start_info(timer);
983 924
925 leftmost = enqueue_hrtimer(timer, new_base);
926
984 /* 927 /*
985 * Only allow reprogramming if the new base is on this CPU. 928 * Only allow reprogramming if the new base is on this CPU.
986 * (it might still be on another CPU if the timer was pending) 929 * (it might still be on another CPU if the timer was pending)
930 *
931 * XXX send_remote_softirq() ?
987 */ 932 */
988 enqueue_hrtimer(timer, new_base, 933 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
989 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 934 hrtimer_enqueue_reprogram(timer, new_base);
990
991 /*
992 * The timer may be expired and moved to the cb_pending
993 * list. We can not raise the softirq with base lock held due
994 * to a possible deadlock with runqueue lock.
995 */
996 raise = timer->state == HRTIMER_STATE_PENDING;
997
998 /*
999 * We use preempt_disable to prevent this task from migrating after
1000 * setting up the softirq and raising it. Otherwise, if me migrate
1001 * we will raise the softirq on the wrong CPU.
1002 */
1003 preempt_disable();
1004 935
1005 unlock_hrtimer_base(timer, &flags); 936 unlock_hrtimer_base(timer, &flags);
1006 937
1007 if (raise)
1008 hrtimer_raise_softirq();
1009 preempt_enable();
1010
1011 return ret; 938 return ret;
1012} 939}
1013EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 940EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
@@ -1192,75 +1119,6 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1192} 1119}
1193EXPORT_SYMBOL_GPL(hrtimer_get_res); 1120EXPORT_SYMBOL_GPL(hrtimer_get_res);
1194 1121
1195static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1196{
1197 spin_lock_irq(&cpu_base->lock);
1198
1199 while (!list_empty(&cpu_base->cb_pending)) {
1200 enum hrtimer_restart (*fn)(struct hrtimer *);
1201 struct hrtimer *timer;
1202 int restart;
1203 int emulate_hardirq_ctx = 0;
1204
1205 timer = list_entry(cpu_base->cb_pending.next,
1206 struct hrtimer, cb_entry);
1207
1208 debug_hrtimer_deactivate(timer);
1209 timer_stats_account_hrtimer(timer);
1210
1211 fn = timer->function;
1212 /*
1213 * A timer might have been added to the cb_pending list
1214 * when it was migrated during a cpu-offline operation.
1215 * Emulate hardirq context for such timers.
1216 */
1217 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1218 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED)
1219 emulate_hardirq_ctx = 1;
1220
1221 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1222 spin_unlock_irq(&cpu_base->lock);
1223
1224 if (unlikely(emulate_hardirq_ctx)) {
1225 local_irq_disable();
1226 restart = fn(timer);
1227 local_irq_enable();
1228 } else
1229 restart = fn(timer);
1230
1231 spin_lock_irq(&cpu_base->lock);
1232
1233 timer->state &= ~HRTIMER_STATE_CALLBACK;
1234 if (restart == HRTIMER_RESTART) {
1235 BUG_ON(hrtimer_active(timer));
1236 /*
1237 * Enqueue the timer, allow reprogramming of the event
1238 * device
1239 */
1240 enqueue_hrtimer(timer, timer->base, 1);
1241 } else if (hrtimer_active(timer)) {
1242 /*
1243 * If the timer was rearmed on another CPU, reprogram
1244 * the event device.
1245 */
1246 struct hrtimer_clock_base *base = timer->base;
1247
1248 if (base->first == &timer->node &&
1249 hrtimer_reprogram(timer, base)) {
1250 /*
1251 * Timer is expired. Thus move it from tree to
1252 * pending list again.
1253 */
1254 __remove_hrtimer(timer, base,
1255 HRTIMER_STATE_PENDING, 0);
1256 list_add_tail(&timer->cb_entry,
1257 &base->cpu_base->cb_pending);
1258 }
1259 }
1260 }
1261 spin_unlock_irq(&cpu_base->lock);
1262}
1263
1264static void __run_hrtimer(struct hrtimer *timer) 1122static void __run_hrtimer(struct hrtimer *timer)
1265{ 1123{
1266 struct hrtimer_clock_base *base = timer->base; 1124 struct hrtimer_clock_base *base = timer->base;
@@ -1268,34 +1126,30 @@ static void __run_hrtimer(struct hrtimer *timer)
1268 enum hrtimer_restart (*fn)(struct hrtimer *); 1126 enum hrtimer_restart (*fn)(struct hrtimer *);
1269 int restart; 1127 int restart;
1270 1128
1129 WARN_ON(!irqs_disabled());
1130
1271 debug_hrtimer_deactivate(timer); 1131 debug_hrtimer_deactivate(timer);
1272 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1132 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1273 timer_stats_account_hrtimer(timer); 1133 timer_stats_account_hrtimer(timer);
1274
1275 fn = timer->function; 1134 fn = timer->function;
1276 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1277 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
1278 /*
1279 * Used for scheduler timers, avoid lock inversion with
1280 * rq->lock and tasklist_lock.
1281 *
1282 * These timers are required to deal with enqueue expiry
1283 * themselves and are not allowed to migrate.
1284 */
1285 spin_unlock(&cpu_base->lock);
1286 restart = fn(timer);
1287 spin_lock(&cpu_base->lock);
1288 } else
1289 restart = fn(timer);
1290 1135
1291 /* 1136 /*
1292 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid 1137 * Because we run timers from hardirq context, there is no chance
1293 * reprogramming of the event hardware. This happens at the end of this 1138 * they get migrated to another cpu, therefore its safe to unlock
1294 * function anyway. 1139 * the timer base.
1140 */
1141 spin_unlock(&cpu_base->lock);
1142 restart = fn(timer);
1143 spin_lock(&cpu_base->lock);
1144
1145 /*
1146 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
1147 * we do not reprogramm the event hardware. Happens either in
1148 * hrtimer_start_range_ns() or in hrtimer_interrupt()
1295 */ 1149 */
1296 if (restart != HRTIMER_NORESTART) { 1150 if (restart != HRTIMER_NORESTART) {
1297 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); 1151 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1298 enqueue_hrtimer(timer, base, 0); 1152 enqueue_hrtimer(timer, base);
1299 } 1153 }
1300 timer->state &= ~HRTIMER_STATE_CALLBACK; 1154 timer->state &= ~HRTIMER_STATE_CALLBACK;
1301} 1155}
@@ -1311,7 +1165,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1311 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1165 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1312 struct hrtimer_clock_base *base; 1166 struct hrtimer_clock_base *base;
1313 ktime_t expires_next, now; 1167 ktime_t expires_next, now;
1314 int i, raise = 0; 1168 int i;
1315 1169
1316 BUG_ON(!cpu_base->hres_active); 1170 BUG_ON(!cpu_base->hres_active);
1317 cpu_base->nr_events++; 1171 cpu_base->nr_events++;
@@ -1360,16 +1214,6 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1360 break; 1214 break;
1361 } 1215 }
1362 1216
1363 /* Move softirq callbacks to the pending list */
1364 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1365 __remove_hrtimer(timer, base,
1366 HRTIMER_STATE_PENDING, 0);
1367 list_add_tail(&timer->cb_entry,
1368 &base->cpu_base->cb_pending);
1369 raise = 1;
1370 continue;
1371 }
1372
1373 __run_hrtimer(timer); 1217 __run_hrtimer(timer);
1374 } 1218 }
1375 spin_unlock(&cpu_base->lock); 1219 spin_unlock(&cpu_base->lock);
@@ -1383,10 +1227,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1383 if (tick_program_event(expires_next, 0)) 1227 if (tick_program_event(expires_next, 0))
1384 goto retry; 1228 goto retry;
1385 } 1229 }
1230}
1231
1232/*
1233 * local version of hrtimer_peek_ahead_timers() called with interrupts
1234 * disabled.
1235 */
1236static void __hrtimer_peek_ahead_timers(void)
1237{
1238 struct tick_device *td;
1239
1240 if (!hrtimer_hres_active())
1241 return;
1386 1242
1387 /* Raise softirq ? */ 1243 td = &__get_cpu_var(tick_cpu_device);
1388 if (raise) 1244 if (td && td->evtdev)
1389 raise_softirq(HRTIMER_SOFTIRQ); 1245 hrtimer_interrupt(td->evtdev);
1390} 1246}
1391 1247
1392/** 1248/**
@@ -1400,25 +1256,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1400 */ 1256 */
1401void hrtimer_peek_ahead_timers(void) 1257void hrtimer_peek_ahead_timers(void)
1402{ 1258{
1403 struct tick_device *td;
1404 unsigned long flags; 1259 unsigned long flags;
1405 1260
1406 if (!hrtimer_hres_active())
1407 return;
1408
1409 local_irq_save(flags); 1261 local_irq_save(flags);
1410 td = &__get_cpu_var(tick_cpu_device); 1262 __hrtimer_peek_ahead_timers();
1411 if (td && td->evtdev)
1412 hrtimer_interrupt(td->evtdev);
1413 local_irq_restore(flags); 1263 local_irq_restore(flags);
1414} 1264}
1415 1265
1416static void run_hrtimer_softirq(struct softirq_action *h) 1266static void run_hrtimer_softirq(struct softirq_action *h)
1417{ 1267{
1418 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); 1268 hrtimer_peek_ahead_timers();
1419} 1269}
1420 1270
1421#endif /* CONFIG_HIGH_RES_TIMERS */ 1271#else /* CONFIG_HIGH_RES_TIMERS */
1272
1273static inline void __hrtimer_peek_ahead_timers(void) { }
1274
1275#endif /* !CONFIG_HIGH_RES_TIMERS */
1422 1276
1423/* 1277/*
1424 * Called from timer softirq every jiffy, expire hrtimers: 1278 * Called from timer softirq every jiffy, expire hrtimers:
@@ -1429,8 +1283,6 @@ static void run_hrtimer_softirq(struct softirq_action *h)
1429 */ 1283 */
1430void hrtimer_run_pending(void) 1284void hrtimer_run_pending(void)
1431{ 1285{
1432 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1433
1434 if (hrtimer_hres_active()) 1286 if (hrtimer_hres_active())
1435 return; 1287 return;
1436 1288
@@ -1444,8 +1296,6 @@ void hrtimer_run_pending(void)
1444 */ 1296 */
1445 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) 1297 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1446 hrtimer_switch_to_hres(); 1298 hrtimer_switch_to_hres();
1447
1448 run_hrtimer_pending(cpu_base);
1449} 1299}
1450 1300
1451/* 1301/*
@@ -1482,14 +1332,6 @@ void hrtimer_run_queues(void)
1482 hrtimer_get_expires_tv64(timer)) 1332 hrtimer_get_expires_tv64(timer))
1483 break; 1333 break;
1484 1334
1485 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1486 __remove_hrtimer(timer, base,
1487 HRTIMER_STATE_PENDING, 0);
1488 list_add_tail(&timer->cb_entry,
1489 &base->cpu_base->cb_pending);
1490 continue;
1491 }
1492
1493 __run_hrtimer(timer); 1335 __run_hrtimer(timer);
1494 } 1336 }
1495 spin_unlock(&cpu_base->lock); 1337 spin_unlock(&cpu_base->lock);
@@ -1516,9 +1358,6 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1516{ 1358{
1517 sl->timer.function = hrtimer_wakeup; 1359 sl->timer.function = hrtimer_wakeup;
1518 sl->task = task; 1360 sl->task = task;
1519#ifdef CONFIG_HIGH_RES_TIMERS
1520 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1521#endif
1522} 1361}
1523 1362
1524static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1363static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
@@ -1655,18 +1494,16 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1655 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1494 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1656 cpu_base->clock_base[i].cpu_base = cpu_base; 1495 cpu_base->clock_base[i].cpu_base = cpu_base;
1657 1496
1658 INIT_LIST_HEAD(&cpu_base->cb_pending);
1659 hrtimer_init_hres(cpu_base); 1497 hrtimer_init_hres(cpu_base);
1660} 1498}
1661 1499
1662#ifdef CONFIG_HOTPLUG_CPU 1500#ifdef CONFIG_HOTPLUG_CPU
1663 1501
1664static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1502static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1665 struct hrtimer_clock_base *new_base, int dcpu) 1503 struct hrtimer_clock_base *new_base)
1666{ 1504{
1667 struct hrtimer *timer; 1505 struct hrtimer *timer;
1668 struct rb_node *node; 1506 struct rb_node *node;
1669 int raise = 0;
1670 1507
1671 while ((node = rb_first(&old_base->active))) { 1508 while ((node = rb_first(&old_base->active))) {
1672 timer = rb_entry(node, struct hrtimer, node); 1509 timer = rb_entry(node, struct hrtimer, node);
@@ -1674,18 +1511,6 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1674 debug_hrtimer_deactivate(timer); 1511 debug_hrtimer_deactivate(timer);
1675 1512
1676 /* 1513 /*
1677 * Should not happen. Per CPU timers should be
1678 * canceled _before_ the migration code is called
1679 */
1680 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1681 __remove_hrtimer(timer, old_base,
1682 HRTIMER_STATE_INACTIVE, 0);
1683 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1684 timer, timer->function, dcpu);
1685 continue;
1686 }
1687
1688 /*
1689 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1514 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1690 * timer could be seen as !active and just vanish away 1515 * timer could be seen as !active and just vanish away
1691 * under us on another CPU 1516 * under us on another CPU
@@ -1693,112 +1518,73 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1693 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); 1518 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1694 timer->base = new_base; 1519 timer->base = new_base;
1695 /* 1520 /*
1696 * Enqueue the timer. Allow reprogramming of the event device 1521 * Enqueue the timers on the new cpu. This does not
1522 * reprogram the event device in case the timer
1523 * expires before the earliest on this CPU, but we run
1524 * hrtimer_interrupt after we migrated everything to
1525 * sort out already expired timers and reprogram the
1526 * event device.
1697 */ 1527 */
1698 enqueue_hrtimer(timer, new_base, 1); 1528 enqueue_hrtimer(timer, new_base);
1699 1529
1700#ifdef CONFIG_HIGH_RES_TIMERS
1701 /*
1702 * Happens with high res enabled when the timer was
1703 * already expired and the callback mode is
1704 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1705 * enqueue code does not move them to the soft irq
1706 * pending list for performance/latency reasons, but
1707 * in the migration state, we need to do that
1708 * otherwise we end up with a stale timer.
1709 */
1710 if (timer->state == HRTIMER_STATE_MIGRATE) {
1711 timer->state = HRTIMER_STATE_PENDING;
1712 list_add_tail(&timer->cb_entry,
1713 &new_base->cpu_base->cb_pending);
1714 raise = 1;
1715 }
1716#endif
1717 /* Clear the migration state bit */ 1530 /* Clear the migration state bit */
1718 timer->state &= ~HRTIMER_STATE_MIGRATE; 1531 timer->state &= ~HRTIMER_STATE_MIGRATE;
1719 } 1532 }
1720 return raise;
1721}
1722
1723#ifdef CONFIG_HIGH_RES_TIMERS
1724static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1725 struct hrtimer_cpu_base *new_base)
1726{
1727 struct hrtimer *timer;
1728 int raise = 0;
1729
1730 while (!list_empty(&old_base->cb_pending)) {
1731 timer = list_entry(old_base->cb_pending.next,
1732 struct hrtimer, cb_entry);
1733
1734 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1735 timer->base = &new_base->clock_base[timer->base->index];
1736 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1737 raise = 1;
1738 }
1739 return raise;
1740}
1741#else
1742static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1743 struct hrtimer_cpu_base *new_base)
1744{
1745 return 0;
1746} 1533}
1747#endif
1748 1534
1749static void migrate_hrtimers(int cpu) 1535static void migrate_hrtimers(int scpu)
1750{ 1536{
1751 struct hrtimer_cpu_base *old_base, *new_base; 1537 struct hrtimer_cpu_base *old_base, *new_base;
1752 int i, raise = 0; 1538 int i;
1753 1539
1754 BUG_ON(cpu_online(cpu)); 1540 BUG_ON(cpu_online(scpu));
1755 old_base = &per_cpu(hrtimer_bases, cpu); 1541 tick_cancel_sched_timer(scpu);
1756 new_base = &get_cpu_var(hrtimer_bases);
1757 1542
1758 tick_cancel_sched_timer(cpu); 1543 local_irq_disable();
1544 old_base = &per_cpu(hrtimer_bases, scpu);
1545 new_base = &__get_cpu_var(hrtimer_bases);
1759 /* 1546 /*
1760 * The caller is globally serialized and nobody else 1547 * The caller is globally serialized and nobody else
1761 * takes two locks at once, deadlock is not possible. 1548 * takes two locks at once, deadlock is not possible.
1762 */ 1549 */
1763 spin_lock_irq(&new_base->lock); 1550 spin_lock(&new_base->lock);
1764 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1551 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1765 1552
1766 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1553 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1767 if (migrate_hrtimer_list(&old_base->clock_base[i], 1554 migrate_hrtimer_list(&old_base->clock_base[i],
1768 &new_base->clock_base[i], cpu)) 1555 &new_base->clock_base[i]);
1769 raise = 1;
1770 } 1556 }
1771 1557
1772 if (migrate_hrtimer_pending(old_base, new_base))
1773 raise = 1;
1774
1775 spin_unlock(&old_base->lock); 1558 spin_unlock(&old_base->lock);
1776 spin_unlock_irq(&new_base->lock); 1559 spin_unlock(&new_base->lock);
1777 put_cpu_var(hrtimer_bases);
1778 1560
1779 if (raise) 1561 /* Check, if we got expired work to do */
1780 hrtimer_raise_softirq(); 1562 __hrtimer_peek_ahead_timers();
1563 local_irq_enable();
1781} 1564}
1565
1782#endif /* CONFIG_HOTPLUG_CPU */ 1566#endif /* CONFIG_HOTPLUG_CPU */
1783 1567
1784static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, 1568static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1785 unsigned long action, void *hcpu) 1569 unsigned long action, void *hcpu)
1786{ 1570{
1787 unsigned int cpu = (long)hcpu; 1571 int scpu = (long)hcpu;
1788 1572
1789 switch (action) { 1573 switch (action) {
1790 1574
1791 case CPU_UP_PREPARE: 1575 case CPU_UP_PREPARE:
1792 case CPU_UP_PREPARE_FROZEN: 1576 case CPU_UP_PREPARE_FROZEN:
1793 init_hrtimers_cpu(cpu); 1577 init_hrtimers_cpu(scpu);
1794 break; 1578 break;
1795 1579
1796#ifdef CONFIG_HOTPLUG_CPU 1580#ifdef CONFIG_HOTPLUG_CPU
1797 case CPU_DEAD: 1581 case CPU_DEAD:
1798 case CPU_DEAD_FROZEN: 1582 case CPU_DEAD_FROZEN:
1799 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); 1583 {
1800 migrate_hrtimers(cpu); 1584 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1585 migrate_hrtimers(scpu);
1801 break; 1586 break;
1587 }
1802#endif 1588#endif
1803 1589
1804 default: 1590 default:
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 681c52dbfe22..4dd5b1edac98 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index cc0f7321b8ce..1de9700f416e 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/async.h>
13 14
14#include "internals.h" 15#include "internals.h"
15 16
@@ -34,6 +35,10 @@ unsigned long probe_irq_on(void)
34 unsigned int status; 35 unsigned int status;
35 int i; 36 int i;
36 37
38 /*
39 * quiesce the kernel, or at least the asynchronous portion
40 */
41 async_synchronize_full();
37 mutex_lock(&probing_active); 42 mutex_lock(&probing_active);
38 /* 43 /*
39 * something may have generated an irq long ago and we want to 44 * something may have generated an irq long ago and we want to
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 10b5092e9bfe..f63c706d25e1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -24,9 +24,10 @@
24 */ 24 */
25void dynamic_irq_init(unsigned int irq) 25void dynamic_irq_init(unsigned int irq)
26{ 26{
27 struct irq_desc *desc = irq_to_desc(irq); 27 struct irq_desc *desc;
28 unsigned long flags; 28 unsigned long flags;
29 29
30 desc = irq_to_desc(irq);
30 if (!desc) { 31 if (!desc) {
31 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); 32 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
32 return; 33 return;
@@ -45,7 +46,7 @@ void dynamic_irq_init(unsigned int irq)
45 desc->irq_count = 0; 46 desc->irq_count = 0;
46 desc->irqs_unhandled = 0; 47 desc->irqs_unhandled = 0;
47#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
48 cpus_setall(desc->affinity); 49 cpumask_setall(&desc->affinity);
49#endif 50#endif
50 spin_unlock_irqrestore(&desc->lock, flags); 51 spin_unlock_irqrestore(&desc->lock, flags);
51} 52}
@@ -124,6 +125,7 @@ int set_irq_type(unsigned int irq, unsigned int type)
124 return -ENODEV; 125 return -ENODEV;
125 } 126 }
126 127
128 type &= IRQ_TYPE_SENSE_MASK;
127 if (type == IRQ_TYPE_NONE) 129 if (type == IRQ_TYPE_NONE)
128 return 0; 130 return 0;
129 131
@@ -352,6 +354,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
352 354
353 spin_lock(&desc->lock); 355 spin_lock(&desc->lock);
354 mask_ack_irq(desc, irq); 356 mask_ack_irq(desc, irq);
357 desc = irq_remap_to_desc(irq, desc);
355 358
356 if (unlikely(desc->status & IRQ_INPROGRESS)) 359 if (unlikely(desc->status & IRQ_INPROGRESS))
357 goto out_unlock; 360 goto out_unlock;
@@ -429,6 +432,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
429 desc->status &= ~IRQ_INPROGRESS; 432 desc->status &= ~IRQ_INPROGRESS;
430out: 433out:
431 desc->chip->eoi(irq); 434 desc->chip->eoi(irq);
435 desc = irq_remap_to_desc(irq, desc);
432 436
433 spin_unlock(&desc->lock); 437 spin_unlock(&desc->lock);
434} 438}
@@ -465,12 +469,14 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
465 !desc->action)) { 469 !desc->action)) {
466 desc->status |= (IRQ_PENDING | IRQ_MASKED); 470 desc->status |= (IRQ_PENDING | IRQ_MASKED);
467 mask_ack_irq(desc, irq); 471 mask_ack_irq(desc, irq);
472 desc = irq_remap_to_desc(irq, desc);
468 goto out_unlock; 473 goto out_unlock;
469 } 474 }
470 kstat_incr_irqs_this_cpu(irq, desc); 475 kstat_incr_irqs_this_cpu(irq, desc);
471 476
472 /* Start handling the irq */ 477 /* Start handling the irq */
473 desc->chip->ack(irq); 478 desc->chip->ack(irq);
479 desc = irq_remap_to_desc(irq, desc);
474 480
475 /* Mark the IRQ currently in progress.*/ 481 /* Mark the IRQ currently in progress.*/
476 desc->status |= IRQ_INPROGRESS; 482 desc->status |= IRQ_INPROGRESS;
@@ -531,8 +537,10 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
531 if (!noirqdebug) 537 if (!noirqdebug)
532 note_interrupt(irq, desc, action_ret); 538 note_interrupt(irq, desc, action_ret);
533 539
534 if (desc->chip->eoi) 540 if (desc->chip->eoi) {
535 desc->chip->eoi(irq); 541 desc->chip->eoi(irq);
542 desc = irq_remap_to_desc(irq, desc);
543 }
536} 544}
537 545
538void 546void
@@ -567,8 +575,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
567 575
568 /* Uninstall? */ 576 /* Uninstall? */
569 if (handle == handle_bad_irq) { 577 if (handle == handle_bad_irq) {
570 if (desc->chip != &no_irq_chip) 578 if (desc->chip != &no_irq_chip) {
571 mask_ack_irq(desc, irq); 579 mask_ack_irq(desc, irq);
580 desc = irq_remap_to_desc(irq, desc);
581 }
572 desc->status |= IRQ_DISABLED; 582 desc->status |= IRQ_DISABLED;
573 desc->depth = 1; 583 desc->depth = 1;
574 } 584 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c815b42d0f5b..c20db0be9173 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -15,9 +15,16 @@
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h>
19#include <linux/hash.h>
18 20
19#include "internals.h" 21#include "internals.h"
20 22
23/*
24 * lockdep: we want to handle all irq_desc locks as a single lock-class:
25 */
26struct lock_class_key irq_desc_lock_class;
27
21/** 28/**
22 * handle_bad_irq - handle spurious and unhandled irqs 29 * handle_bad_irq - handle spurious and unhandled irqs
23 * @irq: the interrupt number 30 * @irq: the interrupt number
@@ -49,6 +56,150 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
49int nr_irqs = NR_IRQS; 56int nr_irqs = NR_IRQS;
50EXPORT_SYMBOL_GPL(nr_irqs); 57EXPORT_SYMBOL_GPL(nr_irqs);
51 58
59#ifdef CONFIG_SPARSE_IRQ
60static struct irq_desc irq_desc_init = {
61 .irq = -1,
62 .status = IRQ_DISABLED,
63 .chip = &no_irq_chip,
64 .handle_irq = handle_bad_irq,
65 .depth = 1,
66 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
67#ifdef CONFIG_SMP
68 .affinity = CPU_MASK_ALL
69#endif
70};
71
72void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
73{
74 unsigned long bytes;
75 char *ptr;
76 int node;
77
78 /* Compute how many bytes we need per irq and allocate them */
79 bytes = nr * sizeof(unsigned int);
80
81 node = cpu_to_node(cpu);
82 ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
83 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
84
85 if (ptr)
86 desc->kstat_irqs = (unsigned int *)ptr;
87}
88
89static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
90{
91 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
92
93 spin_lock_init(&desc->lock);
94 desc->irq = irq;
95#ifdef CONFIG_SMP
96 desc->cpu = cpu;
97#endif
98 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
99 init_kstat_irqs(desc, cpu, nr_cpu_ids);
100 if (!desc->kstat_irqs) {
101 printk(KERN_ERR "can not alloc kstat_irqs\n");
102 BUG_ON(1);
103 }
104 arch_init_chip_data(desc, cpu);
105}
106
107/*
108 * Protect the sparse_irqs:
109 */
110DEFINE_SPINLOCK(sparse_irq_lock);
111
112struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
113
114static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
115 [0 ... NR_IRQS_LEGACY-1] = {
116 .irq = -1,
117 .status = IRQ_DISABLED,
118 .chip = &no_irq_chip,
119 .handle_irq = handle_bad_irq,
120 .depth = 1,
121 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
122#ifdef CONFIG_SMP
123 .affinity = CPU_MASK_ALL
124#endif
125 }
126};
127
128/* FIXME: use bootmem alloc ...*/
129static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
130
131int __init early_irq_init(void)
132{
133 struct irq_desc *desc;
134 int legacy_count;
135 int i;
136
137 desc = irq_desc_legacy;
138 legacy_count = ARRAY_SIZE(irq_desc_legacy);
139
140 for (i = 0; i < legacy_count; i++) {
141 desc[i].irq = i;
142 desc[i].kstat_irqs = kstat_irqs_legacy[i];
143 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
144
145 irq_desc_ptrs[i] = desc + i;
146 }
147
148 for (i = legacy_count; i < NR_IRQS; i++)
149 irq_desc_ptrs[i] = NULL;
150
151 return arch_early_irq_init();
152}
153
154struct irq_desc *irq_to_desc(unsigned int irq)
155{
156 return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
157}
158
159struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
160{
161 struct irq_desc *desc;
162 unsigned long flags;
163 int node;
164
165 if (irq >= NR_IRQS) {
166 printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
167 irq, NR_IRQS);
168 WARN_ON(1);
169 return NULL;
170 }
171
172 desc = irq_desc_ptrs[irq];
173 if (desc)
174 return desc;
175
176 spin_lock_irqsave(&sparse_irq_lock, flags);
177
178 /* We have to check it to avoid races with another CPU */
179 desc = irq_desc_ptrs[irq];
180 if (desc)
181 goto out_unlock;
182
183 node = cpu_to_node(cpu);
184 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
185 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n",
186 irq, cpu, node);
187 if (!desc) {
188 printk(KERN_ERR "can not alloc irq_desc\n");
189 BUG_ON(1);
190 }
191 init_one_irq_desc(irq, desc, cpu);
192
193 irq_desc_ptrs[irq] = desc;
194
195out_unlock:
196 spin_unlock_irqrestore(&sparse_irq_lock, flags);
197
198 return desc;
199}
200
201#else /* !CONFIG_SPARSE_IRQ */
202
52struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 203struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
53 [0 ... NR_IRQS-1] = { 204 [0 ... NR_IRQS-1] = {
54 .status = IRQ_DISABLED, 205 .status = IRQ_DISABLED,
@@ -62,6 +213,32 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
62 } 213 }
63}; 214};
64 215
216int __init early_irq_init(void)
217{
218 struct irq_desc *desc;
219 int count;
220 int i;
221
222 desc = irq_desc;
223 count = ARRAY_SIZE(irq_desc);
224
225 for (i = 0; i < count; i++)
226 desc[i].irq = i;
227
228 return arch_early_irq_init();
229}
230
231struct irq_desc *irq_to_desc(unsigned int irq)
232{
233 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
234}
235
236struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
237{
238 return irq_to_desc(irq);
239}
240#endif /* !CONFIG_SPARSE_IRQ */
241
65/* 242/*
66 * What should we do if we get a hw irq event on an illegal vector? 243 * What should we do if we get a hw irq event on an illegal vector?
67 * Each architecture has to answer this themself. 244 * Each architecture has to answer this themself.
@@ -179,8 +356,11 @@ unsigned int __do_IRQ(unsigned int irq)
179 /* 356 /*
180 * No locking required for CPU-local interrupts: 357 * No locking required for CPU-local interrupts:
181 */ 358 */
182 if (desc->chip->ack) 359 if (desc->chip->ack) {
183 desc->chip->ack(irq); 360 desc->chip->ack(irq);
361 /* get new one */
362 desc = irq_remap_to_desc(irq, desc);
363 }
184 if (likely(!(desc->status & IRQ_DISABLED))) { 364 if (likely(!(desc->status & IRQ_DISABLED))) {
185 action_ret = handle_IRQ_event(irq, desc->action); 365 action_ret = handle_IRQ_event(irq, desc->action);
186 if (!noirqdebug) 366 if (!noirqdebug)
@@ -191,8 +371,10 @@ unsigned int __do_IRQ(unsigned int irq)
191 } 371 }
192 372
193 spin_lock(&desc->lock); 373 spin_lock(&desc->lock);
194 if (desc->chip->ack) 374 if (desc->chip->ack) {
195 desc->chip->ack(irq); 375 desc->chip->ack(irq);
376 desc = irq_remap_to_desc(irq, desc);
377 }
196 /* 378 /*
197 * REPLAY is when Linux resends an IRQ that was dropped earlier 379 * REPLAY is when Linux resends an IRQ that was dropped earlier
198 * WAITING is used by probe to mark irqs that are being tested 380 * WAITING is used by probe to mark irqs that are being tested
@@ -259,19 +441,22 @@ out:
259} 441}
260#endif 442#endif
261 443
262
263#ifdef CONFIG_TRACE_IRQFLAGS
264/*
265 * lockdep: we want to handle all irq_desc locks as a single lock-class:
266 */
267static struct lock_class_key irq_desc_lock_class;
268
269void early_init_irq_lock_class(void) 444void early_init_irq_lock_class(void)
270{ 445{
271 struct irq_desc *desc; 446 struct irq_desc *desc;
272 int i; 447 int i;
273 448
274 for_each_irq_desc(i, desc) 449 for_each_irq_desc(i, desc) {
275 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 450 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
451 }
452}
453
454#ifdef CONFIG_SPARSE_IRQ
455unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
456{
457 struct irq_desc *desc = irq_to_desc(irq);
458 return desc ? desc->kstat_irqs[cpu] : 0;
276} 459}
277#endif 460#endif
461EXPORT_SYMBOL(kstat_irqs_cpu);
462
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 64c1c7253dae..e6d0a43cc125 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -13,6 +13,11 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags); 14 unsigned long flags);
15 15
16extern struct lock_class_key irq_desc_lock_class;
17extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
18extern spinlock_t sparse_irq_lock;
19extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
20
16#ifdef CONFIG_PROC_FS 21#ifdef CONFIG_PROC_FS
17extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 22extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
18extern void register_handler_proc(unsigned int irq, struct irqaction *action); 23extern void register_handler_proc(unsigned int irq, struct irqaction *action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 801addda3c43..cd0cd8dcb345 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,8 +16,15 @@
16#include "internals.h" 16#include "internals.h"
17 17
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
19cpumask_var_t irq_default_affinity;
19 20
20cpumask_t irq_default_affinity = CPU_MASK_ALL; 21static int init_irq_default_affinity(void)
22{
23 alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
24 cpumask_setall(irq_default_affinity);
25 return 0;
26}
27core_initcall(init_irq_default_affinity);
21 28
22/** 29/**
23 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 30 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
@@ -79,7 +86,7 @@ int irq_can_set_affinity(unsigned int irq)
79 * @cpumask: cpumask 86 * @cpumask: cpumask
80 * 87 *
81 */ 88 */
82int irq_set_affinity(unsigned int irq, cpumask_t cpumask) 89int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
83{ 90{
84 struct irq_desc *desc = irq_to_desc(irq); 91 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags; 92 unsigned long flags;
@@ -91,14 +98,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
91 98
92#ifdef CONFIG_GENERIC_PENDING_IRQ 99#ifdef CONFIG_GENERIC_PENDING_IRQ
93 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 100 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
94 desc->affinity = cpumask; 101 cpumask_copy(&desc->affinity, cpumask);
95 desc->chip->set_affinity(irq, cpumask); 102 desc->chip->set_affinity(irq, cpumask);
96 } else { 103 } else {
97 desc->status |= IRQ_MOVE_PENDING; 104 desc->status |= IRQ_MOVE_PENDING;
98 desc->pending_mask = cpumask; 105 cpumask_copy(&desc->pending_mask, cpumask);
99 } 106 }
100#else 107#else
101 desc->affinity = cpumask; 108 cpumask_copy(&desc->affinity, cpumask);
102 desc->chip->set_affinity(irq, cpumask); 109 desc->chip->set_affinity(irq, cpumask);
103#endif 110#endif
104 desc->status |= IRQ_AFFINITY_SET; 111 desc->status |= IRQ_AFFINITY_SET;
@@ -112,26 +119,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
112 */ 119 */
113int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) 120int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
114{ 121{
115 cpumask_t mask;
116
117 if (!irq_can_set_affinity(irq)) 122 if (!irq_can_set_affinity(irq))
118 return 0; 123 return 0;
119 124
120 cpus_and(mask, cpu_online_map, irq_default_affinity);
121
122 /* 125 /*
123 * Preserve an userspace affinity setup, but make sure that 126 * Preserve an userspace affinity setup, but make sure that
124 * one of the targets is online. 127 * one of the targets is online.
125 */ 128 */
126 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 129 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
127 if (cpus_intersects(desc->affinity, cpu_online_map)) 130 if (cpumask_any_and(&desc->affinity, cpu_online_mask)
128 mask = desc->affinity; 131 < nr_cpu_ids)
132 goto set_affinity;
129 else 133 else
130 desc->status &= ~IRQ_AFFINITY_SET; 134 desc->status &= ~IRQ_AFFINITY_SET;
131 } 135 }
132 136
133 desc->affinity = mask; 137 cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
134 desc->chip->set_affinity(irq, mask); 138set_affinity:
139 desc->chip->set_affinity(irq, &desc->affinity);
135 140
136 return 0; 141 return 0;
137} 142}
@@ -370,16 +375,18 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
370 return 0; 375 return 0;
371 } 376 }
372 377
373 ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); 378 /* caller masked out all except trigger mode flags */
379 ret = chip->set_type(irq, flags);
374 380
375 if (ret) 381 if (ret)
376 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 382 pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
377 (int)(flags & IRQF_TRIGGER_MASK), 383 (int)flags, irq, chip->set_type);
378 irq, chip->set_type);
379 else { 384 else {
385 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
386 flags |= IRQ_LEVEL;
380 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 387 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
381 desc->status &= ~IRQ_TYPE_SENSE_MASK; 388 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
382 desc->status |= flags & IRQ_TYPE_SENSE_MASK; 389 desc->status |= flags;
383 } 390 }
384 391
385 return ret; 392 return ret;
@@ -459,7 +466,8 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
459 466
460 /* Setup the type (level, edge polarity) if configured: */ 467 /* Setup the type (level, edge polarity) if configured: */
461 if (new->flags & IRQF_TRIGGER_MASK) { 468 if (new->flags & IRQF_TRIGGER_MASK) {
462 ret = __irq_set_trigger(desc, irq, new->flags); 469 ret = __irq_set_trigger(desc, irq,
470 new->flags & IRQF_TRIGGER_MASK);
463 471
464 if (ret) { 472 if (ret) {
465 spin_unlock_irqrestore(&desc->lock, flags); 473 spin_unlock_irqrestore(&desc->lock, flags);
@@ -673,6 +681,18 @@ int request_irq(unsigned int irq, irq_handler_t handler,
673 struct irq_desc *desc; 681 struct irq_desc *desc;
674 int retval; 682 int retval;
675 683
684 /*
685 * handle_IRQ_event() always ignores IRQF_DISABLED except for
686 * the _first_ irqaction (sigh). That can cause oopsing, but
687 * the behavior is classified as "will not fix" so we need to
688 * start nudging drivers away from using that idiom.
689 */
690 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
691 == (IRQF_SHARED|IRQF_DISABLED))
692 pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
693 "guaranteed on shared IRQs\n",
694 irq, devname);
695
676#ifdef CONFIG_LOCKDEP 696#ifdef CONFIG_LOCKDEP
677 /* 697 /*
678 * Lockdep wants atomic interrupt handlers: 698 * Lockdep wants atomic interrupt handlers:
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 9db681d95814..bd72329e630c 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,7 +4,6 @@
4void move_masked_irq(int irq) 4void move_masked_irq(int irq)
5{ 5{
6 struct irq_desc *desc = irq_to_desc(irq); 6 struct irq_desc *desc = irq_to_desc(irq);
7 cpumask_t tmp;
8 7
9 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 8 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
10 return; 9 return;
@@ -19,7 +18,7 @@ void move_masked_irq(int irq)
19 18
20 desc->status &= ~IRQ_MOVE_PENDING; 19 desc->status &= ~IRQ_MOVE_PENDING;
21 20
22 if (unlikely(cpus_empty(desc->pending_mask))) 21 if (unlikely(cpumask_empty(&desc->pending_mask)))
23 return; 22 return;
24 23
25 if (!desc->chip->set_affinity) 24 if (!desc->chip->set_affinity)
@@ -27,8 +26,6 @@ void move_masked_irq(int irq)
27 26
28 assert_spin_locked(&desc->lock); 27 assert_spin_locked(&desc->lock);
29 28
30 cpus_and(tmp, desc->pending_mask, cpu_online_map);
31
32 /* 29 /*
33 * If there was a valid mask to work with, please 30 * If there was a valid mask to work with, please
34 * do the disable, re-program, enable sequence. 31 * do the disable, re-program, enable sequence.
@@ -41,10 +38,13 @@ void move_masked_irq(int irq)
41 * For correct operation this depends on the caller 38 * For correct operation this depends on the caller
42 * masking the irqs. 39 * masking the irqs.
43 */ 40 */
44 if (likely(!cpus_empty(tmp))) { 41 if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
45 desc->chip->set_affinity(irq,tmp); 42 < nr_cpu_ids)) {
43 cpumask_and(&desc->affinity,
44 &desc->pending_mask, cpu_online_mask);
45 desc->chip->set_affinity(irq, &desc->affinity);
46 } 46 }
47 cpus_clear(desc->pending_mask); 47 cpumask_clear(&desc->pending_mask);
48} 48}
49 49
50void move_native_irq(int irq) 50void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
new file mode 100644
index 000000000000..ecf765c6a77a
--- /dev/null
+++ b/kernel/irq/numa_migrate.c
@@ -0,0 +1,119 @@
1/*
2 * NUMA irq-desc migration code
3 *
4 * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
5 * the new "home node" of the IRQ.
6 */
7
8#include <linux/irq.h>
9#include <linux/module.h>
10#include <linux/random.h>
11#include <linux/interrupt.h>
12#include <linux/kernel_stat.h>
13
14#include "internals.h"
15
16static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc,
18 int cpu, int nr)
19{
20 unsigned long bytes;
21
22 init_kstat_irqs(desc, cpu, nr);
23
24 if (desc->kstat_irqs != old_desc->kstat_irqs) {
25 /* Compute how many bytes we need per irq and allocate them */
26 bytes = nr * sizeof(unsigned int);
27
28 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
29 }
30}
31
32static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
33{
34 if (old_desc->kstat_irqs == desc->kstat_irqs)
35 return;
36
37 kfree(old_desc->kstat_irqs);
38 old_desc->kstat_irqs = NULL;
39}
40
41static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 struct irq_desc *desc, int cpu)
43{
44 memcpy(desc, old_desc, sizeof(struct irq_desc));
45 spin_lock_init(&desc->lock);
46 desc->cpu = cpu;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
49 arch_init_copy_chip_data(old_desc, desc, cpu);
50}
51
52static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
53{
54 free_kstat_irqs(old_desc, desc);
55 arch_free_chip_data(old_desc, desc);
56}
57
58static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
59 int cpu)
60{
61 struct irq_desc *desc;
62 unsigned int irq;
63 unsigned long flags;
64 int node;
65
66 irq = old_desc->irq;
67
68 spin_lock_irqsave(&sparse_irq_lock, flags);
69
70 /* We have to check it to avoid races with another CPU */
71 desc = irq_desc_ptrs[irq];
72
73 if (desc && old_desc != desc)
74 goto out_unlock;
75
76 node = cpu_to_node(cpu);
77 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
78 if (!desc) {
79 printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
80 /* still use old one */
81 desc = old_desc;
82 goto out_unlock;
83 }
84 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
85
86 irq_desc_ptrs[irq] = desc;
87
88 /* free the old one */
89 free_one_irq_desc(old_desc, desc);
90 kfree(old_desc);
91
92out_unlock:
93 spin_unlock_irqrestore(&sparse_irq_lock, flags);
94
95 return desc;
96}
97
98struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
99{
100 int old_cpu;
101 int node, old_node;
102
103 /* those all static, do move them */
104 if (desc->irq < NR_IRQS_LEGACY)
105 return desc;
106
107 old_cpu = desc->cpu;
108 if (old_cpu != cpu) {
109 node = cpu_to_node(cpu);
110 old_node = cpu_to_node(old_cpu);
111 if (old_node != node)
112 desc = __real_move_irq_desc(desc, cpu);
113 else
114 desc->cpu = cpu;
115 }
116
117 return desc;
118}
119
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d257e7d6a8a4..aae3f742bcec 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,7 +20,7 @@ static struct proc_dir_entry *root_irq_dir;
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_to_desc((long)m->private); 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 cpumask_t *mask = &desc->affinity; 23 const struct cpumask *mask = &desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
26 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
@@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file,
40 const char __user *buffer, size_t count, loff_t *pos) 40 const char __user *buffer, size_t count, loff_t *pos)
41{ 41{
42 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 42 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
43 cpumask_t new_value; 43 cpumask_var_t new_value;
44 int err; 44 int err;
45 45
46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || 46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
47 irq_balancing_disabled(irq)) 47 irq_balancing_disabled(irq))
48 return -EIO; 48 return -EIO;
49 49
50 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
51 return -ENOMEM;
52
50 err = cpumask_parse_user(buffer, count, new_value); 53 err = cpumask_parse_user(buffer, count, new_value);
51 if (err) 54 if (err)
52 return err; 55 goto free_cpumask;
53 56
54 if (!is_affinity_mask_valid(new_value)) 57 if (!is_affinity_mask_valid(new_value)) {
55 return -EINVAL; 58 err = -EINVAL;
59 goto free_cpumask;
60 }
56 61
57 /* 62 /*
58 * Do not allow disabling IRQs completely - it's a too easy 63 * Do not allow disabling IRQs completely - it's a too easy
59 * way to make the system unusable accidentally :-) At least 64 * way to make the system unusable accidentally :-) At least
60 * one online CPU still has to be targeted. 65 * one online CPU still has to be targeted.
61 */ 66 */
62 if (!cpus_intersects(new_value, cpu_online_map)) 67 if (!cpumask_intersects(new_value, cpu_online_mask)) {
63 /* Special case for empty set - allow the architecture 68 /* Special case for empty set - allow the architecture
64 code to set default SMP affinity. */ 69 code to set default SMP affinity. */
65 return irq_select_affinity_usr(irq) ? -EINVAL : count; 70 err = irq_select_affinity_usr(irq) ? -EINVAL : count;
66 71 } else {
67 irq_set_affinity(irq, new_value); 72 irq_set_affinity(irq, new_value);
73 err = count;
74 }
68 75
69 return count; 76free_cpumask:
77 free_cpumask_var(new_value);
78 return err;
70} 79}
71 80
72static int irq_affinity_proc_open(struct inode *inode, struct file *file) 81static int irq_affinity_proc_open(struct inode *inode, struct file *file)
@@ -84,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = {
84 93
85static int default_affinity_show(struct seq_file *m, void *v) 94static int default_affinity_show(struct seq_file *m, void *v)
86{ 95{
87 seq_cpumask(m, &irq_default_affinity); 96 seq_cpumask(m, irq_default_affinity);
88 seq_putc(m, '\n'); 97 seq_putc(m, '\n');
89 return 0; 98 return 0;
90} 99}
@@ -92,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v)
92static ssize_t default_affinity_write(struct file *file, 101static ssize_t default_affinity_write(struct file *file,
93 const char __user *buffer, size_t count, loff_t *ppos) 102 const char __user *buffer, size_t count, loff_t *ppos)
94{ 103{
95 cpumask_t new_value; 104 cpumask_var_t new_value;
96 int err; 105 int err;
97 106
107 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
108 return -ENOMEM;
109
98 err = cpumask_parse_user(buffer, count, new_value); 110 err = cpumask_parse_user(buffer, count, new_value);
99 if (err) 111 if (err)
100 return err; 112 goto out;
101 113
102 if (!is_affinity_mask_valid(new_value)) 114 if (!is_affinity_mask_valid(new_value)) {
103 return -EINVAL; 115 err = -EINVAL;
116 goto out;
117 }
104 118
105 /* 119 /*
106 * Do not allow disabling IRQs completely - it's a too easy 120 * Do not allow disabling IRQs completely - it's a too easy
107 * way to make the system unusable accidentally :-) At least 121 * way to make the system unusable accidentally :-) At least
108 * one online CPU still has to be targeted. 122 * one online CPU still has to be targeted.
109 */ 123 */
110 if (!cpus_intersects(new_value, cpu_online_map)) 124 if (!cpumask_intersects(new_value, cpu_online_mask)) {
111 return -EINVAL; 125 err = -EINVAL;
126 goto out;
127 }
112 128
113 irq_default_affinity = new_value; 129 cpumask_copy(irq_default_affinity, new_value);
130 err = count;
114 131
115 return count; 132out:
133 free_cpumask_var(new_value);
134 return err;
116} 135}
117 136
118static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
@@ -243,7 +262,11 @@ void init_irq_proc(void)
243 /* 262 /*
244 * Create entries for all existing IRQs. 263 * Create entries for all existing IRQs.
245 */ 264 */
246 for_each_irq_desc(irq, desc) 265 for_each_irq_desc(irq, desc) {
266 if (!desc)
267 continue;
268
247 register_irq_proc(irq, desc); 269 register_irq_proc(irq, desc);
270 }
248} 271}
249 272
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ac0fde7b54d0..3fb855ad6aa0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1116,7 +1116,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1116 struct elf_prstatus prstatus; 1116 struct elf_prstatus prstatus;
1117 u32 *buf; 1117 u32 *buf;
1118 1118
1119 if ((cpu < 0) || (cpu >= NR_CPUS)) 1119 if ((cpu < 0) || (cpu >= nr_cpu_ids))
1120 return; 1120 return;
1121 1121
1122 /* Using ELF notes here is opportunistic. 1122 /* Using ELF notes here is opportunistic.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index b46dbb908669..a27a5f64443d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51 51
52/** 52/**
53 * request_module - try to load a kernel module 53 * request_module - try to load a kernel module
54 * @fmt: printf style format string for the name of the module 54 * @fmt: printf style format string for the name of the module
55 * @varargs: arguements as specified in the format string 55 * @...: arguments as specified in the format string
56 * 56 *
57 * Load a module using the user mode module loader. The function returns 57 * Load a module using the user mode module loader. The function returns
58 * zero on success or a negative errno code on failure. Note that a 58 * zero on success or a negative errno code on failure. Note that a
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9f8a3f25259a..1b9cbdc0127a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
69/* NOTE: change this value only with kprobe_mutex held */ 69/* NOTE: change this value only with kprobe_mutex held */
70static bool kprobe_enabled; 70static bool kprobe_enabled;
71 71
72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct { 74static struct {
75 spinlock_t lock ____cacheline_aligned_in_smp; 75 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -115,6 +115,7 @@ enum kprobe_slot_state {
115 SLOT_USED = 2, 115 SLOT_USED = 2,
116}; 116};
117 117
118static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
118static struct hlist_head kprobe_insn_pages; 119static struct hlist_head kprobe_insn_pages;
119static int kprobe_garbage_slots; 120static int kprobe_garbage_slots;
120static int collect_garbage_slots(void); 121static int collect_garbage_slots(void);
@@ -144,10 +145,10 @@ loop_end:
144} 145}
145 146
146/** 147/**
147 * get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
148 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
149 */ 150 */
150kprobe_opcode_t __kprobes *get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(void)
151{ 152{
152 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
153 struct hlist_node *pos; 154 struct hlist_node *pos;
@@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
196 return kip->insns; 197 return kip->insns;
197} 198}
198 199
200kprobe_opcode_t __kprobes *get_insn_slot(void)
201{
202 kprobe_opcode_t *ret;
203 mutex_lock(&kprobe_insn_mutex);
204 ret = __get_insn_slot();
205 mutex_unlock(&kprobe_insn_mutex);
206 return ret;
207}
208
199/* Return 1 if all garbages are collected, otherwise 0. */ 209/* Return 1 if all garbages are collected, otherwise 0. */
200static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) 210static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
201{ 211{
@@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void)
226{ 236{
227 struct kprobe_insn_page *kip; 237 struct kprobe_insn_page *kip;
228 struct hlist_node *pos, *next; 238 struct hlist_node *pos, *next;
239 int safety;
229 240
230 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
231 if (check_safety() != 0) 242 mutex_unlock(&kprobe_insn_mutex);
243 safety = check_safety();
244 mutex_lock(&kprobe_insn_mutex);
245 if (safety != 0)
232 return -EAGAIN; 246 return -EAGAIN;
233 247
234 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 248 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
251 struct kprobe_insn_page *kip; 265 struct kprobe_insn_page *kip;
252 struct hlist_node *pos; 266 struct hlist_node *pos;
253 267
268 mutex_lock(&kprobe_insn_mutex);
254 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 269 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
255 if (kip->insns <= slot && 270 if (kip->insns <= slot &&
256 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 271 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
@@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
267 282
268 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 283 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
269 collect_garbage_slots(); 284 collect_garbage_slots();
285
286 mutex_unlock(&kprobe_insn_mutex);
270} 287}
271#endif 288#endif
272 289
@@ -310,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
310 struct kprobe *kp; 327 struct kprobe *kp;
311 328
312 list_for_each_entry_rcu(kp, &p->list, list) { 329 list_for_each_entry_rcu(kp, &p->list, list) {
313 if (kp->pre_handler) { 330 if (kp->pre_handler && !kprobe_gone(kp)) {
314 set_kprobe_instance(kp); 331 set_kprobe_instance(kp);
315 if (kp->pre_handler(kp, regs)) 332 if (kp->pre_handler(kp, regs))
316 return 1; 333 return 1;
@@ -326,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
326 struct kprobe *kp; 343 struct kprobe *kp;
327 344
328 list_for_each_entry_rcu(kp, &p->list, list) { 345 list_for_each_entry_rcu(kp, &p->list, list) {
329 if (kp->post_handler) { 346 if (kp->post_handler && !kprobe_gone(kp)) {
330 set_kprobe_instance(kp); 347 set_kprobe_instance(kp);
331 kp->post_handler(kp, regs, flags); 348 kp->post_handler(kp, regs, flags);
332 reset_kprobe_instance(); 349 reset_kprobe_instance();
@@ -393,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
393 hlist_add_head(&ri->hlist, head); 410 hlist_add_head(&ri->hlist, head);
394} 411}
395 412
396void kretprobe_hash_lock(struct task_struct *tsk, 413void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
397 struct hlist_head **head, unsigned long *flags) 414 struct hlist_head **head, unsigned long *flags)
398{ 415{
399 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 416 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -404,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk,
404 spin_lock_irqsave(hlist_lock, *flags); 421 spin_lock_irqsave(hlist_lock, *flags);
405} 422}
406 423
407static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) 424static void __kprobes kretprobe_table_lock(unsigned long hash,
425 unsigned long *flags)
408{ 426{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 427 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags); 428 spin_lock_irqsave(hlist_lock, *flags);
411} 429}
412 430
413void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) 431void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
432 unsigned long *flags)
414{ 433{
415 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 434 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
416 spinlock_t *hlist_lock; 435 spinlock_t *hlist_lock;
@@ -419,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
419 spin_unlock_irqrestore(hlist_lock, *flags); 438 spin_unlock_irqrestore(hlist_lock, *flags);
420} 439}
421 440
422void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 441void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
423{ 442{
424 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 443 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
425 spin_unlock_irqrestore(hlist_lock, *flags); 444 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -526,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
526 ap->addr = p->addr; 545 ap->addr = p->addr;
527 ap->pre_handler = aggr_pre_handler; 546 ap->pre_handler = aggr_pre_handler;
528 ap->fault_handler = aggr_fault_handler; 547 ap->fault_handler = aggr_fault_handler;
529 if (p->post_handler) 548 /* We don't care the kprobe which has gone. */
549 if (p->post_handler && !kprobe_gone(p))
530 ap->post_handler = aggr_post_handler; 550 ap->post_handler = aggr_post_handler;
531 if (p->break_handler) 551 if (p->break_handler && !kprobe_gone(p))
532 ap->break_handler = aggr_break_handler; 552 ap->break_handler = aggr_break_handler;
533 553
534 INIT_LIST_HEAD(&ap->list); 554 INIT_LIST_HEAD(&ap->list);
@@ -547,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
547 int ret = 0; 567 int ret = 0;
548 struct kprobe *ap; 568 struct kprobe *ap;
549 569
570 if (kprobe_gone(old_p)) {
571 /*
572 * Attempting to insert new probe at the same location that
573 * had a probe in the module vaddr area which already
574 * freed. So, the instruction slot has already been
575 * released. We need a new slot for the new probe.
576 */
577 ret = arch_prepare_kprobe(old_p);
578 if (ret)
579 return ret;
580 }
550 if (old_p->pre_handler == aggr_pre_handler) { 581 if (old_p->pre_handler == aggr_pre_handler) {
551 copy_kprobe(old_p, p); 582 copy_kprobe(old_p, p);
552 ret = add_new_kprobe(old_p, p); 583 ret = add_new_kprobe(old_p, p);
584 ap = old_p;
553 } else { 585 } else {
554 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 586 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
555 if (!ap) 587 if (!ap) {
588 if (kprobe_gone(old_p))
589 arch_remove_kprobe(old_p);
556 return -ENOMEM; 590 return -ENOMEM;
591 }
557 add_aggr_kprobe(ap, old_p); 592 add_aggr_kprobe(ap, old_p);
558 copy_kprobe(ap, p); 593 copy_kprobe(ap, p);
559 ret = add_new_kprobe(ap, p); 594 ret = add_new_kprobe(ap, p);
560 } 595 }
596 if (kprobe_gone(old_p)) {
597 /*
598 * If the old_p has gone, its breakpoint has been disarmed.
599 * We have to arm it again after preparing real kprobes.
600 */
601 ap->flags &= ~KPROBE_FLAG_GONE;
602 if (kprobe_enabled)
603 arch_arm_kprobe(ap);
604 }
561 return ret; 605 return ret;
562} 606}
563 607
@@ -600,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
600 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 644 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
601} 645}
602 646
603static int __kprobes __register_kprobe(struct kprobe *p, 647int __kprobes register_kprobe(struct kprobe *p)
604 unsigned long called_from)
605{ 648{
606 int ret = 0; 649 int ret = 0;
607 struct kprobe *old_p; 650 struct kprobe *old_p;
@@ -620,28 +663,30 @@ static int __kprobes __register_kprobe(struct kprobe *p,
620 return -EINVAL; 663 return -EINVAL;
621 } 664 }
622 665
623 p->mod_refcounted = 0; 666 p->flags = 0;
624
625 /* 667 /*
626 * Check if are we probing a module. 668 * Check if are we probing a module.
627 */ 669 */
628 probed_mod = __module_text_address((unsigned long) p->addr); 670 probed_mod = __module_text_address((unsigned long) p->addr);
629 if (probed_mod) { 671 if (probed_mod) {
630 struct module *calling_mod;
631 calling_mod = __module_text_address(called_from);
632 /* 672 /*
633 * We must allow modules to probe themself and in this case 673 * We must hold a refcount of the probed module while updating
634 * avoid incrementing the module refcount, so as to allow 674 * its code to prohibit unexpected unloading.
635 * unloading of self probing modules.
636 */ 675 */
637 if (calling_mod && calling_mod != probed_mod) { 676 if (unlikely(!try_module_get(probed_mod))) {
638 if (unlikely(!try_module_get(probed_mod))) { 677 preempt_enable();
639 preempt_enable(); 678 return -EINVAL;
640 return -EINVAL; 679 }
641 } 680 /*
642 p->mod_refcounted = 1; 681 * If the module freed .init.text, we couldn't insert
643 } else 682 * kprobes in there.
644 probed_mod = NULL; 683 */
684 if (within_module_init((unsigned long)p->addr, probed_mod) &&
685 probed_mod->state != MODULE_STATE_COMING) {
686 module_put(probed_mod);
687 preempt_enable();
688 return -EINVAL;
689 }
645 } 690 }
646 preempt_enable(); 691 preempt_enable();
647 692
@@ -668,8 +713,9 @@ static int __kprobes __register_kprobe(struct kprobe *p,
668out: 713out:
669 mutex_unlock(&kprobe_mutex); 714 mutex_unlock(&kprobe_mutex);
670 715
671 if (ret && probed_mod) 716 if (probed_mod)
672 module_put(probed_mod); 717 module_put(probed_mod);
718
673 return ret; 719 return ret;
674} 720}
675 721
@@ -697,16 +743,16 @@ valid_p:
697 list_is_singular(&old_p->list))) { 743 list_is_singular(&old_p->list))) {
698 /* 744 /*
699 * Only probe on the hash list. Disarm only if kprobes are 745 * Only probe on the hash list. Disarm only if kprobes are
700 * enabled - otherwise, the breakpoint would already have 746 * enabled and not gone - otherwise, the breakpoint would
701 * been removed. We save on flushing icache. 747 * already have been removed. We save on flushing icache.
702 */ 748 */
703 if (kprobe_enabled) 749 if (kprobe_enabled && !kprobe_gone(old_p))
704 arch_disarm_kprobe(p); 750 arch_disarm_kprobe(p);
705 hlist_del_rcu(&old_p->hlist); 751 hlist_del_rcu(&old_p->hlist);
706 } else { 752 } else {
707 if (p->break_handler) 753 if (p->break_handler && !kprobe_gone(p))
708 old_p->break_handler = NULL; 754 old_p->break_handler = NULL;
709 if (p->post_handler) { 755 if (p->post_handler && !kprobe_gone(p)) {
710 list_for_each_entry_rcu(list_p, &old_p->list, list) { 756 list_for_each_entry_rcu(list_p, &old_p->list, list) {
711 if ((list_p != p) && (list_p->post_handler)) 757 if ((list_p != p) && (list_p->post_handler))
712 goto noclean; 758 goto noclean;
@@ -721,39 +767,27 @@ noclean:
721 767
722static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 768static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
723{ 769{
724 struct module *mod;
725 struct kprobe *old_p; 770 struct kprobe *old_p;
726 771
727 if (p->mod_refcounted) { 772 if (list_empty(&p->list))
728 /*
729 * Since we've already incremented refcount,
730 * we don't need to disable preemption.
731 */
732 mod = module_text_address((unsigned long)p->addr);
733 if (mod)
734 module_put(mod);
735 }
736
737 if (list_empty(&p->list) || list_is_singular(&p->list)) {
738 if (!list_empty(&p->list)) {
739 /* "p" is the last child of an aggr_kprobe */
740 old_p = list_entry(p->list.next, struct kprobe, list);
741 list_del(&p->list);
742 kfree(old_p);
743 }
744 arch_remove_kprobe(p); 773 arch_remove_kprobe(p);
774 else if (list_is_singular(&p->list)) {
775 /* "p" is the last child of an aggr_kprobe */
776 old_p = list_entry(p->list.next, struct kprobe, list);
777 list_del(&p->list);
778 arch_remove_kprobe(old_p);
779 kfree(old_p);
745 } 780 }
746} 781}
747 782
748static int __register_kprobes(struct kprobe **kps, int num, 783int __kprobes register_kprobes(struct kprobe **kps, int num)
749 unsigned long called_from)
750{ 784{
751 int i, ret = 0; 785 int i, ret = 0;
752 786
753 if (num <= 0) 787 if (num <= 0)
754 return -EINVAL; 788 return -EINVAL;
755 for (i = 0; i < num; i++) { 789 for (i = 0; i < num; i++) {
756 ret = __register_kprobe(kps[i], called_from); 790 ret = register_kprobe(kps[i]);
757 if (ret < 0) { 791 if (ret < 0) {
758 if (i > 0) 792 if (i > 0)
759 unregister_kprobes(kps, i); 793 unregister_kprobes(kps, i);
@@ -763,26 +797,11 @@ static int __register_kprobes(struct kprobe **kps, int num,
763 return ret; 797 return ret;
764} 798}
765 799
766/*
767 * Registration and unregistration functions for kprobe.
768 */
769int __kprobes register_kprobe(struct kprobe *p)
770{
771 return __register_kprobes(&p, 1,
772 (unsigned long)__builtin_return_address(0));
773}
774
775void __kprobes unregister_kprobe(struct kprobe *p) 800void __kprobes unregister_kprobe(struct kprobe *p)
776{ 801{
777 unregister_kprobes(&p, 1); 802 unregister_kprobes(&p, 1);
778} 803}
779 804
780int __kprobes register_kprobes(struct kprobe **kps, int num)
781{
782 return __register_kprobes(kps, num,
783 (unsigned long)__builtin_return_address(0));
784}
785
786void __kprobes unregister_kprobes(struct kprobe **kps, int num) 805void __kprobes unregister_kprobes(struct kprobe **kps, int num)
787{ 806{
788 int i; 807 int i;
@@ -811,8 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
811 return (unsigned long)entry; 830 return (unsigned long)entry;
812} 831}
813 832
814static int __register_jprobes(struct jprobe **jps, int num, 833int __kprobes register_jprobes(struct jprobe **jps, int num)
815 unsigned long called_from)
816{ 834{
817 struct jprobe *jp; 835 struct jprobe *jp;
818 int ret = 0, i; 836 int ret = 0, i;
@@ -830,7 +848,7 @@ static int __register_jprobes(struct jprobe **jps, int num,
830 /* Todo: Verify probepoint is a function entry point */ 848 /* Todo: Verify probepoint is a function entry point */
831 jp->kp.pre_handler = setjmp_pre_handler; 849 jp->kp.pre_handler = setjmp_pre_handler;
832 jp->kp.break_handler = longjmp_break_handler; 850 jp->kp.break_handler = longjmp_break_handler;
833 ret = __register_kprobe(&jp->kp, called_from); 851 ret = register_kprobe(&jp->kp);
834 } 852 }
835 if (ret < 0) { 853 if (ret < 0) {
836 if (i > 0) 854 if (i > 0)
@@ -843,8 +861,7 @@ static int __register_jprobes(struct jprobe **jps, int num,
843 861
844int __kprobes register_jprobe(struct jprobe *jp) 862int __kprobes register_jprobe(struct jprobe *jp)
845{ 863{
846 return __register_jprobes(&jp, 1, 864 return register_jprobes(&jp, 1);
847 (unsigned long)__builtin_return_address(0));
848} 865}
849 866
850void __kprobes unregister_jprobe(struct jprobe *jp) 867void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -852,12 +869,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
852 unregister_jprobes(&jp, 1); 869 unregister_jprobes(&jp, 1);
853} 870}
854 871
855int __kprobes register_jprobes(struct jprobe **jps, int num)
856{
857 return __register_jprobes(jps, num,
858 (unsigned long)__builtin_return_address(0));
859}
860
861void __kprobes unregister_jprobes(struct jprobe **jps, int num) 872void __kprobes unregister_jprobes(struct jprobe **jps, int num)
862{ 873{
863 int i; 874 int i;
@@ -920,8 +931,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
920 return 0; 931 return 0;
921} 932}
922 933
923static int __kprobes __register_kretprobe(struct kretprobe *rp, 934int __kprobes register_kretprobe(struct kretprobe *rp)
924 unsigned long called_from)
925{ 935{
926 int ret = 0; 936 int ret = 0;
927 struct kretprobe_instance *inst; 937 struct kretprobe_instance *inst;
@@ -967,21 +977,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
967 977
968 rp->nmissed = 0; 978 rp->nmissed = 0;
969 /* Establish function entry probe point */ 979 /* Establish function entry probe point */
970 ret = __register_kprobe(&rp->kp, called_from); 980 ret = register_kprobe(&rp->kp);
971 if (ret != 0) 981 if (ret != 0)
972 free_rp_inst(rp); 982 free_rp_inst(rp);
973 return ret; 983 return ret;
974} 984}
975 985
976static int __register_kretprobes(struct kretprobe **rps, int num, 986int __kprobes register_kretprobes(struct kretprobe **rps, int num)
977 unsigned long called_from)
978{ 987{
979 int ret = 0, i; 988 int ret = 0, i;
980 989
981 if (num <= 0) 990 if (num <= 0)
982 return -EINVAL; 991 return -EINVAL;
983 for (i = 0; i < num; i++) { 992 for (i = 0; i < num; i++) {
984 ret = __register_kretprobe(rps[i], called_from); 993 ret = register_kretprobe(rps[i]);
985 if (ret < 0) { 994 if (ret < 0) {
986 if (i > 0) 995 if (i > 0)
987 unregister_kretprobes(rps, i); 996 unregister_kretprobes(rps, i);
@@ -991,23 +1000,11 @@ static int __register_kretprobes(struct kretprobe **rps, int num,
991 return ret; 1000 return ret;
992} 1001}
993 1002
994int __kprobes register_kretprobe(struct kretprobe *rp)
995{
996 return __register_kretprobes(&rp, 1,
997 (unsigned long)__builtin_return_address(0));
998}
999
1000void __kprobes unregister_kretprobe(struct kretprobe *rp) 1003void __kprobes unregister_kretprobe(struct kretprobe *rp)
1001{ 1004{
1002 unregister_kretprobes(&rp, 1); 1005 unregister_kretprobes(&rp, 1);
1003} 1006}
1004 1007
1005int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1006{
1007 return __register_kretprobes(rps, num,
1008 (unsigned long)__builtin_return_address(0));
1009}
1010
1011void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1008void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1012{ 1009{
1013 int i; 1010 int i;
@@ -1055,6 +1052,72 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1055 1052
1056#endif /* CONFIG_KRETPROBES */ 1053#endif /* CONFIG_KRETPROBES */
1057 1054
1055/* Set the kprobe gone and remove its instruction buffer. */
1056static void __kprobes kill_kprobe(struct kprobe *p)
1057{
1058 struct kprobe *kp;
1059 p->flags |= KPROBE_FLAG_GONE;
1060 if (p->pre_handler == aggr_pre_handler) {
1061 /*
1062 * If this is an aggr_kprobe, we have to list all the
1063 * chained probes and mark them GONE.
1064 */
1065 list_for_each_entry_rcu(kp, &p->list, list)
1066 kp->flags |= KPROBE_FLAG_GONE;
1067 p->post_handler = NULL;
1068 p->break_handler = NULL;
1069 }
1070 /*
1071 * Here, we can remove insn_slot safely, because no thread calls
1072 * the original probed function (which will be freed soon) any more.
1073 */
1074 arch_remove_kprobe(p);
1075}
1076
1077/* Module notifier call back, checking kprobes on the module */
1078static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1079 unsigned long val, void *data)
1080{
1081 struct module *mod = data;
1082 struct hlist_head *head;
1083 struct hlist_node *node;
1084 struct kprobe *p;
1085 unsigned int i;
1086 int checkcore = (val == MODULE_STATE_GOING);
1087
1088 if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
1089 return NOTIFY_DONE;
1090
1091 /*
1092 * When MODULE_STATE_GOING was notified, both of module .text and
1093 * .init.text sections would be freed. When MODULE_STATE_LIVE was
1094 * notified, only .init.text section would be freed. We need to
1095 * disable kprobes which have been inserted in the sections.
1096 */
1097 mutex_lock(&kprobe_mutex);
1098 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1099 head = &kprobe_table[i];
1100 hlist_for_each_entry_rcu(p, node, head, hlist)
1101 if (within_module_init((unsigned long)p->addr, mod) ||
1102 (checkcore &&
1103 within_module_core((unsigned long)p->addr, mod))) {
1104 /*
1105 * The vaddr this probe is installed will soon
1106 * be vfreed buy not synced to disk. Hence,
1107 * disarming the breakpoint isn't needed.
1108 */
1109 kill_kprobe(p);
1110 }
1111 }
1112 mutex_unlock(&kprobe_mutex);
1113 return NOTIFY_DONE;
1114}
1115
1116static struct notifier_block kprobe_module_nb = {
1117 .notifier_call = kprobes_module_callback,
1118 .priority = 0
1119};
1120
1058static int __init init_kprobes(void) 1121static int __init init_kprobes(void)
1059{ 1122{
1060 int i, err = 0; 1123 int i, err = 0;
@@ -1111,6 +1174,9 @@ static int __init init_kprobes(void)
1111 err = arch_init_kprobes(); 1174 err = arch_init_kprobes();
1112 if (!err) 1175 if (!err)
1113 err = register_die_notifier(&kprobe_exceptions_nb); 1176 err = register_die_notifier(&kprobe_exceptions_nb);
1177 if (!err)
1178 err = register_module_notifier(&kprobe_module_nb);
1179
1114 kprobes_initialized = (err == 0); 1180 kprobes_initialized = (err == 0);
1115 1181
1116 if (!err) 1182 if (!err)
@@ -1131,10 +1197,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1131 else 1197 else
1132 kprobe_type = "k"; 1198 kprobe_type = "k";
1133 if (sym) 1199 if (sym)
1134 seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, 1200 seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type,
1135 sym, offset, (modname ? modname : " ")); 1201 sym, offset, (modname ? modname : " "),
1202 (kprobe_gone(p) ? "[GONE]" : ""));
1136 else 1203 else
1137 seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); 1204 seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr,
1205 (kprobe_gone(p) ? "[GONE]" : ""));
1138} 1206}
1139 1207
1140static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1208static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1215,7 +1283,8 @@ static void __kprobes enable_all_kprobes(void)
1215 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1283 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1216 head = &kprobe_table[i]; 1284 head = &kprobe_table[i];
1217 hlist_for_each_entry_rcu(p, node, head, hlist) 1285 hlist_for_each_entry_rcu(p, node, head, hlist)
1218 arch_arm_kprobe(p); 1286 if (!kprobe_gone(p))
1287 arch_arm_kprobe(p);
1219 } 1288 }
1220 1289
1221 kprobe_enabled = true; 1290 kprobe_enabled = true;
@@ -1244,7 +1313,7 @@ static void __kprobes disable_all_kprobes(void)
1244 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1313 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1245 head = &kprobe_table[i]; 1314 head = &kprobe_table[i];
1246 hlist_for_each_entry_rcu(p, node, head, hlist) { 1315 hlist_for_each_entry_rcu(p, node, head, hlist) {
1247 if (!arch_trampoline_kprobe(p)) 1316 if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
1248 arch_disarm_kprobe(p); 1317 arch_disarm_kprobe(p);
1249 } 1318 }
1250 } 1319 }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 08dd8ed86c77..528dd78e7e7e 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
24static struct kobj_attribute _name##_attr = \ 24static struct kobj_attribute _name##_attr = \
25 __ATTR(_name, 0644, _name##_show, _name##_store) 25 __ATTR(_name, 0644, _name##_show, _name##_store)
26 26
27#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 27#if defined(CONFIG_HOTPLUG)
28/* current uevent sequence number */ 28/* current uevent sequence number */
29static ssize_t uevent_seqnum_show(struct kobject *kobj, 29static ssize_t uevent_seqnum_show(struct kobject *kobj,
30 struct kobj_attribute *attr, char *buf) 30 struct kobj_attribute *attr, char *buf)
@@ -137,7 +137,7 @@ struct kobject *kernel_kobj;
137EXPORT_SYMBOL_GPL(kernel_kobj); 137EXPORT_SYMBOL_GPL(kernel_kobj);
138 138
139static struct attribute * kernel_attrs[] = { 139static struct attribute * kernel_attrs[] = {
140#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 140#if defined(CONFIG_HOTPLUG)
141 &uevent_seqnum_attr.attr, 141 &uevent_seqnum_attr.attr,
142 &uevent_helper_attr.attr, 142 &uevent_helper_attr.attr,
143#endif 143#endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 74b1878b8bb8..06b0c3568f0b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -137,16 +137,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
137#ifdef CONFIG_LOCK_STAT 137#ifdef CONFIG_LOCK_STAT
138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
139 139
140static int lock_contention_point(struct lock_class *class, unsigned long ip) 140static int lock_point(unsigned long points[], unsigned long ip)
141{ 141{
142 int i; 142 int i;
143 143
144 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 144 for (i = 0; i < LOCKSTAT_POINTS; i++) {
145 if (class->contention_point[i] == 0) { 145 if (points[i] == 0) {
146 class->contention_point[i] = ip; 146 points[i] = ip;
147 break; 147 break;
148 } 148 }
149 if (class->contention_point[i] == ip) 149 if (points[i] == ip)
150 break; 150 break;
151 } 151 }
152 152
@@ -186,6 +186,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
187 stats.contention_point[i] += pcs->contention_point[i]; 187 stats.contention_point[i] += pcs->contention_point[i];
188 188
189 for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
190 stats.contending_point[i] += pcs->contending_point[i];
191
189 lock_time_add(&pcs->read_waittime, &stats.read_waittime); 192 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
190 lock_time_add(&pcs->write_waittime, &stats.write_waittime); 193 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
191 194
@@ -210,6 +213,7 @@ void clear_lock_stats(struct lock_class *class)
210 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 213 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
211 } 214 }
212 memset(class->contention_point, 0, sizeof(class->contention_point)); 215 memset(class->contention_point, 0, sizeof(class->contention_point));
216 memset(class->contending_point, 0, sizeof(class->contending_point));
213} 217}
214 218
215static struct lock_class_stats *get_lock_stats(struct lock_class *class) 219static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -288,14 +292,12 @@ void lockdep_off(void)
288{ 292{
289 current->lockdep_recursion++; 293 current->lockdep_recursion++;
290} 294}
291
292EXPORT_SYMBOL(lockdep_off); 295EXPORT_SYMBOL(lockdep_off);
293 296
294void lockdep_on(void) 297void lockdep_on(void)
295{ 298{
296 current->lockdep_recursion--; 299 current->lockdep_recursion--;
297} 300}
298
299EXPORT_SYMBOL(lockdep_on); 301EXPORT_SYMBOL(lockdep_on);
300 302
301/* 303/*
@@ -577,7 +579,8 @@ static void print_lock_class_header(struct lock_class *class, int depth)
577/* 579/*
578 * printk all lock dependencies starting at <entry>: 580 * printk all lock dependencies starting at <entry>:
579 */ 581 */
580static void print_lock_dependencies(struct lock_class *class, int depth) 582static void __used
583print_lock_dependencies(struct lock_class *class, int depth)
581{ 584{
582 struct lock_list *entry; 585 struct lock_list *entry;
583 586
@@ -2509,7 +2512,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2509 if (subclass) 2512 if (subclass)
2510 register_lock_class(lock, subclass, 1); 2513 register_lock_class(lock, subclass, 1);
2511} 2514}
2512
2513EXPORT_SYMBOL_GPL(lockdep_init_map); 2515EXPORT_SYMBOL_GPL(lockdep_init_map);
2514 2516
2515/* 2517/*
@@ -2690,8 +2692,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2690} 2692}
2691 2693
2692static int 2694static int
2693__lock_set_subclass(struct lockdep_map *lock, 2695__lock_set_class(struct lockdep_map *lock, const char *name,
2694 unsigned int subclass, unsigned long ip) 2696 struct lock_class_key *key, unsigned int subclass,
2697 unsigned long ip)
2695{ 2698{
2696 struct task_struct *curr = current; 2699 struct task_struct *curr = current;
2697 struct held_lock *hlock, *prev_hlock; 2700 struct held_lock *hlock, *prev_hlock;
@@ -2718,6 +2721,7 @@ __lock_set_subclass(struct lockdep_map *lock,
2718 return print_unlock_inbalance_bug(curr, lock, ip); 2721 return print_unlock_inbalance_bug(curr, lock, ip);
2719 2722
2720found_it: 2723found_it:
2724 lockdep_init_map(lock, name, key, 0);
2721 class = register_lock_class(lock, subclass, 0); 2725 class = register_lock_class(lock, subclass, 0);
2722 hlock->class_idx = class - lock_classes + 1; 2726 hlock->class_idx = class - lock_classes + 1;
2723 2727
@@ -2902,9 +2906,9 @@ static void check_flags(unsigned long flags)
2902#endif 2906#endif
2903} 2907}
2904 2908
2905void 2909void lock_set_class(struct lockdep_map *lock, const char *name,
2906lock_set_subclass(struct lockdep_map *lock, 2910 struct lock_class_key *key, unsigned int subclass,
2907 unsigned int subclass, unsigned long ip) 2911 unsigned long ip)
2908{ 2912{
2909 unsigned long flags; 2913 unsigned long flags;
2910 2914
@@ -2914,13 +2918,12 @@ lock_set_subclass(struct lockdep_map *lock,
2914 raw_local_irq_save(flags); 2918 raw_local_irq_save(flags);
2915 current->lockdep_recursion = 1; 2919 current->lockdep_recursion = 1;
2916 check_flags(flags); 2920 check_flags(flags);
2917 if (__lock_set_subclass(lock, subclass, ip)) 2921 if (__lock_set_class(lock, name, key, subclass, ip))
2918 check_chain_key(current); 2922 check_chain_key(current);
2919 current->lockdep_recursion = 0; 2923 current->lockdep_recursion = 0;
2920 raw_local_irq_restore(flags); 2924 raw_local_irq_restore(flags);
2921} 2925}
2922 2926EXPORT_SYMBOL_GPL(lock_set_class);
2923EXPORT_SYMBOL_GPL(lock_set_subclass);
2924 2927
2925/* 2928/*
2926 * We are not always called with irqs disabled - do that here, 2929 * We are not always called with irqs disabled - do that here,
@@ -2944,7 +2947,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2944 current->lockdep_recursion = 0; 2947 current->lockdep_recursion = 0;
2945 raw_local_irq_restore(flags); 2948 raw_local_irq_restore(flags);
2946} 2949}
2947
2948EXPORT_SYMBOL_GPL(lock_acquire); 2950EXPORT_SYMBOL_GPL(lock_acquire);
2949 2951
2950void lock_release(struct lockdep_map *lock, int nested, 2952void lock_release(struct lockdep_map *lock, int nested,
@@ -2962,7 +2964,6 @@ void lock_release(struct lockdep_map *lock, int nested,
2962 current->lockdep_recursion = 0; 2964 current->lockdep_recursion = 0;
2963 raw_local_irq_restore(flags); 2965 raw_local_irq_restore(flags);
2964} 2966}
2965
2966EXPORT_SYMBOL_GPL(lock_release); 2967EXPORT_SYMBOL_GPL(lock_release);
2967 2968
2968#ifdef CONFIG_LOCK_STAT 2969#ifdef CONFIG_LOCK_STAT
@@ -3000,7 +3001,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3000 struct held_lock *hlock, *prev_hlock; 3001 struct held_lock *hlock, *prev_hlock;
3001 struct lock_class_stats *stats; 3002 struct lock_class_stats *stats;
3002 unsigned int depth; 3003 unsigned int depth;
3003 int i, point; 3004 int i, contention_point, contending_point;
3004 3005
3005 depth = curr->lockdep_depth; 3006 depth = curr->lockdep_depth;
3006 if (DEBUG_LOCKS_WARN_ON(!depth)) 3007 if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3024,18 +3025,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3024found_it: 3025found_it:
3025 hlock->waittime_stamp = sched_clock(); 3026 hlock->waittime_stamp = sched_clock();
3026 3027
3027 point = lock_contention_point(hlock_class(hlock), ip); 3028 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3029 contending_point = lock_point(hlock_class(hlock)->contending_point,
3030 lock->ip);
3028 3031
3029 stats = get_lock_stats(hlock_class(hlock)); 3032 stats = get_lock_stats(hlock_class(hlock));
3030 if (point < ARRAY_SIZE(stats->contention_point)) 3033 if (contention_point < LOCKSTAT_POINTS)
3031 stats->contention_point[point]++; 3034 stats->contention_point[contention_point]++;
3035 if (contending_point < LOCKSTAT_POINTS)
3036 stats->contending_point[contending_point]++;
3032 if (lock->cpu != smp_processor_id()) 3037 if (lock->cpu != smp_processor_id())
3033 stats->bounces[bounce_contended + !!hlock->read]++; 3038 stats->bounces[bounce_contended + !!hlock->read]++;
3034 put_lock_stats(stats); 3039 put_lock_stats(stats);
3035} 3040}
3036 3041
3037static void 3042static void
3038__lock_acquired(struct lockdep_map *lock) 3043__lock_acquired(struct lockdep_map *lock, unsigned long ip)
3039{ 3044{
3040 struct task_struct *curr = current; 3045 struct task_struct *curr = current;
3041 struct held_lock *hlock, *prev_hlock; 3046 struct held_lock *hlock, *prev_hlock;
@@ -3084,6 +3089,7 @@ found_it:
3084 put_lock_stats(stats); 3089 put_lock_stats(stats);
3085 3090
3086 lock->cpu = cpu; 3091 lock->cpu = cpu;
3092 lock->ip = ip;
3087} 3093}
3088 3094
3089void lock_contended(struct lockdep_map *lock, unsigned long ip) 3095void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3105,7 +3111,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3105} 3111}
3106EXPORT_SYMBOL_GPL(lock_contended); 3112EXPORT_SYMBOL_GPL(lock_contended);
3107 3113
3108void lock_acquired(struct lockdep_map *lock) 3114void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3109{ 3115{
3110 unsigned long flags; 3116 unsigned long flags;
3111 3117
@@ -3118,7 +3124,7 @@ void lock_acquired(struct lockdep_map *lock)
3118 raw_local_irq_save(flags); 3124 raw_local_irq_save(flags);
3119 check_flags(flags); 3125 check_flags(flags);
3120 current->lockdep_recursion = 1; 3126 current->lockdep_recursion = 1;
3121 __lock_acquired(lock); 3127 __lock_acquired(lock, ip);
3122 current->lockdep_recursion = 0; 3128 current->lockdep_recursion = 0;
3123 raw_local_irq_restore(flags); 3129 raw_local_irq_restore(flags);
3124} 3130}
@@ -3442,7 +3448,6 @@ retry:
3442 if (unlock) 3448 if (unlock)
3443 read_unlock(&tasklist_lock); 3449 read_unlock(&tasklist_lock);
3444} 3450}
3445
3446EXPORT_SYMBOL_GPL(debug_show_all_locks); 3451EXPORT_SYMBOL_GPL(debug_show_all_locks);
3447 3452
3448/* 3453/*
@@ -3463,7 +3468,6 @@ void debug_show_held_locks(struct task_struct *task)
3463{ 3468{
3464 __debug_show_held_locks(task); 3469 __debug_show_held_locks(task);
3465} 3470}
3466
3467EXPORT_SYMBOL_GPL(debug_show_held_locks); 3471EXPORT_SYMBOL_GPL(debug_show_held_locks);
3468 3472
3469void lockdep_sys_exit(void) 3473void lockdep_sys_exit(void)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7dd..13716b813896 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
470 470
471static void snprint_time(char *buf, size_t bufsiz, s64 nr) 471static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 s64 div;
474 s32 rem;
474 475
475 nr += 5; /* for display rounding */ 476 nr += 5; /* for display rounding */
476 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 477 div = div_s64_rem(nr, 1000, &rem);
477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); 478 snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
478} 479}
479 480
480static void seq_time(struct seq_file *m, s64 time) 481static void seq_time(struct seq_file *m, s64 time)
@@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
556 if (stats->read_holdtime.nr) 557 if (stats->read_holdtime.nr)
557 namelen += 2; 558 namelen += 2;
558 559
559 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 560 for (i = 0; i < LOCKSTAT_POINTS; i++) {
560 char sym[KSYM_SYMBOL_LEN]; 561 char sym[KSYM_SYMBOL_LEN];
561 char ip[32]; 562 char ip[32];
562 563
@@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
573 stats->contention_point[i], 574 stats->contention_point[i],
574 ip, sym); 575 ip, sym);
575 } 576 }
577 for (i = 0; i < LOCKSTAT_POINTS; i++) {
578 char sym[KSYM_SYMBOL_LEN];
579 char ip[32];
580
581 if (class->contending_point[i] == 0)
582 break;
583
584 if (!i)
585 seq_line(m, '-', 40-namelen, namelen);
586
587 sprint_symbol(sym, class->contending_point[i]);
588 snprintf(ip, sizeof(ip), "[<%p>]",
589 (void *)class->contending_point[i]);
590 seq_printf(m, "%40s %14lu %29s %s\n", name,
591 stats->contending_point[i],
592 ip, sym);
593 }
576 if (i) { 594 if (i) {
577 seq_puts(m, "\n"); 595 seq_puts(m, "\n");
578 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 596 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
582 600
583static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
584{ 602{
585 seq_printf(m, "lock_stat version 0.2\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
586 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
587 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
588 "%14s %14s\n", 606 "%14s %14s\n",
diff --git a/kernel/module.c b/kernel/module.c
index dd2a54155b54..c9332c90d5a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,7 +43,6 @@
43#include <linux/device.h> 43#include <linux/device.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/unwind.h>
47#include <linux/rculist.h> 46#include <linux/rculist.h>
48#include <asm/uaccess.h> 47#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 48#include <asm/cacheflush.h>
@@ -51,6 +50,7 @@
51#include <asm/sections.h> 50#include <asm/sections.h>
52#include <linux/tracepoint.h> 51#include <linux/tracepoint.h>
53#include <linux/ftrace.h> 52#include <linux/ftrace.h>
53#include <linux/async.h>
54 54
55#if 0 55#if 0
56#define DEBUGP printk 56#define DEBUGP printk
@@ -757,8 +757,16 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
757 return -EFAULT; 757 return -EFAULT;
758 name[MODULE_NAME_LEN-1] = '\0'; 758 name[MODULE_NAME_LEN-1] = '\0';
759 759
760 if (mutex_lock_interruptible(&module_mutex) != 0) 760 /* Create stop_machine threads since free_module relies on
761 return -EINTR; 761 * a non-failing stop_machine call. */
762 ret = stop_machine_create();
763 if (ret)
764 return ret;
765
766 if (mutex_lock_interruptible(&module_mutex) != 0) {
767 ret = -EINTR;
768 goto out_stop;
769 }
762 770
763 mod = find_module(name); 771 mod = find_module(name);
764 if (!mod) { 772 if (!mod) {
@@ -809,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
809 mod->exit(); 817 mod->exit();
810 blocking_notifier_call_chain(&module_notify_list, 818 blocking_notifier_call_chain(&module_notify_list,
811 MODULE_STATE_GOING, mod); 819 MODULE_STATE_GOING, mod);
820 async_synchronize_full();
812 mutex_lock(&module_mutex); 821 mutex_lock(&module_mutex);
813 /* Store the name of the last unloaded module for diagnostic purposes */ 822 /* Store the name of the last unloaded module for diagnostic purposes */
814 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 823 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
@@ -817,10 +826,12 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
817 826
818 out: 827 out:
819 mutex_unlock(&module_mutex); 828 mutex_unlock(&module_mutex);
829out_stop:
830 stop_machine_destroy();
820 return ret; 831 return ret;
821} 832}
822 833
823static void print_unload_info(struct seq_file *m, struct module *mod) 834static inline void print_unload_info(struct seq_file *m, struct module *mod)
824{ 835{
825 struct module_use *use; 836 struct module_use *use;
826 int printed_something = 0; 837 int printed_something = 0;
@@ -893,7 +904,7 @@ void module_put(struct module *module)
893EXPORT_SYMBOL(module_put); 904EXPORT_SYMBOL(module_put);
894 905
895#else /* !CONFIG_MODULE_UNLOAD */ 906#else /* !CONFIG_MODULE_UNLOAD */
896static void print_unload_info(struct seq_file *m, struct module *mod) 907static inline void print_unload_info(struct seq_file *m, struct module *mod)
897{ 908{
898 /* We don't know the usage count, or what modules are using. */ 909 /* We don't know the usage count, or what modules are using. */
899 seq_printf(m, " - -"); 910 seq_printf(m, " - -");
@@ -1439,8 +1450,6 @@ static void free_module(struct module *mod)
1439 remove_sect_attrs(mod); 1450 remove_sect_attrs(mod);
1440 mod_kobject_remove(mod); 1451 mod_kobject_remove(mod);
1441 1452
1442 unwind_remove_table(mod->unwind_info, 0);
1443
1444 /* Arch-specific cleanup. */ 1453 /* Arch-specific cleanup. */
1445 module_arch_cleanup(mod); 1454 module_arch_cleanup(mod);
1446 1455
@@ -1578,11 +1587,21 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1578 return ret; 1587 return ret;
1579} 1588}
1580 1589
1590/* Additional bytes needed by arch in front of individual sections */
1591unsigned int __weak arch_mod_section_prepend(struct module *mod,
1592 unsigned int section)
1593{
1594 /* default implementation just returns zero */
1595 return 0;
1596}
1597
1581/* Update size with this section: return offset. */ 1598/* Update size with this section: return offset. */
1582static long get_offset(unsigned int *size, Elf_Shdr *sechdr) 1599static long get_offset(struct module *mod, unsigned int *size,
1600 Elf_Shdr *sechdr, unsigned int section)
1583{ 1601{
1584 long ret; 1602 long ret;
1585 1603
1604 *size += arch_mod_section_prepend(mod, section);
1586 ret = ALIGN(*size, sechdr->sh_addralign ?: 1); 1605 ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
1587 *size = ret + sechdr->sh_size; 1606 *size = ret + sechdr->sh_size;
1588 return ret; 1607 return ret;
@@ -1622,7 +1641,7 @@ static void layout_sections(struct module *mod,
1622 || strncmp(secstrings + s->sh_name, 1641 || strncmp(secstrings + s->sh_name,
1623 ".init", 5) == 0) 1642 ".init", 5) == 0)
1624 continue; 1643 continue;
1625 s->sh_entsize = get_offset(&mod->core_size, s); 1644 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1626 DEBUGP("\t%s\n", secstrings + s->sh_name); 1645 DEBUGP("\t%s\n", secstrings + s->sh_name);
1627 } 1646 }
1628 if (m == 0) 1647 if (m == 0)
@@ -1640,7 +1659,7 @@ static void layout_sections(struct module *mod,
1640 || strncmp(secstrings + s->sh_name, 1659 || strncmp(secstrings + s->sh_name,
1641 ".init", 5) != 0) 1660 ".init", 5) != 0)
1642 continue; 1661 continue;
1643 s->sh_entsize = (get_offset(&mod->init_size, s) 1662 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1644 | INIT_OFFSET_MASK); 1663 | INIT_OFFSET_MASK);
1645 DEBUGP("\t%s\n", secstrings + s->sh_name); 1664 DEBUGP("\t%s\n", secstrings + s->sh_name);
1646 } 1665 }
@@ -1725,15 +1744,15 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
1725 return NULL; 1744 return NULL;
1726} 1745}
1727 1746
1728static int is_exported(const char *name, const struct module *mod) 1747static int is_exported(const char *name, unsigned long value,
1748 const struct module *mod)
1729{ 1749{
1730 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1750 const struct kernel_symbol *ks;
1731 return 1; 1751 if (!mod)
1752 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
1732 else 1753 else
1733 if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) 1754 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
1734 return 1; 1755 return ks != NULL && ks->value == value;
1735 else
1736 return 0;
1737} 1756}
1738 1757
1739/* As per nm */ 1758/* As per nm */
@@ -1847,7 +1866,6 @@ static noinline struct module *load_module(void __user *umod,
1847 unsigned int symindex = 0; 1866 unsigned int symindex = 0;
1848 unsigned int strindex = 0; 1867 unsigned int strindex = 0;
1849 unsigned int modindex, versindex, infoindex, pcpuindex; 1868 unsigned int modindex, versindex, infoindex, pcpuindex;
1850 unsigned int unwindex = 0;
1851 unsigned int num_kp, num_mcount; 1869 unsigned int num_kp, num_mcount;
1852 struct kernel_param *kp; 1870 struct kernel_param *kp;
1853 struct module *mod; 1871 struct module *mod;
@@ -1865,6 +1883,13 @@ static noinline struct module *load_module(void __user *umod,
1865 /* vmalloc barfs on "unusual" numbers. Check here */ 1883 /* vmalloc barfs on "unusual" numbers. Check here */
1866 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 1884 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
1867 return ERR_PTR(-ENOMEM); 1885 return ERR_PTR(-ENOMEM);
1886
1887 /* Create stop_machine threads since the error path relies on
1888 * a non-failing stop_machine call. */
1889 err = stop_machine_create();
1890 if (err)
1891 goto free_hdr;
1892
1868 if (copy_from_user(hdr, umod, len) != 0) { 1893 if (copy_from_user(hdr, umod, len) != 0) {
1869 err = -EFAULT; 1894 err = -EFAULT;
1870 goto free_hdr; 1895 goto free_hdr;
@@ -1930,9 +1955,6 @@ static noinline struct module *load_module(void __user *umod,
1930 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1955 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1931 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1956 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1932 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1957 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1933#ifdef ARCH_UNWIND_SECTION_NAME
1934 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1935#endif
1936 1958
1937 /* Don't keep modinfo and version sections. */ 1959 /* Don't keep modinfo and version sections. */
1938 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1960 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1942,8 +1964,6 @@ static noinline struct module *load_module(void __user *umod,
1942 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1964 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1943 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1965 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1944#endif 1966#endif
1945 if (unwindex)
1946 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1947 1967
1948 /* Check module struct version now, before we try to use module. */ 1968 /* Check module struct version now, before we try to use module. */
1949 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1969 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2240,14 +2260,10 @@ static noinline struct module *load_module(void __user *umod,
2240 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2260 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2241 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2261 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2242 2262
2243 /* Size of section 0 is 0, so this works well if no unwind info. */
2244 mod->unwind_info = unwind_add_table(mod,
2245 (void *)sechdrs[unwindex].sh_addr,
2246 sechdrs[unwindex].sh_size);
2247
2248 /* Get rid of temporary copy */ 2263 /* Get rid of temporary copy */
2249 vfree(hdr); 2264 vfree(hdr);
2250 2265
2266 stop_machine_destroy();
2251 /* Done! */ 2267 /* Done! */
2252 return mod; 2268 return mod;
2253 2269
@@ -2270,6 +2286,7 @@ static noinline struct module *load_module(void __user *umod,
2270 kfree(args); 2286 kfree(args);
2271 free_hdr: 2287 free_hdr:
2272 vfree(hdr); 2288 vfree(hdr);
2289 stop_machine_destroy();
2273 return ERR_PTR(err); 2290 return ERR_PTR(err);
2274 2291
2275 truncated: 2292 truncated:
@@ -2337,11 +2354,12 @@ sys_init_module(void __user *umod,
2337 /* Now it's a first class citizen! Wake up anyone waiting for it. */ 2354 /* Now it's a first class citizen! Wake up anyone waiting for it. */
2338 mod->state = MODULE_STATE_LIVE; 2355 mod->state = MODULE_STATE_LIVE;
2339 wake_up(&module_wq); 2356 wake_up(&module_wq);
2357 blocking_notifier_call_chain(&module_notify_list,
2358 MODULE_STATE_LIVE, mod);
2340 2359
2341 mutex_lock(&module_mutex); 2360 mutex_lock(&module_mutex);
2342 /* Drop initial reference. */ 2361 /* Drop initial reference. */
2343 module_put(mod); 2362 module_put(mod);
2344 unwind_remove_table(mod->unwind_info, 1);
2345 module_free(mod, mod->module_init); 2363 module_free(mod, mod->module_init);
2346 mod->module_init = NULL; 2364 mod->module_init = NULL;
2347 mod->init_size = 0; 2365 mod->init_size = 0;
@@ -2376,7 +2394,7 @@ static const char *get_ksymbol(struct module *mod,
2376 unsigned long nextval; 2394 unsigned long nextval;
2377 2395
2378 /* At worse, next value is at end of module */ 2396 /* At worse, next value is at end of module */
2379 if (within(addr, mod->module_init, mod->init_size)) 2397 if (within_module_init(addr, mod))
2380 nextval = (unsigned long)mod->module_init+mod->init_text_size; 2398 nextval = (unsigned long)mod->module_init+mod->init_text_size;
2381 else 2399 else
2382 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2400 nextval = (unsigned long)mod->module_core+mod->core_text_size;
@@ -2424,8 +2442,8 @@ const char *module_address_lookup(unsigned long addr,
2424 2442
2425 preempt_disable(); 2443 preempt_disable();
2426 list_for_each_entry_rcu(mod, &modules, list) { 2444 list_for_each_entry_rcu(mod, &modules, list) {
2427 if (within(addr, mod->module_init, mod->init_size) 2445 if (within_module_init(addr, mod) ||
2428 || within(addr, mod->module_core, mod->core_size)) { 2446 within_module_core(addr, mod)) {
2429 if (modname) 2447 if (modname)
2430 *modname = mod->name; 2448 *modname = mod->name;
2431 ret = get_ksymbol(mod, addr, size, offset); 2449 ret = get_ksymbol(mod, addr, size, offset);
@@ -2447,8 +2465,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2447 2465
2448 preempt_disable(); 2466 preempt_disable();
2449 list_for_each_entry_rcu(mod, &modules, list) { 2467 list_for_each_entry_rcu(mod, &modules, list) {
2450 if (within(addr, mod->module_init, mod->init_size) || 2468 if (within_module_init(addr, mod) ||
2451 within(addr, mod->module_core, mod->core_size)) { 2469 within_module_core(addr, mod)) {
2452 const char *sym; 2470 const char *sym;
2453 2471
2454 sym = get_ksymbol(mod, addr, NULL, NULL); 2472 sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -2471,8 +2489,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2471 2489
2472 preempt_disable(); 2490 preempt_disable();
2473 list_for_each_entry_rcu(mod, &modules, list) { 2491 list_for_each_entry_rcu(mod, &modules, list) {
2474 if (within(addr, mod->module_init, mod->init_size) || 2492 if (within_module_init(addr, mod) ||
2475 within(addr, mod->module_core, mod->core_size)) { 2493 within_module_core(addr, mod)) {
2476 const char *sym; 2494 const char *sym;
2477 2495
2478 sym = get_ksymbol(mod, addr, size, offset); 2496 sym = get_ksymbol(mod, addr, size, offset);
@@ -2504,7 +2522,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2504 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, 2522 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
2505 KSYM_NAME_LEN); 2523 KSYM_NAME_LEN);
2506 strlcpy(module_name, mod->name, MODULE_NAME_LEN); 2524 strlcpy(module_name, mod->name, MODULE_NAME_LEN);
2507 *exported = is_exported(name, mod); 2525 *exported = is_exported(name, *value, mod);
2508 preempt_enable(); 2526 preempt_enable();
2509 return 0; 2527 return 0;
2510 } 2528 }
@@ -2691,7 +2709,7 @@ int is_module_address(unsigned long addr)
2691 preempt_disable(); 2709 preempt_disable();
2692 2710
2693 list_for_each_entry_rcu(mod, &modules, list) { 2711 list_for_each_entry_rcu(mod, &modules, list) {
2694 if (within(addr, mod->module_core, mod->core_size)) { 2712 if (within_module_core(addr, mod)) {
2695 preempt_enable(); 2713 preempt_enable();
2696 return 1; 2714 return 1;
2697 } 2715 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d4..4f45d4b658ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
59 * We also put the fastpath first in the kernel image, to make sure the 59 * We also put the fastpath first in the kernel image, to make sure the
60 * branch is predicted by the CPU as default-untaken. 60 * branch is predicted by the CPU as default-untaken.
61 */ 61 */
62static void noinline __sched 62static __used noinline void __sched
63__mutex_lock_slowpath(atomic_t *lock_count); 63__mutex_lock_slowpath(atomic_t *lock_count);
64 64
65/*** 65/***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
96EXPORT_SYMBOL(mutex_lock); 96EXPORT_SYMBOL(mutex_lock);
97#endif 97#endif
98 98
99static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 99static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
100 100
101/*** 101/***
102 * mutex_unlock - release the mutex 102 * mutex_unlock - release the mutex
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
184 } 184 }
185 185
186done: 186done:
187 lock_acquired(&lock->dep_map); 187 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 188 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 189 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
190 debug_mutex_set_owner(lock, task_thread_info(task)); 190 debug_mutex_set_owner(lock, task_thread_info(task));
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
268/* 268/*
269 * Release the lock, slowpath: 269 * Release the lock, slowpath:
270 */ 270 */
271static noinline void 271static __used noinline void
272__mutex_unlock_slowpath(atomic_t *lock_count) 272__mutex_unlock_slowpath(atomic_t *lock_count)
273{ 273{
274 __mutex_unlock_common_slowpath(lock_count, 1); 274 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
313} 313}
314EXPORT_SYMBOL(mutex_lock_killable); 314EXPORT_SYMBOL(mutex_lock_killable);
315 315
316static noinline void __sched 316static __used noinline void __sched
317__mutex_lock_slowpath(atomic_t *lock_count) 317__mutex_lock_slowpath(atomic_t *lock_count)
318{ 318{
319 struct mutex *lock = container_of(lock_count, struct mutex, count); 319 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4282c0a40a57..61d5aa5eced3 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference(nb->next);
85
86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
88 WARN(1, "Invalid notifier called!");
89 nb = next_nb;
90 continue;
91 }
92#endif
85 ret = nb->notifier_call(nb, val, v); 93 ret = nb->notifier_call(nb, val, v);
86 94
87 if (nr_calls) 95 if (nr_calls)
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 43c2111cd54d..78bc3fdac0d2 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -13,7 +13,6 @@
13 13
14struct ns_cgroup { 14struct ns_cgroup {
15 struct cgroup_subsys_state css; 15 struct cgroup_subsys_state css;
16 spinlock_t lock;
17}; 16};
18 17
19struct cgroup_subsys ns_subsys; 18struct cgroup_subsys ns_subsys;
@@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
84 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 83 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
85 if (!ns_cgroup) 84 if (!ns_cgroup)
86 return ERR_PTR(-ENOMEM); 85 return ERR_PTR(-ENOMEM);
87 spin_lock_init(&ns_cgroup->lock);
88 return &ns_cgroup->css; 86 return &ns_cgroup->css;
89} 87}
90 88
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d5088355bfe..2a2ff36ff44d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/dmi.h>
24 25
25int panic_on_oops; 26int panic_on_oops;
26static unsigned long tainted_mask; 27static unsigned long tainted_mask;
@@ -298,6 +299,8 @@ static int init_oops_id(void)
298{ 299{
299 if (!oops_id) 300 if (!oops_id)
300 get_random_bytes(&oops_id, sizeof(oops_id)); 301 get_random_bytes(&oops_id, sizeof(oops_id));
302 else
303 oops_id++;
301 304
302 return 0; 305 return 0;
303} 306}
@@ -321,36 +324,27 @@ void oops_exit(void)
321} 324}
322 325
323#ifdef WANT_WARN_ON_SLOWPATH 326#ifdef WANT_WARN_ON_SLOWPATH
324void warn_on_slowpath(const char *file, int line)
325{
326 char function[KSYM_SYMBOL_LEN];
327 unsigned long caller = (unsigned long) __builtin_return_address(0);
328 sprint_symbol(function, caller);
329
330 printk(KERN_WARNING "------------[ cut here ]------------\n");
331 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
332 line, function);
333 print_modules();
334 dump_stack();
335 print_oops_end_marker();
336 add_taint(TAINT_WARN);
337}
338EXPORT_SYMBOL(warn_on_slowpath);
339
340
341void warn_slowpath(const char *file, int line, const char *fmt, ...) 327void warn_slowpath(const char *file, int line, const char *fmt, ...)
342{ 328{
343 va_list args; 329 va_list args;
344 char function[KSYM_SYMBOL_LEN]; 330 char function[KSYM_SYMBOL_LEN];
345 unsigned long caller = (unsigned long)__builtin_return_address(0); 331 unsigned long caller = (unsigned long)__builtin_return_address(0);
332 const char *board;
333
346 sprint_symbol(function, caller); 334 sprint_symbol(function, caller);
347 335
348 printk(KERN_WARNING "------------[ cut here ]------------\n"); 336 printk(KERN_WARNING "------------[ cut here ]------------\n");
349 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, 337 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
350 line, function); 338 line, function);
351 va_start(args, fmt); 339 board = dmi_get_system_info(DMI_PRODUCT_NAME);
352 vprintk(fmt, args); 340 if (board)
353 va_end(args); 341 printk(KERN_WARNING "Hardware name: %s\n", board);
342
343 if (fmt) {
344 va_start(args, fmt);
345 vprintk(fmt, args);
346 va_end(args);
347 }
354 348
355 print_modules(); 349 print_modules();
356 dump_stack(); 350 dump_stack();
diff --git a/kernel/pid.c b/kernel/pid.c
index 064e76afa507..1b3586fe753a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -474,8 +474,14 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
474} 474}
475EXPORT_SYMBOL(task_session_nr_ns); 475EXPORT_SYMBOL(task_session_nr_ns);
476 476
477struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
478{
479 return ns_of_pid(task_pid(tsk));
480}
481EXPORT_SYMBOL_GPL(task_active_pid_ns);
482
477/* 483/*
478 * Used by proc to find the first pid that is greater then or equal to nr. 484 * Used by proc to find the first pid that is greater than or equal to nr.
479 * 485 *
480 * If there is a pid at nr this function is exactly the same as find_pid_ns. 486 * If there is a pid at nr this function is exactly the same as find_pid_ns.
481 */ 487 */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4e5288a831de..157de3a47832 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -58,21 +58,21 @@ void thread_group_cputime(
58 struct task_struct *tsk, 58 struct task_struct *tsk,
59 struct task_cputime *times) 59 struct task_cputime *times)
60{ 60{
61 struct signal_struct *sig; 61 struct task_cputime *totals, *tot;
62 int i; 62 int i;
63 struct task_cputime *tot;
64 63
65 sig = tsk->signal; 64 totals = tsk->signal->cputime.totals;
66 if (unlikely(!sig) || !sig->cputime.totals) { 65 if (!totals) {
67 times->utime = tsk->utime; 66 times->utime = tsk->utime;
68 times->stime = tsk->stime; 67 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime; 68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return; 69 return;
71 } 70 }
71
72 times->stime = times->utime = cputime_zero; 72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0; 73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) { 74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i); 75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime); 76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime); 77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime; 78 times->sum_exec_runtime += tot->sum_exec_runtime;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index a140e44eebba..887c63787de6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -116,7 +116,7 @@ static DEFINE_SPINLOCK(idr_lock);
116 * must supply functions here, even if the function just returns 116 * must supply functions here, even if the function just returns
117 * ENOSYS. The standard POSIX timer management code assumes the 117 * ENOSYS. The standard POSIX timer management code assumes the
118 * following: 1.) The k_itimer struct (sched.h) is used for the 118 * following: 1.) The k_itimer struct (sched.h) is used for the
119 * timer. 2.) The list, it_lock, it_clock, it_id and it_process 119 * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
120 * fields are not modified by timer code. 120 * fields are not modified by timer code.
121 * 121 *
122 * At this time all functions EXCEPT clock_nanosleep can be 122 * At this time all functions EXCEPT clock_nanosleep can be
@@ -319,7 +319,8 @@ void do_schedule_next_timer(struct siginfo *info)
319 319
320int posix_timer_event(struct k_itimer *timr, int si_private) 320int posix_timer_event(struct k_itimer *timr, int si_private)
321{ 321{
322 int shared, ret; 322 struct task_struct *task;
323 int shared, ret = -1;
323 /* 324 /*
324 * FIXME: if ->sigq is queued we can race with 325 * FIXME: if ->sigq is queued we can race with
325 * dequeue_signal()->do_schedule_next_timer(). 326 * dequeue_signal()->do_schedule_next_timer().
@@ -333,8 +334,13 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
333 */ 334 */
334 timr->sigq->info.si_sys_private = si_private; 335 timr->sigq->info.si_sys_private = si_private;
335 336
336 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); 337 rcu_read_lock();
337 ret = send_sigqueue(timr->sigq, timr->it_process, shared); 338 task = pid_task(timr->it_pid, PIDTYPE_PID);
339 if (task) {
340 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
341 ret = send_sigqueue(timr->sigq, task, shared);
342 }
343 rcu_read_unlock();
338 /* If we failed to send the signal the timer stops. */ 344 /* If we failed to send the signal the timer stops. */
339 return ret > 0; 345 return ret > 0;
340} 346}
@@ -411,7 +417,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
411 return ret; 417 return ret;
412} 418}
413 419
414static struct task_struct * good_sigevent(sigevent_t * event) 420static struct pid *good_sigevent(sigevent_t * event)
415{ 421{
416 struct task_struct *rtn = current->group_leader; 422 struct task_struct *rtn = current->group_leader;
417 423
@@ -425,7 +431,7 @@ static struct task_struct * good_sigevent(sigevent_t * event)
425 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) 431 ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX)))
426 return NULL; 432 return NULL;
427 433
428 return rtn; 434 return task_pid(rtn);
429} 435}
430 436
431void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock) 437void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
@@ -464,6 +470,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
464 idr_remove(&posix_timers_id, tmr->it_id); 470 idr_remove(&posix_timers_id, tmr->it_id);
465 spin_unlock_irqrestore(&idr_lock, flags); 471 spin_unlock_irqrestore(&idr_lock, flags);
466 } 472 }
473 put_pid(tmr->it_pid);
467 sigqueue_free(tmr->sigq); 474 sigqueue_free(tmr->sigq);
468 kmem_cache_free(posix_timers_cache, tmr); 475 kmem_cache_free(posix_timers_cache, tmr);
469} 476}
@@ -477,7 +484,6 @@ sys_timer_create(const clockid_t which_clock,
477{ 484{
478 struct k_itimer *new_timer; 485 struct k_itimer *new_timer;
479 int error, new_timer_id; 486 int error, new_timer_id;
480 struct task_struct *process;
481 sigevent_t event; 487 sigevent_t event;
482 int it_id_set = IT_ID_NOT_SET; 488 int it_id_set = IT_ID_NOT_SET;
483 489
@@ -531,11 +537,9 @@ sys_timer_create(const clockid_t which_clock,
531 goto out; 537 goto out;
532 } 538 }
533 rcu_read_lock(); 539 rcu_read_lock();
534 process = good_sigevent(&event); 540 new_timer->it_pid = get_pid(good_sigevent(&event));
535 if (process)
536 get_task_struct(process);
537 rcu_read_unlock(); 541 rcu_read_unlock();
538 if (!process) { 542 if (!new_timer->it_pid) {
539 error = -EINVAL; 543 error = -EINVAL;
540 goto out; 544 goto out;
541 } 545 }
@@ -543,8 +547,7 @@ sys_timer_create(const clockid_t which_clock,
543 event.sigev_notify = SIGEV_SIGNAL; 547 event.sigev_notify = SIGEV_SIGNAL;
544 event.sigev_signo = SIGALRM; 548 event.sigev_signo = SIGALRM;
545 event.sigev_value.sival_int = new_timer->it_id; 549 event.sigev_value.sival_int = new_timer->it_id;
546 process = current->group_leader; 550 new_timer->it_pid = get_pid(task_tgid(current));
547 get_task_struct(process);
548 } 551 }
549 552
550 new_timer->it_sigev_notify = event.sigev_notify; 553 new_timer->it_sigev_notify = event.sigev_notify;
@@ -554,7 +557,7 @@ sys_timer_create(const clockid_t which_clock,
554 new_timer->sigq->info.si_code = SI_TIMER; 557 new_timer->sigq->info.si_code = SI_TIMER;
555 558
556 spin_lock_irq(&current->sighand->siglock); 559 spin_lock_irq(&current->sighand->siglock);
557 new_timer->it_process = process; 560 new_timer->it_signal = current->signal;
558 list_add(&new_timer->list, &current->signal->posix_timers); 561 list_add(&new_timer->list, &current->signal->posix_timers);
559 spin_unlock_irq(&current->sighand->siglock); 562 spin_unlock_irq(&current->sighand->siglock);
560 563
@@ -589,8 +592,7 @@ static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
589 timr = idr_find(&posix_timers_id, (int)timer_id); 592 timr = idr_find(&posix_timers_id, (int)timer_id);
590 if (timr) { 593 if (timr) {
591 spin_lock(&timr->it_lock); 594 spin_lock(&timr->it_lock);
592 if (timr->it_process && 595 if (timr->it_signal == current->signal) {
593 same_thread_group(timr->it_process, current)) {
594 spin_unlock(&idr_lock); 596 spin_unlock(&idr_lock);
595 return timr; 597 return timr;
596 } 598 }
@@ -837,8 +839,7 @@ retry_delete:
837 * This keeps any tasks waiting on the spin lock from thinking 839 * This keeps any tasks waiting on the spin lock from thinking
838 * they got something (see the lock code above). 840 * they got something (see the lock code above).
839 */ 841 */
840 put_task_struct(timer->it_process); 842 timer->it_signal = NULL;
841 timer->it_process = NULL;
842 843
843 unlock_timer(timer, flags); 844 unlock_timer(timer, flags);
844 release_posix_timer(timer, IT_ID_SET); 845 release_posix_timer(timer, IT_ID_SET);
@@ -864,8 +865,7 @@ retry_delete:
864 * This keeps any tasks waiting on the spin lock from thinking 865 * This keeps any tasks waiting on the spin lock from thinking
865 * they got something (see the lock code above). 866 * they got something (see the lock code above).
866 */ 867 */
867 put_task_struct(timer->it_process); 868 timer->it_signal = NULL;
868 timer->it_process = NULL;
869 869
870 unlock_timer(timer, flags); 870 unlock_timer(timer, flags);
871 release_posix_timer(timer, IT_ID_SET); 871 release_posix_timer(timer, IT_ID_SET);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f77d3819ef57..45e8541ab7e3 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -258,12 +258,12 @@ int hibernation_snapshot(int platform_mode)
258{ 258{
259 int error; 259 int error;
260 260
261 /* Free memory before shutting down devices. */ 261 error = platform_begin(platform_mode);
262 error = swsusp_shrink_memory();
263 if (error) 262 if (error)
264 return error; 263 return error;
265 264
266 error = platform_begin(platform_mode); 265 /* Free memory before shutting down devices. */
266 error = swsusp_shrink_memory();
267 if (error) 267 if (error)
268 goto Close; 268 goto Close;
269 269
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 613f16941b85..239988873971 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -615,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
615 /* this may fail if the RTC hasn't been initialized */ 615 /* this may fail if the RTC hasn't been initialized */
616 status = rtc_read_time(rtc, &alm.time); 616 status = rtc_read_time(rtc, &alm.time);
617 if (status < 0) { 617 if (status < 0) {
618 printk(err_readtime, rtc->dev.bus_id, status); 618 printk(err_readtime, dev_name(&rtc->dev), status);
619 return; 619 return;
620 } 620 }
621 rtc_tm_to_time(&alm.time, &now); 621 rtc_tm_to_time(&alm.time, &now);
@@ -626,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
626 626
627 status = rtc_set_alarm(rtc, &alm); 627 status = rtc_set_alarm(rtc, &alm);
628 if (status < 0) { 628 if (status < 0) {
629 printk(err_wakealarm, rtc->dev.bus_id, status); 629 printk(err_wakealarm, dev_name(&rtc->dev), status);
630 return; 630 return;
631 } 631 }
632 632
@@ -660,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
660 if (!device_may_wakeup(candidate->dev.parent)) 660 if (!device_may_wakeup(candidate->dev.parent))
661 return 0; 661 return 0;
662 662
663 *(char **)name_ptr = dev->bus_id; 663 *(const char **)name_ptr = dev_name(dev);
664 return 1; 664 return 1;
665} 665}
666 666
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 72016f051477..97890831e1b5 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
27static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key, struct tty_struct *tty)
28{ 28{
29 /* run sysrq poweroff on boot cpu */ 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); 30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
31} 31}
32 32
33static struct sysrq_key_op sysrq_poweroff_op = { 33static struct sysrq_key_op sysrq_poweroff_op = {
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5d2ab836e998..f5fc2d7680f2 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -25,6 +25,7 @@
25#include <linux/syscalls.h> 25#include <linux/syscalls.h>
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/list.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/mmu_context.h> 31#include <asm/mmu_context.h>
@@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
192 return ret; 193 return ret;
193} 194}
194 195
195static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
196{
197 free_list_of_pages(ca->chain, clear_page_nosave);
198 memset(ca, 0, sizeof(struct chain_allocator));
199}
200
201/** 196/**
202 * Data types related to memory bitmaps. 197 * Data types related to memory bitmaps.
203 * 198 *
@@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
233#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 228#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
234 229
235struct bm_block { 230struct bm_block {
236 struct bm_block *next; /* next element of the list */ 231 struct list_head hook; /* hook into a list of bitmap blocks */
237 unsigned long start_pfn; /* pfn represented by the first bit */ 232 unsigned long start_pfn; /* pfn represented by the first bit */
238 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ 233 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
239 unsigned long *data; /* bitmap representing pages */ 234 unsigned long *data; /* bitmap representing pages */
@@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb)
244 return bb->end_pfn - bb->start_pfn; 239 return bb->end_pfn - bb->start_pfn;
245} 240}
246 241
247struct zone_bitmap {
248 struct zone_bitmap *next; /* next element of the list */
249 unsigned long start_pfn; /* minimal pfn in this zone */
250 unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
251 struct bm_block *bm_blocks; /* list of bitmap blocks */
252 struct bm_block *cur_block; /* recently used bitmap block */
253};
254
255/* strcut bm_position is used for browsing memory bitmaps */ 242/* strcut bm_position is used for browsing memory bitmaps */
256 243
257struct bm_position { 244struct bm_position {
258 struct zone_bitmap *zone_bm;
259 struct bm_block *block; 245 struct bm_block *block;
260 int bit; 246 int bit;
261}; 247};
262 248
263struct memory_bitmap { 249struct memory_bitmap {
264 struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ 250 struct list_head blocks; /* list of bitmap blocks */
265 struct linked_page *p_list; /* list of pages used to store zone 251 struct linked_page *p_list; /* list of pages used to store zone
266 * bitmap objects and bitmap block 252 * bitmap objects and bitmap block
267 * objects 253 * objects
@@ -273,11 +259,7 @@ struct memory_bitmap {
273 259
274static void memory_bm_position_reset(struct memory_bitmap *bm) 260static void memory_bm_position_reset(struct memory_bitmap *bm)
275{ 261{
276 struct zone_bitmap *zone_bm; 262 bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
277
278 zone_bm = bm->zone_bm_list;
279 bm->cur.zone_bm = zone_bm;
280 bm->cur.block = zone_bm->bm_blocks;
281 bm->cur.bit = 0; 263 bm->cur.bit = 0;
282} 264}
283 265
@@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
285 267
286/** 268/**
287 * create_bm_block_list - create a list of block bitmap objects 269 * create_bm_block_list - create a list of block bitmap objects
270 * @nr_blocks - number of blocks to allocate
271 * @list - list to put the allocated blocks into
272 * @ca - chain allocator to be used for allocating memory
288 */ 273 */
289 274static int create_bm_block_list(unsigned long pages,
290static inline struct bm_block * 275 struct list_head *list,
291create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) 276 struct chain_allocator *ca)
292{ 277{
293 struct bm_block *bblist = NULL; 278 unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
294 279
295 while (nr_blocks-- > 0) { 280 while (nr_blocks-- > 0) {
296 struct bm_block *bb; 281 struct bm_block *bb;
297 282
298 bb = chain_alloc(ca, sizeof(struct bm_block)); 283 bb = chain_alloc(ca, sizeof(struct bm_block));
299 if (!bb) 284 if (!bb)
300 return NULL; 285 return -ENOMEM;
301 286 list_add(&bb->hook, list);
302 bb->next = bblist;
303 bblist = bb;
304 } 287 }
305 return bblist; 288
289 return 0;
306} 290}
307 291
292struct mem_extent {
293 struct list_head hook;
294 unsigned long start;
295 unsigned long end;
296};
297
308/** 298/**
309 * create_zone_bm_list - create a list of zone bitmap objects 299 * free_mem_extents - free a list of memory extents
300 * @list - list of extents to empty
310 */ 301 */
302static void free_mem_extents(struct list_head *list)
303{
304 struct mem_extent *ext, *aux;
311 305
312static inline struct zone_bitmap * 306 list_for_each_entry_safe(ext, aux, list, hook) {
313create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) 307 list_del(&ext->hook);
308 kfree(ext);
309 }
310}
311
312/**
313 * create_mem_extents - create a list of memory extents representing
314 * contiguous ranges of PFNs
315 * @list - list to put the extents into
316 * @gfp_mask - mask to use for memory allocations
317 */
318static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
314{ 319{
315 struct zone_bitmap *zbmlist = NULL; 320 struct zone *zone;
316 321
317 while (nr_zones-- > 0) { 322 INIT_LIST_HEAD(list);
318 struct zone_bitmap *zbm;
319 323
320 zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); 324 for_each_zone(zone) {
321 if (!zbm) 325 unsigned long zone_start, zone_end;
322 return NULL; 326 struct mem_extent *ext, *cur, *aux;
327
328 if (!populated_zone(zone))
329 continue;
323 330
324 zbm->next = zbmlist; 331 zone_start = zone->zone_start_pfn;
325 zbmlist = zbm; 332 zone_end = zone->zone_start_pfn + zone->spanned_pages;
333
334 list_for_each_entry(ext, list, hook)
335 if (zone_start <= ext->end)
336 break;
337
338 if (&ext->hook == list || zone_end < ext->start) {
339 /* New extent is necessary */
340 struct mem_extent *new_ext;
341
342 new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask);
343 if (!new_ext) {
344 free_mem_extents(list);
345 return -ENOMEM;
346 }
347 new_ext->start = zone_start;
348 new_ext->end = zone_end;
349 list_add_tail(&new_ext->hook, &ext->hook);
350 continue;
351 }
352
353 /* Merge this zone's range of PFNs with the existing one */
354 if (zone_start < ext->start)
355 ext->start = zone_start;
356 if (zone_end > ext->end)
357 ext->end = zone_end;
358
359 /* More merging may be possible */
360 cur = ext;
361 list_for_each_entry_safe_continue(cur, aux, list, hook) {
362 if (zone_end < cur->start)
363 break;
364 if (zone_end < cur->end)
365 ext->end = cur->end;
366 list_del(&cur->hook);
367 kfree(cur);
368 }
326 } 369 }
327 return zbmlist; 370
371 return 0;
328} 372}
329 373
330/** 374/**
331 * memory_bm_create - allocate memory for a memory bitmap 375 * memory_bm_create - allocate memory for a memory bitmap
332 */ 376 */
333
334static int 377static int
335memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) 378memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
336{ 379{
337 struct chain_allocator ca; 380 struct chain_allocator ca;
338 struct zone *zone; 381 struct list_head mem_extents;
339 struct zone_bitmap *zone_bm; 382 struct mem_extent *ext;
340 struct bm_block *bb; 383 int error;
341 unsigned int nr;
342 384
343 chain_init(&ca, gfp_mask, safe_needed); 385 chain_init(&ca, gfp_mask, safe_needed);
386 INIT_LIST_HEAD(&bm->blocks);
344 387
345 /* Compute the number of zones */ 388 error = create_mem_extents(&mem_extents, gfp_mask);
346 nr = 0; 389 if (error)
347 for_each_zone(zone) 390 return error;
348 if (populated_zone(zone))
349 nr++;
350
351 /* Allocate the list of zones bitmap objects */
352 zone_bm = create_zone_bm_list(nr, &ca);
353 bm->zone_bm_list = zone_bm;
354 if (!zone_bm) {
355 chain_free(&ca, PG_UNSAFE_CLEAR);
356 return -ENOMEM;
357 }
358
359 /* Initialize the zone bitmap objects */
360 for_each_zone(zone) {
361 unsigned long pfn;
362 391
363 if (!populated_zone(zone)) 392 list_for_each_entry(ext, &mem_extents, hook) {
364 continue; 393 struct bm_block *bb;
394 unsigned long pfn = ext->start;
395 unsigned long pages = ext->end - ext->start;
365 396
366 zone_bm->start_pfn = zone->zone_start_pfn; 397 bb = list_entry(bm->blocks.prev, struct bm_block, hook);
367 zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
368 /* Allocate the list of bitmap block objects */
369 nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
370 bb = create_bm_block_list(nr, &ca);
371 zone_bm->bm_blocks = bb;
372 zone_bm->cur_block = bb;
373 if (!bb)
374 goto Free;
375 398
376 nr = zone->spanned_pages; 399 error = create_bm_block_list(pages, bm->blocks.prev, &ca);
377 pfn = zone->zone_start_pfn; 400 if (error)
378 /* Initialize the bitmap block objects */ 401 goto Error;
379 while (bb) {
380 unsigned long *ptr;
381 402
382 ptr = get_image_page(gfp_mask, safe_needed); 403 list_for_each_entry_continue(bb, &bm->blocks, hook) {
383 bb->data = ptr; 404 bb->data = get_image_page(gfp_mask, safe_needed);
384 if (!ptr) 405 if (!bb->data) {
385 goto Free; 406 error = -ENOMEM;
407 goto Error;
408 }
386 409
387 bb->start_pfn = pfn; 410 bb->start_pfn = pfn;
388 if (nr >= BM_BITS_PER_BLOCK) { 411 if (pages >= BM_BITS_PER_BLOCK) {
389 pfn += BM_BITS_PER_BLOCK; 412 pfn += BM_BITS_PER_BLOCK;
390 nr -= BM_BITS_PER_BLOCK; 413 pages -= BM_BITS_PER_BLOCK;
391 } else { 414 } else {
392 /* This is executed only once in the loop */ 415 /* This is executed only once in the loop */
393 pfn += nr; 416 pfn += pages;
394 } 417 }
395 bb->end_pfn = pfn; 418 bb->end_pfn = pfn;
396 bb = bb->next;
397 } 419 }
398 zone_bm = zone_bm->next;
399 } 420 }
421
400 bm->p_list = ca.chain; 422 bm->p_list = ca.chain;
401 memory_bm_position_reset(bm); 423 memory_bm_position_reset(bm);
402 return 0; 424 Exit:
425 free_mem_extents(&mem_extents);
426 return error;
403 427
404 Free: 428 Error:
405 bm->p_list = ca.chain; 429 bm->p_list = ca.chain;
406 memory_bm_free(bm, PG_UNSAFE_CLEAR); 430 memory_bm_free(bm, PG_UNSAFE_CLEAR);
407 return -ENOMEM; 431 goto Exit;
408} 432}
409 433
410/** 434/**
411 * memory_bm_free - free memory occupied by the memory bitmap @bm 435 * memory_bm_free - free memory occupied by the memory bitmap @bm
412 */ 436 */
413
414static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) 437static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
415{ 438{
416 struct zone_bitmap *zone_bm; 439 struct bm_block *bb;
417 440
418 /* Free the list of bit blocks for each zone_bitmap object */ 441 list_for_each_entry(bb, &bm->blocks, hook)
419 zone_bm = bm->zone_bm_list; 442 if (bb->data)
420 while (zone_bm) { 443 free_image_page(bb->data, clear_nosave_free);
421 struct bm_block *bb;
422 444
423 bb = zone_bm->bm_blocks;
424 while (bb) {
425 if (bb->data)
426 free_image_page(bb->data, clear_nosave_free);
427 bb = bb->next;
428 }
429 zone_bm = zone_bm->next;
430 }
431 free_list_of_pages(bm->p_list, clear_nosave_free); 445 free_list_of_pages(bm->p_list, clear_nosave_free);
432 bm->zone_bm_list = NULL; 446
447 INIT_LIST_HEAD(&bm->blocks);
433} 448}
434 449
435/** 450/**
@@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
437 * to given pfn. The cur_zone_bm member of @bm and the cur_block member 452 * to given pfn. The cur_zone_bm member of @bm and the cur_block member
438 * of @bm->cur_zone_bm are updated. 453 * of @bm->cur_zone_bm are updated.
439 */ 454 */
440
441static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, 455static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
442 void **addr, unsigned int *bit_nr) 456 void **addr, unsigned int *bit_nr)
443{ 457{
444 struct zone_bitmap *zone_bm;
445 struct bm_block *bb; 458 struct bm_block *bb;
446 459
447 /* Check if the pfn is from the current zone */ 460 /*
448 zone_bm = bm->cur.zone_bm; 461 * Check if the pfn corresponds to the current bitmap block and find
449 if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { 462 * the block where it fits if this is not the case.
450 zone_bm = bm->zone_bm_list; 463 */
451 /* We don't assume that the zones are sorted by pfns */ 464 bb = bm->cur.block;
452 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
453 zone_bm = zone_bm->next;
454
455 if (!zone_bm)
456 return -EFAULT;
457 }
458 bm->cur.zone_bm = zone_bm;
459 }
460 /* Check if the pfn corresponds to the current bitmap block */
461 bb = zone_bm->cur_block;
462 if (pfn < bb->start_pfn) 465 if (pfn < bb->start_pfn)
463 bb = zone_bm->bm_blocks; 466 list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
467 if (pfn >= bb->start_pfn)
468 break;
464 469
465 while (pfn >= bb->end_pfn) { 470 if (pfn >= bb->end_pfn)
466 bb = bb->next; 471 list_for_each_entry_continue(bb, &bm->blocks, hook)
472 if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
473 break;
467 474
468 BUG_ON(!bb); 475 if (&bb->hook == &bm->blocks)
469 } 476 return -EFAULT;
470 zone_bm->cur_block = bb; 477
478 /* The block has been found */
479 bm->cur.block = bb;
471 pfn -= bb->start_pfn; 480 pfn -= bb->start_pfn;
481 bm->cur.bit = pfn + 1;
472 *bit_nr = pfn; 482 *bit_nr = pfn;
473 *addr = bb->data; 483 *addr = bb->data;
474 return 0; 484 return 0;
@@ -519,6 +529,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
519 return test_bit(bit, addr); 529 return test_bit(bit, addr);
520} 530}
521 531
532static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
533{
534 void *addr;
535 unsigned int bit;
536
537 return !memory_bm_find_bit(bm, pfn, &addr, &bit);
538}
539
522/** 540/**
523 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit 541 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
524 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is 542 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
@@ -530,29 +548,21 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
530 548
531static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) 549static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
532{ 550{
533 struct zone_bitmap *zone_bm;
534 struct bm_block *bb; 551 struct bm_block *bb;
535 int bit; 552 int bit;
536 553
554 bb = bm->cur.block;
537 do { 555 do {
538 bb = bm->cur.block; 556 bit = bm->cur.bit;
539 do { 557 bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
540 bit = bm->cur.bit; 558 if (bit < bm_block_bits(bb))
541 bit = find_next_bit(bb->data, bm_block_bits(bb), bit); 559 goto Return_pfn;
542 if (bit < bm_block_bits(bb)) 560
543 goto Return_pfn; 561 bb = list_entry(bb->hook.next, struct bm_block, hook);
544 562 bm->cur.block = bb;
545 bb = bb->next; 563 bm->cur.bit = 0;
546 bm->cur.block = bb; 564 } while (&bb->hook != &bm->blocks);
547 bm->cur.bit = 0; 565
548 } while (bb);
549 zone_bm = bm->cur.zone_bm->next;
550 if (zone_bm) {
551 bm->cur.zone_bm = zone_bm;
552 bm->cur.block = zone_bm->bm_blocks;
553 bm->cur.bit = 0;
554 }
555 } while (zone_bm);
556 memory_bm_position_reset(bm); 566 memory_bm_position_reset(bm);
557 return BM_END_OF_MAP; 567 return BM_END_OF_MAP;
558 568
@@ -808,8 +818,7 @@ static unsigned int count_free_highmem_pages(void)
808 * We should save the page if it isn't Nosave or NosaveFree, or Reserved, 818 * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
809 * and it isn't a part of a free chunk of pages. 819 * and it isn't a part of a free chunk of pages.
810 */ 820 */
811 821static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
812static struct page *saveable_highmem_page(unsigned long pfn)
813{ 822{
814 struct page *page; 823 struct page *page;
815 824
@@ -817,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn)
817 return NULL; 826 return NULL;
818 827
819 page = pfn_to_page(pfn); 828 page = pfn_to_page(pfn);
829 if (page_zone(page) != zone)
830 return NULL;
820 831
821 BUG_ON(!PageHighMem(page)); 832 BUG_ON(!PageHighMem(page));
822 833
@@ -846,13 +857,16 @@ unsigned int count_highmem_pages(void)
846 mark_free_pages(zone); 857 mark_free_pages(zone);
847 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 858 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
848 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 859 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
849 if (saveable_highmem_page(pfn)) 860 if (saveable_highmem_page(zone, pfn))
850 n++; 861 n++;
851 } 862 }
852 return n; 863 return n;
853} 864}
854#else 865#else
855static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } 866static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
867{
868 return NULL;
869}
856#endif /* CONFIG_HIGHMEM */ 870#endif /* CONFIG_HIGHMEM */
857 871
858/** 872/**
@@ -863,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
863 * of pages statically defined as 'unsaveable', and it isn't a part of 877 * of pages statically defined as 'unsaveable', and it isn't a part of
864 * a free chunk of pages. 878 * a free chunk of pages.
865 */ 879 */
866 880static struct page *saveable_page(struct zone *zone, unsigned long pfn)
867static struct page *saveable_page(unsigned long pfn)
868{ 881{
869 struct page *page; 882 struct page *page;
870 883
@@ -872,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn)
872 return NULL; 885 return NULL;
873 886
874 page = pfn_to_page(pfn); 887 page = pfn_to_page(pfn);
888 if (page_zone(page) != zone)
889 return NULL;
875 890
876 BUG_ON(PageHighMem(page)); 891 BUG_ON(PageHighMem(page));
877 892
@@ -903,7 +918,7 @@ unsigned int count_data_pages(void)
903 mark_free_pages(zone); 918 mark_free_pages(zone);
904 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 919 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
905 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 920 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
906 if(saveable_page(pfn)) 921 if (saveable_page(zone, pfn))
907 n++; 922 n++;
908 } 923 }
909 return n; 924 return n;
@@ -944,7 +959,7 @@ static inline struct page *
944page_is_saveable(struct zone *zone, unsigned long pfn) 959page_is_saveable(struct zone *zone, unsigned long pfn)
945{ 960{
946 return is_highmem(zone) ? 961 return is_highmem(zone) ?
947 saveable_highmem_page(pfn) : saveable_page(pfn); 962 saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
948} 963}
949 964
950static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) 965static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
@@ -966,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
966 * data modified by kmap_atomic() 981 * data modified by kmap_atomic()
967 */ 982 */
968 safe_copy_page(buffer, s_page); 983 safe_copy_page(buffer, s_page);
969 dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); 984 dst = kmap_atomic(d_page, KM_USER0);
970 memcpy(dst, buffer, PAGE_SIZE); 985 memcpy(dst, buffer, PAGE_SIZE);
971 kunmap_atomic(dst, KM_USER0); 986 kunmap_atomic(dst, KM_USER0);
972 } else { 987 } else {
@@ -975,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
975 } 990 }
976} 991}
977#else 992#else
978#define page_is_saveable(zone, pfn) saveable_page(pfn) 993#define page_is_saveable(zone, pfn) saveable_page(zone, pfn)
979 994
980static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) 995static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
981{ 996{
@@ -1459,9 +1474,7 @@ load_header(struct swsusp_info *info)
1459 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set 1474 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
1460 * the corresponding bit in the memory bitmap @bm 1475 * the corresponding bit in the memory bitmap @bm
1461 */ 1476 */
1462 1477static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1463static inline void
1464unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1465{ 1478{
1466 int j; 1479 int j;
1467 1480
@@ -1469,8 +1482,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1469 if (unlikely(buf[j] == BM_END_OF_MAP)) 1482 if (unlikely(buf[j] == BM_END_OF_MAP))
1470 break; 1483 break;
1471 1484
1472 memory_bm_set_bit(bm, buf[j]); 1485 if (memory_bm_pfn_present(bm, buf[j]))
1486 memory_bm_set_bit(bm, buf[j]);
1487 else
1488 return -EFAULT;
1473 } 1489 }
1490
1491 return 0;
1474} 1492}
1475 1493
1476/* List of "safe" pages that may be used to store data loaded from the suspend 1494/* List of "safe" pages that may be used to store data loaded from the suspend
@@ -1608,7 +1626,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1608 pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); 1626 pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
1609 if (!pbe) { 1627 if (!pbe) {
1610 swsusp_free(); 1628 swsusp_free();
1611 return NULL; 1629 return ERR_PTR(-ENOMEM);
1612 } 1630 }
1613 pbe->orig_page = page; 1631 pbe->orig_page = page;
1614 if (safe_highmem_pages > 0) { 1632 if (safe_highmem_pages > 0) {
@@ -1677,7 +1695,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1677static inline void * 1695static inline void *
1678get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) 1696get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1679{ 1697{
1680 return NULL; 1698 return ERR_PTR(-EINVAL);
1681} 1699}
1682 1700
1683static inline void copy_last_highmem_page(void) {} 1701static inline void copy_last_highmem_page(void) {}
@@ -1788,8 +1806,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1788static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) 1806static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1789{ 1807{
1790 struct pbe *pbe; 1808 struct pbe *pbe;
1791 struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); 1809 struct page *page;
1810 unsigned long pfn = memory_bm_next_pfn(bm);
1792 1811
1812 if (pfn == BM_END_OF_MAP)
1813 return ERR_PTR(-EFAULT);
1814
1815 page = pfn_to_page(pfn);
1793 if (PageHighMem(page)) 1816 if (PageHighMem(page))
1794 return get_highmem_page_buffer(page, ca); 1817 return get_highmem_page_buffer(page, ca);
1795 1818
@@ -1805,7 +1828,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1805 pbe = chain_alloc(ca, sizeof(struct pbe)); 1828 pbe = chain_alloc(ca, sizeof(struct pbe));
1806 if (!pbe) { 1829 if (!pbe) {
1807 swsusp_free(); 1830 swsusp_free();
1808 return NULL; 1831 return ERR_PTR(-ENOMEM);
1809 } 1832 }
1810 pbe->orig_address = page_address(page); 1833 pbe->orig_address = page_address(page);
1811 pbe->address = safe_pages_list; 1834 pbe->address = safe_pages_list;
@@ -1868,7 +1891,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1868 return error; 1891 return error;
1869 1892
1870 } else if (handle->prev <= nr_meta_pages) { 1893 } else if (handle->prev <= nr_meta_pages) {
1871 unpack_orig_pfns(buffer, &copy_bm); 1894 error = unpack_orig_pfns(buffer, &copy_bm);
1895 if (error)
1896 return error;
1897
1872 if (handle->prev == nr_meta_pages) { 1898 if (handle->prev == nr_meta_pages) {
1873 error = prepare_image(&orig_bm, &copy_bm); 1899 error = prepare_image(&orig_bm, &copy_bm);
1874 if (error) 1900 if (error)
@@ -1879,12 +1905,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1879 restore_pblist = NULL; 1905 restore_pblist = NULL;
1880 handle->buffer = get_buffer(&orig_bm, &ca); 1906 handle->buffer = get_buffer(&orig_bm, &ca);
1881 handle->sync_read = 0; 1907 handle->sync_read = 0;
1882 if (!handle->buffer) 1908 if (IS_ERR(handle->buffer))
1883 return -ENOMEM; 1909 return PTR_ERR(handle->buffer);
1884 } 1910 }
1885 } else { 1911 } else {
1886 copy_last_highmem_page(); 1912 copy_last_highmem_page();
1887 handle->buffer = get_buffer(&orig_bm, &ca); 1913 handle->buffer = get_buffer(&orig_bm, &ca);
1914 if (IS_ERR(handle->buffer))
1915 return PTR_ERR(handle->buffer);
1888 if (handle->buffer != buffer) 1916 if (handle->buffer != buffer)
1889 handle->sync_read = 0; 1917 handle->sync_read = 0;
1890 } 1918 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 023ff2a31d89..a92c91451559 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -262,3 +262,125 @@ int swsusp_shrink_memory(void)
262 262
263 return 0; 263 return 0;
264} 264}
265
266/*
267 * Platforms, like ACPI, may want us to save some memory used by them during
268 * hibernation and to restore the contents of this memory during the subsequent
269 * resume. The code below implements a mechanism allowing us to do that.
270 */
271
272struct nvs_page {
273 unsigned long phys_start;
274 unsigned int size;
275 void *kaddr;
276 void *data;
277 struct list_head node;
278};
279
280static LIST_HEAD(nvs_list);
281
282/**
283 * hibernate_nvs_register - register platform NVS memory region to save
284 * @start - physical address of the region
285 * @size - size of the region
286 *
287 * The NVS region need not be page-aligned (both ends) and we arrange
288 * things so that the data from page-aligned addresses in this region will
289 * be copied into separate RAM pages.
290 */
291int hibernate_nvs_register(unsigned long start, unsigned long size)
292{
293 struct nvs_page *entry, *next;
294
295 while (size > 0) {
296 unsigned int nr_bytes;
297
298 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
299 if (!entry)
300 goto Error;
301
302 list_add_tail(&entry->node, &nvs_list);
303 entry->phys_start = start;
304 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
305 entry->size = (size < nr_bytes) ? size : nr_bytes;
306
307 start += entry->size;
308 size -= entry->size;
309 }
310 return 0;
311
312 Error:
313 list_for_each_entry_safe(entry, next, &nvs_list, node) {
314 list_del(&entry->node);
315 kfree(entry);
316 }
317 return -ENOMEM;
318}
319
320/**
321 * hibernate_nvs_free - free data pages allocated for saving NVS regions
322 */
323void hibernate_nvs_free(void)
324{
325 struct nvs_page *entry;
326
327 list_for_each_entry(entry, &nvs_list, node)
328 if (entry->data) {
329 free_page((unsigned long)entry->data);
330 entry->data = NULL;
331 if (entry->kaddr) {
332 iounmap(entry->kaddr);
333 entry->kaddr = NULL;
334 }
335 }
336}
337
338/**
339 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
340 */
341int hibernate_nvs_alloc(void)
342{
343 struct nvs_page *entry;
344
345 list_for_each_entry(entry, &nvs_list, node) {
346 entry->data = (void *)__get_free_page(GFP_KERNEL);
347 if (!entry->data) {
348 hibernate_nvs_free();
349 return -ENOMEM;
350 }
351 }
352 return 0;
353}
354
355/**
356 * hibernate_nvs_save - save NVS memory regions
357 */
358void hibernate_nvs_save(void)
359{
360 struct nvs_page *entry;
361
362 printk(KERN_INFO "PM: Saving platform NVS memory\n");
363
364 list_for_each_entry(entry, &nvs_list, node)
365 if (entry->data) {
366 entry->kaddr = ioremap(entry->phys_start, entry->size);
367 memcpy(entry->data, entry->kaddr, entry->size);
368 }
369}
370
371/**
372 * hibernate_nvs_restore - restore NVS memory regions
373 *
374 * This function is going to be called with interrupts disabled, so it
375 * cannot iounmap the virtual addresses used to access the NVS region.
376 */
377void hibernate_nvs_restore(void)
378{
379 struct nvs_page *entry;
380
381 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
382
383 list_for_each_entry(entry, &nvs_list, node)
384 if (entry->data)
385 memcpy(entry->kaddr, entry->data, entry->size);
386}
diff --git a/kernel/printk.c b/kernel/printk.c
index f492f1583d77..7015733793e8 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
619static const char recursion_bug_msg [] = 619static const char recursion_bug_msg [] =
620 KERN_CRIT "BUG: recent printk recursion!\n"; 620 KERN_CRIT "BUG: recent printk recursion!\n";
621static int recursion_bug; 621static int recursion_bug;
622 static int new_text_line = 1; 622static int new_text_line = 1;
623static char printk_buf[1024]; 623static char printk_buf[1024];
624 624
625asmlinkage int vprintk(const char *fmt, va_list args) 625asmlinkage int vprintk(const char *fmt, va_list args)
@@ -662,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
662 if (recursion_bug) { 662 if (recursion_bug) {
663 recursion_bug = 0; 663 recursion_bug = 0;
664 strcpy(printk_buf, recursion_bug_msg); 664 strcpy(printk_buf, recursion_bug_msg);
665 printed_len = sizeof(recursion_bug_msg); 665 printed_len = strlen(recursion_bug_msg);
666 } 666 }
667 /* Emit the output into the temporary buffer */ 667 /* Emit the output into the temporary buffer */
668 printed_len += vscnprintf(printk_buf + printed_len, 668 printed_len += vscnprintf(printk_buf + printed_len,
diff --git a/kernel/profile.c b/kernel/profile.c
index 60adefb59b5e..784933acf5b8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -45,7 +45,7 @@ static unsigned long prof_len, prof_shift;
45int prof_on __read_mostly; 45int prof_on __read_mostly;
46EXPORT_SYMBOL_GPL(prof_on); 46EXPORT_SYMBOL_GPL(prof_on);
47 47
48static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 48static cpumask_var_t prof_cpu_mask;
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 50static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
51static DEFINE_PER_CPU(int, cpu_profile_flip); 51static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -113,9 +113,13 @@ int __ref profile_init(void)
113 buffer_bytes = prof_len*sizeof(atomic_t); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) { 114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes); 115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
116 return 0; 117 return 0;
117 } 118 }
118 119
120 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
121 return -ENOMEM;
122
119 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 123 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
120 if (prof_buffer) 124 if (prof_buffer)
121 return 0; 125 return 0;
@@ -128,6 +132,7 @@ int __ref profile_init(void)
128 if (prof_buffer) 132 if (prof_buffer)
129 return 0; 133 return 0;
130 134
135 free_cpumask_var(prof_cpu_mask);
131 return -ENOMEM; 136 return -ENOMEM;
132} 137}
133 138
@@ -386,13 +391,15 @@ out_free:
386 return NOTIFY_BAD; 391 return NOTIFY_BAD;
387 case CPU_ONLINE: 392 case CPU_ONLINE:
388 case CPU_ONLINE_FROZEN: 393 case CPU_ONLINE_FROZEN:
389 cpu_set(cpu, prof_cpu_mask); 394 if (prof_cpu_mask != NULL)
395 cpumask_set_cpu(cpu, prof_cpu_mask);
390 break; 396 break;
391 case CPU_UP_CANCELED: 397 case CPU_UP_CANCELED:
392 case CPU_UP_CANCELED_FROZEN: 398 case CPU_UP_CANCELED_FROZEN:
393 case CPU_DEAD: 399 case CPU_DEAD:
394 case CPU_DEAD_FROZEN: 400 case CPU_DEAD_FROZEN:
395 cpu_clear(cpu, prof_cpu_mask); 401 if (prof_cpu_mask != NULL)
402 cpumask_clear_cpu(cpu, prof_cpu_mask);
396 if (per_cpu(cpu_profile_hits, cpu)[0]) { 403 if (per_cpu(cpu_profile_hits, cpu)[0]) {
397 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 404 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
398 per_cpu(cpu_profile_hits, cpu)[0] = NULL; 405 per_cpu(cpu_profile_hits, cpu)[0] = NULL;
@@ -430,19 +437,19 @@ void profile_tick(int type)
430 437
431 if (type == CPU_PROFILING && timer_hook) 438 if (type == CPU_PROFILING && timer_hook)
432 timer_hook(regs); 439 timer_hook(regs);
433 if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) 440 if (!user_mode(regs) && prof_cpu_mask != NULL &&
441 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
434 profile_hit(type, (void *)profile_pc(regs)); 442 profile_hit(type, (void *)profile_pc(regs));
435} 443}
436 444
437#ifdef CONFIG_PROC_FS 445#ifdef CONFIG_PROC_FS
438#include <linux/proc_fs.h> 446#include <linux/proc_fs.h>
439#include <asm/uaccess.h> 447#include <asm/uaccess.h>
440#include <asm/ptrace.h>
441 448
442static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 449static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
443 int count, int *eof, void *data) 450 int count, int *eof, void *data)
444{ 451{
445 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); 452 int len = cpumask_scnprintf(page, count, data);
446 if (count - len < 2) 453 if (count - len < 2)
447 return -EINVAL; 454 return -EINVAL;
448 len += sprintf(page + len, "\n"); 455 len += sprintf(page + len, "\n");
@@ -452,16 +459,20 @@ static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
452static int prof_cpu_mask_write_proc(struct file *file, 459static int prof_cpu_mask_write_proc(struct file *file,
453 const char __user *buffer, unsigned long count, void *data) 460 const char __user *buffer, unsigned long count, void *data)
454{ 461{
455 cpumask_t *mask = (cpumask_t *)data; 462 struct cpumask *mask = data;
456 unsigned long full_count = count, err; 463 unsigned long full_count = count, err;
457 cpumask_t new_value; 464 cpumask_var_t new_value;
458 465
459 err = cpumask_parse_user(buffer, count, new_value); 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
460 if (err) 467 return -ENOMEM;
461 return err;
462 468
463 *mask = new_value; 469 err = cpumask_parse_user(buffer, count, new_value);
464 return full_count; 470 if (!err) {
471 cpumask_copy(mask, new_value);
472 err = full_count;
473 }
474 free_cpumask_var(new_value);
475 return err;
465} 476}
466 477
467void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 478void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
@@ -472,7 +483,7 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
472 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 483 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
473 if (!entry) 484 if (!entry)
474 return; 485 return;
475 entry->data = (void *)&prof_cpu_mask; 486 entry->data = prof_cpu_mask;
476 entry->read_proc = prof_cpu_mask_read_proc; 487 entry->read_proc = prof_cpu_mask_read_proc;
477 entry->write_proc = prof_cpu_mask_write_proc; 488 entry->write_proc = prof_cpu_mask_write_proc;
478} 489}
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e551542..490934fc7ac3 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
63 .completed = -300, 63 .completed = -300,
64 .pending = -300, 64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_MASK_NONE, 66 .cpumask = CPU_BITS_NONE,
67}; 67};
68static struct rcu_ctrlblk rcu_bh_ctrlblk = { 68static struct rcu_ctrlblk rcu_bh_ctrlblk = {
69 .cur = -300, 69 .cur = -300,
70 .completed = -300, 70 .completed = -300,
71 .pending = -300, 71 .pending = -300,
72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), 72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
73 .cpumask = CPU_MASK_NONE, 73 .cpumask = CPU_BITS_NONE,
74}; 74};
75 75
76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
@@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp,
85 struct rcu_ctrlblk *rcp) 85 struct rcu_ctrlblk *rcp)
86{ 86{
87 int cpu; 87 int cpu;
88 cpumask_t cpumask;
89 unsigned long flags; 88 unsigned long flags;
90 89
91 set_need_resched(); 90 set_need_resched();
@@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
96 * Don't send IPI to itself. With irqs disabled, 95 * Don't send IPI to itself. With irqs disabled,
97 * rdp->cpu is the current cpu. 96 * rdp->cpu is the current cpu.
98 * 97 *
99 * cpu_online_map is updated by the _cpu_down() 98 * cpu_online_mask is updated by the _cpu_down()
100 * using __stop_machine(). Since we're in irqs disabled 99 * using __stop_machine(). Since we're in irqs disabled
101 * section, __stop_machine() is not exectuting, hence 100 * section, __stop_machine() is not exectuting, hence
102 * the cpu_online_map is stable. 101 * the cpu_online_mask is stable.
103 * 102 *
104 * However, a cpu might have been offlined _just_ before 103 * However, a cpu might have been offlined _just_ before
105 * we disabled irqs while entering here. 104 * we disabled irqs while entering here.
@@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp,
107 * notification, leading to the offlined cpu's bit 106 * notification, leading to the offlined cpu's bit
108 * being set in the rcp->cpumask. 107 * being set in the rcp->cpumask.
109 * 108 *
110 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent 109 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
111 * sending smp_reschedule() to an offlined CPU. 110 * sending smp_reschedule() to an offlined CPU.
112 */ 111 */
113 cpus_and(cpumask, rcp->cpumask, cpu_online_map); 112 for_each_cpu_and(cpu,
114 cpu_clear(rdp->cpu, cpumask); 113 to_cpumask(rcp->cpumask), cpu_online_mask) {
115 for_each_cpu_mask_nr(cpu, cpumask) 114 if (cpu != rdp->cpu)
116 smp_send_reschedule(cpu); 115 smp_send_reschedule(cpu);
116 }
117 } 117 }
118 spin_unlock_irqrestore(&rcp->lock, flags); 118 spin_unlock_irqrestore(&rcp->lock, flags);
119} 119}
@@ -191,9 +191,9 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
191 191
192 /* OK, time to rat on our buddy... */ 192 /* OK, time to rat on our buddy... */
193 193
194 printk(KERN_ERR "RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
198 } 198 }
199 printk(" (detected by %d, t=%ld jiffies)\n", 199 printk(" (detected by %d, t=%ld jiffies)\n",
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{ 204{
205 unsigned long flags; 205 unsigned long flags;
206 206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", 207 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies, 208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start); 209 jiffies - rcp->gp_start);
210 dump_stack(); 210 dump_stack();
@@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
221 long delta; 221 long delta;
222 222
223 delta = jiffies - rcp->jiffies_stall; 223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { 224 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
225 delta >= 0) {
225 226
226 /* We haven't checked in, so go dump stack. */ 227 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp); 228 print_cpu_stall(rcp);
@@ -393,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
393 * unnecessarily. 394 * unnecessarily.
394 */ 395 */
395 smp_mb(); 396 smp_mb();
396 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); 397 cpumask_andnot(to_cpumask(rcp->cpumask),
398 cpu_online_mask, nohz_cpu_mask);
397 399
398 rcp->signaled = 0; 400 rcp->signaled = 0;
399 } 401 }
@@ -406,8 +408,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
406 */ 408 */
407static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) 409static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
408{ 410{
409 cpu_clear(cpu, rcp->cpumask); 411 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
410 if (cpus_empty(rcp->cpumask)) { 412 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
411 /* batch completed ! */ 413 /* batch completed ! */
412 rcp->completed = rcp->cur; 414 rcp->completed = rcp->cur;
413 rcu_start_batch(rcp); 415 rcu_start_batch(rcp);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ad63af8b2521..d92a76a881aa 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 78 * and may be nested.
79 */ 79 */
80void synchronize_rcu(void); /* Makes kernel-doc tools happy */ 80void synchronize_rcu(void)
81synchronize_rcu_xxx(synchronize_rcu, call_rcu) 81{
82 struct rcu_synchronize rcu;
83 init_completion(&rcu.completion);
84 /* Will wake me after RCU finished. */
85 call_rcu(&rcu.head, wakeme_after_rcu);
86 /* Wait for it. */
87 wait_for_completion(&rcu.completion);
88}
82EXPORT_SYMBOL_GPL(synchronize_rcu); 89EXPORT_SYMBOL_GPL(synchronize_rcu);
83 90
84static void rcu_barrier_callback(struct rcu_head *notused) 91static void rcu_barrier_callback(struct rcu_head *notused)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 59236e8b9daa..33cfc50781f9 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] =
164 { "idle", "waitack", "waitzero", "waitmb" }; 164 { "idle", "waitack", "waitzero", "waitmb" };
165#endif /* #ifdef CONFIG_RCU_TRACE */ 165#endif /* #ifdef CONFIG_RCU_TRACE */
166 166
167static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; 167static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
168 = CPU_BITS_NONE;
168 169
169/* 170/*
170 * Enum and per-CPU flag to determine when each CPU has seen 171 * Enum and per-CPU flag to determine when each CPU has seen
@@ -551,6 +552,16 @@ void rcu_irq_exit(void)
551 } 552 }
552} 553}
553 554
555void rcu_nmi_enter(void)
556{
557 rcu_irq_enter();
558}
559
560void rcu_nmi_exit(void)
561{
562 rcu_irq_exit();
563}
564
554static void dyntick_save_progress_counter(int cpu) 565static void dyntick_save_progress_counter(int cpu)
555{ 566{
556 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); 567 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
@@ -748,7 +759,7 @@ rcu_try_flip_idle(void)
748 759
749 /* Now ask each CPU for acknowledgement of the flip. */ 760 /* Now ask each CPU for acknowledgement of the flip. */
750 761
751 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { 762 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
752 per_cpu(rcu_flip_flag, cpu) = rcu_flipped; 763 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
753 dyntick_save_progress_counter(cpu); 764 dyntick_save_progress_counter(cpu);
754 } 765 }
@@ -766,7 +777,7 @@ rcu_try_flip_waitack(void)
766 int cpu; 777 int cpu;
767 778
768 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); 779 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
769 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 780 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
770 if (rcu_try_flip_waitack_needed(cpu) && 781 if (rcu_try_flip_waitack_needed(cpu) &&
771 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { 782 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
772 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); 783 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -798,7 +809,7 @@ rcu_try_flip_waitzero(void)
798 /* Check to see if the sum of the "last" counters is zero. */ 809 /* Check to see if the sum of the "last" counters is zero. */
799 810
800 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); 811 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
801 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 812 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
802 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; 813 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
803 if (sum != 0) { 814 if (sum != 0) {
804 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); 815 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -813,7 +824,7 @@ rcu_try_flip_waitzero(void)
813 smp_mb(); /* ^^^^^^^^^^^^ */ 824 smp_mb(); /* ^^^^^^^^^^^^ */
814 825
815 /* Call for a memory barrier from each CPU. */ 826 /* Call for a memory barrier from each CPU. */
816 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { 827 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
817 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; 828 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
818 dyntick_save_progress_counter(cpu); 829 dyntick_save_progress_counter(cpu);
819 } 830 }
@@ -833,7 +844,7 @@ rcu_try_flip_waitmb(void)
833 int cpu; 844 int cpu;
834 845
835 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); 846 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
836 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 847 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
837 if (rcu_try_flip_waitmb_needed(cpu) && 848 if (rcu_try_flip_waitmb_needed(cpu) &&
838 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { 849 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
839 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); 850 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@ -1022,7 +1033,7 @@ void rcu_offline_cpu(int cpu)
1022 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; 1033 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1023 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; 1034 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1024 1035
1025 cpu_clear(cpu, rcu_cpu_online_map); 1036 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1026 1037
1027 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1038 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1028 1039
@@ -1062,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu)
1062 struct rcu_data *rdp; 1073 struct rcu_data *rdp;
1063 1074
1064 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); 1075 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1065 cpu_set(cpu, rcu_cpu_online_map); 1076 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1066 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1077 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1067 1078
1068 /* 1079 /*
@@ -1166,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
1166 * in -rt this does -not- necessarily result in all currently executing 1177 * in -rt this does -not- necessarily result in all currently executing
1167 * interrupt -handlers- having completed. 1178 * interrupt -handlers- having completed.
1168 */ 1179 */
1169synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) 1180void __synchronize_sched(void)
1181{
1182 struct rcu_synchronize rcu;
1183
1184 init_completion(&rcu.completion);
1185 /* Will wake me after RCU finished. */
1186 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1187 /* Wait for it. */
1188 wait_for_completion(&rcu.completion);
1189}
1170EXPORT_SYMBOL_GPL(__synchronize_sched); 1190EXPORT_SYMBOL_GPL(__synchronize_sched);
1171 1191
1172/* 1192/*
@@ -1420,7 +1440,7 @@ void __init __rcu_init(void)
1420 * We don't need protection against CPU-Hotplug here 1440 * We don't need protection against CPU-Hotplug here
1421 * since 1441 * since
1422 * a) If a CPU comes online while we are iterating over the 1442 * a) If a CPU comes online while we are iterating over the
1423 * cpu_online_map below, we would only end up making a 1443 * cpu_online_mask below, we would only end up making a
1424 * duplicate call to rcu_online_cpu() which sets the corresponding 1444 * duplicate call to rcu_online_cpu() which sets the corresponding
1425 * CPU's mask in the rcu_cpu_online_map. 1445 * CPU's mask in the rcu_cpu_online_map.
1426 * 1446 *
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ecf..7c2665cac172 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
149 sp->done_length += cp->done_length; 149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add; 150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove; 151 sp->done_remove += cp->done_remove;
152 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); 152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks; 153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_set(&sp->rcu_try_flip_1, 154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 atomic_read(&cp->rcu_try_flip_1)); 155 &sp->rcu_try_flip_1);
156 atomic_set(&sp->rcu_try_flip_e1, 156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 atomic_read(&cp->rcu_try_flip_e1)); 157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; 158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; 159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; 160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 85cb90588a55..1cff28db56b6 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,6 +39,7 @@
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/reboot.h>
42#include <linux/freezer.h> 43#include <linux/freezer.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/delay.h> 45#include <linux/delay.h>
@@ -108,7 +109,6 @@ struct rcu_torture {
108 int rtort_mbtest; 109 int rtort_mbtest;
109}; 110};
110 111
111static int fullstop = 0; /* stop generating callbacks at test end. */
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current = NULL;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version = 0;
@@ -136,6 +136,28 @@ static int stutter_pause_test = 0;
136#endif 136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138 138
139#define FULLSTOP_SHUTDOWN 1 /* Bail due to system shutdown/panic. */
140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */
141static int fullstop; /* stop generating callbacks at test end. */
142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */
143 /* spawning of kthreads. */
144
145/*
146 * Detect and respond to a signal-based shutdown.
147 */
148static int
149rcutorture_shutdown_notify(struct notifier_block *unused1,
150 unsigned long unused2, void *unused3)
151{
152 if (fullstop)
153 return NOTIFY_DONE;
154 mutex_lock(&fullstop_mutex);
155 if (!fullstop)
156 fullstop = FULLSTOP_SHUTDOWN;
157 mutex_unlock(&fullstop_mutex);
158 return NOTIFY_DONE;
159}
160
139/* 161/*
140 * Allocate an element from the rcu_tortures pool. 162 * Allocate an element from the rcu_tortures pool.
141 */ 163 */
@@ -199,11 +221,12 @@ rcu_random(struct rcu_random_state *rrsp)
199static void 221static void
200rcu_stutter_wait(void) 222rcu_stutter_wait(void)
201{ 223{
202 while (stutter_pause_test || !rcutorture_runnable) 224 while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) {
203 if (rcutorture_runnable) 225 if (rcutorture_runnable)
204 schedule_timeout_interruptible(1); 226 schedule_timeout_interruptible(1);
205 else 227 else
206 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 228 schedule_timeout_interruptible(round_jiffies_relative(HZ));
229 }
207} 230}
208 231
209/* 232/*
@@ -599,7 +622,7 @@ rcu_torture_writer(void *arg)
599 rcu_stutter_wait(); 622 rcu_stutter_wait();
600 } while (!kthread_should_stop() && !fullstop); 623 } while (!kthread_should_stop() && !fullstop);
601 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 624 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
602 while (!kthread_should_stop()) 625 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
603 schedule_timeout_uninterruptible(1); 626 schedule_timeout_uninterruptible(1);
604 return 0; 627 return 0;
605} 628}
@@ -624,7 +647,7 @@ rcu_torture_fakewriter(void *arg)
624 } while (!kthread_should_stop() && !fullstop); 647 } while (!kthread_should_stop() && !fullstop);
625 648
626 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 649 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
627 while (!kthread_should_stop()) 650 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
628 schedule_timeout_uninterruptible(1); 651 schedule_timeout_uninterruptible(1);
629 return 0; 652 return 0;
630} 653}
@@ -734,7 +757,7 @@ rcu_torture_reader(void *arg)
734 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 757 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
735 if (irqreader && cur_ops->irqcapable) 758 if (irqreader && cur_ops->irqcapable)
736 del_timer_sync(&t); 759 del_timer_sync(&t);
737 while (!kthread_should_stop()) 760 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
738 schedule_timeout_uninterruptible(1); 761 schedule_timeout_uninterruptible(1);
739 return 0; 762 return 0;
740} 763}
@@ -831,7 +854,7 @@ rcu_torture_stats(void *arg)
831 do { 854 do {
832 schedule_timeout_interruptible(stat_interval * HZ); 855 schedule_timeout_interruptible(stat_interval * HZ);
833 rcu_torture_stats_print(); 856 rcu_torture_stats_print();
834 } while (!kthread_should_stop()); 857 } while (!kthread_should_stop() && !fullstop);
835 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 858 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
836 return 0; 859 return 0;
837} 860}
@@ -843,49 +866,52 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
843 */ 866 */
844static void rcu_torture_shuffle_tasks(void) 867static void rcu_torture_shuffle_tasks(void)
845{ 868{
846 cpumask_t tmp_mask; 869 cpumask_var_t tmp_mask;
847 int i; 870 int i;
848 871
849 cpus_setall(tmp_mask); 872 if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
873 BUG();
874
875 cpumask_setall(tmp_mask);
850 get_online_cpus(); 876 get_online_cpus();
851 877
852 /* No point in shuffling if there is only one online CPU (ex: UP) */ 878 /* No point in shuffling if there is only one online CPU (ex: UP) */
853 if (num_online_cpus() == 1) { 879 if (num_online_cpus() == 1)
854 put_online_cpus(); 880 goto out;
855 return;
856 }
857 881
858 if (rcu_idle_cpu != -1) 882 if (rcu_idle_cpu != -1)
859 cpu_clear(rcu_idle_cpu, tmp_mask); 883 cpumask_clear_cpu(rcu_idle_cpu, tmp_mask);
860 884
861 set_cpus_allowed_ptr(current, &tmp_mask); 885 set_cpus_allowed_ptr(current, tmp_mask);
862 886
863 if (reader_tasks) { 887 if (reader_tasks) {
864 for (i = 0; i < nrealreaders; i++) 888 for (i = 0; i < nrealreaders; i++)
865 if (reader_tasks[i]) 889 if (reader_tasks[i])
866 set_cpus_allowed_ptr(reader_tasks[i], 890 set_cpus_allowed_ptr(reader_tasks[i],
867 &tmp_mask); 891 tmp_mask);
868 } 892 }
869 893
870 if (fakewriter_tasks) { 894 if (fakewriter_tasks) {
871 for (i = 0; i < nfakewriters; i++) 895 for (i = 0; i < nfakewriters; i++)
872 if (fakewriter_tasks[i]) 896 if (fakewriter_tasks[i])
873 set_cpus_allowed_ptr(fakewriter_tasks[i], 897 set_cpus_allowed_ptr(fakewriter_tasks[i],
874 &tmp_mask); 898 tmp_mask);
875 } 899 }
876 900
877 if (writer_task) 901 if (writer_task)
878 set_cpus_allowed_ptr(writer_task, &tmp_mask); 902 set_cpus_allowed_ptr(writer_task, tmp_mask);
879 903
880 if (stats_task) 904 if (stats_task)
881 set_cpus_allowed_ptr(stats_task, &tmp_mask); 905 set_cpus_allowed_ptr(stats_task, tmp_mask);
882 906
883 if (rcu_idle_cpu == -1) 907 if (rcu_idle_cpu == -1)
884 rcu_idle_cpu = num_online_cpus() - 1; 908 rcu_idle_cpu = num_online_cpus() - 1;
885 else 909 else
886 rcu_idle_cpu--; 910 rcu_idle_cpu--;
887 911
912out:
888 put_online_cpus(); 913 put_online_cpus();
914 free_cpumask_var(tmp_mask);
889} 915}
890 916
891/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the 917/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
@@ -899,7 +925,7 @@ rcu_torture_shuffle(void *arg)
899 do { 925 do {
900 schedule_timeout_interruptible(shuffle_interval * HZ); 926 schedule_timeout_interruptible(shuffle_interval * HZ);
901 rcu_torture_shuffle_tasks(); 927 rcu_torture_shuffle_tasks();
902 } while (!kthread_should_stop()); 928 } while (!kthread_should_stop() && !fullstop);
903 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); 929 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
904 return 0; 930 return 0;
905} 931}
@@ -914,10 +940,10 @@ rcu_torture_stutter(void *arg)
914 do { 940 do {
915 schedule_timeout_interruptible(stutter * HZ); 941 schedule_timeout_interruptible(stutter * HZ);
916 stutter_pause_test = 1; 942 stutter_pause_test = 1;
917 if (!kthread_should_stop()) 943 if (!kthread_should_stop() && !fullstop)
918 schedule_timeout_interruptible(stutter * HZ); 944 schedule_timeout_interruptible(stutter * HZ);
919 stutter_pause_test = 0; 945 stutter_pause_test = 0;
920 } while (!kthread_should_stop()); 946 } while (!kthread_should_stop() && !fullstop);
921 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); 947 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
922 return 0; 948 return 0;
923} 949}
@@ -934,12 +960,27 @@ rcu_torture_print_module_parms(char *tag)
934 stutter, irqreader); 960 stutter, irqreader);
935} 961}
936 962
963static struct notifier_block rcutorture_nb = {
964 .notifier_call = rcutorture_shutdown_notify,
965};
966
937static void 967static void
938rcu_torture_cleanup(void) 968rcu_torture_cleanup(void)
939{ 969{
940 int i; 970 int i;
941 971
942 fullstop = 1; 972 mutex_lock(&fullstop_mutex);
973 if (!fullstop) {
974 /* If being signaled, let it happen, then exit. */
975 mutex_unlock(&fullstop_mutex);
976 schedule_timeout_interruptible(10 * HZ);
977 if (cur_ops->cb_barrier != NULL)
978 cur_ops->cb_barrier();
979 return;
980 }
981 fullstop = FULLSTOP_CLEANUP;
982 mutex_unlock(&fullstop_mutex);
983 unregister_reboot_notifier(&rcutorture_nb);
943 if (stutter_task) { 984 if (stutter_task) {
944 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 985 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
945 kthread_stop(stutter_task); 986 kthread_stop(stutter_task);
@@ -1015,6 +1056,8 @@ rcu_torture_init(void)
1015 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1056 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1016 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1057 &srcu_ops, &sched_ops, &sched_ops_sync, };
1017 1058
1059 mutex_lock(&fullstop_mutex);
1060
1018 /* Process args and tell the world that the torturer is on the job. */ 1061 /* Process args and tell the world that the torturer is on the job. */
1019 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1062 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
1020 cur_ops = torture_ops[i]; 1063 cur_ops = torture_ops[i];
@@ -1024,6 +1067,7 @@ rcu_torture_init(void)
1024 if (i == ARRAY_SIZE(torture_ops)) { 1067 if (i == ARRAY_SIZE(torture_ops)) {
1025 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1068 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1026 torture_type); 1069 torture_type);
1070 mutex_unlock(&fullstop_mutex);
1027 return (-EINVAL); 1071 return (-EINVAL);
1028 } 1072 }
1029 if (cur_ops->init) 1073 if (cur_ops->init)
@@ -1146,9 +1190,12 @@ rcu_torture_init(void)
1146 goto unwind; 1190 goto unwind;
1147 } 1191 }
1148 } 1192 }
1193 register_reboot_notifier(&rcutorture_nb);
1194 mutex_unlock(&fullstop_mutex);
1149 return 0; 1195 return 0;
1150 1196
1151unwind: 1197unwind:
1198 mutex_unlock(&fullstop_mutex);
1152 rcu_torture_cleanup(); 1199 rcu_torture_cleanup();
1153 return firsterr; 1200 return firsterr;
1154} 1201}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 000000000000..f2d8638e6c60
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1532 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
23 *
24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 *
27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU
29 */
30#include <linux/types.h>
31#include <linux/kernel.h>
32#include <linux/init.h>
33#include <linux/spinlock.h>
34#include <linux/smp.h>
35#include <linux/rcupdate.h>
36#include <linux/interrupt.h>
37#include <linux/sched.h>
38#include <asm/atomic.h>
39#include <linux/bitops.h>
40#include <linux/module.h>
41#include <linux/completion.h>
42#include <linux/moduleparam.h>
43#include <linux/percpu.h>
44#include <linux/notifier.h>
45#include <linux/cpu.h>
46#include <linux/mutex.h>
47#include <linux/time.h>
48
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map =
52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif
55
56/* Data structures. */
57
58#define RCU_STATE_INITIALIZER(name) { \
59 .level = { &name.node[0] }, \
60 .levelcnt = { \
61 NUM_RCU_LVL_0, /* root of hierarchy. */ \
62 NUM_RCU_LVL_1, \
63 NUM_RCU_LVL_2, \
64 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
65 }, \
66 .signaled = RCU_SIGNAL_INIT, \
67 .gpnum = -300, \
68 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
71 .n_force_qs = 0, \
72 .n_force_qs_ngp = 0, \
73}
74
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data);
77
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80
81#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
83 .dynticks_nesting = 1,
84 .dynticks = 1,
85};
86#endif /* #ifdef CONFIG_NO_HZ */
87
88static int blimit = 10; /* Maximum callbacks per softirq. */
89static int qhimark = 10000; /* If this many pending, ignore blimit. */
90static int qlowmark = 100; /* Once only this many pending, use blimit. */
91
92static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
93
94/*
95 * Return the number of RCU batches processed thus far for debug & stats.
96 */
97long rcu_batches_completed(void)
98{
99 return rcu_state.completed;
100}
101EXPORT_SYMBOL_GPL(rcu_batches_completed);
102
103/*
104 * Return the number of RCU BH batches processed thus far for debug & stats.
105 */
106long rcu_batches_completed_bh(void)
107{
108 return rcu_bh_state.completed;
109}
110EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
111
112/*
113 * Does the CPU have callbacks ready to be invoked?
114 */
115static int
116cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
117{
118 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
119}
120
121/*
122 * Does the current CPU require a yet-as-unscheduled grace period?
123 */
124static int
125cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
126{
127 /* ACCESS_ONCE() because we are accessing outside of lock. */
128 return *rdp->nxttail[RCU_DONE_TAIL] &&
129 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
130}
131
132/*
133 * Return the root node of the specified rcu_state structure.
134 */
135static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
136{
137 return &rsp->node[0];
138}
139
140#ifdef CONFIG_SMP
141
142/*
143 * If the specified CPU is offline, tell the caller that it is in
144 * a quiescent state. Otherwise, whack it with a reschedule IPI.
145 * Grace periods can end up waiting on an offline CPU when that
146 * CPU is in the process of coming online -- it will be added to the
147 * rcu_node bitmasks before it actually makes it online. The same thing
148 * can happen while a CPU is in the process of coming online. Because this
149 * race is quite rare, we check for it after detecting that the grace
150 * period has been delayed rather than checking each and every CPU
151 * each and every time we start a new grace period.
152 */
153static int rcu_implicit_offline_qs(struct rcu_data *rdp)
154{
155 /*
156 * If the CPU is offline, it is in a quiescent state. We can
157 * trust its state not to change because interrupts are disabled.
158 */
159 if (cpu_is_offline(rdp->cpu)) {
160 rdp->offline_fqs++;
161 return 1;
162 }
163
164 /* The CPU is online, so send it a reschedule IPI. */
165 if (rdp->cpu != smp_processor_id())
166 smp_send_reschedule(rdp->cpu);
167 else
168 set_need_resched();
169 rdp->resched_ipi++;
170 return 0;
171}
172
173#endif /* #ifdef CONFIG_SMP */
174
175#ifdef CONFIG_NO_HZ
176static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
177
178/**
179 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
180 *
181 * Enter nohz mode, in other words, -leave- the mode in which RCU
182 * read-side critical sections can occur. (Though RCU read-side
183 * critical sections can occur in irq handlers in nohz mode, a possibility
184 * handled by rcu_irq_enter() and rcu_irq_exit()).
185 */
186void rcu_enter_nohz(void)
187{
188 unsigned long flags;
189 struct rcu_dynticks *rdtp;
190
191 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
192 local_irq_save(flags);
193 rdtp = &__get_cpu_var(rcu_dynticks);
194 rdtp->dynticks++;
195 rdtp->dynticks_nesting--;
196 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
197 local_irq_restore(flags);
198}
199
200/*
201 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
202 *
203 * Exit nohz mode, in other words, -enter- the mode in which RCU
204 * read-side critical sections normally occur.
205 */
206void rcu_exit_nohz(void)
207{
208 unsigned long flags;
209 struct rcu_dynticks *rdtp;
210
211 local_irq_save(flags);
212 rdtp = &__get_cpu_var(rcu_dynticks);
213 rdtp->dynticks++;
214 rdtp->dynticks_nesting++;
215 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
216 local_irq_restore(flags);
217 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
218}
219
220/**
221 * rcu_nmi_enter - inform RCU of entry to NMI context
222 *
223 * If the CPU was idle with dynamic ticks active, and there is no
224 * irq handler running, this updates rdtp->dynticks_nmi to let the
225 * RCU grace-period handling know that the CPU is active.
226 */
227void rcu_nmi_enter(void)
228{
229 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
230
231 if (rdtp->dynticks & 0x1)
232 return;
233 rdtp->dynticks_nmi++;
234 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
235 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
236}
237
238/**
239 * rcu_nmi_exit - inform RCU of exit from NMI context
240 *
241 * If the CPU was idle with dynamic ticks active, and there is no
242 * irq handler running, this updates rdtp->dynticks_nmi to let the
243 * RCU grace-period handling know that the CPU is no longer active.
244 */
245void rcu_nmi_exit(void)
246{
247 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
248
249 if (rdtp->dynticks & 0x1)
250 return;
251 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
252 rdtp->dynticks_nmi++;
253 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
254}
255
256/**
257 * rcu_irq_enter - inform RCU of entry to hard irq context
258 *
259 * If the CPU was idle with dynamic ticks active, this updates the
260 * rdtp->dynticks to let the RCU handling know that the CPU is active.
261 */
262void rcu_irq_enter(void)
263{
264 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
265
266 if (rdtp->dynticks_nesting++)
267 return;
268 rdtp->dynticks++;
269 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
270 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
271}
272
273/**
274 * rcu_irq_exit - inform RCU of exit from hard irq context
275 *
276 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
277 * to put let the RCU handling be aware that the CPU is going back to idle
278 * with no ticks.
279 */
280void rcu_irq_exit(void)
281{
282 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
283
284 if (--rdtp->dynticks_nesting)
285 return;
286 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
287 rdtp->dynticks++;
288 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
289
290 /* If the interrupt queued a callback, get out of dyntick mode. */
291 if (__get_cpu_var(rcu_data).nxtlist ||
292 __get_cpu_var(rcu_bh_data).nxtlist)
293 set_need_resched();
294}
295
296/*
297 * Record the specified "completed" value, which is later used to validate
298 * dynticks counter manipulations. Specify "rsp->completed - 1" to
299 * unconditionally invalidate any future dynticks manipulations (which is
300 * useful at the beginning of a grace period).
301 */
302static void dyntick_record_completed(struct rcu_state *rsp, long comp)
303{
304 rsp->dynticks_completed = comp;
305}
306
307#ifdef CONFIG_SMP
308
309/*
310 * Recall the previously recorded value of the completion for dynticks.
311 */
312static long dyntick_recall_completed(struct rcu_state *rsp)
313{
314 return rsp->dynticks_completed;
315}
316
317/*
318 * Snapshot the specified CPU's dynticks counter so that we can later
319 * credit them with an implicit quiescent state. Return 1 if this CPU
320 * is already in a quiescent state courtesy of dynticks idle mode.
321 */
322static int dyntick_save_progress_counter(struct rcu_data *rdp)
323{
324 int ret;
325 int snap;
326 int snap_nmi;
327
328 snap = rdp->dynticks->dynticks;
329 snap_nmi = rdp->dynticks->dynticks_nmi;
330 smp_mb(); /* Order sampling of snap with end of grace period. */
331 rdp->dynticks_snap = snap;
332 rdp->dynticks_nmi_snap = snap_nmi;
333 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
334 if (ret)
335 rdp->dynticks_fqs++;
336 return ret;
337}
338
339/*
340 * Return true if the specified CPU has passed through a quiescent
341 * state by virtue of being in or having passed through an dynticks
342 * idle state since the last call to dyntick_save_progress_counter()
343 * for this same CPU.
344 */
345static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
346{
347 long curr;
348 long curr_nmi;
349 long snap;
350 long snap_nmi;
351
352 curr = rdp->dynticks->dynticks;
353 snap = rdp->dynticks_snap;
354 curr_nmi = rdp->dynticks->dynticks_nmi;
355 snap_nmi = rdp->dynticks_nmi_snap;
356 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
357
358 /*
359 * If the CPU passed through or entered a dynticks idle phase with
360 * no active irq/NMI handlers, then we can safely pretend that the CPU
361 * already acknowledged the request to pass through a quiescent
362 * state. Either way, that CPU cannot possibly be in an RCU
363 * read-side critical section that started before the beginning
364 * of the current RCU grace period.
365 */
366 if ((curr != snap || (curr & 0x1) == 0) &&
367 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
368 rdp->dynticks_fqs++;
369 return 1;
370 }
371
372 /* Go check for the CPU being offline. */
373 return rcu_implicit_offline_qs(rdp);
374}
375
376#endif /* #ifdef CONFIG_SMP */
377
378#else /* #ifdef CONFIG_NO_HZ */
379
380static void dyntick_record_completed(struct rcu_state *rsp, long comp)
381{
382}
383
384#ifdef CONFIG_SMP
385
386/*
387 * If there are no dynticks, then the only way that a CPU can passively
388 * be in a quiescent state is to be offline. Unlike dynticks idle, which
389 * is a point in time during the prior (already finished) grace period,
390 * an offline CPU is always in a quiescent state, and thus can be
391 * unconditionally applied. So just return the current value of completed.
392 */
393static long dyntick_recall_completed(struct rcu_state *rsp)
394{
395 return rsp->completed;
396}
397
398static int dyntick_save_progress_counter(struct rcu_data *rdp)
399{
400 return 0;
401}
402
403static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
404{
405 return rcu_implicit_offline_qs(rdp);
406}
407
408#endif /* #ifdef CONFIG_SMP */
409
410#endif /* #else #ifdef CONFIG_NO_HZ */
411
412#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
413
414static void record_gp_stall_check_time(struct rcu_state *rsp)
415{
416 rsp->gp_start = jiffies;
417 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
418}
419
420static void print_other_cpu_stall(struct rcu_state *rsp)
421{
422 int cpu;
423 long delta;
424 unsigned long flags;
425 struct rcu_node *rnp = rcu_get_root(rsp);
426 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
427 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
428
429 /* Only let one CPU complain about others per time interval. */
430
431 spin_lock_irqsave(&rnp->lock, flags);
432 delta = jiffies - rsp->jiffies_stall;
433 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
434 spin_unlock_irqrestore(&rnp->lock, flags);
435 return;
436 }
437 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
438 spin_unlock_irqrestore(&rnp->lock, flags);
439
440 /* OK, time to rat on our buddy... */
441
442 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
443 for (; rnp_cur < rnp_end; rnp_cur++) {
444 if (rnp_cur->qsmask == 0)
445 continue;
446 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
447 if (rnp_cur->qsmask & (1UL << cpu))
448 printk(" %d", rnp_cur->grplo + cpu);
449 }
450 printk(" (detected by %d, t=%ld jiffies)\n",
451 smp_processor_id(), (long)(jiffies - rsp->gp_start));
452 force_quiescent_state(rsp, 0); /* Kick them all. */
453}
454
455static void print_cpu_stall(struct rcu_state *rsp)
456{
457 unsigned long flags;
458 struct rcu_node *rnp = rcu_get_root(rsp);
459
460 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
461 smp_processor_id(), jiffies - rsp->gp_start);
462 dump_stack();
463 spin_lock_irqsave(&rnp->lock, flags);
464 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
465 rsp->jiffies_stall =
466 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
467 spin_unlock_irqrestore(&rnp->lock, flags);
468 set_need_resched(); /* kick ourselves to get things going. */
469}
470
471static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
472{
473 long delta;
474 struct rcu_node *rnp;
475
476 delta = jiffies - rsp->jiffies_stall;
477 rnp = rdp->mynode;
478 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
479
480 /* We haven't checked in, so go dump stack. */
481 print_cpu_stall(rsp);
482
483 } else if (rsp->gpnum != rsp->completed &&
484 delta >= RCU_STALL_RAT_DELAY) {
485
486 /* They had two time units to dump stack, so complain. */
487 print_other_cpu_stall(rsp);
488 }
489}
490
491#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
492
493static void record_gp_stall_check_time(struct rcu_state *rsp)
494{
495}
496
497static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
498{
499}
500
501#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
502
503/*
504 * Update CPU-local rcu_data state to record the newly noticed grace period.
505 * This is used both when we started the grace period and when we notice
506 * that someone else started the grace period.
507 */
508static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
509{
510 rdp->qs_pending = 1;
511 rdp->passed_quiesc = 0;
512 rdp->gpnum = rsp->gpnum;
513 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
514 RCU_JIFFIES_TILL_FORCE_QS;
515}
516
517/*
518 * Did someone else start a new RCU grace period start since we last
519 * checked? Update local state appropriately if so. Must be called
520 * on the CPU corresponding to rdp.
521 */
522static int
523check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
524{
525 unsigned long flags;
526 int ret = 0;
527
528 local_irq_save(flags);
529 if (rdp->gpnum != rsp->gpnum) {
530 note_new_gpnum(rsp, rdp);
531 ret = 1;
532 }
533 local_irq_restore(flags);
534 return ret;
535}
536
537/*
538 * Start a new RCU grace period if warranted, re-initializing the hierarchy
539 * in preparation for detecting the next grace period. The caller must hold
540 * the root node's ->lock, which is released before return. Hard irqs must
541 * be disabled.
542 */
543static void
544rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
545 __releases(rcu_get_root(rsp)->lock)
546{
547 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
548 struct rcu_node *rnp = rcu_get_root(rsp);
549 struct rcu_node *rnp_cur;
550 struct rcu_node *rnp_end;
551
552 if (!cpu_needs_another_gp(rsp, rdp)) {
553 spin_unlock_irqrestore(&rnp->lock, flags);
554 return;
555 }
556
557 /* Advance to a new grace period and initialize state. */
558 rsp->gpnum++;
559 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
560 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
561 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
562 RCU_JIFFIES_TILL_FORCE_QS;
563 record_gp_stall_check_time(rsp);
564 dyntick_record_completed(rsp, rsp->completed - 1);
565 note_new_gpnum(rsp, rdp);
566
567 /*
568 * Because we are first, we know that all our callbacks will
569 * be covered by this upcoming grace period, even the ones
570 * that were registered arbitrarily recently.
571 */
572 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
573 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
574
575 /* Special-case the common single-level case. */
576 if (NUM_RCU_NODES == 1) {
577 rnp->qsmask = rnp->qsmaskinit;
578 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
579 spin_unlock_irqrestore(&rnp->lock, flags);
580 return;
581 }
582
583 spin_unlock(&rnp->lock); /* leave irqs disabled. */
584
585
586 /* Exclude any concurrent CPU-hotplug operations. */
587 spin_lock(&rsp->onofflock); /* irqs already disabled. */
588
589 /*
590 * Set the quiescent-state-needed bits in all the non-leaf RCU
591 * nodes for all currently online CPUs. This operation relies
592 * on the layout of the hierarchy within the rsp->node[] array.
593 * Note that other CPUs will access only the leaves of the
594 * hierarchy, which still indicate that no grace period is in
595 * progress. In addition, we have excluded CPU-hotplug operations.
596 *
597 * We therefore do not need to hold any locks. Any required
598 * memory barriers will be supplied by the locks guarding the
599 * leaf rcu_nodes in the hierarchy.
600 */
601
602 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
603 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
604 rnp_cur->qsmask = rnp_cur->qsmaskinit;
605
606 /*
607 * Now set up the leaf nodes. Here we must be careful. First,
608 * we need to hold the lock in order to exclude other CPUs, which
609 * might be contending for the leaf nodes' locks. Second, as
610 * soon as we initialize a given leaf node, its CPUs might run
611 * up the rest of the hierarchy. We must therefore acquire locks
612 * for each node that we touch during this stage. (But we still
613 * are excluding CPU-hotplug operations.)
614 *
615 * Note that the grace period cannot complete until we finish
616 * the initialization process, as there will be at least one
617 * qsmask bit set in the root node until that time, namely the
618 * one corresponding to this CPU.
619 */
620 rnp_end = &rsp->node[NUM_RCU_NODES];
621 rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
622 for (; rnp_cur < rnp_end; rnp_cur++) {
623 spin_lock(&rnp_cur->lock); /* irqs already disabled. */
624 rnp_cur->qsmask = rnp_cur->qsmaskinit;
625 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
626 }
627
628 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
629 spin_unlock_irqrestore(&rsp->onofflock, flags);
630}
631
632/*
633 * Advance this CPU's callbacks, but only if the current grace period
634 * has ended. This may be called only from the CPU to whom the rdp
635 * belongs.
636 */
637static void
638rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
639{
640 long completed_snap;
641 unsigned long flags;
642
643 local_irq_save(flags);
644 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
645
646 /* Did another grace period end? */
647 if (rdp->completed != completed_snap) {
648
649 /* Advance callbacks. No harm if list empty. */
650 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
651 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
652 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
653
654 /* Remember that we saw this grace-period completion. */
655 rdp->completed = completed_snap;
656 }
657 local_irq_restore(flags);
658}
659
660/*
661 * Similar to cpu_quiet(), for which it is a helper function. Allows
662 * a group of CPUs to be quieted at one go, though all the CPUs in the
663 * group must be represented by the same leaf rcu_node structure.
664 * That structure's lock must be held upon entry, and it is released
665 * before return.
666 */
667static void
668cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
669 unsigned long flags)
670 __releases(rnp->lock)
671{
672 /* Walk up the rcu_node hierarchy. */
673 for (;;) {
674 if (!(rnp->qsmask & mask)) {
675
676 /* Our bit has already been cleared, so done. */
677 spin_unlock_irqrestore(&rnp->lock, flags);
678 return;
679 }
680 rnp->qsmask &= ~mask;
681 if (rnp->qsmask != 0) {
682
683 /* Other bits still set at this level, so done. */
684 spin_unlock_irqrestore(&rnp->lock, flags);
685 return;
686 }
687 mask = rnp->grpmask;
688 if (rnp->parent == NULL) {
689
690 /* No more levels. Exit loop holding root lock. */
691
692 break;
693 }
694 spin_unlock_irqrestore(&rnp->lock, flags);
695 rnp = rnp->parent;
696 spin_lock_irqsave(&rnp->lock, flags);
697 }
698
699 /*
700 * Get here if we are the last CPU to pass through a quiescent
701 * state for this grace period. Clean up and let rcu_start_gp()
702 * start up the next grace period if one is needed. Note that
703 * we still hold rnp->lock, as required by rcu_start_gp(), which
704 * will release it.
705 */
706 rsp->completed = rsp->gpnum;
707 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
708 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
709}
710
711/*
712 * Record a quiescent state for the specified CPU, which must either be
713 * the current CPU or an offline CPU. The lastcomp argument is used to
714 * make sure we are still in the grace period of interest. We don't want
715 * to end the current grace period based on quiescent states detected in
716 * an earlier grace period!
717 */
718static void
719cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
720{
721 unsigned long flags;
722 unsigned long mask;
723 struct rcu_node *rnp;
724
725 rnp = rdp->mynode;
726 spin_lock_irqsave(&rnp->lock, flags);
727 if (lastcomp != ACCESS_ONCE(rsp->completed)) {
728
729 /*
730 * Someone beat us to it for this grace period, so leave.
731 * The race with GP start is resolved by the fact that we
732 * hold the leaf rcu_node lock, so that the per-CPU bits
733 * cannot yet be initialized -- so we would simply find our
734 * CPU's bit already cleared in cpu_quiet_msk() if this race
735 * occurred.
736 */
737 rdp->passed_quiesc = 0; /* try again later! */
738 spin_unlock_irqrestore(&rnp->lock, flags);
739 return;
740 }
741 mask = rdp->grpmask;
742 if ((rnp->qsmask & mask) == 0) {
743 spin_unlock_irqrestore(&rnp->lock, flags);
744 } else {
745 rdp->qs_pending = 0;
746
747 /*
748 * This GP can't end until cpu checks in, so all of our
749 * callbacks can be processed during the next GP.
750 */
751 rdp = rsp->rda[smp_processor_id()];
752 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
753
754 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
755 }
756}
757
758/*
759 * Check to see if there is a new grace period of which this CPU
760 * is not yet aware, and if so, set up local rcu_data state for it.
761 * Otherwise, see if this CPU has just passed through its first
762 * quiescent state for this grace period, and record that fact if so.
763 */
764static void
765rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
766{
767 /* If there is now a new grace period, record and return. */
768 if (check_for_new_grace_period(rsp, rdp))
769 return;
770
771 /*
772 * Does this CPU still need to do its part for current grace period?
773 * If no, return and let the other CPUs do their part as well.
774 */
775 if (!rdp->qs_pending)
776 return;
777
778 /*
779 * Was there a quiescent state since the beginning of the grace
780 * period? If no, then exit and wait for the next call.
781 */
782 if (!rdp->passed_quiesc)
783 return;
784
785 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
786 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
787}
788
789#ifdef CONFIG_HOTPLUG_CPU
790
791/*
792 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
793 * and move all callbacks from the outgoing CPU to the current one.
794 */
795static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
796{
797 int i;
798 unsigned long flags;
799 long lastcomp;
800 unsigned long mask;
801 struct rcu_data *rdp = rsp->rda[cpu];
802 struct rcu_data *rdp_me;
803 struct rcu_node *rnp;
804
805 /* Exclude any attempts to start a new grace period. */
806 spin_lock_irqsave(&rsp->onofflock, flags);
807
808 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
809 rnp = rdp->mynode;
810 mask = rdp->grpmask; /* rnp->grplo is constant. */
811 do {
812 spin_lock(&rnp->lock); /* irqs already disabled. */
813 rnp->qsmaskinit &= ~mask;
814 if (rnp->qsmaskinit != 0) {
815 spin_unlock(&rnp->lock); /* irqs already disabled. */
816 break;
817 }
818 mask = rnp->grpmask;
819 spin_unlock(&rnp->lock); /* irqs already disabled. */
820 rnp = rnp->parent;
821 } while (rnp != NULL);
822 lastcomp = rsp->completed;
823
824 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
825
826 /* Being offline is a quiescent state, so go record it. */
827 cpu_quiet(cpu, rsp, rdp, lastcomp);
828
829 /*
830 * Move callbacks from the outgoing CPU to the running CPU.
831 * Note that the outgoing CPU is now quiscent, so it is now
832 * (uncharacteristically) safe to access it rcu_data structure.
833 * Note also that we must carefully retain the order of the
834 * outgoing CPU's callbacks in order for rcu_barrier() to work
835 * correctly. Finally, note that we start all the callbacks
836 * afresh, even those that have passed through a grace period
837 * and are therefore ready to invoke. The theory is that hotplug
838 * events are rare, and that if they are frequent enough to
839 * indefinitely delay callbacks, you have far worse things to
840 * be worrying about.
841 */
842 rdp_me = rsp->rda[smp_processor_id()];
843 if (rdp->nxtlist != NULL) {
844 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
845 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
846 rdp->nxtlist = NULL;
847 for (i = 0; i < RCU_NEXT_SIZE; i++)
848 rdp->nxttail[i] = &rdp->nxtlist;
849 rdp_me->qlen += rdp->qlen;
850 rdp->qlen = 0;
851 }
852 local_irq_restore(flags);
853}
854
855/*
856 * Remove the specified CPU from the RCU hierarchy and move any pending
857 * callbacks that it might have to the current CPU. This code assumes
858 * that at least one CPU in the system will remain running at all times.
859 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
860 */
861static void rcu_offline_cpu(int cpu)
862{
863 __rcu_offline_cpu(cpu, &rcu_state);
864 __rcu_offline_cpu(cpu, &rcu_bh_state);
865}
866
867#else /* #ifdef CONFIG_HOTPLUG_CPU */
868
869static void rcu_offline_cpu(int cpu)
870{
871}
872
873#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
874
875/*
876 * Invoke any RCU callbacks that have made it to the end of their grace
877 * period. Thottle as specified by rdp->blimit.
878 */
879static void rcu_do_batch(struct rcu_data *rdp)
880{
881 unsigned long flags;
882 struct rcu_head *next, *list, **tail;
883 int count;
884
885 /* If no callbacks are ready, just return.*/
886 if (!cpu_has_callbacks_ready_to_invoke(rdp))
887 return;
888
889 /*
890 * Extract the list of ready callbacks, disabling to prevent
891 * races with call_rcu() from interrupt handlers.
892 */
893 local_irq_save(flags);
894 list = rdp->nxtlist;
895 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
896 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
897 tail = rdp->nxttail[RCU_DONE_TAIL];
898 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
899 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
900 rdp->nxttail[count] = &rdp->nxtlist;
901 local_irq_restore(flags);
902
903 /* Invoke callbacks. */
904 count = 0;
905 while (list) {
906 next = list->next;
907 prefetch(next);
908 list->func(list);
909 list = next;
910 if (++count >= rdp->blimit)
911 break;
912 }
913
914 local_irq_save(flags);
915
916 /* Update count, and requeue any remaining callbacks. */
917 rdp->qlen -= count;
918 if (list != NULL) {
919 *tail = rdp->nxtlist;
920 rdp->nxtlist = list;
921 for (count = 0; count < RCU_NEXT_SIZE; count++)
922 if (&rdp->nxtlist == rdp->nxttail[count])
923 rdp->nxttail[count] = tail;
924 else
925 break;
926 }
927
928 /* Reinstate batch limit if we have worked down the excess. */
929 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
930 rdp->blimit = blimit;
931
932 local_irq_restore(flags);
933
934 /* Re-raise the RCU softirq if there are callbacks remaining. */
935 if (cpu_has_callbacks_ready_to_invoke(rdp))
936 raise_softirq(RCU_SOFTIRQ);
937}
938
939/*
940 * Check to see if this CPU is in a non-context-switch quiescent state
941 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
942 * Also schedule the RCU softirq handler.
943 *
944 * This function must be called with hardirqs disabled. It is normally
945 * invoked from the scheduling-clock interrupt. If rcu_pending returns
946 * false, there is no point in invoking rcu_check_callbacks().
947 */
948void rcu_check_callbacks(int cpu, int user)
949{
950 if (user ||
951 (idle_cpu(cpu) && !in_softirq() &&
952 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
953
954 /*
955 * Get here if this CPU took its interrupt from user
956 * mode or from the idle loop, and if this is not a
957 * nested interrupt. In this case, the CPU is in
958 * a quiescent state, so count it.
959 *
960 * No memory barrier is required here because both
961 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
962 * only CPU-local variables that other CPUs neither
963 * access nor modify, at least not while the corresponding
964 * CPU is online.
965 */
966
967 rcu_qsctr_inc(cpu);
968 rcu_bh_qsctr_inc(cpu);
969
970 } else if (!in_softirq()) {
971
972 /*
973 * Get here if this CPU did not take its interrupt from
974 * softirq, in other words, if it is not interrupting
975 * a rcu_bh read-side critical section. This is an _bh
976 * critical section, so count it.
977 */
978
979 rcu_bh_qsctr_inc(cpu);
980 }
981 raise_softirq(RCU_SOFTIRQ);
982}
983
984#ifdef CONFIG_SMP
985
986/*
987 * Scan the leaf rcu_node structures, processing dyntick state for any that
988 * have not yet encountered a quiescent state, using the function specified.
989 * Returns 1 if the current grace period ends while scanning (possibly
990 * because we made it end).
991 */
992static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
993 int (*f)(struct rcu_data *))
994{
995 unsigned long bit;
996 int cpu;
997 unsigned long flags;
998 unsigned long mask;
999 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
1000 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
1001
1002 for (; rnp_cur < rnp_end; rnp_cur++) {
1003 mask = 0;
1004 spin_lock_irqsave(&rnp_cur->lock, flags);
1005 if (rsp->completed != lastcomp) {
1006 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1007 return 1;
1008 }
1009 if (rnp_cur->qsmask == 0) {
1010 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1011 continue;
1012 }
1013 cpu = rnp_cur->grplo;
1014 bit = 1;
1015 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
1016 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1017 mask |= bit;
1018 }
1019 if (mask != 0 && rsp->completed == lastcomp) {
1020
1021 /* cpu_quiet_msk() releases rnp_cur->lock. */
1022 cpu_quiet_msk(mask, rsp, rnp_cur, flags);
1023 continue;
1024 }
1025 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1026 }
1027 return 0;
1028}
1029
1030/*
1031 * Force quiescent states on reluctant CPUs, and also detect which
1032 * CPUs are in dyntick-idle mode.
1033 */
1034static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1035{
1036 unsigned long flags;
1037 long lastcomp;
1038 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
1039 struct rcu_node *rnp = rcu_get_root(rsp);
1040 u8 signaled;
1041
1042 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
1043 return; /* No grace period in progress, nothing to force. */
1044 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1045 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1046 return; /* Someone else is already on the job. */
1047 }
1048 if (relaxed &&
1049 (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
1050 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
1051 goto unlock_ret; /* no emergency and done recently. */
1052 rsp->n_force_qs++;
1053 spin_lock(&rnp->lock);
1054 lastcomp = rsp->completed;
1055 signaled = rsp->signaled;
1056 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1057 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
1058 RCU_JIFFIES_TILL_FORCE_QS;
1059 if (lastcomp == rsp->gpnum) {
1060 rsp->n_force_qs_ngp++;
1061 spin_unlock(&rnp->lock);
1062 goto unlock_ret; /* no GP in progress, time updated. */
1063 }
1064 spin_unlock(&rnp->lock);
1065 switch (signaled) {
1066 case RCU_GP_INIT:
1067
1068 break; /* grace period still initializing, ignore. */
1069
1070 case RCU_SAVE_DYNTICK:
1071
1072 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1073 break; /* So gcc recognizes the dead code. */
1074
1075 /* Record dyntick-idle state. */
1076 if (rcu_process_dyntick(rsp, lastcomp,
1077 dyntick_save_progress_counter))
1078 goto unlock_ret;
1079
1080 /* Update state, record completion counter. */
1081 spin_lock(&rnp->lock);
1082 if (lastcomp == rsp->completed) {
1083 rsp->signaled = RCU_FORCE_QS;
1084 dyntick_record_completed(rsp, lastcomp);
1085 }
1086 spin_unlock(&rnp->lock);
1087 break;
1088
1089 case RCU_FORCE_QS:
1090
1091 /* Check dyntick-idle state, send IPI to laggarts. */
1092 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
1093 rcu_implicit_dynticks_qs))
1094 goto unlock_ret;
1095
1096 /* Leave state in case more forcing is required. */
1097
1098 break;
1099 }
1100unlock_ret:
1101 spin_unlock_irqrestore(&rsp->fqslock, flags);
1102}
1103
1104#else /* #ifdef CONFIG_SMP */
1105
1106static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1107{
1108 set_need_resched();
1109}
1110
1111#endif /* #else #ifdef CONFIG_SMP */
1112
1113/*
1114 * This does the RCU processing work from softirq context for the
1115 * specified rcu_state and rcu_data structures. This may be called
1116 * only from the CPU to whom the rdp belongs.
1117 */
1118static void
1119__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1120{
1121 unsigned long flags;
1122
1123 /*
1124 * If an RCU GP has gone long enough, go check for dyntick
1125 * idle CPUs and, if needed, send resched IPIs.
1126 */
1127 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1128 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1129 force_quiescent_state(rsp, 1);
1130
1131 /*
1132 * Advance callbacks in response to end of earlier grace
1133 * period that some other CPU ended.
1134 */
1135 rcu_process_gp_end(rsp, rdp);
1136
1137 /* Update RCU state based on any recent quiescent states. */
1138 rcu_check_quiescent_state(rsp, rdp);
1139
1140 /* Does this CPU require a not-yet-started grace period? */
1141 if (cpu_needs_another_gp(rsp, rdp)) {
1142 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1143 rcu_start_gp(rsp, flags); /* releases above lock */
1144 }
1145
1146 /* If there are callbacks ready, invoke them. */
1147 rcu_do_batch(rdp);
1148}
1149
1150/*
1151 * Do softirq processing for the current CPU.
1152 */
1153static void rcu_process_callbacks(struct softirq_action *unused)
1154{
1155 /*
1156 * Memory references from any prior RCU read-side critical sections
1157 * executed by the interrupted code must be seen before any RCU
1158 * grace-period manipulations below.
1159 */
1160 smp_mb(); /* See above block comment. */
1161
1162 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
1163 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1164
1165 /*
1166 * Memory references from any later RCU read-side critical sections
1167 * executed by the interrupted code must be seen after any RCU
1168 * grace-period manipulations above.
1169 */
1170 smp_mb(); /* See above block comment. */
1171}
1172
1173static void
1174__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1175 struct rcu_state *rsp)
1176{
1177 unsigned long flags;
1178 struct rcu_data *rdp;
1179
1180 head->func = func;
1181 head->next = NULL;
1182
1183 smp_mb(); /* Ensure RCU update seen before callback registry. */
1184
1185 /*
1186 * Opportunistically note grace-period endings and beginnings.
1187 * Note that we might see a beginning right after we see an
1188 * end, but never vice versa, since this CPU has to pass through
1189 * a quiescent state betweentimes.
1190 */
1191 local_irq_save(flags);
1192 rdp = rsp->rda[smp_processor_id()];
1193 rcu_process_gp_end(rsp, rdp);
1194 check_for_new_grace_period(rsp, rdp);
1195
1196 /* Add the callback to our list. */
1197 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1198 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1199
1200 /* Start a new grace period if one not already started. */
1201 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
1202 unsigned long nestflag;
1203 struct rcu_node *rnp_root = rcu_get_root(rsp);
1204
1205 spin_lock_irqsave(&rnp_root->lock, nestflag);
1206 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1207 }
1208
1209 /* Force the grace period if too many callbacks or too long waiting. */
1210 if (unlikely(++rdp->qlen > qhimark)) {
1211 rdp->blimit = LONG_MAX;
1212 force_quiescent_state(rsp, 0);
1213 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1214 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1215 force_quiescent_state(rsp, 1);
1216 local_irq_restore(flags);
1217}
1218
1219/*
1220 * Queue an RCU callback for invocation after a grace period.
1221 */
1222void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1223{
1224 __call_rcu(head, func, &rcu_state);
1225}
1226EXPORT_SYMBOL_GPL(call_rcu);
1227
1228/*
1229 * Queue an RCU for invocation after a quicker grace period.
1230 */
1231void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1232{
1233 __call_rcu(head, func, &rcu_bh_state);
1234}
1235EXPORT_SYMBOL_GPL(call_rcu_bh);
1236
1237/*
1238 * Check to see if there is any immediate RCU-related work to be done
1239 * by the current CPU, for the specified type of RCU, returning 1 if so.
1240 * The checks are in order of increasing expense: checks that can be
1241 * carried out against CPU-local state are performed first. However,
1242 * we must check for CPU stalls first, else we might not get a chance.
1243 */
1244static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1245{
1246 rdp->n_rcu_pending++;
1247
1248 /* Check for CPU stalls, if enabled. */
1249 check_cpu_stall(rsp, rdp);
1250
1251 /* Is the RCU core waiting for a quiescent state from this CPU? */
1252 if (rdp->qs_pending)
1253 return 1;
1254
1255 /* Does this CPU have callbacks ready to invoke? */
1256 if (cpu_has_callbacks_ready_to_invoke(rdp))
1257 return 1;
1258
1259 /* Has RCU gone idle with this CPU needing another grace period? */
1260 if (cpu_needs_another_gp(rsp, rdp))
1261 return 1;
1262
1263 /* Has another RCU grace period completed? */
1264 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
1265 return 1;
1266
1267 /* Has a new RCU grace period started? */
1268 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
1269 return 1;
1270
1271 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1272 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1273 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1274 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
1275 return 1;
1276
1277 /* nothing to do */
1278 return 0;
1279}
1280
1281/*
1282 * Check to see if there is any immediate RCU-related work to be done
1283 * by the current CPU, returning 1 if so. This function is part of the
1284 * RCU implementation; it is -not- an exported member of the RCU API.
1285 */
1286int rcu_pending(int cpu)
1287{
1288 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
1289 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
1290}
1291
1292/*
1293 * Check to see if any future RCU-related work will need to be done
1294 * by the current CPU, even if none need be done immediately, returning
1295 * 1 if so. This function is part of the RCU implementation; it is -not-
1296 * an exported member of the RCU API.
1297 */
1298int rcu_needs_cpu(int cpu)
1299{
1300 /* RCU callbacks either ready or pending? */
1301 return per_cpu(rcu_data, cpu).nxtlist ||
1302 per_cpu(rcu_bh_data, cpu).nxtlist;
1303}
1304
1305/*
1306 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth"
1307 * approach so that we don't have to worry about how long the CPU has
1308 * been gone, or whether it ever was online previously. We do trust the
1309 * ->mynode field, as it is constant for a given struct rcu_data and
1310 * initialized during early boot.
1311 *
1312 * Note that only one online or offline event can be happening at a given
1313 * time. Note also that we can accept some slop in the rsp->completed
1314 * access due to the fact that this CPU cannot possibly have any RCU
1315 * callbacks in flight yet.
1316 */
1317static void
1318rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1319{
1320 unsigned long flags;
1321 int i;
1322 long lastcomp;
1323 unsigned long mask;
1324 struct rcu_data *rdp = rsp->rda[cpu];
1325 struct rcu_node *rnp = rcu_get_root(rsp);
1326
1327 /* Set up local state, ensuring consistent view of global state. */
1328 spin_lock_irqsave(&rnp->lock, flags);
1329 lastcomp = rsp->completed;
1330 rdp->completed = lastcomp;
1331 rdp->gpnum = lastcomp;
1332 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1333 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1334 rdp->beenonline = 1; /* We have now been online. */
1335 rdp->passed_quiesc_completed = lastcomp - 1;
1336 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1337 rdp->nxtlist = NULL;
1338 for (i = 0; i < RCU_NEXT_SIZE; i++)
1339 rdp->nxttail[i] = &rdp->nxtlist;
1340 rdp->qlen = 0;
1341 rdp->blimit = blimit;
1342#ifdef CONFIG_NO_HZ
1343 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1344#endif /* #ifdef CONFIG_NO_HZ */
1345 rdp->cpu = cpu;
1346 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1347
1348 /*
1349 * A new grace period might start here. If so, we won't be part
1350 * of it, but that is OK, as we are currently in a quiescent state.
1351 */
1352
1353 /* Exclude any attempts to start a new GP on large systems. */
1354 spin_lock(&rsp->onofflock); /* irqs already disabled. */
1355
1356 /* Add CPU to rcu_node bitmasks. */
1357 rnp = rdp->mynode;
1358 mask = rdp->grpmask;
1359 do {
1360 /* Exclude any attempts to start a new GP on small systems. */
1361 spin_lock(&rnp->lock); /* irqs already disabled. */
1362 rnp->qsmaskinit |= mask;
1363 mask = rnp->grpmask;
1364 spin_unlock(&rnp->lock); /* irqs already disabled. */
1365 rnp = rnp->parent;
1366 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1367
1368 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1369
1370 /*
1371 * A new grace period might start here. If so, we will be part of
1372 * it, and its gpnum will be greater than ours, so we will
1373 * participate. It is also possible for the gpnum to have been
1374 * incremented before this function was called, and the bitmasks
1375 * to not be filled out until now, in which case we will also
1376 * participate due to our gpnum being behind.
1377 */
1378
1379 /* Since it is coming online, the CPU is in a quiescent state. */
1380 cpu_quiet(cpu, rsp, rdp, lastcomp);
1381 local_irq_restore(flags);
1382}
1383
1384static void __cpuinit rcu_online_cpu(int cpu)
1385{
1386 rcu_init_percpu_data(cpu, &rcu_state);
1387 rcu_init_percpu_data(cpu, &rcu_bh_state);
1388 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1389}
1390
1391/*
1392 * Handle CPU online/offline notifcation events.
1393 */
1394static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1395 unsigned long action, void *hcpu)
1396{
1397 long cpu = (long)hcpu;
1398
1399 switch (action) {
1400 case CPU_UP_PREPARE:
1401 case CPU_UP_PREPARE_FROZEN:
1402 rcu_online_cpu(cpu);
1403 break;
1404 case CPU_DEAD:
1405 case CPU_DEAD_FROZEN:
1406 case CPU_UP_CANCELED:
1407 case CPU_UP_CANCELED_FROZEN:
1408 rcu_offline_cpu(cpu);
1409 break;
1410 default:
1411 break;
1412 }
1413 return NOTIFY_OK;
1414}
1415
1416/*
1417 * Compute the per-level fanout, either using the exact fanout specified
1418 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1419 */
1420#ifdef CONFIG_RCU_FANOUT_EXACT
1421static void __init rcu_init_levelspread(struct rcu_state *rsp)
1422{
1423 int i;
1424
1425 for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
1426 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1427}
1428#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1429static void __init rcu_init_levelspread(struct rcu_state *rsp)
1430{
1431 int ccur;
1432 int cprv;
1433 int i;
1434
1435 cprv = NR_CPUS;
1436 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1437 ccur = rsp->levelcnt[i];
1438 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
1439 cprv = ccur;
1440 }
1441}
1442#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
1443
1444/*
1445 * Helper function for rcu_init() that initializes one rcu_state structure.
1446 */
1447static void __init rcu_init_one(struct rcu_state *rsp)
1448{
1449 int cpustride = 1;
1450 int i;
1451 int j;
1452 struct rcu_node *rnp;
1453
1454 /* Initialize the level-tracking arrays. */
1455
1456 for (i = 1; i < NUM_RCU_LVLS; i++)
1457 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
1458 rcu_init_levelspread(rsp);
1459
1460 /* Initialize the elements themselves, starting from the leaves. */
1461
1462 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1463 cpustride *= rsp->levelspread[i];
1464 rnp = rsp->level[i];
1465 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1466 spin_lock_init(&rnp->lock);
1467 rnp->qsmask = 0;
1468 rnp->qsmaskinit = 0;
1469 rnp->grplo = j * cpustride;
1470 rnp->grphi = (j + 1) * cpustride - 1;
1471 if (rnp->grphi >= NR_CPUS)
1472 rnp->grphi = NR_CPUS - 1;
1473 if (i == 0) {
1474 rnp->grpnum = 0;
1475 rnp->grpmask = 0;
1476 rnp->parent = NULL;
1477 } else {
1478 rnp->grpnum = j % rsp->levelspread[i - 1];
1479 rnp->grpmask = 1UL << rnp->grpnum;
1480 rnp->parent = rsp->level[i - 1] +
1481 j / rsp->levelspread[i - 1];
1482 }
1483 rnp->level = i;
1484 }
1485 }
1486}
1487
1488/*
1489 * Helper macro for __rcu_init(). To be used nowhere else!
1490 * Assigns leaf node pointers into each CPU's rcu_data structure.
1491 */
1492#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
1493do { \
1494 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1495 j = 0; \
1496 for_each_possible_cpu(i) { \
1497 if (i > rnp[j].grphi) \
1498 j++; \
1499 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1500 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1501 } \
1502} while (0)
1503
1504static struct notifier_block __cpuinitdata rcu_nb = {
1505 .notifier_call = rcu_cpu_notify,
1506};
1507
1508void __init __rcu_init(void)
1509{
1510 int i; /* All used by RCU_DATA_PTR_INIT(). */
1511 int j;
1512 struct rcu_node *rnp;
1513
1514 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
1515#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1516 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1517#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1518 rcu_init_one(&rcu_state);
1519 RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
1520 rcu_init_one(&rcu_bh_state);
1521 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
1522
1523 for_each_online_cpu(i)
1524 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1525 /* Register notifier for non-boot CPUs */
1526 register_cpu_notifier(&rcu_nb);
1527 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1528}
1529
1530module_param(blimit, int, 0);
1531module_param(qhimark, int, 0);
1532module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 000000000000..d6db3e837826
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
1/*
2 * Read-Copy Update tracing for classic implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/debugfs.h>
44#include <linux/seq_file.h>
45
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{
48 if (!rdp->beenonline)
49 return;
50 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
51 rdp->cpu,
52 cpu_is_offline(rdp->cpu) ? '!' : ' ',
53 rdp->completed, rdp->gpnum,
54 rdp->passed_quiesc, rdp->passed_quiesc_completed,
55 rdp->qs_pending,
56 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
57 (int)(rdp->n_rcu_pending & 0xffff));
58#ifdef CONFIG_NO_HZ
59 seq_printf(m, " dt=%d/%d dn=%d df=%lu",
60 rdp->dynticks->dynticks,
61 rdp->dynticks->dynticks_nesting,
62 rdp->dynticks->dynticks_nmi,
63 rdp->dynticks_fqs);
64#endif /* #ifdef CONFIG_NO_HZ */
65 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
66 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
67}
68
69#define PRINT_RCU_DATA(name, func, m) \
70 do { \
71 int _p_r_d_i; \
72 \
73 for_each_possible_cpu(_p_r_d_i) \
74 func(m, &per_cpu(name, _p_r_d_i)); \
75 } while (0)
76
77static int show_rcudata(struct seq_file *m, void *unused)
78{
79 seq_puts(m, "rcu:\n");
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0;
84}
85
86static int rcudata_open(struct inode *inode, struct file *file)
87{
88 return single_open(file, show_rcudata, NULL);
89}
90
91static struct file_operations rcudata_fops = {
92 .owner = THIS_MODULE,
93 .open = rcudata_open,
94 .read = seq_read,
95 .llseek = seq_lseek,
96 .release = single_release,
97};
98
99static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
100{
101 if (!rdp->beenonline)
102 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
104 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
106 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending,
109 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
110 rdp->n_rcu_pending);
111#ifdef CONFIG_NO_HZ
112 seq_printf(m, ",%d,%d,%d,%lu",
113 rdp->dynticks->dynticks,
114 rdp->dynticks->dynticks_nesting,
115 rdp->dynticks->dynticks_nmi,
116 rdp->dynticks_fqs);
117#endif /* #ifdef CONFIG_NO_HZ */
118 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
119 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
120}
121
122static int show_rcudata_csv(struct seq_file *m, void *unused)
123{
124 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
125#ifdef CONFIG_NO_HZ
126 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
127#endif /* #ifdef CONFIG_NO_HZ */
128 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
129 seq_puts(m, "\"rcu:\"\n");
130 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
131 seq_puts(m, "\"rcu_bh:\"\n");
132 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
133 return 0;
134}
135
136static int rcudata_csv_open(struct inode *inode, struct file *file)
137{
138 return single_open(file, show_rcudata_csv, NULL);
139}
140
141static struct file_operations rcudata_csv_fops = {
142 .owner = THIS_MODULE,
143 .open = rcudata_csv_open,
144 .read = seq_read,
145 .llseek = seq_lseek,
146 .release = single_release,
147};
148
149static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
150{
151 int level = 0;
152 struct rcu_node *rnp;
153
154 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
155 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
156 rsp->completed, rsp->gpnum, rsp->signaled,
157 (long)(rsp->jiffies_force_qs - jiffies),
158 (int)(jiffies & 0xffff),
159 rsp->n_force_qs, rsp->n_force_qs_ngp,
160 rsp->n_force_qs - rsp->n_force_qs_ngp,
161 rsp->n_force_qs_lh);
162 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
163 if (rnp->level != level) {
164 seq_puts(m, "\n");
165 level = rnp->level;
166 }
167 seq_printf(m, "%lx/%lx %d:%d ^%d ",
168 rnp->qsmask, rnp->qsmaskinit,
169 rnp->grplo, rnp->grphi, rnp->grpnum);
170 }
171 seq_puts(m, "\n");
172}
173
174static int show_rcuhier(struct seq_file *m, void *unused)
175{
176 seq_puts(m, "rcu:\n");
177 print_one_rcu_state(m, &rcu_state);
178 seq_puts(m, "rcu_bh:\n");
179 print_one_rcu_state(m, &rcu_bh_state);
180 return 0;
181}
182
183static int rcuhier_open(struct inode *inode, struct file *file)
184{
185 return single_open(file, show_rcuhier, NULL);
186}
187
188static struct file_operations rcuhier_fops = {
189 .owner = THIS_MODULE,
190 .open = rcuhier_open,
191 .read = seq_read,
192 .llseek = seq_lseek,
193 .release = single_release,
194};
195
196static int show_rcugp(struct seq_file *m, void *unused)
197{
198 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n",
199 rcu_state.completed, rcu_state.gpnum);
200 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
201 rcu_bh_state.completed, rcu_bh_state.gpnum);
202 return 0;
203}
204
205static int rcugp_open(struct inode *inode, struct file *file)
206{
207 return single_open(file, show_rcugp, NULL);
208}
209
210static struct file_operations rcugp_fops = {
211 .owner = THIS_MODULE,
212 .open = rcugp_open,
213 .read = seq_read,
214 .llseek = seq_lseek,
215 .release = single_release,
216};
217
218static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
219static int __init rcuclassic_trace_init(void)
220{
221 rcudir = debugfs_create_dir("rcu", NULL);
222 if (!rcudir)
223 goto out;
224
225 datadir = debugfs_create_file("rcudata", 0444, rcudir,
226 NULL, &rcudata_fops);
227 if (!datadir)
228 goto free_out;
229
230 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
231 NULL, &rcudata_csv_fops);
232 if (!datadir_csv)
233 goto free_out;
234
235 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
236 if (!gpdir)
237 goto free_out;
238
239 hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
240 NULL, &rcuhier_fops);
241 if (!hierdir)
242 goto free_out;
243 return 0;
244free_out:
245 if (datadir)
246 debugfs_remove(datadir);
247 if (datadir_csv)
248 debugfs_remove(datadir_csv);
249 if (gpdir)
250 debugfs_remove(gpdir);
251 debugfs_remove(rcudir);
252out:
253 return 1;
254}
255
256static void __exit rcuclassic_trace_cleanup(void)
257{
258 debugfs_remove(datadir);
259 debugfs_remove(datadir_csv);
260 debugfs_remove(gpdir);
261 debugfs_remove(hierdir);
262 debugfs_remove(rcudir);
263}
264
265
266module_init(rcuclassic_trace_init);
267module_exit(rcuclassic_trace_cleanup);
268
269MODULE_AUTHOR("Paul E. McKenney");
270MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
271MODULE_LICENSE("GPL");
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index f275c8eca772..bf8e7534c803 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -15,10 +15,11 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17 17
18void res_counter_init(struct res_counter *counter) 18void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = (unsigned long long)LLONG_MAX; 21 counter->limit = (unsigned long long)LLONG_MAX;
22 counter->parent = parent;
22} 23}
23 24
24int res_counter_charge_locked(struct res_counter *counter, unsigned long val) 25int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
@@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
34 return 0; 35 return 0;
35} 36}
36 37
37int res_counter_charge(struct res_counter *counter, unsigned long val) 38int res_counter_charge(struct res_counter *counter, unsigned long val,
39 struct res_counter **limit_fail_at)
38{ 40{
39 int ret; 41 int ret;
40 unsigned long flags; 42 unsigned long flags;
41 43 struct res_counter *c, *u;
42 spin_lock_irqsave(&counter->lock, flags); 44
43 ret = res_counter_charge_locked(counter, val); 45 *limit_fail_at = NULL;
44 spin_unlock_irqrestore(&counter->lock, flags); 46 local_irq_save(flags);
47 for (c = counter; c != NULL; c = c->parent) {
48 spin_lock(&c->lock);
49 ret = res_counter_charge_locked(c, val);
50 spin_unlock(&c->lock);
51 if (ret < 0) {
52 *limit_fail_at = c;
53 goto undo;
54 }
55 }
56 ret = 0;
57 goto done;
58undo:
59 for (u = counter; u != c; u = u->parent) {
60 spin_lock(&u->lock);
61 res_counter_uncharge_locked(u, val);
62 spin_unlock(&u->lock);
63 }
64done:
65 local_irq_restore(flags);
45 return ret; 66 return ret;
46} 67}
47 68
@@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
56void res_counter_uncharge(struct res_counter *counter, unsigned long val) 77void res_counter_uncharge(struct res_counter *counter, unsigned long val)
57{ 78{
58 unsigned long flags; 79 unsigned long flags;
80 struct res_counter *c;
59 81
60 spin_lock_irqsave(&counter->lock, flags); 82 local_irq_save(flags);
61 res_counter_uncharge_locked(counter, val); 83 for (c = counter; c != NULL; c = c->parent) {
62 spin_unlock_irqrestore(&counter->lock, flags); 84 spin_lock(&c->lock);
85 res_counter_uncharge_locked(c, val);
86 spin_unlock(&c->lock);
87 }
88 local_irq_restore(flags);
63} 89}
64 90
65 91
diff --git a/kernel/resource.c b/kernel/resource.c
index 4337063663ef..ca6a1536b205 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res)
623 */ 623 */
624struct resource * __request_region(struct resource *parent, 624struct resource * __request_region(struct resource *parent,
625 resource_size_t start, resource_size_t n, 625 resource_size_t start, resource_size_t n,
626 const char *name) 626 const char *name, int flags)
627{ 627{
628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
629 629
@@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent,
634 res->start = start; 634 res->start = start;
635 res->end = start + n - 1; 635 res->end = start + n - 1;
636 res->flags = IORESOURCE_BUSY; 636 res->flags = IORESOURCE_BUSY;
637 res->flags |= flags;
637 638
638 write_lock(&resource_lock); 639 write_lock(&resource_lock);
639 640
@@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start,
679{ 680{
680 struct resource * res; 681 struct resource * res;
681 682
682 res = __request_region(parent, start, n, "check-region"); 683 res = __request_region(parent, start, n, "check-region", 0);
683 if (!res) 684 if (!res)
684 return -EBUSY; 685 return -EBUSY;
685 686
@@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev,
776 dr->start = start; 777 dr->start = start;
777 dr->n = n; 778 dr->n = n;
778 779
779 res = __request_region(parent, start, n, name); 780 res = __request_region(parent, start, n, name, 0);
780 if (res) 781 if (res)
781 devres_add(dev, dr); 782 devres_add(dev, dr);
782 else 783 else
@@ -853,6 +854,15 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && 854 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) 855 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
855 continue; 856 continue;
857 /*
858 * if a resource is "BUSY", it's not a hardware resource
859 * but a driver mapping of such a resource; we don't want
860 * to warn for those; some drivers legitimately map only
861 * partial hardware resources. (example: vesafb)
862 */
863 if (p->flags & IORESOURCE_BUSY)
864 continue;
865
856 printk(KERN_WARNING "resource map sanity check conflict: " 866 printk(KERN_WARNING "resource map sanity check conflict: "
857 "0x%llx 0x%llx 0x%llx 0x%llx %s\n", 867 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
858 (unsigned long long)addr, 868 (unsigned long long)addr,
@@ -867,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
867 877
868 return err; 878 return err;
869} 879}
880
881#ifdef CONFIG_STRICT_DEVMEM
882static int strict_iomem_checks = 1;
883#else
884static int strict_iomem_checks;
885#endif
886
887/*
888 * check if an address is reserved in the iomem resource tree
889 * returns 1 if reserved, 0 if not reserved.
890 */
891int iomem_is_exclusive(u64 addr)
892{
893 struct resource *p = &iomem_resource;
894 int err = 0;
895 loff_t l;
896 int size = PAGE_SIZE;
897
898 if (!strict_iomem_checks)
899 return 0;
900
901 addr = addr & PAGE_MASK;
902
903 read_lock(&resource_lock);
904 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
905 /*
906 * We can probably skip the resources without
907 * IORESOURCE_IO attribute?
908 */
909 if (p->start >= addr + size)
910 break;
911 if (p->end < addr)
912 continue;
913 if (p->flags & IORESOURCE_BUSY &&
914 p->flags & IORESOURCE_EXCLUSIVE) {
915 err = 1;
916 break;
917 }
918 }
919 read_unlock(&resource_lock);
920
921 return err;
922}
923
924static int __init strict_iomem(char *str)
925{
926 if (strstr(str, "relaxed"))
927 strict_iomem_checks = 0;
928 if (strstr(str, "strict"))
929 strict_iomem_checks = 1;
930 return 1;
931}
932
933__setup("iomem=", strict_iomem);
diff --git a/kernel/sched.c b/kernel/sched.c
index 3dfbff5fb1ac..43fd21233b93 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -209,7 +209,6 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
209 hrtimer_init(&rt_b->rt_period_timer, 209 hrtimer_init(&rt_b->rt_period_timer,
210 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 210 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
211 rt_b->rt_period_timer.function = sched_rt_period_timer; 211 rt_b->rt_period_timer.function = sched_rt_period_timer;
212 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
213} 212}
214 213
215static inline int rt_bandwidth_enabled(void) 214static inline int rt_bandwidth_enabled(void)
@@ -499,18 +498,26 @@ struct rt_rq {
499 */ 498 */
500struct root_domain { 499struct root_domain {
501 atomic_t refcount; 500 atomic_t refcount;
502 cpumask_t span; 501 cpumask_var_t span;
503 cpumask_t online; 502 cpumask_var_t online;
504 503
505 /* 504 /*
506 * The "RT overload" flag: it gets set if a CPU has more than 505 * The "RT overload" flag: it gets set if a CPU has more than
507 * one runnable RT task. 506 * one runnable RT task.
508 */ 507 */
509 cpumask_t rto_mask; 508 cpumask_var_t rto_mask;
510 atomic_t rto_count; 509 atomic_t rto_count;
511#ifdef CONFIG_SMP 510#ifdef CONFIG_SMP
512 struct cpupri cpupri; 511 struct cpupri cpupri;
513#endif 512#endif
513#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
514 /*
515 * Preferred wake up cpu nominated by sched_mc balance that will be
516 * used when most cpus are idle in the system indicating overall very
517 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
518 */
519 unsigned int sched_mc_preferred_wakeup_cpu;
520#endif
514}; 521};
515 522
516/* 523/*
@@ -1159,7 +1166,6 @@ static void init_rq_hrtick(struct rq *rq)
1159 1166
1160 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1167 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1161 rq->hrtick_timer.function = hrtick; 1168 rq->hrtick_timer.function = hrtick;
1162 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1163} 1169}
1164#else /* CONFIG_SCHED_HRTICK */ 1170#else /* CONFIG_SCHED_HRTICK */
1165static inline void hrtick_clear(struct rq *rq) 1171static inline void hrtick_clear(struct rq *rq)
@@ -1536,7 +1542,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1536 struct sched_domain *sd = data; 1542 struct sched_domain *sd = data;
1537 int i; 1543 int i;
1538 1544
1539 for_each_cpu_mask(i, sd->span) { 1545 for_each_cpu(i, sched_domain_span(sd)) {
1540 /* 1546 /*
1541 * If there are currently no tasks on the cpu pretend there 1547 * If there are currently no tasks on the cpu pretend there
1542 * is one of average load so that when a new task gets to 1548 * is one of average load so that when a new task gets to
@@ -1557,7 +1563,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1557 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1563 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1558 shares = tg->shares; 1564 shares = tg->shares;
1559 1565
1560 for_each_cpu_mask(i, sd->span) 1566 for_each_cpu(i, sched_domain_span(sd))
1561 update_group_shares_cpu(tg, i, shares, rq_weight); 1567 update_group_shares_cpu(tg, i, shares, rq_weight);
1562 1568
1563 return 0; 1569 return 0;
@@ -2125,15 +2131,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2125 int i; 2131 int i;
2126 2132
2127 /* Skip over this group if it has no CPUs allowed */ 2133 /* Skip over this group if it has no CPUs allowed */
2128 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 2134 if (!cpumask_intersects(sched_group_cpus(group),
2135 &p->cpus_allowed))
2129 continue; 2136 continue;
2130 2137
2131 local_group = cpu_isset(this_cpu, group->cpumask); 2138 local_group = cpumask_test_cpu(this_cpu,
2139 sched_group_cpus(group));
2132 2140
2133 /* Tally up the load of all CPUs in the group */ 2141 /* Tally up the load of all CPUs in the group */
2134 avg_load = 0; 2142 avg_load = 0;
2135 2143
2136 for_each_cpu_mask_nr(i, group->cpumask) { 2144 for_each_cpu(i, sched_group_cpus(group)) {
2137 /* Bias balancing toward cpus of our domain */ 2145 /* Bias balancing toward cpus of our domain */
2138 if (local_group) 2146 if (local_group)
2139 load = source_load(i, load_idx); 2147 load = source_load(i, load_idx);
@@ -2165,17 +2173,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2165 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2173 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2166 */ 2174 */
2167static int 2175static int
2168find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, 2176find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2169 cpumask_t *tmp)
2170{ 2177{
2171 unsigned long load, min_load = ULONG_MAX; 2178 unsigned long load, min_load = ULONG_MAX;
2172 int idlest = -1; 2179 int idlest = -1;
2173 int i; 2180 int i;
2174 2181
2175 /* Traverse only the allowed CPUs */ 2182 /* Traverse only the allowed CPUs */
2176 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2183 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2177
2178 for_each_cpu_mask_nr(i, *tmp) {
2179 load = weighted_cpuload(i); 2184 load = weighted_cpuload(i);
2180 2185
2181 if (load < min_load || (load == min_load && i == this_cpu)) { 2186 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2217,7 +2222,6 @@ static int sched_balance_self(int cpu, int flag)
2217 update_shares(sd); 2222 update_shares(sd);
2218 2223
2219 while (sd) { 2224 while (sd) {
2220 cpumask_t span, tmpmask;
2221 struct sched_group *group; 2225 struct sched_group *group;
2222 int new_cpu, weight; 2226 int new_cpu, weight;
2223 2227
@@ -2226,14 +2230,13 @@ static int sched_balance_self(int cpu, int flag)
2226 continue; 2230 continue;
2227 } 2231 }
2228 2232
2229 span = sd->span;
2230 group = find_idlest_group(sd, t, cpu); 2233 group = find_idlest_group(sd, t, cpu);
2231 if (!group) { 2234 if (!group) {
2232 sd = sd->child; 2235 sd = sd->child;
2233 continue; 2236 continue;
2234 } 2237 }
2235 2238
2236 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); 2239 new_cpu = find_idlest_cpu(group, t, cpu);
2237 if (new_cpu == -1 || new_cpu == cpu) { 2240 if (new_cpu == -1 || new_cpu == cpu) {
2238 /* Now try balancing at a lower domain level of cpu */ 2241 /* Now try balancing at a lower domain level of cpu */
2239 sd = sd->child; 2242 sd = sd->child;
@@ -2242,10 +2245,10 @@ static int sched_balance_self(int cpu, int flag)
2242 2245
2243 /* Now try balancing at a lower domain level of new_cpu */ 2246 /* Now try balancing at a lower domain level of new_cpu */
2244 cpu = new_cpu; 2247 cpu = new_cpu;
2248 weight = cpumask_weight(sched_domain_span(sd));
2245 sd = NULL; 2249 sd = NULL;
2246 weight = cpus_weight(span);
2247 for_each_domain(cpu, tmp) { 2250 for_each_domain(cpu, tmp) {
2248 if (weight <= cpus_weight(tmp->span)) 2251 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2249 break; 2252 break;
2250 if (tmp->flags & flag) 2253 if (tmp->flags & flag)
2251 sd = tmp; 2254 sd = tmp;
@@ -2311,7 +2314,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2311 cpu = task_cpu(p); 2314 cpu = task_cpu(p);
2312 2315
2313 for_each_domain(this_cpu, sd) { 2316 for_each_domain(this_cpu, sd) {
2314 if (cpu_isset(cpu, sd->span)) { 2317 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2315 update_shares(sd); 2318 update_shares(sd);
2316 break; 2319 break;
2317 } 2320 }
@@ -2360,7 +2363,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2360 else { 2363 else {
2361 struct sched_domain *sd; 2364 struct sched_domain *sd;
2362 for_each_domain(this_cpu, sd) { 2365 for_each_domain(this_cpu, sd) {
2363 if (cpu_isset(cpu, sd->span)) { 2366 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2364 schedstat_inc(sd, ttwu_wake_remote); 2367 schedstat_inc(sd, ttwu_wake_remote);
2365 break; 2368 break;
2366 } 2369 }
@@ -2893,7 +2896,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2893 struct rq *rq; 2896 struct rq *rq;
2894 2897
2895 rq = task_rq_lock(p, &flags); 2898 rq = task_rq_lock(p, &flags);
2896 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2899 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2897 || unlikely(!cpu_active(dest_cpu))) 2900 || unlikely(!cpu_active(dest_cpu)))
2898 goto out; 2901 goto out;
2899 2902
@@ -2958,7 +2961,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2958 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2961 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2959 * 3) are cache-hot on their current CPU. 2962 * 3) are cache-hot on their current CPU.
2960 */ 2963 */
2961 if (!cpu_isset(this_cpu, p->cpus_allowed)) { 2964 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
2962 schedstat_inc(p, se.nr_failed_migrations_affine); 2965 schedstat_inc(p, se.nr_failed_migrations_affine);
2963 return 0; 2966 return 0;
2964 } 2967 }
@@ -3133,7 +3136,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3133static struct sched_group * 3136static struct sched_group *
3134find_busiest_group(struct sched_domain *sd, int this_cpu, 3137find_busiest_group(struct sched_domain *sd, int this_cpu,
3135 unsigned long *imbalance, enum cpu_idle_type idle, 3138 unsigned long *imbalance, enum cpu_idle_type idle,
3136 int *sd_idle, const cpumask_t *cpus, int *balance) 3139 int *sd_idle, const struct cpumask *cpus, int *balance)
3137{ 3140{
3138 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3141 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3139 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3142 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3169,10 +3172,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3169 unsigned long sum_avg_load_per_task; 3172 unsigned long sum_avg_load_per_task;
3170 unsigned long avg_load_per_task; 3173 unsigned long avg_load_per_task;
3171 3174
3172 local_group = cpu_isset(this_cpu, group->cpumask); 3175 local_group = cpumask_test_cpu(this_cpu,
3176 sched_group_cpus(group));
3173 3177
3174 if (local_group) 3178 if (local_group)
3175 balance_cpu = first_cpu(group->cpumask); 3179 balance_cpu = cpumask_first(sched_group_cpus(group));
3176 3180
3177 /* Tally up the load of all CPUs in the group */ 3181 /* Tally up the load of all CPUs in the group */
3178 sum_weighted_load = sum_nr_running = avg_load = 0; 3182 sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3181,13 +3185,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3181 max_cpu_load = 0; 3185 max_cpu_load = 0;
3182 min_cpu_load = ~0UL; 3186 min_cpu_load = ~0UL;
3183 3187
3184 for_each_cpu_mask_nr(i, group->cpumask) { 3188 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3185 struct rq *rq; 3189 struct rq *rq = cpu_rq(i);
3186
3187 if (!cpu_isset(i, *cpus))
3188 continue;
3189
3190 rq = cpu_rq(i);
3191 3190
3192 if (*sd_idle && rq->nr_running) 3191 if (*sd_idle && rq->nr_running)
3193 *sd_idle = 0; 3192 *sd_idle = 0;
@@ -3298,8 +3297,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3298 */ 3297 */
3299 if ((sum_nr_running < min_nr_running) || 3298 if ((sum_nr_running < min_nr_running) ||
3300 (sum_nr_running == min_nr_running && 3299 (sum_nr_running == min_nr_running &&
3301 first_cpu(group->cpumask) < 3300 cpumask_first(sched_group_cpus(group)) >
3302 first_cpu(group_min->cpumask))) { 3301 cpumask_first(sched_group_cpus(group_min)))) {
3303 group_min = group; 3302 group_min = group;
3304 min_nr_running = sum_nr_running; 3303 min_nr_running = sum_nr_running;
3305 min_load_per_task = sum_weighted_load / 3304 min_load_per_task = sum_weighted_load /
@@ -3314,8 +3313,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3314 if (sum_nr_running <= group_capacity - 1) { 3313 if (sum_nr_running <= group_capacity - 1) {
3315 if (sum_nr_running > leader_nr_running || 3314 if (sum_nr_running > leader_nr_running ||
3316 (sum_nr_running == leader_nr_running && 3315 (sum_nr_running == leader_nr_running &&
3317 first_cpu(group->cpumask) > 3316 cpumask_first(sched_group_cpus(group)) <
3318 first_cpu(group_leader->cpumask))) { 3317 cpumask_first(sched_group_cpus(group_leader)))) {
3319 group_leader = group; 3318 group_leader = group;
3320 leader_nr_running = sum_nr_running; 3319 leader_nr_running = sum_nr_running;
3321 } 3320 }
@@ -3441,6 +3440,10 @@ out_balanced:
3441 3440
3442 if (this == group_leader && group_leader != group_min) { 3441 if (this == group_leader && group_leader != group_min) {
3443 *imbalance = min_load_per_task; 3442 *imbalance = min_load_per_task;
3443 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3444 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3445 cpumask_first(sched_group_cpus(group_leader));
3446 }
3444 return group_min; 3447 return group_min;
3445 } 3448 }
3446#endif 3449#endif
@@ -3454,16 +3457,16 @@ ret:
3454 */ 3457 */
3455static struct rq * 3458static struct rq *
3456find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3459find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3457 unsigned long imbalance, const cpumask_t *cpus) 3460 unsigned long imbalance, const struct cpumask *cpus)
3458{ 3461{
3459 struct rq *busiest = NULL, *rq; 3462 struct rq *busiest = NULL, *rq;
3460 unsigned long max_load = 0; 3463 unsigned long max_load = 0;
3461 int i; 3464 int i;
3462 3465
3463 for_each_cpu_mask_nr(i, group->cpumask) { 3466 for_each_cpu(i, sched_group_cpus(group)) {
3464 unsigned long wl; 3467 unsigned long wl;
3465 3468
3466 if (!cpu_isset(i, *cpus)) 3469 if (!cpumask_test_cpu(i, cpus))
3467 continue; 3470 continue;
3468 3471
3469 rq = cpu_rq(i); 3472 rq = cpu_rq(i);
@@ -3493,7 +3496,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3493 */ 3496 */
3494static int load_balance(int this_cpu, struct rq *this_rq, 3497static int load_balance(int this_cpu, struct rq *this_rq,
3495 struct sched_domain *sd, enum cpu_idle_type idle, 3498 struct sched_domain *sd, enum cpu_idle_type idle,
3496 int *balance, cpumask_t *cpus) 3499 int *balance, struct cpumask *cpus)
3497{ 3500{
3498 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3501 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3499 struct sched_group *group; 3502 struct sched_group *group;
@@ -3501,7 +3504,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3501 struct rq *busiest; 3504 struct rq *busiest;
3502 unsigned long flags; 3505 unsigned long flags;
3503 3506
3504 cpus_setall(*cpus); 3507 cpumask_setall(cpus);
3505 3508
3506 /* 3509 /*
3507 * When power savings policy is enabled for the parent domain, idle 3510 * When power savings policy is enabled for the parent domain, idle
@@ -3561,8 +3564,8 @@ redo:
3561 3564
3562 /* All tasks on this runqueue were pinned by CPU affinity */ 3565 /* All tasks on this runqueue were pinned by CPU affinity */
3563 if (unlikely(all_pinned)) { 3566 if (unlikely(all_pinned)) {
3564 cpu_clear(cpu_of(busiest), *cpus); 3567 cpumask_clear_cpu(cpu_of(busiest), cpus);
3565 if (!cpus_empty(*cpus)) 3568 if (!cpumask_empty(cpus))
3566 goto redo; 3569 goto redo;
3567 goto out_balanced; 3570 goto out_balanced;
3568 } 3571 }
@@ -3579,7 +3582,8 @@ redo:
3579 /* don't kick the migration_thread, if the curr 3582 /* don't kick the migration_thread, if the curr
3580 * task on busiest cpu can't be moved to this_cpu 3583 * task on busiest cpu can't be moved to this_cpu
3581 */ 3584 */
3582 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 3585 if (!cpumask_test_cpu(this_cpu,
3586 &busiest->curr->cpus_allowed)) {
3583 spin_unlock_irqrestore(&busiest->lock, flags); 3587 spin_unlock_irqrestore(&busiest->lock, flags);
3584 all_pinned = 1; 3588 all_pinned = 1;
3585 goto out_one_pinned; 3589 goto out_one_pinned;
@@ -3654,7 +3658,7 @@ out:
3654 */ 3658 */
3655static int 3659static int
3656load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 3660load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3657 cpumask_t *cpus) 3661 struct cpumask *cpus)
3658{ 3662{
3659 struct sched_group *group; 3663 struct sched_group *group;
3660 struct rq *busiest = NULL; 3664 struct rq *busiest = NULL;
@@ -3663,7 +3667,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3663 int sd_idle = 0; 3667 int sd_idle = 0;
3664 int all_pinned = 0; 3668 int all_pinned = 0;
3665 3669
3666 cpus_setall(*cpus); 3670 cpumask_setall(cpus);
3667 3671
3668 /* 3672 /*
3669 * When power savings policy is enabled for the parent domain, idle 3673 * When power savings policy is enabled for the parent domain, idle
@@ -3707,17 +3711,76 @@ redo:
3707 double_unlock_balance(this_rq, busiest); 3711 double_unlock_balance(this_rq, busiest);
3708 3712
3709 if (unlikely(all_pinned)) { 3713 if (unlikely(all_pinned)) {
3710 cpu_clear(cpu_of(busiest), *cpus); 3714 cpumask_clear_cpu(cpu_of(busiest), cpus);
3711 if (!cpus_empty(*cpus)) 3715 if (!cpumask_empty(cpus))
3712 goto redo; 3716 goto redo;
3713 } 3717 }
3714 } 3718 }
3715 3719
3716 if (!ld_moved) { 3720 if (!ld_moved) {
3721 int active_balance = 0;
3722
3717 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); 3723 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3718 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3724 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3719 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3725 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3720 return -1; 3726 return -1;
3727
3728 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3729 return -1;
3730
3731 if (sd->nr_balance_failed++ < 2)
3732 return -1;
3733
3734 /*
3735 * The only task running in a non-idle cpu can be moved to this
3736 * cpu in an attempt to completely freeup the other CPU
3737 * package. The same method used to move task in load_balance()
3738 * have been extended for load_balance_newidle() to speedup
3739 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
3740 *
3741 * The package power saving logic comes from
3742 * find_busiest_group(). If there are no imbalance, then
3743 * f_b_g() will return NULL. However when sched_mc={1,2} then
3744 * f_b_g() will select a group from which a running task may be
3745 * pulled to this cpu in order to make the other package idle.
3746 * If there is no opportunity to make a package idle and if
3747 * there are no imbalance, then f_b_g() will return NULL and no
3748 * action will be taken in load_balance_newidle().
3749 *
3750 * Under normal task pull operation due to imbalance, there
3751 * will be more than one task in the source run queue and
3752 * move_tasks() will succeed. ld_moved will be true and this
3753 * active balance code will not be triggered.
3754 */
3755
3756 /* Lock busiest in correct order while this_rq is held */
3757 double_lock_balance(this_rq, busiest);
3758
3759 /*
3760 * don't kick the migration_thread, if the curr
3761 * task on busiest cpu can't be moved to this_cpu
3762 */
3763 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
3764 double_unlock_balance(this_rq, busiest);
3765 all_pinned = 1;
3766 return ld_moved;
3767 }
3768
3769 if (!busiest->active_balance) {
3770 busiest->active_balance = 1;
3771 busiest->push_cpu = this_cpu;
3772 active_balance = 1;
3773 }
3774
3775 double_unlock_balance(this_rq, busiest);
3776 /*
3777 * Should not call ttwu while holding a rq->lock
3778 */
3779 spin_unlock(&this_rq->lock);
3780 if (active_balance)
3781 wake_up_process(busiest->migration_thread);
3782 spin_lock(&this_rq->lock);
3783
3721 } else 3784 } else
3722 sd->nr_balance_failed = 0; 3785 sd->nr_balance_failed = 0;
3723 3786
@@ -3743,7 +3806,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3743 struct sched_domain *sd; 3806 struct sched_domain *sd;
3744 int pulled_task = 0; 3807 int pulled_task = 0;
3745 unsigned long next_balance = jiffies + HZ; 3808 unsigned long next_balance = jiffies + HZ;
3746 cpumask_t tmpmask; 3809 cpumask_var_t tmpmask;
3810
3811 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3812 return;
3747 3813
3748 for_each_domain(this_cpu, sd) { 3814 for_each_domain(this_cpu, sd) {
3749 unsigned long interval; 3815 unsigned long interval;
@@ -3754,7 +3820,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3754 if (sd->flags & SD_BALANCE_NEWIDLE) 3820 if (sd->flags & SD_BALANCE_NEWIDLE)
3755 /* If we've pulled tasks over stop searching: */ 3821 /* If we've pulled tasks over stop searching: */
3756 pulled_task = load_balance_newidle(this_cpu, this_rq, 3822 pulled_task = load_balance_newidle(this_cpu, this_rq,
3757 sd, &tmpmask); 3823 sd, tmpmask);
3758 3824
3759 interval = msecs_to_jiffies(sd->balance_interval); 3825 interval = msecs_to_jiffies(sd->balance_interval);
3760 if (time_after(next_balance, sd->last_balance + interval)) 3826 if (time_after(next_balance, sd->last_balance + interval))
@@ -3769,6 +3835,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3769 */ 3835 */
3770 this_rq->next_balance = next_balance; 3836 this_rq->next_balance = next_balance;
3771 } 3837 }
3838 free_cpumask_var(tmpmask);
3772} 3839}
3773 3840
3774/* 3841/*
@@ -3806,7 +3873,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3806 /* Search for an sd spanning us and the target CPU. */ 3873 /* Search for an sd spanning us and the target CPU. */
3807 for_each_domain(target_cpu, sd) { 3874 for_each_domain(target_cpu, sd) {
3808 if ((sd->flags & SD_LOAD_BALANCE) && 3875 if ((sd->flags & SD_LOAD_BALANCE) &&
3809 cpu_isset(busiest_cpu, sd->span)) 3876 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3810 break; 3877 break;
3811 } 3878 }
3812 3879
@@ -3825,10 +3892,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3825#ifdef CONFIG_NO_HZ 3892#ifdef CONFIG_NO_HZ
3826static struct { 3893static struct {
3827 atomic_t load_balancer; 3894 atomic_t load_balancer;
3828 cpumask_t cpu_mask; 3895 cpumask_var_t cpu_mask;
3829} nohz ____cacheline_aligned = { 3896} nohz ____cacheline_aligned = {
3830 .load_balancer = ATOMIC_INIT(-1), 3897 .load_balancer = ATOMIC_INIT(-1),
3831 .cpu_mask = CPU_MASK_NONE,
3832}; 3898};
3833 3899
3834/* 3900/*
@@ -3856,7 +3922,7 @@ int select_nohz_load_balancer(int stop_tick)
3856 int cpu = smp_processor_id(); 3922 int cpu = smp_processor_id();
3857 3923
3858 if (stop_tick) { 3924 if (stop_tick) {
3859 cpu_set(cpu, nohz.cpu_mask); 3925 cpumask_set_cpu(cpu, nohz.cpu_mask);
3860 cpu_rq(cpu)->in_nohz_recently = 1; 3926 cpu_rq(cpu)->in_nohz_recently = 1;
3861 3927
3862 /* 3928 /*
@@ -3870,7 +3936,7 @@ int select_nohz_load_balancer(int stop_tick)
3870 } 3936 }
3871 3937
3872 /* time for ilb owner also to sleep */ 3938 /* time for ilb owner also to sleep */
3873 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 3939 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3874 if (atomic_read(&nohz.load_balancer) == cpu) 3940 if (atomic_read(&nohz.load_balancer) == cpu)
3875 atomic_set(&nohz.load_balancer, -1); 3941 atomic_set(&nohz.load_balancer, -1);
3876 return 0; 3942 return 0;
@@ -3883,10 +3949,10 @@ int select_nohz_load_balancer(int stop_tick)
3883 } else if (atomic_read(&nohz.load_balancer) == cpu) 3949 } else if (atomic_read(&nohz.load_balancer) == cpu)
3884 return 1; 3950 return 1;
3885 } else { 3951 } else {
3886 if (!cpu_isset(cpu, nohz.cpu_mask)) 3952 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3887 return 0; 3953 return 0;
3888 3954
3889 cpu_clear(cpu, nohz.cpu_mask); 3955 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3890 3956
3891 if (atomic_read(&nohz.load_balancer) == cpu) 3957 if (atomic_read(&nohz.load_balancer) == cpu)
3892 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3958 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3914,7 +3980,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3914 unsigned long next_balance = jiffies + 60*HZ; 3980 unsigned long next_balance = jiffies + 60*HZ;
3915 int update_next_balance = 0; 3981 int update_next_balance = 0;
3916 int need_serialize; 3982 int need_serialize;
3917 cpumask_t tmp; 3983 cpumask_var_t tmp;
3984
3985 /* Fails alloc? Rebalancing probably not a priority right now. */
3986 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3987 return;
3918 3988
3919 for_each_domain(cpu, sd) { 3989 for_each_domain(cpu, sd) {
3920 if (!(sd->flags & SD_LOAD_BALANCE)) 3990 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3939,7 +4009,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3939 } 4009 }
3940 4010
3941 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4011 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3942 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { 4012 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
3943 /* 4013 /*
3944 * We've pulled tasks over so either we're no 4014 * We've pulled tasks over so either we're no
3945 * longer idle, or one of our SMT siblings is 4015 * longer idle, or one of our SMT siblings is
@@ -3973,6 +4043,8 @@ out:
3973 */ 4043 */
3974 if (likely(update_next_balance)) 4044 if (likely(update_next_balance))
3975 rq->next_balance = next_balance; 4045 rq->next_balance = next_balance;
4046
4047 free_cpumask_var(tmp);
3976} 4048}
3977 4049
3978/* 4050/*
@@ -3997,12 +4069,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3997 */ 4069 */
3998 if (this_rq->idle_at_tick && 4070 if (this_rq->idle_at_tick &&
3999 atomic_read(&nohz.load_balancer) == this_cpu) { 4071 atomic_read(&nohz.load_balancer) == this_cpu) {
4000 cpumask_t cpus = nohz.cpu_mask;
4001 struct rq *rq; 4072 struct rq *rq;
4002 int balance_cpu; 4073 int balance_cpu;
4003 4074
4004 cpu_clear(this_cpu, cpus); 4075 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4005 for_each_cpu_mask_nr(balance_cpu, cpus) { 4076 if (balance_cpu == this_cpu)
4077 continue;
4078
4006 /* 4079 /*
4007 * If this cpu gets work to do, stop the load balancing 4080 * If this cpu gets work to do, stop the load balancing
4008 * work being done for other cpus. Next load 4081 * work being done for other cpus. Next load
@@ -4040,7 +4113,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4040 rq->in_nohz_recently = 0; 4113 rq->in_nohz_recently = 0;
4041 4114
4042 if (atomic_read(&nohz.load_balancer) == cpu) { 4115 if (atomic_read(&nohz.load_balancer) == cpu) {
4043 cpu_clear(cpu, nohz.cpu_mask); 4116 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4044 atomic_set(&nohz.load_balancer, -1); 4117 atomic_set(&nohz.load_balancer, -1);
4045 } 4118 }
4046 4119
@@ -4053,7 +4126,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4053 * TBD: Traverse the sched domains and nominate 4126 * TBD: Traverse the sched domains and nominate
4054 * the nearest cpu in the nohz.cpu_mask. 4127 * the nearest cpu in the nohz.cpu_mask.
4055 */ 4128 */
4056 int ilb = first_cpu(nohz.cpu_mask); 4129 int ilb = cpumask_first(nohz.cpu_mask);
4057 4130
4058 if (ilb < nr_cpu_ids) 4131 if (ilb < nr_cpu_ids)
4059 resched_cpu(ilb); 4132 resched_cpu(ilb);
@@ -4065,7 +4138,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4065 * cpus with ticks stopped, is it time for that to stop? 4138 * cpus with ticks stopped, is it time for that to stop?
4066 */ 4139 */
4067 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && 4140 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4068 cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 4141 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4069 resched_cpu(cpu); 4142 resched_cpu(cpu);
4070 return; 4143 return;
4071 } 4144 }
@@ -4075,7 +4148,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4075 * someone else, then no need raise the SCHED_SOFTIRQ 4148 * someone else, then no need raise the SCHED_SOFTIRQ
4076 */ 4149 */
4077 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && 4150 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4078 cpu_isset(cpu, nohz.cpu_mask)) 4151 cpumask_test_cpu(cpu, nohz.cpu_mask))
4079 return; 4152 return;
4080#endif 4153#endif
4081 if (time_after_eq(jiffies, rq->next_balance)) 4154 if (time_after_eq(jiffies, rq->next_balance))
@@ -4150,13 +4223,17 @@ unsigned long long task_delta_exec(struct task_struct *p)
4150 * Account user cpu time to a process. 4223 * Account user cpu time to a process.
4151 * @p: the process that the cpu time gets accounted to 4224 * @p: the process that the cpu time gets accounted to
4152 * @cputime: the cpu time spent in user space since the last update 4225 * @cputime: the cpu time spent in user space since the last update
4226 * @cputime_scaled: cputime scaled by cpu frequency
4153 */ 4227 */
4154void account_user_time(struct task_struct *p, cputime_t cputime) 4228void account_user_time(struct task_struct *p, cputime_t cputime,
4229 cputime_t cputime_scaled)
4155{ 4230{
4156 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4231 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4157 cputime64_t tmp; 4232 cputime64_t tmp;
4158 4233
4234 /* Add user time to process. */
4159 p->utime = cputime_add(p->utime, cputime); 4235 p->utime = cputime_add(p->utime, cputime);
4236 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4160 account_group_user_time(p, cputime); 4237 account_group_user_time(p, cputime);
4161 4238
4162 /* Add user time to cpustat. */ 4239 /* Add user time to cpustat. */
@@ -4173,51 +4250,48 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4173 * Account guest cpu time to a process. 4250 * Account guest cpu time to a process.
4174 * @p: the process that the cpu time gets accounted to 4251 * @p: the process that the cpu time gets accounted to
4175 * @cputime: the cpu time spent in virtual machine since the last update 4252 * @cputime: the cpu time spent in virtual machine since the last update
4253 * @cputime_scaled: cputime scaled by cpu frequency
4176 */ 4254 */
4177static void account_guest_time(struct task_struct *p, cputime_t cputime) 4255static void account_guest_time(struct task_struct *p, cputime_t cputime,
4256 cputime_t cputime_scaled)
4178{ 4257{
4179 cputime64_t tmp; 4258 cputime64_t tmp;
4180 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4259 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4181 4260
4182 tmp = cputime_to_cputime64(cputime); 4261 tmp = cputime_to_cputime64(cputime);
4183 4262
4263 /* Add guest time to process. */
4184 p->utime = cputime_add(p->utime, cputime); 4264 p->utime = cputime_add(p->utime, cputime);
4265 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4185 account_group_user_time(p, cputime); 4266 account_group_user_time(p, cputime);
4186 p->gtime = cputime_add(p->gtime, cputime); 4267 p->gtime = cputime_add(p->gtime, cputime);
4187 4268
4269 /* Add guest time to cpustat. */
4188 cpustat->user = cputime64_add(cpustat->user, tmp); 4270 cpustat->user = cputime64_add(cpustat->user, tmp);
4189 cpustat->guest = cputime64_add(cpustat->guest, tmp); 4271 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4190} 4272}
4191 4273
4192/* 4274/*
4193 * Account scaled user cpu time to a process.
4194 * @p: the process that the cpu time gets accounted to
4195 * @cputime: the cpu time spent in user space since the last update
4196 */
4197void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4198{
4199 p->utimescaled = cputime_add(p->utimescaled, cputime);
4200}
4201
4202/*
4203 * Account system cpu time to a process. 4275 * Account system cpu time to a process.
4204 * @p: the process that the cpu time gets accounted to 4276 * @p: the process that the cpu time gets accounted to
4205 * @hardirq_offset: the offset to subtract from hardirq_count() 4277 * @hardirq_offset: the offset to subtract from hardirq_count()
4206 * @cputime: the cpu time spent in kernel space since the last update 4278 * @cputime: the cpu time spent in kernel space since the last update
4279 * @cputime_scaled: cputime scaled by cpu frequency
4207 */ 4280 */
4208void account_system_time(struct task_struct *p, int hardirq_offset, 4281void account_system_time(struct task_struct *p, int hardirq_offset,
4209 cputime_t cputime) 4282 cputime_t cputime, cputime_t cputime_scaled)
4210{ 4283{
4211 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4284 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4212 struct rq *rq = this_rq();
4213 cputime64_t tmp; 4285 cputime64_t tmp;
4214 4286
4215 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 4287 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4216 account_guest_time(p, cputime); 4288 account_guest_time(p, cputime, cputime_scaled);
4217 return; 4289 return;
4218 } 4290 }
4219 4291
4292 /* Add system time to process. */
4220 p->stime = cputime_add(p->stime, cputime); 4293 p->stime = cputime_add(p->stime, cputime);
4294 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
4221 account_group_system_time(p, cputime); 4295 account_group_system_time(p, cputime);
4222 4296
4223 /* Add system time to cpustat. */ 4297 /* Add system time to cpustat. */
@@ -4226,49 +4300,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4226 cpustat->irq = cputime64_add(cpustat->irq, tmp); 4300 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4227 else if (softirq_count()) 4301 else if (softirq_count())
4228 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 4302 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4229 else if (p != rq->idle)
4230 cpustat->system = cputime64_add(cpustat->system, tmp);
4231 else if (atomic_read(&rq->nr_iowait) > 0)
4232 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4233 else 4303 else
4234 cpustat->idle = cputime64_add(cpustat->idle, tmp); 4304 cpustat->system = cputime64_add(cpustat->system, tmp);
4305
4235 /* Account for system time used */ 4306 /* Account for system time used */
4236 acct_update_integrals(p); 4307 acct_update_integrals(p);
4237} 4308}
4238 4309
4239/* 4310/*
4240 * Account scaled system cpu time to a process. 4311 * Account for involuntary wait time.
4241 * @p: the process that the cpu time gets accounted to 4312 * @steal: the cpu time spent in involuntary wait
4242 * @hardirq_offset: the offset to subtract from hardirq_count()
4243 * @cputime: the cpu time spent in kernel space since the last update
4244 */ 4313 */
4245void account_system_time_scaled(struct task_struct *p, cputime_t cputime) 4314void account_steal_time(cputime_t cputime)
4246{ 4315{
4247 p->stimescaled = cputime_add(p->stimescaled, cputime); 4316 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4317 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4318
4319 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
4248} 4320}
4249 4321
4250/* 4322/*
4251 * Account for involuntary wait time. 4323 * Account for idle time.
4252 * @p: the process from which the cpu time has been stolen 4324 * @cputime: the cpu time spent in idle wait
4253 * @steal: the cpu time spent in involuntary wait
4254 */ 4325 */
4255void account_steal_time(struct task_struct *p, cputime_t steal) 4326void account_idle_time(cputime_t cputime)
4256{ 4327{
4257 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4328 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4258 cputime64_t tmp = cputime_to_cputime64(steal); 4329 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4259 struct rq *rq = this_rq(); 4330 struct rq *rq = this_rq();
4260 4331
4261 if (p == rq->idle) { 4332 if (atomic_read(&rq->nr_iowait) > 0)
4262 p->stime = cputime_add(p->stime, steal); 4333 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
4263 account_group_system_time(p, steal); 4334 else
4264 if (atomic_read(&rq->nr_iowait) > 0) 4335 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
4265 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4266 else
4267 cpustat->idle = cputime64_add(cpustat->idle, tmp);
4268 } else
4269 cpustat->steal = cputime64_add(cpustat->steal, tmp);
4270} 4336}
4271 4337
4338#ifndef CONFIG_VIRT_CPU_ACCOUNTING
4339
4340/*
4341 * Account a single tick of cpu time.
4342 * @p: the process that the cpu time gets accounted to
4343 * @user_tick: indicates if the tick is a user or a system tick
4344 */
4345void account_process_tick(struct task_struct *p, int user_tick)
4346{
4347 cputime_t one_jiffy = jiffies_to_cputime(1);
4348 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
4349 struct rq *rq = this_rq();
4350
4351 if (user_tick)
4352 account_user_time(p, one_jiffy, one_jiffy_scaled);
4353 else if (p != rq->idle)
4354 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
4355 one_jiffy_scaled);
4356 else
4357 account_idle_time(one_jiffy);
4358}
4359
4360/*
4361 * Account multiple ticks of steal time.
4362 * @p: the process from which the cpu time has been stolen
4363 * @ticks: number of stolen ticks
4364 */
4365void account_steal_ticks(unsigned long ticks)
4366{
4367 account_steal_time(jiffies_to_cputime(ticks));
4368}
4369
4370/*
4371 * Account multiple ticks of idle time.
4372 * @ticks: number of stolen ticks
4373 */
4374void account_idle_ticks(unsigned long ticks)
4375{
4376 account_idle_time(jiffies_to_cputime(ticks));
4377}
4378
4379#endif
4380
4272/* 4381/*
4273 * Use precise platform statistics if available: 4382 * Use precise platform statistics if available:
4274 */ 4383 */
@@ -4397,7 +4506,7 @@ void __kprobes sub_preempt_count(int val)
4397 /* 4506 /*
4398 * Underflow? 4507 * Underflow?
4399 */ 4508 */
4400 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4509 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4401 return; 4510 return;
4402 /* 4511 /*
4403 * Is the spinlock portion underflowing? 4512 * Is the spinlock portion underflowing?
@@ -5474,10 +5583,9 @@ out_unlock:
5474 return retval; 5583 return retval;
5475} 5584}
5476 5585
5477long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) 5586long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5478{ 5587{
5479 cpumask_t cpus_allowed; 5588 cpumask_var_t cpus_allowed, new_mask;
5480 cpumask_t new_mask = *in_mask;
5481 struct task_struct *p; 5589 struct task_struct *p;
5482 int retval; 5590 int retval;
5483 5591
@@ -5499,6 +5607,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5499 get_task_struct(p); 5607 get_task_struct(p);
5500 read_unlock(&tasklist_lock); 5608 read_unlock(&tasklist_lock);
5501 5609
5610 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5611 retval = -ENOMEM;
5612 goto out_put_task;
5613 }
5614 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5615 retval = -ENOMEM;
5616 goto out_free_cpus_allowed;
5617 }
5502 retval = -EPERM; 5618 retval = -EPERM;
5503 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5619 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5504 goto out_unlock; 5620 goto out_unlock;
@@ -5507,37 +5623,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5507 if (retval) 5623 if (retval)
5508 goto out_unlock; 5624 goto out_unlock;
5509 5625
5510 cpuset_cpus_allowed(p, &cpus_allowed); 5626 cpuset_cpus_allowed(p, cpus_allowed);
5511 cpus_and(new_mask, new_mask, cpus_allowed); 5627 cpumask_and(new_mask, in_mask, cpus_allowed);
5512 again: 5628 again:
5513 retval = set_cpus_allowed_ptr(p, &new_mask); 5629 retval = set_cpus_allowed_ptr(p, new_mask);
5514 5630
5515 if (!retval) { 5631 if (!retval) {
5516 cpuset_cpus_allowed(p, &cpus_allowed); 5632 cpuset_cpus_allowed(p, cpus_allowed);
5517 if (!cpus_subset(new_mask, cpus_allowed)) { 5633 if (!cpumask_subset(new_mask, cpus_allowed)) {
5518 /* 5634 /*
5519 * We must have raced with a concurrent cpuset 5635 * We must have raced with a concurrent cpuset
5520 * update. Just reset the cpus_allowed to the 5636 * update. Just reset the cpus_allowed to the
5521 * cpuset's cpus_allowed 5637 * cpuset's cpus_allowed
5522 */ 5638 */
5523 new_mask = cpus_allowed; 5639 cpumask_copy(new_mask, cpus_allowed);
5524 goto again; 5640 goto again;
5525 } 5641 }
5526 } 5642 }
5527out_unlock: 5643out_unlock:
5644 free_cpumask_var(new_mask);
5645out_free_cpus_allowed:
5646 free_cpumask_var(cpus_allowed);
5647out_put_task:
5528 put_task_struct(p); 5648 put_task_struct(p);
5529 put_online_cpus(); 5649 put_online_cpus();
5530 return retval; 5650 return retval;
5531} 5651}
5532 5652
5533static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5653static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5534 cpumask_t *new_mask) 5654 struct cpumask *new_mask)
5535{ 5655{
5536 if (len < sizeof(cpumask_t)) { 5656 if (len < cpumask_size())
5537 memset(new_mask, 0, sizeof(cpumask_t)); 5657 cpumask_clear(new_mask);
5538 } else if (len > sizeof(cpumask_t)) { 5658 else if (len > cpumask_size())
5539 len = sizeof(cpumask_t); 5659 len = cpumask_size();
5540 } 5660
5541 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5661 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5542} 5662}
5543 5663
@@ -5550,17 +5670,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5550asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 5670asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5551 unsigned long __user *user_mask_ptr) 5671 unsigned long __user *user_mask_ptr)
5552{ 5672{
5553 cpumask_t new_mask; 5673 cpumask_var_t new_mask;
5554 int retval; 5674 int retval;
5555 5675
5556 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 5676 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5557 if (retval) 5677 return -ENOMEM;
5558 return retval;
5559 5678
5560 return sched_setaffinity(pid, &new_mask); 5679 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5680 if (retval == 0)
5681 retval = sched_setaffinity(pid, new_mask);
5682 free_cpumask_var(new_mask);
5683 return retval;
5561} 5684}
5562 5685
5563long sched_getaffinity(pid_t pid, cpumask_t *mask) 5686long sched_getaffinity(pid_t pid, struct cpumask *mask)
5564{ 5687{
5565 struct task_struct *p; 5688 struct task_struct *p;
5566 int retval; 5689 int retval;
@@ -5577,7 +5700,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
5577 if (retval) 5700 if (retval)
5578 goto out_unlock; 5701 goto out_unlock;
5579 5702
5580 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 5703 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5581 5704
5582out_unlock: 5705out_unlock:
5583 read_unlock(&tasklist_lock); 5706 read_unlock(&tasklist_lock);
@@ -5596,19 +5719,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5596 unsigned long __user *user_mask_ptr) 5719 unsigned long __user *user_mask_ptr)
5597{ 5720{
5598 int ret; 5721 int ret;
5599 cpumask_t mask; 5722 cpumask_var_t mask;
5600 5723
5601 if (len < sizeof(cpumask_t)) 5724 if (len < cpumask_size())
5602 return -EINVAL; 5725 return -EINVAL;
5603 5726
5604 ret = sched_getaffinity(pid, &mask); 5727 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5605 if (ret < 0) 5728 return -ENOMEM;
5606 return ret;
5607 5729
5608 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 5730 ret = sched_getaffinity(pid, mask);
5609 return -EFAULT; 5731 if (ret == 0) {
5732 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
5733 ret = -EFAULT;
5734 else
5735 ret = cpumask_size();
5736 }
5737 free_cpumask_var(mask);
5610 5738
5611 return sizeof(cpumask_t); 5739 return ret;
5612} 5740}
5613 5741
5614/** 5742/**
@@ -5950,7 +6078,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5950 idle->se.exec_start = sched_clock(); 6078 idle->se.exec_start = sched_clock();
5951 6079
5952 idle->prio = idle->normal_prio = MAX_PRIO; 6080 idle->prio = idle->normal_prio = MAX_PRIO;
5953 idle->cpus_allowed = cpumask_of_cpu(cpu); 6081 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5954 __set_task_cpu(idle, cpu); 6082 __set_task_cpu(idle, cpu);
5955 6083
5956 rq->curr = rq->idle = idle; 6084 rq->curr = rq->idle = idle;
@@ -5977,9 +6105,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5977 * indicates which cpus entered this state. This is used 6105 * indicates which cpus entered this state. This is used
5978 * in the rcu update to wait only for active cpus. For system 6106 * in the rcu update to wait only for active cpus. For system
5979 * which do not switch off the HZ timer nohz_cpu_mask should 6107 * which do not switch off the HZ timer nohz_cpu_mask should
5980 * always be CPU_MASK_NONE. 6108 * always be CPU_BITS_NONE.
5981 */ 6109 */
5982cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 6110cpumask_var_t nohz_cpu_mask;
5983 6111
5984/* 6112/*
5985 * Increase the granularity value when there are more CPUs, 6113 * Increase the granularity value when there are more CPUs,
@@ -6034,7 +6162,7 @@ static inline void sched_init_granularity(void)
6034 * task must not exit() & deallocate itself prematurely. The 6162 * task must not exit() & deallocate itself prematurely. The
6035 * call is not atomic; no spinlocks may be held. 6163 * call is not atomic; no spinlocks may be held.
6036 */ 6164 */
6037int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) 6165int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6038{ 6166{
6039 struct migration_req req; 6167 struct migration_req req;
6040 unsigned long flags; 6168 unsigned long flags;
@@ -6042,13 +6170,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
6042 int ret = 0; 6170 int ret = 0;
6043 6171
6044 rq = task_rq_lock(p, &flags); 6172 rq = task_rq_lock(p, &flags);
6045 if (!cpus_intersects(*new_mask, cpu_online_map)) { 6173 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
6046 ret = -EINVAL; 6174 ret = -EINVAL;
6047 goto out; 6175 goto out;
6048 } 6176 }
6049 6177
6050 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6178 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
6051 !cpus_equal(p->cpus_allowed, *new_mask))) { 6179 !cpumask_equal(&p->cpus_allowed, new_mask))) {
6052 ret = -EINVAL; 6180 ret = -EINVAL;
6053 goto out; 6181 goto out;
6054 } 6182 }
@@ -6056,15 +6184,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
6056 if (p->sched_class->set_cpus_allowed) 6184 if (p->sched_class->set_cpus_allowed)
6057 p->sched_class->set_cpus_allowed(p, new_mask); 6185 p->sched_class->set_cpus_allowed(p, new_mask);
6058 else { 6186 else {
6059 p->cpus_allowed = *new_mask; 6187 cpumask_copy(&p->cpus_allowed, new_mask);
6060 p->rt.nr_cpus_allowed = cpus_weight(*new_mask); 6188 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
6061 } 6189 }
6062 6190
6063 /* Can the task run on the task's current CPU? If so, we're done */ 6191 /* Can the task run on the task's current CPU? If so, we're done */
6064 if (cpu_isset(task_cpu(p), *new_mask)) 6192 if (cpumask_test_cpu(task_cpu(p), new_mask))
6065 goto out; 6193 goto out;
6066 6194
6067 if (migrate_task(p, any_online_cpu(*new_mask), &req)) { 6195 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6068 /* Need help from migration thread: drop lock and wait. */ 6196 /* Need help from migration thread: drop lock and wait. */
6069 task_rq_unlock(rq, &flags); 6197 task_rq_unlock(rq, &flags);
6070 wake_up_process(rq->migration_thread); 6198 wake_up_process(rq->migration_thread);
@@ -6106,7 +6234,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6106 if (task_cpu(p) != src_cpu) 6234 if (task_cpu(p) != src_cpu)
6107 goto done; 6235 goto done;
6108 /* Affinity changed (again). */ 6236 /* Affinity changed (again). */
6109 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 6237 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6110 goto fail; 6238 goto fail;
6111 6239
6112 on_rq = p->se.on_rq; 6240 on_rq = p->se.on_rq;
@@ -6203,50 +6331,41 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6203 */ 6331 */
6204static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6332static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6205{ 6333{
6206 unsigned long flags;
6207 cpumask_t mask;
6208 struct rq *rq;
6209 int dest_cpu; 6334 int dest_cpu;
6335 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
6210 6336
6211 do { 6337again:
6212 /* On same node? */ 6338 /* Look for allowed, online CPU in same node. */
6213 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 6339 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
6214 cpus_and(mask, mask, p->cpus_allowed); 6340 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6215 dest_cpu = any_online_cpu(mask); 6341 goto move;
6216 6342
6217 /* On any allowed CPU? */ 6343 /* Any allowed, online CPU? */
6218 if (dest_cpu >= nr_cpu_ids) 6344 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
6219 dest_cpu = any_online_cpu(p->cpus_allowed); 6345 if (dest_cpu < nr_cpu_ids)
6346 goto move;
6220 6347
6221 /* No more Mr. Nice Guy. */ 6348 /* No more Mr. Nice Guy. */
6222 if (dest_cpu >= nr_cpu_ids) { 6349 if (dest_cpu >= nr_cpu_ids) {
6223 cpumask_t cpus_allowed; 6350 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
6351 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
6224 6352
6225 cpuset_cpus_allowed_locked(p, &cpus_allowed); 6353 /*
6226 /* 6354 * Don't tell them about moving exiting tasks or
6227 * Try to stay on the same cpuset, where the 6355 * kernel threads (both mm NULL), since they never
6228 * current cpuset may be a subset of all cpus. 6356 * leave kernel.
6229 * The cpuset_cpus_allowed_locked() variant of 6357 */
6230 * cpuset_cpus_allowed() will not block. It must be 6358 if (p->mm && printk_ratelimit()) {
6231 * called within calls to cpuset_lock/cpuset_unlock. 6359 printk(KERN_INFO "process %d (%s) no "
6232 */ 6360 "longer affine to cpu%d\n",
6233 rq = task_rq_lock(p, &flags); 6361 task_pid_nr(p), p->comm, dead_cpu);
6234 p->cpus_allowed = cpus_allowed;
6235 dest_cpu = any_online_cpu(p->cpus_allowed);
6236 task_rq_unlock(rq, &flags);
6237
6238 /*
6239 * Don't tell them about moving exiting tasks or
6240 * kernel threads (both mm NULL), since they never
6241 * leave kernel.
6242 */
6243 if (p->mm && printk_ratelimit()) {
6244 printk(KERN_INFO "process %d (%s) no "
6245 "longer affine to cpu%d\n",
6246 task_pid_nr(p), p->comm, dead_cpu);
6247 }
6248 } 6362 }
6249 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 6363 }
6364
6365move:
6366 /* It can have affinity changed while we were choosing. */
6367 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
6368 goto again;
6250} 6369}
6251 6370
6252/* 6371/*
@@ -6258,7 +6377,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6258 */ 6377 */
6259static void migrate_nr_uninterruptible(struct rq *rq_src) 6378static void migrate_nr_uninterruptible(struct rq *rq_src)
6260{ 6379{
6261 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); 6380 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
6262 unsigned long flags; 6381 unsigned long flags;
6263 6382
6264 local_irq_save(flags); 6383 local_irq_save(flags);
@@ -6548,7 +6667,7 @@ static void set_rq_online(struct rq *rq)
6548 if (!rq->online) { 6667 if (!rq->online) {
6549 const struct sched_class *class; 6668 const struct sched_class *class;
6550 6669
6551 cpu_set(rq->cpu, rq->rd->online); 6670 cpumask_set_cpu(rq->cpu, rq->rd->online);
6552 rq->online = 1; 6671 rq->online = 1;
6553 6672
6554 for_each_class(class) { 6673 for_each_class(class) {
@@ -6568,7 +6687,7 @@ static void set_rq_offline(struct rq *rq)
6568 class->rq_offline(rq); 6687 class->rq_offline(rq);
6569 } 6688 }
6570 6689
6571 cpu_clear(rq->cpu, rq->rd->online); 6690 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6572 rq->online = 0; 6691 rq->online = 0;
6573 } 6692 }
6574} 6693}
@@ -6609,7 +6728,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6609 rq = cpu_rq(cpu); 6728 rq = cpu_rq(cpu);
6610 spin_lock_irqsave(&rq->lock, flags); 6729 spin_lock_irqsave(&rq->lock, flags);
6611 if (rq->rd) { 6730 if (rq->rd) {
6612 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6731 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6613 6732
6614 set_rq_online(rq); 6733 set_rq_online(rq);
6615 } 6734 }
@@ -6623,7 +6742,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6623 break; 6742 break;
6624 /* Unbind it from offline cpu so it can run. Fall thru. */ 6743 /* Unbind it from offline cpu so it can run. Fall thru. */
6625 kthread_bind(cpu_rq(cpu)->migration_thread, 6744 kthread_bind(cpu_rq(cpu)->migration_thread,
6626 any_online_cpu(cpu_online_map)); 6745 cpumask_any(cpu_online_mask));
6627 kthread_stop(cpu_rq(cpu)->migration_thread); 6746 kthread_stop(cpu_rq(cpu)->migration_thread);
6628 cpu_rq(cpu)->migration_thread = NULL; 6747 cpu_rq(cpu)->migration_thread = NULL;
6629 break; 6748 break;
@@ -6673,7 +6792,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6673 rq = cpu_rq(cpu); 6792 rq = cpu_rq(cpu);
6674 spin_lock_irqsave(&rq->lock, flags); 6793 spin_lock_irqsave(&rq->lock, flags);
6675 if (rq->rd) { 6794 if (rq->rd) {
6676 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6795 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6677 set_rq_offline(rq); 6796 set_rq_offline(rq);
6678 } 6797 }
6679 spin_unlock_irqrestore(&rq->lock, flags); 6798 spin_unlock_irqrestore(&rq->lock, flags);
@@ -6712,13 +6831,13 @@ early_initcall(migration_init);
6712#ifdef CONFIG_SCHED_DEBUG 6831#ifdef CONFIG_SCHED_DEBUG
6713 6832
6714static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6833static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6715 cpumask_t *groupmask) 6834 struct cpumask *groupmask)
6716{ 6835{
6717 struct sched_group *group = sd->groups; 6836 struct sched_group *group = sd->groups;
6718 char str[256]; 6837 char str[256];
6719 6838
6720 cpulist_scnprintf(str, sizeof(str), sd->span); 6839 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6721 cpus_clear(*groupmask); 6840 cpumask_clear(groupmask);
6722 6841
6723 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6842 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6724 6843
@@ -6732,11 +6851,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6732 6851
6733 printk(KERN_CONT "span %s level %s\n", str, sd->name); 6852 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6734 6853
6735 if (!cpu_isset(cpu, sd->span)) { 6854 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6736 printk(KERN_ERR "ERROR: domain->span does not contain " 6855 printk(KERN_ERR "ERROR: domain->span does not contain "
6737 "CPU%d\n", cpu); 6856 "CPU%d\n", cpu);
6738 } 6857 }
6739 if (!cpu_isset(cpu, group->cpumask)) { 6858 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6740 printk(KERN_ERR "ERROR: domain->groups does not contain" 6859 printk(KERN_ERR "ERROR: domain->groups does not contain"
6741 " CPU%d\n", cpu); 6860 " CPU%d\n", cpu);
6742 } 6861 }
@@ -6756,31 +6875,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6756 break; 6875 break;
6757 } 6876 }
6758 6877
6759 if (!cpus_weight(group->cpumask)) { 6878 if (!cpumask_weight(sched_group_cpus(group))) {
6760 printk(KERN_CONT "\n"); 6879 printk(KERN_CONT "\n");
6761 printk(KERN_ERR "ERROR: empty group\n"); 6880 printk(KERN_ERR "ERROR: empty group\n");
6762 break; 6881 break;
6763 } 6882 }
6764 6883
6765 if (cpus_intersects(*groupmask, group->cpumask)) { 6884 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6766 printk(KERN_CONT "\n"); 6885 printk(KERN_CONT "\n");
6767 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6886 printk(KERN_ERR "ERROR: repeated CPUs\n");
6768 break; 6887 break;
6769 } 6888 }
6770 6889
6771 cpus_or(*groupmask, *groupmask, group->cpumask); 6890 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6772 6891
6773 cpulist_scnprintf(str, sizeof(str), group->cpumask); 6892 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6774 printk(KERN_CONT " %s", str); 6893 printk(KERN_CONT " %s", str);
6775 6894
6776 group = group->next; 6895 group = group->next;
6777 } while (group != sd->groups); 6896 } while (group != sd->groups);
6778 printk(KERN_CONT "\n"); 6897 printk(KERN_CONT "\n");
6779 6898
6780 if (!cpus_equal(sd->span, *groupmask)) 6899 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6781 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6900 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6782 6901
6783 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) 6902 if (sd->parent &&
6903 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6784 printk(KERN_ERR "ERROR: parent span is not a superset " 6904 printk(KERN_ERR "ERROR: parent span is not a superset "
6785 "of domain->span\n"); 6905 "of domain->span\n");
6786 return 0; 6906 return 0;
@@ -6788,7 +6908,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6788 6908
6789static void sched_domain_debug(struct sched_domain *sd, int cpu) 6909static void sched_domain_debug(struct sched_domain *sd, int cpu)
6790{ 6910{
6791 cpumask_t *groupmask; 6911 cpumask_var_t groupmask;
6792 int level = 0; 6912 int level = 0;
6793 6913
6794 if (!sd) { 6914 if (!sd) {
@@ -6798,8 +6918,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6798 6918
6799 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6919 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6800 6920
6801 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 6921 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6802 if (!groupmask) {
6803 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6922 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6804 return; 6923 return;
6805 } 6924 }
@@ -6812,7 +6931,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6812 if (!sd) 6931 if (!sd)
6813 break; 6932 break;
6814 } 6933 }
6815 kfree(groupmask); 6934 free_cpumask_var(groupmask);
6816} 6935}
6817#else /* !CONFIG_SCHED_DEBUG */ 6936#else /* !CONFIG_SCHED_DEBUG */
6818# define sched_domain_debug(sd, cpu) do { } while (0) 6937# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6820,7 +6939,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6820 6939
6821static int sd_degenerate(struct sched_domain *sd) 6940static int sd_degenerate(struct sched_domain *sd)
6822{ 6941{
6823 if (cpus_weight(sd->span) == 1) 6942 if (cpumask_weight(sched_domain_span(sd)) == 1)
6824 return 1; 6943 return 1;
6825 6944
6826 /* Following flags need at least 2 groups */ 6945 /* Following flags need at least 2 groups */
@@ -6851,7 +6970,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6851 if (sd_degenerate(parent)) 6970 if (sd_degenerate(parent))
6852 return 1; 6971 return 1;
6853 6972
6854 if (!cpus_equal(sd->span, parent->span)) 6973 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6855 return 0; 6974 return 0;
6856 6975
6857 /* Does parent contain flags not in child? */ 6976 /* Does parent contain flags not in child? */
@@ -6875,6 +6994,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6875 return 1; 6994 return 1;
6876} 6995}
6877 6996
6997static void free_rootdomain(struct root_domain *rd)
6998{
6999 cpupri_cleanup(&rd->cpupri);
7000
7001 free_cpumask_var(rd->rto_mask);
7002 free_cpumask_var(rd->online);
7003 free_cpumask_var(rd->span);
7004 kfree(rd);
7005}
7006
6878static void rq_attach_root(struct rq *rq, struct root_domain *rd) 7007static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6879{ 7008{
6880 unsigned long flags; 7009 unsigned long flags;
@@ -6884,38 +7013,62 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6884 if (rq->rd) { 7013 if (rq->rd) {
6885 struct root_domain *old_rd = rq->rd; 7014 struct root_domain *old_rd = rq->rd;
6886 7015
6887 if (cpu_isset(rq->cpu, old_rd->online)) 7016 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6888 set_rq_offline(rq); 7017 set_rq_offline(rq);
6889 7018
6890 cpu_clear(rq->cpu, old_rd->span); 7019 cpumask_clear_cpu(rq->cpu, old_rd->span);
6891 7020
6892 if (atomic_dec_and_test(&old_rd->refcount)) 7021 if (atomic_dec_and_test(&old_rd->refcount))
6893 kfree(old_rd); 7022 free_rootdomain(old_rd);
6894 } 7023 }
6895 7024
6896 atomic_inc(&rd->refcount); 7025 atomic_inc(&rd->refcount);
6897 rq->rd = rd; 7026 rq->rd = rd;
6898 7027
6899 cpu_set(rq->cpu, rd->span); 7028 cpumask_set_cpu(rq->cpu, rd->span);
6900 if (cpu_isset(rq->cpu, cpu_online_map)) 7029 if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
6901 set_rq_online(rq); 7030 set_rq_online(rq);
6902 7031
6903 spin_unlock_irqrestore(&rq->lock, flags); 7032 spin_unlock_irqrestore(&rq->lock, flags);
6904} 7033}
6905 7034
6906static void init_rootdomain(struct root_domain *rd) 7035static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
6907{ 7036{
6908 memset(rd, 0, sizeof(*rd)); 7037 memset(rd, 0, sizeof(*rd));
6909 7038
6910 cpus_clear(rd->span); 7039 if (bootmem) {
6911 cpus_clear(rd->online); 7040 alloc_bootmem_cpumask_var(&def_root_domain.span);
7041 alloc_bootmem_cpumask_var(&def_root_domain.online);
7042 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
7043 cpupri_init(&rd->cpupri, true);
7044 return 0;
7045 }
6912 7046
6913 cpupri_init(&rd->cpupri); 7047 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
7048 goto out;
7049 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
7050 goto free_span;
7051 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
7052 goto free_online;
7053
7054 if (cpupri_init(&rd->cpupri, false) != 0)
7055 goto free_rto_mask;
7056 return 0;
7057
7058free_rto_mask:
7059 free_cpumask_var(rd->rto_mask);
7060free_online:
7061 free_cpumask_var(rd->online);
7062free_span:
7063 free_cpumask_var(rd->span);
7064out:
7065 return -ENOMEM;
6914} 7066}
6915 7067
6916static void init_defrootdomain(void) 7068static void init_defrootdomain(void)
6917{ 7069{
6918 init_rootdomain(&def_root_domain); 7070 init_rootdomain(&def_root_domain, true);
7071
6919 atomic_set(&def_root_domain.refcount, 1); 7072 atomic_set(&def_root_domain.refcount, 1);
6920} 7073}
6921 7074
@@ -6927,7 +7080,10 @@ static struct root_domain *alloc_rootdomain(void)
6927 if (!rd) 7080 if (!rd)
6928 return NULL; 7081 return NULL;
6929 7082
6930 init_rootdomain(rd); 7083 if (init_rootdomain(rd, false) != 0) {
7084 kfree(rd);
7085 return NULL;
7086 }
6931 7087
6932 return rd; 7088 return rd;
6933} 7089}
@@ -6969,19 +7125,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6969} 7125}
6970 7126
6971/* cpus with isolated domains */ 7127/* cpus with isolated domains */
6972static cpumask_t cpu_isolated_map = CPU_MASK_NONE; 7128static cpumask_var_t cpu_isolated_map;
6973 7129
6974/* Setup the mask of cpus configured for isolated domains */ 7130/* Setup the mask of cpus configured for isolated domains */
6975static int __init isolated_cpu_setup(char *str) 7131static int __init isolated_cpu_setup(char *str)
6976{ 7132{
6977 static int __initdata ints[NR_CPUS]; 7133 cpulist_parse(str, cpu_isolated_map);
6978 int i;
6979
6980 str = get_options(str, ARRAY_SIZE(ints), ints);
6981 cpus_clear(cpu_isolated_map);
6982 for (i = 1; i <= ints[0]; i++)
6983 if (ints[i] < NR_CPUS)
6984 cpu_set(ints[i], cpu_isolated_map);
6985 return 1; 7134 return 1;
6986} 7135}
6987 7136
@@ -6990,42 +7139,43 @@ __setup("isolcpus=", isolated_cpu_setup);
6990/* 7139/*
6991 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 7140 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6992 * to a function which identifies what group(along with sched group) a CPU 7141 * to a function which identifies what group(along with sched group) a CPU
6993 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS 7142 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6994 * (due to the fact that we keep track of groups covered with a cpumask_t). 7143 * (due to the fact that we keep track of groups covered with a struct cpumask).
6995 * 7144 *
6996 * init_sched_build_groups will build a circular linked list of the groups 7145 * init_sched_build_groups will build a circular linked list of the groups
6997 * covered by the given span, and will set each group's ->cpumask correctly, 7146 * covered by the given span, and will set each group's ->cpumask correctly,
6998 * and ->cpu_power to 0. 7147 * and ->cpu_power to 0.
6999 */ 7148 */
7000static void 7149static void
7001init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, 7150init_sched_build_groups(const struct cpumask *span,
7002 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 7151 const struct cpumask *cpu_map,
7152 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
7003 struct sched_group **sg, 7153 struct sched_group **sg,
7004 cpumask_t *tmpmask), 7154 struct cpumask *tmpmask),
7005 cpumask_t *covered, cpumask_t *tmpmask) 7155 struct cpumask *covered, struct cpumask *tmpmask)
7006{ 7156{
7007 struct sched_group *first = NULL, *last = NULL; 7157 struct sched_group *first = NULL, *last = NULL;
7008 int i; 7158 int i;
7009 7159
7010 cpus_clear(*covered); 7160 cpumask_clear(covered);
7011 7161
7012 for_each_cpu_mask_nr(i, *span) { 7162 for_each_cpu(i, span) {
7013 struct sched_group *sg; 7163 struct sched_group *sg;
7014 int group = group_fn(i, cpu_map, &sg, tmpmask); 7164 int group = group_fn(i, cpu_map, &sg, tmpmask);
7015 int j; 7165 int j;
7016 7166
7017 if (cpu_isset(i, *covered)) 7167 if (cpumask_test_cpu(i, covered))
7018 continue; 7168 continue;
7019 7169
7020 cpus_clear(sg->cpumask); 7170 cpumask_clear(sched_group_cpus(sg));
7021 sg->__cpu_power = 0; 7171 sg->__cpu_power = 0;
7022 7172
7023 for_each_cpu_mask_nr(j, *span) { 7173 for_each_cpu(j, span) {
7024 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 7174 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
7025 continue; 7175 continue;
7026 7176
7027 cpu_set(j, *covered); 7177 cpumask_set_cpu(j, covered);
7028 cpu_set(j, sg->cpumask); 7178 cpumask_set_cpu(j, sched_group_cpus(sg));
7029 } 7179 }
7030 if (!first) 7180 if (!first)
7031 first = sg; 7181 first = sg;
@@ -7089,23 +7239,21 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
7089 * should be one that prevents unnecessary balancing, but also spreads tasks 7239 * should be one that prevents unnecessary balancing, but also spreads tasks
7090 * out optimally. 7240 * out optimally.
7091 */ 7241 */
7092static void sched_domain_node_span(int node, cpumask_t *span) 7242static void sched_domain_node_span(int node, struct cpumask *span)
7093{ 7243{
7094 nodemask_t used_nodes; 7244 nodemask_t used_nodes;
7095 node_to_cpumask_ptr(nodemask, node);
7096 int i; 7245 int i;
7097 7246
7098 cpus_clear(*span); 7247 cpumask_clear(span);
7099 nodes_clear(used_nodes); 7248 nodes_clear(used_nodes);
7100 7249
7101 cpus_or(*span, *span, *nodemask); 7250 cpumask_or(span, span, cpumask_of_node(node));
7102 node_set(node, used_nodes); 7251 node_set(node, used_nodes);
7103 7252
7104 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7253 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
7105 int next_node = find_next_best_node(node, &used_nodes); 7254 int next_node = find_next_best_node(node, &used_nodes);
7106 7255
7107 node_to_cpumask_ptr_next(nodemask, next_node); 7256 cpumask_or(span, span, cpumask_of_node(next_node));
7108 cpus_or(*span, *span, *nodemask);
7109 } 7257 }
7110} 7258}
7111#endif /* CONFIG_NUMA */ 7259#endif /* CONFIG_NUMA */
@@ -7113,18 +7261,33 @@ static void sched_domain_node_span(int node, cpumask_t *span)
7113int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7261int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7114 7262
7115/* 7263/*
7264 * The cpus mask in sched_group and sched_domain hangs off the end.
7265 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
7266 * for nr_cpu_ids < CONFIG_NR_CPUS.
7267 */
7268struct static_sched_group {
7269 struct sched_group sg;
7270 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
7271};
7272
7273struct static_sched_domain {
7274 struct sched_domain sd;
7275 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
7276};
7277
7278/*
7116 * SMT sched-domains: 7279 * SMT sched-domains:
7117 */ 7280 */
7118#ifdef CONFIG_SCHED_SMT 7281#ifdef CONFIG_SCHED_SMT
7119static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 7282static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
7120static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7283static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
7121 7284
7122static int 7285static int
7123cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7286cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
7124 cpumask_t *unused) 7287 struct sched_group **sg, struct cpumask *unused)
7125{ 7288{
7126 if (sg) 7289 if (sg)
7127 *sg = &per_cpu(sched_group_cpus, cpu); 7290 *sg = &per_cpu(sched_group_cpus, cpu).sg;
7128 return cpu; 7291 return cpu;
7129} 7292}
7130#endif /* CONFIG_SCHED_SMT */ 7293#endif /* CONFIG_SCHED_SMT */
@@ -7133,56 +7296,53 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7133 * multi-core sched-domains: 7296 * multi-core sched-domains:
7134 */ 7297 */
7135#ifdef CONFIG_SCHED_MC 7298#ifdef CONFIG_SCHED_MC
7136static DEFINE_PER_CPU(struct sched_domain, core_domains); 7299static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
7137static DEFINE_PER_CPU(struct sched_group, sched_group_core); 7300static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
7138#endif /* CONFIG_SCHED_MC */ 7301#endif /* CONFIG_SCHED_MC */
7139 7302
7140#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7303#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7141static int 7304static int
7142cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7305cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7143 cpumask_t *mask) 7306 struct sched_group **sg, struct cpumask *mask)
7144{ 7307{
7145 int group; 7308 int group;
7146 7309
7147 *mask = per_cpu(cpu_sibling_map, cpu); 7310 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7148 cpus_and(*mask, *mask, *cpu_map); 7311 group = cpumask_first(mask);
7149 group = first_cpu(*mask);
7150 if (sg) 7312 if (sg)
7151 *sg = &per_cpu(sched_group_core, group); 7313 *sg = &per_cpu(sched_group_core, group).sg;
7152 return group; 7314 return group;
7153} 7315}
7154#elif defined(CONFIG_SCHED_MC) 7316#elif defined(CONFIG_SCHED_MC)
7155static int 7317static int
7156cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7318cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7157 cpumask_t *unused) 7319 struct sched_group **sg, struct cpumask *unused)
7158{ 7320{
7159 if (sg) 7321 if (sg)
7160 *sg = &per_cpu(sched_group_core, cpu); 7322 *sg = &per_cpu(sched_group_core, cpu).sg;
7161 return cpu; 7323 return cpu;
7162} 7324}
7163#endif 7325#endif
7164 7326
7165static DEFINE_PER_CPU(struct sched_domain, phys_domains); 7327static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
7166static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7328static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
7167 7329
7168static int 7330static int
7169cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7331cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7170 cpumask_t *mask) 7332 struct sched_group **sg, struct cpumask *mask)
7171{ 7333{
7172 int group; 7334 int group;
7173#ifdef CONFIG_SCHED_MC 7335#ifdef CONFIG_SCHED_MC
7174 *mask = cpu_coregroup_map(cpu); 7336 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7175 cpus_and(*mask, *mask, *cpu_map); 7337 group = cpumask_first(mask);
7176 group = first_cpu(*mask);
7177#elif defined(CONFIG_SCHED_SMT) 7338#elif defined(CONFIG_SCHED_SMT)
7178 *mask = per_cpu(cpu_sibling_map, cpu); 7339 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7179 cpus_and(*mask, *mask, *cpu_map); 7340 group = cpumask_first(mask);
7180 group = first_cpu(*mask);
7181#else 7341#else
7182 group = cpu; 7342 group = cpu;
7183#endif 7343#endif
7184 if (sg) 7344 if (sg)
7185 *sg = &per_cpu(sched_group_phys, group); 7345 *sg = &per_cpu(sched_group_phys, group).sg;
7186 return group; 7346 return group;
7187} 7347}
7188 7348
@@ -7196,19 +7356,19 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
7196static struct sched_group ***sched_group_nodes_bycpu; 7356static struct sched_group ***sched_group_nodes_bycpu;
7197 7357
7198static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7358static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7199static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7359static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7200 7360
7201static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7361static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7202 struct sched_group **sg, cpumask_t *nodemask) 7362 struct sched_group **sg,
7363 struct cpumask *nodemask)
7203{ 7364{
7204 int group; 7365 int group;
7205 7366
7206 *nodemask = node_to_cpumask(cpu_to_node(cpu)); 7367 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7207 cpus_and(*nodemask, *nodemask, *cpu_map); 7368 group = cpumask_first(nodemask);
7208 group = first_cpu(*nodemask);
7209 7369
7210 if (sg) 7370 if (sg)
7211 *sg = &per_cpu(sched_group_allnodes, group); 7371 *sg = &per_cpu(sched_group_allnodes, group).sg;
7212 return group; 7372 return group;
7213} 7373}
7214 7374
@@ -7220,11 +7380,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7220 if (!sg) 7380 if (!sg)
7221 return; 7381 return;
7222 do { 7382 do {
7223 for_each_cpu_mask_nr(j, sg->cpumask) { 7383 for_each_cpu(j, sched_group_cpus(sg)) {
7224 struct sched_domain *sd; 7384 struct sched_domain *sd;
7225 7385
7226 sd = &per_cpu(phys_domains, j); 7386 sd = &per_cpu(phys_domains, j).sd;
7227 if (j != first_cpu(sd->groups->cpumask)) { 7387 if (j != cpumask_first(sched_group_cpus(sd->groups))) {
7228 /* 7388 /*
7229 * Only add "power" once for each 7389 * Only add "power" once for each
7230 * physical package. 7390 * physical package.
@@ -7241,11 +7401,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7241 7401
7242#ifdef CONFIG_NUMA 7402#ifdef CONFIG_NUMA
7243/* Free memory allocated for various sched_group structures */ 7403/* Free memory allocated for various sched_group structures */
7244static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7404static void free_sched_groups(const struct cpumask *cpu_map,
7405 struct cpumask *nodemask)
7245{ 7406{
7246 int cpu, i; 7407 int cpu, i;
7247 7408
7248 for_each_cpu_mask_nr(cpu, *cpu_map) { 7409 for_each_cpu(cpu, cpu_map) {
7249 struct sched_group **sched_group_nodes 7410 struct sched_group **sched_group_nodes
7250 = sched_group_nodes_bycpu[cpu]; 7411 = sched_group_nodes_bycpu[cpu];
7251 7412
@@ -7255,9 +7416,8 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7255 for (i = 0; i < nr_node_ids; i++) { 7416 for (i = 0; i < nr_node_ids; i++) {
7256 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7417 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7257 7418
7258 *nodemask = node_to_cpumask(i); 7419 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7259 cpus_and(*nodemask, *nodemask, *cpu_map); 7420 if (cpumask_empty(nodemask))
7260 if (cpus_empty(*nodemask))
7261 continue; 7421 continue;
7262 7422
7263 if (sg == NULL) 7423 if (sg == NULL)
@@ -7275,7 +7435,8 @@ next_sg:
7275 } 7435 }
7276} 7436}
7277#else /* !CONFIG_NUMA */ 7437#else /* !CONFIG_NUMA */
7278static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7438static void free_sched_groups(const struct cpumask *cpu_map,
7439 struct cpumask *nodemask)
7279{ 7440{
7280} 7441}
7281#endif /* CONFIG_NUMA */ 7442#endif /* CONFIG_NUMA */
@@ -7301,7 +7462,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7301 7462
7302 WARN_ON(!sd || !sd->groups); 7463 WARN_ON(!sd || !sd->groups);
7303 7464
7304 if (cpu != first_cpu(sd->groups->cpumask)) 7465 if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
7305 return; 7466 return;
7306 7467
7307 child = sd->child; 7468 child = sd->child;
@@ -7366,48 +7527,6 @@ SD_INIT_FUNC(CPU)
7366 SD_INIT_FUNC(MC) 7527 SD_INIT_FUNC(MC)
7367#endif 7528#endif
7368 7529
7369/*
7370 * To minimize stack usage kmalloc room for cpumasks and share the
7371 * space as the usage in build_sched_domains() dictates. Used only
7372 * if the amount of space is significant.
7373 */
7374struct allmasks {
7375 cpumask_t tmpmask; /* make this one first */
7376 union {
7377 cpumask_t nodemask;
7378 cpumask_t this_sibling_map;
7379 cpumask_t this_core_map;
7380 };
7381 cpumask_t send_covered;
7382
7383#ifdef CONFIG_NUMA
7384 cpumask_t domainspan;
7385 cpumask_t covered;
7386 cpumask_t notcovered;
7387#endif
7388};
7389
7390#if NR_CPUS > 128
7391#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7392static inline void sched_cpumask_alloc(struct allmasks **masks)
7393{
7394 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7395}
7396static inline void sched_cpumask_free(struct allmasks *masks)
7397{
7398 kfree(masks);
7399}
7400#else
7401#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7402static inline void sched_cpumask_alloc(struct allmasks **masks)
7403{ }
7404static inline void sched_cpumask_free(struct allmasks *masks)
7405{ }
7406#endif
7407
7408#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7409 ((unsigned long)(a) + offsetof(struct allmasks, v))
7410
7411static int default_relax_domain_level = -1; 7530static int default_relax_domain_level = -1;
7412 7531
7413static int __init setup_relax_domain_level(char *str) 7532static int __init setup_relax_domain_level(char *str)
@@ -7447,17 +7566,38 @@ static void set_domain_attribute(struct sched_domain *sd,
7447 * Build sched domains for a given set of cpus and attach the sched domains 7566 * Build sched domains for a given set of cpus and attach the sched domains
7448 * to the individual cpus 7567 * to the individual cpus
7449 */ 7568 */
7450static int __build_sched_domains(const cpumask_t *cpu_map, 7569static int __build_sched_domains(const struct cpumask *cpu_map,
7451 struct sched_domain_attr *attr) 7570 struct sched_domain_attr *attr)
7452{ 7571{
7453 int i; 7572 int i, err = -ENOMEM;
7454 struct root_domain *rd; 7573 struct root_domain *rd;
7455 SCHED_CPUMASK_DECLARE(allmasks); 7574 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
7456 cpumask_t *tmpmask; 7575 tmpmask;
7457#ifdef CONFIG_NUMA 7576#ifdef CONFIG_NUMA
7577 cpumask_var_t domainspan, covered, notcovered;
7458 struct sched_group **sched_group_nodes = NULL; 7578 struct sched_group **sched_group_nodes = NULL;
7459 int sd_allnodes = 0; 7579 int sd_allnodes = 0;
7460 7580
7581 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
7582 goto out;
7583 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
7584 goto free_domainspan;
7585 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
7586 goto free_covered;
7587#endif
7588
7589 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
7590 goto free_notcovered;
7591 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
7592 goto free_nodemask;
7593 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
7594 goto free_this_sibling_map;
7595 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
7596 goto free_this_core_map;
7597 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
7598 goto free_send_covered;
7599
7600#ifdef CONFIG_NUMA
7461 /* 7601 /*
7462 * Allocate the per-node list of sched groups 7602 * Allocate the per-node list of sched groups
7463 */ 7603 */
@@ -7465,54 +7605,35 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7465 GFP_KERNEL); 7605 GFP_KERNEL);
7466 if (!sched_group_nodes) { 7606 if (!sched_group_nodes) {
7467 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7607 printk(KERN_WARNING "Can not alloc sched group node list\n");
7468 return -ENOMEM; 7608 goto free_tmpmask;
7469 } 7609 }
7470#endif 7610#endif
7471 7611
7472 rd = alloc_rootdomain(); 7612 rd = alloc_rootdomain();
7473 if (!rd) { 7613 if (!rd) {
7474 printk(KERN_WARNING "Cannot alloc root domain\n"); 7614 printk(KERN_WARNING "Cannot alloc root domain\n");
7475#ifdef CONFIG_NUMA 7615 goto free_sched_groups;
7476 kfree(sched_group_nodes);
7477#endif
7478 return -ENOMEM;
7479 }
7480
7481 /* get space for all scratch cpumask variables */
7482 sched_cpumask_alloc(&allmasks);
7483 if (!allmasks) {
7484 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7485 kfree(rd);
7486#ifdef CONFIG_NUMA
7487 kfree(sched_group_nodes);
7488#endif
7489 return -ENOMEM;
7490 } 7616 }
7491 7617
7492 tmpmask = (cpumask_t *)allmasks;
7493
7494
7495#ifdef CONFIG_NUMA 7618#ifdef CONFIG_NUMA
7496 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 7619 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
7497#endif 7620#endif
7498 7621
7499 /* 7622 /*
7500 * Set up domains for cpus specified by the cpu_map. 7623 * Set up domains for cpus specified by the cpu_map.
7501 */ 7624 */
7502 for_each_cpu_mask_nr(i, *cpu_map) { 7625 for_each_cpu(i, cpu_map) {
7503 struct sched_domain *sd = NULL, *p; 7626 struct sched_domain *sd = NULL, *p;
7504 SCHED_CPUMASK_VAR(nodemask, allmasks);
7505 7627
7506 *nodemask = node_to_cpumask(cpu_to_node(i)); 7628 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
7507 cpus_and(*nodemask, *nodemask, *cpu_map);
7508 7629
7509#ifdef CONFIG_NUMA 7630#ifdef CONFIG_NUMA
7510 if (cpus_weight(*cpu_map) > 7631 if (cpumask_weight(cpu_map) >
7511 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { 7632 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
7512 sd = &per_cpu(allnodes_domains, i); 7633 sd = &per_cpu(allnodes_domains, i);
7513 SD_INIT(sd, ALLNODES); 7634 SD_INIT(sd, ALLNODES);
7514 set_domain_attribute(sd, attr); 7635 set_domain_attribute(sd, attr);
7515 sd->span = *cpu_map; 7636 cpumask_copy(sched_domain_span(sd), cpu_map);
7516 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7637 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7517 p = sd; 7638 p = sd;
7518 sd_allnodes = 1; 7639 sd_allnodes = 1;
@@ -7522,18 +7643,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7522 sd = &per_cpu(node_domains, i); 7643 sd = &per_cpu(node_domains, i);
7523 SD_INIT(sd, NODE); 7644 SD_INIT(sd, NODE);
7524 set_domain_attribute(sd, attr); 7645 set_domain_attribute(sd, attr);
7525 sched_domain_node_span(cpu_to_node(i), &sd->span); 7646 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7526 sd->parent = p; 7647 sd->parent = p;
7527 if (p) 7648 if (p)
7528 p->child = sd; 7649 p->child = sd;
7529 cpus_and(sd->span, sd->span, *cpu_map); 7650 cpumask_and(sched_domain_span(sd),
7651 sched_domain_span(sd), cpu_map);
7530#endif 7652#endif
7531 7653
7532 p = sd; 7654 p = sd;
7533 sd = &per_cpu(phys_domains, i); 7655 sd = &per_cpu(phys_domains, i).sd;
7534 SD_INIT(sd, CPU); 7656 SD_INIT(sd, CPU);
7535 set_domain_attribute(sd, attr); 7657 set_domain_attribute(sd, attr);
7536 sd->span = *nodemask; 7658 cpumask_copy(sched_domain_span(sd), nodemask);
7537 sd->parent = p; 7659 sd->parent = p;
7538 if (p) 7660 if (p)
7539 p->child = sd; 7661 p->child = sd;
@@ -7541,11 +7663,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7541 7663
7542#ifdef CONFIG_SCHED_MC 7664#ifdef CONFIG_SCHED_MC
7543 p = sd; 7665 p = sd;
7544 sd = &per_cpu(core_domains, i); 7666 sd = &per_cpu(core_domains, i).sd;
7545 SD_INIT(sd, MC); 7667 SD_INIT(sd, MC);
7546 set_domain_attribute(sd, attr); 7668 set_domain_attribute(sd, attr);
7547 sd->span = cpu_coregroup_map(i); 7669 cpumask_and(sched_domain_span(sd), cpu_map,
7548 cpus_and(sd->span, sd->span, *cpu_map); 7670 cpu_coregroup_mask(i));
7549 sd->parent = p; 7671 sd->parent = p;
7550 p->child = sd; 7672 p->child = sd;
7551 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); 7673 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7553,11 +7675,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7553 7675
7554#ifdef CONFIG_SCHED_SMT 7676#ifdef CONFIG_SCHED_SMT
7555 p = sd; 7677 p = sd;
7556 sd = &per_cpu(cpu_domains, i); 7678 sd = &per_cpu(cpu_domains, i).sd;
7557 SD_INIT(sd, SIBLING); 7679 SD_INIT(sd, SIBLING);
7558 set_domain_attribute(sd, attr); 7680 set_domain_attribute(sd, attr);
7559 sd->span = per_cpu(cpu_sibling_map, i); 7681 cpumask_and(sched_domain_span(sd),
7560 cpus_and(sd->span, sd->span, *cpu_map); 7682 &per_cpu(cpu_sibling_map, i), cpu_map);
7561 sd->parent = p; 7683 sd->parent = p;
7562 p->child = sd; 7684 p->child = sd;
7563 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 7685 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7566,13 +7688,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7566 7688
7567#ifdef CONFIG_SCHED_SMT 7689#ifdef CONFIG_SCHED_SMT
7568 /* Set up CPU (sibling) groups */ 7690 /* Set up CPU (sibling) groups */
7569 for_each_cpu_mask_nr(i, *cpu_map) { 7691 for_each_cpu(i, cpu_map) {
7570 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7692 cpumask_and(this_sibling_map,
7571 SCHED_CPUMASK_VAR(send_covered, allmasks); 7693 &per_cpu(cpu_sibling_map, i), cpu_map);
7572 7694 if (i != cpumask_first(this_sibling_map))
7573 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7574 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7575 if (i != first_cpu(*this_sibling_map))
7576 continue; 7695 continue;
7577 7696
7578 init_sched_build_groups(this_sibling_map, cpu_map, 7697 init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7583,13 +7702,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7583 7702
7584#ifdef CONFIG_SCHED_MC 7703#ifdef CONFIG_SCHED_MC
7585 /* Set up multi-core groups */ 7704 /* Set up multi-core groups */
7586 for_each_cpu_mask_nr(i, *cpu_map) { 7705 for_each_cpu(i, cpu_map) {
7587 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7706 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
7588 SCHED_CPUMASK_VAR(send_covered, allmasks); 7707 if (i != cpumask_first(this_core_map))
7589
7590 *this_core_map = cpu_coregroup_map(i);
7591 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7592 if (i != first_cpu(*this_core_map))
7593 continue; 7708 continue;
7594 7709
7595 init_sched_build_groups(this_core_map, cpu_map, 7710 init_sched_build_groups(this_core_map, cpu_map,
@@ -7600,12 +7715,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7600 7715
7601 /* Set up physical groups */ 7716 /* Set up physical groups */
7602 for (i = 0; i < nr_node_ids; i++) { 7717 for (i = 0; i < nr_node_ids; i++) {
7603 SCHED_CPUMASK_VAR(nodemask, allmasks); 7718 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7604 SCHED_CPUMASK_VAR(send_covered, allmasks); 7719 if (cpumask_empty(nodemask))
7605
7606 *nodemask = node_to_cpumask(i);
7607 cpus_and(*nodemask, *nodemask, *cpu_map);
7608 if (cpus_empty(*nodemask))
7609 continue; 7720 continue;
7610 7721
7611 init_sched_build_groups(nodemask, cpu_map, 7722 init_sched_build_groups(nodemask, cpu_map,
@@ -7616,8 +7727,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7616#ifdef CONFIG_NUMA 7727#ifdef CONFIG_NUMA
7617 /* Set up node groups */ 7728 /* Set up node groups */
7618 if (sd_allnodes) { 7729 if (sd_allnodes) {
7619 SCHED_CPUMASK_VAR(send_covered, allmasks);
7620
7621 init_sched_build_groups(cpu_map, cpu_map, 7730 init_sched_build_groups(cpu_map, cpu_map,
7622 &cpu_to_allnodes_group, 7731 &cpu_to_allnodes_group,
7623 send_covered, tmpmask); 7732 send_covered, tmpmask);
@@ -7626,58 +7735,53 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7626 for (i = 0; i < nr_node_ids; i++) { 7735 for (i = 0; i < nr_node_ids; i++) {
7627 /* Set up node groups */ 7736 /* Set up node groups */
7628 struct sched_group *sg, *prev; 7737 struct sched_group *sg, *prev;
7629 SCHED_CPUMASK_VAR(nodemask, allmasks);
7630 SCHED_CPUMASK_VAR(domainspan, allmasks);
7631 SCHED_CPUMASK_VAR(covered, allmasks);
7632 int j; 7738 int j;
7633 7739
7634 *nodemask = node_to_cpumask(i); 7740 cpumask_clear(covered);
7635 cpus_clear(*covered); 7741 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7636 7742 if (cpumask_empty(nodemask)) {
7637 cpus_and(*nodemask, *nodemask, *cpu_map);
7638 if (cpus_empty(*nodemask)) {
7639 sched_group_nodes[i] = NULL; 7743 sched_group_nodes[i] = NULL;
7640 continue; 7744 continue;
7641 } 7745 }
7642 7746
7643 sched_domain_node_span(i, domainspan); 7747 sched_domain_node_span(i, domainspan);
7644 cpus_and(*domainspan, *domainspan, *cpu_map); 7748 cpumask_and(domainspan, domainspan, cpu_map);
7645 7749
7646 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7750 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7751 GFP_KERNEL, i);
7647 if (!sg) { 7752 if (!sg) {
7648 printk(KERN_WARNING "Can not alloc domain group for " 7753 printk(KERN_WARNING "Can not alloc domain group for "
7649 "node %d\n", i); 7754 "node %d\n", i);
7650 goto error; 7755 goto error;
7651 } 7756 }
7652 sched_group_nodes[i] = sg; 7757 sched_group_nodes[i] = sg;
7653 for_each_cpu_mask_nr(j, *nodemask) { 7758 for_each_cpu(j, nodemask) {
7654 struct sched_domain *sd; 7759 struct sched_domain *sd;
7655 7760
7656 sd = &per_cpu(node_domains, j); 7761 sd = &per_cpu(node_domains, j);
7657 sd->groups = sg; 7762 sd->groups = sg;
7658 } 7763 }
7659 sg->__cpu_power = 0; 7764 sg->__cpu_power = 0;
7660 sg->cpumask = *nodemask; 7765 cpumask_copy(sched_group_cpus(sg), nodemask);
7661 sg->next = sg; 7766 sg->next = sg;
7662 cpus_or(*covered, *covered, *nodemask); 7767 cpumask_or(covered, covered, nodemask);
7663 prev = sg; 7768 prev = sg;
7664 7769
7665 for (j = 0; j < nr_node_ids; j++) { 7770 for (j = 0; j < nr_node_ids; j++) {
7666 SCHED_CPUMASK_VAR(notcovered, allmasks);
7667 int n = (i + j) % nr_node_ids; 7771 int n = (i + j) % nr_node_ids;
7668 node_to_cpumask_ptr(pnodemask, n);
7669 7772
7670 cpus_complement(*notcovered, *covered); 7773 cpumask_complement(notcovered, covered);
7671 cpus_and(*tmpmask, *notcovered, *cpu_map); 7774 cpumask_and(tmpmask, notcovered, cpu_map);
7672 cpus_and(*tmpmask, *tmpmask, *domainspan); 7775 cpumask_and(tmpmask, tmpmask, domainspan);
7673 if (cpus_empty(*tmpmask)) 7776 if (cpumask_empty(tmpmask))
7674 break; 7777 break;
7675 7778
7676 cpus_and(*tmpmask, *tmpmask, *pnodemask); 7779 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
7677 if (cpus_empty(*tmpmask)) 7780 if (cpumask_empty(tmpmask))
7678 continue; 7781 continue;
7679 7782
7680 sg = kmalloc_node(sizeof(struct sched_group), 7783 sg = kmalloc_node(sizeof(struct sched_group) +
7784 cpumask_size(),
7681 GFP_KERNEL, i); 7785 GFP_KERNEL, i);
7682 if (!sg) { 7786 if (!sg) {
7683 printk(KERN_WARNING 7787 printk(KERN_WARNING
@@ -7685,9 +7789,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7685 goto error; 7789 goto error;
7686 } 7790 }
7687 sg->__cpu_power = 0; 7791 sg->__cpu_power = 0;
7688 sg->cpumask = *tmpmask; 7792 cpumask_copy(sched_group_cpus(sg), tmpmask);
7689 sg->next = prev->next; 7793 sg->next = prev->next;
7690 cpus_or(*covered, *covered, *tmpmask); 7794 cpumask_or(covered, covered, tmpmask);
7691 prev->next = sg; 7795 prev->next = sg;
7692 prev = sg; 7796 prev = sg;
7693 } 7797 }
@@ -7696,22 +7800,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7696 7800
7697 /* Calculate CPU power for physical packages and nodes */ 7801 /* Calculate CPU power for physical packages and nodes */
7698#ifdef CONFIG_SCHED_SMT 7802#ifdef CONFIG_SCHED_SMT
7699 for_each_cpu_mask_nr(i, *cpu_map) { 7803 for_each_cpu(i, cpu_map) {
7700 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7804 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
7701 7805
7702 init_sched_groups_power(i, sd); 7806 init_sched_groups_power(i, sd);
7703 } 7807 }
7704#endif 7808#endif
7705#ifdef CONFIG_SCHED_MC 7809#ifdef CONFIG_SCHED_MC
7706 for_each_cpu_mask_nr(i, *cpu_map) { 7810 for_each_cpu(i, cpu_map) {
7707 struct sched_domain *sd = &per_cpu(core_domains, i); 7811 struct sched_domain *sd = &per_cpu(core_domains, i).sd;
7708 7812
7709 init_sched_groups_power(i, sd); 7813 init_sched_groups_power(i, sd);
7710 } 7814 }
7711#endif 7815#endif
7712 7816
7713 for_each_cpu_mask_nr(i, *cpu_map) { 7817 for_each_cpu(i, cpu_map) {
7714 struct sched_domain *sd = &per_cpu(phys_domains, i); 7818 struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
7715 7819
7716 init_sched_groups_power(i, sd); 7820 init_sched_groups_power(i, sd);
7717 } 7821 }
@@ -7723,53 +7827,78 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7723 if (sd_allnodes) { 7827 if (sd_allnodes) {
7724 struct sched_group *sg; 7828 struct sched_group *sg;
7725 7829
7726 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, 7830 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7727 tmpmask); 7831 tmpmask);
7728 init_numa_sched_groups_power(sg); 7832 init_numa_sched_groups_power(sg);
7729 } 7833 }
7730#endif 7834#endif
7731 7835
7732 /* Attach the domains */ 7836 /* Attach the domains */
7733 for_each_cpu_mask_nr(i, *cpu_map) { 7837 for_each_cpu(i, cpu_map) {
7734 struct sched_domain *sd; 7838 struct sched_domain *sd;
7735#ifdef CONFIG_SCHED_SMT 7839#ifdef CONFIG_SCHED_SMT
7736 sd = &per_cpu(cpu_domains, i); 7840 sd = &per_cpu(cpu_domains, i).sd;
7737#elif defined(CONFIG_SCHED_MC) 7841#elif defined(CONFIG_SCHED_MC)
7738 sd = &per_cpu(core_domains, i); 7842 sd = &per_cpu(core_domains, i).sd;
7739#else 7843#else
7740 sd = &per_cpu(phys_domains, i); 7844 sd = &per_cpu(phys_domains, i).sd;
7741#endif 7845#endif
7742 cpu_attach_domain(sd, rd, i); 7846 cpu_attach_domain(sd, rd, i);
7743 } 7847 }
7744 7848
7745 sched_cpumask_free(allmasks); 7849 err = 0;
7746 return 0; 7850
7851free_tmpmask:
7852 free_cpumask_var(tmpmask);
7853free_send_covered:
7854 free_cpumask_var(send_covered);
7855free_this_core_map:
7856 free_cpumask_var(this_core_map);
7857free_this_sibling_map:
7858 free_cpumask_var(this_sibling_map);
7859free_nodemask:
7860 free_cpumask_var(nodemask);
7861free_notcovered:
7862#ifdef CONFIG_NUMA
7863 free_cpumask_var(notcovered);
7864free_covered:
7865 free_cpumask_var(covered);
7866free_domainspan:
7867 free_cpumask_var(domainspan);
7868out:
7869#endif
7870 return err;
7871
7872free_sched_groups:
7873#ifdef CONFIG_NUMA
7874 kfree(sched_group_nodes);
7875#endif
7876 goto free_tmpmask;
7747 7877
7748#ifdef CONFIG_NUMA 7878#ifdef CONFIG_NUMA
7749error: 7879error:
7750 free_sched_groups(cpu_map, tmpmask); 7880 free_sched_groups(cpu_map, tmpmask);
7751 sched_cpumask_free(allmasks); 7881 free_rootdomain(rd);
7752 kfree(rd); 7882 goto free_tmpmask;
7753 return -ENOMEM;
7754#endif 7883#endif
7755} 7884}
7756 7885
7757static int build_sched_domains(const cpumask_t *cpu_map) 7886static int build_sched_domains(const struct cpumask *cpu_map)
7758{ 7887{
7759 return __build_sched_domains(cpu_map, NULL); 7888 return __build_sched_domains(cpu_map, NULL);
7760} 7889}
7761 7890
7762static cpumask_t *doms_cur; /* current sched domains */ 7891static struct cpumask *doms_cur; /* current sched domains */
7763static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7892static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7764static struct sched_domain_attr *dattr_cur; 7893static struct sched_domain_attr *dattr_cur;
7765 /* attribues of custom domains in 'doms_cur' */ 7894 /* attribues of custom domains in 'doms_cur' */
7766 7895
7767/* 7896/*
7768 * Special case: If a kmalloc of a doms_cur partition (array of 7897 * Special case: If a kmalloc of a doms_cur partition (array of
7769 * cpumask_t) fails, then fallback to a single sched domain, 7898 * cpumask) fails, then fallback to a single sched domain,
7770 * as determined by the single cpumask_t fallback_doms. 7899 * as determined by the single cpumask fallback_doms.
7771 */ 7900 */
7772static cpumask_t fallback_doms; 7901static cpumask_var_t fallback_doms;
7773 7902
7774/* 7903/*
7775 * arch_update_cpu_topology lets virtualized architectures update the 7904 * arch_update_cpu_topology lets virtualized architectures update the
@@ -7786,16 +7915,16 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
7786 * For now this just excludes isolated cpus, but could be used to 7915 * For now this just excludes isolated cpus, but could be used to
7787 * exclude other special cases in the future. 7916 * exclude other special cases in the future.
7788 */ 7917 */
7789static int arch_init_sched_domains(const cpumask_t *cpu_map) 7918static int arch_init_sched_domains(const struct cpumask *cpu_map)
7790{ 7919{
7791 int err; 7920 int err;
7792 7921
7793 arch_update_cpu_topology(); 7922 arch_update_cpu_topology();
7794 ndoms_cur = 1; 7923 ndoms_cur = 1;
7795 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 7924 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
7796 if (!doms_cur) 7925 if (!doms_cur)
7797 doms_cur = &fallback_doms; 7926 doms_cur = fallback_doms;
7798 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7927 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
7799 dattr_cur = NULL; 7928 dattr_cur = NULL;
7800 err = build_sched_domains(doms_cur); 7929 err = build_sched_domains(doms_cur);
7801 register_sched_domain_sysctl(); 7930 register_sched_domain_sysctl();
@@ -7803,8 +7932,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
7803 return err; 7932 return err;
7804} 7933}
7805 7934
7806static void arch_destroy_sched_domains(const cpumask_t *cpu_map, 7935static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7807 cpumask_t *tmpmask) 7936 struct cpumask *tmpmask)
7808{ 7937{
7809 free_sched_groups(cpu_map, tmpmask); 7938 free_sched_groups(cpu_map, tmpmask);
7810} 7939}
@@ -7813,15 +7942,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7813 * Detach sched domains from a group of cpus specified in cpu_map 7942 * Detach sched domains from a group of cpus specified in cpu_map
7814 * These cpus will now be attached to the NULL domain 7943 * These cpus will now be attached to the NULL domain
7815 */ 7944 */
7816static void detach_destroy_domains(const cpumask_t *cpu_map) 7945static void detach_destroy_domains(const struct cpumask *cpu_map)
7817{ 7946{
7818 cpumask_t tmpmask; 7947 /* Save because hotplug lock held. */
7948 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7819 int i; 7949 int i;
7820 7950
7821 for_each_cpu_mask_nr(i, *cpu_map) 7951 for_each_cpu(i, cpu_map)
7822 cpu_attach_domain(NULL, &def_root_domain, i); 7952 cpu_attach_domain(NULL, &def_root_domain, i);
7823 synchronize_sched(); 7953 synchronize_sched();
7824 arch_destroy_sched_domains(cpu_map, &tmpmask); 7954 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7825} 7955}
7826 7956
7827/* handle null as "default" */ 7957/* handle null as "default" */
@@ -7846,7 +7976,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7846 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7976 * doms_new[] to the current sched domain partitioning, doms_cur[].
7847 * It destroys each deleted domain and builds each new domain. 7977 * It destroys each deleted domain and builds each new domain.
7848 * 7978 *
7849 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. 7979 * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
7850 * The masks don't intersect (don't overlap.) We should setup one 7980 * The masks don't intersect (don't overlap.) We should setup one
7851 * sched domain for each mask. CPUs not in any of the cpumasks will 7981 * sched domain for each mask. CPUs not in any of the cpumasks will
7852 * not be load balanced. If the same cpumask appears both in the 7982 * not be load balanced. If the same cpumask appears both in the
@@ -7860,13 +7990,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7860 * the single partition 'fallback_doms', it also forces the domains 7990 * the single partition 'fallback_doms', it also forces the domains
7861 * to be rebuilt. 7991 * to be rebuilt.
7862 * 7992 *
7863 * If doms_new == NULL it will be replaced with cpu_online_map. 7993 * If doms_new == NULL it will be replaced with cpu_online_mask.
7864 * ndoms_new == 0 is a special case for destroying existing domains, 7994 * ndoms_new == 0 is a special case for destroying existing domains,
7865 * and it will not create the default domain. 7995 * and it will not create the default domain.
7866 * 7996 *
7867 * Call with hotplug lock held 7997 * Call with hotplug lock held
7868 */ 7998 */
7869void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7999/* FIXME: Change to struct cpumask *doms_new[] */
8000void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
7870 struct sched_domain_attr *dattr_new) 8001 struct sched_domain_attr *dattr_new)
7871{ 8002{
7872 int i, j, n; 8003 int i, j, n;
@@ -7885,7 +8016,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7885 /* Destroy deleted domains */ 8016 /* Destroy deleted domains */
7886 for (i = 0; i < ndoms_cur; i++) { 8017 for (i = 0; i < ndoms_cur; i++) {
7887 for (j = 0; j < n && !new_topology; j++) { 8018 for (j = 0; j < n && !new_topology; j++) {
7888 if (cpus_equal(doms_cur[i], doms_new[j]) 8019 if (cpumask_equal(&doms_cur[i], &doms_new[j])
7889 && dattrs_equal(dattr_cur, i, dattr_new, j)) 8020 && dattrs_equal(dattr_cur, i, dattr_new, j))
7890 goto match1; 8021 goto match1;
7891 } 8022 }
@@ -7897,15 +8028,15 @@ match1:
7897 8028
7898 if (doms_new == NULL) { 8029 if (doms_new == NULL) {
7899 ndoms_cur = 0; 8030 ndoms_cur = 0;
7900 doms_new = &fallback_doms; 8031 doms_new = fallback_doms;
7901 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 8032 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
7902 WARN_ON_ONCE(dattr_new); 8033 WARN_ON_ONCE(dattr_new);
7903 } 8034 }
7904 8035
7905 /* Build new domains */ 8036 /* Build new domains */
7906 for (i = 0; i < ndoms_new; i++) { 8037 for (i = 0; i < ndoms_new; i++) {
7907 for (j = 0; j < ndoms_cur && !new_topology; j++) { 8038 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7908 if (cpus_equal(doms_new[i], doms_cur[j]) 8039 if (cpumask_equal(&doms_new[i], &doms_cur[j])
7909 && dattrs_equal(dattr_new, i, dattr_cur, j)) 8040 && dattrs_equal(dattr_new, i, dattr_cur, j))
7910 goto match2; 8041 goto match2;
7911 } 8042 }
@@ -7917,7 +8048,7 @@ match2:
7917 } 8048 }
7918 8049
7919 /* Remember the new sched domains */ 8050 /* Remember the new sched domains */
7920 if (doms_cur != &fallback_doms) 8051 if (doms_cur != fallback_doms)
7921 kfree(doms_cur); 8052 kfree(doms_cur);
7922 kfree(dattr_cur); /* kfree(NULL) is safe */ 8053 kfree(dattr_cur); /* kfree(NULL) is safe */
7923 doms_cur = doms_new; 8054 doms_cur = doms_new;
@@ -7930,7 +8061,7 @@ match2:
7930} 8061}
7931 8062
7932#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 8063#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7933int arch_reinit_sched_domains(void) 8064static void arch_reinit_sched_domains(void)
7934{ 8065{
7935 get_online_cpus(); 8066 get_online_cpus();
7936 8067
@@ -7939,25 +8070,33 @@ int arch_reinit_sched_domains(void)
7939 8070
7940 rebuild_sched_domains(); 8071 rebuild_sched_domains();
7941 put_online_cpus(); 8072 put_online_cpus();
7942
7943 return 0;
7944} 8073}
7945 8074
7946static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 8075static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7947{ 8076{
7948 int ret; 8077 unsigned int level = 0;
8078
8079 if (sscanf(buf, "%u", &level) != 1)
8080 return -EINVAL;
8081
8082 /*
8083 * level is always be positive so don't check for
8084 * level < POWERSAVINGS_BALANCE_NONE which is 0
8085 * What happens on 0 or 1 byte write,
8086 * need to check for count as well?
8087 */
7949 8088
7950 if (buf[0] != '0' && buf[0] != '1') 8089 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7951 return -EINVAL; 8090 return -EINVAL;
7952 8091
7953 if (smt) 8092 if (smt)
7954 sched_smt_power_savings = (buf[0] == '1'); 8093 sched_smt_power_savings = level;
7955 else 8094 else
7956 sched_mc_power_savings = (buf[0] == '1'); 8095 sched_mc_power_savings = level;
7957 8096
7958 ret = arch_reinit_sched_domains(); 8097 arch_reinit_sched_domains();
7959 8098
7960 return ret ? ret : count; 8099 return count;
7961} 8100}
7962 8101
7963#ifdef CONFIG_SCHED_MC 8102#ifdef CONFIG_SCHED_MC
@@ -7992,7 +8131,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7992 sched_smt_power_savings_store); 8131 sched_smt_power_savings_store);
7993#endif 8132#endif
7994 8133
7995int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 8134int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7996{ 8135{
7997 int err = 0; 8136 int err = 0;
7998 8137
@@ -8057,7 +8196,9 @@ static int update_runtime(struct notifier_block *nfb,
8057 8196
8058void __init sched_init_smp(void) 8197void __init sched_init_smp(void)
8059{ 8198{
8060 cpumask_t non_isolated_cpus; 8199 cpumask_var_t non_isolated_cpus;
8200
8201 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
8061 8202
8062#if defined(CONFIG_NUMA) 8203#if defined(CONFIG_NUMA)
8063 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 8204 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -8066,10 +8207,10 @@ void __init sched_init_smp(void)
8066#endif 8207#endif
8067 get_online_cpus(); 8208 get_online_cpus();
8068 mutex_lock(&sched_domains_mutex); 8209 mutex_lock(&sched_domains_mutex);
8069 arch_init_sched_domains(&cpu_online_map); 8210 arch_init_sched_domains(cpu_online_mask);
8070 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 8211 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8071 if (cpus_empty(non_isolated_cpus)) 8212 if (cpumask_empty(non_isolated_cpus))
8072 cpu_set(smp_processor_id(), non_isolated_cpus); 8213 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8073 mutex_unlock(&sched_domains_mutex); 8214 mutex_unlock(&sched_domains_mutex);
8074 put_online_cpus(); 8215 put_online_cpus();
8075 8216
@@ -8084,9 +8225,13 @@ void __init sched_init_smp(void)
8084 init_hrtick(); 8225 init_hrtick();
8085 8226
8086 /* Move init over to a non-isolated CPU */ 8227 /* Move init over to a non-isolated CPU */
8087 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 8228 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8088 BUG(); 8229 BUG();
8089 sched_init_granularity(); 8230 sched_init_granularity();
8231 free_cpumask_var(non_isolated_cpus);
8232
8233 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8234 init_sched_rt_class();
8090} 8235}
8091#else 8236#else
8092void __init sched_init_smp(void) 8237void __init sched_init_smp(void)
@@ -8401,6 +8546,15 @@ void __init sched_init(void)
8401 */ 8546 */
8402 current->sched_class = &fair_sched_class; 8547 current->sched_class = &fair_sched_class;
8403 8548
8549 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8550 alloc_bootmem_cpumask_var(&nohz_cpu_mask);
8551#ifdef CONFIG_SMP
8552#ifdef CONFIG_NO_HZ
8553 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
8554#endif
8555 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8556#endif /* SMP */
8557
8404 scheduler_running = 1; 8558 scheduler_running = 1;
8405} 8559}
8406 8560
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e8ab096ddfe3..a0b0852414cc 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
126 min_clock = wrap_max(scd->tick_gtod, scd->clock); 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = scd->tick_gtod + TICK_NSEC; 127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
128 128
129 clock = wrap_max(clock, min_clock); 129 clock = wrap_max(clock, min_clock);
130 clock = wrap_min(clock, max_clock); 130 clock = wrap_min(clock, max_clock);
@@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
227 */ 227 */
228void sched_clock_idle_wakeup_event(u64 delta_ns) 228void sched_clock_idle_wakeup_event(u64 delta_ns)
229{ 229{
230 if (timekeeping_suspended)
231 return;
232
230 sched_clock_tick(); 233 sched_clock_tick();
231 touch_softlockup_watchdog(); 234 touch_softlockup_watchdog();
232} 235}
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 52154fefab7e..1e00bfacf9b8 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -67,24 +67,21 @@ static int convert_prio(int prio)
67 * Returns: (int)bool - CPUs were found 67 * Returns: (int)bool - CPUs were found
68 */ 68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p, 69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask) 70 struct cpumask *lowest_mask)
71{ 71{
72 int idx = 0; 72 int idx = 0;
73 int task_pri = convert_prio(p->prio); 73 int task_pri = convert_prio(p->prio);
74 74
75 for_each_cpupri_active(cp->pri_active, idx) { 75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78 77
79 if (idx >= task_pri) 78 if (idx >= task_pri)
80 break; 79 break;
81 80
82 cpus_and(mask, p->cpus_allowed, vec->mask); 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
83
84 if (cpus_empty(mask))
85 continue; 82 continue;
86 83
87 *lowest_mask = mask; 84 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
88 return 1; 85 return 1;
89 } 86 }
90 87
@@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
126 vec->count--; 123 vec->count--;
127 if (!vec->count) 124 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active); 125 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask); 126 cpumask_clear_cpu(cpu, vec->mask);
130 127
131 spin_unlock_irqrestore(&vec->lock, flags); 128 spin_unlock_irqrestore(&vec->lock, flags);
132 } 129 }
@@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
136 133
137 spin_lock_irqsave(&vec->lock, flags); 134 spin_lock_irqsave(&vec->lock, flags);
138 135
139 cpu_set(cpu, vec->mask); 136 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 137 vec->count++;
141 if (vec->count == 1) 138 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active); 139 set_bit(newpri, cp->pri_active);
@@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
150/** 147/**
151 * cpupri_init - initialize the cpupri structure 148 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context 149 * @cp: The cpupri context
150 * @bootmem: true if allocations need to use bootmem
153 * 151 *
154 * Returns: (void) 152 * Returns: -ENOMEM if memory fails.
155 */ 153 */
156void cpupri_init(struct cpupri *cp) 154int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
157{ 155{
158 int i; 156 int i;
159 157
@@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp)
164 162
165 spin_lock_init(&vec->lock); 163 spin_lock_init(&vec->lock);
166 vec->count = 0; 164 vec->count = 0;
167 cpus_clear(vec->mask); 165 if (bootmem)
166 alloc_bootmem_cpumask_var(&vec->mask);
167 else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
168 goto cleanup;
168 } 169 }
169 170
170 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID; 172 cp->cpu_to_pri[i] = CPUPRI_INVALID;
173 return 0;
174
175cleanup:
176 for (i--; i >= 0; i--)
177 free_cpumask_var(cp->pri_to_cpu[i].mask);
178 return -ENOMEM;
172} 179}
173 180
181/**
182 * cpupri_cleanup - clean up the cpupri structure
183 * @cp: The cpupri context
184 */
185void cpupri_cleanup(struct cpupri *cp)
186{
187 int i;
174 188
189 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
190 free_cpumask_var(cp->pri_to_cpu[i].mask);
191}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index f25811b0f931..642a94ef8a0a 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -14,7 +14,7 @@
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 spinlock_t lock;
16 int count; 16 int count;
17 cpumask_t mask; 17 cpumask_var_t mask;
18}; 18};
19 19
20struct cpupri { 20struct cpupri {
@@ -27,7 +27,8 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask); 28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp); 30int cpupri_init(struct cpupri *cp, bool bootmem);
31void cpupri_cleanup(struct cpupri *cp);
31#else 32#else
32#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0) 34#define cpupri_init() do { } while (0)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5ad4440f0fc4..8e1352c75557 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -386,20 +386,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
386#endif 386#endif
387 387
388/* 388/*
389 * delta *= P[w / rw]
390 */
391static inline unsigned long
392calc_delta_weight(unsigned long delta, struct sched_entity *se)
393{
394 for_each_sched_entity(se) {
395 delta = calc_delta_mine(delta,
396 se->load.weight, &cfs_rq_of(se)->load);
397 }
398
399 return delta;
400}
401
402/*
403 * delta /= w 389 * delta /= w
404 */ 390 */
405static inline unsigned long 391static inline unsigned long
@@ -440,12 +426,20 @@ static u64 __sched_period(unsigned long nr_running)
440 */ 426 */
441static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 427static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
442{ 428{
443 unsigned long nr_running = cfs_rq->nr_running; 429 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
444 430
445 if (unlikely(!se->on_rq)) 431 for_each_sched_entity(se) {
446 nr_running++; 432 struct load_weight *load = &cfs_rq->load;
433
434 if (unlikely(!se->on_rq)) {
435 struct load_weight lw = cfs_rq->load;
447 436
448 return calc_delta_weight(__sched_period(nr_running), se); 437 update_load_add(&lw, se->load.weight);
438 load = &lw;
439 }
440 slice = calc_delta_mine(slice, se->load.weight, load);
441 }
442 return slice;
449} 443}
450 444
451/* 445/*
@@ -1019,16 +1013,33 @@ static void yield_task_fair(struct rq *rq)
1019 * search starts with cpus closest then further out as needed, 1013 * search starts with cpus closest then further out as needed,
1020 * so we always favor a closer, idle cpu. 1014 * so we always favor a closer, idle cpu.
1021 * Domains may include CPUs that are not usable for migration, 1015 * Domains may include CPUs that are not usable for migration,
1022 * hence we need to mask them out (cpu_active_map) 1016 * hence we need to mask them out (cpu_active_mask)
1023 * 1017 *
1024 * Returns the CPU we should wake onto. 1018 * Returns the CPU we should wake onto.
1025 */ 1019 */
1026#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1020#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1027static int wake_idle(int cpu, struct task_struct *p) 1021static int wake_idle(int cpu, struct task_struct *p)
1028{ 1022{
1029 cpumask_t tmp;
1030 struct sched_domain *sd; 1023 struct sched_domain *sd;
1031 int i; 1024 int i;
1025 unsigned int chosen_wakeup_cpu;
1026 int this_cpu;
1027
1028 /*
1029 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1030 * are idle and this is not a kernel thread and this task's affinity
1031 * allows it to be moved to preferred cpu, then just move!
1032 */
1033
1034 this_cpu = smp_processor_id();
1035 chosen_wakeup_cpu =
1036 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1037
1038 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1039 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1040 p->mm && !(p->flags & PF_KTHREAD) &&
1041 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1042 return chosen_wakeup_cpu;
1032 1043
1033 /* 1044 /*
1034 * If it is idle, then it is the best cpu to run this task. 1045 * If it is idle, then it is the best cpu to run this task.
@@ -1046,10 +1057,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1046 if ((sd->flags & SD_WAKE_IDLE) 1057 if ((sd->flags & SD_WAKE_IDLE)
1047 || ((sd->flags & SD_WAKE_IDLE_FAR) 1058 || ((sd->flags & SD_WAKE_IDLE_FAR)
1048 && !task_hot(p, task_rq(p)->clock, sd))) { 1059 && !task_hot(p, task_rq(p)->clock, sd))) {
1049 cpus_and(tmp, sd->span, p->cpus_allowed); 1060 for_each_cpu_and(i, sched_domain_span(sd),
1050 cpus_and(tmp, tmp, cpu_active_map); 1061 &p->cpus_allowed) {
1051 for_each_cpu_mask_nr(i, tmp) { 1062 if (cpu_active(i) && idle_cpu(i)) {
1052 if (idle_cpu(i)) {
1053 if (i != task_cpu(p)) { 1063 if (i != task_cpu(p)) {
1054 schedstat_inc(p, 1064 schedstat_inc(p,
1055 se.nr_wakeups_idle); 1065 se.nr_wakeups_idle);
@@ -1242,13 +1252,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1242 * this_cpu and prev_cpu are present in: 1252 * this_cpu and prev_cpu are present in:
1243 */ 1253 */
1244 for_each_domain(this_cpu, sd) { 1254 for_each_domain(this_cpu, sd) {
1245 if (cpu_isset(prev_cpu, sd->span)) { 1255 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
1246 this_sd = sd; 1256 this_sd = sd;
1247 break; 1257 break;
1248 } 1258 }
1249 } 1259 }
1250 1260
1251 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1261 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
1252 goto out; 1262 goto out;
1253 1263
1254 /* 1264 /*
@@ -1607,8 +1617,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1607 } 1617 }
1608} 1618}
1609 1619
1610#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1611
1612/* 1620/*
1613 * Share the fairness runtime between parent and child, thus the 1621 * Share the fairness runtime between parent and child, thus the
1614 * total amount of pressure for CPU stays equal - new tasks 1622 * total amount of pressure for CPU stays equal - new tasks
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 51d2af3e6191..954e1a81b796 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq)
15 if (!rq->online) 15 if (!rq->online)
16 return; 16 return;
17 17
18 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
19 /* 19 /*
20 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
21 * the overload count. That is checked to determine 21 * the overload count. That is checked to determine
@@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq)
34 34
35 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
36 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
37 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
38} 38}
39 39
40static void update_rt_migration(struct rq *rq) 40static void update_rt_migration(struct rq *rq)
@@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
139} 139}
140 140
141#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
142static inline cpumask_t sched_rt_period_mask(void) 142static inline const struct cpumask *sched_rt_period_mask(void)
143{ 143{
144 return cpu_rq(smp_processor_id())->rd->span; 144 return cpu_rq(smp_processor_id())->rd->span;
145} 145}
146#else 146#else
147static inline cpumask_t sched_rt_period_mask(void) 147static inline const struct cpumask *sched_rt_period_mask(void)
148{ 148{
149 return cpu_online_map; 149 return cpu_online_mask;
150} 150}
151#endif 151#endif
152 152
@@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
212 return rt_rq->rt_throttled; 212 return rt_rq->rt_throttled;
213} 213}
214 214
215static inline cpumask_t sched_rt_period_mask(void) 215static inline const struct cpumask *sched_rt_period_mask(void)
216{ 216{
217 return cpu_online_map; 217 return cpu_online_mask;
218} 218}
219 219
220static inline 220static inline
@@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
241 int i, weight, more = 0; 241 int i, weight, more = 0;
242 u64 rt_period; 242 u64 rt_period;
243 243
244 weight = cpus_weight(rd->span); 244 weight = cpumask_weight(rd->span);
245 245
246 spin_lock(&rt_b->rt_runtime_lock); 246 spin_lock(&rt_b->rt_runtime_lock);
247 rt_period = ktime_to_ns(rt_b->rt_period); 247 rt_period = ktime_to_ns(rt_b->rt_period);
248 for_each_cpu_mask_nr(i, rd->span) { 248 for_each_cpu(i, rd->span) {
249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
250 s64 diff; 250 s64 diff;
251 251
@@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq)
324 /* 324 /*
325 * Greedy reclaim, take back as much as we can. 325 * Greedy reclaim, take back as much as we can.
326 */ 326 */
327 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu(i, rd->span) {
328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
329 s64 diff; 329 s64 diff;
330 330
@@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
430{ 430{
431 int i, idle = 1; 431 int i, idle = 1;
432 cpumask_t span; 432 const struct cpumask *span;
433 433
434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
435 return 1; 435 return 1;
436 436
437 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
438 for_each_cpu_mask(i, span) { 438 for_each_cpu(i, span) {
439 int enqueue = 0; 439 int enqueue = 0;
440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
441 struct rq *rq = rq_of_rt_rq(rt_rq); 441 struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
805 805
806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
807{ 807{
808 cpumask_t mask; 808 cpumask_var_t mask;
809 809
810 if (rq->curr->rt.nr_cpus_allowed == 1) 810 if (rq->curr->rt.nr_cpus_allowed == 1)
811 return; 811 return;
812 812
813 if (p->rt.nr_cpus_allowed != 1 813 if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
814 && cpupri_find(&rq->rd->cpupri, p, &mask))
815 return; 814 return;
816 815
817 if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) 816 if (p->rt.nr_cpus_allowed != 1
818 return; 817 && cpupri_find(&rq->rd->cpupri, p, mask))
818 goto free;
819
820 if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
821 goto free;
819 822
820 /* 823 /*
821 * There appears to be other cpus that can accept 824 * There appears to be other cpus that can accept
@@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
824 */ 827 */
825 requeue_task_rt(rq, p, 1); 828 requeue_task_rt(rq, p, 1);
826 resched_task(rq->curr); 829 resched_task(rq->curr);
830free:
831 free_cpumask_var(mask);
827} 832}
828 833
829#endif /* CONFIG_SMP */ 834#endif /* CONFIG_SMP */
@@ -914,7 +919,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
914static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 919static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
915{ 920{
916 if (!task_running(rq, p) && 921 if (!task_running(rq, p) &&
917 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && 922 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
918 (p->rt.nr_cpus_allowed > 1)) 923 (p->rt.nr_cpus_allowed > 1))
919 return 1; 924 return 1;
920 return 0; 925 return 0;
@@ -953,7 +958,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
953 return next; 958 return next;
954} 959}
955 960
956static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
957 962
958static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 963static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
959{ 964{
@@ -973,7 +978,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
973static int find_lowest_rq(struct task_struct *task) 978static int find_lowest_rq(struct task_struct *task)
974{ 979{
975 struct sched_domain *sd; 980 struct sched_domain *sd;
976 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 981 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
977 int this_cpu = smp_processor_id(); 982 int this_cpu = smp_processor_id();
978 int cpu = task_cpu(task); 983 int cpu = task_cpu(task);
979 984
@@ -988,7 +993,7 @@ static int find_lowest_rq(struct task_struct *task)
988 * I guess we might want to change cpupri_find() to ignore those 993 * I guess we might want to change cpupri_find() to ignore those
989 * in the first place. 994 * in the first place.
990 */ 995 */
991 cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); 996 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
992 997
993 /* 998 /*
994 * At this point we have built a mask of cpus representing the 999 * At this point we have built a mask of cpus representing the
@@ -998,7 +1003,7 @@ static int find_lowest_rq(struct task_struct *task)
998 * We prioritize the last cpu that the task executed on since 1003 * We prioritize the last cpu that the task executed on since
999 * it is most likely cache-hot in that location. 1004 * it is most likely cache-hot in that location.
1000 */ 1005 */
1001 if (cpu_isset(cpu, *lowest_mask)) 1006 if (cpumask_test_cpu(cpu, lowest_mask))
1002 return cpu; 1007 return cpu;
1003 1008
1004 /* 1009 /*
@@ -1013,7 +1018,8 @@ static int find_lowest_rq(struct task_struct *task)
1013 cpumask_t domain_mask; 1018 cpumask_t domain_mask;
1014 int best_cpu; 1019 int best_cpu;
1015 1020
1016 cpus_and(domain_mask, sd->span, *lowest_mask); 1021 cpumask_and(&domain_mask, sched_domain_span(sd),
1022 lowest_mask);
1017 1023
1018 best_cpu = pick_optimal_cpu(this_cpu, 1024 best_cpu = pick_optimal_cpu(this_cpu,
1019 &domain_mask); 1025 &domain_mask);
@@ -1054,8 +1060,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1054 * Also make sure that it wasn't scheduled on its rq. 1060 * Also make sure that it wasn't scheduled on its rq.
1055 */ 1061 */
1056 if (unlikely(task_rq(task) != rq || 1062 if (unlikely(task_rq(task) != rq ||
1057 !cpu_isset(lowest_rq->cpu, 1063 !cpumask_test_cpu(lowest_rq->cpu,
1058 task->cpus_allowed) || 1064 &task->cpus_allowed) ||
1059 task_running(rq, task) || 1065 task_running(rq, task) ||
1060 !task->se.on_rq)) { 1066 !task->se.on_rq)) {
1061 1067
@@ -1176,7 +1182,7 @@ static int pull_rt_task(struct rq *this_rq)
1176 1182
1177 next = pick_next_task_rt(this_rq); 1183 next = pick_next_task_rt(this_rq);
1178 1184
1179 for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { 1185 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1180 if (this_cpu == cpu) 1186 if (this_cpu == cpu)
1181 continue; 1187 continue;
1182 1188
@@ -1305,9 +1311,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1305} 1311}
1306 1312
1307static void set_cpus_allowed_rt(struct task_struct *p, 1313static void set_cpus_allowed_rt(struct task_struct *p,
1308 const cpumask_t *new_mask) 1314 const struct cpumask *new_mask)
1309{ 1315{
1310 int weight = cpus_weight(*new_mask); 1316 int weight = cpumask_weight(new_mask);
1311 1317
1312 BUG_ON(!rt_task(p)); 1318 BUG_ON(!rt_task(p));
1313 1319
@@ -1328,7 +1334,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1328 update_rt_migration(rq); 1334 update_rt_migration(rq);
1329 } 1335 }
1330 1336
1331 p->cpus_allowed = *new_mask; 1337 cpumask_copy(&p->cpus_allowed, new_mask);
1332 p->rt.nr_cpus_allowed = weight; 1338 p->rt.nr_cpus_allowed = weight;
1333} 1339}
1334 1340
@@ -1371,6 +1377,15 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1371 if (!rq->rt.rt_nr_running) 1377 if (!rq->rt.rt_nr_running)
1372 pull_rt_task(rq); 1378 pull_rt_task(rq);
1373} 1379}
1380
1381static inline void init_sched_rt_class(void)
1382{
1383 unsigned int i;
1384
1385 for_each_possible_cpu(i)
1386 alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1387 GFP_KERNEL, cpu_to_node(i));
1388}
1374#endif /* CONFIG_SMP */ 1389#endif /* CONFIG_SMP */
1375 1390
1376/* 1391/*
@@ -1541,3 +1556,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1541 rcu_read_unlock(); 1556 rcu_read_unlock();
1542} 1557}
1543#endif /* CONFIG_SCHED_DEBUG */ 1558#endif /* CONFIG_SCHED_DEBUG */
1559
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 3b01098164c8..f2773b5d1226 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
42 for_each_domain(cpu, sd) { 42 for_each_domain(cpu, sd) {
43 enum cpu_idle_type itype; 43 enum cpu_idle_type itype;
44 44
45 cpumask_scnprintf(mask_str, mask_len, sd->span); 45 cpumask_scnprintf(mask_str, mask_len,
46 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str); 47 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 48 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) { 49 itype++) {
diff --git a/kernel/signal.c b/kernel/signal.c
index 8e95855ff3cf..3152ac3b62e2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
858 q->info.si_signo = sig; 858 q->info.si_signo = sig;
859 q->info.si_errno = 0; 859 q->info.si_errno = 0;
860 q->info.si_code = SI_USER; 860 q->info.si_code = SI_USER;
861 q->info.si_pid = task_pid_vnr(current); 861 q->info.si_pid = task_tgid_nr_ns(current,
862 task_active_pid_ns(t));
862 q->info.si_uid = current_uid(); 863 q->info.si_uid = current_uid();
863 break; 864 break;
864 case (unsigned long) SEND_SIG_PRIV: 865 case (unsigned long) SEND_SIG_PRIV:
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c8dde58c55..5cfa0e5e3e88 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -24,8 +24,8 @@ struct call_function_data {
24 struct call_single_data csd; 24 struct call_single_data csd;
25 spinlock_t lock; 25 spinlock_t lock;
26 unsigned int refs; 26 unsigned int refs;
27 cpumask_t cpumask;
28 struct rcu_head rcu_head; 27 struct rcu_head rcu_head;
28 unsigned long cpumask_bits[];
29}; 29};
30 30
31struct call_single_queue { 31struct call_single_queue {
@@ -110,13 +110,13 @@ void generic_smp_call_function_interrupt(void)
110 list_for_each_entry_rcu(data, &call_function_queue, csd.list) { 110 list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
111 int refs; 111 int refs;
112 112
113 if (!cpu_isset(cpu, data->cpumask)) 113 if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
114 continue; 114 continue;
115 115
116 data->csd.func(data->csd.info); 116 data->csd.func(data->csd.info);
117 117
118 spin_lock(&data->lock); 118 spin_lock(&data->lock);
119 cpu_clear(cpu, data->cpumask); 119 cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
120 WARN_ON(data->refs == 0); 120 WARN_ON(data->refs == 0);
121 data->refs--; 121 data->refs--;
122 refs = data->refs; 122 refs = data->refs;
@@ -223,7 +223,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
223 local_irq_save(flags); 223 local_irq_save(flags);
224 func(info); 224 func(info);
225 local_irq_restore(flags); 225 local_irq_restore(flags);
226 } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { 226 } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
227 struct call_single_data *data = NULL; 227 struct call_single_data *data = NULL;
228 228
229 if (!wait) { 229 if (!wait) {
@@ -266,51 +266,19 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
266 generic_exec_single(cpu, data); 266 generic_exec_single(cpu, data);
267} 267}
268 268
269/* Dummy function */ 269/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
270static void quiesce_dummy(void *unused) 270#ifndef arch_send_call_function_ipi_mask
271{ 271#define arch_send_call_function_ipi_mask(maskp) \
272} 272 arch_send_call_function_ipi(*(maskp))
273 273#endif
274/*
275 * Ensure stack based data used in call function mask is safe to free.
276 *
277 * This is needed by smp_call_function_mask when using on-stack data, because
278 * a single call function queue is shared by all CPUs, and any CPU may pick up
279 * the data item on the queue at any time before it is deleted. So we need to
280 * ensure that all CPUs have transitioned through a quiescent state after
281 * this call.
282 *
283 * This is a very slow function, implemented by sending synchronous IPIs to
284 * all possible CPUs. For this reason, we have to alloc data rather than use
285 * stack based data even in the case of synchronous calls. The stack based
286 * data is then just used for deadlock/oom fallback which will be very rare.
287 *
288 * If a faster scheme can be made, we could go back to preferring stack based
289 * data -- the data allocation/free is non-zero cost.
290 */
291static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
292{
293 struct call_single_data data;
294 int cpu;
295
296 data.func = quiesce_dummy;
297 data.info = NULL;
298
299 for_each_cpu_mask(cpu, mask) {
300 data.flags = CSD_FLAG_WAIT;
301 generic_exec_single(cpu, &data);
302 }
303}
304 274
305/** 275/**
306 * smp_call_function_mask(): Run a function on a set of other CPUs. 276 * smp_call_function_many(): Run a function on a set of other CPUs.
307 * @mask: The set of cpus to run on. 277 * @mask: The set of cpus to run on (only runs on online subset).
308 * @func: The function to run. This must be fast and non-blocking. 278 * @func: The function to run. This must be fast and non-blocking.
309 * @info: An arbitrary pointer to pass to the function. 279 * @info: An arbitrary pointer to pass to the function.
310 * @wait: If true, wait (atomically) until function has completed on other CPUs. 280 * @wait: If true, wait (atomically) until function has completed on other CPUs.
311 * 281 *
312 * Returns 0 on success, else a negative status code.
313 *
314 * If @wait is true, then returns once @func has returned. Note that @wait 282 * If @wait is true, then returns once @func has returned. Note that @wait
315 * will be implicitly turned on in case of allocation failures, since 283 * will be implicitly turned on in case of allocation failures, since
316 * we fall back to on-stack allocation. 284 * we fall back to on-stack allocation.
@@ -319,53 +287,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
319 * hardware interrupt handler or from a bottom half handler. Preemption 287 * hardware interrupt handler or from a bottom half handler. Preemption
320 * must be disabled when calling this function. 288 * must be disabled when calling this function.
321 */ 289 */
322int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, 290void smp_call_function_many(const struct cpumask *mask,
323 int wait) 291 void (*func)(void *), void *info,
292 bool wait)
324{ 293{
325 struct call_function_data d; 294 struct call_function_data *data;
326 struct call_function_data *data = NULL;
327 cpumask_t allbutself;
328 unsigned long flags; 295 unsigned long flags;
329 int cpu, num_cpus; 296 int cpu, next_cpu;
330 int slowpath = 0;
331 297
332 /* Can deadlock when called with interrupts disabled */ 298 /* Can deadlock when called with interrupts disabled */
333 WARN_ON(irqs_disabled()); 299 WARN_ON(irqs_disabled());
334 300
335 cpu = smp_processor_id(); 301 /* So, what's a CPU they want? Ignoring this one. */
336 allbutself = cpu_online_map; 302 cpu = cpumask_first_and(mask, cpu_online_mask);
337 cpu_clear(cpu, allbutself); 303 if (cpu == smp_processor_id())
338 cpus_and(mask, mask, allbutself); 304 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
339 num_cpus = cpus_weight(mask); 305 /* No online cpus? We're done. */
340 306 if (cpu >= nr_cpu_ids)
341 /* 307 return;
342 * If zero CPUs, return. If just a single CPU, turn this request 308
343 * into a targetted single call instead since it's faster. 309 /* Do we have another CPU which isn't us? */
344 */ 310 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
345 if (!num_cpus) 311 if (next_cpu == smp_processor_id())
346 return 0; 312 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
347 else if (num_cpus == 1) { 313
348 cpu = first_cpu(mask); 314 /* Fastpath: do that cpu by itself. */
349 return smp_call_function_single(cpu, func, info, wait); 315 if (next_cpu >= nr_cpu_ids) {
316 smp_call_function_single(cpu, func, info, wait);
317 return;
350 } 318 }
351 319
352 data = kmalloc(sizeof(*data), GFP_ATOMIC); 320 data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
353 if (data) { 321 if (unlikely(!data)) {
354 data->csd.flags = CSD_FLAG_ALLOC; 322 /* Slow path. */
355 if (wait) 323 for_each_online_cpu(cpu) {
356 data->csd.flags |= CSD_FLAG_WAIT; 324 if (cpu == smp_processor_id())
357 } else { 325 continue;
358 data = &d; 326 if (cpumask_test_cpu(cpu, mask))
359 data->csd.flags = CSD_FLAG_WAIT; 327 smp_call_function_single(cpu, func, info, wait);
360 wait = 1; 328 }
361 slowpath = 1; 329 return;
362 } 330 }
363 331
364 spin_lock_init(&data->lock); 332 spin_lock_init(&data->lock);
333 data->csd.flags = CSD_FLAG_ALLOC;
334 if (wait)
335 data->csd.flags |= CSD_FLAG_WAIT;
365 data->csd.func = func; 336 data->csd.func = func;
366 data->csd.info = info; 337 data->csd.info = info;
367 data->refs = num_cpus; 338 cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
368 data->cpumask = mask; 339 cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
340 data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
369 341
370 spin_lock_irqsave(&call_function_lock, flags); 342 spin_lock_irqsave(&call_function_lock, flags);
371 list_add_tail_rcu(&data->csd.list, &call_function_queue); 343 list_add_tail_rcu(&data->csd.list, &call_function_queue);
@@ -377,18 +349,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
377 smp_mb(); 349 smp_mb();
378 350
379 /* Send a message to all CPUs in the map */ 351 /* Send a message to all CPUs in the map */
380 arch_send_call_function_ipi(mask); 352 arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
381 353
382 /* optionally wait for the CPUs to complete */ 354 /* optionally wait for the CPUs to complete */
383 if (wait) { 355 if (wait)
384 csd_flag_wait(&data->csd); 356 csd_flag_wait(&data->csd);
385 if (unlikely(slowpath))
386 smp_call_function_mask_quiesce_stack(mask);
387 }
388
389 return 0;
390} 357}
391EXPORT_SYMBOL(smp_call_function_mask); 358EXPORT_SYMBOL(smp_call_function_many);
392 359
393/** 360/**
394 * smp_call_function(): Run a function on all other CPUs. 361 * smp_call_function(): Run a function on all other CPUs.
@@ -396,7 +363,7 @@ EXPORT_SYMBOL(smp_call_function_mask);
396 * @info: An arbitrary pointer to pass to the function. 363 * @info: An arbitrary pointer to pass to the function.
397 * @wait: If true, wait (atomically) until function has completed on other CPUs. 364 * @wait: If true, wait (atomically) until function has completed on other CPUs.
398 * 365 *
399 * Returns 0 on success, else a negative status code. 366 * Returns 0.
400 * 367 *
401 * If @wait is true, then returns once @func has returned; otherwise 368 * If @wait is true, then returns once @func has returned; otherwise
402 * it returns just before the target cpu calls @func. In case of allocation 369 * it returns just before the target cpu calls @func. In case of allocation
@@ -407,12 +374,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
407 */ 374 */
408int smp_call_function(void (*func)(void *), void *info, int wait) 375int smp_call_function(void (*func)(void *), void *info, int wait)
409{ 376{
410 int ret;
411
412 preempt_disable(); 377 preempt_disable();
413 ret = smp_call_function_mask(cpu_online_map, func, info, wait); 378 smp_call_function_many(cpu_online_mask, func, info, wait);
414 preempt_enable(); 379 preempt_enable();
415 return ret; 380 return 0;
416} 381}
417EXPORT_SYMBOL(smp_call_function); 382EXPORT_SYMBOL(smp_call_function);
418 383
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d69..bdbe9de9cd8d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -102,20 +102,6 @@ void local_bh_disable(void)
102 102
103EXPORT_SYMBOL(local_bh_disable); 103EXPORT_SYMBOL(local_bh_disable);
104 104
105void __local_bh_enable(void)
106{
107 WARN_ON_ONCE(in_irq());
108
109 /*
110 * softirqs should never be enabled by __local_bh_enable(),
111 * it always nests inside local_bh_enable() sections:
112 */
113 WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
114
115 sub_preempt_count(SOFTIRQ_OFFSET);
116}
117EXPORT_SYMBOL_GPL(__local_bh_enable);
118
119/* 105/*
120 * Special-case - softirqs can safely be enabled in 106 * Special-case - softirqs can safely be enabled in
121 * cond_resched_softirq(), or by __do_softirq(), 107 * cond_resched_softirq(), or by __do_softirq(),
@@ -269,6 +255,7 @@ void irq_enter(void)
269{ 255{
270 int cpu = smp_processor_id(); 256 int cpu = smp_processor_id();
271 257
258 rcu_irq_enter();
272 if (idle_cpu(cpu) && !in_interrupt()) { 259 if (idle_cpu(cpu) && !in_interrupt()) {
273 __irq_enter(); 260 __irq_enter();
274 tick_check_idle(cpu); 261 tick_check_idle(cpu);
@@ -295,9 +282,9 @@ void irq_exit(void)
295 282
296#ifdef CONFIG_NO_HZ 283#ifdef CONFIG_NO_HZ
297 /* Make sure that timer wheel updates are propagated */ 284 /* Make sure that timer wheel updates are propagated */
298 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
299 tick_nohz_stop_sched_tick(0);
300 rcu_irq_exit(); 285 rcu_irq_exit();
286 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
287 tick_nohz_stop_sched_tick(0);
301#endif 288#endif
302 preempt_enable_no_resched(); 289 preempt_enable_no_resched();
303} 290}
@@ -746,7 +733,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
746 break; 733 break;
747 /* Unbind so it can run. Fall thru. */ 734 /* Unbind so it can run. Fall thru. */
748 kthread_bind(per_cpu(ksoftirqd, hotcpu), 735 kthread_bind(per_cpu(ksoftirqd, hotcpu),
749 any_online_cpu(cpu_online_map)); 736 cpumask_any(cpu_online_mask));
750 case CPU_DEAD: 737 case CPU_DEAD:
751 case CPU_DEAD_FROZEN: { 738 case CPU_DEAD_FROZEN: {
752 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 739 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -797,3 +784,23 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
797} 784}
798EXPORT_SYMBOL(on_each_cpu); 785EXPORT_SYMBOL(on_each_cpu);
799#endif 786#endif
787
788/*
789 * [ These __weak aliases are kept in a separate compilation unit, so that
790 * GCC does not inline them incorrectly. ]
791 */
792
793int __init __weak early_irq_init(void)
794{
795 return 0;
796}
797
798int __init __weak arch_early_irq_init(void)
799{
800 return 0;
801}
802
803int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
804{
805 return 0;
806}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index dc0b3be6b7d5..d9188c66278a 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
164/* 164/*
165 * Zero means infinite timeout - no checking done: 165 * Zero means infinite timeout - no checking done:
166 */ 166 */
167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
168 168
169unsigned long __read_mostly sysctl_hung_task_warnings = 10; 169unsigned long __read_mostly sysctl_hung_task_warnings = 10;
170 170
@@ -303,17 +303,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
303 break; 303 break;
304 case CPU_ONLINE: 304 case CPU_ONLINE:
305 case CPU_ONLINE_FROZEN: 305 case CPU_ONLINE_FROZEN:
306 check_cpu = any_online_cpu(cpu_online_map); 306 check_cpu = cpumask_any(cpu_online_mask);
307 wake_up_process(per_cpu(watchdog_task, hotcpu)); 307 wake_up_process(per_cpu(watchdog_task, hotcpu));
308 break; 308 break;
309#ifdef CONFIG_HOTPLUG_CPU 309#ifdef CONFIG_HOTPLUG_CPU
310 case CPU_DOWN_PREPARE: 310 case CPU_DOWN_PREPARE:
311 case CPU_DOWN_PREPARE_FROZEN: 311 case CPU_DOWN_PREPARE_FROZEN:
312 if (hotcpu == check_cpu) { 312 if (hotcpu == check_cpu) {
313 cpumask_t temp_cpu_online_map = cpu_online_map; 313 /* Pick any other online cpu. */
314 314 check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
315 cpu_clear(hotcpu, temp_cpu_online_map);
316 check_cpu = any_online_cpu(temp_cpu_online_map);
317 } 315 }
318 break; 316 break;
319 317
@@ -323,7 +321,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
323 break; 321 break;
324 /* Unbind so it can run. Fall thru. */ 322 /* Unbind so it can run. Fall thru. */
325 kthread_bind(per_cpu(watchdog_task, hotcpu), 323 kthread_bind(per_cpu(watchdog_task, hotcpu),
326 any_online_cpu(cpu_online_map)); 324 cpumask_any(cpu_online_mask));
327 case CPU_DEAD: 325 case CPU_DEAD:
328 case CPU_DEAD_FROZEN: 326 case CPU_DEAD_FROZEN:
329 p = per_cpu(watchdog_task, hotcpu); 327 p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 94b527ef1d1e..eb212f8f8bc8 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
11#include <linux/stacktrace.h> 12#include <linux/stacktrace.h>
@@ -24,3 +25,13 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
24} 25}
25EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
26 27
28/*
29 * Architectures that do not implement save_stack_trace_tsk get this
30 * weak alias and a once-per-bootup warning (whenever this facility
31 * is utilized - for example by procfs):
32 */
33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 24e8ceacc388..0cd415ee62a2 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -38,7 +38,10 @@ struct stop_machine_data {
38static unsigned int num_threads; 38static unsigned int num_threads;
39static atomic_t thread_ack; 39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock); 40static DEFINE_MUTEX(lock);
41 41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
42static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
43static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
44static const cpumask_t *active_cpus; 47static const cpumask_t *active_cpus;
@@ -69,10 +72,10 @@ static void stop_cpu(struct work_struct *unused)
69 int err; 72 int err;
70 73
71 if (!active_cpus) { 74 if (!active_cpus) {
72 if (cpu == first_cpu(cpu_online_map)) 75 if (cpu == cpumask_first(cpu_online_mask))
73 smdata = &active; 76 smdata = &active;
74 } else { 77 } else {
75 if (cpu_isset(cpu, *active_cpus)) 78 if (cpumask_test_cpu(cpu, active_cpus))
76 smdata = &active; 79 smdata = &active;
77 } 80 }
78 /* Simple state machine */ 81 /* Simple state machine */
@@ -109,7 +112,44 @@ static int chill(void *unused)
109 return 0; 112 return 0;
110} 113}
111 114
112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
113{ 153{
114 struct work_struct *sm_work; 154 struct work_struct *sm_work;
115 int i, ret; 155 int i, ret;
@@ -142,23 +182,18 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
142 return ret; 182 return ret;
143} 183}
144 184
145int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
146{ 186{
147 int ret; 187 int ret;
148 188
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
149 /* No CPUs can come up or down during this. */ 192 /* No CPUs can come up or down during this. */
150 get_online_cpus(); 193 get_online_cpus();
151 ret = __stop_machine(fn, data, cpus); 194 ret = __stop_machine(fn, data, cpus);
152 put_online_cpus(); 195 put_online_cpus();
153 196 stop_machine_destroy();
154 return ret; 197 return ret;
155} 198}
156EXPORT_SYMBOL_GPL(stop_machine); 199EXPORT_SYMBOL_GPL(stop_machine);
157
158static int __init stop_machine_init(void)
159{
160 stop_machine_wq = create_rt_workqueue("kstop");
161 stop_machine_work = alloc_percpu(struct work_struct);
162 return 0;
163}
164core_initcall(stop_machine_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 1544c305751e..c2a951ae4223 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -34,6 +34,7 @@
34#include <linux/task_io_accounting_ops.h> 34#include <linux/task_io_accounting_ops.h>
35#include <linux/seccomp.h> 35#include <linux/seccomp.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/ptrace.h>
37 38
38#include <linux/compat.h> 39#include <linux/compat.h>
39#include <linux/syscalls.h> 40#include <linux/syscalls.h>
@@ -908,8 +909,8 @@ void do_sys_times(struct tms *tms)
908 struct task_cputime cputime; 909 struct task_cputime cputime;
909 cputime_t cutime, cstime; 910 cputime_t cutime, cstime;
910 911
911 spin_lock_irq(&current->sighand->siglock);
912 thread_group_cputime(current, &cputime); 912 thread_group_cputime(current, &cputime);
913 spin_lock_irq(&current->sighand->siglock);
913 cutime = current->signal->cutime; 914 cutime = current->signal->cutime;
914 cstime = current->signal->cstime; 915 cstime = current->signal->cstime;
915 spin_unlock_irq(&current->sighand->siglock); 916 spin_unlock_irq(&current->sighand->siglock);
@@ -928,6 +929,7 @@ asmlinkage long sys_times(struct tms __user * tbuf)
928 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 929 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
929 return -EFAULT; 930 return -EFAULT;
930 } 931 }
932 force_successful_syscall_return();
931 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 933 return (long) jiffies_64_to_clock_t(get_jiffies_64());
932} 934}
933 935
@@ -1628,6 +1630,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1628 utime = stime = cputime_zero; 1630 utime = stime = cputime_zero;
1629 1631
1630 if (who == RUSAGE_THREAD) { 1632 if (who == RUSAGE_THREAD) {
1633 utime = task_utime(current);
1634 stime = task_stime(current);
1631 accumulate_thread_rusage(p, r); 1635 accumulate_thread_rusage(p, r);
1632 goto out; 1636 goto out;
1633 } 1637 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0b627d9c93d8..89d74436318c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,15 +82,14 @@ extern int percpu_pagelist_fraction;
82extern int compat_log; 82extern int compat_log;
83extern int latencytop_enabled; 83extern int latencytop_enabled;
84extern int sysctl_nr_open_min, sysctl_nr_open_max; 84extern int sysctl_nr_open_min, sysctl_nr_open_max;
85#ifndef CONFIG_MMU
86extern int sysctl_nr_trim_pages;
87#endif
85#ifdef CONFIG_RCU_TORTURE_TEST 88#ifdef CONFIG_RCU_TORTURE_TEST
86extern int rcutorture_runnable; 89extern int rcutorture_runnable;
87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 90#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
88 91
89/* Constants used for minimum and maximum */ 92/* Constants used for minimum and maximum */
90#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
91static int one = 1;
92#endif
93
94#ifdef CONFIG_DETECT_SOFTLOCKUP 93#ifdef CONFIG_DETECT_SOFTLOCKUP
95static int sixty = 60; 94static int sixty = 60;
96static int neg_one = -1; 95static int neg_one = -1;
@@ -101,6 +100,7 @@ static int two = 2;
101#endif 100#endif
102 101
103static int zero; 102static int zero;
103static int one = 1;
104static int one_hundred = 100; 104static int one_hundred = 100;
105 105
106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -121,6 +121,10 @@ extern int sg_big_buff;
121#include <asm/system.h> 121#include <asm/system.h>
122#endif 122#endif
123 123
124#ifdef CONFIG_SPARC64
125extern int sysctl_tsb_ratio;
126#endif
127
124#ifdef __hppa__ 128#ifdef __hppa__
125extern int pwrsw_enabled; 129extern int pwrsw_enabled;
126extern int unaligned_enabled; 130extern int unaligned_enabled;
@@ -451,6 +455,16 @@ static struct ctl_table kern_table[] = {
451 .proc_handler = &proc_dointvec, 455 .proc_handler = &proc_dointvec,
452 }, 456 },
453#endif 457#endif
458#ifdef CONFIG_SPARC64
459 {
460 .ctl_name = CTL_UNNUMBERED,
461 .procname = "tsb-ratio",
462 .data = &sysctl_tsb_ratio,
463 .maxlen = sizeof (int),
464 .mode = 0644,
465 .proc_handler = &proc_dointvec,
466 },
467#endif
454#ifdef __hppa__ 468#ifdef __hppa__
455 { 469 {
456 .ctl_name = KERN_HPPA_PWRSW, 470 .ctl_name = KERN_HPPA_PWRSW,
@@ -938,12 +952,22 @@ static struct ctl_table vm_table[] = {
938 .data = &dirty_background_ratio, 952 .data = &dirty_background_ratio,
939 .maxlen = sizeof(dirty_background_ratio), 953 .maxlen = sizeof(dirty_background_ratio),
940 .mode = 0644, 954 .mode = 0644,
941 .proc_handler = &proc_dointvec_minmax, 955 .proc_handler = &dirty_background_ratio_handler,
942 .strategy = &sysctl_intvec, 956 .strategy = &sysctl_intvec,
943 .extra1 = &zero, 957 .extra1 = &zero,
944 .extra2 = &one_hundred, 958 .extra2 = &one_hundred,
945 }, 959 },
946 { 960 {
961 .ctl_name = CTL_UNNUMBERED,
962 .procname = "dirty_background_bytes",
963 .data = &dirty_background_bytes,
964 .maxlen = sizeof(dirty_background_bytes),
965 .mode = 0644,
966 .proc_handler = &dirty_background_bytes_handler,
967 .strategy = &sysctl_intvec,
968 .extra1 = &one,
969 },
970 {
947 .ctl_name = VM_DIRTY_RATIO, 971 .ctl_name = VM_DIRTY_RATIO,
948 .procname = "dirty_ratio", 972 .procname = "dirty_ratio",
949 .data = &vm_dirty_ratio, 973 .data = &vm_dirty_ratio,
@@ -955,6 +979,16 @@ static struct ctl_table vm_table[] = {
955 .extra2 = &one_hundred, 979 .extra2 = &one_hundred,
956 }, 980 },
957 { 981 {
982 .ctl_name = CTL_UNNUMBERED,
983 .procname = "dirty_bytes",
984 .data = &vm_dirty_bytes,
985 .maxlen = sizeof(vm_dirty_bytes),
986 .mode = 0644,
987 .proc_handler = &dirty_bytes_handler,
988 .strategy = &sysctl_intvec,
989 .extra1 = &one,
990 },
991 {
958 .procname = "dirty_writeback_centisecs", 992 .procname = "dirty_writeback_centisecs",
959 .data = &dirty_writeback_interval, 993 .data = &dirty_writeback_interval,
960 .maxlen = sizeof(dirty_writeback_interval), 994 .maxlen = sizeof(dirty_writeback_interval),
@@ -1071,6 +1105,17 @@ static struct ctl_table vm_table[] = {
1071 .mode = 0644, 1105 .mode = 0644,
1072 .proc_handler = &proc_dointvec 1106 .proc_handler = &proc_dointvec
1073 }, 1107 },
1108#else
1109 {
1110 .ctl_name = CTL_UNNUMBERED,
1111 .procname = "nr_trim_pages",
1112 .data = &sysctl_nr_trim_pages,
1113 .maxlen = sizeof(sysctl_nr_trim_pages),
1114 .mode = 0644,
1115 .proc_handler = &proc_dointvec_minmax,
1116 .strategy = &sysctl_intvec,
1117 .extra1 = &zero,
1118 },
1074#endif 1119#endif
1075 { 1120 {
1076 .ctl_name = VM_LAPTOP_MODE, 1121 .ctl_name = VM_LAPTOP_MODE,
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c35da23ab8fb..fafeb48f27c0 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -730,7 +730,6 @@ static const struct trans_ctl_table trans_fs_quota_table[] = {
730}; 730};
731 731
732static const struct trans_ctl_table trans_fs_xfs_table[] = { 732static const struct trans_ctl_table trans_fs_xfs_table[] = {
733 { XFS_RESTRICT_CHOWN, "restrict_chown" },
734 { XFS_SGID_INHERIT, "irix_sgid_inherit" }, 733 { XFS_SGID_INHERIT, "irix_sgid_inherit" },
735 { XFS_SYMLINK_MODE, "irix_symlink_mode" }, 734 { XFS_SYMLINK_MODE, "irix_symlink_mode" },
736 { XFS_PANIC_MASK, "panic_mask" }, 735 { XFS_PANIC_MASK, "panic_mask" },
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index bd6be76303cf..888adbcca30c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,18 +290,17 @@ ret:
290 return; 290 return;
291} 291}
292 292
293static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) 293static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
294{ 294{
295 struct listener_list *listeners; 295 struct listener_list *listeners;
296 struct listener *s, *tmp; 296 struct listener *s, *tmp;
297 unsigned int cpu; 297 unsigned int cpu;
298 cpumask_t mask = *maskp;
299 298
300 if (!cpus_subset(mask, cpu_possible_map)) 299 if (!cpumask_subset(mask, cpu_possible_mask))
301 return -EINVAL; 300 return -EINVAL;
302 301
303 if (isadd == REGISTER) { 302 if (isadd == REGISTER) {
304 for_each_cpu_mask_nr(cpu, mask) { 303 for_each_cpu(cpu, mask) {
305 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 304 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
306 cpu_to_node(cpu)); 305 cpu_to_node(cpu));
307 if (!s) 306 if (!s)
@@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
320 319
321 /* Deregister or cleanup */ 320 /* Deregister or cleanup */
322cleanup: 321cleanup:
323 for_each_cpu_mask_nr(cpu, mask) { 322 for_each_cpu(cpu, mask) {
324 listeners = &per_cpu(listener_array, cpu); 323 listeners = &per_cpu(listener_array, cpu);
325 down_write(&listeners->sem); 324 down_write(&listeners->sem);
326 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 325 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
@@ -335,7 +334,7 @@ cleanup:
335 return 0; 334 return 0;
336} 335}
337 336
338static int parse(struct nlattr *na, cpumask_t *mask) 337static int parse(struct nlattr *na, struct cpumask *mask)
339{ 338{
340 char *data; 339 char *data;
341 int len; 340 int len;
@@ -352,7 +351,7 @@ static int parse(struct nlattr *na, cpumask_t *mask)
352 if (!data) 351 if (!data)
353 return -ENOMEM; 352 return -ENOMEM;
354 nla_strlcpy(data, na, len); 353 nla_strlcpy(data, na, len);
355 ret = cpulist_parse(data, *mask); 354 ret = cpulist_parse(data, mask);
356 kfree(data); 355 kfree(data);
357 return ret; 356 return ret;
358} 357}
@@ -428,23 +427,33 @@ err:
428 427
429static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 428static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
430{ 429{
431 int rc = 0; 430 int rc;
432 struct sk_buff *rep_skb; 431 struct sk_buff *rep_skb;
433 struct taskstats *stats; 432 struct taskstats *stats;
434 size_t size; 433 size_t size;
435 cpumask_t mask; 434 cpumask_var_t mask;
435
436 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
437 return -ENOMEM;
436 438
437 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 439 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
438 if (rc < 0) 440 if (rc < 0)
439 return rc; 441 goto free_return_rc;
440 if (rc == 0) 442 if (rc == 0) {
441 return add_del_listener(info->snd_pid, &mask, REGISTER); 443 rc = add_del_listener(info->snd_pid, mask, REGISTER);
444 goto free_return_rc;
445 }
442 446
443 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); 447 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
444 if (rc < 0) 448 if (rc < 0)
449 goto free_return_rc;
450 if (rc == 0) {
451 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
452free_return_rc:
453 free_cpumask_var(mask);
445 return rc; 454 return rc;
446 if (rc == 0) 455 }
447 return add_del_listener(info->snd_pid, &mask, DEREGISTER); 456 free_cpumask_var(mask);
448 457
449 /* 458 /*
450 * Size includes space for nested attributes 459 * Size includes space for nested attributes
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 06b6395b45b2..4f104515a19b 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -22,21 +22,11 @@
22 22
23static u32 rand1, preh_val, posth_val, jph_val; 23static u32 rand1, preh_val, posth_val, jph_val;
24static int errors, handler_errors, num_tests; 24static int errors, handler_errors, num_tests;
25static u32 (*target)(u32 value);
26static u32 (*target2)(u32 value);
25 27
26static noinline u32 kprobe_target(u32 value) 28static noinline u32 kprobe_target(u32 value)
27{ 29{
28 /*
29 * gcc ignores noinline on some architectures unless we stuff
30 * sufficient lard into the function. The get_kprobe() here is
31 * just for that.
32 *
33 * NOTE: We aren't concerned about the correctness of get_kprobe()
34 * here; hence, this call is neither under !preempt nor with the
35 * kprobe_mutex held. This is fine(tm)
36 */
37 if (get_kprobe((void *)0xdeadbeef))
38 printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
39
40 return (value / div_factor); 30 return (value / div_factor);
41} 31}
42 32
@@ -74,7 +64,7 @@ static int test_kprobe(void)
74 return ret; 64 return ret;
75 } 65 }
76 66
77 ret = kprobe_target(rand1); 67 ret = target(rand1);
78 unregister_kprobe(&kp); 68 unregister_kprobe(&kp);
79 69
80 if (preh_val == 0) { 70 if (preh_val == 0) {
@@ -92,6 +82,84 @@ static int test_kprobe(void)
92 return 0; 82 return 0;
93} 83}
94 84
85static noinline u32 kprobe_target2(u32 value)
86{
87 return (value / div_factor) + 1;
88}
89
90static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs)
91{
92 preh_val = (rand1 / div_factor) + 1;
93 return 0;
94}
95
96static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
97 unsigned long flags)
98{
99 if (preh_val != (rand1 / div_factor) + 1) {
100 handler_errors++;
101 printk(KERN_ERR "Kprobe smoke test failed: "
102 "incorrect value in post_handler2\n");
103 }
104 posth_val = preh_val + div_factor;
105}
106
107static struct kprobe kp2 = {
108 .symbol_name = "kprobe_target2",
109 .pre_handler = kp_pre_handler2,
110 .post_handler = kp_post_handler2
111};
112
113static int test_kprobes(void)
114{
115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2};
117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */
119 ret = register_kprobes(kps, 2);
120 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: "
122 "register_kprobes returned %d\n", ret);
123 return ret;
124 }
125
126 preh_val = 0;
127 posth_val = 0;
128 ret = target(rand1);
129
130 if (preh_val == 0) {
131 printk(KERN_ERR "Kprobe smoke test failed: "
132 "kprobe pre_handler not called\n");
133 handler_errors++;
134 }
135
136 if (posth_val == 0) {
137 printk(KERN_ERR "Kprobe smoke test failed: "
138 "kprobe post_handler not called\n");
139 handler_errors++;
140 }
141
142 preh_val = 0;
143 posth_val = 0;
144 ret = target2(rand1);
145
146 if (preh_val == 0) {
147 printk(KERN_ERR "Kprobe smoke test failed: "
148 "kprobe pre_handler2 not called\n");
149 handler_errors++;
150 }
151
152 if (posth_val == 0) {
153 printk(KERN_ERR "Kprobe smoke test failed: "
154 "kprobe post_handler2 not called\n");
155 handler_errors++;
156 }
157
158 unregister_kprobes(kps, 2);
159 return 0;
160
161}
162
95static u32 j_kprobe_target(u32 value) 163static u32 j_kprobe_target(u32 value)
96{ 164{
97 if (value != rand1) { 165 if (value != rand1) {
@@ -121,7 +189,7 @@ static int test_jprobe(void)
121 return ret; 189 return ret;
122 } 190 }
123 191
124 ret = kprobe_target(rand1); 192 ret = target(rand1);
125 unregister_jprobe(&jp); 193 unregister_jprobe(&jp);
126 if (jph_val == 0) { 194 if (jph_val == 0) {
127 printk(KERN_ERR "Kprobe smoke test failed: " 195 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -132,6 +200,43 @@ static int test_jprobe(void)
132 return 0; 200 return 0;
133} 201}
134 202
203static struct jprobe jp2 = {
204 .entry = j_kprobe_target,
205 .kp.symbol_name = "kprobe_target2"
206};
207
208static int test_jprobes(void)
209{
210 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2};
212
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
214 ret = register_jprobes(jps, 2);
215 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: "
217 "register_jprobes returned %d\n", ret);
218 return ret;
219 }
220
221 jph_val = 0;
222 ret = target(rand1);
223 if (jph_val == 0) {
224 printk(KERN_ERR "Kprobe smoke test failed: "
225 "jprobe handler not called\n");
226 handler_errors++;
227 }
228
229 jph_val = 0;
230 ret = target2(rand1);
231 if (jph_val == 0) {
232 printk(KERN_ERR "Kprobe smoke test failed: "
233 "jprobe handler2 not called\n");
234 handler_errors++;
235 }
236 unregister_jprobes(jps, 2);
237
238 return 0;
239}
135#ifdef CONFIG_KRETPROBES 240#ifdef CONFIG_KRETPROBES
136static u32 krph_val; 241static u32 krph_val;
137 242
@@ -177,7 +282,7 @@ static int test_kretprobe(void)
177 return ret; 282 return ret;
178 } 283 }
179 284
180 ret = kprobe_target(rand1); 285 ret = target(rand1);
181 unregister_kretprobe(&rp); 286 unregister_kretprobe(&rp);
182 if (krph_val != rand1) { 287 if (krph_val != rand1) {
183 printk(KERN_ERR "Kprobe smoke test failed: " 288 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -187,12 +292,72 @@ static int test_kretprobe(void)
187 292
188 return 0; 293 return 0;
189} 294}
295
296static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
297{
298 unsigned long ret = regs_return_value(regs);
299
300 if (ret != (rand1 / div_factor) + 1) {
301 handler_errors++;
302 printk(KERN_ERR "Kprobe smoke test failed: "
303 "incorrect value in kretprobe handler2\n");
304 }
305 if (krph_val == 0) {
306 handler_errors++;
307 printk(KERN_ERR "Kprobe smoke test failed: "
308 "call to kretprobe entry handler failed\n");
309 }
310
311 krph_val = rand1;
312 return 0;
313}
314
315static struct kretprobe rp2 = {
316 .handler = return_handler2,
317 .entry_handler = entry_handler,
318 .kp.symbol_name = "kprobe_target2"
319};
320
321static int test_kretprobes(void)
322{
323 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2};
325
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
327 ret = register_kretprobes(rps, 2);
328 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: "
330 "register_kretprobe returned %d\n", ret);
331 return ret;
332 }
333
334 krph_val = 0;
335 ret = target(rand1);
336 if (krph_val != rand1) {
337 printk(KERN_ERR "Kprobe smoke test failed: "
338 "kretprobe handler not called\n");
339 handler_errors++;
340 }
341
342 krph_val = 0;
343 ret = target2(rand1);
344 if (krph_val != rand1) {
345 printk(KERN_ERR "Kprobe smoke test failed: "
346 "kretprobe handler2 not called\n");
347 handler_errors++;
348 }
349 unregister_kretprobes(rps, 2);
350 return 0;
351}
190#endif /* CONFIG_KRETPROBES */ 352#endif /* CONFIG_KRETPROBES */
191 353
192int init_test_probes(void) 354int init_test_probes(void)
193{ 355{
194 int ret; 356 int ret;
195 357
358 target = kprobe_target;
359 target2 = kprobe_target2;
360
196 do { 361 do {
197 rand1 = random32(); 362 rand1 = random32();
198 } while (rand1 <= div_factor); 363 } while (rand1 <= div_factor);
@@ -204,15 +369,30 @@ int init_test_probes(void)
204 errors++; 369 errors++;
205 370
206 num_tests++; 371 num_tests++;
372 ret = test_kprobes();
373 if (ret < 0)
374 errors++;
375
376 num_tests++;
207 ret = test_jprobe(); 377 ret = test_jprobe();
208 if (ret < 0) 378 if (ret < 0)
209 errors++; 379 errors++;
210 380
381 num_tests++;
382 ret = test_jprobes();
383 if (ret < 0)
384 errors++;
385
211#ifdef CONFIG_KRETPROBES 386#ifdef CONFIG_KRETPROBES
212 num_tests++; 387 num_tests++;
213 ret = test_kretprobe(); 388 ret = test_kretprobe();
214 if (ret < 0) 389 if (ret < 0)
215 errors++; 390 errors++;
391
392 num_tests++;
393 ret = test_kretprobes();
394 if (ret < 0)
395 errors++;
216#endif /* CONFIG_KRETPROBES */ 396#endif /* CONFIG_KRETPROBES */
217 397
218 if (errors) 398 if (errors)
diff --git a/kernel/time.c b/kernel/time.c
index d63a4336fad6..4886e3ce83a4 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -37,6 +37,7 @@
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/math64.h> 39#include <linux/math64.h>
40#include <linux/ptrace.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc)
65 66
66 if (tloc) { 67 if (tloc) {
67 if (put_user(i,tloc)) 68 if (put_user(i,tloc))
68 i = -EFAULT; 69 return -EFAULT;
69 } 70 }
71 force_successful_syscall_return();
70 return i; 72 return i;
71} 73}
72 74
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f8d968063cea..ea2f48af83cf 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -166,6 +166,8 @@ static void clockevents_notify_released(void)
166void clockevents_register_device(struct clock_event_device *dev) 166void clockevents_register_device(struct clock_event_device *dev)
167{ 167{
168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
169 BUG_ON(!dev->cpumask);
170
169 /* 171 /*
170 * A nsec2cyc multiplicator of 0 is invalid and we'd crash 172 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
171 * on it, so fix it up and emit a warning: 173 * on it, so fix it up and emit a warning:
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9ed2eec97526..ca89e1593f08 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,10 +145,11 @@ static void clocksource_watchdog(unsigned long data)
145 * Cycle through CPUs to check if the CPUs stay 145 * Cycle through CPUs to check if the CPUs stay
146 * synchronized to each other. 146 * synchronized to each other.
147 */ 147 */
148 int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); 148 int next_cpu = cpumask_next(raw_smp_processor_id(),
149 cpu_online_mask);
149 150
150 if (next_cpu >= nr_cpu_ids) 151 if (next_cpu >= nr_cpu_ids)
151 next_cpu = first_cpu(cpu_online_map); 152 next_cpu = cpumask_first(cpu_online_mask);
152 watchdog_timer.expires += WATCHDOG_INTERVAL; 153 watchdog_timer.expires += WATCHDOG_INTERVAL;
153 add_timer_on(&watchdog_timer, next_cpu); 154 add_timer_on(&watchdog_timer, next_cpu);
154 } 155 }
@@ -173,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
173 watchdog_last = watchdog->read(); 174 watchdog_last = watchdog->read();
174 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 175 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
175 add_timer_on(&watchdog_timer, 176 add_timer_on(&watchdog_timer,
176 first_cpu(cpu_online_map)); 177 cpumask_first(cpu_online_mask));
177 } 178 }
178 } else { 179 } else {
179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 180 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -195,7 +196,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
195 watchdog_timer.expires = 196 watchdog_timer.expires =
196 jiffies + WATCHDOG_INTERVAL; 197 jiffies + WATCHDOG_INTERVAL;
197 add_timer_on(&watchdog_timer, 198 add_timer_on(&watchdog_timer,
198 first_cpu(cpu_online_map)); 199 cpumask_first(cpu_online_mask));
199 } 200 }
200 } 201 }
201 } 202 }
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1ca99557e929..06f197560f3b 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -45,7 +45,7 @@
45 * 45 *
46 * The value 8 is somewhat carefully chosen, as anything 46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as 47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when 48 * HZ shrinks, so values greater than 8 overflow 32bits when
49 * HZ=100. 49 * HZ=100.
50 */ 50 */
51#define JIFFIES_SHIFT 8 51#define JIFFIES_SHIFT 8
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8ff15e5d486b..f5f793d92415 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -131,7 +131,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
131{ 131{
132 enum hrtimer_restart res = HRTIMER_NORESTART; 132 enum hrtimer_restart res = HRTIMER_NORESTART;
133 133
134 write_seqlock_irq(&xtime_lock); 134 write_seqlock(&xtime_lock);
135 135
136 switch (time_state) { 136 switch (time_state) {
137 case TIME_OK: 137 case TIME_OK:
@@ -164,7 +164,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
164 } 164 }
165 update_vsyscall(&xtime, clock); 165 update_vsyscall(&xtime, clock);
166 166
167 write_sequnlock_irq(&xtime_lock); 167 write_sequnlock(&xtime_lock);
168 168
169 return res; 169 return res;
170} 170}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f98a1b7b16e9..118a3b3b3f9a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,7 +28,9 @@
28 */ 28 */
29 29
30struct tick_device tick_broadcast_device; 30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask; 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS);
32static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_SPINLOCK(tick_broadcast_lock);
33static int tick_broadcast_force; 35static int tick_broadcast_force;
34 36
@@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void)
46 return &tick_broadcast_device; 48 return &tick_broadcast_device;
47} 49}
48 50
49cpumask_t *tick_get_broadcast_mask(void) 51struct cpumask *tick_get_broadcast_mask(void)
50{ 52{
51 return &tick_broadcast_mask; 53 return to_cpumask(tick_broadcast_mask);
52} 54}
53 55
54/* 56/*
@@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
72 74
73 clockevents_exchange_device(NULL, dev); 75 clockevents_exchange_device(NULL, dev);
74 tick_broadcast_device.evtdev = dev; 76 tick_broadcast_device.evtdev = dev;
75 if (!cpus_empty(tick_broadcast_mask)) 77 if (!cpumask_empty(tick_get_broadcast_mask()))
76 tick_broadcast_start_periodic(dev); 78 tick_broadcast_start_periodic(dev);
77 return 1; 79 return 1;
78} 80}
@@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
104 */ 106 */
105 if (!tick_device_is_functional(dev)) { 107 if (!tick_device_is_functional(dev)) {
106 dev->event_handler = tick_handle_periodic; 108 dev->event_handler = tick_handle_periodic;
107 cpu_set(cpu, tick_broadcast_mask); 109 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
108 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 110 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
109 ret = 1; 111 ret = 1;
110 } else { 112 } else {
@@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
116 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 118 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
117 int cpu = smp_processor_id(); 119 int cpu = smp_processor_id();
118 120
119 cpu_clear(cpu, tick_broadcast_mask); 121 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
120 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
121 } 123 }
122 } 124 }
@@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
125} 127}
126 128
127/* 129/*
128 * Broadcast the event to the cpus, which are set in the mask 130 * Broadcast the event to the cpus, which are set in the mask (mangled).
129 */ 131 */
130static void tick_do_broadcast(cpumask_t mask) 132static void tick_do_broadcast(struct cpumask *mask)
131{ 133{
132 int cpu = smp_processor_id(); 134 int cpu = smp_processor_id();
133 struct tick_device *td; 135 struct tick_device *td;
@@ -135,21 +137,20 @@ static void tick_do_broadcast(cpumask_t mask)
135 /* 137 /*
136 * Check, if the current cpu is in the mask 138 * Check, if the current cpu is in the mask
137 */ 139 */
138 if (cpu_isset(cpu, mask)) { 140 if (cpumask_test_cpu(cpu, mask)) {
139 cpu_clear(cpu, mask); 141 cpumask_clear_cpu(cpu, mask);
140 td = &per_cpu(tick_cpu_device, cpu); 142 td = &per_cpu(tick_cpu_device, cpu);
141 td->evtdev->event_handler(td->evtdev); 143 td->evtdev->event_handler(td->evtdev);
142 } 144 }
143 145
144 if (!cpus_empty(mask)) { 146 if (!cpumask_empty(mask)) {
145 /* 147 /*
146 * It might be necessary to actually check whether the devices 148 * It might be necessary to actually check whether the devices
147 * have different broadcast functions. For now, just use the 149 * have different broadcast functions. For now, just use the
148 * one of the first device. This works as long as we have this 150 * one of the first device. This works as long as we have this
149 * misfeature only on x86 (lapic) 151 * misfeature only on x86 (lapic)
150 */ 152 */
151 cpu = first_cpu(mask); 153 td = &per_cpu(tick_cpu_device, cpumask_first(mask));
152 td = &per_cpu(tick_cpu_device, cpu);
153 td->evtdev->broadcast(mask); 154 td->evtdev->broadcast(mask);
154 } 155 }
155} 156}
@@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask)
160 */ 161 */
161static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
162{ 163{
163 cpumask_t mask;
164
165 spin_lock(&tick_broadcast_lock); 164 spin_lock(&tick_broadcast_lock);
166 165
167 cpus_and(mask, cpu_online_map, tick_broadcast_mask); 166 cpumask_and(to_cpumask(tmpmask),
168 tick_do_broadcast(mask); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 spin_unlock(&tick_broadcast_lock);
171} 171}
@@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why)
228 if (!tick_device_is_functional(dev)) 228 if (!tick_device_is_functional(dev))
229 goto out; 229 goto out;
230 230
231 bc_stopped = cpus_empty(tick_broadcast_mask); 231 bc_stopped = cpumask_empty(tick_get_broadcast_mask());
232 232
233 switch (*reason) { 233 switch (*reason) {
234 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 234 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
236 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
237 cpu_set(cpu, tick_broadcast_mask); 237 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
238 if (tick_broadcast_device.mode == 238 if (tick_broadcast_device.mode ==
239 TICKDEV_MODE_PERIODIC) 239 TICKDEV_MODE_PERIODIC)
240 clockevents_shutdown(dev); 240 clockevents_shutdown(dev);
@@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why)
244 break; 244 break;
245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
246 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
247 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
248 cpu_clear(cpu, tick_broadcast_mask); 248 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
249 if (tick_broadcast_device.mode == 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC) 250 TICKDEV_MODE_PERIODIC)
251 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
@@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why)
253 break; 253 break;
254 } 254 }
255 255
256 if (cpus_empty(tick_broadcast_mask)) { 256 if (cpumask_empty(tick_get_broadcast_mask())) {
257 if (!bc_stopped) 257 if (!bc_stopped)
258 clockevents_shutdown(bc); 258 clockevents_shutdown(bc);
259 } else if (bc_stopped) { 259 } else if (bc_stopped) {
@@ -272,7 +272,7 @@ out:
272 */ 272 */
273void tick_broadcast_on_off(unsigned long reason, int *oncpu) 273void tick_broadcast_on_off(unsigned long reason, int *oncpu)
274{ 274{
275 if (!cpu_isset(*oncpu, cpu_online_map)) 275 if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
277 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
278 else 278 else
@@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
303 spin_lock_irqsave(&tick_broadcast_lock, flags); 303 spin_lock_irqsave(&tick_broadcast_lock, flags);
304 304
305 bc = tick_broadcast_device.evtdev; 305 bc = tick_broadcast_device.evtdev;
306 cpu_clear(cpu, tick_broadcast_mask); 306 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
307 307
308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
309 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpumask_empty(tick_get_broadcast_mask()))
310 clockevents_shutdown(bc); 310 clockevents_shutdown(bc);
311 } 311 }
312 312
@@ -342,10 +342,10 @@ int tick_resume_broadcast(void)
342 342
343 switch (tick_broadcast_device.mode) { 343 switch (tick_broadcast_device.mode) {
344 case TICKDEV_MODE_PERIODIC: 344 case TICKDEV_MODE_PERIODIC:
345 if(!cpus_empty(tick_broadcast_mask)) 345 if (!cpumask_empty(tick_get_broadcast_mask()))
346 tick_broadcast_start_periodic(bc); 346 tick_broadcast_start_periodic(bc);
347 broadcast = cpu_isset(smp_processor_id(), 347 broadcast = cpumask_test_cpu(smp_processor_id(),
348 tick_broadcast_mask); 348 tick_get_broadcast_mask());
349 break; 349 break;
350 case TICKDEV_MODE_ONESHOT: 350 case TICKDEV_MODE_ONESHOT:
351 broadcast = tick_resume_broadcast_oneshot(bc); 351 broadcast = tick_resume_broadcast_oneshot(bc);
@@ -360,14 +360,15 @@ int tick_resume_broadcast(void)
360 360
361#ifdef CONFIG_TICK_ONESHOT 361#ifdef CONFIG_TICK_ONESHOT
362 362
363static cpumask_t tick_broadcast_oneshot_mask; 363/* FIXME: use cpumask_var_t. */
364static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
364 365
365/* 366/*
366 * Debugging: see timer_list.c 367 * Exposed for debugging: see timer_list.c
367 */ 368 */
368cpumask_t *tick_get_broadcast_oneshot_mask(void) 369struct cpumask *tick_get_broadcast_oneshot_mask(void)
369{ 370{
370 return &tick_broadcast_oneshot_mask; 371 return to_cpumask(tick_broadcast_oneshot_mask);
371} 372}
372 373
373static int tick_broadcast_set_event(ktime_t expires, int force) 374static int tick_broadcast_set_event(ktime_t expires, int force)
@@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
389 */ 390 */
390void tick_check_oneshot_broadcast(int cpu) 391void tick_check_oneshot_broadcast(int cpu)
391{ 392{
392 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 393 if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
393 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 394 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
394 395
395 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 396 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu)
402static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 403static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
403{ 404{
404 struct tick_device *td; 405 struct tick_device *td;
405 cpumask_t mask;
406 ktime_t now, next_event; 406 ktime_t now, next_event;
407 int cpu; 407 int cpu;
408 408
@@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
410again: 410again:
411 dev->next_event.tv64 = KTIME_MAX; 411 dev->next_event.tv64 = KTIME_MAX;
412 next_event.tv64 = KTIME_MAX; 412 next_event.tv64 = KTIME_MAX;
413 mask = CPU_MASK_NONE; 413 cpumask_clear(to_cpumask(tmpmask));
414 now = ktime_get(); 414 now = ktime_get();
415 /* Find all expired events */ 415 /* Find all expired events */
416 for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { 416 for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
417 td = &per_cpu(tick_cpu_device, cpu); 417 td = &per_cpu(tick_cpu_device, cpu);
418 if (td->evtdev->next_event.tv64 <= now.tv64) 418 if (td->evtdev->next_event.tv64 <= now.tv64)
419 cpu_set(cpu, mask); 419 cpumask_set_cpu(cpu, to_cpumask(tmpmask));
420 else if (td->evtdev->next_event.tv64 < next_event.tv64) 420 else if (td->evtdev->next_event.tv64 < next_event.tv64)
421 next_event.tv64 = td->evtdev->next_event.tv64; 421 next_event.tv64 = td->evtdev->next_event.tv64;
422 } 422 }
@@ -424,7 +424,7 @@ again:
424 /* 424 /*
425 * Wakeup the cpus which have an expired event. 425 * Wakeup the cpus which have an expired event.
426 */ 426 */
427 tick_do_broadcast(mask); 427 tick_do_broadcast(to_cpumask(tmpmask));
428 428
429 /* 429 /*
430 * Two reasons for reprogram: 430 * Two reasons for reprogram:
@@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason)
476 goto out; 476 goto out;
477 477
478 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 478 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
479 if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 479 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
480 cpu_set(cpu, tick_broadcast_oneshot_mask); 480 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
481 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 481 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
482 if (dev->next_event.tv64 < bc->next_event.tv64) 482 if (dev->next_event.tv64 < bc->next_event.tv64)
483 tick_broadcast_set_event(dev->next_event, 1); 483 tick_broadcast_set_event(dev->next_event, 1);
484 } 484 }
485 } else { 485 } else {
486 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 486 if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
487 cpu_clear(cpu, tick_broadcast_oneshot_mask); 487 cpumask_clear_cpu(cpu,
488 tick_get_broadcast_oneshot_mask());
488 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 489 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
489 if (dev->next_event.tv64 != KTIME_MAX) 490 if (dev->next_event.tv64 != KTIME_MAX)
490 tick_program_event(dev->next_event, 1); 491 tick_program_event(dev->next_event, 1);
@@ -502,15 +503,16 @@ out:
502 */ 503 */
503static void tick_broadcast_clear_oneshot(int cpu) 504static void tick_broadcast_clear_oneshot(int cpu)
504{ 505{
505 cpu_clear(cpu, tick_broadcast_oneshot_mask); 506 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
506} 507}
507 508
508static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) 509static void tick_broadcast_init_next_event(struct cpumask *mask,
510 ktime_t expires)
509{ 511{
510 struct tick_device *td; 512 struct tick_device *td;
511 int cpu; 513 int cpu;
512 514
513 for_each_cpu_mask_nr(cpu, *mask) { 515 for_each_cpu(cpu, mask) {
514 td = &per_cpu(tick_cpu_device, cpu); 516 td = &per_cpu(tick_cpu_device, cpu);
515 if (td->evtdev) 517 if (td->evtdev)
516 td->evtdev->next_event = expires; 518 td->evtdev->next_event = expires;
@@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
526 if (bc->event_handler != tick_handle_oneshot_broadcast) { 528 if (bc->event_handler != tick_handle_oneshot_broadcast) {
527 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 529 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
528 int cpu = smp_processor_id(); 530 int cpu = smp_processor_id();
529 cpumask_t mask;
530 531
531 bc->event_handler = tick_handle_oneshot_broadcast; 532 bc->event_handler = tick_handle_oneshot_broadcast;
532 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 533 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
540 * oneshot_mask bits for those and program the 541 * oneshot_mask bits for those and program the
541 * broadcast device to fire. 542 * broadcast device to fire.
542 */ 543 */
543 mask = tick_broadcast_mask; 544 cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
544 cpu_clear(cpu, mask); 545 cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
545 cpus_or(tick_broadcast_oneshot_mask, 546 cpumask_or(tick_get_broadcast_oneshot_mask(),
546 tick_broadcast_oneshot_mask, mask); 547 tick_get_broadcast_oneshot_mask(),
547 548 to_cpumask(tmpmask));
548 if (was_periodic && !cpus_empty(mask)) { 549
549 tick_broadcast_init_next_event(&mask, tick_next_period); 550 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
551 tick_broadcast_init_next_event(to_cpumask(tmpmask),
552 tick_next_period);
550 tick_broadcast_set_event(tick_next_period, 1); 553 tick_broadcast_set_event(tick_next_period, 1);
551 } else 554 } else
552 bc->next_event.tv64 = KTIME_MAX; 555 bc->next_event.tv64 = KTIME_MAX;
@@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
585 * Clear the broadcast mask flag for the dead cpu, but do not 588 * Clear the broadcast mask flag for the dead cpu, but do not
586 * stop the broadcast device! 589 * stop the broadcast device!
587 */ 590 */
588 cpu_clear(cpu, tick_broadcast_oneshot_mask); 591 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
589 592
590 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 593 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
591} 594}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index df12434b43ca..63e05d423a09 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
136 */ 136 */
137static void tick_setup_device(struct tick_device *td, 137static void tick_setup_device(struct tick_device *td,
138 struct clock_event_device *newdev, int cpu, 138 struct clock_event_device *newdev, int cpu,
139 const cpumask_t *cpumask) 139 const struct cpumask *cpumask)
140{ 140{
141 ktime_t next_event; 141 ktime_t next_event;
142 void (*handler)(struct clock_event_device *) = NULL; 142 void (*handler)(struct clock_event_device *) = NULL;
@@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td,
171 * When the device is not per cpu, pin the interrupt to the 171 * When the device is not per cpu, pin the interrupt to the
172 * current cpu: 172 * current cpu:
173 */ 173 */
174 if (!cpus_equal(newdev->cpumask, *cpumask)) 174 if (!cpumask_equal(newdev->cpumask, cpumask))
175 irq_set_affinity(newdev->irq, *cpumask); 175 irq_set_affinity(newdev->irq, cpumask);
176 176
177 /* 177 /*
178 * When global broadcasting is active, check if the current 178 * When global broadcasting is active, check if the current
@@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev)
202 spin_lock_irqsave(&tick_device_lock, flags); 202 spin_lock_irqsave(&tick_device_lock, flags);
203 203
204 cpu = smp_processor_id(); 204 cpu = smp_processor_id();
205 if (!cpu_isset(cpu, newdev->cpumask)) 205 if (!cpumask_test_cpu(cpu, newdev->cpumask))
206 goto out_bc; 206 goto out_bc;
207 207
208 td = &per_cpu(tick_cpu_device, cpu); 208 td = &per_cpu(tick_cpu_device, cpu);
209 curdev = td->evtdev; 209 curdev = td->evtdev;
210 210
211 /* cpu local device ? */ 211 /* cpu local device ? */
212 if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { 212 if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
213 213
214 /* 214 /*
215 * If the cpu affinity of the device interrupt can not 215 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
222 * If we have a cpu local device already, do not replace it 222 * If we have a cpu local device already, do not replace it
223 * by a non cpu local device 223 * by a non cpu local device
224 */ 224 */
225 if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) 225 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
226 goto out_bc; 226 goto out_bc;
227 } 227 }
228 228
@@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
254 curdev = NULL; 254 curdev = NULL;
255 } 255 }
256 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
257 tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); 257 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
259 tick_oneshot_notify(); 259 tick_oneshot_notify();
260 260
@@ -299,9 +299,9 @@ static void tick_shutdown(unsigned int *cpup)
299 } 299 }
300 /* Transfer the do_timer job away from this cpu */ 300 /* Transfer the do_timer job away from this cpu */
301 if (*cpup == tick_do_timer_cpu) { 301 if (*cpup == tick_do_timer_cpu) {
302 int cpu = first_cpu(cpu_online_map); 302 int cpu = cpumask_first(cpu_online_mask);
303 303
304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : 304 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
305 TICK_DO_TIMER_NONE; 305 TICK_DO_TIMER_NONE;
306 } 306 }
307 spin_unlock_irqrestore(&tick_device_lock, flags); 307 spin_unlock_irqrestore(&tick_device_lock, flags);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 342fc9ccab46..1b6c05bd0d0a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
144 if (!ts->tick_stopped) 144 if (!ts->tick_stopped)
145 return; 145 return;
146 146
147 cpu_clear(cpu, nohz_cpu_mask); 147 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get(); 148 now = ktime_get();
149 ts->idle_waketime = now; 149 ts->idle_waketime = now;
150 150
@@ -247,7 +247,7 @@ void tick_nohz_stop_sched_tick(int inidle)
247 if (need_resched()) 247 if (need_resched())
248 goto end; 248 goto end;
249 249
250 if (unlikely(local_softirq_pending())) { 250 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
251 static int ratelimit; 251 static int ratelimit;
252 252
253 if (ratelimit < 10) { 253 if (ratelimit < 10) {
@@ -282,8 +282,31 @@ void tick_nohz_stop_sched_tick(int inidle)
282 /* Schedule the tick, if we are at least one jiffie off */ 282 /* Schedule the tick, if we are at least one jiffie off */
283 if ((long)delta_jiffies >= 1) { 283 if ((long)delta_jiffies >= 1) {
284 284
285 /*
286 * calculate the expiry time for the next timer wheel
287 * timer
288 */
289 expires = ktime_add_ns(last_update, tick_period.tv64 *
290 delta_jiffies);
291
292 /*
293 * If this cpu is the one which updates jiffies, then
294 * give up the assignment and let it be taken by the
295 * cpu which runs the tick timer next, which might be
296 * this cpu as well. If we don't drop this here the
297 * jiffies might be stale and do_timer() never
298 * invoked.
299 */
300 if (cpu == tick_do_timer_cpu)
301 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
302
285 if (delta_jiffies > 1) 303 if (delta_jiffies > 1)
286 cpu_set(cpu, nohz_cpu_mask); 304 cpumask_set_cpu(cpu, nohz_cpu_mask);
305
306 /* Skip reprogram of event if its not changed */
307 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
308 goto out;
309
287 /* 310 /*
288 * nohz_stop_sched_tick can be called several times before 311 * nohz_stop_sched_tick can be called several times before
289 * the nohz_restart_sched_tick is called. This happens when 312 * the nohz_restart_sched_tick is called. This happens when
@@ -296,7 +319,7 @@ void tick_nohz_stop_sched_tick(int inidle)
296 /* 319 /*
297 * sched tick not stopped! 320 * sched tick not stopped!
298 */ 321 */
299 cpu_clear(cpu, nohz_cpu_mask); 322 cpumask_clear_cpu(cpu, nohz_cpu_mask);
300 goto out; 323 goto out;
301 } 324 }
302 325
@@ -306,17 +329,6 @@ void tick_nohz_stop_sched_tick(int inidle)
306 rcu_enter_nohz(); 329 rcu_enter_nohz();
307 } 330 }
308 331
309 /*
310 * If this cpu is the one which updates jiffies, then
311 * give up the assignment and let it be taken by the
312 * cpu which runs the tick timer next, which might be
313 * this cpu as well. If we don't drop this here the
314 * jiffies might be stale and do_timer() never
315 * invoked.
316 */
317 if (cpu == tick_do_timer_cpu)
318 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
319
320 ts->idle_sleeps++; 332 ts->idle_sleeps++;
321 333
322 /* 334 /*
@@ -332,12 +344,7 @@ void tick_nohz_stop_sched_tick(int inidle)
332 goto out; 344 goto out;
333 } 345 }
334 346
335 /* 347 /* Mark expiries */
336 * calculate the expiry time for the next timer wheel
337 * timer
338 */
339 expires = ktime_add_ns(last_update, tick_period.tv64 *
340 delta_jiffies);
341 ts->idle_expires = expires; 348 ts->idle_expires = expires;
342 349
343 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
@@ -354,7 +361,7 @@ void tick_nohz_stop_sched_tick(int inidle)
354 * softirq. 361 * softirq.
355 */ 362 */
356 tick_do_update_jiffies64(ktime_get()); 363 tick_do_update_jiffies64(ktime_get());
357 cpu_clear(cpu, nohz_cpu_mask); 364 cpumask_clear_cpu(cpu, nohz_cpu_mask);
358 } 365 }
359 raise_softirq_irqoff(TIMER_SOFTIRQ); 366 raise_softirq_irqoff(TIMER_SOFTIRQ);
360out: 367out:
@@ -412,7 +419,9 @@ void tick_nohz_restart_sched_tick(void)
412{ 419{
413 int cpu = smp_processor_id(); 420 int cpu = smp_processor_id();
414 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 421 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
422#ifndef CONFIG_VIRT_CPU_ACCOUNTING
415 unsigned long ticks; 423 unsigned long ticks;
424#endif
416 ktime_t now; 425 ktime_t now;
417 426
418 local_irq_disable(); 427 local_irq_disable();
@@ -432,8 +441,9 @@ void tick_nohz_restart_sched_tick(void)
432 select_nohz_load_balancer(0); 441 select_nohz_load_balancer(0);
433 now = ktime_get(); 442 now = ktime_get();
434 tick_do_update_jiffies64(now); 443 tick_do_update_jiffies64(now);
435 cpu_clear(cpu, nohz_cpu_mask); 444 cpumask_clear_cpu(cpu, nohz_cpu_mask);
436 445
446#ifndef CONFIG_VIRT_CPU_ACCOUNTING
437 /* 447 /*
438 * We stopped the tick in idle. Update process times would miss the 448 * We stopped the tick in idle. Update process times would miss the
439 * time we slept as update_process_times does only a 1 tick 449 * time we slept as update_process_times does only a 1 tick
@@ -443,12 +453,9 @@ void tick_nohz_restart_sched_tick(void)
443 /* 453 /*
444 * We might be one off. Do not randomly account a huge number of ticks! 454 * We might be one off. Do not randomly account a huge number of ticks!
445 */ 455 */
446 if (ticks && ticks < LONG_MAX) { 456 if (ticks && ticks < LONG_MAX)
447 add_preempt_count(HARDIRQ_OFFSET); 457 account_idle_ticks(ticks);
448 account_system_time(current, HARDIRQ_OFFSET, 458#endif
449 jiffies_to_cputime(ticks));
450 sub_preempt_count(HARDIRQ_OFFSET);
451 }
452 459
453 touch_softlockup_watchdog(); 460 touch_softlockup_watchdog();
454 /* 461 /*
@@ -681,7 +688,6 @@ void tick_setup_sched_timer(void)
681 */ 688 */
682 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 689 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
683 ts->sched_timer.function = tick_sched_timer; 690 ts->sched_timer.function = tick_sched_timer;
684 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
685 691
686 /* Get the next period (per cpu) */ 692 /* Get the next period (per cpu) */
687 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); 693 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fa05e88aa76f..900f1b6598d1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 46struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 47static unsigned long total_sleep_time; /* seconds */
48 48
49/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended;
51
49static struct timespec xtime_cache __attribute__ ((aligned (16))); 52static struct timespec xtime_cache __attribute__ ((aligned (16)));
50void update_xtime_cache(u64 nsec) 53void update_xtime_cache(u64 nsec)
51{ 54{
@@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts)
92 unsigned long seq; 95 unsigned long seq;
93 s64 nsecs; 96 s64 nsecs;
94 97
98 WARN_ON(timekeeping_suspended);
99
95 do { 100 do {
96 seq = read_seqbegin(&xtime_lock); 101 seq = read_seqbegin(&xtime_lock);
97 102
@@ -299,8 +304,6 @@ void __init timekeeping_init(void)
299 write_sequnlock_irqrestore(&xtime_lock, flags); 304 write_sequnlock_irqrestore(&xtime_lock, flags);
300} 305}
301 306
302/* flag for if timekeeping is suspended */
303static int timekeeping_suspended;
304/* time in seconds when suspend began */ 307/* time in seconds when suspend began */
305static unsigned long timekeeping_suspend_time; 308static unsigned long timekeeping_suspend_time;
306 309
diff --git a/kernel/timer.c b/kernel/timer.c
index 566257d1dc10..dee3f641a7a7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1018,21 +1018,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1018} 1018}
1019#endif 1019#endif
1020 1020
1021#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1022void account_process_tick(struct task_struct *p, int user_tick)
1023{
1024 cputime_t one_jiffy = jiffies_to_cputime(1);
1025
1026 if (user_tick) {
1027 account_user_time(p, one_jiffy);
1028 account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
1029 } else {
1030 account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
1031 account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
1032 }
1033}
1034#endif
1035
1036/* 1021/*
1037 * Called from the timer interrupt handler to charge one tick to the current 1022 * Called from the timer interrupt handler to charge one tick to the current
1038 * process. user_tick is 1 if the tick is user time, 0 for system. 1023 * process. user_tick is 1 if the tick is user time, 0 for system.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 76f34c0ef29c..8b0daf0662ef 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -69,6 +69,7 @@ void tracing_on(void)
69{ 69{
70 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); 70 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
71} 71}
72EXPORT_SYMBOL_GPL(tracing_on);
72 73
73/** 74/**
74 * tracing_off - turn off all tracing buffers 75 * tracing_off - turn off all tracing buffers
@@ -82,6 +83,7 @@ void tracing_off(void)
82{ 83{
83 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); 84 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
84} 85}
86EXPORT_SYMBOL_GPL(tracing_off);
85 87
86/** 88/**
87 * tracing_off_permanent - permanently disable ring buffers 89 * tracing_off_permanent - permanently disable ring buffers
@@ -111,12 +113,14 @@ u64 ring_buffer_time_stamp(int cpu)
111 113
112 return time; 114 return time;
113} 115}
116EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
114 117
115void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) 118void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
116{ 119{
117 /* Just stupid testing the normalize function and deltas */ 120 /* Just stupid testing the normalize function and deltas */
118 *ts >>= DEBUG_SHIFT; 121 *ts >>= DEBUG_SHIFT;
119} 122}
123EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
120 124
121#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) 125#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
122#define RB_ALIGNMENT_SHIFT 2 126#define RB_ALIGNMENT_SHIFT 2
@@ -164,8 +168,15 @@ rb_event_length(struct ring_buffer_event *event)
164 */ 168 */
165unsigned ring_buffer_event_length(struct ring_buffer_event *event) 169unsigned ring_buffer_event_length(struct ring_buffer_event *event)
166{ 170{
167 return rb_event_length(event); 171 unsigned length = rb_event_length(event);
172 if (event->type != RINGBUF_TYPE_DATA)
173 return length;
174 length -= RB_EVNT_HDR_SIZE;
175 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
176 length -= sizeof(event->array[0]);
177 return length;
168} 178}
179EXPORT_SYMBOL_GPL(ring_buffer_event_length);
169 180
170/* inline for ring buffer fast paths */ 181/* inline for ring buffer fast paths */
171static inline void * 182static inline void *
@@ -187,9 +198,10 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
187{ 198{
188 return rb_event_data(event); 199 return rb_event_data(event);
189} 200}
201EXPORT_SYMBOL_GPL(ring_buffer_event_data);
190 202
191#define for_each_buffer_cpu(buffer, cpu) \ 203#define for_each_buffer_cpu(buffer, cpu) \
192 for_each_cpu_mask(cpu, buffer->cpumask) 204 for_each_cpu(cpu, buffer->cpumask)
193 205
194#define TS_SHIFT 27 206#define TS_SHIFT 27
195#define TS_MASK ((1ULL << TS_SHIFT) - 1) 207#define TS_MASK ((1ULL << TS_SHIFT) - 1)
@@ -261,7 +273,7 @@ struct ring_buffer {
261 unsigned pages; 273 unsigned pages;
262 unsigned flags; 274 unsigned flags;
263 int cpus; 275 int cpus;
264 cpumask_t cpumask; 276 cpumask_var_t cpumask;
265 atomic_t record_disabled; 277 atomic_t record_disabled;
266 278
267 struct mutex mutex; 279 struct mutex mutex;
@@ -427,7 +439,7 @@ extern int ring_buffer_page_too_big(void);
427 439
428/** 440/**
429 * ring_buffer_alloc - allocate a new ring_buffer 441 * ring_buffer_alloc - allocate a new ring_buffer
430 * @size: the size in bytes that is needed. 442 * @size: the size in bytes per cpu that is needed.
431 * @flags: attributes to set for the ring buffer. 443 * @flags: attributes to set for the ring buffer.
432 * 444 *
433 * Currently the only flag that is available is the RB_FL_OVERWRITE 445 * Currently the only flag that is available is the RB_FL_OVERWRITE
@@ -452,6 +464,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
452 if (!buffer) 464 if (!buffer)
453 return NULL; 465 return NULL;
454 466
467 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
468 goto fail_free_buffer;
469
455 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 470 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
456 buffer->flags = flags; 471 buffer->flags = flags;
457 472
@@ -459,14 +474,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
459 if (buffer->pages == 1) 474 if (buffer->pages == 1)
460 buffer->pages++; 475 buffer->pages++;
461 476
462 buffer->cpumask = cpu_possible_map; 477 cpumask_copy(buffer->cpumask, cpu_possible_mask);
463 buffer->cpus = nr_cpu_ids; 478 buffer->cpus = nr_cpu_ids;
464 479
465 bsize = sizeof(void *) * nr_cpu_ids; 480 bsize = sizeof(void *) * nr_cpu_ids;
466 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), 481 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
467 GFP_KERNEL); 482 GFP_KERNEL);
468 if (!buffer->buffers) 483 if (!buffer->buffers)
469 goto fail_free_buffer; 484 goto fail_free_cpumask;
470 485
471 for_each_buffer_cpu(buffer, cpu) { 486 for_each_buffer_cpu(buffer, cpu) {
472 buffer->buffers[cpu] = 487 buffer->buffers[cpu] =
@@ -486,10 +501,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
486 } 501 }
487 kfree(buffer->buffers); 502 kfree(buffer->buffers);
488 503
504 fail_free_cpumask:
505 free_cpumask_var(buffer->cpumask);
506
489 fail_free_buffer: 507 fail_free_buffer:
490 kfree(buffer); 508 kfree(buffer);
491 return NULL; 509 return NULL;
492} 510}
511EXPORT_SYMBOL_GPL(ring_buffer_alloc);
493 512
494/** 513/**
495 * ring_buffer_free - free a ring buffer. 514 * ring_buffer_free - free a ring buffer.
@@ -503,8 +522,11 @@ ring_buffer_free(struct ring_buffer *buffer)
503 for_each_buffer_cpu(buffer, cpu) 522 for_each_buffer_cpu(buffer, cpu)
504 rb_free_cpu_buffer(buffer->buffers[cpu]); 523 rb_free_cpu_buffer(buffer->buffers[cpu]);
505 524
525 free_cpumask_var(buffer->cpumask);
526
506 kfree(buffer); 527 kfree(buffer);
507} 528}
529EXPORT_SYMBOL_GPL(ring_buffer_free);
508 530
509static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 531static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
510 532
@@ -680,6 +702,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
680 mutex_unlock(&buffer->mutex); 702 mutex_unlock(&buffer->mutex);
681 return -ENOMEM; 703 return -ENOMEM;
682} 704}
705EXPORT_SYMBOL_GPL(ring_buffer_resize);
683 706
684static inline int rb_null_event(struct ring_buffer_event *event) 707static inline int rb_null_event(struct ring_buffer_event *event)
685{ 708{
@@ -1274,7 +1297,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1274 1297
1275 cpu = raw_smp_processor_id(); 1298 cpu = raw_smp_processor_id();
1276 1299
1277 if (!cpu_isset(cpu, buffer->cpumask)) 1300 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1278 goto out; 1301 goto out;
1279 1302
1280 cpu_buffer = buffer->buffers[cpu]; 1303 cpu_buffer = buffer->buffers[cpu];
@@ -1304,6 +1327,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1304 ftrace_preempt_enable(resched); 1327 ftrace_preempt_enable(resched);
1305 return NULL; 1328 return NULL;
1306} 1329}
1330EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1307 1331
1308static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 1332static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1309 struct ring_buffer_event *event) 1333 struct ring_buffer_event *event)
@@ -1350,6 +1374,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1350 1374
1351 return 0; 1375 return 0;
1352} 1376}
1377EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
1353 1378
1354/** 1379/**
1355 * ring_buffer_write - write data to the buffer without reserving 1380 * ring_buffer_write - write data to the buffer without reserving
@@ -1385,7 +1410,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1385 1410
1386 cpu = raw_smp_processor_id(); 1411 cpu = raw_smp_processor_id();
1387 1412
1388 if (!cpu_isset(cpu, buffer->cpumask)) 1413 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1389 goto out; 1414 goto out;
1390 1415
1391 cpu_buffer = buffer->buffers[cpu]; 1416 cpu_buffer = buffer->buffers[cpu];
@@ -1411,6 +1436,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1411 1436
1412 return ret; 1437 return ret;
1413} 1438}
1439EXPORT_SYMBOL_GPL(ring_buffer_write);
1414 1440
1415static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 1441static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1416{ 1442{
@@ -1437,6 +1463,7 @@ void ring_buffer_record_disable(struct ring_buffer *buffer)
1437{ 1463{
1438 atomic_inc(&buffer->record_disabled); 1464 atomic_inc(&buffer->record_disabled);
1439} 1465}
1466EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
1440 1467
1441/** 1468/**
1442 * ring_buffer_record_enable - enable writes to the buffer 1469 * ring_buffer_record_enable - enable writes to the buffer
@@ -1449,6 +1476,7 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
1449{ 1476{
1450 atomic_dec(&buffer->record_disabled); 1477 atomic_dec(&buffer->record_disabled);
1451} 1478}
1479EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
1452 1480
1453/** 1481/**
1454 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer 1482 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
@@ -1464,12 +1492,13 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
1464{ 1492{
1465 struct ring_buffer_per_cpu *cpu_buffer; 1493 struct ring_buffer_per_cpu *cpu_buffer;
1466 1494
1467 if (!cpu_isset(cpu, buffer->cpumask)) 1495 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1468 return; 1496 return;
1469 1497
1470 cpu_buffer = buffer->buffers[cpu]; 1498 cpu_buffer = buffer->buffers[cpu];
1471 atomic_inc(&cpu_buffer->record_disabled); 1499 atomic_inc(&cpu_buffer->record_disabled);
1472} 1500}
1501EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
1473 1502
1474/** 1503/**
1475 * ring_buffer_record_enable_cpu - enable writes to the buffer 1504 * ring_buffer_record_enable_cpu - enable writes to the buffer
@@ -1483,12 +1512,13 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
1483{ 1512{
1484 struct ring_buffer_per_cpu *cpu_buffer; 1513 struct ring_buffer_per_cpu *cpu_buffer;
1485 1514
1486 if (!cpu_isset(cpu, buffer->cpumask)) 1515 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1487 return; 1516 return;
1488 1517
1489 cpu_buffer = buffer->buffers[cpu]; 1518 cpu_buffer = buffer->buffers[cpu];
1490 atomic_dec(&cpu_buffer->record_disabled); 1519 atomic_dec(&cpu_buffer->record_disabled);
1491} 1520}
1521EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
1492 1522
1493/** 1523/**
1494 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 1524 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
@@ -1499,12 +1529,13 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1499{ 1529{
1500 struct ring_buffer_per_cpu *cpu_buffer; 1530 struct ring_buffer_per_cpu *cpu_buffer;
1501 1531
1502 if (!cpu_isset(cpu, buffer->cpumask)) 1532 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1503 return 0; 1533 return 0;
1504 1534
1505 cpu_buffer = buffer->buffers[cpu]; 1535 cpu_buffer = buffer->buffers[cpu];
1506 return cpu_buffer->entries; 1536 return cpu_buffer->entries;
1507} 1537}
1538EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
1508 1539
1509/** 1540/**
1510 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer 1541 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
@@ -1515,12 +1546,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1515{ 1546{
1516 struct ring_buffer_per_cpu *cpu_buffer; 1547 struct ring_buffer_per_cpu *cpu_buffer;
1517 1548
1518 if (!cpu_isset(cpu, buffer->cpumask)) 1549 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1519 return 0; 1550 return 0;
1520 1551
1521 cpu_buffer = buffer->buffers[cpu]; 1552 cpu_buffer = buffer->buffers[cpu];
1522 return cpu_buffer->overrun; 1553 return cpu_buffer->overrun;
1523} 1554}
1555EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1524 1556
1525/** 1557/**
1526 * ring_buffer_entries - get the number of entries in a buffer 1558 * ring_buffer_entries - get the number of entries in a buffer
@@ -1543,6 +1575,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
1543 1575
1544 return entries; 1576 return entries;
1545} 1577}
1578EXPORT_SYMBOL_GPL(ring_buffer_entries);
1546 1579
1547/** 1580/**
1548 * ring_buffer_overrun_cpu - get the number of overruns in buffer 1581 * ring_buffer_overrun_cpu - get the number of overruns in buffer
@@ -1565,6 +1598,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
1565 1598
1566 return overruns; 1599 return overruns;
1567} 1600}
1601EXPORT_SYMBOL_GPL(ring_buffer_overruns);
1568 1602
1569static void rb_iter_reset(struct ring_buffer_iter *iter) 1603static void rb_iter_reset(struct ring_buffer_iter *iter)
1570{ 1604{
@@ -1600,6 +1634,7 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1600 rb_iter_reset(iter); 1634 rb_iter_reset(iter);
1601 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 1635 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1602} 1636}
1637EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
1603 1638
1604/** 1639/**
1605 * ring_buffer_iter_empty - check if an iterator has no more to read 1640 * ring_buffer_iter_empty - check if an iterator has no more to read
@@ -1614,6 +1649,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
1614 return iter->head_page == cpu_buffer->commit_page && 1649 return iter->head_page == cpu_buffer->commit_page &&
1615 iter->head == rb_commit_index(cpu_buffer); 1650 iter->head == rb_commit_index(cpu_buffer);
1616} 1651}
1652EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
1617 1653
1618static void 1654static void
1619rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, 1655rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1828,7 +1864,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1828 struct buffer_page *reader; 1864 struct buffer_page *reader;
1829 int nr_loops = 0; 1865 int nr_loops = 0;
1830 1866
1831 if (!cpu_isset(cpu, buffer->cpumask)) 1867 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1832 return NULL; 1868 return NULL;
1833 1869
1834 cpu_buffer = buffer->buffers[cpu]; 1870 cpu_buffer = buffer->buffers[cpu];
@@ -1880,6 +1916,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1880 1916
1881 return NULL; 1917 return NULL;
1882} 1918}
1919EXPORT_SYMBOL_GPL(ring_buffer_peek);
1883 1920
1884static struct ring_buffer_event * 1921static struct ring_buffer_event *
1885rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) 1922rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
@@ -1940,6 +1977,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1940 1977
1941 return NULL; 1978 return NULL;
1942} 1979}
1980EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
1943 1981
1944/** 1982/**
1945 * ring_buffer_peek - peek at the next event to be read 1983 * ring_buffer_peek - peek at the next event to be read
@@ -2001,7 +2039,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2001 struct ring_buffer_event *event; 2039 struct ring_buffer_event *event;
2002 unsigned long flags; 2040 unsigned long flags;
2003 2041
2004 if (!cpu_isset(cpu, buffer->cpumask)) 2042 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2005 return NULL; 2043 return NULL;
2006 2044
2007 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2045 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2017,6 +2055,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2017 2055
2018 return event; 2056 return event;
2019} 2057}
2058EXPORT_SYMBOL_GPL(ring_buffer_consume);
2020 2059
2021/** 2060/**
2022 * ring_buffer_read_start - start a non consuming read of the buffer 2061 * ring_buffer_read_start - start a non consuming read of the buffer
@@ -2037,7 +2076,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
2037 struct ring_buffer_iter *iter; 2076 struct ring_buffer_iter *iter;
2038 unsigned long flags; 2077 unsigned long flags;
2039 2078
2040 if (!cpu_isset(cpu, buffer->cpumask)) 2079 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2041 return NULL; 2080 return NULL;
2042 2081
2043 iter = kmalloc(sizeof(*iter), GFP_KERNEL); 2082 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
@@ -2059,6 +2098,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
2059 2098
2060 return iter; 2099 return iter;
2061} 2100}
2101EXPORT_SYMBOL_GPL(ring_buffer_read_start);
2062 2102
2063/** 2103/**
2064 * ring_buffer_finish - finish reading the iterator of the buffer 2104 * ring_buffer_finish - finish reading the iterator of the buffer
@@ -2075,6 +2115,7 @@ ring_buffer_read_finish(struct ring_buffer_iter *iter)
2075 atomic_dec(&cpu_buffer->record_disabled); 2115 atomic_dec(&cpu_buffer->record_disabled);
2076 kfree(iter); 2116 kfree(iter);
2077} 2117}
2118EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
2078 2119
2079/** 2120/**
2080 * ring_buffer_read - read the next item in the ring buffer by the iterator 2121 * ring_buffer_read - read the next item in the ring buffer by the iterator
@@ -2101,6 +2142,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2101 2142
2102 return event; 2143 return event;
2103} 2144}
2145EXPORT_SYMBOL_GPL(ring_buffer_read);
2104 2146
2105/** 2147/**
2106 * ring_buffer_size - return the size of the ring buffer (in bytes) 2148 * ring_buffer_size - return the size of the ring buffer (in bytes)
@@ -2110,6 +2152,7 @@ unsigned long ring_buffer_size(struct ring_buffer *buffer)
2110{ 2152{
2111 return BUF_PAGE_SIZE * buffer->pages; 2153 return BUF_PAGE_SIZE * buffer->pages;
2112} 2154}
2155EXPORT_SYMBOL_GPL(ring_buffer_size);
2113 2156
2114static void 2157static void
2115rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 2158rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
@@ -2143,7 +2186,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2143 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2186 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2144 unsigned long flags; 2187 unsigned long flags;
2145 2188
2146 if (!cpu_isset(cpu, buffer->cpumask)) 2189 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2147 return; 2190 return;
2148 2191
2149 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2192 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2156,6 +2199,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2156 2199
2157 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2200 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2158} 2201}
2202EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
2159 2203
2160/** 2204/**
2161 * ring_buffer_reset - reset a ring buffer 2205 * ring_buffer_reset - reset a ring buffer
@@ -2168,6 +2212,7 @@ void ring_buffer_reset(struct ring_buffer *buffer)
2168 for_each_buffer_cpu(buffer, cpu) 2212 for_each_buffer_cpu(buffer, cpu)
2169 ring_buffer_reset_cpu(buffer, cpu); 2213 ring_buffer_reset_cpu(buffer, cpu);
2170} 2214}
2215EXPORT_SYMBOL_GPL(ring_buffer_reset);
2171 2216
2172/** 2217/**
2173 * rind_buffer_empty - is the ring buffer empty? 2218 * rind_buffer_empty - is the ring buffer empty?
@@ -2186,6 +2231,7 @@ int ring_buffer_empty(struct ring_buffer *buffer)
2186 } 2231 }
2187 return 1; 2232 return 1;
2188} 2233}
2234EXPORT_SYMBOL_GPL(ring_buffer_empty);
2189 2235
2190/** 2236/**
2191 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? 2237 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
@@ -2196,12 +2242,13 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2196{ 2242{
2197 struct ring_buffer_per_cpu *cpu_buffer; 2243 struct ring_buffer_per_cpu *cpu_buffer;
2198 2244
2199 if (!cpu_isset(cpu, buffer->cpumask)) 2245 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2200 return 1; 2246 return 1;
2201 2247
2202 cpu_buffer = buffer->buffers[cpu]; 2248 cpu_buffer = buffer->buffers[cpu];
2203 return rb_per_cpu_empty(cpu_buffer); 2249 return rb_per_cpu_empty(cpu_buffer);
2204} 2250}
2251EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2205 2252
2206/** 2253/**
2207 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 2254 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
@@ -2219,8 +2266,8 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2219 struct ring_buffer_per_cpu *cpu_buffer_a; 2266 struct ring_buffer_per_cpu *cpu_buffer_a;
2220 struct ring_buffer_per_cpu *cpu_buffer_b; 2267 struct ring_buffer_per_cpu *cpu_buffer_b;
2221 2268
2222 if (!cpu_isset(cpu, buffer_a->cpumask) || 2269 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
2223 !cpu_isset(cpu, buffer_b->cpumask)) 2270 !cpumask_test_cpu(cpu, buffer_b->cpumask))
2224 return -EINVAL; 2271 return -EINVAL;
2225 2272
2226 /* At least make sure the two buffers are somewhat the same */ 2273 /* At least make sure the two buffers are somewhat the same */
@@ -2250,6 +2297,7 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2250 2297
2251 return 0; 2298 return 0;
2252} 2299}
2300EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2253 2301
2254static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, 2302static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2255 struct buffer_data_page *bpage) 2303 struct buffer_data_page *bpage)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f4bb3800318b..c580233add95 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,7 +30,6 @@
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/kprobes.h> 32#include <linux/kprobes.h>
33#include <linux/seq_file.h>
34#include <linux/writeback.h> 33#include <linux/writeback.h>
35 34
36#include <linux/stacktrace.h> 35#include <linux/stacktrace.h>
@@ -90,10 +89,10 @@ static inline void ftrace_enable_cpu(void)
90 preempt_enable(); 89 preempt_enable();
91} 90}
92 91
93static cpumask_t __read_mostly tracing_buffer_mask; 92static cpumask_var_t __read_mostly tracing_buffer_mask;
94 93
95#define for_each_tracing_cpu(cpu) \ 94#define for_each_tracing_cpu(cpu) \
96 for_each_cpu_mask(cpu, tracing_buffer_mask) 95 for_each_cpu(cpu, tracing_buffer_mask)
97 96
98/* 97/*
99 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops 98 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -1310,7 +1309,7 @@ enum trace_file_type {
1310 TRACE_FILE_ANNOTATE = 2, 1309 TRACE_FILE_ANNOTATE = 2,
1311}; 1310};
1312 1311
1313static void trace_iterator_increment(struct trace_iterator *iter, int cpu) 1312static void trace_iterator_increment(struct trace_iterator *iter)
1314{ 1313{
1315 /* Don't allow ftrace to trace into the ring buffers */ 1314 /* Don't allow ftrace to trace into the ring buffers */
1316 ftrace_disable_cpu(); 1315 ftrace_disable_cpu();
@@ -1389,7 +1388,7 @@ static void *find_next_entry_inc(struct trace_iterator *iter)
1389 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1388 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
1390 1389
1391 if (iter->ent) 1390 if (iter->ent)
1392 trace_iterator_increment(iter, iter->cpu); 1391 trace_iterator_increment(iter);
1393 1392
1394 return iter->ent ? iter : NULL; 1393 return iter->ent ? iter : NULL;
1395} 1394}
@@ -1812,10 +1811,10 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1812 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) 1811 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
1813 return; 1812 return;
1814 1813
1815 if (cpu_isset(iter->cpu, iter->started)) 1814 if (cpumask_test_cpu(iter->cpu, iter->started))
1816 return; 1815 return;
1817 1816
1818 cpu_set(iter->cpu, iter->started); 1817 cpumask_set_cpu(iter->cpu, iter->started);
1819 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); 1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1820} 1819}
1821 1820
@@ -2647,13 +2646,7 @@ static struct file_operations show_traces_fops = {
2647/* 2646/*
2648 * Only trace on a CPU if the bitmask is set: 2647 * Only trace on a CPU if the bitmask is set:
2649 */ 2648 */
2650static cpumask_t tracing_cpumask = CPU_MASK_ALL; 2649static cpumask_var_t tracing_cpumask;
2651
2652/*
2653 * When tracing/tracing_cpu_mask is modified then this holds
2654 * the new bitmask we are about to install:
2655 */
2656static cpumask_t tracing_cpumask_new;
2657 2650
2658/* 2651/*
2659 * The tracer itself will not take this lock, but still we want 2652 * The tracer itself will not take this lock, but still we want
@@ -2694,6 +2687,10 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2694 size_t count, loff_t *ppos) 2687 size_t count, loff_t *ppos)
2695{ 2688{
2696 int err, cpu; 2689 int err, cpu;
2690 cpumask_var_t tracing_cpumask_new;
2691
2692 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2693 return -ENOMEM;
2697 2694
2698 mutex_lock(&tracing_cpumask_update_lock); 2695 mutex_lock(&tracing_cpumask_update_lock);
2699 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); 2696 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
@@ -2707,26 +2704,28 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2707 * Increase/decrease the disabled counter if we are 2704 * Increase/decrease the disabled counter if we are
2708 * about to flip a bit in the cpumask: 2705 * about to flip a bit in the cpumask:
2709 */ 2706 */
2710 if (cpu_isset(cpu, tracing_cpumask) && 2707 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2711 !cpu_isset(cpu, tracing_cpumask_new)) { 2708 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2712 atomic_inc(&global_trace.data[cpu]->disabled); 2709 atomic_inc(&global_trace.data[cpu]->disabled);
2713 } 2710 }
2714 if (!cpu_isset(cpu, tracing_cpumask) && 2711 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2715 cpu_isset(cpu, tracing_cpumask_new)) { 2712 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2716 atomic_dec(&global_trace.data[cpu]->disabled); 2713 atomic_dec(&global_trace.data[cpu]->disabled);
2717 } 2714 }
2718 } 2715 }
2719 __raw_spin_unlock(&ftrace_max_lock); 2716 __raw_spin_unlock(&ftrace_max_lock);
2720 local_irq_enable(); 2717 local_irq_enable();
2721 2718
2722 tracing_cpumask = tracing_cpumask_new; 2719 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
2723 2720
2724 mutex_unlock(&tracing_cpumask_update_lock); 2721 mutex_unlock(&tracing_cpumask_update_lock);
2722 free_cpumask_var(tracing_cpumask_new);
2725 2723
2726 return count; 2724 return count;
2727 2725
2728err_unlock: 2726err_unlock:
2729 mutex_unlock(&tracing_cpumask_update_lock); 2727 mutex_unlock(&tracing_cpumask_update_lock);
2728 free_cpumask_var(tracing_cpumask);
2730 2729
2731 return err; 2730 return err;
2732} 2731}
@@ -3115,10 +3114,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3115 if (!iter) 3114 if (!iter)
3116 return -ENOMEM; 3115 return -ENOMEM;
3117 3116
3117 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3118 kfree(iter);
3119 return -ENOMEM;
3120 }
3121
3118 mutex_lock(&trace_types_lock); 3122 mutex_lock(&trace_types_lock);
3119 3123
3120 /* trace pipe does not show start of buffer */ 3124 /* trace pipe does not show start of buffer */
3121 cpus_setall(iter->started); 3125 cpumask_setall(iter->started);
3122 3126
3123 iter->tr = &global_trace; 3127 iter->tr = &global_trace;
3124 iter->trace = current_trace; 3128 iter->trace = current_trace;
@@ -3135,6 +3139,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3135{ 3139{
3136 struct trace_iterator *iter = file->private_data; 3140 struct trace_iterator *iter = file->private_data;
3137 3141
3142 free_cpumask_var(iter->started);
3138 kfree(iter); 3143 kfree(iter);
3139 atomic_dec(&tracing_reader); 3144 atomic_dec(&tracing_reader);
3140 3145
@@ -3753,7 +3758,6 @@ void ftrace_dump(void)
3753 static DEFINE_SPINLOCK(ftrace_dump_lock); 3758 static DEFINE_SPINLOCK(ftrace_dump_lock);
3754 /* use static because iter can be a bit big for the stack */ 3759 /* use static because iter can be a bit big for the stack */
3755 static struct trace_iterator iter; 3760 static struct trace_iterator iter;
3756 static cpumask_t mask;
3757 static int dump_ran; 3761 static int dump_ran;
3758 unsigned long flags; 3762 unsigned long flags;
3759 int cnt = 0, cpu; 3763 int cnt = 0, cpu;
@@ -3787,8 +3791,6 @@ void ftrace_dump(void)
3787 * and then release the locks again. 3791 * and then release the locks again.
3788 */ 3792 */
3789 3793
3790 cpus_clear(mask);
3791
3792 while (!trace_empty(&iter)) { 3794 while (!trace_empty(&iter)) {
3793 3795
3794 if (!cnt) 3796 if (!cnt)
@@ -3824,19 +3826,28 @@ __init static int tracer_alloc_buffers(void)
3824{ 3826{
3825 struct trace_array_cpu *data; 3827 struct trace_array_cpu *data;
3826 int i; 3828 int i;
3829 int ret = -ENOMEM;
3827 3830
3828 /* TODO: make the number of buffers hot pluggable with CPUS */ 3831 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
3829 tracing_buffer_mask = cpu_possible_map; 3832 goto out;
3833
3834 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
3835 goto out_free_buffer_mask;
3836
3837 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
3838 cpumask_copy(tracing_cpumask, cpu_all_mask);
3830 3839
3840 /* TODO: make the number of buffers hot pluggable with CPUS */
3831 global_trace.buffer = ring_buffer_alloc(trace_buf_size, 3841 global_trace.buffer = ring_buffer_alloc(trace_buf_size,
3832 TRACE_BUFFER_FLAGS); 3842 TRACE_BUFFER_FLAGS);
3833 if (!global_trace.buffer) { 3843 if (!global_trace.buffer) {
3834 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 3844 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
3835 WARN_ON(1); 3845 WARN_ON(1);
3836 return 0; 3846 goto out_free_cpumask;
3837 } 3847 }
3838 global_trace.entries = ring_buffer_size(global_trace.buffer); 3848 global_trace.entries = ring_buffer_size(global_trace.buffer);
3839 3849
3850
3840#ifdef CONFIG_TRACER_MAX_TRACE 3851#ifdef CONFIG_TRACER_MAX_TRACE
3841 max_tr.buffer = ring_buffer_alloc(trace_buf_size, 3852 max_tr.buffer = ring_buffer_alloc(trace_buf_size,
3842 TRACE_BUFFER_FLAGS); 3853 TRACE_BUFFER_FLAGS);
@@ -3844,7 +3855,7 @@ __init static int tracer_alloc_buffers(void)
3844 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 3855 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
3845 WARN_ON(1); 3856 WARN_ON(1);
3846 ring_buffer_free(global_trace.buffer); 3857 ring_buffer_free(global_trace.buffer);
3847 return 0; 3858 goto out_free_cpumask;
3848 } 3859 }
3849 max_tr.entries = ring_buffer_size(max_tr.buffer); 3860 max_tr.entries = ring_buffer_size(max_tr.buffer);
3850 WARN_ON(max_tr.entries != global_trace.entries); 3861 WARN_ON(max_tr.entries != global_trace.entries);
@@ -3874,8 +3885,14 @@ __init static int tracer_alloc_buffers(void)
3874 &trace_panic_notifier); 3885 &trace_panic_notifier);
3875 3886
3876 register_die_notifier(&trace_die_notifier); 3887 register_die_notifier(&trace_die_notifier);
3888 ret = 0;
3877 3889
3878 return 0; 3890out_free_cpumask:
3891 free_cpumask_var(tracing_cpumask);
3892out_free_buffer_mask:
3893 free_cpumask_var(tracing_buffer_mask);
3894out:
3895 return ret;
3879} 3896}
3880early_initcall(tracer_alloc_buffers); 3897early_initcall(tracer_alloc_buffers);
3881fs_initcall(tracer_init_debugfs); 3898fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f864036..4d3d381bfd95 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -368,7 +368,7 @@ struct trace_iterator {
368 loff_t pos; 368 loff_t pos;
369 long idx; 369 long idx;
370 370
371 cpumask_t started; 371 cpumask_var_t started;
372}; 372};
373 373
374int tracing_is_enabled(void); 374int tracing_is_enabled(void);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 3ccebde28482..366c8c333e13 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -42,7 +42,7 @@ static int boot_trace_init(struct trace_array *tr)
42 int cpu; 42 int cpu;
43 boot_trace = tr; 43 boot_trace = tr;
44 44
45 for_each_cpu_mask(cpu, cpu_possible_map) 45 for_each_cpu(cpu, cpu_possible_mask)
46 tracing_reset(tr, cpu); 46 tracing_reset(tr, cpu);
47 47
48 tracing_sched_switch_assign_trace(tr); 48 tracing_sched_switch_assign_trace(tr);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4bf39fcae97a..930c08e5b38e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -79,7 +79,7 @@ print_graph_cpu(struct trace_seq *s, int cpu)
79 int i; 79 int i;
80 int ret; 80 int ret;
81 int log10_this = log10_cpu(cpu); 81 int log10_this = log10_cpu(cpu);
82 int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); 82 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
83 83
84 84
85 /* 85 /*
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index b6a3e20a49a9..649df22d435f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -46,7 +46,7 @@ static void bts_trace_start(struct trace_array *tr)
46 46
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(tr);
48 48
49 for_each_cpu_mask(cpu, cpu_possible_map) 49 for_each_cpu(cpu, cpu_possible_mask)
50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); 50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
51} 51}
52 52
@@ -62,7 +62,7 @@ static void bts_trace_stop(struct trace_array *tr)
62{ 62{
63 int cpu; 63 int cpu;
64 64
65 for_each_cpu_mask(cpu, cpu_possible_map) 65 for_each_cpu(cpu, cpu_possible_mask)
66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); 66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
67} 67}
68 68
@@ -172,7 +172,7 @@ static void trace_bts_prepare(struct trace_iterator *iter)
172{ 172{
173 int cpu; 173 int cpu;
174 174
175 for_each_cpu_mask(cpu, cpu_possible_map) 175 for_each_cpu(cpu, cpu_possible_mask)
176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); 176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
177} 177}
178 178
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index a7172a352f62..7bda248daf55 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -39,7 +39,7 @@ static int power_trace_init(struct trace_array *tr)
39 39
40 trace_power_enabled = 1; 40 trace_power_enabled = 1;
41 41
42 for_each_cpu_mask(cpu, cpu_possible_map) 42 for_each_cpu(cpu, cpu_possible_mask)
43 tracing_reset(tr, cpu); 43 tracing_reset(tr, cpu);
44 return 0; 44 return 0;
45} 45}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 01becf1f19ff..eaca5ad803ff 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -196,27 +196,19 @@ static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
196 return HRTIMER_RESTART; 196 return HRTIMER_RESTART;
197} 197}
198 198
199static void start_stack_timer(int cpu) 199static void start_stack_timer(void *unused)
200{ 200{
201 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); 201 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 205
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 206 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 207}
209 208
210static void start_stack_timers(void) 209static void start_stack_timers(void)
211{ 210{
212 cpumask_t saved_mask = current->cpus_allowed; 211 on_each_cpu(start_stack_timer, NULL, 1);
213 int cpu;
214
215 for_each_online_cpu(cpu) {
216 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
217 start_stack_timer(cpu);
218 }
219 set_cpus_allowed_ptr(current, &saved_mask);
220} 212}
221 213
222static void stop_stack_timer(int cpu) 214static void stop_stack_timer(int cpu)
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 2dc06ab35716..43f891b05a4b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
92 mm = get_task_mm(p); 92 mm = get_task_mm(p);
93 if (mm) { 93 if (mm) {
94 /* adjust to KB unit */ 94 /* adjust to KB unit */
95 stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; 95 stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
96 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; 96 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB;
97 mmput(mm); 97 mmput(mm);
98 } 98 }
99 stats->read_char = p->ioac.rchar; 99 stats->read_char = p->ioac.rchar;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4952322cba45..2f445833ae37 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 73static LIST_HEAD(workqueues);
74 74
75static int singlethread_cpu __read_mostly; 75static int singlethread_cpu __read_mostly;
76static cpumask_t cpu_singlethread_map __read_mostly; 76static const struct cpumask *cpu_singlethread_map __read_mostly;
77/* 77/*
78 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD 78 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
79 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work 79 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
@@ -81,7 +81,7 @@ static cpumask_t cpu_singlethread_map __read_mostly;
81 * use cpu_possible_map, the cpumask below is more a documentation 81 * use cpu_possible_map, the cpumask below is more a documentation
82 * than optimization. 82 * than optimization.
83 */ 83 */
84static cpumask_t cpu_populated_map __read_mostly; 84static cpumask_var_t cpu_populated_map __read_mostly;
85 85
86/* If it's single threaded, it isn't in the list of workqueues. */ 86/* If it's single threaded, it isn't in the list of workqueues. */
87static inline int is_wq_single_threaded(struct workqueue_struct *wq) 87static inline int is_wq_single_threaded(struct workqueue_struct *wq)
@@ -89,10 +89,10 @@ static inline int is_wq_single_threaded(struct workqueue_struct *wq)
89 return wq->singlethread; 89 return wq->singlethread;
90} 90}
91 91
92static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) 92static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
93{ 93{
94 return is_wq_single_threaded(wq) 94 return is_wq_single_threaded(wq)
95 ? &cpu_singlethread_map : &cpu_populated_map; 95 ? cpu_singlethread_map : cpu_populated_map;
96} 96}
97 97
98static 98static
@@ -410,7 +410,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
410 */ 410 */
411void flush_workqueue(struct workqueue_struct *wq) 411void flush_workqueue(struct workqueue_struct *wq)
412{ 412{
413 const cpumask_t *cpu_map = wq_cpu_map(wq); 413 const struct cpumask *cpu_map = wq_cpu_map(wq);
414 int cpu; 414 int cpu;
415 415
416 might_sleep(); 416 might_sleep();
@@ -532,7 +532,7 @@ static void wait_on_work(struct work_struct *work)
532{ 532{
533 struct cpu_workqueue_struct *cwq; 533 struct cpu_workqueue_struct *cwq;
534 struct workqueue_struct *wq; 534 struct workqueue_struct *wq;
535 const cpumask_t *cpu_map; 535 const struct cpumask *cpu_map;
536 int cpu; 536 int cpu;
537 537
538 might_sleep(); 538 might_sleep();
@@ -903,7 +903,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
903 */ 903 */
904void destroy_workqueue(struct workqueue_struct *wq) 904void destroy_workqueue(struct workqueue_struct *wq)
905{ 905{
906 const cpumask_t *cpu_map = wq_cpu_map(wq); 906 const struct cpumask *cpu_map = wq_cpu_map(wq);
907 int cpu; 907 int cpu;
908 908
909 cpu_maps_update_begin(); 909 cpu_maps_update_begin();
@@ -933,7 +933,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
933 933
934 switch (action) { 934 switch (action) {
935 case CPU_UP_PREPARE: 935 case CPU_UP_PREPARE:
936 cpu_set(cpu, cpu_populated_map); 936 cpumask_set_cpu(cpu, cpu_populated_map);
937 } 937 }
938undo: 938undo:
939 list_for_each_entry(wq, &workqueues, list) { 939 list_for_each_entry(wq, &workqueues, list) {
@@ -964,7 +964,7 @@ undo:
964 switch (action) { 964 switch (action) {
965 case CPU_UP_CANCELED: 965 case CPU_UP_CANCELED:
966 case CPU_POST_DEAD: 966 case CPU_POST_DEAD:
967 cpu_clear(cpu, cpu_populated_map); 967 cpumask_clear_cpu(cpu, cpu_populated_map);
968 } 968 }
969 969
970 return ret; 970 return ret;
@@ -1017,9 +1017,11 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
1017 1017
1018void __init init_workqueues(void) 1018void __init init_workqueues(void)
1019{ 1019{
1020 cpu_populated_map = cpu_online_map; 1020 alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL);
1021 singlethread_cpu = first_cpu(cpu_possible_map); 1021
1022 cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); 1022 cpumask_copy(cpu_populated_map, cpu_online_mask);
1023 singlethread_cpu = cpumask_first(cpu_possible_mask);
1024 cpu_singlethread_map = cpumask_of(singlethread_cpu);
1023 hotcpu_notifier(workqueue_cpu_callback, 0); 1025 hotcpu_notifier(workqueue_cpu_callback, 0);
1024 keventd_wq = create_workqueue("events"); 1026 keventd_wq = create_workqueue("events");
1025 BUG_ON(!keventd_wq); 1027 BUG_ON(!keventd_wq);