aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/async.c321
-rw-r--r--kernel/audit.h5
-rw-r--r--kernel/audit_tree.c3
-rw-r--r--kernel/auditfilter.c325
-rw-r--r--kernel/auditsc.c739
-rw-r--r--kernel/capability.c6
-rw-r--r--kernel/cgroup.c40
-rw-r--r--kernel/compat.c54
-rw-r--r--kernel/cpu.c157
-rw-r--r--kernel/cpuset.c38
-rw-r--r--kernel/dma-coherent.c42
-rw-r--r--kernel/exit.c21
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/futex.c72
-rw-r--r--kernel/hrtimer.c142
-rw-r--r--kernel/irq/autoprobe.c5
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/manage.c31
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/proc.c57
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c281
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/module.c94
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/profile.c39
-rw-r--r--kernel/rcuclassic.c32
-rw-r--r--kernel/rcupdate.c11
-rw-r--r--kernel/rcupreempt.c30
-rw-r--r--kernel/rcutorture.c45
-rw-r--r--kernel/rcutree.c13
-rw-r--r--kernel/resource.c61
-rw-r--r--kernel/sched.c1105
-rw-r--r--kernel/sched_clock.c5
-rw-r--r--kernel/sched_cpupri.c39
-rw-r--r--kernel/sched_cpupri.h5
-rw-r--r--kernel/sched_fair.c62
-rw-r--r--kernel/sched_rt.c74
-rw-r--r--kernel/sched_stats.h3
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/smp.c145
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c10
-rw-r--r--kernel/stop_machine.c63
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c27
-rw-r--r--kernel/taskstats.c41
-rw-r--r--kernel/test_kprobes.c210
-rw-r--r--kernel/time.c4
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/tick-broadcast.c113
-rw-r--r--kernel/time/tick-common.c18
-rw-r--r--kernel/time/tick-sched.c22
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/timer.c15
-rw-r--r--kernel/trace/ring_buffer.c42
-rw-r--r--kernel/trace/trace.c68
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_hw_branches.c6
-rw-r--r--kernel/trace/trace_power.c2
-rw-r--r--kernel/trace/trace_sysprof.c13
-rw-r--r--kernel/tsacct.c4
-rw-r--r--kernel/workqueue.c26
73 files changed, 2892 insertions, 1989 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e1c5bf3365c0..2921d90ce32f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o
13 14
14ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
diff --git a/kernel/async.c b/kernel/async.c
new file mode 100644
index 000000000000..97373380c9e7
--- /dev/null
+++ b/kernel/async.c
@@ -0,0 +1,321 @@
1/*
2 * async.c: Asynchronous function calls for boot performance
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13
14/*
15
16Goals and Theory of Operation
17
18The primary goal of this feature is to reduce the kernel boot time,
19by doing various independent hardware delays and discovery operations
20decoupled and not strictly serialized.
21
22More specifically, the asynchronous function call concept allows
23certain operations (primarily during system boot) to happen
24asynchronously, out of order, while these operations still
25have their externally visible parts happen sequentially and in-order.
26(not unlike how out-of-order CPUs retire their instructions in order)
27
28Key to the asynchronous function call implementation is the concept of
29a "sequence cookie" (which, although it has an abstracted type, can be
30thought of as a monotonically incrementing number).
31
32The async core will assign each scheduled event such a sequence cookie and
33pass this to the called functions.
34
35The asynchronously called function should before doing a globally visible
36operation, such as registering device numbers, call the
37async_synchronize_cookie() function and pass in its own cookie. The
38async_synchronize_cookie() function will make sure that all asynchronous
39operations that were scheduled prior to the operation corresponding with the
40cookie have completed.
41
42Subsystem/driver initialization code that scheduled asynchronous probe
43functions, but which shares global resources with other drivers/subsystems
44that do not use the asynchronous call feature, need to do a full
45synchronization with the async_synchronize_full() function, before returning
46from their init function. This is to maintain strict ordering between the
47asynchronous and synchronous parts of the kernel.
48
49*/
50
51#include <linux/async.h>
52#include <linux/module.h>
53#include <linux/wait.h>
54#include <linux/sched.h>
55#include <linux/init.h>
56#include <linux/kthread.h>
57#include <asm/atomic.h>
58
59static async_cookie_t next_cookie = 1;
60
61#define MAX_THREADS 256
62#define MAX_WORK 32768
63
64static LIST_HEAD(async_pending);
65static LIST_HEAD(async_running);
66static DEFINE_SPINLOCK(async_lock);
67
68struct async_entry {
69 struct list_head list;
70 async_cookie_t cookie;
71 async_func_ptr *func;
72 void *data;
73 struct list_head *running;
74};
75
76static DECLARE_WAIT_QUEUE_HEAD(async_done);
77static DECLARE_WAIT_QUEUE_HEAD(async_new);
78
79static atomic_t entry_count;
80static atomic_t thread_count;
81
82extern int initcall_debug;
83
84
85/*
86 * MUST be called with the lock held!
87 */
88static async_cookie_t __lowest_in_progress(struct list_head *running)
89{
90 struct async_entry *entry;
91 if (!list_empty(&async_pending)) {
92 entry = list_first_entry(&async_pending,
93 struct async_entry, list);
94 return entry->cookie;
95 } else if (!list_empty(running)) {
96 entry = list_first_entry(running,
97 struct async_entry, list);
98 return entry->cookie;
99 } else {
100 /* nothing in progress... next_cookie is "infinity" */
101 return next_cookie;
102 }
103
104}
105/*
106 * pick the first pending entry and run it
107 */
108static void run_one_entry(void)
109{
110 unsigned long flags;
111 struct async_entry *entry;
112 ktime_t calltime, delta, rettime;
113
114 /* 1) pick one task from the pending queue */
115
116 spin_lock_irqsave(&async_lock, flags);
117 if (list_empty(&async_pending))
118 goto out;
119 entry = list_first_entry(&async_pending, struct async_entry, list);
120
121 /* 2) move it to the running queue */
122 list_del(&entry->list);
123 list_add_tail(&entry->list, &async_running);
124 spin_unlock_irqrestore(&async_lock, flags);
125
126 /* 3) run it (and print duration)*/
127 if (initcall_debug && system_state == SYSTEM_BOOTING) {
128 printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
129 calltime = ktime_get();
130 }
131 entry->func(entry->data, entry->cookie);
132 if (initcall_debug && system_state == SYSTEM_BOOTING) {
133 rettime = ktime_get();
134 delta = ktime_sub(rettime, calltime);
135 printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
136 entry->func, ktime_to_ns(delta) >> 10);
137 }
138
139 /* 4) remove it from the running queue */
140 spin_lock_irqsave(&async_lock, flags);
141 list_del(&entry->list);
142
143 /* 5) free the entry */
144 kfree(entry);
145 atomic_dec(&entry_count);
146
147 spin_unlock_irqrestore(&async_lock, flags);
148
149 /* 6) wake up any waiters. */
150 wake_up(&async_done);
151 return;
152
153out:
154 spin_unlock_irqrestore(&async_lock, flags);
155}
156
157
158static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
159{
160 struct async_entry *entry;
161 unsigned long flags;
162 async_cookie_t newcookie;
163
164
165 /* allow irq-off callers */
166 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
167
168 /*
169 * If we're out of memory or if there's too much work
170 * pending already, we execute synchronously.
171 */
172 if (!entry || atomic_read(&entry_count) > MAX_WORK) {
173 kfree(entry);
174 spin_lock_irqsave(&async_lock, flags);
175 newcookie = next_cookie++;
176 spin_unlock_irqrestore(&async_lock, flags);
177
178 /* low on memory.. run synchronously */
179 ptr(data, newcookie);
180 return newcookie;
181 }
182 entry->func = ptr;
183 entry->data = data;
184 entry->running = running;
185
186 spin_lock_irqsave(&async_lock, flags);
187 newcookie = entry->cookie = next_cookie++;
188 list_add_tail(&entry->list, &async_pending);
189 atomic_inc(&entry_count);
190 spin_unlock_irqrestore(&async_lock, flags);
191 wake_up(&async_new);
192 return newcookie;
193}
194
195async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
196{
197 return __async_schedule(ptr, data, &async_pending);
198}
199EXPORT_SYMBOL_GPL(async_schedule);
200
201async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
202{
203 return __async_schedule(ptr, data, running);
204}
205EXPORT_SYMBOL_GPL(async_schedule_special);
206
207void async_synchronize_full(void)
208{
209 async_synchronize_cookie(next_cookie);
210}
211EXPORT_SYMBOL_GPL(async_synchronize_full);
212
213void async_synchronize_full_special(struct list_head *list)
214{
215 async_synchronize_cookie_special(next_cookie, list);
216}
217EXPORT_SYMBOL_GPL(async_synchronize_full_special);
218
219void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
220{
221 ktime_t starttime, delta, endtime;
222
223 if (initcall_debug && system_state == SYSTEM_BOOTING) {
224 printk("async_waiting @ %i\n", task_pid_nr(current));
225 starttime = ktime_get();
226 }
227
228 wait_event(async_done, __lowest_in_progress(running) >= cookie);
229
230 if (initcall_debug && system_state == SYSTEM_BOOTING) {
231 endtime = ktime_get();
232 delta = ktime_sub(endtime, starttime);
233
234 printk("async_continuing @ %i after %lli usec\n",
235 task_pid_nr(current), ktime_to_ns(delta) >> 10);
236 }
237}
238EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
239
240void async_synchronize_cookie(async_cookie_t cookie)
241{
242 async_synchronize_cookie_special(cookie, &async_running);
243}
244EXPORT_SYMBOL_GPL(async_synchronize_cookie);
245
246
247static int async_thread(void *unused)
248{
249 DECLARE_WAITQUEUE(wq, current);
250 add_wait_queue(&async_new, &wq);
251
252 while (!kthread_should_stop()) {
253 int ret = HZ;
254 set_current_state(TASK_INTERRUPTIBLE);
255 /*
256 * check the list head without lock.. false positives
257 * are dealt with inside run_one_entry() while holding
258 * the lock.
259 */
260 rmb();
261 if (!list_empty(&async_pending))
262 run_one_entry();
263 else
264 ret = schedule_timeout(HZ);
265
266 if (ret == 0) {
267 /*
268 * we timed out, this means we as thread are redundant.
269 * we sign off and die, but we to avoid any races there
270 * is a last-straw check to see if work snuck in.
271 */
272 atomic_dec(&thread_count);
273 wmb(); /* manager must see our departure first */
274 if (list_empty(&async_pending))
275 break;
276 /*
277 * woops work came in between us timing out and us
278 * signing off; we need to stay alive and keep working.
279 */
280 atomic_inc(&thread_count);
281 }
282 }
283 remove_wait_queue(&async_new, &wq);
284
285 return 0;
286}
287
288static int async_manager_thread(void *unused)
289{
290 DECLARE_WAITQUEUE(wq, current);
291 add_wait_queue(&async_new, &wq);
292
293 while (!kthread_should_stop()) {
294 int tc, ec;
295
296 set_current_state(TASK_INTERRUPTIBLE);
297
298 tc = atomic_read(&thread_count);
299 rmb();
300 ec = atomic_read(&entry_count);
301
302 while (tc < ec && tc < MAX_THREADS) {
303 kthread_run(async_thread, NULL, "async/%i", tc);
304 atomic_inc(&thread_count);
305 tc++;
306 }
307
308 schedule();
309 }
310 remove_wait_queue(&async_new, &wq);
311
312 return 0;
313}
314
315static int __init async_init(void)
316{
317 kthread_run(async_manager_thread, NULL, "async/mgr");
318 return 0;
319}
320
321core_initcall(async_init);
diff --git a/kernel/audit.h b/kernel/audit.h
index 9d6717412fec..16f18cac661b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -159,11 +159,8 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
159 return __audit_signal_info(sig, t); 159 return __audit_signal_info(sig, t);
160 return 0; 160 return 0;
161} 161}
162extern enum audit_state audit_filter_inodes(struct task_struct *, 162extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
163 struct audit_context *);
164extern void audit_set_auditable(struct audit_context *);
165#else 163#else
166#define audit_signal_info(s,t) AUDIT_DISABLED 164#define audit_signal_info(s,t) AUDIT_DISABLED
167#define audit_filter_inodes(t,c) AUDIT_DISABLED 165#define audit_filter_inodes(t,c) AUDIT_DISABLED
168#define audit_set_auditable(c)
169#endif 166#endif
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8b509441f49a..8ad9545b8db9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -450,6 +450,7 @@ static void kill_rules(struct audit_tree *tree)
450 audit_log_end(ab); 450 audit_log_end(ab);
451 rule->tree = NULL; 451 rule->tree = NULL;
452 list_del_rcu(&entry->list); 452 list_del_rcu(&entry->list);
453 list_del(&entry->rule.list);
453 call_rcu(&entry->rcu, audit_free_rule_rcu); 454 call_rcu(&entry->rcu, audit_free_rule_rcu);
454 } 455 }
455 } 456 }
@@ -617,7 +618,7 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
617 618
618 if (pathname[0] != '/' || 619 if (pathname[0] != '/' ||
619 rule->listnr != AUDIT_FILTER_EXIT || 620 rule->listnr != AUDIT_FILTER_EXIT ||
620 op & ~AUDIT_EQUAL || 621 op != Audit_equal ||
621 rule->inode_f || rule->watch || rule->tree) 622 rule->inode_f || rule->watch || rule->tree)
622 return -EINVAL; 623 return -EINVAL;
623 rule->tree = alloc_tree(pathname); 624 rule->tree = alloc_tree(pathname);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9fd85a4640a0..fbf24d121d97 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -86,6 +86,14 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
86#error Fix audit_filter_list initialiser 86#error Fix audit_filter_list initialiser
87#endif 87#endif
88}; 88};
89static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
90 LIST_HEAD_INIT(audit_rules_list[0]),
91 LIST_HEAD_INIT(audit_rules_list[1]),
92 LIST_HEAD_INIT(audit_rules_list[2]),
93 LIST_HEAD_INIT(audit_rules_list[3]),
94 LIST_HEAD_INIT(audit_rules_list[4]),
95 LIST_HEAD_INIT(audit_rules_list[5]),
96};
89 97
90DEFINE_MUTEX(audit_filter_mutex); 98DEFINE_MUTEX(audit_filter_mutex);
91 99
@@ -244,7 +252,8 @@ static inline int audit_to_inode(struct audit_krule *krule,
244 struct audit_field *f) 252 struct audit_field *f)
245{ 253{
246 if (krule->listnr != AUDIT_FILTER_EXIT || 254 if (krule->listnr != AUDIT_FILTER_EXIT ||
247 krule->watch || krule->inode_f || krule->tree) 255 krule->watch || krule->inode_f || krule->tree ||
256 (f->op != Audit_equal && f->op != Audit_not_equal))
248 return -EINVAL; 257 return -EINVAL;
249 258
250 krule->inode_f = f; 259 krule->inode_f = f;
@@ -262,7 +271,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
262 271
263 if (path[0] != '/' || path[len-1] == '/' || 272 if (path[0] != '/' || path[len-1] == '/' ||
264 krule->listnr != AUDIT_FILTER_EXIT || 273 krule->listnr != AUDIT_FILTER_EXIT ||
265 op & ~AUDIT_EQUAL || 274 op != Audit_equal ||
266 krule->inode_f || krule->watch || krule->tree) 275 krule->inode_f || krule->watch || krule->tree)
267 return -EINVAL; 276 return -EINVAL;
268 277
@@ -412,12 +421,32 @@ exit_err:
412 return ERR_PTR(err); 421 return ERR_PTR(err);
413} 422}
414 423
424static u32 audit_ops[] =
425{
426 [Audit_equal] = AUDIT_EQUAL,
427 [Audit_not_equal] = AUDIT_NOT_EQUAL,
428 [Audit_bitmask] = AUDIT_BIT_MASK,
429 [Audit_bittest] = AUDIT_BIT_TEST,
430 [Audit_lt] = AUDIT_LESS_THAN,
431 [Audit_gt] = AUDIT_GREATER_THAN,
432 [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL,
433 [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL,
434};
435
436static u32 audit_to_op(u32 op)
437{
438 u32 n;
439 for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++)
440 ;
441 return n;
442}
443
444
415/* Translate struct audit_rule to kernel's rule respresentation. 445/* Translate struct audit_rule to kernel's rule respresentation.
416 * Exists for backward compatibility with userspace. */ 446 * Exists for backward compatibility with userspace. */
417static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 447static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
418{ 448{
419 struct audit_entry *entry; 449 struct audit_entry *entry;
420 struct audit_field *ino_f;
421 int err = 0; 450 int err = 0;
422 int i; 451 int i;
423 452
@@ -427,12 +456,28 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
427 456
428 for (i = 0; i < rule->field_count; i++) { 457 for (i = 0; i < rule->field_count; i++) {
429 struct audit_field *f = &entry->rule.fields[i]; 458 struct audit_field *f = &entry->rule.fields[i];
459 u32 n;
460
461 n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
462
463 /* Support for legacy operators where
464 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
465 if (n & AUDIT_NEGATE)
466 f->op = Audit_not_equal;
467 else if (!n)
468 f->op = Audit_equal;
469 else
470 f->op = audit_to_op(n);
471
472 entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
430 473
431 f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
432 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 474 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
433 f->val = rule->values[i]; 475 f->val = rule->values[i];
434 476
435 err = -EINVAL; 477 err = -EINVAL;
478 if (f->op == Audit_bad)
479 goto exit_free;
480
436 switch(f->type) { 481 switch(f->type) {
437 default: 482 default:
438 goto exit_free; 483 goto exit_free;
@@ -454,11 +499,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
454 case AUDIT_EXIT: 499 case AUDIT_EXIT:
455 case AUDIT_SUCCESS: 500 case AUDIT_SUCCESS:
456 /* bit ops are only useful on syscall args */ 501 /* bit ops are only useful on syscall args */
457 if (f->op == AUDIT_BIT_MASK || 502 if (f->op == Audit_bitmask || f->op == Audit_bittest)
458 f->op == AUDIT_BIT_TEST) {
459 err = -EINVAL;
460 goto exit_free; 503 goto exit_free;
461 }
462 break; 504 break;
463 case AUDIT_ARG0: 505 case AUDIT_ARG0:
464 case AUDIT_ARG1: 506 case AUDIT_ARG1:
@@ -467,11 +509,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
467 break; 509 break;
468 /* arch is only allowed to be = or != */ 510 /* arch is only allowed to be = or != */
469 case AUDIT_ARCH: 511 case AUDIT_ARCH:
470 if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) 512 if (f->op != Audit_not_equal && f->op != Audit_equal)
471 && (f->op != AUDIT_NEGATE) && (f->op)) {
472 err = -EINVAL;
473 goto exit_free; 513 goto exit_free;
474 }
475 entry->rule.arch_f = f; 514 entry->rule.arch_f = f;
476 break; 515 break;
477 case AUDIT_PERM: 516 case AUDIT_PERM:
@@ -488,33 +527,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
488 goto exit_free; 527 goto exit_free;
489 break; 528 break;
490 } 529 }
491
492 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
493
494 /* Support for legacy operators where
495 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
496 if (f->op & AUDIT_NEGATE)
497 f->op = AUDIT_NOT_EQUAL;
498 else if (!f->op)
499 f->op = AUDIT_EQUAL;
500 else if (f->op == AUDIT_OPERATORS) {
501 err = -EINVAL;
502 goto exit_free;
503 }
504 } 530 }
505 531
506 ino_f = entry->rule.inode_f; 532 if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
507 if (ino_f) { 533 entry->rule.inode_f = NULL;
508 switch(ino_f->op) {
509 case AUDIT_NOT_EQUAL:
510 entry->rule.inode_f = NULL;
511 case AUDIT_EQUAL:
512 break;
513 default:
514 err = -EINVAL;
515 goto exit_free;
516 }
517 }
518 534
519exit_nofree: 535exit_nofree:
520 return entry; 536 return entry;
@@ -530,7 +546,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
530{ 546{
531 int err = 0; 547 int err = 0;
532 struct audit_entry *entry; 548 struct audit_entry *entry;
533 struct audit_field *ino_f;
534 void *bufp; 549 void *bufp;
535 size_t remain = datasz - sizeof(struct audit_rule_data); 550 size_t remain = datasz - sizeof(struct audit_rule_data);
536 int i; 551 int i;
@@ -546,11 +561,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
546 struct audit_field *f = &entry->rule.fields[i]; 561 struct audit_field *f = &entry->rule.fields[i];
547 562
548 err = -EINVAL; 563 err = -EINVAL;
549 if (!(data->fieldflags[i] & AUDIT_OPERATORS) || 564
550 data->fieldflags[i] & ~AUDIT_OPERATORS) 565 f->op = audit_to_op(data->fieldflags[i]);
566 if (f->op == Audit_bad)
551 goto exit_free; 567 goto exit_free;
552 568
553 f->op = data->fieldflags[i] & AUDIT_OPERATORS;
554 f->type = data->fields[i]; 569 f->type = data->fields[i];
555 f->val = data->values[i]; 570 f->val = data->values[i];
556 f->lsm_str = NULL; 571 f->lsm_str = NULL;
@@ -662,18 +677,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
662 } 677 }
663 } 678 }
664 679
665 ino_f = entry->rule.inode_f; 680 if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
666 if (ino_f) { 681 entry->rule.inode_f = NULL;
667 switch(ino_f->op) {
668 case AUDIT_NOT_EQUAL:
669 entry->rule.inode_f = NULL;
670 case AUDIT_EQUAL:
671 break;
672 default:
673 err = -EINVAL;
674 goto exit_free;
675 }
676 }
677 682
678exit_nofree: 683exit_nofree:
679 return entry; 684 return entry;
@@ -713,10 +718,10 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
713 rule->fields[i] = krule->fields[i].type; 718 rule->fields[i] = krule->fields[i].type;
714 719
715 if (krule->vers_ops == 1) { 720 if (krule->vers_ops == 1) {
716 if (krule->fields[i].op & AUDIT_NOT_EQUAL) 721 if (krule->fields[i].op == Audit_not_equal)
717 rule->fields[i] |= AUDIT_NEGATE; 722 rule->fields[i] |= AUDIT_NEGATE;
718 } else { 723 } else {
719 rule->fields[i] |= krule->fields[i].op; 724 rule->fields[i] |= audit_ops[krule->fields[i].op];
720 } 725 }
721 } 726 }
722 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; 727 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
@@ -744,7 +749,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
744 struct audit_field *f = &krule->fields[i]; 749 struct audit_field *f = &krule->fields[i];
745 750
746 data->fields[i] = f->type; 751 data->fields[i] = f->type;
747 data->fieldflags[i] = f->op; 752 data->fieldflags[i] = audit_ops[f->op];
748 switch(f->type) { 753 switch(f->type) {
749 case AUDIT_SUBJ_USER: 754 case AUDIT_SUBJ_USER:
750 case AUDIT_SUBJ_ROLE: 755 case AUDIT_SUBJ_ROLE:
@@ -919,6 +924,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
919 new->action = old->action; 924 new->action = old->action;
920 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 925 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
921 new->mask[i] = old->mask[i]; 926 new->mask[i] = old->mask[i];
927 new->prio = old->prio;
922 new->buflen = old->buflen; 928 new->buflen = old->buflen;
923 new->inode_f = old->inode_f; 929 new->inode_f = old->inode_f;
924 new->watch = NULL; 930 new->watch = NULL;
@@ -987,9 +993,8 @@ static void audit_update_watch(struct audit_parent *parent,
987 993
988 /* If the update involves invalidating rules, do the inode-based 994 /* If the update involves invalidating rules, do the inode-based
989 * filtering now, so we don't omit records. */ 995 * filtering now, so we don't omit records. */
990 if (invalidating && current->audit_context && 996 if (invalidating && current->audit_context)
991 audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) 997 audit_filter_inodes(current, current->audit_context);
992 audit_set_auditable(current->audit_context);
993 998
994 nwatch = audit_dupe_watch(owatch); 999 nwatch = audit_dupe_watch(owatch);
995 if (IS_ERR(nwatch)) { 1000 if (IS_ERR(nwatch)) {
@@ -1007,12 +1012,15 @@ static void audit_update_watch(struct audit_parent *parent,
1007 list_del_rcu(&oentry->list); 1012 list_del_rcu(&oentry->list);
1008 1013
1009 nentry = audit_dupe_rule(&oentry->rule, nwatch); 1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1010 if (IS_ERR(nentry)) 1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1011 audit_panic("error updating watch, removing"); 1017 audit_panic("error updating watch, removing");
1012 else { 1018 } else {
1013 int h = audit_hash_ino((u32)ino); 1019 int h = audit_hash_ino((u32)ino);
1014 list_add(&nentry->rule.rlist, &nwatch->rules); 1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1015 list_add_rcu(&nentry->list, &audit_inode_hash[h]); 1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1016 } 1024 }
1017 1025
1018 call_rcu(&oentry->rcu, audit_free_rule_rcu); 1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
@@ -1077,6 +1085,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
1077 audit_log_end(ab); 1085 audit_log_end(ab);
1078 } 1086 }
1079 list_del(&r->rlist); 1087 list_del(&r->rlist);
1088 list_del(&r->list);
1080 list_del_rcu(&e->list); 1089 list_del_rcu(&e->list);
1081 call_rcu(&e->rcu, audit_free_rule_rcu); 1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1082 } 1091 }
@@ -1102,12 +1111,16 @@ static void audit_inotify_unregister(struct list_head *in_list)
1102/* Find an existing audit rule. 1111/* Find an existing audit rule.
1103 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1104static struct audit_entry *audit_find_rule(struct audit_entry *entry, 1113static struct audit_entry *audit_find_rule(struct audit_entry *entry,
1105 struct list_head *list) 1114 struct list_head **p)
1106{ 1115{
1107 struct audit_entry *e, *found = NULL; 1116 struct audit_entry *e, *found = NULL;
1117 struct list_head *list;
1108 int h; 1118 int h;
1109 1119
1110 if (entry->rule.watch) { 1120 if (entry->rule.inode_f) {
1121 h = audit_hash_ino(entry->rule.inode_f->val);
1122 *p = list = &audit_inode_hash[h];
1123 } else if (entry->rule.watch) {
1111 /* we don't know the inode number, so must walk entire hash */ 1124 /* we don't know the inode number, so must walk entire hash */
1112 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { 1125 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
1113 list = &audit_inode_hash[h]; 1126 list = &audit_inode_hash[h];
@@ -1118,6 +1131,8 @@ static struct audit_entry *audit_find_rule(struct audit_entry *entry,
1118 } 1131 }
1119 } 1132 }
1120 goto out; 1133 goto out;
1134 } else {
1135 *p = list = &audit_filter_list[entry->rule.listnr];
1121 } 1136 }
1122 1137
1123 list_for_each_entry(e, list, list) 1138 list_for_each_entry(e, list, list)
@@ -1258,15 +1273,17 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1258 return ret; 1273 return ret;
1259} 1274}
1260 1275
1276static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1;
1278
1261/* Add rule to given filterlist if not a duplicate. */ 1279/* Add rule to given filterlist if not a duplicate. */
1262static inline int audit_add_rule(struct audit_entry *entry, 1280static inline int audit_add_rule(struct audit_entry *entry)
1263 struct list_head *list)
1264{ 1281{
1265 struct audit_entry *e; 1282 struct audit_entry *e;
1266 struct audit_field *inode_f = entry->rule.inode_f;
1267 struct audit_watch *watch = entry->rule.watch; 1283 struct audit_watch *watch = entry->rule.watch;
1268 struct audit_tree *tree = entry->rule.tree; 1284 struct audit_tree *tree = entry->rule.tree;
1269 struct nameidata *ndp = NULL, *ndw = NULL; 1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list;
1270 int h, err; 1287 int h, err;
1271#ifdef CONFIG_AUDITSYSCALL 1288#ifdef CONFIG_AUDITSYSCALL
1272 int dont_count = 0; 1289 int dont_count = 0;
@@ -1277,13 +1294,8 @@ static inline int audit_add_rule(struct audit_entry *entry,
1277 dont_count = 1; 1294 dont_count = 1;
1278#endif 1295#endif
1279 1296
1280 if (inode_f) {
1281 h = audit_hash_ino(inode_f->val);
1282 list = &audit_inode_hash[h];
1283 }
1284
1285 mutex_lock(&audit_filter_mutex); 1297 mutex_lock(&audit_filter_mutex);
1286 e = audit_find_rule(entry, list); 1298 e = audit_find_rule(entry, &list);
1287 mutex_unlock(&audit_filter_mutex); 1299 mutex_unlock(&audit_filter_mutex);
1288 if (e) { 1300 if (e) {
1289 err = -EEXIST; 1301 err = -EEXIST;
@@ -1319,10 +1331,22 @@ static inline int audit_add_rule(struct audit_entry *entry,
1319 } 1331 }
1320 } 1332 }
1321 1333
1334 entry->rule.prio = ~0ULL;
1335 if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
1336 if (entry->rule.flags & AUDIT_FILTER_PREPEND)
1337 entry->rule.prio = ++prio_high;
1338 else
1339 entry->rule.prio = --prio_low;
1340 }
1341
1322 if (entry->rule.flags & AUDIT_FILTER_PREPEND) { 1342 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
1343 list_add(&entry->rule.list,
1344 &audit_rules_list[entry->rule.listnr]);
1323 list_add_rcu(&entry->list, list); 1345 list_add_rcu(&entry->list, list);
1324 entry->rule.flags &= ~AUDIT_FILTER_PREPEND; 1346 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
1325 } else { 1347 } else {
1348 list_add_tail(&entry->rule.list,
1349 &audit_rules_list[entry->rule.listnr]);
1326 list_add_tail_rcu(&entry->list, list); 1350 list_add_tail_rcu(&entry->list, list);
1327 } 1351 }
1328#ifdef CONFIG_AUDITSYSCALL 1352#ifdef CONFIG_AUDITSYSCALL
@@ -1345,15 +1369,14 @@ error:
1345} 1369}
1346 1370
1347/* Remove an existing rule from filterlist. */ 1371/* Remove an existing rule from filterlist. */
1348static inline int audit_del_rule(struct audit_entry *entry, 1372static inline int audit_del_rule(struct audit_entry *entry)
1349 struct list_head *list)
1350{ 1373{
1351 struct audit_entry *e; 1374 struct audit_entry *e;
1352 struct audit_field *inode_f = entry->rule.inode_f;
1353 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch;
1354 struct audit_tree *tree = entry->rule.tree; 1376 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list;
1355 LIST_HEAD(inotify_list); 1378 LIST_HEAD(inotify_list);
1356 int h, ret = 0; 1379 int ret = 0;
1357#ifdef CONFIG_AUDITSYSCALL 1380#ifdef CONFIG_AUDITSYSCALL
1358 int dont_count = 0; 1381 int dont_count = 0;
1359 1382
@@ -1363,13 +1386,8 @@ static inline int audit_del_rule(struct audit_entry *entry,
1363 dont_count = 1; 1386 dont_count = 1;
1364#endif 1387#endif
1365 1388
1366 if (inode_f) {
1367 h = audit_hash_ino(inode_f->val);
1368 list = &audit_inode_hash[h];
1369 }
1370
1371 mutex_lock(&audit_filter_mutex); 1389 mutex_lock(&audit_filter_mutex);
1372 e = audit_find_rule(entry, list); 1390 e = audit_find_rule(entry, &list);
1373 if (!e) { 1391 if (!e) {
1374 mutex_unlock(&audit_filter_mutex); 1392 mutex_unlock(&audit_filter_mutex);
1375 ret = -ENOENT; 1393 ret = -ENOENT;
@@ -1404,6 +1422,7 @@ static inline int audit_del_rule(struct audit_entry *entry,
1404 audit_remove_tree_rule(&e->rule); 1422 audit_remove_tree_rule(&e->rule);
1405 1423
1406 list_del_rcu(&e->list); 1424 list_del_rcu(&e->list);
1425 list_del(&e->rule.list);
1407 call_rcu(&e->rcu, audit_free_rule_rcu); 1426 call_rcu(&e->rcu, audit_free_rule_rcu);
1408 1427
1409#ifdef CONFIG_AUDITSYSCALL 1428#ifdef CONFIG_AUDITSYSCALL
@@ -1432,30 +1451,16 @@ out:
1432static void audit_list(int pid, int seq, struct sk_buff_head *q) 1451static void audit_list(int pid, int seq, struct sk_buff_head *q)
1433{ 1452{
1434 struct sk_buff *skb; 1453 struct sk_buff *skb;
1435 struct audit_entry *entry; 1454 struct audit_krule *r;
1436 int i; 1455 int i;
1437 1456
1438 /* This is a blocking read, so use audit_filter_mutex instead of rcu 1457 /* This is a blocking read, so use audit_filter_mutex instead of rcu
1439 * iterator to sync with list writers. */ 1458 * iterator to sync with list writers. */
1440 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1459 for (i=0; i<AUDIT_NR_FILTERS; i++) {
1441 list_for_each_entry(entry, &audit_filter_list[i], list) { 1460 list_for_each_entry(r, &audit_rules_list[i], list) {
1442 struct audit_rule *rule;
1443
1444 rule = audit_krule_to_rule(&entry->rule);
1445 if (unlikely(!rule))
1446 break;
1447 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
1448 rule, sizeof(*rule));
1449 if (skb)
1450 skb_queue_tail(q, skb);
1451 kfree(rule);
1452 }
1453 }
1454 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
1455 list_for_each_entry(entry, &audit_inode_hash[i], list) {
1456 struct audit_rule *rule; 1461 struct audit_rule *rule;
1457 1462
1458 rule = audit_krule_to_rule(&entry->rule); 1463 rule = audit_krule_to_rule(r);
1459 if (unlikely(!rule)) 1464 if (unlikely(!rule))
1460 break; 1465 break;
1461 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, 1466 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
@@ -1474,30 +1479,16 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q)
1474static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) 1479static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1475{ 1480{
1476 struct sk_buff *skb; 1481 struct sk_buff *skb;
1477 struct audit_entry *e; 1482 struct audit_krule *r;
1478 int i; 1483 int i;
1479 1484
1480 /* This is a blocking read, so use audit_filter_mutex instead of rcu 1485 /* This is a blocking read, so use audit_filter_mutex instead of rcu
1481 * iterator to sync with list writers. */ 1486 * iterator to sync with list writers. */
1482 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1487 for (i=0; i<AUDIT_NR_FILTERS; i++) {
1483 list_for_each_entry(e, &audit_filter_list[i], list) { 1488 list_for_each_entry(r, &audit_rules_list[i], list) {
1484 struct audit_rule_data *data;
1485
1486 data = audit_krule_to_data(&e->rule);
1487 if (unlikely(!data))
1488 break;
1489 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
1490 data, sizeof(*data) + data->buflen);
1491 if (skb)
1492 skb_queue_tail(q, skb);
1493 kfree(data);
1494 }
1495 }
1496 for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
1497 list_for_each_entry(e, &audit_inode_hash[i], list) {
1498 struct audit_rule_data *data; 1489 struct audit_rule_data *data;
1499 1490
1500 data = audit_krule_to_data(&e->rule); 1491 data = audit_krule_to_data(r);
1501 if (unlikely(!data)) 1492 if (unlikely(!data))
1502 break; 1493 break;
1503 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, 1494 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
@@ -1603,8 +1594,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1603 if (IS_ERR(entry)) 1594 if (IS_ERR(entry))
1604 return PTR_ERR(entry); 1595 return PTR_ERR(entry);
1605 1596
1606 err = audit_add_rule(entry, 1597 err = audit_add_rule(entry);
1607 &audit_filter_list[entry->rule.listnr]);
1608 audit_log_rule_change(loginuid, sessionid, sid, "add", 1598 audit_log_rule_change(loginuid, sessionid, sid, "add",
1609 &entry->rule, !err); 1599 &entry->rule, !err);
1610 1600
@@ -1620,8 +1610,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1620 if (IS_ERR(entry)) 1610 if (IS_ERR(entry))
1621 return PTR_ERR(entry); 1611 return PTR_ERR(entry);
1622 1612
1623 err = audit_del_rule(entry, 1613 err = audit_del_rule(entry);
1624 &audit_filter_list[entry->rule.listnr]);
1625 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1614 audit_log_rule_change(loginuid, sessionid, sid, "remove",
1626 &entry->rule, !err); 1615 &entry->rule, !err);
1627 1616
@@ -1634,28 +1623,29 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1634 return err; 1623 return err;
1635} 1624}
1636 1625
1637int audit_comparator(const u32 left, const u32 op, const u32 right) 1626int audit_comparator(u32 left, u32 op, u32 right)
1638{ 1627{
1639 switch (op) { 1628 switch (op) {
1640 case AUDIT_EQUAL: 1629 case Audit_equal:
1641 return (left == right); 1630 return (left == right);
1642 case AUDIT_NOT_EQUAL: 1631 case Audit_not_equal:
1643 return (left != right); 1632 return (left != right);
1644 case AUDIT_LESS_THAN: 1633 case Audit_lt:
1645 return (left < right); 1634 return (left < right);
1646 case AUDIT_LESS_THAN_OR_EQUAL: 1635 case Audit_le:
1647 return (left <= right); 1636 return (left <= right);
1648 case AUDIT_GREATER_THAN: 1637 case Audit_gt:
1649 return (left > right); 1638 return (left > right);
1650 case AUDIT_GREATER_THAN_OR_EQUAL: 1639 case Audit_ge:
1651 return (left >= right); 1640 return (left >= right);
1652 case AUDIT_BIT_MASK: 1641 case Audit_bitmask:
1653 return (left & right); 1642 return (left & right);
1654 case AUDIT_BIT_TEST: 1643 case Audit_bittest:
1655 return ((left & right) == right); 1644 return ((left & right) == right);
1645 default:
1646 BUG();
1647 return 0;
1656 } 1648 }
1657 BUG();
1658 return 0;
1659} 1649}
1660 1650
1661/* Compare given dentry name with last component in given path, 1651/* Compare given dentry name with last component in given path,
@@ -1778,6 +1768,43 @@ unlock_and_return:
1778 return result; 1768 return result;
1779} 1769}
1780 1770
1771static int update_lsm_rule(struct audit_krule *r)
1772{
1773 struct audit_entry *entry = container_of(r, struct audit_entry, rule);
1774 struct audit_entry *nentry;
1775 struct audit_watch *watch;
1776 struct audit_tree *tree;
1777 int err = 0;
1778
1779 if (!security_audit_rule_known(r))
1780 return 0;
1781
1782 watch = r->watch;
1783 tree = r->tree;
1784 nentry = audit_dupe_rule(r, watch);
1785 if (IS_ERR(nentry)) {
1786 /* save the first error encountered for the
1787 * return value */
1788 err = PTR_ERR(nentry);
1789 audit_panic("error updating LSM filters");
1790 if (watch)
1791 list_del(&r->rlist);
1792 list_del_rcu(&entry->list);
1793 list_del(&r->list);
1794 } else {
1795 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules);
1797 list_del(&r->rlist);
1798 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist);
1800 list_replace_rcu(&entry->list, &nentry->list);
1801 list_replace(&r->list, &nentry->rule.list);
1802 }
1803 call_rcu(&entry->rcu, audit_free_rule_rcu);
1804
1805 return err;
1806}
1807
1781/* This function will re-initialize the lsm_rule field of all applicable rules. 1808/* This function will re-initialize the lsm_rule field of all applicable rules.
1782 * It will traverse the filter lists serarching for rules that contain LSM 1809 * It will traverse the filter lists serarching for rules that contain LSM
1783 * specific filter fields. When such a rule is found, it is copied, the 1810 * specific filter fields. When such a rule is found, it is copied, the
@@ -1785,45 +1812,19 @@ unlock_and_return:
1785 * updated rule. */ 1812 * updated rule. */
1786int audit_update_lsm_rules(void) 1813int audit_update_lsm_rules(void)
1787{ 1814{
1788 struct audit_entry *entry, *n, *nentry; 1815 struct audit_krule *r, *n;
1789 struct audit_watch *watch;
1790 struct audit_tree *tree;
1791 int i, err = 0; 1816 int i, err = 0;
1792 1817
1793 /* audit_filter_mutex synchronizes the writers */ 1818 /* audit_filter_mutex synchronizes the writers */
1794 mutex_lock(&audit_filter_mutex); 1819 mutex_lock(&audit_filter_mutex);
1795 1820
1796 for (i = 0; i < AUDIT_NR_FILTERS; i++) { 1821 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
1797 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { 1822 list_for_each_entry_safe(r, n, &audit_rules_list[i], list) {
1798 if (!security_audit_rule_known(&entry->rule)) 1823 int res = update_lsm_rule(r);
1799 continue; 1824 if (!err)
1800 1825 err = res;
1801 watch = entry->rule.watch;
1802 tree = entry->rule.tree;
1803 nentry = audit_dupe_rule(&entry->rule, watch);
1804 if (IS_ERR(nentry)) {
1805 /* save the first error encountered for the
1806 * return value */
1807 if (!err)
1808 err = PTR_ERR(nentry);
1809 audit_panic("error updating LSM filters");
1810 if (watch)
1811 list_del(&entry->rule.rlist);
1812 list_del_rcu(&entry->list);
1813 } else {
1814 if (watch) {
1815 list_add(&nentry->rule.rlist,
1816 &watch->rules);
1817 list_del(&entry->rule.rlist);
1818 } else if (tree)
1819 list_replace_init(&entry->rule.rlist,
1820 &nentry->rule.rlist);
1821 list_replace_rcu(&entry->list, &nentry->list);
1822 }
1823 call_rcu(&entry->rcu, audit_free_rule_rcu);
1824 } 1826 }
1825 } 1827 }
1826
1827 mutex_unlock(&audit_filter_mutex); 1828 mutex_unlock(&audit_filter_mutex);
1828 1829
1829 return err; 1830 return err;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4819f3711973..8cbddff6c283 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -124,43 +124,6 @@ struct audit_aux_data {
124/* Number of target pids per aux struct. */ 124/* Number of target pids per aux struct. */
125#define AUDIT_AUX_PIDS 16 125#define AUDIT_AUX_PIDS 16
126 126
127struct audit_aux_data_mq_open {
128 struct audit_aux_data d;
129 int oflag;
130 mode_t mode;
131 struct mq_attr attr;
132};
133
134struct audit_aux_data_mq_sendrecv {
135 struct audit_aux_data d;
136 mqd_t mqdes;
137 size_t msg_len;
138 unsigned int msg_prio;
139 struct timespec abs_timeout;
140};
141
142struct audit_aux_data_mq_notify {
143 struct audit_aux_data d;
144 mqd_t mqdes;
145 struct sigevent notification;
146};
147
148struct audit_aux_data_mq_getsetattr {
149 struct audit_aux_data d;
150 mqd_t mqdes;
151 struct mq_attr mqstat;
152};
153
154struct audit_aux_data_ipcctl {
155 struct audit_aux_data d;
156 struct ipc_perm p;
157 unsigned long qbytes;
158 uid_t uid;
159 gid_t gid;
160 mode_t mode;
161 u32 osid;
162};
163
164struct audit_aux_data_execve { 127struct audit_aux_data_execve {
165 struct audit_aux_data d; 128 struct audit_aux_data d;
166 int argc; 129 int argc;
@@ -168,23 +131,6 @@ struct audit_aux_data_execve {
168 struct mm_struct *mm; 131 struct mm_struct *mm;
169}; 132};
170 133
171struct audit_aux_data_socketcall {
172 struct audit_aux_data d;
173 int nargs;
174 unsigned long args[0];
175};
176
177struct audit_aux_data_sockaddr {
178 struct audit_aux_data d;
179 int len;
180 char a[0];
181};
182
183struct audit_aux_data_fd_pair {
184 struct audit_aux_data d;
185 int fd[2];
186};
187
188struct audit_aux_data_pids { 134struct audit_aux_data_pids {
189 struct audit_aux_data d; 135 struct audit_aux_data d;
190 pid_t target_pid[AUDIT_AUX_PIDS]; 136 pid_t target_pid[AUDIT_AUX_PIDS];
@@ -219,14 +165,14 @@ struct audit_tree_refs {
219struct audit_context { 165struct audit_context {
220 int dummy; /* must be the first element */ 166 int dummy; /* must be the first element */
221 int in_syscall; /* 1 if task is in a syscall */ 167 int in_syscall; /* 1 if task is in a syscall */
222 enum audit_state state; 168 enum audit_state state, current_state;
223 unsigned int serial; /* serial number for record */ 169 unsigned int serial; /* serial number for record */
224 struct timespec ctime; /* time of syscall entry */ 170 struct timespec ctime; /* time of syscall entry */
225 int major; /* syscall number */ 171 int major; /* syscall number */
226 unsigned long argv[4]; /* syscall arguments */ 172 unsigned long argv[4]; /* syscall arguments */
227 int return_valid; /* return code is valid */ 173 int return_valid; /* return code is valid */
228 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
229 int auditable; /* 1 if record should be written */ 175 u64 prio;
230 int name_count; 176 int name_count;
231 struct audit_names names[AUDIT_NAMES]; 177 struct audit_names names[AUDIT_NAMES];
232 char * filterkey; /* key for rule that triggered record */ 178 char * filterkey; /* key for rule that triggered record */
@@ -234,7 +180,8 @@ struct audit_context {
234 struct audit_context *previous; /* For nested syscalls */ 180 struct audit_context *previous; /* For nested syscalls */
235 struct audit_aux_data *aux; 181 struct audit_aux_data *aux;
236 struct audit_aux_data *aux_pids; 182 struct audit_aux_data *aux_pids;
237 183 struct sockaddr_storage *sockaddr;
184 size_t sockaddr_len;
238 /* Save things to print about task_struct */ 185 /* Save things to print about task_struct */
239 pid_t pid, ppid; 186 pid_t pid, ppid;
240 uid_t uid, euid, suid, fsuid; 187 uid_t uid, euid, suid, fsuid;
@@ -252,6 +199,49 @@ struct audit_context {
252 struct audit_tree_refs *trees, *first_trees; 199 struct audit_tree_refs *trees, *first_trees;
253 int tree_count; 200 int tree_count;
254 201
202 int type;
203 union {
204 struct {
205 int nargs;
206 long args[6];
207 } socketcall;
208 struct {
209 uid_t uid;
210 gid_t gid;
211 mode_t mode;
212 u32 osid;
213 int has_perm;
214 uid_t perm_uid;
215 gid_t perm_gid;
216 mode_t perm_mode;
217 unsigned long qbytes;
218 } ipc;
219 struct {
220 mqd_t mqdes;
221 struct mq_attr mqstat;
222 } mq_getsetattr;
223 struct {
224 mqd_t mqdes;
225 int sigev_signo;
226 } mq_notify;
227 struct {
228 mqd_t mqdes;
229 size_t msg_len;
230 unsigned int msg_prio;
231 struct timespec abs_timeout;
232 } mq_sendrecv;
233 struct {
234 int oflag;
235 mode_t mode;
236 struct mq_attr attr;
237 } mq_open;
238 struct {
239 pid_t pid;
240 struct audit_cap_data cap;
241 } capset;
242 };
243 int fds[2];
244
255#if AUDIT_DEBUG 245#if AUDIT_DEBUG
256 int put_count; 246 int put_count;
257 int ino_count; 247 int ino_count;
@@ -608,19 +598,12 @@ static int audit_filter_rules(struct task_struct *tsk,
608 } 598 }
609 } 599 }
610 /* Find ipc objects that match */ 600 /* Find ipc objects that match */
611 if (ctx) { 601 if (!ctx || ctx->type != AUDIT_IPC)
612 struct audit_aux_data *aux; 602 break;
613 for (aux = ctx->aux; aux; 603 if (security_audit_rule_match(ctx->ipc.osid,
614 aux = aux->next) { 604 f->type, f->op,
615 if (aux->type == AUDIT_IPC) { 605 f->lsm_rule, ctx))
616 struct audit_aux_data_ipcctl *axi = (void *)aux; 606 ++result;
617 if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) {
618 ++result;
619 break;
620 }
621 }
622 }
623 }
624 } 607 }
625 break; 608 break;
626 case AUDIT_ARG0: 609 case AUDIT_ARG0:
@@ -647,8 +630,16 @@ static int audit_filter_rules(struct task_struct *tsk,
647 return 0; 630 return 0;
648 } 631 }
649 } 632 }
650 if (rule->filterkey && ctx) 633
651 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); 634 if (ctx) {
635 if (rule->prio <= ctx->prio)
636 return 0;
637 if (rule->filterkey) {
638 kfree(ctx->filterkey);
639 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
640 }
641 ctx->prio = rule->prio;
642 }
652 switch (rule->action) { 643 switch (rule->action) {
653 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 644 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
654 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 645 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
@@ -661,7 +652,7 @@ static int audit_filter_rules(struct task_struct *tsk,
661 * completely disabled for this task. Since we only have the task 652 * completely disabled for this task. Since we only have the task
662 * structure at this point, we can only check uid and gid. 653 * structure at this point, we can only check uid and gid.
663 */ 654 */
664static enum audit_state audit_filter_task(struct task_struct *tsk) 655static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
665{ 656{
666 struct audit_entry *e; 657 struct audit_entry *e;
667 enum audit_state state; 658 enum audit_state state;
@@ -669,6 +660,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
669 rcu_read_lock(); 660 rcu_read_lock();
670 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 661 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
671 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { 662 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
663 if (state == AUDIT_RECORD_CONTEXT)
664 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
672 rcu_read_unlock(); 665 rcu_read_unlock();
673 return state; 666 return state;
674 } 667 }
@@ -702,6 +695,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
702 audit_filter_rules(tsk, &e->rule, ctx, NULL, 695 audit_filter_rules(tsk, &e->rule, ctx, NULL,
703 &state)) { 696 &state)) {
704 rcu_read_unlock(); 697 rcu_read_unlock();
698 ctx->current_state = state;
705 return state; 699 return state;
706 } 700 }
707 } 701 }
@@ -715,15 +709,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
715 * buckets applicable to the inode numbers in audit_names[]. 709 * buckets applicable to the inode numbers in audit_names[].
716 * Regarding audit_state, same rules apply as for audit_filter_syscall(). 710 * Regarding audit_state, same rules apply as for audit_filter_syscall().
717 */ 711 */
718enum audit_state audit_filter_inodes(struct task_struct *tsk, 712void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
719 struct audit_context *ctx)
720{ 713{
721 int i; 714 int i;
722 struct audit_entry *e; 715 struct audit_entry *e;
723 enum audit_state state; 716 enum audit_state state;
724 717
725 if (audit_pid && tsk->tgid == audit_pid) 718 if (audit_pid && tsk->tgid == audit_pid)
726 return AUDIT_DISABLED; 719 return;
727 720
728 rcu_read_lock(); 721 rcu_read_lock();
729 for (i = 0; i < ctx->name_count; i++) { 722 for (i = 0; i < ctx->name_count; i++) {
@@ -740,17 +733,20 @@ enum audit_state audit_filter_inodes(struct task_struct *tsk,
740 if ((e->rule.mask[word] & bit) == bit && 733 if ((e->rule.mask[word] & bit) == bit &&
741 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { 734 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
742 rcu_read_unlock(); 735 rcu_read_unlock();
743 return state; 736 ctx->current_state = state;
737 return;
744 } 738 }
745 } 739 }
746 } 740 }
747 rcu_read_unlock(); 741 rcu_read_unlock();
748 return AUDIT_BUILD_CONTEXT;
749} 742}
750 743
751void audit_set_auditable(struct audit_context *ctx) 744static void audit_set_auditable(struct audit_context *ctx)
752{ 745{
753 ctx->auditable = 1; 746 if (!ctx->prio) {
747 ctx->prio = 1;
748 ctx->current_state = AUDIT_RECORD_CONTEXT;
749 }
754} 750}
755 751
756static inline struct audit_context *audit_get_context(struct task_struct *tsk, 752static inline struct audit_context *audit_get_context(struct task_struct *tsk,
@@ -781,23 +777,11 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
781 else 777 else
782 context->return_code = return_code; 778 context->return_code = return_code;
783 779
784 if (context->in_syscall && !context->dummy && !context->auditable) { 780 if (context->in_syscall && !context->dummy) {
785 enum audit_state state; 781 audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
786 782 audit_filter_inodes(tsk, context);
787 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
788 if (state == AUDIT_RECORD_CONTEXT) {
789 context->auditable = 1;
790 goto get_context;
791 }
792
793 state = audit_filter_inodes(tsk, context);
794 if (state == AUDIT_RECORD_CONTEXT)
795 context->auditable = 1;
796
797 } 783 }
798 784
799get_context:
800
801 tsk->audit_context = NULL; 785 tsk->audit_context = NULL;
802 return context; 786 return context;
803} 787}
@@ -807,8 +791,7 @@ static inline void audit_free_names(struct audit_context *context)
807 int i; 791 int i;
808 792
809#if AUDIT_DEBUG == 2 793#if AUDIT_DEBUG == 2
810 if (context->auditable 794 if (context->put_count + context->ino_count != context->name_count) {
811 ||context->put_count + context->ino_count != context->name_count) {
812 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" 795 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
813 " name_count=%d put_count=%d" 796 " name_count=%d put_count=%d"
814 " ino_count=%d [NOT freeing]\n", 797 " ino_count=%d [NOT freeing]\n",
@@ -859,6 +842,7 @@ static inline void audit_zero_context(struct audit_context *context,
859{ 842{
860 memset(context, 0, sizeof(*context)); 843 memset(context, 0, sizeof(*context));
861 context->state = state; 844 context->state = state;
845 context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
862} 846}
863 847
864static inline struct audit_context *audit_alloc_context(enum audit_state state) 848static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -884,18 +868,21 @@ int audit_alloc(struct task_struct *tsk)
884{ 868{
885 struct audit_context *context; 869 struct audit_context *context;
886 enum audit_state state; 870 enum audit_state state;
871 char *key = NULL;
887 872
888 if (likely(!audit_ever_enabled)) 873 if (likely(!audit_ever_enabled))
889 return 0; /* Return if not auditing. */ 874 return 0; /* Return if not auditing. */
890 875
891 state = audit_filter_task(tsk); 876 state = audit_filter_task(tsk, &key);
892 if (likely(state == AUDIT_DISABLED)) 877 if (likely(state == AUDIT_DISABLED))
893 return 0; 878 return 0;
894 879
895 if (!(context = audit_alloc_context(state))) { 880 if (!(context = audit_alloc_context(state))) {
881 kfree(key);
896 audit_log_lost("out of memory in audit_alloc"); 882 audit_log_lost("out of memory in audit_alloc");
897 return -ENOMEM; 883 return -ENOMEM;
898 } 884 }
885 context->filterkey = key;
899 886
900 tsk->audit_context = context; 887 tsk->audit_context = context;
901 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); 888 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
@@ -921,6 +908,7 @@ static inline void audit_free_context(struct audit_context *context)
921 free_tree_refs(context); 908 free_tree_refs(context);
922 audit_free_aux(context); 909 audit_free_aux(context);
923 kfree(context->filterkey); 910 kfree(context->filterkey);
911 kfree(context->sockaddr);
924 kfree(context); 912 kfree(context);
925 context = previous; 913 context = previous;
926 } while (context); 914 } while (context);
@@ -1230,6 +1218,97 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1230 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); 1218 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
1231} 1219}
1232 1220
1221static void show_special(struct audit_context *context, int *call_panic)
1222{
1223 struct audit_buffer *ab;
1224 int i;
1225
1226 ab = audit_log_start(context, GFP_KERNEL, context->type);
1227 if (!ab)
1228 return;
1229
1230 switch (context->type) {
1231 case AUDIT_SOCKETCALL: {
1232 int nargs = context->socketcall.nargs;
1233 audit_log_format(ab, "nargs=%d", nargs);
1234 for (i = 0; i < nargs; i++)
1235 audit_log_format(ab, " a%d=%lx", i,
1236 context->socketcall.args[i]);
1237 break; }
1238 case AUDIT_IPC: {
1239 u32 osid = context->ipc.osid;
1240
1241 audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
1242 context->ipc.uid, context->ipc.gid, context->ipc.mode);
1243 if (osid) {
1244 char *ctx = NULL;
1245 u32 len;
1246 if (security_secid_to_secctx(osid, &ctx, &len)) {
1247 audit_log_format(ab, " osid=%u", osid);
1248 *call_panic = 1;
1249 } else {
1250 audit_log_format(ab, " obj=%s", ctx);
1251 security_release_secctx(ctx, len);
1252 }
1253 }
1254 if (context->ipc.has_perm) {
1255 audit_log_end(ab);
1256 ab = audit_log_start(context, GFP_KERNEL,
1257 AUDIT_IPC_SET_PERM);
1258 audit_log_format(ab,
1259 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
1260 context->ipc.qbytes,
1261 context->ipc.perm_uid,
1262 context->ipc.perm_gid,
1263 context->ipc.perm_mode);
1264 if (!ab)
1265 return;
1266 }
1267 break; }
1268 case AUDIT_MQ_OPEN: {
1269 audit_log_format(ab,
1270 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
1271 "mq_msgsize=%ld mq_curmsgs=%ld",
1272 context->mq_open.oflag, context->mq_open.mode,
1273 context->mq_open.attr.mq_flags,
1274 context->mq_open.attr.mq_maxmsg,
1275 context->mq_open.attr.mq_msgsize,
1276 context->mq_open.attr.mq_curmsgs);
1277 break; }
1278 case AUDIT_MQ_SENDRECV: {
1279 audit_log_format(ab,
1280 "mqdes=%d msg_len=%zd msg_prio=%u "
1281 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
1282 context->mq_sendrecv.mqdes,
1283 context->mq_sendrecv.msg_len,
1284 context->mq_sendrecv.msg_prio,
1285 context->mq_sendrecv.abs_timeout.tv_sec,
1286 context->mq_sendrecv.abs_timeout.tv_nsec);
1287 break; }
1288 case AUDIT_MQ_NOTIFY: {
1289 audit_log_format(ab, "mqdes=%d sigev_signo=%d",
1290 context->mq_notify.mqdes,
1291 context->mq_notify.sigev_signo);
1292 break; }
1293 case AUDIT_MQ_GETSETATTR: {
1294 struct mq_attr *attr = &context->mq_getsetattr.mqstat;
1295 audit_log_format(ab,
1296 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
1297 "mq_curmsgs=%ld ",
1298 context->mq_getsetattr.mqdes,
1299 attr->mq_flags, attr->mq_maxmsg,
1300 attr->mq_msgsize, attr->mq_curmsgs);
1301 break; }
1302 case AUDIT_CAPSET: {
1303 audit_log_format(ab, "pid=%d", context->capset.pid);
1304 audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
1305 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1306 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1307 break; }
1308 }
1309 audit_log_end(ab);
1310}
1311
1233static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1312static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1234{ 1313{
1235 const struct cred *cred; 1314 const struct cred *cred;
@@ -1307,94 +1386,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1307 continue; /* audit_panic has been called */ 1386 continue; /* audit_panic has been called */
1308 1387
1309 switch (aux->type) { 1388 switch (aux->type) {
1310 case AUDIT_MQ_OPEN: {
1311 struct audit_aux_data_mq_open *axi = (void *)aux;
1312 audit_log_format(ab,
1313 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
1314 "mq_msgsize=%ld mq_curmsgs=%ld",
1315 axi->oflag, axi->mode, axi->attr.mq_flags,
1316 axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
1317 axi->attr.mq_curmsgs);
1318 break; }
1319
1320 case AUDIT_MQ_SENDRECV: {
1321 struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
1322 audit_log_format(ab,
1323 "mqdes=%d msg_len=%zd msg_prio=%u "
1324 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
1325 axi->mqdes, axi->msg_len, axi->msg_prio,
1326 axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
1327 break; }
1328
1329 case AUDIT_MQ_NOTIFY: {
1330 struct audit_aux_data_mq_notify *axi = (void *)aux;
1331 audit_log_format(ab,
1332 "mqdes=%d sigev_signo=%d",
1333 axi->mqdes,
1334 axi->notification.sigev_signo);
1335 break; }
1336
1337 case AUDIT_MQ_GETSETATTR: {
1338 struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
1339 audit_log_format(ab,
1340 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
1341 "mq_curmsgs=%ld ",
1342 axi->mqdes,
1343 axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
1344 axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
1345 break; }
1346
1347 case AUDIT_IPC: {
1348 struct audit_aux_data_ipcctl *axi = (void *)aux;
1349 audit_log_format(ab,
1350 "ouid=%u ogid=%u mode=%#o",
1351 axi->uid, axi->gid, axi->mode);
1352 if (axi->osid != 0) {
1353 char *ctx = NULL;
1354 u32 len;
1355 if (security_secid_to_secctx(
1356 axi->osid, &ctx, &len)) {
1357 audit_log_format(ab, " osid=%u",
1358 axi->osid);
1359 call_panic = 1;
1360 } else {
1361 audit_log_format(ab, " obj=%s", ctx);
1362 security_release_secctx(ctx, len);
1363 }
1364 }
1365 break; }
1366
1367 case AUDIT_IPC_SET_PERM: {
1368 struct audit_aux_data_ipcctl *axi = (void *)aux;
1369 audit_log_format(ab,
1370 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
1371 axi->qbytes, axi->uid, axi->gid, axi->mode);
1372 break; }
1373 1389
1374 case AUDIT_EXECVE: { 1390 case AUDIT_EXECVE: {
1375 struct audit_aux_data_execve *axi = (void *)aux; 1391 struct audit_aux_data_execve *axi = (void *)aux;
1376 audit_log_execve_info(context, &ab, axi); 1392 audit_log_execve_info(context, &ab, axi);
1377 break; } 1393 break; }
1378 1394
1379 case AUDIT_SOCKETCALL: {
1380 struct audit_aux_data_socketcall *axs = (void *)aux;
1381 audit_log_format(ab, "nargs=%d", axs->nargs);
1382 for (i=0; i<axs->nargs; i++)
1383 audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
1384 break; }
1385
1386 case AUDIT_SOCKADDR: {
1387 struct audit_aux_data_sockaddr *axs = (void *)aux;
1388
1389 audit_log_format(ab, "saddr=");
1390 audit_log_n_hex(ab, axs->a, axs->len);
1391 break; }
1392
1393 case AUDIT_FD_PAIR: {
1394 struct audit_aux_data_fd_pair *axs = (void *)aux;
1395 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
1396 break; }
1397
1398 case AUDIT_BPRM_FCAPS: { 1395 case AUDIT_BPRM_FCAPS: {
1399 struct audit_aux_data_bprm_fcaps *axs = (void *)aux; 1396 struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
1400 audit_log_format(ab, "fver=%x", axs->fcap_ver); 1397 audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1409,18 +1406,32 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1409 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); 1406 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
1410 break; } 1407 break; }
1411 1408
1412 case AUDIT_CAPSET: {
1413 struct audit_aux_data_capset *axs = (void *)aux;
1414 audit_log_format(ab, "pid=%d", axs->pid);
1415 audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
1416 audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
1417 audit_log_cap(ab, "cap_pe", &axs->cap.effective);
1418 break; }
1419
1420 } 1409 }
1421 audit_log_end(ab); 1410 audit_log_end(ab);
1422 } 1411 }
1423 1412
1413 if (context->type)
1414 show_special(context, &call_panic);
1415
1416 if (context->fds[0] >= 0) {
1417 ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR);
1418 if (ab) {
1419 audit_log_format(ab, "fd0=%d fd1=%d",
1420 context->fds[0], context->fds[1]);
1421 audit_log_end(ab);
1422 }
1423 }
1424
1425 if (context->sockaddr_len) {
1426 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
1427 if (ab) {
1428 audit_log_format(ab, "saddr=");
1429 audit_log_n_hex(ab, (void *)context->sockaddr,
1430 context->sockaddr_len);
1431 audit_log_end(ab);
1432 }
1433 }
1434
1424 for (aux = context->aux_pids; aux; aux = aux->next) { 1435 for (aux = context->aux_pids; aux; aux = aux->next) {
1425 struct audit_aux_data_pids *axs = (void *)aux; 1436 struct audit_aux_data_pids *axs = (void *)aux;
1426 1437
@@ -1536,7 +1547,7 @@ void audit_free(struct task_struct *tsk)
1536 * We use GFP_ATOMIC here because we might be doing this 1547 * We use GFP_ATOMIC here because we might be doing this
1537 * in the context of the idle thread */ 1548 * in the context of the idle thread */
1538 /* that can happen only if we are called from do_exit() */ 1549 /* that can happen only if we are called from do_exit() */
1539 if (context->in_syscall && context->auditable) 1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1540 audit_log_exit(context, tsk); 1551 audit_log_exit(context, tsk);
1541 1552
1542 audit_free_context(context); 1553 audit_free_context(context);
@@ -1620,15 +1631,17 @@ void audit_syscall_entry(int arch, int major,
1620 1631
1621 state = context->state; 1632 state = context->state;
1622 context->dummy = !audit_n_rules; 1633 context->dummy = !audit_n_rules;
1623 if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) 1634 if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
1635 context->prio = 0;
1624 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); 1636 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
1637 }
1625 if (likely(state == AUDIT_DISABLED)) 1638 if (likely(state == AUDIT_DISABLED))
1626 return; 1639 return;
1627 1640
1628 context->serial = 0; 1641 context->serial = 0;
1629 context->ctime = CURRENT_TIME; 1642 context->ctime = CURRENT_TIME;
1630 context->in_syscall = 1; 1643 context->in_syscall = 1;
1631 context->auditable = !!(state == AUDIT_RECORD_CONTEXT); 1644 context->current_state = state;
1632 context->ppid = 0; 1645 context->ppid = 0;
1633} 1646}
1634 1647
@@ -1636,17 +1649,20 @@ void audit_finish_fork(struct task_struct *child)
1636{ 1649{
1637 struct audit_context *ctx = current->audit_context; 1650 struct audit_context *ctx = current->audit_context;
1638 struct audit_context *p = child->audit_context; 1651 struct audit_context *p = child->audit_context;
1639 if (!p || !ctx || !ctx->auditable) 1652 if (!p || !ctx)
1653 return;
1654 if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
1640 return; 1655 return;
1641 p->arch = ctx->arch; 1656 p->arch = ctx->arch;
1642 p->major = ctx->major; 1657 p->major = ctx->major;
1643 memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); 1658 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1644 p->ctime = ctx->ctime; 1659 p->ctime = ctx->ctime;
1645 p->dummy = ctx->dummy; 1660 p->dummy = ctx->dummy;
1646 p->auditable = ctx->auditable;
1647 p->in_syscall = ctx->in_syscall; 1661 p->in_syscall = ctx->in_syscall;
1648 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); 1662 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1649 p->ppid = current->pid; 1663 p->ppid = current->pid;
1664 p->prio = ctx->prio;
1665 p->current_state = ctx->current_state;
1650} 1666}
1651 1667
1652/** 1668/**
@@ -1670,11 +1686,11 @@ void audit_syscall_exit(int valid, long return_code)
1670 if (likely(!context)) 1686 if (likely(!context))
1671 return; 1687 return;
1672 1688
1673 if (context->in_syscall && context->auditable) 1689 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1674 audit_log_exit(context, tsk); 1690 audit_log_exit(context, tsk);
1675 1691
1676 context->in_syscall = 0; 1692 context->in_syscall = 0;
1677 context->auditable = 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1678 1694
1679 if (context->previous) { 1695 if (context->previous) {
1680 struct audit_context *new_context = context->previous; 1696 struct audit_context *new_context = context->previous;
@@ -1689,8 +1705,13 @@ void audit_syscall_exit(int valid, long return_code)
1689 context->aux_pids = NULL; 1705 context->aux_pids = NULL;
1690 context->target_pid = 0; 1706 context->target_pid = 0;
1691 context->target_sid = 0; 1707 context->target_sid = 0;
1692 kfree(context->filterkey); 1708 context->sockaddr_len = 0;
1693 context->filterkey = NULL; 1709 context->type = 0;
1710 context->fds[0] = -1;
1711 if (context->state != AUDIT_RECORD_CONTEXT) {
1712 kfree(context->filterkey);
1713 context->filterkey = NULL;
1714 }
1694 tsk->audit_context = context; 1715 tsk->audit_context = context;
1695 } 1716 }
1696} 1717}
@@ -2081,7 +2102,10 @@ int auditsc_get_stamp(struct audit_context *ctx,
2081 t->tv_sec = ctx->ctime.tv_sec; 2102 t->tv_sec = ctx->ctime.tv_sec;
2082 t->tv_nsec = ctx->ctime.tv_nsec; 2103 t->tv_nsec = ctx->ctime.tv_nsec;
2083 *serial = ctx->serial; 2104 *serial = ctx->serial;
2084 ctx->auditable = 1; 2105 if (!ctx->prio) {
2106 ctx->prio = 1;
2107 ctx->current_state = AUDIT_RECORD_CONTEXT;
2108 }
2085 return 1; 2109 return 1;
2086} 2110}
2087 2111
@@ -2127,132 +2151,46 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2127 * @mode: mode bits 2151 * @mode: mode bits
2128 * @u_attr: queue attributes 2152 * @u_attr: queue attributes
2129 * 2153 *
2130 * Returns 0 for success or NULL context or < 0 on error.
2131 */ 2154 */
2132int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) 2155void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
2133{ 2156{
2134 struct audit_aux_data_mq_open *ax;
2135 struct audit_context *context = current->audit_context; 2157 struct audit_context *context = current->audit_context;
2136 2158
2137 if (!audit_enabled) 2159 if (attr)
2138 return 0; 2160 memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
2139 2161 else
2140 if (likely(!context)) 2162 memset(&context->mq_open.attr, 0, sizeof(struct mq_attr));
2141 return 0;
2142
2143 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2144 if (!ax)
2145 return -ENOMEM;
2146
2147 if (u_attr != NULL) {
2148 if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
2149 kfree(ax);
2150 return -EFAULT;
2151 }
2152 } else
2153 memset(&ax->attr, 0, sizeof(ax->attr));
2154 2163
2155 ax->oflag = oflag; 2164 context->mq_open.oflag = oflag;
2156 ax->mode = mode; 2165 context->mq_open.mode = mode;
2157 2166
2158 ax->d.type = AUDIT_MQ_OPEN; 2167 context->type = AUDIT_MQ_OPEN;
2159 ax->d.next = context->aux;
2160 context->aux = (void *)ax;
2161 return 0;
2162} 2168}
2163 2169
2164/** 2170/**
2165 * __audit_mq_timedsend - record audit data for a POSIX MQ timed send 2171 * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive
2166 * @mqdes: MQ descriptor 2172 * @mqdes: MQ descriptor
2167 * @msg_len: Message length 2173 * @msg_len: Message length
2168 * @msg_prio: Message priority 2174 * @msg_prio: Message priority
2169 * @u_abs_timeout: Message timeout in absolute time 2175 * @abs_timeout: Message timeout in absolute time
2170 * 2176 *
2171 * Returns 0 for success or NULL context or < 0 on error.
2172 */ 2177 */
2173int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, 2178void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
2174 const struct timespec __user *u_abs_timeout) 2179 const struct timespec *abs_timeout)
2175{ 2180{
2176 struct audit_aux_data_mq_sendrecv *ax;
2177 struct audit_context *context = current->audit_context; 2181 struct audit_context *context = current->audit_context;
2182 struct timespec *p = &context->mq_sendrecv.abs_timeout;
2178 2183
2179 if (!audit_enabled) 2184 if (abs_timeout)
2180 return 0; 2185 memcpy(p, abs_timeout, sizeof(struct timespec));
2181 2186 else
2182 if (likely(!context)) 2187 memset(p, 0, sizeof(struct timespec));
2183 return 0;
2184
2185 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2186 if (!ax)
2187 return -ENOMEM;
2188
2189 if (u_abs_timeout != NULL) {
2190 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
2191 kfree(ax);
2192 return -EFAULT;
2193 }
2194 } else
2195 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
2196
2197 ax->mqdes = mqdes;
2198 ax->msg_len = msg_len;
2199 ax->msg_prio = msg_prio;
2200
2201 ax->d.type = AUDIT_MQ_SENDRECV;
2202 ax->d.next = context->aux;
2203 context->aux = (void *)ax;
2204 return 0;
2205}
2206
2207/**
2208 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
2209 * @mqdes: MQ descriptor
2210 * @msg_len: Message length
2211 * @u_msg_prio: Message priority
2212 * @u_abs_timeout: Message timeout in absolute time
2213 *
2214 * Returns 0 for success or NULL context or < 0 on error.
2215 */
2216int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
2217 unsigned int __user *u_msg_prio,
2218 const struct timespec __user *u_abs_timeout)
2219{
2220 struct audit_aux_data_mq_sendrecv *ax;
2221 struct audit_context *context = current->audit_context;
2222
2223 if (!audit_enabled)
2224 return 0;
2225
2226 if (likely(!context))
2227 return 0;
2228
2229 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2230 if (!ax)
2231 return -ENOMEM;
2232
2233 if (u_msg_prio != NULL) {
2234 if (get_user(ax->msg_prio, u_msg_prio)) {
2235 kfree(ax);
2236 return -EFAULT;
2237 }
2238 } else
2239 ax->msg_prio = 0;
2240
2241 if (u_abs_timeout != NULL) {
2242 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
2243 kfree(ax);
2244 return -EFAULT;
2245 }
2246 } else
2247 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
2248 2188
2249 ax->mqdes = mqdes; 2189 context->mq_sendrecv.mqdes = mqdes;
2250 ax->msg_len = msg_len; 2190 context->mq_sendrecv.msg_len = msg_len;
2191 context->mq_sendrecv.msg_prio = msg_prio;
2251 2192
2252 ax->d.type = AUDIT_MQ_SENDRECV; 2193 context->type = AUDIT_MQ_SENDRECV;
2253 ax->d.next = context->aux;
2254 context->aux = (void *)ax;
2255 return 0;
2256} 2194}
2257 2195
2258/** 2196/**
@@ -2260,38 +2198,19 @@ int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
2260 * @mqdes: MQ descriptor 2198 * @mqdes: MQ descriptor
2261 * @u_notification: Notification event 2199 * @u_notification: Notification event
2262 * 2200 *
2263 * Returns 0 for success or NULL context or < 0 on error.
2264 */ 2201 */
2265 2202
2266int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) 2203void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
2267{ 2204{
2268 struct audit_aux_data_mq_notify *ax;
2269 struct audit_context *context = current->audit_context; 2205 struct audit_context *context = current->audit_context;
2270 2206
2271 if (!audit_enabled) 2207 if (notification)
2272 return 0; 2208 context->mq_notify.sigev_signo = notification->sigev_signo;
2273 2209 else
2274 if (likely(!context)) 2210 context->mq_notify.sigev_signo = 0;
2275 return 0;
2276
2277 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2278 if (!ax)
2279 return -ENOMEM;
2280
2281 if (u_notification != NULL) {
2282 if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
2283 kfree(ax);
2284 return -EFAULT;
2285 }
2286 } else
2287 memset(&ax->notification, 0, sizeof(ax->notification));
2288
2289 ax->mqdes = mqdes;
2290 2211
2291 ax->d.type = AUDIT_MQ_NOTIFY; 2212 context->mq_notify.mqdes = mqdes;
2292 ax->d.next = context->aux; 2213 context->type = AUDIT_MQ_NOTIFY;
2293 context->aux = (void *)ax;
2294 return 0;
2295} 2214}
2296 2215
2297/** 2216/**
@@ -2299,55 +2218,29 @@ int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
2299 * @mqdes: MQ descriptor 2218 * @mqdes: MQ descriptor
2300 * @mqstat: MQ flags 2219 * @mqstat: MQ flags
2301 * 2220 *
2302 * Returns 0 for success or NULL context or < 0 on error.
2303 */ 2221 */
2304int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) 2222void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
2305{ 2223{
2306 struct audit_aux_data_mq_getsetattr *ax;
2307 struct audit_context *context = current->audit_context; 2224 struct audit_context *context = current->audit_context;
2308 2225 context->mq_getsetattr.mqdes = mqdes;
2309 if (!audit_enabled) 2226 context->mq_getsetattr.mqstat = *mqstat;
2310 return 0; 2227 context->type = AUDIT_MQ_GETSETATTR;
2311
2312 if (likely(!context))
2313 return 0;
2314
2315 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2316 if (!ax)
2317 return -ENOMEM;
2318
2319 ax->mqdes = mqdes;
2320 ax->mqstat = *mqstat;
2321
2322 ax->d.type = AUDIT_MQ_GETSETATTR;
2323 ax->d.next = context->aux;
2324 context->aux = (void *)ax;
2325 return 0;
2326} 2228}
2327 2229
2328/** 2230/**
2329 * audit_ipc_obj - record audit data for ipc object 2231 * audit_ipc_obj - record audit data for ipc object
2330 * @ipcp: ipc permissions 2232 * @ipcp: ipc permissions
2331 * 2233 *
2332 * Returns 0 for success or NULL context or < 0 on error.
2333 */ 2234 */
2334int __audit_ipc_obj(struct kern_ipc_perm *ipcp) 2235void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2335{ 2236{
2336 struct audit_aux_data_ipcctl *ax;
2337 struct audit_context *context = current->audit_context; 2237 struct audit_context *context = current->audit_context;
2338 2238 context->ipc.uid = ipcp->uid;
2339 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 2239 context->ipc.gid = ipcp->gid;
2340 if (!ax) 2240 context->ipc.mode = ipcp->mode;
2341 return -ENOMEM; 2241 context->ipc.has_perm = 0;
2342 2242 security_ipc_getsecid(ipcp, &context->ipc.osid);
2343 ax->uid = ipcp->uid; 2243 context->type = AUDIT_IPC;
2344 ax->gid = ipcp->gid;
2345 ax->mode = ipcp->mode;
2346 security_ipc_getsecid(ipcp, &ax->osid);
2347 ax->d.type = AUDIT_IPC;
2348 ax->d.next = context->aux;
2349 context->aux = (void *)ax;
2350 return 0;
2351} 2244}
2352 2245
2353/** 2246/**
@@ -2357,26 +2250,17 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2357 * @gid: msgq group id 2250 * @gid: msgq group id
2358 * @mode: msgq mode (permissions) 2251 * @mode: msgq mode (permissions)
2359 * 2252 *
2360 * Returns 0 for success or NULL context or < 0 on error. 2253 * Called only after audit_ipc_obj().
2361 */ 2254 */
2362int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) 2255void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
2363{ 2256{
2364 struct audit_aux_data_ipcctl *ax;
2365 struct audit_context *context = current->audit_context; 2257 struct audit_context *context = current->audit_context;
2366 2258
2367 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 2259 context->ipc.qbytes = qbytes;
2368 if (!ax) 2260 context->ipc.perm_uid = uid;
2369 return -ENOMEM; 2261 context->ipc.perm_gid = gid;
2370 2262 context->ipc.perm_mode = mode;
2371 ax->qbytes = qbytes; 2263 context->ipc.has_perm = 1;
2372 ax->uid = uid;
2373 ax->gid = gid;
2374 ax->mode = mode;
2375
2376 ax->d.type = AUDIT_IPC_SET_PERM;
2377 ax->d.next = context->aux;
2378 context->aux = (void *)ax;
2379 return 0;
2380} 2264}
2381 2265
2382int audit_bprm(struct linux_binprm *bprm) 2266int audit_bprm(struct linux_binprm *bprm)
@@ -2406,27 +2290,17 @@ int audit_bprm(struct linux_binprm *bprm)
2406 * @nargs: number of args 2290 * @nargs: number of args
2407 * @args: args array 2291 * @args: args array
2408 * 2292 *
2409 * Returns 0 for success or NULL context or < 0 on error.
2410 */ 2293 */
2411int audit_socketcall(int nargs, unsigned long *args) 2294void audit_socketcall(int nargs, unsigned long *args)
2412{ 2295{
2413 struct audit_aux_data_socketcall *ax;
2414 struct audit_context *context = current->audit_context; 2296 struct audit_context *context = current->audit_context;
2415 2297
2416 if (likely(!context || context->dummy)) 2298 if (likely(!context || context->dummy))
2417 return 0; 2299 return;
2418
2419 ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
2420 if (!ax)
2421 return -ENOMEM;
2422
2423 ax->nargs = nargs;
2424 memcpy(ax->args, args, nargs * sizeof(unsigned long));
2425 2300
2426 ax->d.type = AUDIT_SOCKETCALL; 2301 context->type = AUDIT_SOCKETCALL;
2427 ax->d.next = context->aux; 2302 context->socketcall.nargs = nargs;
2428 context->aux = (void *)ax; 2303 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
2429 return 0;
2430} 2304}
2431 2305
2432/** 2306/**
@@ -2434,29 +2308,12 @@ int audit_socketcall(int nargs, unsigned long *args)
2434 * @fd1: the first file descriptor 2308 * @fd1: the first file descriptor
2435 * @fd2: the second file descriptor 2309 * @fd2: the second file descriptor
2436 * 2310 *
2437 * Returns 0 for success or NULL context or < 0 on error.
2438 */ 2311 */
2439int __audit_fd_pair(int fd1, int fd2) 2312void __audit_fd_pair(int fd1, int fd2)
2440{ 2313{
2441 struct audit_context *context = current->audit_context; 2314 struct audit_context *context = current->audit_context;
2442 struct audit_aux_data_fd_pair *ax; 2315 context->fds[0] = fd1;
2443 2316 context->fds[1] = fd2;
2444 if (likely(!context)) {
2445 return 0;
2446 }
2447
2448 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2449 if (!ax) {
2450 return -ENOMEM;
2451 }
2452
2453 ax->fd[0] = fd1;
2454 ax->fd[1] = fd2;
2455
2456 ax->d.type = AUDIT_FD_PAIR;
2457 ax->d.next = context->aux;
2458 context->aux = (void *)ax;
2459 return 0;
2460} 2317}
2461 2318
2462/** 2319/**
@@ -2468,22 +2325,20 @@ int __audit_fd_pair(int fd1, int fd2)
2468 */ 2325 */
2469int audit_sockaddr(int len, void *a) 2326int audit_sockaddr(int len, void *a)
2470{ 2327{
2471 struct audit_aux_data_sockaddr *ax;
2472 struct audit_context *context = current->audit_context; 2328 struct audit_context *context = current->audit_context;
2473 2329
2474 if (likely(!context || context->dummy)) 2330 if (likely(!context || context->dummy))
2475 return 0; 2331 return 0;
2476 2332
2477 ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); 2333 if (!context->sockaddr) {
2478 if (!ax) 2334 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
2479 return -ENOMEM; 2335 if (!p)
2480 2336 return -ENOMEM;
2481 ax->len = len; 2337 context->sockaddr = p;
2482 memcpy(ax->a, a, len); 2338 }
2483 2339
2484 ax->d.type = AUDIT_SOCKADDR; 2340 context->sockaddr_len = len;
2485 ax->d.next = context->aux; 2341 memcpy(context->sockaddr, a, len);
2486 context->aux = (void *)ax;
2487 return 0; 2342 return 0;
2488} 2343}
2489 2344
@@ -2617,29 +2472,15 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2617 * Record the aguments userspace sent to sys_capset for later printing by the 2472 * Record the aguments userspace sent to sys_capset for later printing by the
2618 * audit system if applicable 2473 * audit system if applicable
2619 */ 2474 */
2620int __audit_log_capset(pid_t pid, 2475void __audit_log_capset(pid_t pid,
2621 const struct cred *new, const struct cred *old) 2476 const struct cred *new, const struct cred *old)
2622{ 2477{
2623 struct audit_aux_data_capset *ax;
2624 struct audit_context *context = current->audit_context; 2478 struct audit_context *context = current->audit_context;
2625 2479 context->capset.pid = pid;
2626 if (likely(!audit_enabled || !context || context->dummy)) 2480 context->capset.cap.effective = new->cap_effective;
2627 return 0; 2481 context->capset.cap.inheritable = new->cap_effective;
2628 2482 context->capset.cap.permitted = new->cap_permitted;
2629 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2483 context->type = AUDIT_CAPSET;
2630 if (!ax)
2631 return -ENOMEM;
2632
2633 ax->d.type = AUDIT_CAPSET;
2634 ax->d.next = context->aux;
2635 context->aux = (void *)ax;
2636
2637 ax->pid = pid;
2638 ax->cap.effective = new->cap_effective;
2639 ax->cap.inheritable = new->cap_effective;
2640 ax->cap.permitted = new->cap_permitted;
2641
2642 return 0;
2643} 2484}
2644 2485
2645/** 2486/**
diff --git a/kernel/capability.c b/kernel/capability.c
index 36b4b4daebec..688926e496be 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -280,9 +280,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
280 if (ret < 0) 280 if (ret < 0)
281 goto error; 281 goto error;
282 282
283 ret = audit_log_capset(pid, new, current_cred()); 283 audit_log_capset(pid, new, current_cred());
284 if (ret < 0)
285 return ret;
286 284
287 return commit_creds(new); 285 return commit_creds(new);
288 286
@@ -308,7 +306,7 @@ int capable(int cap)
308 BUG(); 306 BUG();
309 } 307 }
310 308
311 if (has_capability(current, cap)) { 309 if (security_capable(cap) == 0) {
312 current->flags |= PF_SUPERPRIV; 310 current->flags |= PF_SUPERPRIV;
313 return 1; 311 return 1;
314 } 312 }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 48348dde6d81..f221446aa02d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -116,7 +116,6 @@ static int root_count;
116 * be called. 116 * be called.
117 */ 117 */
118static int need_forkexit_callback __read_mostly; 118static int need_forkexit_callback __read_mostly;
119static int need_mm_owner_callback __read_mostly;
120 119
121/* convenient tests for these bits */ 120/* convenient tests for these bits */
122inline int cgroup_is_removed(const struct cgroup *cgrp) 121inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -573,7 +572,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
573 inode->i_mode = mode; 572 inode->i_mode = mode;
574 inode->i_uid = current_fsuid(); 573 inode->i_uid = current_fsuid();
575 inode->i_gid = current_fsgid(); 574 inode->i_gid = current_fsgid();
576 inode->i_blocks = 0;
577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 575 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 576 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
579 } 577 }
@@ -2540,7 +2538,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2540 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 2538 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2541 2539
2542 need_forkexit_callback |= ss->fork || ss->exit; 2540 need_forkexit_callback |= ss->fork || ss->exit;
2543 need_mm_owner_callback |= !!ss->mm_owner_changed;
2544 2541
2545 /* At system boot, before all subsystems have been 2542 /* At system boot, before all subsystems have been
2546 * registered, no tasks have been forked, so we don't 2543 * registered, no tasks have been forked, so we don't
@@ -2790,37 +2787,6 @@ void cgroup_fork_callbacks(struct task_struct *child)
2790 } 2787 }
2791} 2788}
2792 2789
2793#ifdef CONFIG_MM_OWNER
2794/**
2795 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2796 * @p: the new owner
2797 *
2798 * Called on every change to mm->owner. mm_init_owner() does not
2799 * invoke this routine, since it assigns the mm->owner the first time
2800 * and does not change it.
2801 *
2802 * The callbacks are invoked with mmap_sem held in read mode.
2803 */
2804void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2805{
2806 struct cgroup *oldcgrp, *newcgrp = NULL;
2807
2808 if (need_mm_owner_callback) {
2809 int i;
2810 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2811 struct cgroup_subsys *ss = subsys[i];
2812 oldcgrp = task_cgroup(old, ss->subsys_id);
2813 if (new)
2814 newcgrp = task_cgroup(new, ss->subsys_id);
2815 if (oldcgrp == newcgrp)
2816 continue;
2817 if (ss->mm_owner_changed)
2818 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2819 }
2820 }
2821}
2822#endif /* CONFIG_MM_OWNER */
2823
2824/** 2790/**
2825 * cgroup_post_fork - called on a new task after adding it to the task list 2791 * cgroup_post_fork - called on a new task after adding it to the task list
2826 * @child: the task in question 2792 * @child: the task in question
@@ -2945,7 +2911,11 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2945 parent = task_cgroup(tsk, subsys->subsys_id); 2911 parent = task_cgroup(tsk, subsys->subsys_id);
2946 2912
2947 /* Pin the hierarchy */ 2913 /* Pin the hierarchy */
2948 atomic_inc(&parent->root->sb->s_active); 2914 if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
2915 /* We race with the final deactivate_super() */
2916 mutex_unlock(&cgroup_mutex);
2917 return 0;
2918 }
2949 2919
2950 /* Keep the cgroup alive */ 2920 /* Keep the cgroup alive */
2951 get_css_set(cg); 2921 get_css_set(cg);
diff --git a/kernel/compat.c b/kernel/compat.c
index 8eafe3eb50d9..42d56544460f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -24,6 +24,7 @@
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h> 26#include <linux/times.h>
27#include <linux/ptrace.h>
27 28
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29 30
@@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
229 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 230 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
230 return -EFAULT; 231 return -EFAULT;
231 } 232 }
233 force_successful_syscall_return();
232 return compat_jiffies_to_clock_t(jiffies); 234 return compat_jiffies_to_clock_t(jiffies);
233} 235}
234 236
@@ -454,16 +456,16 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
454} 456}
455 457
456static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, 458static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
457 unsigned len, cpumask_t *new_mask) 459 unsigned len, struct cpumask *new_mask)
458{ 460{
459 unsigned long *k; 461 unsigned long *k;
460 462
461 if (len < sizeof(cpumask_t)) 463 if (len < cpumask_size())
462 memset(new_mask, 0, sizeof(cpumask_t)); 464 memset(new_mask, 0, cpumask_size());
463 else if (len > sizeof(cpumask_t)) 465 else if (len > cpumask_size())
464 len = sizeof(cpumask_t); 466 len = cpumask_size();
465 467
466 k = cpus_addr(*new_mask); 468 k = cpumask_bits(new_mask);
467 return compat_get_bitmap(k, user_mask_ptr, len * 8); 469 return compat_get_bitmap(k, user_mask_ptr, len * 8);
468} 470}
469 471
@@ -471,40 +473,51 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
471 unsigned int len, 473 unsigned int len,
472 compat_ulong_t __user *user_mask_ptr) 474 compat_ulong_t __user *user_mask_ptr)
473{ 475{
474 cpumask_t new_mask; 476 cpumask_var_t new_mask;
475 int retval; 477 int retval;
476 478
477 retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); 479 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
480 return -ENOMEM;
481
482 retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask);
478 if (retval) 483 if (retval)
479 return retval; 484 goto out;
480 485
481 return sched_setaffinity(pid, &new_mask); 486 retval = sched_setaffinity(pid, new_mask);
487out:
488 free_cpumask_var(new_mask);
489 return retval;
482} 490}
483 491
484asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, 492asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
485 compat_ulong_t __user *user_mask_ptr) 493 compat_ulong_t __user *user_mask_ptr)
486{ 494{
487 int ret; 495 int ret;
488 cpumask_t mask; 496 cpumask_var_t mask;
489 unsigned long *k; 497 unsigned long *k;
490 unsigned int min_length = sizeof(cpumask_t); 498 unsigned int min_length = cpumask_size();
491 499
492 if (NR_CPUS <= BITS_PER_COMPAT_LONG) 500 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
493 min_length = sizeof(compat_ulong_t); 501 min_length = sizeof(compat_ulong_t);
494 502
495 if (len < min_length) 503 if (len < min_length)
496 return -EINVAL; 504 return -EINVAL;
497 505
498 ret = sched_getaffinity(pid, &mask); 506 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
507 return -ENOMEM;
508
509 ret = sched_getaffinity(pid, mask);
499 if (ret < 0) 510 if (ret < 0)
500 return ret; 511 goto out;
501 512
502 k = cpus_addr(mask); 513 k = cpumask_bits(mask);
503 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 514 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
504 if (ret) 515 if (ret == 0)
505 return ret; 516 ret = min_length;
506 517
507 return min_length; 518out:
519 free_cpumask_var(mask);
520 return ret;
508} 521}
509 522
510int get_compat_itimerspec(struct itimerspec *dst, 523int get_compat_itimerspec(struct itimerspec *dst,
@@ -883,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
883 896
884 if (tloc) { 897 if (tloc) {
885 if (put_user(i,tloc)) 898 if (put_user(i,tloc))
886 i = -EFAULT; 899 return -EFAULT;
887 } 900 }
901 force_successful_syscall_return();
888 return i; 902 return i;
889} 903}
890 904
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ea32e8d68b0..79e40f00dcb8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,29 +15,8 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/* 18#ifdef CONFIG_SMP
19 * Represents all cpu's present in the system 19/* Serializes the updates to cpu_online_mask, cpu_present_mask */
20 * In systems capable of hotplug, this map could dynamically grow
21 * as new cpu's are detected in the system via any platform specific
22 * method, such as ACPI for e.g.
23 */
24cpumask_t cpu_present_map __read_mostly;
25EXPORT_SYMBOL(cpu_present_map);
26
27#ifndef CONFIG_SMP
28
29/*
30 * Represents all cpu's that are currently online.
31 */
32cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
33EXPORT_SYMBOL(cpu_online_map);
34
35cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
36EXPORT_SYMBOL(cpu_possible_map);
37
38#else /* CONFIG_SMP */
39
40/* Serializes the updates to cpu_online_map, cpu_present_map */
41static DEFINE_MUTEX(cpu_add_remove_lock); 20static DEFINE_MUTEX(cpu_add_remove_lock);
42 21
43static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
@@ -64,8 +43,6 @@ void __init cpu_hotplug_init(void)
64 cpu_hotplug.refcount = 0; 43 cpu_hotplug.refcount = 0;
65} 44}
66 45
67cpumask_t cpu_active_map;
68
69#ifdef CONFIG_HOTPLUG_CPU 46#ifdef CONFIG_HOTPLUG_CPU
70 47
71void get_online_cpus(void) 48void get_online_cpus(void)
@@ -96,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
96 73
97/* 74/*
98 * The following two API's must be used when attempting 75 * The following two API's must be used when attempting
99 * to serialize the updates to cpu_online_map, cpu_present_map. 76 * to serialize the updates to cpu_online_mask, cpu_present_mask.
100 */ 77 */
101void cpu_maps_update_begin(void) 78void cpu_maps_update_begin(void)
102{ 79{
@@ -217,7 +194,7 @@ static int __ref take_cpu_down(void *_param)
217static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 194static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
218{ 195{
219 int err, nr_calls = 0; 196 int err, nr_calls = 0;
220 cpumask_t old_allowed, tmp; 197 cpumask_var_t old_allowed;
221 void *hcpu = (void *)(long)cpu; 198 void *hcpu = (void *)(long)cpu;
222 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 199 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
223 struct take_cpu_down_param tcd_param = { 200 struct take_cpu_down_param tcd_param = {
@@ -231,6 +208,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
231 if (!cpu_online(cpu)) 208 if (!cpu_online(cpu))
232 return -EINVAL; 209 return -EINVAL;
233 210
211 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
212 return -ENOMEM;
213
234 cpu_hotplug_begin(); 214 cpu_hotplug_begin();
235 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 215 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
236 hcpu, -1, &nr_calls); 216 hcpu, -1, &nr_calls);
@@ -245,13 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
245 } 225 }
246 226
247 /* Ensure that we are not runnable on dying cpu */ 227 /* Ensure that we are not runnable on dying cpu */
248 old_allowed = current->cpus_allowed; 228 cpumask_copy(old_allowed, &current->cpus_allowed);
249 cpus_setall(tmp); 229 set_cpus_allowed_ptr(current,
250 cpu_clear(cpu, tmp); 230 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
251 set_cpus_allowed_ptr(current, &tmp);
252 tmp = cpumask_of_cpu(cpu);
253 231
254 err = __stop_machine(take_cpu_down, &tcd_param, &tmp); 232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
255 if (err) { 233 if (err) {
256 /* CPU didn't die: tell everyone. Can't complain. */ 234 /* CPU didn't die: tell everyone. Can't complain. */
257 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 235 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
@@ -277,7 +255,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
277 check_for_tasks(cpu); 255 check_for_tasks(cpu);
278 256
279out_allowed: 257out_allowed:
280 set_cpus_allowed_ptr(current, &old_allowed); 258 set_cpus_allowed_ptr(current, old_allowed);
281out_release: 259out_release:
282 cpu_hotplug_done(); 260 cpu_hotplug_done();
283 if (!err) { 261 if (!err) {
@@ -285,13 +263,17 @@ out_release:
285 hcpu) == NOTIFY_BAD) 263 hcpu) == NOTIFY_BAD)
286 BUG(); 264 BUG();
287 } 265 }
266 free_cpumask_var(old_allowed);
288 return err; 267 return err;
289} 268}
290 269
291int __ref cpu_down(unsigned int cpu) 270int __ref cpu_down(unsigned int cpu)
292{ 271{
293 int err = 0; 272 int err;
294 273
274 err = stop_machine_create();
275 if (err)
276 return err;
295 cpu_maps_update_begin(); 277 cpu_maps_update_begin();
296 278
297 if (cpu_hotplug_disabled) { 279 if (cpu_hotplug_disabled) {
@@ -303,7 +285,7 @@ int __ref cpu_down(unsigned int cpu)
303 285
304 /* 286 /*
305 * Make sure the all cpus did the reschedule and are not 287 * Make sure the all cpus did the reschedule and are not
306 * using stale version of the cpu_active_map. 288 * using stale version of the cpu_active_mask.
307 * This is not strictly necessary becuase stop_machine() 289 * This is not strictly necessary becuase stop_machine()
308 * that we run down the line already provides the required 290 * that we run down the line already provides the required
309 * synchronization. But it's really a side effect and we do not 291 * synchronization. But it's really a side effect and we do not
@@ -318,6 +300,7 @@ int __ref cpu_down(unsigned int cpu)
318 300
319out: 301out:
320 cpu_maps_update_done(); 302 cpu_maps_update_done();
303 stop_machine_destroy();
321 return err; 304 return err;
322} 305}
323EXPORT_SYMBOL(cpu_down); 306EXPORT_SYMBOL(cpu_down);
@@ -367,7 +350,7 @@ out_notify:
367int __cpuinit cpu_up(unsigned int cpu) 350int __cpuinit cpu_up(unsigned int cpu)
368{ 351{
369 int err = 0; 352 int err = 0;
370 if (!cpu_isset(cpu, cpu_possible_map)) { 353 if (!cpu_possible(cpu)) {
371 printk(KERN_ERR "can't online cpu %d because it is not " 354 printk(KERN_ERR "can't online cpu %d because it is not "
372 "configured as may-hotadd at boot time\n", cpu); 355 "configured as may-hotadd at boot time\n", cpu);
373#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 356#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
@@ -392,25 +375,28 @@ out:
392} 375}
393 376
394#ifdef CONFIG_PM_SLEEP_SMP 377#ifdef CONFIG_PM_SLEEP_SMP
395static cpumask_t frozen_cpus; 378static cpumask_var_t frozen_cpus;
396 379
397int disable_nonboot_cpus(void) 380int disable_nonboot_cpus(void)
398{ 381{
399 int cpu, first_cpu, error = 0; 382 int cpu, first_cpu, error;
400 383
384 error = stop_machine_create();
385 if (error)
386 return error;
401 cpu_maps_update_begin(); 387 cpu_maps_update_begin();
402 first_cpu = first_cpu(cpu_online_map); 388 first_cpu = cpumask_first(cpu_online_mask);
403 /* We take down all of the non-boot CPUs in one shot to avoid races 389 /* We take down all of the non-boot CPUs in one shot to avoid races
404 * with the userspace trying to use the CPU hotplug at the same time 390 * with the userspace trying to use the CPU hotplug at the same time
405 */ 391 */
406 cpus_clear(frozen_cpus); 392 cpumask_clear(frozen_cpus);
407 printk("Disabling non-boot CPUs ...\n"); 393 printk("Disabling non-boot CPUs ...\n");
408 for_each_online_cpu(cpu) { 394 for_each_online_cpu(cpu) {
409 if (cpu == first_cpu) 395 if (cpu == first_cpu)
410 continue; 396 continue;
411 error = _cpu_down(cpu, 1); 397 error = _cpu_down(cpu, 1);
412 if (!error) { 398 if (!error) {
413 cpu_set(cpu, frozen_cpus); 399 cpumask_set_cpu(cpu, frozen_cpus);
414 printk("CPU%d is down\n", cpu); 400 printk("CPU%d is down\n", cpu);
415 } else { 401 } else {
416 printk(KERN_ERR "Error taking CPU%d down: %d\n", 402 printk(KERN_ERR "Error taking CPU%d down: %d\n",
@@ -426,6 +412,7 @@ int disable_nonboot_cpus(void)
426 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 412 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
427 } 413 }
428 cpu_maps_update_done(); 414 cpu_maps_update_done();
415 stop_machine_destroy();
429 return error; 416 return error;
430} 417}
431 418
@@ -436,11 +423,11 @@ void __ref enable_nonboot_cpus(void)
436 /* Allow everyone to use the CPU hotplug again */ 423 /* Allow everyone to use the CPU hotplug again */
437 cpu_maps_update_begin(); 424 cpu_maps_update_begin();
438 cpu_hotplug_disabled = 0; 425 cpu_hotplug_disabled = 0;
439 if (cpus_empty(frozen_cpus)) 426 if (cpumask_empty(frozen_cpus))
440 goto out; 427 goto out;
441 428
442 printk("Enabling non-boot CPUs ...\n"); 429 printk("Enabling non-boot CPUs ...\n");
443 for_each_cpu_mask_nr(cpu, frozen_cpus) { 430 for_each_cpu(cpu, frozen_cpus) {
444 error = _cpu_up(cpu, 1); 431 error = _cpu_up(cpu, 1);
445 if (!error) { 432 if (!error) {
446 printk("CPU%d is up\n", cpu); 433 printk("CPU%d is up\n", cpu);
@@ -448,10 +435,18 @@ void __ref enable_nonboot_cpus(void)
448 } 435 }
449 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 436 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
450 } 437 }
451 cpus_clear(frozen_cpus); 438 cpumask_clear(frozen_cpus);
452out: 439out:
453 cpu_maps_update_done(); 440 cpu_maps_update_done();
454} 441}
442
443static int alloc_frozen_cpus(void)
444{
445 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
446 return -ENOMEM;
447 return 0;
448}
449core_initcall(alloc_frozen_cpus);
455#endif /* CONFIG_PM_SLEEP_SMP */ 450#endif /* CONFIG_PM_SLEEP_SMP */
456 451
457/** 452/**
@@ -467,7 +462,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
467 unsigned long val = CPU_STARTING; 462 unsigned long val = CPU_STARTING;
468 463
469#ifdef CONFIG_PM_SLEEP_SMP 464#ifdef CONFIG_PM_SLEEP_SMP
470 if (cpu_isset(cpu, frozen_cpus)) 465 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
471 val = CPU_STARTING_FROZEN; 466 val = CPU_STARTING_FROZEN;
472#endif /* CONFIG_PM_SLEEP_SMP */ 467#endif /* CONFIG_PM_SLEEP_SMP */
473 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); 468 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
@@ -479,7 +474,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
479 * cpu_bit_bitmap[] is a special, "compressed" data structure that 474 * cpu_bit_bitmap[] is a special, "compressed" data structure that
480 * represents all NR_CPUS bits binary values of 1<<nr. 475 * represents all NR_CPUS bits binary values of 1<<nr.
481 * 476 *
482 * It is used by cpumask_of_cpu() to get a constant address to a CPU 477 * It is used by cpumask_of() to get a constant address to a CPU
483 * mask value that has a single bit set only. 478 * mask value that has a single bit set only.
484 */ 479 */
485 480
@@ -502,3 +497,71 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
502 497
503const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; 498const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
504EXPORT_SYMBOL(cpu_all_bits); 499EXPORT_SYMBOL(cpu_all_bits);
500
501#ifdef CONFIG_INIT_ALL_POSSIBLE
502static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
503 = CPU_BITS_ALL;
504#else
505static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
506#endif
507const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
508EXPORT_SYMBOL(cpu_possible_mask);
509
510static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
511const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
512EXPORT_SYMBOL(cpu_online_mask);
513
514static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
515const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
516EXPORT_SYMBOL(cpu_present_mask);
517
518static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
519const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
520EXPORT_SYMBOL(cpu_active_mask);
521
522void set_cpu_possible(unsigned int cpu, bool possible)
523{
524 if (possible)
525 cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
526 else
527 cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
528}
529
530void set_cpu_present(unsigned int cpu, bool present)
531{
532 if (present)
533 cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
534 else
535 cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
536}
537
538void set_cpu_online(unsigned int cpu, bool online)
539{
540 if (online)
541 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
542 else
543 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
544}
545
546void set_cpu_active(unsigned int cpu, bool active)
547{
548 if (active)
549 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
550 else
551 cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
552}
553
554void init_cpu_present(const struct cpumask *src)
555{
556 cpumask_copy(to_cpumask(cpu_present_bits), src);
557}
558
559void init_cpu_possible(const struct cpumask *src)
560{
561 cpumask_copy(to_cpumask(cpu_possible_bits), src);
562}
563
564void init_cpu_online(const struct cpumask *src)
565{
566 cpumask_copy(to_cpumask(cpu_online_bits), src);
567}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 96c0ba13b8cd..345ace5117de 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -240,6 +240,17 @@ static struct cpuset top_cpuset = {
240static DEFINE_MUTEX(callback_mutex); 240static DEFINE_MUTEX(callback_mutex);
241 241
242/* 242/*
243 * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
244 * buffers. They are statically allocated to prevent using excess stack
245 * when calling cpuset_print_task_mems_allowed().
246 */
247#define CPUSET_NAME_LEN (128)
248#define CPUSET_NODELIST_LEN (256)
249static char cpuset_name[CPUSET_NAME_LEN];
250static char cpuset_nodelist[CPUSET_NODELIST_LEN];
251static DEFINE_SPINLOCK(cpuset_buffer_lock);
252
253/*
243 * This is ugly, but preserves the userspace API for existing cpuset 254 * This is ugly, but preserves the userspace API for existing cpuset
244 * users. If someone tries to mount the "cpuset" filesystem, we 255 * users. If someone tries to mount the "cpuset" filesystem, we
245 * silently switch it to mount "cgroup" instead 256 * silently switch it to mount "cgroup" instead
@@ -896,7 +907,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
896 if (!*buf) { 907 if (!*buf) {
897 cpus_clear(trialcs.cpus_allowed); 908 cpus_clear(trialcs.cpus_allowed);
898 } else { 909 } else {
899 retval = cpulist_parse(buf, trialcs.cpus_allowed); 910 retval = cpulist_parse(buf, &trialcs.cpus_allowed);
900 if (retval < 0) 911 if (retval < 0)
901 return retval; 912 return retval;
902 913
@@ -1482,7 +1493,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1482 mask = cs->cpus_allowed; 1493 mask = cs->cpus_allowed;
1483 mutex_unlock(&callback_mutex); 1494 mutex_unlock(&callback_mutex);
1484 1495
1485 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1496 return cpulist_scnprintf(page, PAGE_SIZE, &mask);
1486} 1497}
1487 1498
1488static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1499static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
@@ -2356,6 +2367,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2356 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2367 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2357} 2368}
2358 2369
2370/**
2371 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2372 * @task: pointer to task_struct of some task.
2373 *
2374 * Description: Prints @task's name, cpuset name, and cached copy of its
2375 * mems_allowed to the kernel log. Must hold task_lock(task) to allow
2376 * dereferencing task_cs(task).
2377 */
2378void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2379{
2380 struct dentry *dentry;
2381
2382 dentry = task_cs(tsk)->css.cgroup->dentry;
2383 spin_lock(&cpuset_buffer_lock);
2384 snprintf(cpuset_name, CPUSET_NAME_LEN,
2385 dentry ? (const char *)dentry->d_name.name : "/");
2386 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2387 tsk->mems_allowed);
2388 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2389 tsk->comm, cpuset_name, cpuset_nodelist);
2390 spin_unlock(&cpuset_buffer_lock);
2391}
2392
2359/* 2393/*
2360 * Collection of memory_pressure is suppressed unless 2394 * Collection of memory_pressure is suppressed unless
2361 * this flag is enabled by writing "1" to the special 2395 * this flag is enabled by writing "1" to the special
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index f013a0c2e111..038707404b76 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -109,20 +109,40 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
109int dma_alloc_from_coherent(struct device *dev, ssize_t size, 109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret) 110 dma_addr_t *dma_handle, void **ret)
111{ 111{
112 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; 112 struct dma_coherent_mem *mem;
113 int order = get_order(size); 113 int order = get_order(size);
114 int pageno;
114 115
115 if (mem) { 116 if (!dev)
116 int page = bitmap_find_free_region(mem->bitmap, mem->size, 117 return 0;
117 order); 118 mem = dev->dma_mem;
118 if (page >= 0) { 119 if (!mem)
119 *dma_handle = mem->device_base + (page << PAGE_SHIFT); 120 return 0;
120 *ret = mem->virt_base + (page << PAGE_SHIFT); 121 if (unlikely(size > mem->size))
121 memset(*ret, 0, size); 122 return 0;
122 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) 123
123 *ret = NULL; 124 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
125 if (pageno >= 0) {
126 /*
127 * Memory was found in the per-device arena.
128 */
129 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
130 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
131 memset(*ret, 0, size);
132 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
133 /*
134 * The per-device arena is exhausted and we are not
135 * permitted to fall back to generic memory.
136 */
137 *ret = NULL;
138 } else {
139 /*
140 * The per-device arena is exhausted and we are
141 * permitted to fall back to generic memory.
142 */
143 return 0;
124 } 144 }
125 return (mem != NULL); 145 return 1;
126} 146}
127EXPORT_SYMBOL(dma_alloc_from_coherent); 147EXPORT_SYMBOL(dma_alloc_from_coherent);
128 148
diff --git a/kernel/exit.c b/kernel/exit.c
index c9e5a1c14e08..c7740fa3252c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -642,35 +642,31 @@ retry:
642 /* 642 /*
643 * We found no owner yet mm_users > 1: this implies that we are 643 * We found no owner yet mm_users > 1: this implies that we are
644 * most likely racing with swapoff (try_to_unuse()) or /proc or 644 * most likely racing with swapoff (try_to_unuse()) or /proc or
645 * ptrace or page migration (get_task_mm()). Mark owner as NULL, 645 * ptrace or page migration (get_task_mm()). Mark owner as NULL.
646 * so that subsystems can understand the callback and take action.
647 */ 646 */
648 down_write(&mm->mmap_sem);
649 cgroup_mm_owner_callbacks(mm->owner, NULL);
650 mm->owner = NULL; 647 mm->owner = NULL;
651 up_write(&mm->mmap_sem);
652 return; 648 return;
653 649
654assign_new_owner: 650assign_new_owner:
655 BUG_ON(c == p); 651 BUG_ON(c == p);
656 get_task_struct(c); 652 get_task_struct(c);
657 read_unlock(&tasklist_lock);
658 down_write(&mm->mmap_sem);
659 /* 653 /*
660 * The task_lock protects c->mm from changing. 654 * The task_lock protects c->mm from changing.
661 * We always want mm->owner->mm == mm 655 * We always want mm->owner->mm == mm
662 */ 656 */
663 task_lock(c); 657 task_lock(c);
658 /*
659 * Delay read_unlock() till we have the task_lock()
660 * to ensure that c does not slip away underneath us
661 */
662 read_unlock(&tasklist_lock);
664 if (c->mm != mm) { 663 if (c->mm != mm) {
665 task_unlock(c); 664 task_unlock(c);
666 up_write(&mm->mmap_sem);
667 put_task_struct(c); 665 put_task_struct(c);
668 goto retry; 666 goto retry;
669 } 667 }
670 cgroup_mm_owner_callbacks(mm->owner, c);
671 mm->owner = c; 668 mm->owner = c;
672 task_unlock(c); 669 task_unlock(c);
673 up_write(&mm->mmap_sem);
674 put_task_struct(c); 670 put_task_struct(c);
675} 671}
676#endif /* CONFIG_MM_OWNER */ 672#endif /* CONFIG_MM_OWNER */
@@ -1055,10 +1051,7 @@ NORET_TYPE void do_exit(long code)
1055 preempt_count()); 1051 preempt_count());
1056 1052
1057 acct_update_integrals(tsk); 1053 acct_update_integrals(tsk);
1058 if (tsk->mm) { 1054
1059 update_hiwater_rss(tsk->mm);
1060 update_hiwater_vm(tsk->mm);
1061 }
1062 group_dead = atomic_dec_and_test(&tsk->signal->live); 1055 group_dead = atomic_dec_and_test(&tsk->signal->live);
1063 if (group_dead) { 1056 if (group_dead) {
1064 hrtimer_cancel(&tsk->signal->real_timer); 1057 hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index 43cbf30669e6..7b8f2a78be3d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -400,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
400#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) 400#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
401#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 401#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
402 402
403static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
404
405static int __init coredump_filter_setup(char *s)
406{
407 default_dump_filter =
408 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
409 MMF_DUMP_FILTER_MASK;
410 return 1;
411}
412
413__setup("coredump_filter=", coredump_filter_setup);
414
403#include <linux/init_task.h> 415#include <linux/init_task.h>
404 416
405static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 417static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
@@ -408,8 +420,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
408 atomic_set(&mm->mm_count, 1); 420 atomic_set(&mm->mm_count, 1);
409 init_rwsem(&mm->mmap_sem); 421 init_rwsem(&mm->mmap_sem);
410 INIT_LIST_HEAD(&mm->mmlist); 422 INIT_LIST_HEAD(&mm->mmlist);
411 mm->flags = (current->mm) ? current->mm->flags 423 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
412 : MMF_DUMP_FILTER_DEFAULT;
413 mm->core_state = NULL; 424 mm->core_state = NULL;
414 mm->nr_ptes = 0; 425 mm->nr_ptes = 0;
415 set_mm_counter(mm, file_rss, 0); 426 set_mm_counter(mm, file_rss, 0);
@@ -758,7 +769,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
758{ 769{
759 struct sighand_struct *sig; 770 struct sighand_struct *sig;
760 771
761 if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { 772 if (clone_flags & CLONE_SIGHAND) {
762 atomic_inc(&current->sighand->count); 773 atomic_inc(&current->sighand->count);
763 return 0; 774 return 0;
764 } 775 }
diff --git a/kernel/futex.c b/kernel/futex.c
index 7c6cbabe52b3..002aa189eb09 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -170,8 +170,11 @@ static void get_futex_key_refs(union futex_key *key)
170 */ 170 */
171static void drop_futex_key_refs(union futex_key *key) 171static void drop_futex_key_refs(union futex_key *key)
172{ 172{
173 if (!key->both.ptr) 173 if (!key->both.ptr) {
174 /* If we're here then we tried to put a key we failed to get */
175 WARN_ON_ONCE(1);
174 return; 176 return;
177 }
175 178
176 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 179 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
177 case FUT_OFF_INODE: 180 case FUT_OFF_INODE:
@@ -730,8 +733,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
730 } 733 }
731 734
732 spin_unlock(&hb->lock); 735 spin_unlock(&hb->lock);
733out:
734 put_futex_key(fshared, &key); 736 put_futex_key(fshared, &key);
737out:
735 return ret; 738 return ret;
736} 739}
737 740
@@ -755,7 +758,7 @@ retryfull:
755 goto out; 758 goto out;
756 ret = get_futex_key(uaddr2, fshared, &key2); 759 ret = get_futex_key(uaddr2, fshared, &key2);
757 if (unlikely(ret != 0)) 760 if (unlikely(ret != 0))
758 goto out; 761 goto out_put_key1;
759 762
760 hb1 = hash_futex(&key1); 763 hb1 = hash_futex(&key1);
761 hb2 = hash_futex(&key2); 764 hb2 = hash_futex(&key2);
@@ -777,12 +780,12 @@ retry:
777 * but we might get them from range checking 780 * but we might get them from range checking
778 */ 781 */
779 ret = op_ret; 782 ret = op_ret;
780 goto out; 783 goto out_put_keys;
781#endif 784#endif
782 785
783 if (unlikely(op_ret != -EFAULT)) { 786 if (unlikely(op_ret != -EFAULT)) {
784 ret = op_ret; 787 ret = op_ret;
785 goto out; 788 goto out_put_keys;
786 } 789 }
787 790
788 /* 791 /*
@@ -796,7 +799,7 @@ retry:
796 ret = futex_handle_fault((unsigned long)uaddr2, 799 ret = futex_handle_fault((unsigned long)uaddr2,
797 attempt); 800 attempt);
798 if (ret) 801 if (ret)
799 goto out; 802 goto out_put_keys;
800 goto retry; 803 goto retry;
801 } 804 }
802 805
@@ -834,10 +837,11 @@ retry:
834 spin_unlock(&hb1->lock); 837 spin_unlock(&hb1->lock);
835 if (hb1 != hb2) 838 if (hb1 != hb2)
836 spin_unlock(&hb2->lock); 839 spin_unlock(&hb2->lock);
837out: 840out_put_keys:
838 put_futex_key(fshared, &key2); 841 put_futex_key(fshared, &key2);
842out_put_key1:
839 put_futex_key(fshared, &key1); 843 put_futex_key(fshared, &key1);
840 844out:
841 return ret; 845 return ret;
842} 846}
843 847
@@ -854,13 +858,13 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
854 struct futex_q *this, *next; 858 struct futex_q *this, *next;
855 int ret, drop_count = 0; 859 int ret, drop_count = 0;
856 860
857 retry: 861retry:
858 ret = get_futex_key(uaddr1, fshared, &key1); 862 ret = get_futex_key(uaddr1, fshared, &key1);
859 if (unlikely(ret != 0)) 863 if (unlikely(ret != 0))
860 goto out; 864 goto out;
861 ret = get_futex_key(uaddr2, fshared, &key2); 865 ret = get_futex_key(uaddr2, fshared, &key2);
862 if (unlikely(ret != 0)) 866 if (unlikely(ret != 0))
863 goto out; 867 goto out_put_key1;
864 868
865 hb1 = hash_futex(&key1); 869 hb1 = hash_futex(&key1);
866 hb2 = hash_futex(&key2); 870 hb2 = hash_futex(&key2);
@@ -882,7 +886,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
882 if (!ret) 886 if (!ret)
883 goto retry; 887 goto retry;
884 888
885 return ret; 889 goto out_put_keys;
886 } 890 }
887 if (curval != *cmpval) { 891 if (curval != *cmpval) {
888 ret = -EAGAIN; 892 ret = -EAGAIN;
@@ -927,9 +931,11 @@ out_unlock:
927 while (--drop_count >= 0) 931 while (--drop_count >= 0)
928 drop_futex_key_refs(&key1); 932 drop_futex_key_refs(&key1);
929 933
930out: 934out_put_keys:
931 put_futex_key(fshared, &key2); 935 put_futex_key(fshared, &key2);
936out_put_key1:
932 put_futex_key(fshared, &key1); 937 put_futex_key(fshared, &key1);
938out:
933 return ret; 939 return ret;
934} 940}
935 941
@@ -990,7 +996,7 @@ static int unqueue_me(struct futex_q *q)
990 int ret = 0; 996 int ret = 0;
991 997
992 /* In the common case we don't take the spinlock, which is nice. */ 998 /* In the common case we don't take the spinlock, which is nice. */
993 retry: 999retry:
994 lock_ptr = q->lock_ptr; 1000 lock_ptr = q->lock_ptr;
995 barrier(); 1001 barrier();
996 if (lock_ptr != NULL) { 1002 if (lock_ptr != NULL) {
@@ -1172,11 +1178,11 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1172 1178
1173 q.pi_state = NULL; 1179 q.pi_state = NULL;
1174 q.bitset = bitset; 1180 q.bitset = bitset;
1175 retry: 1181retry:
1176 q.key = FUTEX_KEY_INIT; 1182 q.key = FUTEX_KEY_INIT;
1177 ret = get_futex_key(uaddr, fshared, &q.key); 1183 ret = get_futex_key(uaddr, fshared, &q.key);
1178 if (unlikely(ret != 0)) 1184 if (unlikely(ret != 0))
1179 goto out_release_sem; 1185 goto out;
1180 1186
1181 hb = queue_lock(&q); 1187 hb = queue_lock(&q);
1182 1188
@@ -1204,6 +1210,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1204 1210
1205 if (unlikely(ret)) { 1211 if (unlikely(ret)) {
1206 queue_unlock(&q, hb); 1212 queue_unlock(&q, hb);
1213 put_futex_key(fshared, &q.key);
1207 1214
1208 ret = get_user(uval, uaddr); 1215 ret = get_user(uval, uaddr);
1209 1216
@@ -1213,7 +1220,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1213 } 1220 }
1214 ret = -EWOULDBLOCK; 1221 ret = -EWOULDBLOCK;
1215 if (uval != val) 1222 if (uval != val)
1216 goto out_unlock_release_sem; 1223 goto out_unlock_put_key;
1217 1224
1218 /* Only actually queue if *uaddr contained val. */ 1225 /* Only actually queue if *uaddr contained val. */
1219 queue_me(&q, hb); 1226 queue_me(&q, hb);
@@ -1305,11 +1312,11 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1305 return -ERESTART_RESTARTBLOCK; 1312 return -ERESTART_RESTARTBLOCK;
1306 } 1313 }
1307 1314
1308 out_unlock_release_sem: 1315out_unlock_put_key:
1309 queue_unlock(&q, hb); 1316 queue_unlock(&q, hb);
1310
1311 out_release_sem:
1312 put_futex_key(fshared, &q.key); 1317 put_futex_key(fshared, &q.key);
1318
1319out:
1313 return ret; 1320 return ret;
1314} 1321}
1315 1322
@@ -1358,16 +1365,16 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1358 } 1365 }
1359 1366
1360 q.pi_state = NULL; 1367 q.pi_state = NULL;
1361 retry: 1368retry:
1362 q.key = FUTEX_KEY_INIT; 1369 q.key = FUTEX_KEY_INIT;
1363 ret = get_futex_key(uaddr, fshared, &q.key); 1370 ret = get_futex_key(uaddr, fshared, &q.key);
1364 if (unlikely(ret != 0)) 1371 if (unlikely(ret != 0))
1365 goto out_release_sem; 1372 goto out;
1366 1373
1367 retry_unlocked: 1374retry_unlocked:
1368 hb = queue_lock(&q); 1375 hb = queue_lock(&q);
1369 1376
1370 retry_locked: 1377retry_locked:
1371 ret = lock_taken = 0; 1378 ret = lock_taken = 0;
1372 1379
1373 /* 1380 /*
@@ -1388,14 +1395,14 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1388 */ 1395 */
1389 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { 1396 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1390 ret = -EDEADLK; 1397 ret = -EDEADLK;
1391 goto out_unlock_release_sem; 1398 goto out_unlock_put_key;
1392 } 1399 }
1393 1400
1394 /* 1401 /*
1395 * Surprise - we got the lock. Just return to userspace: 1402 * Surprise - we got the lock. Just return to userspace:
1396 */ 1403 */
1397 if (unlikely(!curval)) 1404 if (unlikely(!curval))
1398 goto out_unlock_release_sem; 1405 goto out_unlock_put_key;
1399 1406
1400 uval = curval; 1407 uval = curval;
1401 1408
@@ -1431,7 +1438,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1431 * We took the lock due to owner died take over. 1438 * We took the lock due to owner died take over.
1432 */ 1439 */
1433 if (unlikely(lock_taken)) 1440 if (unlikely(lock_taken))
1434 goto out_unlock_release_sem; 1441 goto out_unlock_put_key;
1435 1442
1436 /* 1443 /*
1437 * We dont have the lock. Look up the PI state (or create it if 1444 * We dont have the lock. Look up the PI state (or create it if
@@ -1470,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1470 goto retry_locked; 1477 goto retry_locked;
1471 } 1478 }
1472 default: 1479 default:
1473 goto out_unlock_release_sem; 1480 goto out_unlock_put_key;
1474 } 1481 }
1475 } 1482 }
1476 1483
@@ -1561,16 +1568,17 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1561 destroy_hrtimer_on_stack(&to->timer); 1568 destroy_hrtimer_on_stack(&to->timer);
1562 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1569 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1563 1570
1564 out_unlock_release_sem: 1571out_unlock_put_key:
1565 queue_unlock(&q, hb); 1572 queue_unlock(&q, hb);
1566 1573
1567 out_release_sem: 1574out_put_key:
1568 put_futex_key(fshared, &q.key); 1575 put_futex_key(fshared, &q.key);
1576out:
1569 if (to) 1577 if (to)
1570 destroy_hrtimer_on_stack(&to->timer); 1578 destroy_hrtimer_on_stack(&to->timer);
1571 return ret; 1579 return ret;
1572 1580
1573 uaddr_faulted: 1581uaddr_faulted:
1574 /* 1582 /*
1575 * We have to r/w *(int __user *)uaddr, and we have to modify it 1583 * We have to r/w *(int __user *)uaddr, and we have to modify it
1576 * atomically. Therefore, if we continue to fault after get_user() 1584 * atomically. Therefore, if we continue to fault after get_user()
@@ -1583,7 +1591,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1583 if (attempt++) { 1591 if (attempt++) {
1584 ret = futex_handle_fault((unsigned long)uaddr, attempt); 1592 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1585 if (ret) 1593 if (ret)
1586 goto out_release_sem; 1594 goto out_put_key;
1587 goto retry_unlocked; 1595 goto retry_unlocked;
1588 } 1596 }
1589 1597
@@ -1675,9 +1683,9 @@ retry_unlocked:
1675 1683
1676out_unlock: 1684out_unlock:
1677 spin_unlock(&hb->lock); 1685 spin_unlock(&hb->lock);
1678out:
1679 put_futex_key(fshared, &key); 1686 put_futex_key(fshared, &key);
1680 1687
1688out:
1681 return ret; 1689 return ret;
1682 1690
1683pi_faulted: 1691pi_faulted:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index eb2bfefa6dcc..1455b7651b6b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -634,7 +634,6 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
634{ 634{
635} 635}
636 636
637static void __run_hrtimer(struct hrtimer *timer);
638 637
639/* 638/*
640 * When High resolution timers are active, try to reprogram. Note, that in case 639 * When High resolution timers are active, try to reprogram. Note, that in case
@@ -646,13 +645,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
646 struct hrtimer_clock_base *base) 645 struct hrtimer_clock_base *base)
647{ 646{
648 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 647 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
649 /* 648 spin_unlock(&base->cpu_base->lock);
650 * XXX: recursion check? 649 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
651 * hrtimer_forward() should round up with timer granularity 650 spin_lock(&base->cpu_base->lock);
652 * so that we never get into inf recursion here,
653 * it doesn't do that though
654 */
655 __run_hrtimer(timer);
656 return 1; 651 return 1;
657 } 652 }
658 return 0; 653 return 0;
@@ -705,11 +700,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
705} 700}
706static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 701static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
707static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 702static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
708static inline int hrtimer_reprogram(struct hrtimer *timer,
709 struct hrtimer_clock_base *base)
710{
711 return 0;
712}
713 703
714#endif /* CONFIG_HIGH_RES_TIMERS */ 704#endif /* CONFIG_HIGH_RES_TIMERS */
715 705
@@ -780,9 +770,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
780 * 770 *
781 * The timer is inserted in expiry order. Insertion into the 771 * The timer is inserted in expiry order. Insertion into the
782 * red black tree is O(log(n)). Must hold the base lock. 772 * red black tree is O(log(n)). Must hold the base lock.
773 *
774 * Returns 1 when the new timer is the leftmost timer in the tree.
783 */ 775 */
784static void enqueue_hrtimer(struct hrtimer *timer, 776static int enqueue_hrtimer(struct hrtimer *timer,
785 struct hrtimer_clock_base *base, int reprogram) 777 struct hrtimer_clock_base *base)
786{ 778{
787 struct rb_node **link = &base->active.rb_node; 779 struct rb_node **link = &base->active.rb_node;
788 struct rb_node *parent = NULL; 780 struct rb_node *parent = NULL;
@@ -814,20 +806,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
814 * Insert the timer to the rbtree and check whether it 806 * Insert the timer to the rbtree and check whether it
815 * replaces the first pending timer 807 * replaces the first pending timer
816 */ 808 */
817 if (leftmost) { 809 if (leftmost)
818 /*
819 * Reprogram the clock event device. When the timer is already
820 * expired hrtimer_enqueue_reprogram has either called the
821 * callback or added it to the pending list and raised the
822 * softirq.
823 *
824 * This is a NOP for !HIGHRES
825 */
826 if (reprogram && hrtimer_enqueue_reprogram(timer, base))
827 return;
828
829 base->first = &timer->node; 810 base->first = &timer->node;
830 }
831 811
832 rb_link_node(&timer->node, parent, link); 812 rb_link_node(&timer->node, parent, link);
833 rb_insert_color(&timer->node, &base->active); 813 rb_insert_color(&timer->node, &base->active);
@@ -836,6 +816,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
836 * state of a possibly running callback. 816 * state of a possibly running callback.
837 */ 817 */
838 timer->state |= HRTIMER_STATE_ENQUEUED; 818 timer->state |= HRTIMER_STATE_ENQUEUED;
819
820 return leftmost;
839} 821}
840 822
841/* 823/*
@@ -912,7 +894,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
912{ 894{
913 struct hrtimer_clock_base *base, *new_base; 895 struct hrtimer_clock_base *base, *new_base;
914 unsigned long flags; 896 unsigned long flags;
915 int ret; 897 int ret, leftmost;
916 898
917 base = lock_hrtimer_base(timer, &flags); 899 base = lock_hrtimer_base(timer, &flags);
918 900
@@ -940,12 +922,16 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
940 922
941 timer_stats_hrtimer_set_start_info(timer); 923 timer_stats_hrtimer_set_start_info(timer);
942 924
925 leftmost = enqueue_hrtimer(timer, new_base);
926
943 /* 927 /*
944 * Only allow reprogramming if the new base is on this CPU. 928 * Only allow reprogramming if the new base is on this CPU.
945 * (it might still be on another CPU if the timer was pending) 929 * (it might still be on another CPU if the timer was pending)
930 *
931 * XXX send_remote_softirq() ?
946 */ 932 */
947 enqueue_hrtimer(timer, new_base, 933 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
948 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 934 hrtimer_enqueue_reprogram(timer, new_base);
949 935
950 unlock_hrtimer_base(timer, &flags); 936 unlock_hrtimer_base(timer, &flags);
951 937
@@ -1157,13 +1143,13 @@ static void __run_hrtimer(struct hrtimer *timer)
1157 spin_lock(&cpu_base->lock); 1143 spin_lock(&cpu_base->lock);
1158 1144
1159 /* 1145 /*
1160 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid 1146 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
1161 * reprogramming of the event hardware. This happens at the end of this 1147 * we do not reprogramm the event hardware. Happens either in
1162 * function anyway. 1148 * hrtimer_start_range_ns() or in hrtimer_interrupt()
1163 */ 1149 */
1164 if (restart != HRTIMER_NORESTART) { 1150 if (restart != HRTIMER_NORESTART) {
1165 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); 1151 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1166 enqueue_hrtimer(timer, base, 0); 1152 enqueue_hrtimer(timer, base);
1167 } 1153 }
1168 timer->state &= ~HRTIMER_STATE_CALLBACK; 1154 timer->state &= ~HRTIMER_STATE_CALLBACK;
1169} 1155}
@@ -1243,6 +1229,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1243 } 1229 }
1244} 1230}
1245 1231
1232/*
1233 * local version of hrtimer_peek_ahead_timers() called with interrupts
1234 * disabled.
1235 */
1236static void __hrtimer_peek_ahead_timers(void)
1237{
1238 struct tick_device *td;
1239
1240 if (!hrtimer_hres_active())
1241 return;
1242
1243 td = &__get_cpu_var(tick_cpu_device);
1244 if (td && td->evtdev)
1245 hrtimer_interrupt(td->evtdev);
1246}
1247
1246/** 1248/**
1247 * hrtimer_peek_ahead_timers -- run soft-expired timers now 1249 * hrtimer_peek_ahead_timers -- run soft-expired timers now
1248 * 1250 *
@@ -1254,20 +1256,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1254 */ 1256 */
1255void hrtimer_peek_ahead_timers(void) 1257void hrtimer_peek_ahead_timers(void)
1256{ 1258{
1257 struct tick_device *td;
1258 unsigned long flags; 1259 unsigned long flags;
1259 1260
1260 if (!hrtimer_hres_active())
1261 return;
1262
1263 local_irq_save(flags); 1261 local_irq_save(flags);
1264 td = &__get_cpu_var(tick_cpu_device); 1262 __hrtimer_peek_ahead_timers();
1265 if (td && td->evtdev)
1266 hrtimer_interrupt(td->evtdev);
1267 local_irq_restore(flags); 1263 local_irq_restore(flags);
1268} 1264}
1269 1265
1270#endif /* CONFIG_HIGH_RES_TIMERS */ 1266static void run_hrtimer_softirq(struct softirq_action *h)
1267{
1268 hrtimer_peek_ahead_timers();
1269}
1270
1271#else /* CONFIG_HIGH_RES_TIMERS */
1272
1273static inline void __hrtimer_peek_ahead_timers(void) { }
1274
1275#endif /* !CONFIG_HIGH_RES_TIMERS */
1271 1276
1272/* 1277/*
1273 * Called from timer softirq every jiffy, expire hrtimers: 1278 * Called from timer softirq every jiffy, expire hrtimers:
@@ -1513,39 +1518,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1513 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); 1518 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1514 timer->base = new_base; 1519 timer->base = new_base;
1515 /* 1520 /*
1516 * Enqueue the timers on the new cpu, but do not reprogram 1521 * Enqueue the timers on the new cpu. This does not
1517 * the timer as that would enable a deadlock between 1522 * reprogram the event device in case the timer
1518 * hrtimer_enqueue_reprogramm() running the timer and us still 1523 * expires before the earliest on this CPU, but we run
1519 * holding a nested base lock. 1524 * hrtimer_interrupt after we migrated everything to
1520 * 1525 * sort out already expired timers and reprogram the
1521 * Instead we tickle the hrtimer interrupt after the migration 1526 * event device.
1522 * is done, which will run all expired timers and re-programm
1523 * the timer device.
1524 */ 1527 */
1525 enqueue_hrtimer(timer, new_base, 0); 1528 enqueue_hrtimer(timer, new_base);
1526 1529
1527 /* Clear the migration state bit */ 1530 /* Clear the migration state bit */
1528 timer->state &= ~HRTIMER_STATE_MIGRATE; 1531 timer->state &= ~HRTIMER_STATE_MIGRATE;
1529 } 1532 }
1530} 1533}
1531 1534
1532static int migrate_hrtimers(int scpu) 1535static void migrate_hrtimers(int scpu)
1533{ 1536{
1534 struct hrtimer_cpu_base *old_base, *new_base; 1537 struct hrtimer_cpu_base *old_base, *new_base;
1535 int dcpu, i; 1538 int i;
1536 1539
1537 BUG_ON(cpu_online(scpu)); 1540 BUG_ON(cpu_online(scpu));
1538 old_base = &per_cpu(hrtimer_bases, scpu);
1539 new_base = &get_cpu_var(hrtimer_bases);
1540
1541 dcpu = smp_processor_id();
1542
1543 tick_cancel_sched_timer(scpu); 1541 tick_cancel_sched_timer(scpu);
1542
1543 local_irq_disable();
1544 old_base = &per_cpu(hrtimer_bases, scpu);
1545 new_base = &__get_cpu_var(hrtimer_bases);
1544 /* 1546 /*
1545 * The caller is globally serialized and nobody else 1547 * The caller is globally serialized and nobody else
1546 * takes two locks at once, deadlock is not possible. 1548 * takes two locks at once, deadlock is not possible.
1547 */ 1549 */
1548 spin_lock_irq(&new_base->lock); 1550 spin_lock(&new_base->lock);
1549 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1551 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1550 1552
1551 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1553 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
@@ -1554,15 +1556,11 @@ static int migrate_hrtimers(int scpu)
1554 } 1556 }
1555 1557
1556 spin_unlock(&old_base->lock); 1558 spin_unlock(&old_base->lock);
1557 spin_unlock_irq(&new_base->lock); 1559 spin_unlock(&new_base->lock);
1558 put_cpu_var(hrtimer_bases);
1559 1560
1560 return dcpu; 1561 /* Check, if we got expired work to do */
1561} 1562 __hrtimer_peek_ahead_timers();
1562 1563 local_irq_enable();
1563static void tickle_timers(void *arg)
1564{
1565 hrtimer_peek_ahead_timers();
1566} 1564}
1567 1565
1568#endif /* CONFIG_HOTPLUG_CPU */ 1566#endif /* CONFIG_HOTPLUG_CPU */
@@ -1583,11 +1581,8 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1583 case CPU_DEAD: 1581 case CPU_DEAD:
1584 case CPU_DEAD_FROZEN: 1582 case CPU_DEAD_FROZEN:
1585 { 1583 {
1586 int dcpu;
1587
1588 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); 1584 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1589 dcpu = migrate_hrtimers(scpu); 1585 migrate_hrtimers(scpu);
1590 smp_call_function_single(dcpu, tickle_timers, NULL, 0);
1591 break; 1586 break;
1592 } 1587 }
1593#endif 1588#endif
@@ -1608,6 +1603,9 @@ void __init hrtimers_init(void)
1608 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1603 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1609 (void *)(long)smp_processor_id()); 1604 (void *)(long)smp_processor_id());
1610 register_cpu_notifier(&hrtimers_nb); 1605 register_cpu_notifier(&hrtimers_nb);
1606#ifdef CONFIG_HIGH_RES_TIMERS
1607 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1608#endif
1611} 1609}
1612 1610
1613/** 1611/**
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index cc0f7321b8ce..1de9700f416e 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/async.h>
13 14
14#include "internals.h" 15#include "internals.h"
15 16
@@ -34,6 +35,10 @@ unsigned long probe_irq_on(void)
34 unsigned int status; 35 unsigned int status;
35 int i; 36 int i;
36 37
38 /*
39 * quiesce the kernel, or at least the asynchronous portion
40 */
41 async_synchronize_full();
37 mutex_lock(&probing_active); 42 mutex_lock(&probing_active);
38 /* 43 /*
39 * something may have generated an irq long ago and we want to 44 * something may have generated an irq long ago and we want to
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6eb3c7952b64..f63c706d25e1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,7 @@ void dynamic_irq_init(unsigned int irq)
46 desc->irq_count = 0; 46 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 47 desc->irqs_unhandled = 0;
48#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
49 cpus_setall(desc->affinity); 49 cpumask_setall(&desc->affinity);
50#endif 50#endif
51 spin_unlock_irqrestore(&desc->lock, flags); 51 spin_unlock_irqrestore(&desc->lock, flags);
52} 52}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 540f6c49f3fa..cd0cd8dcb345 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -16,8 +16,15 @@
16#include "internals.h" 16#include "internals.h"
17 17
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
19cpumask_var_t irq_default_affinity;
19 20
20cpumask_t irq_default_affinity = CPU_MASK_ALL; 21static int init_irq_default_affinity(void)
22{
23 alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
24 cpumask_setall(irq_default_affinity);
25 return 0;
26}
27core_initcall(init_irq_default_affinity);
21 28
22/** 29/**
23 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 30 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
@@ -79,7 +86,7 @@ int irq_can_set_affinity(unsigned int irq)
79 * @cpumask: cpumask 86 * @cpumask: cpumask
80 * 87 *
81 */ 88 */
82int irq_set_affinity(unsigned int irq, cpumask_t cpumask) 89int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
83{ 90{
84 struct irq_desc *desc = irq_to_desc(irq); 91 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags; 92 unsigned long flags;
@@ -91,14 +98,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
91 98
92#ifdef CONFIG_GENERIC_PENDING_IRQ 99#ifdef CONFIG_GENERIC_PENDING_IRQ
93 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 100 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
94 desc->affinity = cpumask; 101 cpumask_copy(&desc->affinity, cpumask);
95 desc->chip->set_affinity(irq, cpumask); 102 desc->chip->set_affinity(irq, cpumask);
96 } else { 103 } else {
97 desc->status |= IRQ_MOVE_PENDING; 104 desc->status |= IRQ_MOVE_PENDING;
98 desc->pending_mask = cpumask; 105 cpumask_copy(&desc->pending_mask, cpumask);
99 } 106 }
100#else 107#else
101 desc->affinity = cpumask; 108 cpumask_copy(&desc->affinity, cpumask);
102 desc->chip->set_affinity(irq, cpumask); 109 desc->chip->set_affinity(irq, cpumask);
103#endif 110#endif
104 desc->status |= IRQ_AFFINITY_SET; 111 desc->status |= IRQ_AFFINITY_SET;
@@ -112,26 +119,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
112 */ 119 */
113int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) 120int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
114{ 121{
115 cpumask_t mask;
116
117 if (!irq_can_set_affinity(irq)) 122 if (!irq_can_set_affinity(irq))
118 return 0; 123 return 0;
119 124
120 cpus_and(mask, cpu_online_map, irq_default_affinity);
121
122 /* 125 /*
123 * Preserve an userspace affinity setup, but make sure that 126 * Preserve an userspace affinity setup, but make sure that
124 * one of the targets is online. 127 * one of the targets is online.
125 */ 128 */
126 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 129 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
127 if (cpus_intersects(desc->affinity, cpu_online_map)) 130 if (cpumask_any_and(&desc->affinity, cpu_online_mask)
128 mask = desc->affinity; 131 < nr_cpu_ids)
132 goto set_affinity;
129 else 133 else
130 desc->status &= ~IRQ_AFFINITY_SET; 134 desc->status &= ~IRQ_AFFINITY_SET;
131 } 135 }
132 136
133 desc->affinity = mask; 137 cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
134 desc->chip->set_affinity(irq, mask); 138set_affinity:
139 desc->chip->set_affinity(irq, &desc->affinity);
135 140
136 return 0; 141 return 0;
137} 142}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 9db681d95814..bd72329e630c 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,7 +4,6 @@
4void move_masked_irq(int irq) 4void move_masked_irq(int irq)
5{ 5{
6 struct irq_desc *desc = irq_to_desc(irq); 6 struct irq_desc *desc = irq_to_desc(irq);
7 cpumask_t tmp;
8 7
9 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 8 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
10 return; 9 return;
@@ -19,7 +18,7 @@ void move_masked_irq(int irq)
19 18
20 desc->status &= ~IRQ_MOVE_PENDING; 19 desc->status &= ~IRQ_MOVE_PENDING;
21 20
22 if (unlikely(cpus_empty(desc->pending_mask))) 21 if (unlikely(cpumask_empty(&desc->pending_mask)))
23 return; 22 return;
24 23
25 if (!desc->chip->set_affinity) 24 if (!desc->chip->set_affinity)
@@ -27,8 +26,6 @@ void move_masked_irq(int irq)
27 26
28 assert_spin_locked(&desc->lock); 27 assert_spin_locked(&desc->lock);
29 28
30 cpus_and(tmp, desc->pending_mask, cpu_online_map);
31
32 /* 29 /*
33 * If there was a valid mask to work with, please 30 * If there was a valid mask to work with, please
34 * do the disable, re-program, enable sequence. 31 * do the disable, re-program, enable sequence.
@@ -41,10 +38,13 @@ void move_masked_irq(int irq)
41 * For correct operation this depends on the caller 38 * For correct operation this depends on the caller
42 * masking the irqs. 39 * masking the irqs.
43 */ 40 */
44 if (likely(!cpus_empty(tmp))) { 41 if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
45 desc->chip->set_affinity(irq,tmp); 42 < nr_cpu_ids)) {
43 cpumask_and(&desc->affinity,
44 &desc->pending_mask, cpu_online_mask);
45 desc->chip->set_affinity(irq, &desc->affinity);
46 } 46 }
47 cpus_clear(desc->pending_mask); 47 cpumask_clear(&desc->pending_mask);
48} 48}
49 49
50void move_native_irq(int irq) 50void move_native_irq(int irq)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f6b3440f05bc..aae3f742bcec 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,7 +20,7 @@ static struct proc_dir_entry *root_irq_dir;
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_to_desc((long)m->private); 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 cpumask_t *mask = &desc->affinity; 23 const struct cpumask *mask = &desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
26 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
@@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file,
40 const char __user *buffer, size_t count, loff_t *pos) 40 const char __user *buffer, size_t count, loff_t *pos)
41{ 41{
42 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 42 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
43 cpumask_t new_value; 43 cpumask_var_t new_value;
44 int err; 44 int err;
45 45
46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || 46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
47 irq_balancing_disabled(irq)) 47 irq_balancing_disabled(irq))
48 return -EIO; 48 return -EIO;
49 49
50 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
51 return -ENOMEM;
52
50 err = cpumask_parse_user(buffer, count, new_value); 53 err = cpumask_parse_user(buffer, count, new_value);
51 if (err) 54 if (err)
52 return err; 55 goto free_cpumask;
53 56
54 if (!is_affinity_mask_valid(new_value)) 57 if (!is_affinity_mask_valid(new_value)) {
55 return -EINVAL; 58 err = -EINVAL;
59 goto free_cpumask;
60 }
56 61
57 /* 62 /*
58 * Do not allow disabling IRQs completely - it's a too easy 63 * Do not allow disabling IRQs completely - it's a too easy
59 * way to make the system unusable accidentally :-) At least 64 * way to make the system unusable accidentally :-) At least
60 * one online CPU still has to be targeted. 65 * one online CPU still has to be targeted.
61 */ 66 */
62 if (!cpus_intersects(new_value, cpu_online_map)) 67 if (!cpumask_intersects(new_value, cpu_online_mask)) {
63 /* Special case for empty set - allow the architecture 68 /* Special case for empty set - allow the architecture
64 code to set default SMP affinity. */ 69 code to set default SMP affinity. */
65 return irq_select_affinity_usr(irq) ? -EINVAL : count; 70 err = irq_select_affinity_usr(irq) ? -EINVAL : count;
66 71 } else {
67 irq_set_affinity(irq, new_value); 72 irq_set_affinity(irq, new_value);
73 err = count;
74 }
68 75
69 return count; 76free_cpumask:
77 free_cpumask_var(new_value);
78 return err;
70} 79}
71 80
72static int irq_affinity_proc_open(struct inode *inode, struct file *file) 81static int irq_affinity_proc_open(struct inode *inode, struct file *file)
@@ -84,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = {
84 93
85static int default_affinity_show(struct seq_file *m, void *v) 94static int default_affinity_show(struct seq_file *m, void *v)
86{ 95{
87 seq_cpumask(m, &irq_default_affinity); 96 seq_cpumask(m, irq_default_affinity);
88 seq_putc(m, '\n'); 97 seq_putc(m, '\n');
89 return 0; 98 return 0;
90} 99}
@@ -92,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v)
92static ssize_t default_affinity_write(struct file *file, 101static ssize_t default_affinity_write(struct file *file,
93 const char __user *buffer, size_t count, loff_t *ppos) 102 const char __user *buffer, size_t count, loff_t *ppos)
94{ 103{
95 cpumask_t new_value; 104 cpumask_var_t new_value;
96 int err; 105 int err;
97 106
107 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
108 return -ENOMEM;
109
98 err = cpumask_parse_user(buffer, count, new_value); 110 err = cpumask_parse_user(buffer, count, new_value);
99 if (err) 111 if (err)
100 return err; 112 goto out;
101 113
102 if (!is_affinity_mask_valid(new_value)) 114 if (!is_affinity_mask_valid(new_value)) {
103 return -EINVAL; 115 err = -EINVAL;
116 goto out;
117 }
104 118
105 /* 119 /*
106 * Do not allow disabling IRQs completely - it's a too easy 120 * Do not allow disabling IRQs completely - it's a too easy
107 * way to make the system unusable accidentally :-) At least 121 * way to make the system unusable accidentally :-) At least
108 * one online CPU still has to be targeted. 122 * one online CPU still has to be targeted.
109 */ 123 */
110 if (!cpus_intersects(new_value, cpu_online_map)) 124 if (!cpumask_intersects(new_value, cpu_online_mask)) {
111 return -EINVAL; 125 err = -EINVAL;
126 goto out;
127 }
112 128
113 irq_default_affinity = new_value; 129 cpumask_copy(irq_default_affinity, new_value);
130 err = count;
114 131
115 return count; 132out:
133 free_cpumask_var(new_value);
134 return err;
116} 135}
117 136
118static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ac0fde7b54d0..3fb855ad6aa0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1116,7 +1116,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1116 struct elf_prstatus prstatus; 1116 struct elf_prstatus prstatus;
1117 u32 *buf; 1117 u32 *buf;
1118 1118
1119 if ((cpu < 0) || (cpu >= NR_CPUS)) 1119 if ((cpu < 0) || (cpu >= nr_cpu_ids))
1120 return; 1120 return;
1121 1121
1122 /* Using ELF notes here is opportunistic. 1122 /* Using ELF notes here is opportunistic.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index b46dbb908669..a27a5f64443d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51 51
52/** 52/**
53 * request_module - try to load a kernel module 53 * request_module - try to load a kernel module
54 * @fmt: printf style format string for the name of the module 54 * @fmt: printf style format string for the name of the module
55 * @varargs: arguements as specified in the format string 55 * @...: arguments as specified in the format string
56 * 56 *
57 * Load a module using the user mode module loader. The function returns 57 * Load a module using the user mode module loader. The function returns
58 * zero on success or a negative errno code on failure. Note that a 58 * zero on success or a negative errno code on failure. Note that a
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9f8a3f25259a..1b9cbdc0127a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
69/* NOTE: change this value only with kprobe_mutex held */ 69/* NOTE: change this value only with kprobe_mutex held */
70static bool kprobe_enabled; 70static bool kprobe_enabled;
71 71
72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct { 74static struct {
75 spinlock_t lock ____cacheline_aligned_in_smp; 75 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -115,6 +115,7 @@ enum kprobe_slot_state {
115 SLOT_USED = 2, 115 SLOT_USED = 2,
116}; 116};
117 117
118static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
118static struct hlist_head kprobe_insn_pages; 119static struct hlist_head kprobe_insn_pages;
119static int kprobe_garbage_slots; 120static int kprobe_garbage_slots;
120static int collect_garbage_slots(void); 121static int collect_garbage_slots(void);
@@ -144,10 +145,10 @@ loop_end:
144} 145}
145 146
146/** 147/**
147 * get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
148 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
149 */ 150 */
150kprobe_opcode_t __kprobes *get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(void)
151{ 152{
152 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
153 struct hlist_node *pos; 154 struct hlist_node *pos;
@@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
196 return kip->insns; 197 return kip->insns;
197} 198}
198 199
200kprobe_opcode_t __kprobes *get_insn_slot(void)
201{
202 kprobe_opcode_t *ret;
203 mutex_lock(&kprobe_insn_mutex);
204 ret = __get_insn_slot();
205 mutex_unlock(&kprobe_insn_mutex);
206 return ret;
207}
208
199/* Return 1 if all garbages are collected, otherwise 0. */ 209/* Return 1 if all garbages are collected, otherwise 0. */
200static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) 210static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
201{ 211{
@@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void)
226{ 236{
227 struct kprobe_insn_page *kip; 237 struct kprobe_insn_page *kip;
228 struct hlist_node *pos, *next; 238 struct hlist_node *pos, *next;
239 int safety;
229 240
230 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
231 if (check_safety() != 0) 242 mutex_unlock(&kprobe_insn_mutex);
243 safety = check_safety();
244 mutex_lock(&kprobe_insn_mutex);
245 if (safety != 0)
232 return -EAGAIN; 246 return -EAGAIN;
233 247
234 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 248 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
251 struct kprobe_insn_page *kip; 265 struct kprobe_insn_page *kip;
252 struct hlist_node *pos; 266 struct hlist_node *pos;
253 267
268 mutex_lock(&kprobe_insn_mutex);
254 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 269 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
255 if (kip->insns <= slot && 270 if (kip->insns <= slot &&
256 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 271 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
@@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
267 282
268 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 283 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
269 collect_garbage_slots(); 284 collect_garbage_slots();
285
286 mutex_unlock(&kprobe_insn_mutex);
270} 287}
271#endif 288#endif
272 289
@@ -310,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
310 struct kprobe *kp; 327 struct kprobe *kp;
311 328
312 list_for_each_entry_rcu(kp, &p->list, list) { 329 list_for_each_entry_rcu(kp, &p->list, list) {
313 if (kp->pre_handler) { 330 if (kp->pre_handler && !kprobe_gone(kp)) {
314 set_kprobe_instance(kp); 331 set_kprobe_instance(kp);
315 if (kp->pre_handler(kp, regs)) 332 if (kp->pre_handler(kp, regs))
316 return 1; 333 return 1;
@@ -326,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
326 struct kprobe *kp; 343 struct kprobe *kp;
327 344
328 list_for_each_entry_rcu(kp, &p->list, list) { 345 list_for_each_entry_rcu(kp, &p->list, list) {
329 if (kp->post_handler) { 346 if (kp->post_handler && !kprobe_gone(kp)) {
330 set_kprobe_instance(kp); 347 set_kprobe_instance(kp);
331 kp->post_handler(kp, regs, flags); 348 kp->post_handler(kp, regs, flags);
332 reset_kprobe_instance(); 349 reset_kprobe_instance();
@@ -393,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
393 hlist_add_head(&ri->hlist, head); 410 hlist_add_head(&ri->hlist, head);
394} 411}
395 412
396void kretprobe_hash_lock(struct task_struct *tsk, 413void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
397 struct hlist_head **head, unsigned long *flags) 414 struct hlist_head **head, unsigned long *flags)
398{ 415{
399 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 416 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -404,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk,
404 spin_lock_irqsave(hlist_lock, *flags); 421 spin_lock_irqsave(hlist_lock, *flags);
405} 422}
406 423
407static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) 424static void __kprobes kretprobe_table_lock(unsigned long hash,
425 unsigned long *flags)
408{ 426{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 427 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags); 428 spin_lock_irqsave(hlist_lock, *flags);
411} 429}
412 430
413void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) 431void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
432 unsigned long *flags)
414{ 433{
415 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 434 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
416 spinlock_t *hlist_lock; 435 spinlock_t *hlist_lock;
@@ -419,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
419 spin_unlock_irqrestore(hlist_lock, *flags); 438 spin_unlock_irqrestore(hlist_lock, *flags);
420} 439}
421 440
422void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 441void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
423{ 442{
424 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 443 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
425 spin_unlock_irqrestore(hlist_lock, *flags); 444 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -526,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
526 ap->addr = p->addr; 545 ap->addr = p->addr;
527 ap->pre_handler = aggr_pre_handler; 546 ap->pre_handler = aggr_pre_handler;
528 ap->fault_handler = aggr_fault_handler; 547 ap->fault_handler = aggr_fault_handler;
529 if (p->post_handler) 548 /* We don't care the kprobe which has gone. */
549 if (p->post_handler && !kprobe_gone(p))
530 ap->post_handler = aggr_post_handler; 550 ap->post_handler = aggr_post_handler;
531 if (p->break_handler) 551 if (p->break_handler && !kprobe_gone(p))
532 ap->break_handler = aggr_break_handler; 552 ap->break_handler = aggr_break_handler;
533 553
534 INIT_LIST_HEAD(&ap->list); 554 INIT_LIST_HEAD(&ap->list);
@@ -547,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
547 int ret = 0; 567 int ret = 0;
548 struct kprobe *ap; 568 struct kprobe *ap;
549 569
570 if (kprobe_gone(old_p)) {
571 /*
572 * Attempting to insert new probe at the same location that
573 * had a probe in the module vaddr area which already
574 * freed. So, the instruction slot has already been
575 * released. We need a new slot for the new probe.
576 */
577 ret = arch_prepare_kprobe(old_p);
578 if (ret)
579 return ret;
580 }
550 if (old_p->pre_handler == aggr_pre_handler) { 581 if (old_p->pre_handler == aggr_pre_handler) {
551 copy_kprobe(old_p, p); 582 copy_kprobe(old_p, p);
552 ret = add_new_kprobe(old_p, p); 583 ret = add_new_kprobe(old_p, p);
584 ap = old_p;
553 } else { 585 } else {
554 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 586 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
555 if (!ap) 587 if (!ap) {
588 if (kprobe_gone(old_p))
589 arch_remove_kprobe(old_p);
556 return -ENOMEM; 590 return -ENOMEM;
591 }
557 add_aggr_kprobe(ap, old_p); 592 add_aggr_kprobe(ap, old_p);
558 copy_kprobe(ap, p); 593 copy_kprobe(ap, p);
559 ret = add_new_kprobe(ap, p); 594 ret = add_new_kprobe(ap, p);
560 } 595 }
596 if (kprobe_gone(old_p)) {
597 /*
598 * If the old_p has gone, its breakpoint has been disarmed.
599 * We have to arm it again after preparing real kprobes.
600 */
601 ap->flags &= ~KPROBE_FLAG_GONE;
602 if (kprobe_enabled)
603 arch_arm_kprobe(ap);
604 }
561 return ret; 605 return ret;
562} 606}
563 607
@@ -600,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
600 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 644 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
601} 645}
602 646
603static int __kprobes __register_kprobe(struct kprobe *p, 647int __kprobes register_kprobe(struct kprobe *p)
604 unsigned long called_from)
605{ 648{
606 int ret = 0; 649 int ret = 0;
607 struct kprobe *old_p; 650 struct kprobe *old_p;
@@ -620,28 +663,30 @@ static int __kprobes __register_kprobe(struct kprobe *p,
620 return -EINVAL; 663 return -EINVAL;
621 } 664 }
622 665
623 p->mod_refcounted = 0; 666 p->flags = 0;
624
625 /* 667 /*
626 * Check if are we probing a module. 668 * Check if are we probing a module.
627 */ 669 */
628 probed_mod = __module_text_address((unsigned long) p->addr); 670 probed_mod = __module_text_address((unsigned long) p->addr);
629 if (probed_mod) { 671 if (probed_mod) {
630 struct module *calling_mod;
631 calling_mod = __module_text_address(called_from);
632 /* 672 /*
633 * We must allow modules to probe themself and in this case 673 * We must hold a refcount of the probed module while updating
634 * avoid incrementing the module refcount, so as to allow 674 * its code to prohibit unexpected unloading.
635 * unloading of self probing modules.
636 */ 675 */
637 if (calling_mod && calling_mod != probed_mod) { 676 if (unlikely(!try_module_get(probed_mod))) {
638 if (unlikely(!try_module_get(probed_mod))) { 677 preempt_enable();
639 preempt_enable(); 678 return -EINVAL;
640 return -EINVAL; 679 }
641 } 680 /*
642 p->mod_refcounted = 1; 681 * If the module freed .init.text, we couldn't insert
643 } else 682 * kprobes in there.
644 probed_mod = NULL; 683 */
684 if (within_module_init((unsigned long)p->addr, probed_mod) &&
685 probed_mod->state != MODULE_STATE_COMING) {
686 module_put(probed_mod);
687 preempt_enable();
688 return -EINVAL;
689 }
645 } 690 }
646 preempt_enable(); 691 preempt_enable();
647 692
@@ -668,8 +713,9 @@ static int __kprobes __register_kprobe(struct kprobe *p,
668out: 713out:
669 mutex_unlock(&kprobe_mutex); 714 mutex_unlock(&kprobe_mutex);
670 715
671 if (ret && probed_mod) 716 if (probed_mod)
672 module_put(probed_mod); 717 module_put(probed_mod);
718
673 return ret; 719 return ret;
674} 720}
675 721
@@ -697,16 +743,16 @@ valid_p:
697 list_is_singular(&old_p->list))) { 743 list_is_singular(&old_p->list))) {
698 /* 744 /*
699 * Only probe on the hash list. Disarm only if kprobes are 745 * Only probe on the hash list. Disarm only if kprobes are
700 * enabled - otherwise, the breakpoint would already have 746 * enabled and not gone - otherwise, the breakpoint would
701 * been removed. We save on flushing icache. 747 * already have been removed. We save on flushing icache.
702 */ 748 */
703 if (kprobe_enabled) 749 if (kprobe_enabled && !kprobe_gone(old_p))
704 arch_disarm_kprobe(p); 750 arch_disarm_kprobe(p);
705 hlist_del_rcu(&old_p->hlist); 751 hlist_del_rcu(&old_p->hlist);
706 } else { 752 } else {
707 if (p->break_handler) 753 if (p->break_handler && !kprobe_gone(p))
708 old_p->break_handler = NULL; 754 old_p->break_handler = NULL;
709 if (p->post_handler) { 755 if (p->post_handler && !kprobe_gone(p)) {
710 list_for_each_entry_rcu(list_p, &old_p->list, list) { 756 list_for_each_entry_rcu(list_p, &old_p->list, list) {
711 if ((list_p != p) && (list_p->post_handler)) 757 if ((list_p != p) && (list_p->post_handler))
712 goto noclean; 758 goto noclean;
@@ -721,39 +767,27 @@ noclean:
721 767
722static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 768static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
723{ 769{
724 struct module *mod;
725 struct kprobe *old_p; 770 struct kprobe *old_p;
726 771
727 if (p->mod_refcounted) { 772 if (list_empty(&p->list))
728 /*
729 * Since we've already incremented refcount,
730 * we don't need to disable preemption.
731 */
732 mod = module_text_address((unsigned long)p->addr);
733 if (mod)
734 module_put(mod);
735 }
736
737 if (list_empty(&p->list) || list_is_singular(&p->list)) {
738 if (!list_empty(&p->list)) {
739 /* "p" is the last child of an aggr_kprobe */
740 old_p = list_entry(p->list.next, struct kprobe, list);
741 list_del(&p->list);
742 kfree(old_p);
743 }
744 arch_remove_kprobe(p); 773 arch_remove_kprobe(p);
774 else if (list_is_singular(&p->list)) {
775 /* "p" is the last child of an aggr_kprobe */
776 old_p = list_entry(p->list.next, struct kprobe, list);
777 list_del(&p->list);
778 arch_remove_kprobe(old_p);
779 kfree(old_p);
745 } 780 }
746} 781}
747 782
748static int __register_kprobes(struct kprobe **kps, int num, 783int __kprobes register_kprobes(struct kprobe **kps, int num)
749 unsigned long called_from)
750{ 784{
751 int i, ret = 0; 785 int i, ret = 0;
752 786
753 if (num <= 0) 787 if (num <= 0)
754 return -EINVAL; 788 return -EINVAL;
755 for (i = 0; i < num; i++) { 789 for (i = 0; i < num; i++) {
756 ret = __register_kprobe(kps[i], called_from); 790 ret = register_kprobe(kps[i]);
757 if (ret < 0) { 791 if (ret < 0) {
758 if (i > 0) 792 if (i > 0)
759 unregister_kprobes(kps, i); 793 unregister_kprobes(kps, i);
@@ -763,26 +797,11 @@ static int __register_kprobes(struct kprobe **kps, int num,
763 return ret; 797 return ret;
764} 798}
765 799
766/*
767 * Registration and unregistration functions for kprobe.
768 */
769int __kprobes register_kprobe(struct kprobe *p)
770{
771 return __register_kprobes(&p, 1,
772 (unsigned long)__builtin_return_address(0));
773}
774
775void __kprobes unregister_kprobe(struct kprobe *p) 800void __kprobes unregister_kprobe(struct kprobe *p)
776{ 801{
777 unregister_kprobes(&p, 1); 802 unregister_kprobes(&p, 1);
778} 803}
779 804
780int __kprobes register_kprobes(struct kprobe **kps, int num)
781{
782 return __register_kprobes(kps, num,
783 (unsigned long)__builtin_return_address(0));
784}
785
786void __kprobes unregister_kprobes(struct kprobe **kps, int num) 805void __kprobes unregister_kprobes(struct kprobe **kps, int num)
787{ 806{
788 int i; 807 int i;
@@ -811,8 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
811 return (unsigned long)entry; 830 return (unsigned long)entry;
812} 831}
813 832
814static int __register_jprobes(struct jprobe **jps, int num, 833int __kprobes register_jprobes(struct jprobe **jps, int num)
815 unsigned long called_from)
816{ 834{
817 struct jprobe *jp; 835 struct jprobe *jp;
818 int ret = 0, i; 836 int ret = 0, i;
@@ -830,7 +848,7 @@ static int __register_jprobes(struct jprobe **jps, int num,
830 /* Todo: Verify probepoint is a function entry point */ 848 /* Todo: Verify probepoint is a function entry point */
831 jp->kp.pre_handler = setjmp_pre_handler; 849 jp->kp.pre_handler = setjmp_pre_handler;
832 jp->kp.break_handler = longjmp_break_handler; 850 jp->kp.break_handler = longjmp_break_handler;
833 ret = __register_kprobe(&jp->kp, called_from); 851 ret = register_kprobe(&jp->kp);
834 } 852 }
835 if (ret < 0) { 853 if (ret < 0) {
836 if (i > 0) 854 if (i > 0)
@@ -843,8 +861,7 @@ static int __register_jprobes(struct jprobe **jps, int num,
843 861
844int __kprobes register_jprobe(struct jprobe *jp) 862int __kprobes register_jprobe(struct jprobe *jp)
845{ 863{
846 return __register_jprobes(&jp, 1, 864 return register_jprobes(&jp, 1);
847 (unsigned long)__builtin_return_address(0));
848} 865}
849 866
850void __kprobes unregister_jprobe(struct jprobe *jp) 867void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -852,12 +869,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
852 unregister_jprobes(&jp, 1); 869 unregister_jprobes(&jp, 1);
853} 870}
854 871
855int __kprobes register_jprobes(struct jprobe **jps, int num)
856{
857 return __register_jprobes(jps, num,
858 (unsigned long)__builtin_return_address(0));
859}
860
861void __kprobes unregister_jprobes(struct jprobe **jps, int num) 872void __kprobes unregister_jprobes(struct jprobe **jps, int num)
862{ 873{
863 int i; 874 int i;
@@ -920,8 +931,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
920 return 0; 931 return 0;
921} 932}
922 933
923static int __kprobes __register_kretprobe(struct kretprobe *rp, 934int __kprobes register_kretprobe(struct kretprobe *rp)
924 unsigned long called_from)
925{ 935{
926 int ret = 0; 936 int ret = 0;
927 struct kretprobe_instance *inst; 937 struct kretprobe_instance *inst;
@@ -967,21 +977,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
967 977
968 rp->nmissed = 0; 978 rp->nmissed = 0;
969 /* Establish function entry probe point */ 979 /* Establish function entry probe point */
970 ret = __register_kprobe(&rp->kp, called_from); 980 ret = register_kprobe(&rp->kp);
971 if (ret != 0) 981 if (ret != 0)
972 free_rp_inst(rp); 982 free_rp_inst(rp);
973 return ret; 983 return ret;
974} 984}
975 985
976static int __register_kretprobes(struct kretprobe **rps, int num, 986int __kprobes register_kretprobes(struct kretprobe **rps, int num)
977 unsigned long called_from)
978{ 987{
979 int ret = 0, i; 988 int ret = 0, i;
980 989
981 if (num <= 0) 990 if (num <= 0)
982 return -EINVAL; 991 return -EINVAL;
983 for (i = 0; i < num; i++) { 992 for (i = 0; i < num; i++) {
984 ret = __register_kretprobe(rps[i], called_from); 993 ret = register_kretprobe(rps[i]);
985 if (ret < 0) { 994 if (ret < 0) {
986 if (i > 0) 995 if (i > 0)
987 unregister_kretprobes(rps, i); 996 unregister_kretprobes(rps, i);
@@ -991,23 +1000,11 @@ static int __register_kretprobes(struct kretprobe **rps, int num,
991 return ret; 1000 return ret;
992} 1001}
993 1002
994int __kprobes register_kretprobe(struct kretprobe *rp)
995{
996 return __register_kretprobes(&rp, 1,
997 (unsigned long)__builtin_return_address(0));
998}
999
1000void __kprobes unregister_kretprobe(struct kretprobe *rp) 1003void __kprobes unregister_kretprobe(struct kretprobe *rp)
1001{ 1004{
1002 unregister_kretprobes(&rp, 1); 1005 unregister_kretprobes(&rp, 1);
1003} 1006}
1004 1007
1005int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1006{
1007 return __register_kretprobes(rps, num,
1008 (unsigned long)__builtin_return_address(0));
1009}
1010
1011void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1008void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1012{ 1009{
1013 int i; 1010 int i;
@@ -1055,6 +1052,72 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1055 1052
1056#endif /* CONFIG_KRETPROBES */ 1053#endif /* CONFIG_KRETPROBES */
1057 1054
1055/* Set the kprobe gone and remove its instruction buffer. */
1056static void __kprobes kill_kprobe(struct kprobe *p)
1057{
1058 struct kprobe *kp;
1059 p->flags |= KPROBE_FLAG_GONE;
1060 if (p->pre_handler == aggr_pre_handler) {
1061 /*
1062 * If this is an aggr_kprobe, we have to list all the
1063 * chained probes and mark them GONE.
1064 */
1065 list_for_each_entry_rcu(kp, &p->list, list)
1066 kp->flags |= KPROBE_FLAG_GONE;
1067 p->post_handler = NULL;
1068 p->break_handler = NULL;
1069 }
1070 /*
1071 * Here, we can remove insn_slot safely, because no thread calls
1072 * the original probed function (which will be freed soon) any more.
1073 */
1074 arch_remove_kprobe(p);
1075}
1076
1077/* Module notifier call back, checking kprobes on the module */
1078static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1079 unsigned long val, void *data)
1080{
1081 struct module *mod = data;
1082 struct hlist_head *head;
1083 struct hlist_node *node;
1084 struct kprobe *p;
1085 unsigned int i;
1086 int checkcore = (val == MODULE_STATE_GOING);
1087
1088 if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
1089 return NOTIFY_DONE;
1090
1091 /*
1092 * When MODULE_STATE_GOING was notified, both of module .text and
1093 * .init.text sections would be freed. When MODULE_STATE_LIVE was
1094 * notified, only .init.text section would be freed. We need to
1095 * disable kprobes which have been inserted in the sections.
1096 */
1097 mutex_lock(&kprobe_mutex);
1098 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1099 head = &kprobe_table[i];
1100 hlist_for_each_entry_rcu(p, node, head, hlist)
1101 if (within_module_init((unsigned long)p->addr, mod) ||
1102 (checkcore &&
1103 within_module_core((unsigned long)p->addr, mod))) {
1104 /*
1105 * The vaddr this probe is installed will soon
1106 * be vfreed buy not synced to disk. Hence,
1107 * disarming the breakpoint isn't needed.
1108 */
1109 kill_kprobe(p);
1110 }
1111 }
1112 mutex_unlock(&kprobe_mutex);
1113 return NOTIFY_DONE;
1114}
1115
1116static struct notifier_block kprobe_module_nb = {
1117 .notifier_call = kprobes_module_callback,
1118 .priority = 0
1119};
1120
1058static int __init init_kprobes(void) 1121static int __init init_kprobes(void)
1059{ 1122{
1060 int i, err = 0; 1123 int i, err = 0;
@@ -1111,6 +1174,9 @@ static int __init init_kprobes(void)
1111 err = arch_init_kprobes(); 1174 err = arch_init_kprobes();
1112 if (!err) 1175 if (!err)
1113 err = register_die_notifier(&kprobe_exceptions_nb); 1176 err = register_die_notifier(&kprobe_exceptions_nb);
1177 if (!err)
1178 err = register_module_notifier(&kprobe_module_nb);
1179
1114 kprobes_initialized = (err == 0); 1180 kprobes_initialized = (err == 0);
1115 1181
1116 if (!err) 1182 if (!err)
@@ -1131,10 +1197,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1131 else 1197 else
1132 kprobe_type = "k"; 1198 kprobe_type = "k";
1133 if (sym) 1199 if (sym)
1134 seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, 1200 seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type,
1135 sym, offset, (modname ? modname : " ")); 1201 sym, offset, (modname ? modname : " "),
1202 (kprobe_gone(p) ? "[GONE]" : ""));
1136 else 1203 else
1137 seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); 1204 seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr,
1205 (kprobe_gone(p) ? "[GONE]" : ""));
1138} 1206}
1139 1207
1140static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1208static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1215,7 +1283,8 @@ static void __kprobes enable_all_kprobes(void)
1215 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1283 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1216 head = &kprobe_table[i]; 1284 head = &kprobe_table[i];
1217 hlist_for_each_entry_rcu(p, node, head, hlist) 1285 hlist_for_each_entry_rcu(p, node, head, hlist)
1218 arch_arm_kprobe(p); 1286 if (!kprobe_gone(p))
1287 arch_arm_kprobe(p);
1219 } 1288 }
1220 1289
1221 kprobe_enabled = true; 1290 kprobe_enabled = true;
@@ -1244,7 +1313,7 @@ static void __kprobes disable_all_kprobes(void)
1244 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1313 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1245 head = &kprobe_table[i]; 1314 head = &kprobe_table[i];
1246 hlist_for_each_entry_rcu(p, node, head, hlist) { 1315 hlist_for_each_entry_rcu(p, node, head, hlist) {
1247 if (!arch_trampoline_kprobe(p)) 1316 if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
1248 arch_disarm_kprobe(p); 1317 arch_disarm_kprobe(p);
1249 } 1318 }
1250 } 1319 }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 08dd8ed86c77..528dd78e7e7e 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
24static struct kobj_attribute _name##_attr = \ 24static struct kobj_attribute _name##_attr = \
25 __ATTR(_name, 0644, _name##_show, _name##_store) 25 __ATTR(_name, 0644, _name##_show, _name##_store)
26 26
27#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 27#if defined(CONFIG_HOTPLUG)
28/* current uevent sequence number */ 28/* current uevent sequence number */
29static ssize_t uevent_seqnum_show(struct kobject *kobj, 29static ssize_t uevent_seqnum_show(struct kobject *kobj,
30 struct kobj_attribute *attr, char *buf) 30 struct kobj_attribute *attr, char *buf)
@@ -137,7 +137,7 @@ struct kobject *kernel_kobj;
137EXPORT_SYMBOL_GPL(kernel_kobj); 137EXPORT_SYMBOL_GPL(kernel_kobj);
138 138
139static struct attribute * kernel_attrs[] = { 139static struct attribute * kernel_attrs[] = {
140#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 140#if defined(CONFIG_HOTPLUG)
141 &uevent_seqnum_attr.attr, 141 &uevent_seqnum_attr.attr,
142 &uevent_helper_attr.attr, 142 &uevent_helper_attr.attr,
143#endif 143#endif
diff --git a/kernel/module.c b/kernel/module.c
index dd2a54155b54..c9332c90d5a0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,7 +43,6 @@
43#include <linux/device.h> 43#include <linux/device.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/unwind.h>
47#include <linux/rculist.h> 46#include <linux/rculist.h>
48#include <asm/uaccess.h> 47#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 48#include <asm/cacheflush.h>
@@ -51,6 +50,7 @@
51#include <asm/sections.h> 50#include <asm/sections.h>
52#include <linux/tracepoint.h> 51#include <linux/tracepoint.h>
53#include <linux/ftrace.h> 52#include <linux/ftrace.h>
53#include <linux/async.h>
54 54
55#if 0 55#if 0
56#define DEBUGP printk 56#define DEBUGP printk
@@ -757,8 +757,16 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
757 return -EFAULT; 757 return -EFAULT;
758 name[MODULE_NAME_LEN-1] = '\0'; 758 name[MODULE_NAME_LEN-1] = '\0';
759 759
760 if (mutex_lock_interruptible(&module_mutex) != 0) 760 /* Create stop_machine threads since free_module relies on
761 return -EINTR; 761 * a non-failing stop_machine call. */
762 ret = stop_machine_create();
763 if (ret)
764 return ret;
765
766 if (mutex_lock_interruptible(&module_mutex) != 0) {
767 ret = -EINTR;
768 goto out_stop;
769 }
762 770
763 mod = find_module(name); 771 mod = find_module(name);
764 if (!mod) { 772 if (!mod) {
@@ -809,6 +817,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
809 mod->exit(); 817 mod->exit();
810 blocking_notifier_call_chain(&module_notify_list, 818 blocking_notifier_call_chain(&module_notify_list,
811 MODULE_STATE_GOING, mod); 819 MODULE_STATE_GOING, mod);
820 async_synchronize_full();
812 mutex_lock(&module_mutex); 821 mutex_lock(&module_mutex);
813 /* Store the name of the last unloaded module for diagnostic purposes */ 822 /* Store the name of the last unloaded module for diagnostic purposes */
814 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 823 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
@@ -817,10 +826,12 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
817 826
818 out: 827 out:
819 mutex_unlock(&module_mutex); 828 mutex_unlock(&module_mutex);
829out_stop:
830 stop_machine_destroy();
820 return ret; 831 return ret;
821} 832}
822 833
823static void print_unload_info(struct seq_file *m, struct module *mod) 834static inline void print_unload_info(struct seq_file *m, struct module *mod)
824{ 835{
825 struct module_use *use; 836 struct module_use *use;
826 int printed_something = 0; 837 int printed_something = 0;
@@ -893,7 +904,7 @@ void module_put(struct module *module)
893EXPORT_SYMBOL(module_put); 904EXPORT_SYMBOL(module_put);
894 905
895#else /* !CONFIG_MODULE_UNLOAD */ 906#else /* !CONFIG_MODULE_UNLOAD */
896static void print_unload_info(struct seq_file *m, struct module *mod) 907static inline void print_unload_info(struct seq_file *m, struct module *mod)
897{ 908{
898 /* We don't know the usage count, or what modules are using. */ 909 /* We don't know the usage count, or what modules are using. */
899 seq_printf(m, " - -"); 910 seq_printf(m, " - -");
@@ -1439,8 +1450,6 @@ static void free_module(struct module *mod)
1439 remove_sect_attrs(mod); 1450 remove_sect_attrs(mod);
1440 mod_kobject_remove(mod); 1451 mod_kobject_remove(mod);
1441 1452
1442 unwind_remove_table(mod->unwind_info, 0);
1443
1444 /* Arch-specific cleanup. */ 1453 /* Arch-specific cleanup. */
1445 module_arch_cleanup(mod); 1454 module_arch_cleanup(mod);
1446 1455
@@ -1578,11 +1587,21 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1578 return ret; 1587 return ret;
1579} 1588}
1580 1589
1590/* Additional bytes needed by arch in front of individual sections */
1591unsigned int __weak arch_mod_section_prepend(struct module *mod,
1592 unsigned int section)
1593{
1594 /* default implementation just returns zero */
1595 return 0;
1596}
1597
1581/* Update size with this section: return offset. */ 1598/* Update size with this section: return offset. */
1582static long get_offset(unsigned int *size, Elf_Shdr *sechdr) 1599static long get_offset(struct module *mod, unsigned int *size,
1600 Elf_Shdr *sechdr, unsigned int section)
1583{ 1601{
1584 long ret; 1602 long ret;
1585 1603
1604 *size += arch_mod_section_prepend(mod, section);
1586 ret = ALIGN(*size, sechdr->sh_addralign ?: 1); 1605 ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
1587 *size = ret + sechdr->sh_size; 1606 *size = ret + sechdr->sh_size;
1588 return ret; 1607 return ret;
@@ -1622,7 +1641,7 @@ static void layout_sections(struct module *mod,
1622 || strncmp(secstrings + s->sh_name, 1641 || strncmp(secstrings + s->sh_name,
1623 ".init", 5) == 0) 1642 ".init", 5) == 0)
1624 continue; 1643 continue;
1625 s->sh_entsize = get_offset(&mod->core_size, s); 1644 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1626 DEBUGP("\t%s\n", secstrings + s->sh_name); 1645 DEBUGP("\t%s\n", secstrings + s->sh_name);
1627 } 1646 }
1628 if (m == 0) 1647 if (m == 0)
@@ -1640,7 +1659,7 @@ static void layout_sections(struct module *mod,
1640 || strncmp(secstrings + s->sh_name, 1659 || strncmp(secstrings + s->sh_name,
1641 ".init", 5) != 0) 1660 ".init", 5) != 0)
1642 continue; 1661 continue;
1643 s->sh_entsize = (get_offset(&mod->init_size, s) 1662 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1644 | INIT_OFFSET_MASK); 1663 | INIT_OFFSET_MASK);
1645 DEBUGP("\t%s\n", secstrings + s->sh_name); 1664 DEBUGP("\t%s\n", secstrings + s->sh_name);
1646 } 1665 }
@@ -1725,15 +1744,15 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
1725 return NULL; 1744 return NULL;
1726} 1745}
1727 1746
1728static int is_exported(const char *name, const struct module *mod) 1747static int is_exported(const char *name, unsigned long value,
1748 const struct module *mod)
1729{ 1749{
1730 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1750 const struct kernel_symbol *ks;
1731 return 1; 1751 if (!mod)
1752 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
1732 else 1753 else
1733 if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) 1754 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
1734 return 1; 1755 return ks != NULL && ks->value == value;
1735 else
1736 return 0;
1737} 1756}
1738 1757
1739/* As per nm */ 1758/* As per nm */
@@ -1847,7 +1866,6 @@ static noinline struct module *load_module(void __user *umod,
1847 unsigned int symindex = 0; 1866 unsigned int symindex = 0;
1848 unsigned int strindex = 0; 1867 unsigned int strindex = 0;
1849 unsigned int modindex, versindex, infoindex, pcpuindex; 1868 unsigned int modindex, versindex, infoindex, pcpuindex;
1850 unsigned int unwindex = 0;
1851 unsigned int num_kp, num_mcount; 1869 unsigned int num_kp, num_mcount;
1852 struct kernel_param *kp; 1870 struct kernel_param *kp;
1853 struct module *mod; 1871 struct module *mod;
@@ -1865,6 +1883,13 @@ static noinline struct module *load_module(void __user *umod,
1865 /* vmalloc barfs on "unusual" numbers. Check here */ 1883 /* vmalloc barfs on "unusual" numbers. Check here */
1866 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 1884 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
1867 return ERR_PTR(-ENOMEM); 1885 return ERR_PTR(-ENOMEM);
1886
1887 /* Create stop_machine threads since the error path relies on
1888 * a non-failing stop_machine call. */
1889 err = stop_machine_create();
1890 if (err)
1891 goto free_hdr;
1892
1868 if (copy_from_user(hdr, umod, len) != 0) { 1893 if (copy_from_user(hdr, umod, len) != 0) {
1869 err = -EFAULT; 1894 err = -EFAULT;
1870 goto free_hdr; 1895 goto free_hdr;
@@ -1930,9 +1955,6 @@ static noinline struct module *load_module(void __user *umod,
1930 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1955 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1931 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1956 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1932 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1957 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1933#ifdef ARCH_UNWIND_SECTION_NAME
1934 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1935#endif
1936 1958
1937 /* Don't keep modinfo and version sections. */ 1959 /* Don't keep modinfo and version sections. */
1938 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1960 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1942,8 +1964,6 @@ static noinline struct module *load_module(void __user *umod,
1942 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1964 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1943 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1965 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1944#endif 1966#endif
1945 if (unwindex)
1946 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1947 1967
1948 /* Check module struct version now, before we try to use module. */ 1968 /* Check module struct version now, before we try to use module. */
1949 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1969 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2240,14 +2260,10 @@ static noinline struct module *load_module(void __user *umod,
2240 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2260 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2241 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2261 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2242 2262
2243 /* Size of section 0 is 0, so this works well if no unwind info. */
2244 mod->unwind_info = unwind_add_table(mod,
2245 (void *)sechdrs[unwindex].sh_addr,
2246 sechdrs[unwindex].sh_size);
2247
2248 /* Get rid of temporary copy */ 2263 /* Get rid of temporary copy */
2249 vfree(hdr); 2264 vfree(hdr);
2250 2265
2266 stop_machine_destroy();
2251 /* Done! */ 2267 /* Done! */
2252 return mod; 2268 return mod;
2253 2269
@@ -2270,6 +2286,7 @@ static noinline struct module *load_module(void __user *umod,
2270 kfree(args); 2286 kfree(args);
2271 free_hdr: 2287 free_hdr:
2272 vfree(hdr); 2288 vfree(hdr);
2289 stop_machine_destroy();
2273 return ERR_PTR(err); 2290 return ERR_PTR(err);
2274 2291
2275 truncated: 2292 truncated:
@@ -2337,11 +2354,12 @@ sys_init_module(void __user *umod,
2337 /* Now it's a first class citizen! Wake up anyone waiting for it. */ 2354 /* Now it's a first class citizen! Wake up anyone waiting for it. */
2338 mod->state = MODULE_STATE_LIVE; 2355 mod->state = MODULE_STATE_LIVE;
2339 wake_up(&module_wq); 2356 wake_up(&module_wq);
2357 blocking_notifier_call_chain(&module_notify_list,
2358 MODULE_STATE_LIVE, mod);
2340 2359
2341 mutex_lock(&module_mutex); 2360 mutex_lock(&module_mutex);
2342 /* Drop initial reference. */ 2361 /* Drop initial reference. */
2343 module_put(mod); 2362 module_put(mod);
2344 unwind_remove_table(mod->unwind_info, 1);
2345 module_free(mod, mod->module_init); 2363 module_free(mod, mod->module_init);
2346 mod->module_init = NULL; 2364 mod->module_init = NULL;
2347 mod->init_size = 0; 2365 mod->init_size = 0;
@@ -2376,7 +2394,7 @@ static const char *get_ksymbol(struct module *mod,
2376 unsigned long nextval; 2394 unsigned long nextval;
2377 2395
2378 /* At worse, next value is at end of module */ 2396 /* At worse, next value is at end of module */
2379 if (within(addr, mod->module_init, mod->init_size)) 2397 if (within_module_init(addr, mod))
2380 nextval = (unsigned long)mod->module_init+mod->init_text_size; 2398 nextval = (unsigned long)mod->module_init+mod->init_text_size;
2381 else 2399 else
2382 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2400 nextval = (unsigned long)mod->module_core+mod->core_text_size;
@@ -2424,8 +2442,8 @@ const char *module_address_lookup(unsigned long addr,
2424 2442
2425 preempt_disable(); 2443 preempt_disable();
2426 list_for_each_entry_rcu(mod, &modules, list) { 2444 list_for_each_entry_rcu(mod, &modules, list) {
2427 if (within(addr, mod->module_init, mod->init_size) 2445 if (within_module_init(addr, mod) ||
2428 || within(addr, mod->module_core, mod->core_size)) { 2446 within_module_core(addr, mod)) {
2429 if (modname) 2447 if (modname)
2430 *modname = mod->name; 2448 *modname = mod->name;
2431 ret = get_ksymbol(mod, addr, size, offset); 2449 ret = get_ksymbol(mod, addr, size, offset);
@@ -2447,8 +2465,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2447 2465
2448 preempt_disable(); 2466 preempt_disable();
2449 list_for_each_entry_rcu(mod, &modules, list) { 2467 list_for_each_entry_rcu(mod, &modules, list) {
2450 if (within(addr, mod->module_init, mod->init_size) || 2468 if (within_module_init(addr, mod) ||
2451 within(addr, mod->module_core, mod->core_size)) { 2469 within_module_core(addr, mod)) {
2452 const char *sym; 2470 const char *sym;
2453 2471
2454 sym = get_ksymbol(mod, addr, NULL, NULL); 2472 sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -2471,8 +2489,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2471 2489
2472 preempt_disable(); 2490 preempt_disable();
2473 list_for_each_entry_rcu(mod, &modules, list) { 2491 list_for_each_entry_rcu(mod, &modules, list) {
2474 if (within(addr, mod->module_init, mod->init_size) || 2492 if (within_module_init(addr, mod) ||
2475 within(addr, mod->module_core, mod->core_size)) { 2493 within_module_core(addr, mod)) {
2476 const char *sym; 2494 const char *sym;
2477 2495
2478 sym = get_ksymbol(mod, addr, size, offset); 2496 sym = get_ksymbol(mod, addr, size, offset);
@@ -2504,7 +2522,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2504 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, 2522 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
2505 KSYM_NAME_LEN); 2523 KSYM_NAME_LEN);
2506 strlcpy(module_name, mod->name, MODULE_NAME_LEN); 2524 strlcpy(module_name, mod->name, MODULE_NAME_LEN);
2507 *exported = is_exported(name, mod); 2525 *exported = is_exported(name, *value, mod);
2508 preempt_enable(); 2526 preempt_enable();
2509 return 0; 2527 return 0;
2510 } 2528 }
@@ -2691,7 +2709,7 @@ int is_module_address(unsigned long addr)
2691 preempt_disable(); 2709 preempt_disable();
2692 2710
2693 list_for_each_entry_rcu(mod, &modules, list) { 2711 list_for_each_entry_rcu(mod, &modules, list) {
2694 if (within(addr, mod->module_core, mod->core_size)) { 2712 if (within_module_core(addr, mod)) {
2695 preempt_enable(); 2713 preempt_enable();
2696 return 1; 2714 return 1;
2697 } 2715 }
diff --git a/kernel/panic.c b/kernel/panic.c
index 13f06349a786..2a2ff36ff44d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -299,6 +299,8 @@ static int init_oops_id(void)
299{ 299{
300 if (!oops_id) 300 if (!oops_id)
301 get_random_bytes(&oops_id, sizeof(oops_id)); 301 get_random_bytes(&oops_id, sizeof(oops_id));
302 else
303 oops_id++;
302 304
303 return 0; 305 return 0;
304} 306}
diff --git a/kernel/pid.c b/kernel/pid.c
index 064e76afa507..af9224cdd6c0 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -475,7 +475,7 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
475EXPORT_SYMBOL(task_session_nr_ns); 475EXPORT_SYMBOL(task_session_nr_ns);
476 476
477/* 477/*
478 * Used by proc to find the first pid that is greater then or equal to nr. 478 * Used by proc to find the first pid that is greater than or equal to nr.
479 * 479 *
480 * If there is a pid at nr this function is exactly the same as find_pid_ns. 480 * If there is a pid at nr this function is exactly the same as find_pid_ns.
481 */ 481 */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 613f16941b85..239988873971 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -615,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
615 /* this may fail if the RTC hasn't been initialized */ 615 /* this may fail if the RTC hasn't been initialized */
616 status = rtc_read_time(rtc, &alm.time); 616 status = rtc_read_time(rtc, &alm.time);
617 if (status < 0) { 617 if (status < 0) {
618 printk(err_readtime, rtc->dev.bus_id, status); 618 printk(err_readtime, dev_name(&rtc->dev), status);
619 return; 619 return;
620 } 620 }
621 rtc_tm_to_time(&alm.time, &now); 621 rtc_tm_to_time(&alm.time, &now);
@@ -626,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
626 626
627 status = rtc_set_alarm(rtc, &alm); 627 status = rtc_set_alarm(rtc, &alm);
628 if (status < 0) { 628 if (status < 0) {
629 printk(err_wakealarm, rtc->dev.bus_id, status); 629 printk(err_wakealarm, dev_name(&rtc->dev), status);
630 return; 630 return;
631 } 631 }
632 632
@@ -660,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
660 if (!device_may_wakeup(candidate->dev.parent)) 660 if (!device_may_wakeup(candidate->dev.parent))
661 return 0; 661 return 0;
662 662
663 *(char **)name_ptr = dev->bus_id; 663 *(const char **)name_ptr = dev_name(dev);
664 return 1; 664 return 1;
665} 665}
666 666
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 72016f051477..97890831e1b5 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
27static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key, struct tty_struct *tty)
28{ 28{
29 /* run sysrq poweroff on boot cpu */ 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); 30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
31} 31}
32 32
33static struct sysrq_key_op sysrq_poweroff_op = { 33static struct sysrq_key_op sysrq_poweroff_op = {
diff --git a/kernel/printk.c b/kernel/printk.c
index e651ab05655f..7015733793e8 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
619static const char recursion_bug_msg [] = 619static const char recursion_bug_msg [] =
620 KERN_CRIT "BUG: recent printk recursion!\n"; 620 KERN_CRIT "BUG: recent printk recursion!\n";
621static int recursion_bug; 621static int recursion_bug;
622 static int new_text_line = 1; 622static int new_text_line = 1;
623static char printk_buf[1024]; 623static char printk_buf[1024];
624 624
625asmlinkage int vprintk(const char *fmt, va_list args) 625asmlinkage int vprintk(const char *fmt, va_list args)
diff --git a/kernel/profile.c b/kernel/profile.c
index 60adefb59b5e..784933acf5b8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -45,7 +45,7 @@ static unsigned long prof_len, prof_shift;
45int prof_on __read_mostly; 45int prof_on __read_mostly;
46EXPORT_SYMBOL_GPL(prof_on); 46EXPORT_SYMBOL_GPL(prof_on);
47 47
48static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 48static cpumask_var_t prof_cpu_mask;
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 50static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
51static DEFINE_PER_CPU(int, cpu_profile_flip); 51static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -113,9 +113,13 @@ int __ref profile_init(void)
113 buffer_bytes = prof_len*sizeof(atomic_t); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) { 114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes); 115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
116 return 0; 117 return 0;
117 } 118 }
118 119
120 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
121 return -ENOMEM;
122
119 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 123 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
120 if (prof_buffer) 124 if (prof_buffer)
121 return 0; 125 return 0;
@@ -128,6 +132,7 @@ int __ref profile_init(void)
128 if (prof_buffer) 132 if (prof_buffer)
129 return 0; 133 return 0;
130 134
135 free_cpumask_var(prof_cpu_mask);
131 return -ENOMEM; 136 return -ENOMEM;
132} 137}
133 138
@@ -386,13 +391,15 @@ out_free:
386 return NOTIFY_BAD; 391 return NOTIFY_BAD;
387 case CPU_ONLINE: 392 case CPU_ONLINE:
388 case CPU_ONLINE_FROZEN: 393 case CPU_ONLINE_FROZEN:
389 cpu_set(cpu, prof_cpu_mask); 394 if (prof_cpu_mask != NULL)
395 cpumask_set_cpu(cpu, prof_cpu_mask);
390 break; 396 break;
391 case CPU_UP_CANCELED: 397 case CPU_UP_CANCELED:
392 case CPU_UP_CANCELED_FROZEN: 398 case CPU_UP_CANCELED_FROZEN:
393 case CPU_DEAD: 399 case CPU_DEAD:
394 case CPU_DEAD_FROZEN: 400 case CPU_DEAD_FROZEN:
395 cpu_clear(cpu, prof_cpu_mask); 401 if (prof_cpu_mask != NULL)
402 cpumask_clear_cpu(cpu, prof_cpu_mask);
396 if (per_cpu(cpu_profile_hits, cpu)[0]) { 403 if (per_cpu(cpu_profile_hits, cpu)[0]) {
397 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 404 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
398 per_cpu(cpu_profile_hits, cpu)[0] = NULL; 405 per_cpu(cpu_profile_hits, cpu)[0] = NULL;
@@ -430,19 +437,19 @@ void profile_tick(int type)
430 437
431 if (type == CPU_PROFILING && timer_hook) 438 if (type == CPU_PROFILING && timer_hook)
432 timer_hook(regs); 439 timer_hook(regs);
433 if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) 440 if (!user_mode(regs) && prof_cpu_mask != NULL &&
441 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
434 profile_hit(type, (void *)profile_pc(regs)); 442 profile_hit(type, (void *)profile_pc(regs));
435} 443}
436 444
437#ifdef CONFIG_PROC_FS 445#ifdef CONFIG_PROC_FS
438#include <linux/proc_fs.h> 446#include <linux/proc_fs.h>
439#include <asm/uaccess.h> 447#include <asm/uaccess.h>
440#include <asm/ptrace.h>
441 448
442static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 449static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
443 int count, int *eof, void *data) 450 int count, int *eof, void *data)
444{ 451{
445 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); 452 int len = cpumask_scnprintf(page, count, data);
446 if (count - len < 2) 453 if (count - len < 2)
447 return -EINVAL; 454 return -EINVAL;
448 len += sprintf(page + len, "\n"); 455 len += sprintf(page + len, "\n");
@@ -452,16 +459,20 @@ static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
452static int prof_cpu_mask_write_proc(struct file *file, 459static int prof_cpu_mask_write_proc(struct file *file,
453 const char __user *buffer, unsigned long count, void *data) 460 const char __user *buffer, unsigned long count, void *data)
454{ 461{
455 cpumask_t *mask = (cpumask_t *)data; 462 struct cpumask *mask = data;
456 unsigned long full_count = count, err; 463 unsigned long full_count = count, err;
457 cpumask_t new_value; 464 cpumask_var_t new_value;
458 465
459 err = cpumask_parse_user(buffer, count, new_value); 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
460 if (err) 467 return -ENOMEM;
461 return err;
462 468
463 *mask = new_value; 469 err = cpumask_parse_user(buffer, count, new_value);
464 return full_count; 470 if (!err) {
471 cpumask_copy(mask, new_value);
472 err = full_count;
473 }
474 free_cpumask_var(new_value);
475 return err;
465} 476}
466 477
467void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 478void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
@@ -472,7 +483,7 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
472 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 483 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
473 if (!entry) 484 if (!entry)
474 return; 485 return;
475 entry->data = (void *)&prof_cpu_mask; 486 entry->data = prof_cpu_mask;
476 entry->read_proc = prof_cpu_mask_read_proc; 487 entry->read_proc = prof_cpu_mask_read_proc;
477 entry->write_proc = prof_cpu_mask_write_proc; 488 entry->write_proc = prof_cpu_mask_write_proc;
478} 489}
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index e503a002f330..490934fc7ac3 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
63 .completed = -300, 63 .completed = -300,
64 .pending = -300, 64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_MASK_NONE, 66 .cpumask = CPU_BITS_NONE,
67}; 67};
68static struct rcu_ctrlblk rcu_bh_ctrlblk = { 68static struct rcu_ctrlblk rcu_bh_ctrlblk = {
69 .cur = -300, 69 .cur = -300,
70 .completed = -300, 70 .completed = -300,
71 .pending = -300, 71 .pending = -300,
72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), 72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
73 .cpumask = CPU_MASK_NONE, 73 .cpumask = CPU_BITS_NONE,
74}; 74};
75 75
76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
@@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp,
85 struct rcu_ctrlblk *rcp) 85 struct rcu_ctrlblk *rcp)
86{ 86{
87 int cpu; 87 int cpu;
88 cpumask_t cpumask;
89 unsigned long flags; 88 unsigned long flags;
90 89
91 set_need_resched(); 90 set_need_resched();
@@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
96 * Don't send IPI to itself. With irqs disabled, 95 * Don't send IPI to itself. With irqs disabled,
97 * rdp->cpu is the current cpu. 96 * rdp->cpu is the current cpu.
98 * 97 *
99 * cpu_online_map is updated by the _cpu_down() 98 * cpu_online_mask is updated by the _cpu_down()
100 * using __stop_machine(). Since we're in irqs disabled 99 * using __stop_machine(). Since we're in irqs disabled
101 * section, __stop_machine() is not exectuting, hence 100 * section, __stop_machine() is not exectuting, hence
102 * the cpu_online_map is stable. 101 * the cpu_online_mask is stable.
103 * 102 *
104 * However, a cpu might have been offlined _just_ before 103 * However, a cpu might have been offlined _just_ before
105 * we disabled irqs while entering here. 104 * we disabled irqs while entering here.
@@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp,
107 * notification, leading to the offlined cpu's bit 106 * notification, leading to the offlined cpu's bit
108 * being set in the rcp->cpumask. 107 * being set in the rcp->cpumask.
109 * 108 *
110 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent 109 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
111 * sending smp_reschedule() to an offlined CPU. 110 * sending smp_reschedule() to an offlined CPU.
112 */ 111 */
113 cpus_and(cpumask, rcp->cpumask, cpu_online_map); 112 for_each_cpu_and(cpu,
114 cpu_clear(rdp->cpu, cpumask); 113 to_cpumask(rcp->cpumask), cpu_online_mask) {
115 for_each_cpu_mask_nr(cpu, cpumask) 114 if (cpu != rdp->cpu)
116 smp_send_reschedule(cpu); 115 smp_send_reschedule(cpu);
116 }
117 } 117 }
118 spin_unlock_irqrestore(&rcp->lock, flags); 118 spin_unlock_irqrestore(&rcp->lock, flags);
119} 119}
@@ -193,7 +193,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
193 193
194 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
198 } 198 }
199 printk(" (detected by %d, t=%ld jiffies)\n", 199 printk(" (detected by %d, t=%ld jiffies)\n",
@@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
221 long delta; 221 long delta;
222 222
223 delta = jiffies - rcp->jiffies_stall; 223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { 224 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
225 delta >= 0) {
225 226
226 /* We haven't checked in, so go dump stack. */ 227 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp); 228 print_cpu_stall(rcp);
@@ -393,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
393 * unnecessarily. 394 * unnecessarily.
394 */ 395 */
395 smp_mb(); 396 smp_mb();
396 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); 397 cpumask_andnot(to_cpumask(rcp->cpumask),
398 cpu_online_mask, nohz_cpu_mask);
397 399
398 rcp->signaled = 0; 400 rcp->signaled = 0;
399 } 401 }
@@ -406,8 +408,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
406 */ 408 */
407static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) 409static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
408{ 410{
409 cpu_clear(cpu, rcp->cpumask); 411 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
410 if (cpus_empty(rcp->cpumask)) { 412 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
411 /* batch completed ! */ 413 /* batch completed ! */
412 rcp->completed = rcp->cur; 414 rcp->completed = rcp->cur;
413 rcu_start_batch(rcp); 415 rcu_start_batch(rcp);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ad63af8b2521..d92a76a881aa 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 78 * and may be nested.
79 */ 79 */
80void synchronize_rcu(void); /* Makes kernel-doc tools happy */ 80void synchronize_rcu(void)
81synchronize_rcu_xxx(synchronize_rcu, call_rcu) 81{
82 struct rcu_synchronize rcu;
83 init_completion(&rcu.completion);
84 /* Will wake me after RCU finished. */
85 call_rcu(&rcu.head, wakeme_after_rcu);
86 /* Wait for it. */
87 wait_for_completion(&rcu.completion);
88}
82EXPORT_SYMBOL_GPL(synchronize_rcu); 89EXPORT_SYMBOL_GPL(synchronize_rcu);
83 90
84static void rcu_barrier_callback(struct rcu_head *notused) 91static void rcu_barrier_callback(struct rcu_head *notused)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 04982659875a..33cfc50781f9 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] =
164 { "idle", "waitack", "waitzero", "waitmb" }; 164 { "idle", "waitack", "waitzero", "waitmb" };
165#endif /* #ifdef CONFIG_RCU_TRACE */ 165#endif /* #ifdef CONFIG_RCU_TRACE */
166 166
167static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; 167static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
168 = CPU_BITS_NONE;
168 169
169/* 170/*
170 * Enum and per-CPU flag to determine when each CPU has seen 171 * Enum and per-CPU flag to determine when each CPU has seen
@@ -758,7 +759,7 @@ rcu_try_flip_idle(void)
758 759
759 /* Now ask each CPU for acknowledgement of the flip. */ 760 /* Now ask each CPU for acknowledgement of the flip. */
760 761
761 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { 762 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
762 per_cpu(rcu_flip_flag, cpu) = rcu_flipped; 763 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
763 dyntick_save_progress_counter(cpu); 764 dyntick_save_progress_counter(cpu);
764 } 765 }
@@ -776,7 +777,7 @@ rcu_try_flip_waitack(void)
776 int cpu; 777 int cpu;
777 778
778 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); 779 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
779 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 780 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
780 if (rcu_try_flip_waitack_needed(cpu) && 781 if (rcu_try_flip_waitack_needed(cpu) &&
781 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { 782 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
782 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); 783 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -808,7 +809,7 @@ rcu_try_flip_waitzero(void)
808 /* Check to see if the sum of the "last" counters is zero. */ 809 /* Check to see if the sum of the "last" counters is zero. */
809 810
810 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); 811 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
811 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 812 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
812 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; 813 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
813 if (sum != 0) { 814 if (sum != 0) {
814 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); 815 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -823,7 +824,7 @@ rcu_try_flip_waitzero(void)
823 smp_mb(); /* ^^^^^^^^^^^^ */ 824 smp_mb(); /* ^^^^^^^^^^^^ */
824 825
825 /* Call for a memory barrier from each CPU. */ 826 /* Call for a memory barrier from each CPU. */
826 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { 827 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
827 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; 828 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
828 dyntick_save_progress_counter(cpu); 829 dyntick_save_progress_counter(cpu);
829 } 830 }
@@ -843,7 +844,7 @@ rcu_try_flip_waitmb(void)
843 int cpu; 844 int cpu;
844 845
845 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); 846 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
846 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 847 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
847 if (rcu_try_flip_waitmb_needed(cpu) && 848 if (rcu_try_flip_waitmb_needed(cpu) &&
848 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { 849 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
849 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); 850 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@ -1032,7 +1033,7 @@ void rcu_offline_cpu(int cpu)
1032 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; 1033 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1033 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; 1034 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1034 1035
1035 cpu_clear(cpu, rcu_cpu_online_map); 1036 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1036 1037
1037 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1038 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1038 1039
@@ -1072,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu)
1072 struct rcu_data *rdp; 1073 struct rcu_data *rdp;
1073 1074
1074 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); 1075 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1075 cpu_set(cpu, rcu_cpu_online_map); 1076 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1076 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1077 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1077 1078
1078 /* 1079 /*
@@ -1176,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
1176 * in -rt this does -not- necessarily result in all currently executing 1177 * in -rt this does -not- necessarily result in all currently executing
1177 * interrupt -handlers- having completed. 1178 * interrupt -handlers- having completed.
1178 */ 1179 */
1179synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) 1180void __synchronize_sched(void)
1181{
1182 struct rcu_synchronize rcu;
1183
1184 init_completion(&rcu.completion);
1185 /* Will wake me after RCU finished. */
1186 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1187 /* Wait for it. */
1188 wait_for_completion(&rcu.completion);
1189}
1180EXPORT_SYMBOL_GPL(__synchronize_sched); 1190EXPORT_SYMBOL_GPL(__synchronize_sched);
1181 1191
1182/* 1192/*
@@ -1430,7 +1440,7 @@ void __init __rcu_init(void)
1430 * We don't need protection against CPU-Hotplug here 1440 * We don't need protection against CPU-Hotplug here
1431 * since 1441 * since
1432 * a) If a CPU comes online while we are iterating over the 1442 * a) If a CPU comes online while we are iterating over the
1433 * cpu_online_map below, we would only end up making a 1443 * cpu_online_mask below, we would only end up making a
1434 * duplicate call to rcu_online_cpu() which sets the corresponding 1444 * duplicate call to rcu_online_cpu() which sets the corresponding
1435 * CPU's mask in the rcu_cpu_online_map. 1445 * CPU's mask in the rcu_cpu_online_map.
1436 * 1446 *
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b31065522104..1cff28db56b6 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -136,7 +136,7 @@ static int stutter_pause_test = 0;
136#endif 136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138 138
139#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */ 139#define FULLSTOP_SHUTDOWN 1 /* Bail due to system shutdown/panic. */
140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ 140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */
141static int fullstop; /* stop generating callbacks at test end. */ 141static int fullstop; /* stop generating callbacks at test end. */
142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ 142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */
@@ -151,12 +151,10 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
151{ 151{
152 if (fullstop) 152 if (fullstop)
153 return NOTIFY_DONE; 153 return NOTIFY_DONE;
154 if (signal_pending(current)) { 154 mutex_lock(&fullstop_mutex);
155 mutex_lock(&fullstop_mutex); 155 if (!fullstop)
156 if (!ACCESS_ONCE(fullstop)) 156 fullstop = FULLSTOP_SHUTDOWN;
157 fullstop = FULLSTOP_SIGNALED; 157 mutex_unlock(&fullstop_mutex);
158 mutex_unlock(&fullstop_mutex);
159 }
160 return NOTIFY_DONE; 158 return NOTIFY_DONE;
161} 159}
162 160
@@ -624,7 +622,7 @@ rcu_torture_writer(void *arg)
624 rcu_stutter_wait(); 622 rcu_stutter_wait();
625 } while (!kthread_should_stop() && !fullstop); 623 } while (!kthread_should_stop() && !fullstop);
626 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 624 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
627 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) 625 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
628 schedule_timeout_uninterruptible(1); 626 schedule_timeout_uninterruptible(1);
629 return 0; 627 return 0;
630} 628}
@@ -649,7 +647,7 @@ rcu_torture_fakewriter(void *arg)
649 } while (!kthread_should_stop() && !fullstop); 647 } while (!kthread_should_stop() && !fullstop);
650 648
651 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 649 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
652 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) 650 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
653 schedule_timeout_uninterruptible(1); 651 schedule_timeout_uninterruptible(1);
654 return 0; 652 return 0;
655} 653}
@@ -759,7 +757,7 @@ rcu_torture_reader(void *arg)
759 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 757 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
760 if (irqreader && cur_ops->irqcapable) 758 if (irqreader && cur_ops->irqcapable)
761 del_timer_sync(&t); 759 del_timer_sync(&t);
762 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) 760 while (!kthread_should_stop() && fullstop != FULLSTOP_SHUTDOWN)
763 schedule_timeout_uninterruptible(1); 761 schedule_timeout_uninterruptible(1);
764 return 0; 762 return 0;
765} 763}
@@ -868,49 +866,52 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
868 */ 866 */
869static void rcu_torture_shuffle_tasks(void) 867static void rcu_torture_shuffle_tasks(void)
870{ 868{
871 cpumask_t tmp_mask; 869 cpumask_var_t tmp_mask;
872 int i; 870 int i;
873 871
874 cpus_setall(tmp_mask); 872 if (!alloc_cpumask_var(&tmp_mask, GFP_KERNEL))
873 BUG();
874
875 cpumask_setall(tmp_mask);
875 get_online_cpus(); 876 get_online_cpus();
876 877
877 /* No point in shuffling if there is only one online CPU (ex: UP) */ 878 /* No point in shuffling if there is only one online CPU (ex: UP) */
878 if (num_online_cpus() == 1) { 879 if (num_online_cpus() == 1)
879 put_online_cpus(); 880 goto out;
880 return;
881 }
882 881
883 if (rcu_idle_cpu != -1) 882 if (rcu_idle_cpu != -1)
884 cpu_clear(rcu_idle_cpu, tmp_mask); 883 cpumask_clear_cpu(rcu_idle_cpu, tmp_mask);
885 884
886 set_cpus_allowed_ptr(current, &tmp_mask); 885 set_cpus_allowed_ptr(current, tmp_mask);
887 886
888 if (reader_tasks) { 887 if (reader_tasks) {
889 for (i = 0; i < nrealreaders; i++) 888 for (i = 0; i < nrealreaders; i++)
890 if (reader_tasks[i]) 889 if (reader_tasks[i])
891 set_cpus_allowed_ptr(reader_tasks[i], 890 set_cpus_allowed_ptr(reader_tasks[i],
892 &tmp_mask); 891 tmp_mask);
893 } 892 }
894 893
895 if (fakewriter_tasks) { 894 if (fakewriter_tasks) {
896 for (i = 0; i < nfakewriters; i++) 895 for (i = 0; i < nfakewriters; i++)
897 if (fakewriter_tasks[i]) 896 if (fakewriter_tasks[i])
898 set_cpus_allowed_ptr(fakewriter_tasks[i], 897 set_cpus_allowed_ptr(fakewriter_tasks[i],
899 &tmp_mask); 898 tmp_mask);
900 } 899 }
901 900
902 if (writer_task) 901 if (writer_task)
903 set_cpus_allowed_ptr(writer_task, &tmp_mask); 902 set_cpus_allowed_ptr(writer_task, tmp_mask);
904 903
905 if (stats_task) 904 if (stats_task)
906 set_cpus_allowed_ptr(stats_task, &tmp_mask); 905 set_cpus_allowed_ptr(stats_task, tmp_mask);
907 906
908 if (rcu_idle_cpu == -1) 907 if (rcu_idle_cpu == -1)
909 rcu_idle_cpu = num_online_cpus() - 1; 908 rcu_idle_cpu = num_online_cpus() - 1;
910 else 909 else
911 rcu_idle_cpu--; 910 rcu_idle_cpu--;
912 911
912out:
913 put_online_cpus(); 913 put_online_cpus();
914 free_cpumask_var(tmp_mask);
914} 915}
915 916
916/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the 917/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a342b032112c..f2d8638e6c60 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -79,7 +79,10 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 80
81#ifdef CONFIG_NO_HZ 81#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks); 82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
83 .dynticks_nesting = 1,
84 .dynticks = 1,
85};
83#endif /* #ifdef CONFIG_NO_HZ */ 86#endif /* #ifdef CONFIG_NO_HZ */
84 87
85static int blimit = 10; /* Maximum callbacks per softirq. */ 88static int blimit = 10; /* Maximum callbacks per softirq. */
@@ -572,6 +575,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
572 /* Special-case the common single-level case. */ 575 /* Special-case the common single-level case. */
573 if (NUM_RCU_NODES == 1) { 576 if (NUM_RCU_NODES == 1) {
574 rnp->qsmask = rnp->qsmaskinit; 577 rnp->qsmask = rnp->qsmaskinit;
578 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
575 spin_unlock_irqrestore(&rnp->lock, flags); 579 spin_unlock_irqrestore(&rnp->lock, flags);
576 return; 580 return;
577 } 581 }
@@ -1379,13 +1383,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1379 1383
1380static void __cpuinit rcu_online_cpu(int cpu) 1384static void __cpuinit rcu_online_cpu(int cpu)
1381{ 1385{
1382#ifdef CONFIG_NO_HZ
1383 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1384
1385 rdtp->dynticks_nesting = 1;
1386 rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */
1387 rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
1388#endif /* #ifdef CONFIG_NO_HZ */
1389 rcu_init_percpu_data(cpu, &rcu_state); 1386 rcu_init_percpu_data(cpu, &rcu_state);
1390 rcu_init_percpu_data(cpu, &rcu_bh_state); 1387 rcu_init_percpu_data(cpu, &rcu_bh_state);
1391 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1388 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/resource.c b/kernel/resource.c
index e633106b12f6..ca6a1536b205 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -623,7 +623,7 @@ resource_size_t resource_alignment(struct resource *res)
623 */ 623 */
624struct resource * __request_region(struct resource *parent, 624struct resource * __request_region(struct resource *parent,
625 resource_size_t start, resource_size_t n, 625 resource_size_t start, resource_size_t n,
626 const char *name) 626 const char *name, int flags)
627{ 627{
628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
629 629
@@ -634,6 +634,7 @@ struct resource * __request_region(struct resource *parent,
634 res->start = start; 634 res->start = start;
635 res->end = start + n - 1; 635 res->end = start + n - 1;
636 res->flags = IORESOURCE_BUSY; 636 res->flags = IORESOURCE_BUSY;
637 res->flags |= flags;
637 638
638 write_lock(&resource_lock); 639 write_lock(&resource_lock);
639 640
@@ -679,7 +680,7 @@ int __check_region(struct resource *parent, resource_size_t start,
679{ 680{
680 struct resource * res; 681 struct resource * res;
681 682
682 res = __request_region(parent, start, n, "check-region"); 683 res = __request_region(parent, start, n, "check-region", 0);
683 if (!res) 684 if (!res)
684 return -EBUSY; 685 return -EBUSY;
685 686
@@ -776,7 +777,7 @@ struct resource * __devm_request_region(struct device *dev,
776 dr->start = start; 777 dr->start = start;
777 dr->n = n; 778 dr->n = n;
778 779
779 res = __request_region(parent, start, n, name); 780 res = __request_region(parent, start, n, name, 0);
780 if (res) 781 if (res)
781 devres_add(dev, dr); 782 devres_add(dev, dr);
782 else 783 else
@@ -876,3 +877,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
876 877
877 return err; 878 return err;
878} 879}
880
881#ifdef CONFIG_STRICT_DEVMEM
882static int strict_iomem_checks = 1;
883#else
884static int strict_iomem_checks;
885#endif
886
887/*
888 * check if an address is reserved in the iomem resource tree
889 * returns 1 if reserved, 0 if not reserved.
890 */
891int iomem_is_exclusive(u64 addr)
892{
893 struct resource *p = &iomem_resource;
894 int err = 0;
895 loff_t l;
896 int size = PAGE_SIZE;
897
898 if (!strict_iomem_checks)
899 return 0;
900
901 addr = addr & PAGE_MASK;
902
903 read_lock(&resource_lock);
904 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
905 /*
906 * We can probably skip the resources without
907 * IORESOURCE_IO attribute?
908 */
909 if (p->start >= addr + size)
910 break;
911 if (p->end < addr)
912 continue;
913 if (p->flags & IORESOURCE_BUSY &&
914 p->flags & IORESOURCE_EXCLUSIVE) {
915 err = 1;
916 break;
917 }
918 }
919 read_unlock(&resource_lock);
920
921 return err;
922}
923
924static int __init strict_iomem(char *str)
925{
926 if (strstr(str, "relaxed"))
927 strict_iomem_checks = 0;
928 if (strstr(str, "strict"))
929 strict_iomem_checks = 1;
930 return 1;
931}
932
933__setup("iomem=", strict_iomem);
diff --git a/kernel/sched.c b/kernel/sched.c
index fff1c4a20b65..deb5ac8c12f3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -498,18 +498,26 @@ struct rt_rq {
498 */ 498 */
499struct root_domain { 499struct root_domain {
500 atomic_t refcount; 500 atomic_t refcount;
501 cpumask_t span; 501 cpumask_var_t span;
502 cpumask_t online; 502 cpumask_var_t online;
503 503
504 /* 504 /*
505 * The "RT overload" flag: it gets set if a CPU has more than 505 * The "RT overload" flag: it gets set if a CPU has more than
506 * one runnable RT task. 506 * one runnable RT task.
507 */ 507 */
508 cpumask_t rto_mask; 508 cpumask_var_t rto_mask;
509 atomic_t rto_count; 509 atomic_t rto_count;
510#ifdef CONFIG_SMP 510#ifdef CONFIG_SMP
511 struct cpupri cpupri; 511 struct cpupri cpupri;
512#endif 512#endif
513#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
514 /*
515 * Preferred wake up cpu nominated by sched_mc balance that will be
516 * used when most cpus are idle in the system indicating overall very
517 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
518 */
519 unsigned int sched_mc_preferred_wakeup_cpu;
520#endif
513}; 521};
514 522
515/* 523/*
@@ -1514,7 +1522,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1514 struct sched_domain *sd = data; 1522 struct sched_domain *sd = data;
1515 int i; 1523 int i;
1516 1524
1517 for_each_cpu_mask(i, sd->span) { 1525 for_each_cpu(i, sched_domain_span(sd)) {
1518 /* 1526 /*
1519 * If there are currently no tasks on the cpu pretend there 1527 * If there are currently no tasks on the cpu pretend there
1520 * is one of average load so that when a new task gets to 1528 * is one of average load so that when a new task gets to
@@ -1535,7 +1543,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1535 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1543 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1536 shares = tg->shares; 1544 shares = tg->shares;
1537 1545
1538 for_each_cpu_mask(i, sd->span) 1546 for_each_cpu(i, sched_domain_span(sd))
1539 update_group_shares_cpu(tg, i, shares, rq_weight); 1547 update_group_shares_cpu(tg, i, shares, rq_weight);
1540 1548
1541 return 0; 1549 return 0;
@@ -2101,15 +2109,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2101 int i; 2109 int i;
2102 2110
2103 /* Skip over this group if it has no CPUs allowed */ 2111 /* Skip over this group if it has no CPUs allowed */
2104 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 2112 if (!cpumask_intersects(sched_group_cpus(group),
2113 &p->cpus_allowed))
2105 continue; 2114 continue;
2106 2115
2107 local_group = cpu_isset(this_cpu, group->cpumask); 2116 local_group = cpumask_test_cpu(this_cpu,
2117 sched_group_cpus(group));
2108 2118
2109 /* Tally up the load of all CPUs in the group */ 2119 /* Tally up the load of all CPUs in the group */
2110 avg_load = 0; 2120 avg_load = 0;
2111 2121
2112 for_each_cpu_mask_nr(i, group->cpumask) { 2122 for_each_cpu(i, sched_group_cpus(group)) {
2113 /* Bias balancing toward cpus of our domain */ 2123 /* Bias balancing toward cpus of our domain */
2114 if (local_group) 2124 if (local_group)
2115 load = source_load(i, load_idx); 2125 load = source_load(i, load_idx);
@@ -2141,17 +2151,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2141 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2151 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2142 */ 2152 */
2143static int 2153static int
2144find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, 2154find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2145 cpumask_t *tmp)
2146{ 2155{
2147 unsigned long load, min_load = ULONG_MAX; 2156 unsigned long load, min_load = ULONG_MAX;
2148 int idlest = -1; 2157 int idlest = -1;
2149 int i; 2158 int i;
2150 2159
2151 /* Traverse only the allowed CPUs */ 2160 /* Traverse only the allowed CPUs */
2152 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2161 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2153
2154 for_each_cpu_mask_nr(i, *tmp) {
2155 load = weighted_cpuload(i); 2162 load = weighted_cpuload(i);
2156 2163
2157 if (load < min_load || (load == min_load && i == this_cpu)) { 2164 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2193,7 +2200,6 @@ static int sched_balance_self(int cpu, int flag)
2193 update_shares(sd); 2200 update_shares(sd);
2194 2201
2195 while (sd) { 2202 while (sd) {
2196 cpumask_t span, tmpmask;
2197 struct sched_group *group; 2203 struct sched_group *group;
2198 int new_cpu, weight; 2204 int new_cpu, weight;
2199 2205
@@ -2202,14 +2208,13 @@ static int sched_balance_self(int cpu, int flag)
2202 continue; 2208 continue;
2203 } 2209 }
2204 2210
2205 span = sd->span;
2206 group = find_idlest_group(sd, t, cpu); 2211 group = find_idlest_group(sd, t, cpu);
2207 if (!group) { 2212 if (!group) {
2208 sd = sd->child; 2213 sd = sd->child;
2209 continue; 2214 continue;
2210 } 2215 }
2211 2216
2212 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); 2217 new_cpu = find_idlest_cpu(group, t, cpu);
2213 if (new_cpu == -1 || new_cpu == cpu) { 2218 if (new_cpu == -1 || new_cpu == cpu) {
2214 /* Now try balancing at a lower domain level of cpu */ 2219 /* Now try balancing at a lower domain level of cpu */
2215 sd = sd->child; 2220 sd = sd->child;
@@ -2218,10 +2223,10 @@ static int sched_balance_self(int cpu, int flag)
2218 2223
2219 /* Now try balancing at a lower domain level of new_cpu */ 2224 /* Now try balancing at a lower domain level of new_cpu */
2220 cpu = new_cpu; 2225 cpu = new_cpu;
2226 weight = cpumask_weight(sched_domain_span(sd));
2221 sd = NULL; 2227 sd = NULL;
2222 weight = cpus_weight(span);
2223 for_each_domain(cpu, tmp) { 2228 for_each_domain(cpu, tmp) {
2224 if (weight <= cpus_weight(tmp->span)) 2229 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2225 break; 2230 break;
2226 if (tmp->flags & flag) 2231 if (tmp->flags & flag)
2227 sd = tmp; 2232 sd = tmp;
@@ -2266,7 +2271,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2266 cpu = task_cpu(p); 2271 cpu = task_cpu(p);
2267 2272
2268 for_each_domain(this_cpu, sd) { 2273 for_each_domain(this_cpu, sd) {
2269 if (cpu_isset(cpu, sd->span)) { 2274 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2270 update_shares(sd); 2275 update_shares(sd);
2271 break; 2276 break;
2272 } 2277 }
@@ -2315,7 +2320,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2315 else { 2320 else {
2316 struct sched_domain *sd; 2321 struct sched_domain *sd;
2317 for_each_domain(this_cpu, sd) { 2322 for_each_domain(this_cpu, sd) {
2318 if (cpu_isset(cpu, sd->span)) { 2323 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2319 schedstat_inc(sd, ttwu_wake_remote); 2324 schedstat_inc(sd, ttwu_wake_remote);
2320 break; 2325 break;
2321 } 2326 }
@@ -2846,7 +2851,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2846 struct rq *rq; 2851 struct rq *rq;
2847 2852
2848 rq = task_rq_lock(p, &flags); 2853 rq = task_rq_lock(p, &flags);
2849 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2854 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2850 || unlikely(!cpu_active(dest_cpu))) 2855 || unlikely(!cpu_active(dest_cpu)))
2851 goto out; 2856 goto out;
2852 2857
@@ -2911,7 +2916,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2911 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2916 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2912 * 3) are cache-hot on their current CPU. 2917 * 3) are cache-hot on their current CPU.
2913 */ 2918 */
2914 if (!cpu_isset(this_cpu, p->cpus_allowed)) { 2919 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
2915 schedstat_inc(p, se.nr_failed_migrations_affine); 2920 schedstat_inc(p, se.nr_failed_migrations_affine);
2916 return 0; 2921 return 0;
2917 } 2922 }
@@ -3086,7 +3091,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3086static struct sched_group * 3091static struct sched_group *
3087find_busiest_group(struct sched_domain *sd, int this_cpu, 3092find_busiest_group(struct sched_domain *sd, int this_cpu,
3088 unsigned long *imbalance, enum cpu_idle_type idle, 3093 unsigned long *imbalance, enum cpu_idle_type idle,
3089 int *sd_idle, const cpumask_t *cpus, int *balance) 3094 int *sd_idle, const struct cpumask *cpus, int *balance)
3090{ 3095{
3091 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3096 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3092 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3097 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3122,10 +3127,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3122 unsigned long sum_avg_load_per_task; 3127 unsigned long sum_avg_load_per_task;
3123 unsigned long avg_load_per_task; 3128 unsigned long avg_load_per_task;
3124 3129
3125 local_group = cpu_isset(this_cpu, group->cpumask); 3130 local_group = cpumask_test_cpu(this_cpu,
3131 sched_group_cpus(group));
3126 3132
3127 if (local_group) 3133 if (local_group)
3128 balance_cpu = first_cpu(group->cpumask); 3134 balance_cpu = cpumask_first(sched_group_cpus(group));
3129 3135
3130 /* Tally up the load of all CPUs in the group */ 3136 /* Tally up the load of all CPUs in the group */
3131 sum_weighted_load = sum_nr_running = avg_load = 0; 3137 sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3134,13 +3140,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3134 max_cpu_load = 0; 3140 max_cpu_load = 0;
3135 min_cpu_load = ~0UL; 3141 min_cpu_load = ~0UL;
3136 3142
3137 for_each_cpu_mask_nr(i, group->cpumask) { 3143 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3138 struct rq *rq; 3144 struct rq *rq = cpu_rq(i);
3139
3140 if (!cpu_isset(i, *cpus))
3141 continue;
3142
3143 rq = cpu_rq(i);
3144 3145
3145 if (*sd_idle && rq->nr_running) 3146 if (*sd_idle && rq->nr_running)
3146 *sd_idle = 0; 3147 *sd_idle = 0;
@@ -3251,8 +3252,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3251 */ 3252 */
3252 if ((sum_nr_running < min_nr_running) || 3253 if ((sum_nr_running < min_nr_running) ||
3253 (sum_nr_running == min_nr_running && 3254 (sum_nr_running == min_nr_running &&
3254 first_cpu(group->cpumask) < 3255 cpumask_first(sched_group_cpus(group)) >
3255 first_cpu(group_min->cpumask))) { 3256 cpumask_first(sched_group_cpus(group_min)))) {
3256 group_min = group; 3257 group_min = group;
3257 min_nr_running = sum_nr_running; 3258 min_nr_running = sum_nr_running;
3258 min_load_per_task = sum_weighted_load / 3259 min_load_per_task = sum_weighted_load /
@@ -3267,8 +3268,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3267 if (sum_nr_running <= group_capacity - 1) { 3268 if (sum_nr_running <= group_capacity - 1) {
3268 if (sum_nr_running > leader_nr_running || 3269 if (sum_nr_running > leader_nr_running ||
3269 (sum_nr_running == leader_nr_running && 3270 (sum_nr_running == leader_nr_running &&
3270 first_cpu(group->cpumask) > 3271 cpumask_first(sched_group_cpus(group)) <
3271 first_cpu(group_leader->cpumask))) { 3272 cpumask_first(sched_group_cpus(group_leader)))) {
3272 group_leader = group; 3273 group_leader = group;
3273 leader_nr_running = sum_nr_running; 3274 leader_nr_running = sum_nr_running;
3274 } 3275 }
@@ -3394,6 +3395,10 @@ out_balanced:
3394 3395
3395 if (this == group_leader && group_leader != group_min) { 3396 if (this == group_leader && group_leader != group_min) {
3396 *imbalance = min_load_per_task; 3397 *imbalance = min_load_per_task;
3398 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3399 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3400 cpumask_first(sched_group_cpus(group_leader));
3401 }
3397 return group_min; 3402 return group_min;
3398 } 3403 }
3399#endif 3404#endif
@@ -3407,16 +3412,16 @@ ret:
3407 */ 3412 */
3408static struct rq * 3413static struct rq *
3409find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3414find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3410 unsigned long imbalance, const cpumask_t *cpus) 3415 unsigned long imbalance, const struct cpumask *cpus)
3411{ 3416{
3412 struct rq *busiest = NULL, *rq; 3417 struct rq *busiest = NULL, *rq;
3413 unsigned long max_load = 0; 3418 unsigned long max_load = 0;
3414 int i; 3419 int i;
3415 3420
3416 for_each_cpu_mask_nr(i, group->cpumask) { 3421 for_each_cpu(i, sched_group_cpus(group)) {
3417 unsigned long wl; 3422 unsigned long wl;
3418 3423
3419 if (!cpu_isset(i, *cpus)) 3424 if (!cpumask_test_cpu(i, cpus))
3420 continue; 3425 continue;
3421 3426
3422 rq = cpu_rq(i); 3427 rq = cpu_rq(i);
@@ -3446,7 +3451,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3446 */ 3451 */
3447static int load_balance(int this_cpu, struct rq *this_rq, 3452static int load_balance(int this_cpu, struct rq *this_rq,
3448 struct sched_domain *sd, enum cpu_idle_type idle, 3453 struct sched_domain *sd, enum cpu_idle_type idle,
3449 int *balance, cpumask_t *cpus) 3454 int *balance, struct cpumask *cpus)
3450{ 3455{
3451 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3456 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3452 struct sched_group *group; 3457 struct sched_group *group;
@@ -3454,7 +3459,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3454 struct rq *busiest; 3459 struct rq *busiest;
3455 unsigned long flags; 3460 unsigned long flags;
3456 3461
3457 cpus_setall(*cpus); 3462 cpumask_setall(cpus);
3458 3463
3459 /* 3464 /*
3460 * When power savings policy is enabled for the parent domain, idle 3465 * When power savings policy is enabled for the parent domain, idle
@@ -3514,8 +3519,8 @@ redo:
3514 3519
3515 /* All tasks on this runqueue were pinned by CPU affinity */ 3520 /* All tasks on this runqueue were pinned by CPU affinity */
3516 if (unlikely(all_pinned)) { 3521 if (unlikely(all_pinned)) {
3517 cpu_clear(cpu_of(busiest), *cpus); 3522 cpumask_clear_cpu(cpu_of(busiest), cpus);
3518 if (!cpus_empty(*cpus)) 3523 if (!cpumask_empty(cpus))
3519 goto redo; 3524 goto redo;
3520 goto out_balanced; 3525 goto out_balanced;
3521 } 3526 }
@@ -3532,7 +3537,8 @@ redo:
3532 /* don't kick the migration_thread, if the curr 3537 /* don't kick the migration_thread, if the curr
3533 * task on busiest cpu can't be moved to this_cpu 3538 * task on busiest cpu can't be moved to this_cpu
3534 */ 3539 */
3535 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 3540 if (!cpumask_test_cpu(this_cpu,
3541 &busiest->curr->cpus_allowed)) {
3536 spin_unlock_irqrestore(&busiest->lock, flags); 3542 spin_unlock_irqrestore(&busiest->lock, flags);
3537 all_pinned = 1; 3543 all_pinned = 1;
3538 goto out_one_pinned; 3544 goto out_one_pinned;
@@ -3607,7 +3613,7 @@ out:
3607 */ 3613 */
3608static int 3614static int
3609load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 3615load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3610 cpumask_t *cpus) 3616 struct cpumask *cpus)
3611{ 3617{
3612 struct sched_group *group; 3618 struct sched_group *group;
3613 struct rq *busiest = NULL; 3619 struct rq *busiest = NULL;
@@ -3616,7 +3622,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3616 int sd_idle = 0; 3622 int sd_idle = 0;
3617 int all_pinned = 0; 3623 int all_pinned = 0;
3618 3624
3619 cpus_setall(*cpus); 3625 cpumask_setall(cpus);
3620 3626
3621 /* 3627 /*
3622 * When power savings policy is enabled for the parent domain, idle 3628 * When power savings policy is enabled for the parent domain, idle
@@ -3660,17 +3666,76 @@ redo:
3660 double_unlock_balance(this_rq, busiest); 3666 double_unlock_balance(this_rq, busiest);
3661 3667
3662 if (unlikely(all_pinned)) { 3668 if (unlikely(all_pinned)) {
3663 cpu_clear(cpu_of(busiest), *cpus); 3669 cpumask_clear_cpu(cpu_of(busiest), cpus);
3664 if (!cpus_empty(*cpus)) 3670 if (!cpumask_empty(cpus))
3665 goto redo; 3671 goto redo;
3666 } 3672 }
3667 } 3673 }
3668 3674
3669 if (!ld_moved) { 3675 if (!ld_moved) {
3676 int active_balance = 0;
3677
3670 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); 3678 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3671 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3679 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3672 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3680 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3673 return -1; 3681 return -1;
3682
3683 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3684 return -1;
3685
3686 if (sd->nr_balance_failed++ < 2)
3687 return -1;
3688
3689 /*
3690 * The only task running in a non-idle cpu can be moved to this
3691 * cpu in an attempt to completely freeup the other CPU
3692 * package. The same method used to move task in load_balance()
3693 * have been extended for load_balance_newidle() to speedup
3694 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
3695 *
3696 * The package power saving logic comes from
3697 * find_busiest_group(). If there are no imbalance, then
3698 * f_b_g() will return NULL. However when sched_mc={1,2} then
3699 * f_b_g() will select a group from which a running task may be
3700 * pulled to this cpu in order to make the other package idle.
3701 * If there is no opportunity to make a package idle and if
3702 * there are no imbalance, then f_b_g() will return NULL and no
3703 * action will be taken in load_balance_newidle().
3704 *
3705 * Under normal task pull operation due to imbalance, there
3706 * will be more than one task in the source run queue and
3707 * move_tasks() will succeed. ld_moved will be true and this
3708 * active balance code will not be triggered.
3709 */
3710
3711 /* Lock busiest in correct order while this_rq is held */
3712 double_lock_balance(this_rq, busiest);
3713
3714 /*
3715 * don't kick the migration_thread, if the curr
3716 * task on busiest cpu can't be moved to this_cpu
3717 */
3718 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
3719 double_unlock_balance(this_rq, busiest);
3720 all_pinned = 1;
3721 return ld_moved;
3722 }
3723
3724 if (!busiest->active_balance) {
3725 busiest->active_balance = 1;
3726 busiest->push_cpu = this_cpu;
3727 active_balance = 1;
3728 }
3729
3730 double_unlock_balance(this_rq, busiest);
3731 /*
3732 * Should not call ttwu while holding a rq->lock
3733 */
3734 spin_unlock(&this_rq->lock);
3735 if (active_balance)
3736 wake_up_process(busiest->migration_thread);
3737 spin_lock(&this_rq->lock);
3738
3674 } else 3739 } else
3675 sd->nr_balance_failed = 0; 3740 sd->nr_balance_failed = 0;
3676 3741
@@ -3696,7 +3761,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3696 struct sched_domain *sd; 3761 struct sched_domain *sd;
3697 int pulled_task = 0; 3762 int pulled_task = 0;
3698 unsigned long next_balance = jiffies + HZ; 3763 unsigned long next_balance = jiffies + HZ;
3699 cpumask_t tmpmask; 3764 cpumask_var_t tmpmask;
3765
3766 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3767 return;
3700 3768
3701 for_each_domain(this_cpu, sd) { 3769 for_each_domain(this_cpu, sd) {
3702 unsigned long interval; 3770 unsigned long interval;
@@ -3707,7 +3775,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3707 if (sd->flags & SD_BALANCE_NEWIDLE) 3775 if (sd->flags & SD_BALANCE_NEWIDLE)
3708 /* If we've pulled tasks over stop searching: */ 3776 /* If we've pulled tasks over stop searching: */
3709 pulled_task = load_balance_newidle(this_cpu, this_rq, 3777 pulled_task = load_balance_newidle(this_cpu, this_rq,
3710 sd, &tmpmask); 3778 sd, tmpmask);
3711 3779
3712 interval = msecs_to_jiffies(sd->balance_interval); 3780 interval = msecs_to_jiffies(sd->balance_interval);
3713 if (time_after(next_balance, sd->last_balance + interval)) 3781 if (time_after(next_balance, sd->last_balance + interval))
@@ -3722,6 +3790,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3722 */ 3790 */
3723 this_rq->next_balance = next_balance; 3791 this_rq->next_balance = next_balance;
3724 } 3792 }
3793 free_cpumask_var(tmpmask);
3725} 3794}
3726 3795
3727/* 3796/*
@@ -3759,7 +3828,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3759 /* Search for an sd spanning us and the target CPU. */ 3828 /* Search for an sd spanning us and the target CPU. */
3760 for_each_domain(target_cpu, sd) { 3829 for_each_domain(target_cpu, sd) {
3761 if ((sd->flags & SD_LOAD_BALANCE) && 3830 if ((sd->flags & SD_LOAD_BALANCE) &&
3762 cpu_isset(busiest_cpu, sd->span)) 3831 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3763 break; 3832 break;
3764 } 3833 }
3765 3834
@@ -3778,10 +3847,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3778#ifdef CONFIG_NO_HZ 3847#ifdef CONFIG_NO_HZ
3779static struct { 3848static struct {
3780 atomic_t load_balancer; 3849 atomic_t load_balancer;
3781 cpumask_t cpu_mask; 3850 cpumask_var_t cpu_mask;
3782} nohz ____cacheline_aligned = { 3851} nohz ____cacheline_aligned = {
3783 .load_balancer = ATOMIC_INIT(-1), 3852 .load_balancer = ATOMIC_INIT(-1),
3784 .cpu_mask = CPU_MASK_NONE,
3785}; 3853};
3786 3854
3787/* 3855/*
@@ -3809,7 +3877,7 @@ int select_nohz_load_balancer(int stop_tick)
3809 int cpu = smp_processor_id(); 3877 int cpu = smp_processor_id();
3810 3878
3811 if (stop_tick) { 3879 if (stop_tick) {
3812 cpu_set(cpu, nohz.cpu_mask); 3880 cpumask_set_cpu(cpu, nohz.cpu_mask);
3813 cpu_rq(cpu)->in_nohz_recently = 1; 3881 cpu_rq(cpu)->in_nohz_recently = 1;
3814 3882
3815 /* 3883 /*
@@ -3823,7 +3891,7 @@ int select_nohz_load_balancer(int stop_tick)
3823 } 3891 }
3824 3892
3825 /* time for ilb owner also to sleep */ 3893 /* time for ilb owner also to sleep */
3826 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 3894 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3827 if (atomic_read(&nohz.load_balancer) == cpu) 3895 if (atomic_read(&nohz.load_balancer) == cpu)
3828 atomic_set(&nohz.load_balancer, -1); 3896 atomic_set(&nohz.load_balancer, -1);
3829 return 0; 3897 return 0;
@@ -3836,10 +3904,10 @@ int select_nohz_load_balancer(int stop_tick)
3836 } else if (atomic_read(&nohz.load_balancer) == cpu) 3904 } else if (atomic_read(&nohz.load_balancer) == cpu)
3837 return 1; 3905 return 1;
3838 } else { 3906 } else {
3839 if (!cpu_isset(cpu, nohz.cpu_mask)) 3907 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3840 return 0; 3908 return 0;
3841 3909
3842 cpu_clear(cpu, nohz.cpu_mask); 3910 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3843 3911
3844 if (atomic_read(&nohz.load_balancer) == cpu) 3912 if (atomic_read(&nohz.load_balancer) == cpu)
3845 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3913 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3867,7 +3935,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3867 unsigned long next_balance = jiffies + 60*HZ; 3935 unsigned long next_balance = jiffies + 60*HZ;
3868 int update_next_balance = 0; 3936 int update_next_balance = 0;
3869 int need_serialize; 3937 int need_serialize;
3870 cpumask_t tmp; 3938 cpumask_var_t tmp;
3939
3940 /* Fails alloc? Rebalancing probably not a priority right now. */
3941 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3942 return;
3871 3943
3872 for_each_domain(cpu, sd) { 3944 for_each_domain(cpu, sd) {
3873 if (!(sd->flags & SD_LOAD_BALANCE)) 3945 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3892,7 +3964,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3892 } 3964 }
3893 3965
3894 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3966 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3895 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { 3967 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
3896 /* 3968 /*
3897 * We've pulled tasks over so either we're no 3969 * We've pulled tasks over so either we're no
3898 * longer idle, or one of our SMT siblings is 3970 * longer idle, or one of our SMT siblings is
@@ -3926,6 +3998,8 @@ out:
3926 */ 3998 */
3927 if (likely(update_next_balance)) 3999 if (likely(update_next_balance))
3928 rq->next_balance = next_balance; 4000 rq->next_balance = next_balance;
4001
4002 free_cpumask_var(tmp);
3929} 4003}
3930 4004
3931/* 4005/*
@@ -3950,12 +4024,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3950 */ 4024 */
3951 if (this_rq->idle_at_tick && 4025 if (this_rq->idle_at_tick &&
3952 atomic_read(&nohz.load_balancer) == this_cpu) { 4026 atomic_read(&nohz.load_balancer) == this_cpu) {
3953 cpumask_t cpus = nohz.cpu_mask;
3954 struct rq *rq; 4027 struct rq *rq;
3955 int balance_cpu; 4028 int balance_cpu;
3956 4029
3957 cpu_clear(this_cpu, cpus); 4030 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3958 for_each_cpu_mask_nr(balance_cpu, cpus) { 4031 if (balance_cpu == this_cpu)
4032 continue;
4033
3959 /* 4034 /*
3960 * If this cpu gets work to do, stop the load balancing 4035 * If this cpu gets work to do, stop the load balancing
3961 * work being done for other cpus. Next load 4036 * work being done for other cpus. Next load
@@ -3993,7 +4068,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
3993 rq->in_nohz_recently = 0; 4068 rq->in_nohz_recently = 0;
3994 4069
3995 if (atomic_read(&nohz.load_balancer) == cpu) { 4070 if (atomic_read(&nohz.load_balancer) == cpu) {
3996 cpu_clear(cpu, nohz.cpu_mask); 4071 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3997 atomic_set(&nohz.load_balancer, -1); 4072 atomic_set(&nohz.load_balancer, -1);
3998 } 4073 }
3999 4074
@@ -4006,7 +4081,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4006 * TBD: Traverse the sched domains and nominate 4081 * TBD: Traverse the sched domains and nominate
4007 * the nearest cpu in the nohz.cpu_mask. 4082 * the nearest cpu in the nohz.cpu_mask.
4008 */ 4083 */
4009 int ilb = first_cpu(nohz.cpu_mask); 4084 int ilb = cpumask_first(nohz.cpu_mask);
4010 4085
4011 if (ilb < nr_cpu_ids) 4086 if (ilb < nr_cpu_ids)
4012 resched_cpu(ilb); 4087 resched_cpu(ilb);
@@ -4018,7 +4093,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4018 * cpus with ticks stopped, is it time for that to stop? 4093 * cpus with ticks stopped, is it time for that to stop?
4019 */ 4094 */
4020 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && 4095 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4021 cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 4096 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4022 resched_cpu(cpu); 4097 resched_cpu(cpu);
4023 return; 4098 return;
4024 } 4099 }
@@ -4028,7 +4103,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4028 * someone else, then no need raise the SCHED_SOFTIRQ 4103 * someone else, then no need raise the SCHED_SOFTIRQ
4029 */ 4104 */
4030 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && 4105 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4031 cpu_isset(cpu, nohz.cpu_mask)) 4106 cpumask_test_cpu(cpu, nohz.cpu_mask))
4032 return; 4107 return;
4033#endif 4108#endif
4034 if (time_after_eq(jiffies, rq->next_balance)) 4109 if (time_after_eq(jiffies, rq->next_balance))
@@ -4080,13 +4155,17 @@ unsigned long long task_delta_exec(struct task_struct *p)
4080 * Account user cpu time to a process. 4155 * Account user cpu time to a process.
4081 * @p: the process that the cpu time gets accounted to 4156 * @p: the process that the cpu time gets accounted to
4082 * @cputime: the cpu time spent in user space since the last update 4157 * @cputime: the cpu time spent in user space since the last update
4158 * @cputime_scaled: cputime scaled by cpu frequency
4083 */ 4159 */
4084void account_user_time(struct task_struct *p, cputime_t cputime) 4160void account_user_time(struct task_struct *p, cputime_t cputime,
4161 cputime_t cputime_scaled)
4085{ 4162{
4086 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4163 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4087 cputime64_t tmp; 4164 cputime64_t tmp;
4088 4165
4166 /* Add user time to process. */
4089 p->utime = cputime_add(p->utime, cputime); 4167 p->utime = cputime_add(p->utime, cputime);
4168 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4090 account_group_user_time(p, cputime); 4169 account_group_user_time(p, cputime);
4091 4170
4092 /* Add user time to cpustat. */ 4171 /* Add user time to cpustat. */
@@ -4103,51 +4182,48 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4103 * Account guest cpu time to a process. 4182 * Account guest cpu time to a process.
4104 * @p: the process that the cpu time gets accounted to 4183 * @p: the process that the cpu time gets accounted to
4105 * @cputime: the cpu time spent in virtual machine since the last update 4184 * @cputime: the cpu time spent in virtual machine since the last update
4185 * @cputime_scaled: cputime scaled by cpu frequency
4106 */ 4186 */
4107static void account_guest_time(struct task_struct *p, cputime_t cputime) 4187static void account_guest_time(struct task_struct *p, cputime_t cputime,
4188 cputime_t cputime_scaled)
4108{ 4189{
4109 cputime64_t tmp; 4190 cputime64_t tmp;
4110 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4191 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4111 4192
4112 tmp = cputime_to_cputime64(cputime); 4193 tmp = cputime_to_cputime64(cputime);
4113 4194
4195 /* Add guest time to process. */
4114 p->utime = cputime_add(p->utime, cputime); 4196 p->utime = cputime_add(p->utime, cputime);
4197 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4115 account_group_user_time(p, cputime); 4198 account_group_user_time(p, cputime);
4116 p->gtime = cputime_add(p->gtime, cputime); 4199 p->gtime = cputime_add(p->gtime, cputime);
4117 4200
4201 /* Add guest time to cpustat. */
4118 cpustat->user = cputime64_add(cpustat->user, tmp); 4202 cpustat->user = cputime64_add(cpustat->user, tmp);
4119 cpustat->guest = cputime64_add(cpustat->guest, tmp); 4203 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4120} 4204}
4121 4205
4122/* 4206/*
4123 * Account scaled user cpu time to a process.
4124 * @p: the process that the cpu time gets accounted to
4125 * @cputime: the cpu time spent in user space since the last update
4126 */
4127void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4128{
4129 p->utimescaled = cputime_add(p->utimescaled, cputime);
4130}
4131
4132/*
4133 * Account system cpu time to a process. 4207 * Account system cpu time to a process.
4134 * @p: the process that the cpu time gets accounted to 4208 * @p: the process that the cpu time gets accounted to
4135 * @hardirq_offset: the offset to subtract from hardirq_count() 4209 * @hardirq_offset: the offset to subtract from hardirq_count()
4136 * @cputime: the cpu time spent in kernel space since the last update 4210 * @cputime: the cpu time spent in kernel space since the last update
4211 * @cputime_scaled: cputime scaled by cpu frequency
4137 */ 4212 */
4138void account_system_time(struct task_struct *p, int hardirq_offset, 4213void account_system_time(struct task_struct *p, int hardirq_offset,
4139 cputime_t cputime) 4214 cputime_t cputime, cputime_t cputime_scaled)
4140{ 4215{
4141 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4216 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4142 struct rq *rq = this_rq();
4143 cputime64_t tmp; 4217 cputime64_t tmp;
4144 4218
4145 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 4219 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4146 account_guest_time(p, cputime); 4220 account_guest_time(p, cputime, cputime_scaled);
4147 return; 4221 return;
4148 } 4222 }
4149 4223
4224 /* Add system time to process. */
4150 p->stime = cputime_add(p->stime, cputime); 4225 p->stime = cputime_add(p->stime, cputime);
4226 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
4151 account_group_system_time(p, cputime); 4227 account_group_system_time(p, cputime);
4152 4228
4153 /* Add system time to cpustat. */ 4229 /* Add system time to cpustat. */
@@ -4156,48 +4232,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4156 cpustat->irq = cputime64_add(cpustat->irq, tmp); 4232 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4157 else if (softirq_count()) 4233 else if (softirq_count())
4158 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 4234 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4159 else if (p != rq->idle)
4160 cpustat->system = cputime64_add(cpustat->system, tmp);
4161 else if (atomic_read(&rq->nr_iowait) > 0)
4162 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4163 else 4235 else
4164 cpustat->idle = cputime64_add(cpustat->idle, tmp); 4236 cpustat->system = cputime64_add(cpustat->system, tmp);
4237
4165 /* Account for system time used */ 4238 /* Account for system time used */
4166 acct_update_integrals(p); 4239 acct_update_integrals(p);
4167} 4240}
4168 4241
4169/* 4242/*
4170 * Account scaled system cpu time to a process. 4243 * Account for involuntary wait time.
4171 * @p: the process that the cpu time gets accounted to 4244 * @steal: the cpu time spent in involuntary wait
4172 * @hardirq_offset: the offset to subtract from hardirq_count()
4173 * @cputime: the cpu time spent in kernel space since the last update
4174 */ 4245 */
4175void account_system_time_scaled(struct task_struct *p, cputime_t cputime) 4246void account_steal_time(cputime_t cputime)
4176{ 4247{
4177 p->stimescaled = cputime_add(p->stimescaled, cputime); 4248 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4249 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4250
4251 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
4178} 4252}
4179 4253
4180/* 4254/*
4181 * Account for involuntary wait time. 4255 * Account for idle time.
4182 * @p: the process from which the cpu time has been stolen 4256 * @cputime: the cpu time spent in idle wait
4183 * @steal: the cpu time spent in involuntary wait
4184 */ 4257 */
4185void account_steal_time(struct task_struct *p, cputime_t steal) 4258void account_idle_time(cputime_t cputime)
4186{ 4259{
4187 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4260 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4188 cputime64_t tmp = cputime_to_cputime64(steal); 4261 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4189 struct rq *rq = this_rq(); 4262 struct rq *rq = this_rq();
4190 4263
4191 if (p == rq->idle) { 4264 if (atomic_read(&rq->nr_iowait) > 0)
4192 p->stime = cputime_add(p->stime, steal); 4265 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
4193 if (atomic_read(&rq->nr_iowait) > 0) 4266 else
4194 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4267 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
4195 else
4196 cpustat->idle = cputime64_add(cpustat->idle, tmp);
4197 } else
4198 cpustat->steal = cputime64_add(cpustat->steal, tmp);
4199} 4268}
4200 4269
4270#ifndef CONFIG_VIRT_CPU_ACCOUNTING
4271
4272/*
4273 * Account a single tick of cpu time.
4274 * @p: the process that the cpu time gets accounted to
4275 * @user_tick: indicates if the tick is a user or a system tick
4276 */
4277void account_process_tick(struct task_struct *p, int user_tick)
4278{
4279 cputime_t one_jiffy = jiffies_to_cputime(1);
4280 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
4281 struct rq *rq = this_rq();
4282
4283 if (user_tick)
4284 account_user_time(p, one_jiffy, one_jiffy_scaled);
4285 else if (p != rq->idle)
4286 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
4287 one_jiffy_scaled);
4288 else
4289 account_idle_time(one_jiffy);
4290}
4291
4292/*
4293 * Account multiple ticks of steal time.
4294 * @p: the process from which the cpu time has been stolen
4295 * @ticks: number of stolen ticks
4296 */
4297void account_steal_ticks(unsigned long ticks)
4298{
4299 account_steal_time(jiffies_to_cputime(ticks));
4300}
4301
4302/*
4303 * Account multiple ticks of idle time.
4304 * @ticks: number of stolen ticks
4305 */
4306void account_idle_ticks(unsigned long ticks)
4307{
4308 account_idle_time(jiffies_to_cputime(ticks));
4309}
4310
4311#endif
4312
4201/* 4313/*
4202 * Use precise platform statistics if available: 4314 * Use precise platform statistics if available:
4203 */ 4315 */
@@ -5401,10 +5513,9 @@ out_unlock:
5401 return retval; 5513 return retval;
5402} 5514}
5403 5515
5404long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) 5516long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5405{ 5517{
5406 cpumask_t cpus_allowed; 5518 cpumask_var_t cpus_allowed, new_mask;
5407 cpumask_t new_mask = *in_mask;
5408 struct task_struct *p; 5519 struct task_struct *p;
5409 int retval; 5520 int retval;
5410 5521
@@ -5426,6 +5537,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5426 get_task_struct(p); 5537 get_task_struct(p);
5427 read_unlock(&tasklist_lock); 5538 read_unlock(&tasklist_lock);
5428 5539
5540 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5541 retval = -ENOMEM;
5542 goto out_put_task;
5543 }
5544 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5545 retval = -ENOMEM;
5546 goto out_free_cpus_allowed;
5547 }
5429 retval = -EPERM; 5548 retval = -EPERM;
5430 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5549 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5431 goto out_unlock; 5550 goto out_unlock;
@@ -5434,37 +5553,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5434 if (retval) 5553 if (retval)
5435 goto out_unlock; 5554 goto out_unlock;
5436 5555
5437 cpuset_cpus_allowed(p, &cpus_allowed); 5556 cpuset_cpus_allowed(p, cpus_allowed);
5438 cpus_and(new_mask, new_mask, cpus_allowed); 5557 cpumask_and(new_mask, in_mask, cpus_allowed);
5439 again: 5558 again:
5440 retval = set_cpus_allowed_ptr(p, &new_mask); 5559 retval = set_cpus_allowed_ptr(p, new_mask);
5441 5560
5442 if (!retval) { 5561 if (!retval) {
5443 cpuset_cpus_allowed(p, &cpus_allowed); 5562 cpuset_cpus_allowed(p, cpus_allowed);
5444 if (!cpus_subset(new_mask, cpus_allowed)) { 5563 if (!cpumask_subset(new_mask, cpus_allowed)) {
5445 /* 5564 /*
5446 * We must have raced with a concurrent cpuset 5565 * We must have raced with a concurrent cpuset
5447 * update. Just reset the cpus_allowed to the 5566 * update. Just reset the cpus_allowed to the
5448 * cpuset's cpus_allowed 5567 * cpuset's cpus_allowed
5449 */ 5568 */
5450 new_mask = cpus_allowed; 5569 cpumask_copy(new_mask, cpus_allowed);
5451 goto again; 5570 goto again;
5452 } 5571 }
5453 } 5572 }
5454out_unlock: 5573out_unlock:
5574 free_cpumask_var(new_mask);
5575out_free_cpus_allowed:
5576 free_cpumask_var(cpus_allowed);
5577out_put_task:
5455 put_task_struct(p); 5578 put_task_struct(p);
5456 put_online_cpus(); 5579 put_online_cpus();
5457 return retval; 5580 return retval;
5458} 5581}
5459 5582
5460static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5583static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5461 cpumask_t *new_mask) 5584 struct cpumask *new_mask)
5462{ 5585{
5463 if (len < sizeof(cpumask_t)) { 5586 if (len < cpumask_size())
5464 memset(new_mask, 0, sizeof(cpumask_t)); 5587 cpumask_clear(new_mask);
5465 } else if (len > sizeof(cpumask_t)) { 5588 else if (len > cpumask_size())
5466 len = sizeof(cpumask_t); 5589 len = cpumask_size();
5467 } 5590
5468 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5591 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5469} 5592}
5470 5593
@@ -5477,17 +5600,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5477asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 5600asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5478 unsigned long __user *user_mask_ptr) 5601 unsigned long __user *user_mask_ptr)
5479{ 5602{
5480 cpumask_t new_mask; 5603 cpumask_var_t new_mask;
5481 int retval; 5604 int retval;
5482 5605
5483 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 5606 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5484 if (retval) 5607 return -ENOMEM;
5485 return retval;
5486 5608
5487 return sched_setaffinity(pid, &new_mask); 5609 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5610 if (retval == 0)
5611 retval = sched_setaffinity(pid, new_mask);
5612 free_cpumask_var(new_mask);
5613 return retval;
5488} 5614}
5489 5615
5490long sched_getaffinity(pid_t pid, cpumask_t *mask) 5616long sched_getaffinity(pid_t pid, struct cpumask *mask)
5491{ 5617{
5492 struct task_struct *p; 5618 struct task_struct *p;
5493 int retval; 5619 int retval;
@@ -5504,7 +5630,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
5504 if (retval) 5630 if (retval)
5505 goto out_unlock; 5631 goto out_unlock;
5506 5632
5507 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 5633 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5508 5634
5509out_unlock: 5635out_unlock:
5510 read_unlock(&tasklist_lock); 5636 read_unlock(&tasklist_lock);
@@ -5523,19 +5649,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5523 unsigned long __user *user_mask_ptr) 5649 unsigned long __user *user_mask_ptr)
5524{ 5650{
5525 int ret; 5651 int ret;
5526 cpumask_t mask; 5652 cpumask_var_t mask;
5527 5653
5528 if (len < sizeof(cpumask_t)) 5654 if (len < cpumask_size())
5529 return -EINVAL; 5655 return -EINVAL;
5530 5656
5531 ret = sched_getaffinity(pid, &mask); 5657 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5532 if (ret < 0) 5658 return -ENOMEM;
5533 return ret;
5534 5659
5535 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 5660 ret = sched_getaffinity(pid, mask);
5536 return -EFAULT; 5661 if (ret == 0) {
5662 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
5663 ret = -EFAULT;
5664 else
5665 ret = cpumask_size();
5666 }
5667 free_cpumask_var(mask);
5537 5668
5538 return sizeof(cpumask_t); 5669 return ret;
5539} 5670}
5540 5671
5541/** 5672/**
@@ -5877,7 +6008,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5877 idle->se.exec_start = sched_clock(); 6008 idle->se.exec_start = sched_clock();
5878 6009
5879 idle->prio = idle->normal_prio = MAX_PRIO; 6010 idle->prio = idle->normal_prio = MAX_PRIO;
5880 idle->cpus_allowed = cpumask_of_cpu(cpu); 6011 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5881 __set_task_cpu(idle, cpu); 6012 __set_task_cpu(idle, cpu);
5882 6013
5883 rq->curr = rq->idle = idle; 6014 rq->curr = rq->idle = idle;
@@ -5904,9 +6035,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5904 * indicates which cpus entered this state. This is used 6035 * indicates which cpus entered this state. This is used
5905 * in the rcu update to wait only for active cpus. For system 6036 * in the rcu update to wait only for active cpus. For system
5906 * which do not switch off the HZ timer nohz_cpu_mask should 6037 * which do not switch off the HZ timer nohz_cpu_mask should
5907 * always be CPU_MASK_NONE. 6038 * always be CPU_BITS_NONE.
5908 */ 6039 */
5909cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 6040cpumask_var_t nohz_cpu_mask;
5910 6041
5911/* 6042/*
5912 * Increase the granularity value when there are more CPUs, 6043 * Increase the granularity value when there are more CPUs,
@@ -5961,7 +6092,7 @@ static inline void sched_init_granularity(void)
5961 * task must not exit() & deallocate itself prematurely. The 6092 * task must not exit() & deallocate itself prematurely. The
5962 * call is not atomic; no spinlocks may be held. 6093 * call is not atomic; no spinlocks may be held.
5963 */ 6094 */
5964int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) 6095int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5965{ 6096{
5966 struct migration_req req; 6097 struct migration_req req;
5967 unsigned long flags; 6098 unsigned long flags;
@@ -5969,13 +6100,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5969 int ret = 0; 6100 int ret = 0;
5970 6101
5971 rq = task_rq_lock(p, &flags); 6102 rq = task_rq_lock(p, &flags);
5972 if (!cpus_intersects(*new_mask, cpu_online_map)) { 6103 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
5973 ret = -EINVAL; 6104 ret = -EINVAL;
5974 goto out; 6105 goto out;
5975 } 6106 }
5976 6107
5977 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6108 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5978 !cpus_equal(p->cpus_allowed, *new_mask))) { 6109 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5979 ret = -EINVAL; 6110 ret = -EINVAL;
5980 goto out; 6111 goto out;
5981 } 6112 }
@@ -5983,15 +6114,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5983 if (p->sched_class->set_cpus_allowed) 6114 if (p->sched_class->set_cpus_allowed)
5984 p->sched_class->set_cpus_allowed(p, new_mask); 6115 p->sched_class->set_cpus_allowed(p, new_mask);
5985 else { 6116 else {
5986 p->cpus_allowed = *new_mask; 6117 cpumask_copy(&p->cpus_allowed, new_mask);
5987 p->rt.nr_cpus_allowed = cpus_weight(*new_mask); 6118 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5988 } 6119 }
5989 6120
5990 /* Can the task run on the task's current CPU? If so, we're done */ 6121 /* Can the task run on the task's current CPU? If so, we're done */
5991 if (cpu_isset(task_cpu(p), *new_mask)) 6122 if (cpumask_test_cpu(task_cpu(p), new_mask))
5992 goto out; 6123 goto out;
5993 6124
5994 if (migrate_task(p, any_online_cpu(*new_mask), &req)) { 6125 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
5995 /* Need help from migration thread: drop lock and wait. */ 6126 /* Need help from migration thread: drop lock and wait. */
5996 task_rq_unlock(rq, &flags); 6127 task_rq_unlock(rq, &flags);
5997 wake_up_process(rq->migration_thread); 6128 wake_up_process(rq->migration_thread);
@@ -6033,7 +6164,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6033 if (task_cpu(p) != src_cpu) 6164 if (task_cpu(p) != src_cpu)
6034 goto done; 6165 goto done;
6035 /* Affinity changed (again). */ 6166 /* Affinity changed (again). */
6036 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 6167 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6037 goto fail; 6168 goto fail;
6038 6169
6039 on_rq = p->se.on_rq; 6170 on_rq = p->se.on_rq;
@@ -6130,50 +6261,41 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6130 */ 6261 */
6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6262static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6132{ 6263{
6133 unsigned long flags;
6134 cpumask_t mask;
6135 struct rq *rq;
6136 int dest_cpu; 6264 int dest_cpu;
6265 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
6137 6266
6138 do { 6267again:
6139 /* On same node? */ 6268 /* Look for allowed, online CPU in same node. */
6140 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 6269 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
6141 cpus_and(mask, mask, p->cpus_allowed); 6270 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6142 dest_cpu = any_online_cpu(mask); 6271 goto move;
6143 6272
6144 /* On any allowed CPU? */ 6273 /* Any allowed, online CPU? */
6145 if (dest_cpu >= nr_cpu_ids) 6274 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
6146 dest_cpu = any_online_cpu(p->cpus_allowed); 6275 if (dest_cpu < nr_cpu_ids)
6276 goto move;
6147 6277
6148 /* No more Mr. Nice Guy. */ 6278 /* No more Mr. Nice Guy. */
6149 if (dest_cpu >= nr_cpu_ids) { 6279 if (dest_cpu >= nr_cpu_ids) {
6150 cpumask_t cpus_allowed; 6280 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
6281 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
6151 6282
6152 cpuset_cpus_allowed_locked(p, &cpus_allowed); 6283 /*
6153 /* 6284 * Don't tell them about moving exiting tasks or
6154 * Try to stay on the same cpuset, where the 6285 * kernel threads (both mm NULL), since they never
6155 * current cpuset may be a subset of all cpus. 6286 * leave kernel.
6156 * The cpuset_cpus_allowed_locked() variant of 6287 */
6157 * cpuset_cpus_allowed() will not block. It must be 6288 if (p->mm && printk_ratelimit()) {
6158 * called within calls to cpuset_lock/cpuset_unlock. 6289 printk(KERN_INFO "process %d (%s) no "
6159 */ 6290 "longer affine to cpu%d\n",
6160 rq = task_rq_lock(p, &flags); 6291 task_pid_nr(p), p->comm, dead_cpu);
6161 p->cpus_allowed = cpus_allowed;
6162 dest_cpu = any_online_cpu(p->cpus_allowed);
6163 task_rq_unlock(rq, &flags);
6164
6165 /*
6166 * Don't tell them about moving exiting tasks or
6167 * kernel threads (both mm NULL), since they never
6168 * leave kernel.
6169 */
6170 if (p->mm && printk_ratelimit()) {
6171 printk(KERN_INFO "process %d (%s) no "
6172 "longer affine to cpu%d\n",
6173 task_pid_nr(p), p->comm, dead_cpu);
6174 }
6175 } 6292 }
6176 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 6293 }
6294
6295move:
6296 /* It can have affinity changed while we were choosing. */
6297 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
6298 goto again;
6177} 6299}
6178 6300
6179/* 6301/*
@@ -6185,7 +6307,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6185 */ 6307 */
6186static void migrate_nr_uninterruptible(struct rq *rq_src) 6308static void migrate_nr_uninterruptible(struct rq *rq_src)
6187{ 6309{
6188 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); 6310 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
6189 unsigned long flags; 6311 unsigned long flags;
6190 6312
6191 local_irq_save(flags); 6313 local_irq_save(flags);
@@ -6475,7 +6597,7 @@ static void set_rq_online(struct rq *rq)
6475 if (!rq->online) { 6597 if (!rq->online) {
6476 const struct sched_class *class; 6598 const struct sched_class *class;
6477 6599
6478 cpu_set(rq->cpu, rq->rd->online); 6600 cpumask_set_cpu(rq->cpu, rq->rd->online);
6479 rq->online = 1; 6601 rq->online = 1;
6480 6602
6481 for_each_class(class) { 6603 for_each_class(class) {
@@ -6495,7 +6617,7 @@ static void set_rq_offline(struct rq *rq)
6495 class->rq_offline(rq); 6617 class->rq_offline(rq);
6496 } 6618 }
6497 6619
6498 cpu_clear(rq->cpu, rq->rd->online); 6620 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6499 rq->online = 0; 6621 rq->online = 0;
6500 } 6622 }
6501} 6623}
@@ -6536,7 +6658,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6536 rq = cpu_rq(cpu); 6658 rq = cpu_rq(cpu);
6537 spin_lock_irqsave(&rq->lock, flags); 6659 spin_lock_irqsave(&rq->lock, flags);
6538 if (rq->rd) { 6660 if (rq->rd) {
6539 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6661 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6540 6662
6541 set_rq_online(rq); 6663 set_rq_online(rq);
6542 } 6664 }
@@ -6550,7 +6672,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6550 break; 6672 break;
6551 /* Unbind it from offline cpu so it can run. Fall thru. */ 6673 /* Unbind it from offline cpu so it can run. Fall thru. */
6552 kthread_bind(cpu_rq(cpu)->migration_thread, 6674 kthread_bind(cpu_rq(cpu)->migration_thread,
6553 any_online_cpu(cpu_online_map)); 6675 cpumask_any(cpu_online_mask));
6554 kthread_stop(cpu_rq(cpu)->migration_thread); 6676 kthread_stop(cpu_rq(cpu)->migration_thread);
6555 cpu_rq(cpu)->migration_thread = NULL; 6677 cpu_rq(cpu)->migration_thread = NULL;
6556 break; 6678 break;
@@ -6600,7 +6722,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6600 rq = cpu_rq(cpu); 6722 rq = cpu_rq(cpu);
6601 spin_lock_irqsave(&rq->lock, flags); 6723 spin_lock_irqsave(&rq->lock, flags);
6602 if (rq->rd) { 6724 if (rq->rd) {
6603 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6725 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6604 set_rq_offline(rq); 6726 set_rq_offline(rq);
6605 } 6727 }
6606 spin_unlock_irqrestore(&rq->lock, flags); 6728 spin_unlock_irqrestore(&rq->lock, flags);
@@ -6639,13 +6761,13 @@ early_initcall(migration_init);
6639#ifdef CONFIG_SCHED_DEBUG 6761#ifdef CONFIG_SCHED_DEBUG
6640 6762
6641static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6763static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6642 cpumask_t *groupmask) 6764 struct cpumask *groupmask)
6643{ 6765{
6644 struct sched_group *group = sd->groups; 6766 struct sched_group *group = sd->groups;
6645 char str[256]; 6767 char str[256];
6646 6768
6647 cpulist_scnprintf(str, sizeof(str), sd->span); 6769 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6648 cpus_clear(*groupmask); 6770 cpumask_clear(groupmask);
6649 6771
6650 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6772 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6651 6773
@@ -6659,11 +6781,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6659 6781
6660 printk(KERN_CONT "span %s level %s\n", str, sd->name); 6782 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6661 6783
6662 if (!cpu_isset(cpu, sd->span)) { 6784 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6663 printk(KERN_ERR "ERROR: domain->span does not contain " 6785 printk(KERN_ERR "ERROR: domain->span does not contain "
6664 "CPU%d\n", cpu); 6786 "CPU%d\n", cpu);
6665 } 6787 }
6666 if (!cpu_isset(cpu, group->cpumask)) { 6788 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6667 printk(KERN_ERR "ERROR: domain->groups does not contain" 6789 printk(KERN_ERR "ERROR: domain->groups does not contain"
6668 " CPU%d\n", cpu); 6790 " CPU%d\n", cpu);
6669 } 6791 }
@@ -6683,31 +6805,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6683 break; 6805 break;
6684 } 6806 }
6685 6807
6686 if (!cpus_weight(group->cpumask)) { 6808 if (!cpumask_weight(sched_group_cpus(group))) {
6687 printk(KERN_CONT "\n"); 6809 printk(KERN_CONT "\n");
6688 printk(KERN_ERR "ERROR: empty group\n"); 6810 printk(KERN_ERR "ERROR: empty group\n");
6689 break; 6811 break;
6690 } 6812 }
6691 6813
6692 if (cpus_intersects(*groupmask, group->cpumask)) { 6814 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6693 printk(KERN_CONT "\n"); 6815 printk(KERN_CONT "\n");
6694 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6816 printk(KERN_ERR "ERROR: repeated CPUs\n");
6695 break; 6817 break;
6696 } 6818 }
6697 6819
6698 cpus_or(*groupmask, *groupmask, group->cpumask); 6820 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6699 6821
6700 cpulist_scnprintf(str, sizeof(str), group->cpumask); 6822 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6701 printk(KERN_CONT " %s", str); 6823 printk(KERN_CONT " %s", str);
6702 6824
6703 group = group->next; 6825 group = group->next;
6704 } while (group != sd->groups); 6826 } while (group != sd->groups);
6705 printk(KERN_CONT "\n"); 6827 printk(KERN_CONT "\n");
6706 6828
6707 if (!cpus_equal(sd->span, *groupmask)) 6829 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6708 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6830 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6709 6831
6710 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) 6832 if (sd->parent &&
6833 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6711 printk(KERN_ERR "ERROR: parent span is not a superset " 6834 printk(KERN_ERR "ERROR: parent span is not a superset "
6712 "of domain->span\n"); 6835 "of domain->span\n");
6713 return 0; 6836 return 0;
@@ -6715,7 +6838,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6715 6838
6716static void sched_domain_debug(struct sched_domain *sd, int cpu) 6839static void sched_domain_debug(struct sched_domain *sd, int cpu)
6717{ 6840{
6718 cpumask_t *groupmask; 6841 cpumask_var_t groupmask;
6719 int level = 0; 6842 int level = 0;
6720 6843
6721 if (!sd) { 6844 if (!sd) {
@@ -6725,8 +6848,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6725 6848
6726 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6849 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6727 6850
6728 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 6851 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6729 if (!groupmask) {
6730 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6852 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6731 return; 6853 return;
6732 } 6854 }
@@ -6739,7 +6861,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6739 if (!sd) 6861 if (!sd)
6740 break; 6862 break;
6741 } 6863 }
6742 kfree(groupmask); 6864 free_cpumask_var(groupmask);
6743} 6865}
6744#else /* !CONFIG_SCHED_DEBUG */ 6866#else /* !CONFIG_SCHED_DEBUG */
6745# define sched_domain_debug(sd, cpu) do { } while (0) 6867# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6747,7 +6869,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6747 6869
6748static int sd_degenerate(struct sched_domain *sd) 6870static int sd_degenerate(struct sched_domain *sd)
6749{ 6871{
6750 if (cpus_weight(sd->span) == 1) 6872 if (cpumask_weight(sched_domain_span(sd)) == 1)
6751 return 1; 6873 return 1;
6752 6874
6753 /* Following flags need at least 2 groups */ 6875 /* Following flags need at least 2 groups */
@@ -6778,7 +6900,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6778 if (sd_degenerate(parent)) 6900 if (sd_degenerate(parent))
6779 return 1; 6901 return 1;
6780 6902
6781 if (!cpus_equal(sd->span, parent->span)) 6903 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6782 return 0; 6904 return 0;
6783 6905
6784 /* Does parent contain flags not in child? */ 6906 /* Does parent contain flags not in child? */
@@ -6802,6 +6924,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6802 return 1; 6924 return 1;
6803} 6925}
6804 6926
6927static void free_rootdomain(struct root_domain *rd)
6928{
6929 cpupri_cleanup(&rd->cpupri);
6930
6931 free_cpumask_var(rd->rto_mask);
6932 free_cpumask_var(rd->online);
6933 free_cpumask_var(rd->span);
6934 kfree(rd);
6935}
6936
6805static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6937static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6806{ 6938{
6807 unsigned long flags; 6939 unsigned long flags;
@@ -6811,38 +6943,62 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6811 if (rq->rd) { 6943 if (rq->rd) {
6812 struct root_domain *old_rd = rq->rd; 6944 struct root_domain *old_rd = rq->rd;
6813 6945
6814 if (cpu_isset(rq->cpu, old_rd->online)) 6946 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6815 set_rq_offline(rq); 6947 set_rq_offline(rq);
6816 6948
6817 cpu_clear(rq->cpu, old_rd->span); 6949 cpumask_clear_cpu(rq->cpu, old_rd->span);
6818 6950
6819 if (atomic_dec_and_test(&old_rd->refcount)) 6951 if (atomic_dec_and_test(&old_rd->refcount))
6820 kfree(old_rd); 6952 free_rootdomain(old_rd);
6821 } 6953 }
6822 6954
6823 atomic_inc(&rd->refcount); 6955 atomic_inc(&rd->refcount);
6824 rq->rd = rd; 6956 rq->rd = rd;
6825 6957
6826 cpu_set(rq->cpu, rd->span); 6958 cpumask_set_cpu(rq->cpu, rd->span);
6827 if (cpu_isset(rq->cpu, cpu_online_map)) 6959 if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
6828 set_rq_online(rq); 6960 set_rq_online(rq);
6829 6961
6830 spin_unlock_irqrestore(&rq->lock, flags); 6962 spin_unlock_irqrestore(&rq->lock, flags);
6831} 6963}
6832 6964
6833static void init_rootdomain(struct root_domain *rd) 6965static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
6834{ 6966{
6835 memset(rd, 0, sizeof(*rd)); 6967 memset(rd, 0, sizeof(*rd));
6836 6968
6837 cpus_clear(rd->span); 6969 if (bootmem) {
6838 cpus_clear(rd->online); 6970 alloc_bootmem_cpumask_var(&def_root_domain.span);
6971 alloc_bootmem_cpumask_var(&def_root_domain.online);
6972 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
6973 cpupri_init(&rd->cpupri, true);
6974 return 0;
6975 }
6839 6976
6840 cpupri_init(&rd->cpupri); 6977 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6978 goto out;
6979 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6980 goto free_span;
6981 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6982 goto free_online;
6983
6984 if (cpupri_init(&rd->cpupri, false) != 0)
6985 goto free_rto_mask;
6986 return 0;
6987
6988free_rto_mask:
6989 free_cpumask_var(rd->rto_mask);
6990free_online:
6991 free_cpumask_var(rd->online);
6992free_span:
6993 free_cpumask_var(rd->span);
6994out:
6995 return -ENOMEM;
6841} 6996}
6842 6997
6843static void init_defrootdomain(void) 6998static void init_defrootdomain(void)
6844{ 6999{
6845 init_rootdomain(&def_root_domain); 7000 init_rootdomain(&def_root_domain, true);
7001
6846 atomic_set(&def_root_domain.refcount, 1); 7002 atomic_set(&def_root_domain.refcount, 1);
6847} 7003}
6848 7004
@@ -6854,7 +7010,10 @@ static struct root_domain *alloc_rootdomain(void)
6854 if (!rd) 7010 if (!rd)
6855 return NULL; 7011 return NULL;
6856 7012
6857 init_rootdomain(rd); 7013 if (init_rootdomain(rd, false) != 0) {
7014 kfree(rd);
7015 return NULL;
7016 }
6858 7017
6859 return rd; 7018 return rd;
6860} 7019}
@@ -6896,19 +7055,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6896} 7055}
6897 7056
6898/* cpus with isolated domains */ 7057/* cpus with isolated domains */
6899static cpumask_t cpu_isolated_map = CPU_MASK_NONE; 7058static cpumask_var_t cpu_isolated_map;
6900 7059
6901/* Setup the mask of cpus configured for isolated domains */ 7060/* Setup the mask of cpus configured for isolated domains */
6902static int __init isolated_cpu_setup(char *str) 7061static int __init isolated_cpu_setup(char *str)
6903{ 7062{
6904 static int __initdata ints[NR_CPUS]; 7063 cpulist_parse(str, cpu_isolated_map);
6905 int i;
6906
6907 str = get_options(str, ARRAY_SIZE(ints), ints);
6908 cpus_clear(cpu_isolated_map);
6909 for (i = 1; i <= ints[0]; i++)
6910 if (ints[i] < NR_CPUS)
6911 cpu_set(ints[i], cpu_isolated_map);
6912 return 1; 7064 return 1;
6913} 7065}
6914 7066
@@ -6917,42 +7069,43 @@ __setup("isolcpus=", isolated_cpu_setup);
6917/* 7069/*
6918 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 7070 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6919 * to a function which identifies what group(along with sched group) a CPU 7071 * to a function which identifies what group(along with sched group) a CPU
6920 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS 7072 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6921 * (due to the fact that we keep track of groups covered with a cpumask_t). 7073 * (due to the fact that we keep track of groups covered with a struct cpumask).
6922 * 7074 *
6923 * init_sched_build_groups will build a circular linked list of the groups 7075 * init_sched_build_groups will build a circular linked list of the groups
6924 * covered by the given span, and will set each group's ->cpumask correctly, 7076 * covered by the given span, and will set each group's ->cpumask correctly,
6925 * and ->cpu_power to 0. 7077 * and ->cpu_power to 0.
6926 */ 7078 */
6927static void 7079static void
6928init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, 7080init_sched_build_groups(const struct cpumask *span,
6929 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 7081 const struct cpumask *cpu_map,
7082 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6930 struct sched_group **sg, 7083 struct sched_group **sg,
6931 cpumask_t *tmpmask), 7084 struct cpumask *tmpmask),
6932 cpumask_t *covered, cpumask_t *tmpmask) 7085 struct cpumask *covered, struct cpumask *tmpmask)
6933{ 7086{
6934 struct sched_group *first = NULL, *last = NULL; 7087 struct sched_group *first = NULL, *last = NULL;
6935 int i; 7088 int i;
6936 7089
6937 cpus_clear(*covered); 7090 cpumask_clear(covered);
6938 7091
6939 for_each_cpu_mask_nr(i, *span) { 7092 for_each_cpu(i, span) {
6940 struct sched_group *sg; 7093 struct sched_group *sg;
6941 int group = group_fn(i, cpu_map, &sg, tmpmask); 7094 int group = group_fn(i, cpu_map, &sg, tmpmask);
6942 int j; 7095 int j;
6943 7096
6944 if (cpu_isset(i, *covered)) 7097 if (cpumask_test_cpu(i, covered))
6945 continue; 7098 continue;
6946 7099
6947 cpus_clear(sg->cpumask); 7100 cpumask_clear(sched_group_cpus(sg));
6948 sg->__cpu_power = 0; 7101 sg->__cpu_power = 0;
6949 7102
6950 for_each_cpu_mask_nr(j, *span) { 7103 for_each_cpu(j, span) {
6951 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 7104 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6952 continue; 7105 continue;
6953 7106
6954 cpu_set(j, *covered); 7107 cpumask_set_cpu(j, covered);
6955 cpu_set(j, sg->cpumask); 7108 cpumask_set_cpu(j, sched_group_cpus(sg));
6956 } 7109 }
6957 if (!first) 7110 if (!first)
6958 first = sg; 7111 first = sg;
@@ -7016,23 +7169,21 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
7016 * should be one that prevents unnecessary balancing, but also spreads tasks 7169 * should be one that prevents unnecessary balancing, but also spreads tasks
7017 * out optimally. 7170 * out optimally.
7018 */ 7171 */
7019static void sched_domain_node_span(int node, cpumask_t *span) 7172static void sched_domain_node_span(int node, struct cpumask *span)
7020{ 7173{
7021 nodemask_t used_nodes; 7174 nodemask_t used_nodes;
7022 node_to_cpumask_ptr(nodemask, node);
7023 int i; 7175 int i;
7024 7176
7025 cpus_clear(*span); 7177 cpumask_clear(span);
7026 nodes_clear(used_nodes); 7178 nodes_clear(used_nodes);
7027 7179
7028 cpus_or(*span, *span, *nodemask); 7180 cpumask_or(span, span, cpumask_of_node(node));
7029 node_set(node, used_nodes); 7181 node_set(node, used_nodes);
7030 7182
7031 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7183 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
7032 int next_node = find_next_best_node(node, &used_nodes); 7184 int next_node = find_next_best_node(node, &used_nodes);
7033 7185
7034 node_to_cpumask_ptr_next(nodemask, next_node); 7186 cpumask_or(span, span, cpumask_of_node(next_node));
7035 cpus_or(*span, *span, *nodemask);
7036 } 7187 }
7037} 7188}
7038#endif /* CONFIG_NUMA */ 7189#endif /* CONFIG_NUMA */
@@ -7040,18 +7191,33 @@ static void sched_domain_node_span(int node, cpumask_t *span)
7040int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7191int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7041 7192
7042/* 7193/*
7194 * The cpus mask in sched_group and sched_domain hangs off the end.
7195 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
7196 * for nr_cpu_ids < CONFIG_NR_CPUS.
7197 */
7198struct static_sched_group {
7199 struct sched_group sg;
7200 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
7201};
7202
7203struct static_sched_domain {
7204 struct sched_domain sd;
7205 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
7206};
7207
7208/*
7043 * SMT sched-domains: 7209 * SMT sched-domains:
7044 */ 7210 */
7045#ifdef CONFIG_SCHED_SMT 7211#ifdef CONFIG_SCHED_SMT
7046static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 7212static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
7047static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7213static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
7048 7214
7049static int 7215static int
7050cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7216cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
7051 cpumask_t *unused) 7217 struct sched_group **sg, struct cpumask *unused)
7052{ 7218{
7053 if (sg) 7219 if (sg)
7054 *sg = &per_cpu(sched_group_cpus, cpu); 7220 *sg = &per_cpu(sched_group_cpus, cpu).sg;
7055 return cpu; 7221 return cpu;
7056} 7222}
7057#endif /* CONFIG_SCHED_SMT */ 7223#endif /* CONFIG_SCHED_SMT */
@@ -7060,56 +7226,53 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7060 * multi-core sched-domains: 7226 * multi-core sched-domains:
7061 */ 7227 */
7062#ifdef CONFIG_SCHED_MC 7228#ifdef CONFIG_SCHED_MC
7063static DEFINE_PER_CPU(struct sched_domain, core_domains); 7229static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
7064static DEFINE_PER_CPU(struct sched_group, sched_group_core); 7230static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
7065#endif /* CONFIG_SCHED_MC */ 7231#endif /* CONFIG_SCHED_MC */
7066 7232
7067#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7233#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7068static int 7234static int
7069cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7235cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7070 cpumask_t *mask) 7236 struct sched_group **sg, struct cpumask *mask)
7071{ 7237{
7072 int group; 7238 int group;
7073 7239
7074 *mask = per_cpu(cpu_sibling_map, cpu); 7240 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7075 cpus_and(*mask, *mask, *cpu_map); 7241 group = cpumask_first(mask);
7076 group = first_cpu(*mask);
7077 if (sg) 7242 if (sg)
7078 *sg = &per_cpu(sched_group_core, group); 7243 *sg = &per_cpu(sched_group_core, group).sg;
7079 return group; 7244 return group;
7080} 7245}
7081#elif defined(CONFIG_SCHED_MC) 7246#elif defined(CONFIG_SCHED_MC)
7082static int 7247static int
7083cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7248cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7084 cpumask_t *unused) 7249 struct sched_group **sg, struct cpumask *unused)
7085{ 7250{
7086 if (sg) 7251 if (sg)
7087 *sg = &per_cpu(sched_group_core, cpu); 7252 *sg = &per_cpu(sched_group_core, cpu).sg;
7088 return cpu; 7253 return cpu;
7089} 7254}
7090#endif 7255#endif
7091 7256
7092static DEFINE_PER_CPU(struct sched_domain, phys_domains); 7257static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
7093static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7258static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
7094 7259
7095static int 7260static int
7096cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7261cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7097 cpumask_t *mask) 7262 struct sched_group **sg, struct cpumask *mask)
7098{ 7263{
7099 int group; 7264 int group;
7100#ifdef CONFIG_SCHED_MC 7265#ifdef CONFIG_SCHED_MC
7101 *mask = cpu_coregroup_map(cpu); 7266 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7102 cpus_and(*mask, *mask, *cpu_map); 7267 group = cpumask_first(mask);
7103 group = first_cpu(*mask);
7104#elif defined(CONFIG_SCHED_SMT) 7268#elif defined(CONFIG_SCHED_SMT)
7105 *mask = per_cpu(cpu_sibling_map, cpu); 7269 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7106 cpus_and(*mask, *mask, *cpu_map); 7270 group = cpumask_first(mask);
7107 group = first_cpu(*mask);
7108#else 7271#else
7109 group = cpu; 7272 group = cpu;
7110#endif 7273#endif
7111 if (sg) 7274 if (sg)
7112 *sg = &per_cpu(sched_group_phys, group); 7275 *sg = &per_cpu(sched_group_phys, group).sg;
7113 return group; 7276 return group;
7114} 7277}
7115 7278
@@ -7123,19 +7286,19 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
7123static struct sched_group ***sched_group_nodes_bycpu; 7286static struct sched_group ***sched_group_nodes_bycpu;
7124 7287
7125static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7288static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7126static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7289static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7127 7290
7128static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7291static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7129 struct sched_group **sg, cpumask_t *nodemask) 7292 struct sched_group **sg,
7293 struct cpumask *nodemask)
7130{ 7294{
7131 int group; 7295 int group;
7132 7296
7133 *nodemask = node_to_cpumask(cpu_to_node(cpu)); 7297 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7134 cpus_and(*nodemask, *nodemask, *cpu_map); 7298 group = cpumask_first(nodemask);
7135 group = first_cpu(*nodemask);
7136 7299
7137 if (sg) 7300 if (sg)
7138 *sg = &per_cpu(sched_group_allnodes, group); 7301 *sg = &per_cpu(sched_group_allnodes, group).sg;
7139 return group; 7302 return group;
7140} 7303}
7141 7304
@@ -7147,11 +7310,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7147 if (!sg) 7310 if (!sg)
7148 return; 7311 return;
7149 do { 7312 do {
7150 for_each_cpu_mask_nr(j, sg->cpumask) { 7313 for_each_cpu(j, sched_group_cpus(sg)) {
7151 struct sched_domain *sd; 7314 struct sched_domain *sd;
7152 7315
7153 sd = &per_cpu(phys_domains, j); 7316 sd = &per_cpu(phys_domains, j).sd;
7154 if (j != first_cpu(sd->groups->cpumask)) { 7317 if (j != cpumask_first(sched_group_cpus(sd->groups))) {
7155 /* 7318 /*
7156 * Only add "power" once for each 7319 * Only add "power" once for each
7157 * physical package. 7320 * physical package.
@@ -7168,11 +7331,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7168 7331
7169#ifdef CONFIG_NUMA 7332#ifdef CONFIG_NUMA
7170/* Free memory allocated for various sched_group structures */ 7333/* Free memory allocated for various sched_group structures */
7171static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7334static void free_sched_groups(const struct cpumask *cpu_map,
7335 struct cpumask *nodemask)
7172{ 7336{
7173 int cpu, i; 7337 int cpu, i;
7174 7338
7175 for_each_cpu_mask_nr(cpu, *cpu_map) { 7339 for_each_cpu(cpu, cpu_map) {
7176 struct sched_group **sched_group_nodes 7340 struct sched_group **sched_group_nodes
7177 = sched_group_nodes_bycpu[cpu]; 7341 = sched_group_nodes_bycpu[cpu];
7178 7342
@@ -7182,9 +7346,8 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7182 for (i = 0; i < nr_node_ids; i++) { 7346 for (i = 0; i < nr_node_ids; i++) {
7183 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7347 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7184 7348
7185 *nodemask = node_to_cpumask(i); 7349 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7186 cpus_and(*nodemask, *nodemask, *cpu_map); 7350 if (cpumask_empty(nodemask))
7187 if (cpus_empty(*nodemask))
7188 continue; 7351 continue;
7189 7352
7190 if (sg == NULL) 7353 if (sg == NULL)
@@ -7202,7 +7365,8 @@ next_sg:
7202 } 7365 }
7203} 7366}
7204#else /* !CONFIG_NUMA */ 7367#else /* !CONFIG_NUMA */
7205static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7368static void free_sched_groups(const struct cpumask *cpu_map,
7369 struct cpumask *nodemask)
7206{ 7370{
7207} 7371}
7208#endif /* CONFIG_NUMA */ 7372#endif /* CONFIG_NUMA */
@@ -7228,7 +7392,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7228 7392
7229 WARN_ON(!sd || !sd->groups); 7393 WARN_ON(!sd || !sd->groups);
7230 7394
7231 if (cpu != first_cpu(sd->groups->cpumask)) 7395 if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
7232 return; 7396 return;
7233 7397
7234 child = sd->child; 7398 child = sd->child;
@@ -7293,48 +7457,6 @@ SD_INIT_FUNC(CPU)
7293 SD_INIT_FUNC(MC) 7457 SD_INIT_FUNC(MC)
7294#endif 7458#endif
7295 7459
7296/*
7297 * To minimize stack usage kmalloc room for cpumasks and share the
7298 * space as the usage in build_sched_domains() dictates. Used only
7299 * if the amount of space is significant.
7300 */
7301struct allmasks {
7302 cpumask_t tmpmask; /* make this one first */
7303 union {
7304 cpumask_t nodemask;
7305 cpumask_t this_sibling_map;
7306 cpumask_t this_core_map;
7307 };
7308 cpumask_t send_covered;
7309
7310#ifdef CONFIG_NUMA
7311 cpumask_t domainspan;
7312 cpumask_t covered;
7313 cpumask_t notcovered;
7314#endif
7315};
7316
7317#if NR_CPUS > 128
7318#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7319static inline void sched_cpumask_alloc(struct allmasks **masks)
7320{
7321 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7322}
7323static inline void sched_cpumask_free(struct allmasks *masks)
7324{
7325 kfree(masks);
7326}
7327#else
7328#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7329static inline void sched_cpumask_alloc(struct allmasks **masks)
7330{ }
7331static inline void sched_cpumask_free(struct allmasks *masks)
7332{ }
7333#endif
7334
7335#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7336 ((unsigned long)(a) + offsetof(struct allmasks, v))
7337
7338static int default_relax_domain_level = -1; 7460static int default_relax_domain_level = -1;
7339 7461
7340static int __init setup_relax_domain_level(char *str) 7462static int __init setup_relax_domain_level(char *str)
@@ -7374,17 +7496,38 @@ static void set_domain_attribute(struct sched_domain *sd,
7374 * Build sched domains for a given set of cpus and attach the sched domains 7496 * Build sched domains for a given set of cpus and attach the sched domains
7375 * to the individual cpus 7497 * to the individual cpus
7376 */ 7498 */
7377static int __build_sched_domains(const cpumask_t *cpu_map, 7499static int __build_sched_domains(const struct cpumask *cpu_map,
7378 struct sched_domain_attr *attr) 7500 struct sched_domain_attr *attr)
7379{ 7501{
7380 int i; 7502 int i, err = -ENOMEM;
7381 struct root_domain *rd; 7503 struct root_domain *rd;
7382 SCHED_CPUMASK_DECLARE(allmasks); 7504 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
7383 cpumask_t *tmpmask; 7505 tmpmask;
7384#ifdef CONFIG_NUMA 7506#ifdef CONFIG_NUMA
7507 cpumask_var_t domainspan, covered, notcovered;
7385 struct sched_group **sched_group_nodes = NULL; 7508 struct sched_group **sched_group_nodes = NULL;
7386 int sd_allnodes = 0; 7509 int sd_allnodes = 0;
7387 7510
7511 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
7512 goto out;
7513 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
7514 goto free_domainspan;
7515 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
7516 goto free_covered;
7517#endif
7518
7519 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
7520 goto free_notcovered;
7521 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
7522 goto free_nodemask;
7523 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
7524 goto free_this_sibling_map;
7525 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
7526 goto free_this_core_map;
7527 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
7528 goto free_send_covered;
7529
7530#ifdef CONFIG_NUMA
7388 /* 7531 /*
7389 * Allocate the per-node list of sched groups 7532 * Allocate the per-node list of sched groups
7390 */ 7533 */
@@ -7392,54 +7535,35 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7392 GFP_KERNEL); 7535 GFP_KERNEL);
7393 if (!sched_group_nodes) { 7536 if (!sched_group_nodes) {
7394 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7537 printk(KERN_WARNING "Can not alloc sched group node list\n");
7395 return -ENOMEM; 7538 goto free_tmpmask;
7396 } 7539 }
7397#endif 7540#endif
7398 7541
7399 rd = alloc_rootdomain(); 7542 rd = alloc_rootdomain();
7400 if (!rd) { 7543 if (!rd) {
7401 printk(KERN_WARNING "Cannot alloc root domain\n"); 7544 printk(KERN_WARNING "Cannot alloc root domain\n");
7402#ifdef CONFIG_NUMA 7545 goto free_sched_groups;
7403 kfree(sched_group_nodes);
7404#endif
7405 return -ENOMEM;
7406 }
7407
7408 /* get space for all scratch cpumask variables */
7409 sched_cpumask_alloc(&allmasks);
7410 if (!allmasks) {
7411 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7412 kfree(rd);
7413#ifdef CONFIG_NUMA
7414 kfree(sched_group_nodes);
7415#endif
7416 return -ENOMEM;
7417 } 7546 }
7418 7547
7419 tmpmask = (cpumask_t *)allmasks;
7420
7421
7422#ifdef CONFIG_NUMA 7548#ifdef CONFIG_NUMA
7423 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 7549 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
7424#endif 7550#endif
7425 7551
7426 /* 7552 /*
7427 * Set up domains for cpus specified by the cpu_map. 7553 * Set up domains for cpus specified by the cpu_map.
7428 */ 7554 */
7429 for_each_cpu_mask_nr(i, *cpu_map) { 7555 for_each_cpu(i, cpu_map) {
7430 struct sched_domain *sd = NULL, *p; 7556 struct sched_domain *sd = NULL, *p;
7431 SCHED_CPUMASK_VAR(nodemask, allmasks);
7432 7557
7433 *nodemask = node_to_cpumask(cpu_to_node(i)); 7558 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
7434 cpus_and(*nodemask, *nodemask, *cpu_map);
7435 7559
7436#ifdef CONFIG_NUMA 7560#ifdef CONFIG_NUMA
7437 if (cpus_weight(*cpu_map) > 7561 if (cpumask_weight(cpu_map) >
7438 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { 7562 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
7439 sd = &per_cpu(allnodes_domains, i); 7563 sd = &per_cpu(allnodes_domains, i);
7440 SD_INIT(sd, ALLNODES); 7564 SD_INIT(sd, ALLNODES);
7441 set_domain_attribute(sd, attr); 7565 set_domain_attribute(sd, attr);
7442 sd->span = *cpu_map; 7566 cpumask_copy(sched_domain_span(sd), cpu_map);
7443 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7567 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7444 p = sd; 7568 p = sd;
7445 sd_allnodes = 1; 7569 sd_allnodes = 1;
@@ -7449,18 +7573,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7449 sd = &per_cpu(node_domains, i); 7573 sd = &per_cpu(node_domains, i);
7450 SD_INIT(sd, NODE); 7574 SD_INIT(sd, NODE);
7451 set_domain_attribute(sd, attr); 7575 set_domain_attribute(sd, attr);
7452 sched_domain_node_span(cpu_to_node(i), &sd->span); 7576 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7453 sd->parent = p; 7577 sd->parent = p;
7454 if (p) 7578 if (p)
7455 p->child = sd; 7579 p->child = sd;
7456 cpus_and(sd->span, sd->span, *cpu_map); 7580 cpumask_and(sched_domain_span(sd),
7581 sched_domain_span(sd), cpu_map);
7457#endif 7582#endif
7458 7583
7459 p = sd; 7584 p = sd;
7460 sd = &per_cpu(phys_domains, i); 7585 sd = &per_cpu(phys_domains, i).sd;
7461 SD_INIT(sd, CPU); 7586 SD_INIT(sd, CPU);
7462 set_domain_attribute(sd, attr); 7587 set_domain_attribute(sd, attr);
7463 sd->span = *nodemask; 7588 cpumask_copy(sched_domain_span(sd), nodemask);
7464 sd->parent = p; 7589 sd->parent = p;
7465 if (p) 7590 if (p)
7466 p->child = sd; 7591 p->child = sd;
@@ -7468,11 +7593,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7468 7593
7469#ifdef CONFIG_SCHED_MC 7594#ifdef CONFIG_SCHED_MC
7470 p = sd; 7595 p = sd;
7471 sd = &per_cpu(core_domains, i); 7596 sd = &per_cpu(core_domains, i).sd;
7472 SD_INIT(sd, MC); 7597 SD_INIT(sd, MC);
7473 set_domain_attribute(sd, attr); 7598 set_domain_attribute(sd, attr);
7474 sd->span = cpu_coregroup_map(i); 7599 cpumask_and(sched_domain_span(sd), cpu_map,
7475 cpus_and(sd->span, sd->span, *cpu_map); 7600 cpu_coregroup_mask(i));
7476 sd->parent = p; 7601 sd->parent = p;
7477 p->child = sd; 7602 p->child = sd;
7478 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); 7603 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7480,11 +7605,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7480 7605
7481#ifdef CONFIG_SCHED_SMT 7606#ifdef CONFIG_SCHED_SMT
7482 p = sd; 7607 p = sd;
7483 sd = &per_cpu(cpu_domains, i); 7608 sd = &per_cpu(cpu_domains, i).sd;
7484 SD_INIT(sd, SIBLING); 7609 SD_INIT(sd, SIBLING);
7485 set_domain_attribute(sd, attr); 7610 set_domain_attribute(sd, attr);
7486 sd->span = per_cpu(cpu_sibling_map, i); 7611 cpumask_and(sched_domain_span(sd),
7487 cpus_and(sd->span, sd->span, *cpu_map); 7612 &per_cpu(cpu_sibling_map, i), cpu_map);
7488 sd->parent = p; 7613 sd->parent = p;
7489 p->child = sd; 7614 p->child = sd;
7490 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 7615 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7493,13 +7618,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7493 7618
7494#ifdef CONFIG_SCHED_SMT 7619#ifdef CONFIG_SCHED_SMT
7495 /* Set up CPU (sibling) groups */ 7620 /* Set up CPU (sibling) groups */
7496 for_each_cpu_mask_nr(i, *cpu_map) { 7621 for_each_cpu(i, cpu_map) {
7497 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7622 cpumask_and(this_sibling_map,
7498 SCHED_CPUMASK_VAR(send_covered, allmasks); 7623 &per_cpu(cpu_sibling_map, i), cpu_map);
7499 7624 if (i != cpumask_first(this_sibling_map))
7500 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7501 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7502 if (i != first_cpu(*this_sibling_map))
7503 continue; 7625 continue;
7504 7626
7505 init_sched_build_groups(this_sibling_map, cpu_map, 7627 init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7510,13 +7632,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7510 7632
7511#ifdef CONFIG_SCHED_MC 7633#ifdef CONFIG_SCHED_MC
7512 /* Set up multi-core groups */ 7634 /* Set up multi-core groups */
7513 for_each_cpu_mask_nr(i, *cpu_map) { 7635 for_each_cpu(i, cpu_map) {
7514 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7636 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
7515 SCHED_CPUMASK_VAR(send_covered, allmasks); 7637 if (i != cpumask_first(this_core_map))
7516
7517 *this_core_map = cpu_coregroup_map(i);
7518 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7519 if (i != first_cpu(*this_core_map))
7520 continue; 7638 continue;
7521 7639
7522 init_sched_build_groups(this_core_map, cpu_map, 7640 init_sched_build_groups(this_core_map, cpu_map,
@@ -7527,12 +7645,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7527 7645
7528 /* Set up physical groups */ 7646 /* Set up physical groups */
7529 for (i = 0; i < nr_node_ids; i++) { 7647 for (i = 0; i < nr_node_ids; i++) {
7530 SCHED_CPUMASK_VAR(nodemask, allmasks); 7648 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7531 SCHED_CPUMASK_VAR(send_covered, allmasks); 7649 if (cpumask_empty(nodemask))
7532
7533 *nodemask = node_to_cpumask(i);
7534 cpus_and(*nodemask, *nodemask, *cpu_map);
7535 if (cpus_empty(*nodemask))
7536 continue; 7650 continue;
7537 7651
7538 init_sched_build_groups(nodemask, cpu_map, 7652 init_sched_build_groups(nodemask, cpu_map,
@@ -7543,8 +7657,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7543#ifdef CONFIG_NUMA 7657#ifdef CONFIG_NUMA
7544 /* Set up node groups */ 7658 /* Set up node groups */
7545 if (sd_allnodes) { 7659 if (sd_allnodes) {
7546 SCHED_CPUMASK_VAR(send_covered, allmasks);
7547
7548 init_sched_build_groups(cpu_map, cpu_map, 7660 init_sched_build_groups(cpu_map, cpu_map,
7549 &cpu_to_allnodes_group, 7661 &cpu_to_allnodes_group,
7550 send_covered, tmpmask); 7662 send_covered, tmpmask);
@@ -7553,58 +7665,53 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7553 for (i = 0; i < nr_node_ids; i++) { 7665 for (i = 0; i < nr_node_ids; i++) {
7554 /* Set up node groups */ 7666 /* Set up node groups */
7555 struct sched_group *sg, *prev; 7667 struct sched_group *sg, *prev;
7556 SCHED_CPUMASK_VAR(nodemask, allmasks);
7557 SCHED_CPUMASK_VAR(domainspan, allmasks);
7558 SCHED_CPUMASK_VAR(covered, allmasks);
7559 int j; 7668 int j;
7560 7669
7561 *nodemask = node_to_cpumask(i); 7670 cpumask_clear(covered);
7562 cpus_clear(*covered); 7671 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7563 7672 if (cpumask_empty(nodemask)) {
7564 cpus_and(*nodemask, *nodemask, *cpu_map);
7565 if (cpus_empty(*nodemask)) {
7566 sched_group_nodes[i] = NULL; 7673 sched_group_nodes[i] = NULL;
7567 continue; 7674 continue;
7568 } 7675 }
7569 7676
7570 sched_domain_node_span(i, domainspan); 7677 sched_domain_node_span(i, domainspan);
7571 cpus_and(*domainspan, *domainspan, *cpu_map); 7678 cpumask_and(domainspan, domainspan, cpu_map);
7572 7679
7573 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7680 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7681 GFP_KERNEL, i);
7574 if (!sg) { 7682 if (!sg) {
7575 printk(KERN_WARNING "Can not alloc domain group for " 7683 printk(KERN_WARNING "Can not alloc domain group for "
7576 "node %d\n", i); 7684 "node %d\n", i);
7577 goto error; 7685 goto error;
7578 } 7686 }
7579 sched_group_nodes[i] = sg; 7687 sched_group_nodes[i] = sg;
7580 for_each_cpu_mask_nr(j, *nodemask) { 7688 for_each_cpu(j, nodemask) {
7581 struct sched_domain *sd; 7689 struct sched_domain *sd;
7582 7690
7583 sd = &per_cpu(node_domains, j); 7691 sd = &per_cpu(node_domains, j);
7584 sd->groups = sg; 7692 sd->groups = sg;
7585 } 7693 }
7586 sg->__cpu_power = 0; 7694 sg->__cpu_power = 0;
7587 sg->cpumask = *nodemask; 7695 cpumask_copy(sched_group_cpus(sg), nodemask);
7588 sg->next = sg; 7696 sg->next = sg;
7589 cpus_or(*covered, *covered, *nodemask); 7697 cpumask_or(covered, covered, nodemask);
7590 prev = sg; 7698 prev = sg;
7591 7699
7592 for (j = 0; j < nr_node_ids; j++) { 7700 for (j = 0; j < nr_node_ids; j++) {
7593 SCHED_CPUMASK_VAR(notcovered, allmasks);
7594 int n = (i + j) % nr_node_ids; 7701 int n = (i + j) % nr_node_ids;
7595 node_to_cpumask_ptr(pnodemask, n);
7596 7702
7597 cpus_complement(*notcovered, *covered); 7703 cpumask_complement(notcovered, covered);
7598 cpus_and(*tmpmask, *notcovered, *cpu_map); 7704 cpumask_and(tmpmask, notcovered, cpu_map);
7599 cpus_and(*tmpmask, *tmpmask, *domainspan); 7705 cpumask_and(tmpmask, tmpmask, domainspan);
7600 if (cpus_empty(*tmpmask)) 7706 if (cpumask_empty(tmpmask))
7601 break; 7707 break;
7602 7708
7603 cpus_and(*tmpmask, *tmpmask, *pnodemask); 7709 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
7604 if (cpus_empty(*tmpmask)) 7710 if (cpumask_empty(tmpmask))
7605 continue; 7711 continue;
7606 7712
7607 sg = kmalloc_node(sizeof(struct sched_group), 7713 sg = kmalloc_node(sizeof(struct sched_group) +
7714 cpumask_size(),
7608 GFP_KERNEL, i); 7715 GFP_KERNEL, i);
7609 if (!sg) { 7716 if (!sg) {
7610 printk(KERN_WARNING 7717 printk(KERN_WARNING
@@ -7612,9 +7719,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7612 goto error; 7719 goto error;
7613 } 7720 }
7614 sg->__cpu_power = 0; 7721 sg->__cpu_power = 0;
7615 sg->cpumask = *tmpmask; 7722 cpumask_copy(sched_group_cpus(sg), tmpmask);
7616 sg->next = prev->next; 7723 sg->next = prev->next;
7617 cpus_or(*covered, *covered, *tmpmask); 7724 cpumask_or(covered, covered, tmpmask);
7618 prev->next = sg; 7725 prev->next = sg;
7619 prev = sg; 7726 prev = sg;
7620 } 7727 }
@@ -7623,22 +7730,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7623 7730
7624 /* Calculate CPU power for physical packages and nodes */ 7731 /* Calculate CPU power for physical packages and nodes */
7625#ifdef CONFIG_SCHED_SMT 7732#ifdef CONFIG_SCHED_SMT
7626 for_each_cpu_mask_nr(i, *cpu_map) { 7733 for_each_cpu(i, cpu_map) {
7627 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7734 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
7628 7735
7629 init_sched_groups_power(i, sd); 7736 init_sched_groups_power(i, sd);
7630 } 7737 }
7631#endif 7738#endif
7632#ifdef CONFIG_SCHED_MC 7739#ifdef CONFIG_SCHED_MC
7633 for_each_cpu_mask_nr(i, *cpu_map) { 7740 for_each_cpu(i, cpu_map) {
7634 struct sched_domain *sd = &per_cpu(core_domains, i); 7741 struct sched_domain *sd = &per_cpu(core_domains, i).sd;
7635 7742
7636 init_sched_groups_power(i, sd); 7743 init_sched_groups_power(i, sd);
7637 } 7744 }
7638#endif 7745#endif
7639 7746
7640 for_each_cpu_mask_nr(i, *cpu_map) { 7747 for_each_cpu(i, cpu_map) {
7641 struct sched_domain *sd = &per_cpu(phys_domains, i); 7748 struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
7642 7749
7643 init_sched_groups_power(i, sd); 7750 init_sched_groups_power(i, sd);
7644 } 7751 }
@@ -7650,53 +7757,78 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7650 if (sd_allnodes) { 7757 if (sd_allnodes) {
7651 struct sched_group *sg; 7758 struct sched_group *sg;
7652 7759
7653 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, 7760 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7654 tmpmask); 7761 tmpmask);
7655 init_numa_sched_groups_power(sg); 7762 init_numa_sched_groups_power(sg);
7656 } 7763 }
7657#endif 7764#endif
7658 7765
7659 /* Attach the domains */ 7766 /* Attach the domains */
7660 for_each_cpu_mask_nr(i, *cpu_map) { 7767 for_each_cpu(i, cpu_map) {
7661 struct sched_domain *sd; 7768 struct sched_domain *sd;
7662#ifdef CONFIG_SCHED_SMT 7769#ifdef CONFIG_SCHED_SMT
7663 sd = &per_cpu(cpu_domains, i); 7770 sd = &per_cpu(cpu_domains, i).sd;
7664#elif defined(CONFIG_SCHED_MC) 7771#elif defined(CONFIG_SCHED_MC)
7665 sd = &per_cpu(core_domains, i); 7772 sd = &per_cpu(core_domains, i).sd;
7666#else 7773#else
7667 sd = &per_cpu(phys_domains, i); 7774 sd = &per_cpu(phys_domains, i).sd;
7668#endif 7775#endif
7669 cpu_attach_domain(sd, rd, i); 7776 cpu_attach_domain(sd, rd, i);
7670 } 7777 }
7671 7778
7672 sched_cpumask_free(allmasks); 7779 err = 0;
7673 return 0; 7780
7781free_tmpmask:
7782 free_cpumask_var(tmpmask);
7783free_send_covered:
7784 free_cpumask_var(send_covered);
7785free_this_core_map:
7786 free_cpumask_var(this_core_map);
7787free_this_sibling_map:
7788 free_cpumask_var(this_sibling_map);
7789free_nodemask:
7790 free_cpumask_var(nodemask);
7791free_notcovered:
7792#ifdef CONFIG_NUMA
7793 free_cpumask_var(notcovered);
7794free_covered:
7795 free_cpumask_var(covered);
7796free_domainspan:
7797 free_cpumask_var(domainspan);
7798out:
7799#endif
7800 return err;
7801
7802free_sched_groups:
7803#ifdef CONFIG_NUMA
7804 kfree(sched_group_nodes);
7805#endif
7806 goto free_tmpmask;
7674 7807
7675#ifdef CONFIG_NUMA 7808#ifdef CONFIG_NUMA
7676error: 7809error:
7677 free_sched_groups(cpu_map, tmpmask); 7810 free_sched_groups(cpu_map, tmpmask);
7678 sched_cpumask_free(allmasks); 7811 free_rootdomain(rd);
7679 kfree(rd); 7812 goto free_tmpmask;
7680 return -ENOMEM;
7681#endif 7813#endif
7682} 7814}
7683 7815
7684static int build_sched_domains(const cpumask_t *cpu_map) 7816static int build_sched_domains(const struct cpumask *cpu_map)
7685{ 7817{
7686 return __build_sched_domains(cpu_map, NULL); 7818 return __build_sched_domains(cpu_map, NULL);
7687} 7819}
7688 7820
7689static cpumask_t *doms_cur; /* current sched domains */ 7821static struct cpumask *doms_cur; /* current sched domains */
7690static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7822static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7691static struct sched_domain_attr *dattr_cur; 7823static struct sched_domain_attr *dattr_cur;
7692 /* attribues of custom domains in 'doms_cur' */ 7824 /* attribues of custom domains in 'doms_cur' */
7693 7825
7694/* 7826/*
7695 * Special case: If a kmalloc of a doms_cur partition (array of 7827 * Special case: If a kmalloc of a doms_cur partition (array of
7696 * cpumask_t) fails, then fallback to a single sched domain, 7828 * cpumask) fails, then fallback to a single sched domain,
7697 * as determined by the single cpumask_t fallback_doms. 7829 * as determined by the single cpumask fallback_doms.
7698 */ 7830 */
7699static cpumask_t fallback_doms; 7831static cpumask_var_t fallback_doms;
7700 7832
7701/* 7833/*
7702 * arch_update_cpu_topology lets virtualized architectures update the 7834 * arch_update_cpu_topology lets virtualized architectures update the
@@ -7713,16 +7845,16 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
7713 * For now this just excludes isolated cpus, but could be used to 7845 * For now this just excludes isolated cpus, but could be used to
7714 * exclude other special cases in the future. 7846 * exclude other special cases in the future.
7715 */ 7847 */
7716static int arch_init_sched_domains(const cpumask_t *cpu_map) 7848static int arch_init_sched_domains(const struct cpumask *cpu_map)
7717{ 7849{
7718 int err; 7850 int err;
7719 7851
7720 arch_update_cpu_topology(); 7852 arch_update_cpu_topology();
7721 ndoms_cur = 1; 7853 ndoms_cur = 1;
7722 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 7854 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
7723 if (!doms_cur) 7855 if (!doms_cur)
7724 doms_cur = &fallback_doms; 7856 doms_cur = fallback_doms;
7725 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7857 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
7726 dattr_cur = NULL; 7858 dattr_cur = NULL;
7727 err = build_sched_domains(doms_cur); 7859 err = build_sched_domains(doms_cur);
7728 register_sched_domain_sysctl(); 7860 register_sched_domain_sysctl();
@@ -7730,8 +7862,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
7730 return err; 7862 return err;
7731} 7863}
7732 7864
7733static void arch_destroy_sched_domains(const cpumask_t *cpu_map, 7865static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7734 cpumask_t *tmpmask) 7866 struct cpumask *tmpmask)
7735{ 7867{
7736 free_sched_groups(cpu_map, tmpmask); 7868 free_sched_groups(cpu_map, tmpmask);
7737} 7869}
@@ -7740,15 +7872,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7740 * Detach sched domains from a group of cpus specified in cpu_map 7872 * Detach sched domains from a group of cpus specified in cpu_map
7741 * These cpus will now be attached to the NULL domain 7873 * These cpus will now be attached to the NULL domain
7742 */ 7874 */
7743static void detach_destroy_domains(const cpumask_t *cpu_map) 7875static void detach_destroy_domains(const struct cpumask *cpu_map)
7744{ 7876{
7745 cpumask_t tmpmask; 7877 /* Save because hotplug lock held. */
7878 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7746 int i; 7879 int i;
7747 7880
7748 for_each_cpu_mask_nr(i, *cpu_map) 7881 for_each_cpu(i, cpu_map)
7749 cpu_attach_domain(NULL, &def_root_domain, i); 7882 cpu_attach_domain(NULL, &def_root_domain, i);
7750 synchronize_sched(); 7883 synchronize_sched();
7751 arch_destroy_sched_domains(cpu_map, &tmpmask); 7884 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7752} 7885}
7753 7886
7754/* handle null as "default" */ 7887/* handle null as "default" */
@@ -7773,7 +7906,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7773 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7906 * doms_new[] to the current sched domain partitioning, doms_cur[].
7774 * It destroys each deleted domain and builds each new domain. 7907 * It destroys each deleted domain and builds each new domain.
7775 * 7908 *
7776 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. 7909 * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
7777 * The masks don't intersect (don't overlap.) We should setup one 7910 * The masks don't intersect (don't overlap.) We should setup one
7778 * sched domain for each mask. CPUs not in any of the cpumasks will 7911 * sched domain for each mask. CPUs not in any of the cpumasks will
7779 * not be load balanced. If the same cpumask appears both in the 7912 * not be load balanced. If the same cpumask appears both in the
@@ -7787,13 +7920,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7787 * the single partition 'fallback_doms', it also forces the domains 7920 * the single partition 'fallback_doms', it also forces the domains
7788 * to be rebuilt. 7921 * to be rebuilt.
7789 * 7922 *
7790 * If doms_new == NULL it will be replaced with cpu_online_map. 7923 * If doms_new == NULL it will be replaced with cpu_online_mask.
7791 * ndoms_new == 0 is a special case for destroying existing domains, 7924 * ndoms_new == 0 is a special case for destroying existing domains,
7792 * and it will not create the default domain. 7925 * and it will not create the default domain.
7793 * 7926 *
7794 * Call with hotplug lock held 7927 * Call with hotplug lock held
7795 */ 7928 */
7796void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7929/* FIXME: Change to struct cpumask *doms_new[] */
7930void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
7797 struct sched_domain_attr *dattr_new) 7931 struct sched_domain_attr *dattr_new)
7798{ 7932{
7799 int i, j, n; 7933 int i, j, n;
@@ -7812,7 +7946,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7812 /* Destroy deleted domains */ 7946 /* Destroy deleted domains */
7813 for (i = 0; i < ndoms_cur; i++) { 7947 for (i = 0; i < ndoms_cur; i++) {
7814 for (j = 0; j < n && !new_topology; j++) { 7948 for (j = 0; j < n && !new_topology; j++) {
7815 if (cpus_equal(doms_cur[i], doms_new[j]) 7949 if (cpumask_equal(&doms_cur[i], &doms_new[j])
7816 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7950 && dattrs_equal(dattr_cur, i, dattr_new, j))
7817 goto match1; 7951 goto match1;
7818 } 7952 }
@@ -7824,15 +7958,15 @@ match1:
7824 7958
7825 if (doms_new == NULL) { 7959 if (doms_new == NULL) {
7826 ndoms_cur = 0; 7960 ndoms_cur = 0;
7827 doms_new = &fallback_doms; 7961 doms_new = fallback_doms;
7828 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7962 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
7829 WARN_ON_ONCE(dattr_new); 7963 WARN_ON_ONCE(dattr_new);
7830 } 7964 }
7831 7965
7832 /* Build new domains */ 7966 /* Build new domains */
7833 for (i = 0; i < ndoms_new; i++) { 7967 for (i = 0; i < ndoms_new; i++) {
7834 for (j = 0; j < ndoms_cur && !new_topology; j++) { 7968 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7835 if (cpus_equal(doms_new[i], doms_cur[j]) 7969 if (cpumask_equal(&doms_new[i], &doms_cur[j])
7836 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7970 && dattrs_equal(dattr_new, i, dattr_cur, j))
7837 goto match2; 7971 goto match2;
7838 } 7972 }
@@ -7844,7 +7978,7 @@ match2:
7844 } 7978 }
7845 7979
7846 /* Remember the new sched domains */ 7980 /* Remember the new sched domains */
7847 if (doms_cur != &fallback_doms) 7981 if (doms_cur != fallback_doms)
7848 kfree(doms_cur); 7982 kfree(doms_cur);
7849 kfree(dattr_cur); /* kfree(NULL) is safe */ 7983 kfree(dattr_cur); /* kfree(NULL) is safe */
7850 doms_cur = doms_new; 7984 doms_cur = doms_new;
@@ -7857,7 +7991,7 @@ match2:
7857} 7991}
7858 7992
7859#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7993#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7860int arch_reinit_sched_domains(void) 7994static void arch_reinit_sched_domains(void)
7861{ 7995{
7862 get_online_cpus(); 7996 get_online_cpus();
7863 7997
@@ -7866,25 +8000,33 @@ int arch_reinit_sched_domains(void)
7866 8000
7867 rebuild_sched_domains(); 8001 rebuild_sched_domains();
7868 put_online_cpus(); 8002 put_online_cpus();
7869
7870 return 0;
7871} 8003}
7872 8004
7873static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 8005static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7874{ 8006{
7875 int ret; 8007 unsigned int level = 0;
8008
8009 if (sscanf(buf, "%u", &level) != 1)
8010 return -EINVAL;
8011
8012 /*
8013 * level is always be positive so don't check for
8014 * level < POWERSAVINGS_BALANCE_NONE which is 0
8015 * What happens on 0 or 1 byte write,
8016 * need to check for count as well?
8017 */
7876 8018
7877 if (buf[0] != '0' && buf[0] != '1') 8019 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7878 return -EINVAL; 8020 return -EINVAL;
7879 8021
7880 if (smt) 8022 if (smt)
7881 sched_smt_power_savings = (buf[0] == '1'); 8023 sched_smt_power_savings = level;
7882 else 8024 else
7883 sched_mc_power_savings = (buf[0] == '1'); 8025 sched_mc_power_savings = level;
7884 8026
7885 ret = arch_reinit_sched_domains(); 8027 arch_reinit_sched_domains();
7886 8028
7887 return ret ? ret : count; 8029 return count;
7888} 8030}
7889 8031
7890#ifdef CONFIG_SCHED_MC 8032#ifdef CONFIG_SCHED_MC
@@ -7919,7 +8061,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7919 sched_smt_power_savings_store); 8061 sched_smt_power_savings_store);
7920#endif 8062#endif
7921 8063
7922int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 8064int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7923{ 8065{
7924 int err = 0; 8066 int err = 0;
7925 8067
@@ -7984,7 +8126,9 @@ static int update_runtime(struct notifier_block *nfb,
7984 8126
7985void __init sched_init_smp(void) 8127void __init sched_init_smp(void)
7986{ 8128{
7987 cpumask_t non_isolated_cpus; 8129 cpumask_var_t non_isolated_cpus;
8130
8131 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7988 8132
7989#if defined(CONFIG_NUMA) 8133#if defined(CONFIG_NUMA)
7990 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 8134 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -7993,10 +8137,10 @@ void __init sched_init_smp(void)
7993#endif 8137#endif
7994 get_online_cpus(); 8138 get_online_cpus();
7995 mutex_lock(&sched_domains_mutex); 8139 mutex_lock(&sched_domains_mutex);
7996 arch_init_sched_domains(&cpu_online_map); 8140 arch_init_sched_domains(cpu_online_mask);
7997 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 8141 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7998 if (cpus_empty(non_isolated_cpus)) 8142 if (cpumask_empty(non_isolated_cpus))
7999 cpu_set(smp_processor_id(), non_isolated_cpus); 8143 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8000 mutex_unlock(&sched_domains_mutex); 8144 mutex_unlock(&sched_domains_mutex);
8001 put_online_cpus(); 8145 put_online_cpus();
8002 8146
@@ -8011,9 +8155,13 @@ void __init sched_init_smp(void)
8011 init_hrtick(); 8155 init_hrtick();
8012 8156
8013 /* Move init over to a non-isolated CPU */ 8157 /* Move init over to a non-isolated CPU */
8014 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 8158 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8015 BUG(); 8159 BUG();
8016 sched_init_granularity(); 8160 sched_init_granularity();
8161 free_cpumask_var(non_isolated_cpus);
8162
8163 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8164 init_sched_rt_class();
8017} 8165}
8018#else 8166#else
8019void __init sched_init_smp(void) 8167void __init sched_init_smp(void)
@@ -8328,6 +8476,15 @@ void __init sched_init(void)
8328 */ 8476 */
8329 current->sched_class = &fair_sched_class; 8477 current->sched_class = &fair_sched_class;
8330 8478
8479 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8480 alloc_bootmem_cpumask_var(&nohz_cpu_mask);
8481#ifdef CONFIG_SMP
8482#ifdef CONFIG_NO_HZ
8483 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
8484#endif
8485 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8486#endif /* SMP */
8487
8331 scheduler_running = 1; 8488 scheduler_running = 1;
8332} 8489}
8333 8490
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e8ab096ddfe3..a0b0852414cc 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
126 min_clock = wrap_max(scd->tick_gtod, scd->clock); 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = scd->tick_gtod + TICK_NSEC; 127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
128 128
129 clock = wrap_max(clock, min_clock); 129 clock = wrap_max(clock, min_clock);
130 clock = wrap_min(clock, max_clock); 130 clock = wrap_min(clock, max_clock);
@@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
227 */ 227 */
228void sched_clock_idle_wakeup_event(u64 delta_ns) 228void sched_clock_idle_wakeup_event(u64 delta_ns)
229{ 229{
230 if (timekeeping_suspended)
231 return;
232
230 sched_clock_tick(); 233 sched_clock_tick();
231 touch_softlockup_watchdog(); 234 touch_softlockup_watchdog();
232} 235}
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 52154fefab7e..1e00bfacf9b8 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -67,24 +67,21 @@ static int convert_prio(int prio)
67 * Returns: (int)bool - CPUs were found 67 * Returns: (int)bool - CPUs were found
68 */ 68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p, 69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask) 70 struct cpumask *lowest_mask)
71{ 71{
72 int idx = 0; 72 int idx = 0;
73 int task_pri = convert_prio(p->prio); 73 int task_pri = convert_prio(p->prio);
74 74
75 for_each_cpupri_active(cp->pri_active, idx) { 75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78 77
79 if (idx >= task_pri) 78 if (idx >= task_pri)
80 break; 79 break;
81 80
82 cpus_and(mask, p->cpus_allowed, vec->mask); 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
83
84 if (cpus_empty(mask))
85 continue; 82 continue;
86 83
87 *lowest_mask = mask; 84 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
88 return 1; 85 return 1;
89 } 86 }
90 87
@@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
126 vec->count--; 123 vec->count--;
127 if (!vec->count) 124 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active); 125 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask); 126 cpumask_clear_cpu(cpu, vec->mask);
130 127
131 spin_unlock_irqrestore(&vec->lock, flags); 128 spin_unlock_irqrestore(&vec->lock, flags);
132 } 129 }
@@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
136 133
137 spin_lock_irqsave(&vec->lock, flags); 134 spin_lock_irqsave(&vec->lock, flags);
138 135
139 cpu_set(cpu, vec->mask); 136 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 137 vec->count++;
141 if (vec->count == 1) 138 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active); 139 set_bit(newpri, cp->pri_active);
@@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
150/** 147/**
151 * cpupri_init - initialize the cpupri structure 148 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context 149 * @cp: The cpupri context
150 * @bootmem: true if allocations need to use bootmem
153 * 151 *
154 * Returns: (void) 152 * Returns: -ENOMEM if memory fails.
155 */ 153 */
156void cpupri_init(struct cpupri *cp) 154int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
157{ 155{
158 int i; 156 int i;
159 157
@@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp)
164 162
165 spin_lock_init(&vec->lock); 163 spin_lock_init(&vec->lock);
166 vec->count = 0; 164 vec->count = 0;
167 cpus_clear(vec->mask); 165 if (bootmem)
166 alloc_bootmem_cpumask_var(&vec->mask);
167 else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
168 goto cleanup;
168 } 169 }
169 170
170 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID; 172 cp->cpu_to_pri[i] = CPUPRI_INVALID;
173 return 0;
174
175cleanup:
176 for (i--; i >= 0; i--)
177 free_cpumask_var(cp->pri_to_cpu[i].mask);
178 return -ENOMEM;
172} 179}
173 180
181/**
182 * cpupri_cleanup - clean up the cpupri structure
183 * @cp: The cpupri context
184 */
185void cpupri_cleanup(struct cpupri *cp)
186{
187 int i;
174 188
189 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
190 free_cpumask_var(cp->pri_to_cpu[i].mask);
191}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index f25811b0f931..642a94ef8a0a 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -14,7 +14,7 @@
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 spinlock_t lock;
16 int count; 16 int count;
17 cpumask_t mask; 17 cpumask_var_t mask;
18}; 18};
19 19
20struct cpupri { 20struct cpupri {
@@ -27,7 +27,8 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask); 28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp); 30int cpupri_init(struct cpupri *cp, bool bootmem);
31void cpupri_cleanup(struct cpupri *cp);
31#else 32#else
32#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0) 34#define cpupri_init() do { } while (0)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5ad4440f0fc4..e0c0b4bc3f08 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -386,20 +386,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
386#endif 386#endif
387 387
388/* 388/*
389 * delta *= P[w / rw]
390 */
391static inline unsigned long
392calc_delta_weight(unsigned long delta, struct sched_entity *se)
393{
394 for_each_sched_entity(se) {
395 delta = calc_delta_mine(delta,
396 se->load.weight, &cfs_rq_of(se)->load);
397 }
398
399 return delta;
400}
401
402/*
403 * delta /= w 389 * delta /= w
404 */ 390 */
405static inline unsigned long 391static inline unsigned long
@@ -440,12 +426,20 @@ static u64 __sched_period(unsigned long nr_running)
440 */ 426 */
441static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 427static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
442{ 428{
443 unsigned long nr_running = cfs_rq->nr_running; 429 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
444 430
445 if (unlikely(!se->on_rq)) 431 for_each_sched_entity(se) {
446 nr_running++; 432 struct load_weight *load = &cfs_rq->load;
433
434 if (unlikely(!se->on_rq)) {
435 struct load_weight lw = cfs_rq->load;
447 436
448 return calc_delta_weight(__sched_period(nr_running), se); 437 update_load_add(&lw, se->load.weight);
438 load = &lw;
439 }
440 slice = calc_delta_mine(slice, se->load.weight, load);
441 }
442 return slice;
449} 443}
450 444
451/* 445/*
@@ -1019,16 +1013,33 @@ static void yield_task_fair(struct rq *rq)
1019 * search starts with cpus closest then further out as needed, 1013 * search starts with cpus closest then further out as needed,
1020 * so we always favor a closer, idle cpu. 1014 * so we always favor a closer, idle cpu.
1021 * Domains may include CPUs that are not usable for migration, 1015 * Domains may include CPUs that are not usable for migration,
1022 * hence we need to mask them out (cpu_active_map) 1016 * hence we need to mask them out (cpu_active_mask)
1023 * 1017 *
1024 * Returns the CPU we should wake onto. 1018 * Returns the CPU we should wake onto.
1025 */ 1019 */
1026#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1020#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1027static int wake_idle(int cpu, struct task_struct *p) 1021static int wake_idle(int cpu, struct task_struct *p)
1028{ 1022{
1029 cpumask_t tmp;
1030 struct sched_domain *sd; 1023 struct sched_domain *sd;
1031 int i; 1024 int i;
1025 unsigned int chosen_wakeup_cpu;
1026 int this_cpu;
1027
1028 /*
1029 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1030 * are idle and this is not a kernel thread and this task's affinity
1031 * allows it to be moved to preferred cpu, then just move!
1032 */
1033
1034 this_cpu = smp_processor_id();
1035 chosen_wakeup_cpu =
1036 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1037
1038 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1039 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1040 p->mm && !(p->flags & PF_KTHREAD) &&
1041 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1042 return chosen_wakeup_cpu;
1032 1043
1033 /* 1044 /*
1034 * If it is idle, then it is the best cpu to run this task. 1045 * If it is idle, then it is the best cpu to run this task.
@@ -1046,10 +1057,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1046 if ((sd->flags & SD_WAKE_IDLE) 1057 if ((sd->flags & SD_WAKE_IDLE)
1047 || ((sd->flags & SD_WAKE_IDLE_FAR) 1058 || ((sd->flags & SD_WAKE_IDLE_FAR)
1048 && !task_hot(p, task_rq(p)->clock, sd))) { 1059 && !task_hot(p, task_rq(p)->clock, sd))) {
1049 cpus_and(tmp, sd->span, p->cpus_allowed); 1060 for_each_cpu_and(i, sched_domain_span(sd),
1050 cpus_and(tmp, tmp, cpu_active_map); 1061 &p->cpus_allowed) {
1051 for_each_cpu_mask_nr(i, tmp) { 1062 if (cpu_active(i) && idle_cpu(i)) {
1052 if (idle_cpu(i)) {
1053 if (i != task_cpu(p)) { 1063 if (i != task_cpu(p)) {
1054 schedstat_inc(p, 1064 schedstat_inc(p,
1055 se.nr_wakeups_idle); 1065 se.nr_wakeups_idle);
@@ -1242,13 +1252,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1242 * this_cpu and prev_cpu are present in: 1252 * this_cpu and prev_cpu are present in:
1243 */ 1253 */
1244 for_each_domain(this_cpu, sd) { 1254 for_each_domain(this_cpu, sd) {
1245 if (cpu_isset(prev_cpu, sd->span)) { 1255 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
1246 this_sd = sd; 1256 this_sd = sd;
1247 break; 1257 break;
1248 } 1258 }
1249 } 1259 }
1250 1260
1251 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1261 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
1252 goto out; 1262 goto out;
1253 1263
1254 /* 1264 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 51d2af3e6191..954e1a81b796 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq)
15 if (!rq->online) 15 if (!rq->online)
16 return; 16 return;
17 17
18 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
19 /* 19 /*
20 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
21 * the overload count. That is checked to determine 21 * the overload count. That is checked to determine
@@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq)
34 34
35 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
36 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
37 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
38} 38}
39 39
40static void update_rt_migration(struct rq *rq) 40static void update_rt_migration(struct rq *rq)
@@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
139} 139}
140 140
141#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
142static inline cpumask_t sched_rt_period_mask(void) 142static inline const struct cpumask *sched_rt_period_mask(void)
143{ 143{
144 return cpu_rq(smp_processor_id())->rd->span; 144 return cpu_rq(smp_processor_id())->rd->span;
145} 145}
146#else 146#else
147static inline cpumask_t sched_rt_period_mask(void) 147static inline const struct cpumask *sched_rt_period_mask(void)
148{ 148{
149 return cpu_online_map; 149 return cpu_online_mask;
150} 150}
151#endif 151#endif
152 152
@@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
212 return rt_rq->rt_throttled; 212 return rt_rq->rt_throttled;
213} 213}
214 214
215static inline cpumask_t sched_rt_period_mask(void) 215static inline const struct cpumask *sched_rt_period_mask(void)
216{ 216{
217 return cpu_online_map; 217 return cpu_online_mask;
218} 218}
219 219
220static inline 220static inline
@@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
241 int i, weight, more = 0; 241 int i, weight, more = 0;
242 u64 rt_period; 242 u64 rt_period;
243 243
244 weight = cpus_weight(rd->span); 244 weight = cpumask_weight(rd->span);
245 245
246 spin_lock(&rt_b->rt_runtime_lock); 246 spin_lock(&rt_b->rt_runtime_lock);
247 rt_period = ktime_to_ns(rt_b->rt_period); 247 rt_period = ktime_to_ns(rt_b->rt_period);
248 for_each_cpu_mask_nr(i, rd->span) { 248 for_each_cpu(i, rd->span) {
249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
250 s64 diff; 250 s64 diff;
251 251
@@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq)
324 /* 324 /*
325 * Greedy reclaim, take back as much as we can. 325 * Greedy reclaim, take back as much as we can.
326 */ 326 */
327 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu(i, rd->span) {
328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
329 s64 diff; 329 s64 diff;
330 330
@@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
430{ 430{
431 int i, idle = 1; 431 int i, idle = 1;
432 cpumask_t span; 432 const struct cpumask *span;
433 433
434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
435 return 1; 435 return 1;
436 436
437 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
438 for_each_cpu_mask(i, span) { 438 for_each_cpu(i, span) {
439 int enqueue = 0; 439 int enqueue = 0;
440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
441 struct rq *rq = rq_of_rt_rq(rt_rq); 441 struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
805 805
806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
807{ 807{
808 cpumask_t mask; 808 cpumask_var_t mask;
809 809
810 if (rq->curr->rt.nr_cpus_allowed == 1) 810 if (rq->curr->rt.nr_cpus_allowed == 1)
811 return; 811 return;
812 812
813 if (p->rt.nr_cpus_allowed != 1 813 if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
814 && cpupri_find(&rq->rd->cpupri, p, &mask))
815 return; 814 return;
816 815
817 if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) 816 if (p->rt.nr_cpus_allowed != 1
818 return; 817 && cpupri_find(&rq->rd->cpupri, p, mask))
818 goto free;
819
820 if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
821 goto free;
819 822
820 /* 823 /*
821 * There appears to be other cpus that can accept 824 * There appears to be other cpus that can accept
@@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
824 */ 827 */
825 requeue_task_rt(rq, p, 1); 828 requeue_task_rt(rq, p, 1);
826 resched_task(rq->curr); 829 resched_task(rq->curr);
830free:
831 free_cpumask_var(mask);
827} 832}
828 833
829#endif /* CONFIG_SMP */ 834#endif /* CONFIG_SMP */
@@ -914,7 +919,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
914static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 919static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
915{ 920{
916 if (!task_running(rq, p) && 921 if (!task_running(rq, p) &&
917 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && 922 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
918 (p->rt.nr_cpus_allowed > 1)) 923 (p->rt.nr_cpus_allowed > 1))
919 return 1; 924 return 1;
920 return 0; 925 return 0;
@@ -953,7 +958,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
953 return next; 958 return next;
954} 959}
955 960
956static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
957 962
958static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 963static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
959{ 964{
@@ -973,7 +978,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
973static int find_lowest_rq(struct task_struct *task) 978static int find_lowest_rq(struct task_struct *task)
974{ 979{
975 struct sched_domain *sd; 980 struct sched_domain *sd;
976 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 981 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
977 int this_cpu = smp_processor_id(); 982 int this_cpu = smp_processor_id();
978 int cpu = task_cpu(task); 983 int cpu = task_cpu(task);
979 984
@@ -988,7 +993,7 @@ static int find_lowest_rq(struct task_struct *task)
988 * I guess we might want to change cpupri_find() to ignore those 993 * I guess we might want to change cpupri_find() to ignore those
989 * in the first place. 994 * in the first place.
990 */ 995 */
991 cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); 996 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
992 997
993 /* 998 /*
994 * At this point we have built a mask of cpus representing the 999 * At this point we have built a mask of cpus representing the
@@ -998,7 +1003,7 @@ static int find_lowest_rq(struct task_struct *task)
998 * We prioritize the last cpu that the task executed on since 1003 * We prioritize the last cpu that the task executed on since
999 * it is most likely cache-hot in that location. 1004 * it is most likely cache-hot in that location.
1000 */ 1005 */
1001 if (cpu_isset(cpu, *lowest_mask)) 1006 if (cpumask_test_cpu(cpu, lowest_mask))
1002 return cpu; 1007 return cpu;
1003 1008
1004 /* 1009 /*
@@ -1013,7 +1018,8 @@ static int find_lowest_rq(struct task_struct *task)
1013 cpumask_t domain_mask; 1018 cpumask_t domain_mask;
1014 int best_cpu; 1019 int best_cpu;
1015 1020
1016 cpus_and(domain_mask, sd->span, *lowest_mask); 1021 cpumask_and(&domain_mask, sched_domain_span(sd),
1022 lowest_mask);
1017 1023
1018 best_cpu = pick_optimal_cpu(this_cpu, 1024 best_cpu = pick_optimal_cpu(this_cpu,
1019 &domain_mask); 1025 &domain_mask);
@@ -1054,8 +1060,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1054 * Also make sure that it wasn't scheduled on its rq. 1060 * Also make sure that it wasn't scheduled on its rq.
1055 */ 1061 */
1056 if (unlikely(task_rq(task) != rq || 1062 if (unlikely(task_rq(task) != rq ||
1057 !cpu_isset(lowest_rq->cpu, 1063 !cpumask_test_cpu(lowest_rq->cpu,
1058 task->cpus_allowed) || 1064 &task->cpus_allowed) ||
1059 task_running(rq, task) || 1065 task_running(rq, task) ||
1060 !task->se.on_rq)) { 1066 !task->se.on_rq)) {
1061 1067
@@ -1176,7 +1182,7 @@ static int pull_rt_task(struct rq *this_rq)
1176 1182
1177 next = pick_next_task_rt(this_rq); 1183 next = pick_next_task_rt(this_rq);
1178 1184
1179 for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { 1185 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1180 if (this_cpu == cpu) 1186 if (this_cpu == cpu)
1181 continue; 1187 continue;
1182 1188
@@ -1305,9 +1311,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1305} 1311}
1306 1312
1307static void set_cpus_allowed_rt(struct task_struct *p, 1313static void set_cpus_allowed_rt(struct task_struct *p,
1308 const cpumask_t *new_mask) 1314 const struct cpumask *new_mask)
1309{ 1315{
1310 int weight = cpus_weight(*new_mask); 1316 int weight = cpumask_weight(new_mask);
1311 1317
1312 BUG_ON(!rt_task(p)); 1318 BUG_ON(!rt_task(p));
1313 1319
@@ -1328,7 +1334,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1328 update_rt_migration(rq); 1334 update_rt_migration(rq);
1329 } 1335 }
1330 1336
1331 p->cpus_allowed = *new_mask; 1337 cpumask_copy(&p->cpus_allowed, new_mask);
1332 p->rt.nr_cpus_allowed = weight; 1338 p->rt.nr_cpus_allowed = weight;
1333} 1339}
1334 1340
@@ -1371,6 +1377,15 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1371 if (!rq->rt.rt_nr_running) 1377 if (!rq->rt.rt_nr_running)
1372 pull_rt_task(rq); 1378 pull_rt_task(rq);
1373} 1379}
1380
1381static inline void init_sched_rt_class(void)
1382{
1383 unsigned int i;
1384
1385 for_each_possible_cpu(i)
1386 alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1387 GFP_KERNEL, cpu_to_node(i));
1388}
1374#endif /* CONFIG_SMP */ 1389#endif /* CONFIG_SMP */
1375 1390
1376/* 1391/*
@@ -1541,3 +1556,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1541 rcu_read_unlock(); 1556 rcu_read_unlock();
1542} 1557}
1543#endif /* CONFIG_SCHED_DEBUG */ 1558#endif /* CONFIG_SCHED_DEBUG */
1559
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 3b01098164c8..f2773b5d1226 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
42 for_each_domain(cpu, sd) { 42 for_each_domain(cpu, sd) {
43 enum cpu_idle_type itype; 43 enum cpu_idle_type itype;
44 44
45 cpumask_scnprintf(mask_str, mask_len, sd->span); 45 cpumask_scnprintf(mask_str, mask_len,
46 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str); 47 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 48 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) { 49 itype++) {
diff --git a/kernel/signal.c b/kernel/signal.c
index 8e95855ff3cf..3152ac3b62e2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
858 q->info.si_signo = sig; 858 q->info.si_signo = sig;
859 q->info.si_errno = 0; 859 q->info.si_errno = 0;
860 q->info.si_code = SI_USER; 860 q->info.si_code = SI_USER;
861 q->info.si_pid = task_pid_vnr(current); 861 q->info.si_pid = task_tgid_nr_ns(current,
862 task_active_pid_ns(t));
862 q->info.si_uid = current_uid(); 863 q->info.si_uid = current_uid();
863 break; 864 break;
864 case (unsigned long) SEND_SIG_PRIV: 865 case (unsigned long) SEND_SIG_PRIV:
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c8dde58c55..5cfa0e5e3e88 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -24,8 +24,8 @@ struct call_function_data {
24 struct call_single_data csd; 24 struct call_single_data csd;
25 spinlock_t lock; 25 spinlock_t lock;
26 unsigned int refs; 26 unsigned int refs;
27 cpumask_t cpumask;
28 struct rcu_head rcu_head; 27 struct rcu_head rcu_head;
28 unsigned long cpumask_bits[];
29}; 29};
30 30
31struct call_single_queue { 31struct call_single_queue {
@@ -110,13 +110,13 @@ void generic_smp_call_function_interrupt(void)
110 list_for_each_entry_rcu(data, &call_function_queue, csd.list) { 110 list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
111 int refs; 111 int refs;
112 112
113 if (!cpu_isset(cpu, data->cpumask)) 113 if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
114 continue; 114 continue;
115 115
116 data->csd.func(data->csd.info); 116 data->csd.func(data->csd.info);
117 117
118 spin_lock(&data->lock); 118 spin_lock(&data->lock);
119 cpu_clear(cpu, data->cpumask); 119 cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
120 WARN_ON(data->refs == 0); 120 WARN_ON(data->refs == 0);
121 data->refs--; 121 data->refs--;
122 refs = data->refs; 122 refs = data->refs;
@@ -223,7 +223,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
223 local_irq_save(flags); 223 local_irq_save(flags);
224 func(info); 224 func(info);
225 local_irq_restore(flags); 225 local_irq_restore(flags);
226 } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { 226 } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
227 struct call_single_data *data = NULL; 227 struct call_single_data *data = NULL;
228 228
229 if (!wait) { 229 if (!wait) {
@@ -266,51 +266,19 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
266 generic_exec_single(cpu, data); 266 generic_exec_single(cpu, data);
267} 267}
268 268
269/* Dummy function */ 269/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
270static void quiesce_dummy(void *unused) 270#ifndef arch_send_call_function_ipi_mask
271{ 271#define arch_send_call_function_ipi_mask(maskp) \
272} 272 arch_send_call_function_ipi(*(maskp))
273 273#endif
274/*
275 * Ensure stack based data used in call function mask is safe to free.
276 *
277 * This is needed by smp_call_function_mask when using on-stack data, because
278 * a single call function queue is shared by all CPUs, and any CPU may pick up
279 * the data item on the queue at any time before it is deleted. So we need to
280 * ensure that all CPUs have transitioned through a quiescent state after
281 * this call.
282 *
283 * This is a very slow function, implemented by sending synchronous IPIs to
284 * all possible CPUs. For this reason, we have to alloc data rather than use
285 * stack based data even in the case of synchronous calls. The stack based
286 * data is then just used for deadlock/oom fallback which will be very rare.
287 *
288 * If a faster scheme can be made, we could go back to preferring stack based
289 * data -- the data allocation/free is non-zero cost.
290 */
291static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
292{
293 struct call_single_data data;
294 int cpu;
295
296 data.func = quiesce_dummy;
297 data.info = NULL;
298
299 for_each_cpu_mask(cpu, mask) {
300 data.flags = CSD_FLAG_WAIT;
301 generic_exec_single(cpu, &data);
302 }
303}
304 274
305/** 275/**
306 * smp_call_function_mask(): Run a function on a set of other CPUs. 276 * smp_call_function_many(): Run a function on a set of other CPUs.
307 * @mask: The set of cpus to run on. 277 * @mask: The set of cpus to run on (only runs on online subset).
308 * @func: The function to run. This must be fast and non-blocking. 278 * @func: The function to run. This must be fast and non-blocking.
309 * @info: An arbitrary pointer to pass to the function. 279 * @info: An arbitrary pointer to pass to the function.
310 * @wait: If true, wait (atomically) until function has completed on other CPUs. 280 * @wait: If true, wait (atomically) until function has completed on other CPUs.
311 * 281 *
312 * Returns 0 on success, else a negative status code.
313 *
314 * If @wait is true, then returns once @func has returned. Note that @wait 282 * If @wait is true, then returns once @func has returned. Note that @wait
315 * will be implicitly turned on in case of allocation failures, since 283 * will be implicitly turned on in case of allocation failures, since
316 * we fall back to on-stack allocation. 284 * we fall back to on-stack allocation.
@@ -319,53 +287,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
319 * hardware interrupt handler or from a bottom half handler. Preemption 287 * hardware interrupt handler or from a bottom half handler. Preemption
320 * must be disabled when calling this function. 288 * must be disabled when calling this function.
321 */ 289 */
322int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, 290void smp_call_function_many(const struct cpumask *mask,
323 int wait) 291 void (*func)(void *), void *info,
292 bool wait)
324{ 293{
325 struct call_function_data d; 294 struct call_function_data *data;
326 struct call_function_data *data = NULL;
327 cpumask_t allbutself;
328 unsigned long flags; 295 unsigned long flags;
329 int cpu, num_cpus; 296 int cpu, next_cpu;
330 int slowpath = 0;
331 297
332 /* Can deadlock when called with interrupts disabled */ 298 /* Can deadlock when called with interrupts disabled */
333 WARN_ON(irqs_disabled()); 299 WARN_ON(irqs_disabled());
334 300
335 cpu = smp_processor_id(); 301 /* So, what's a CPU they want? Ignoring this one. */
336 allbutself = cpu_online_map; 302 cpu = cpumask_first_and(mask, cpu_online_mask);
337 cpu_clear(cpu, allbutself); 303 if (cpu == smp_processor_id())
338 cpus_and(mask, mask, allbutself); 304 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
339 num_cpus = cpus_weight(mask); 305 /* No online cpus? We're done. */
340 306 if (cpu >= nr_cpu_ids)
341 /* 307 return;
342 * If zero CPUs, return. If just a single CPU, turn this request 308
343 * into a targetted single call instead since it's faster. 309 /* Do we have another CPU which isn't us? */
344 */ 310 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
345 if (!num_cpus) 311 if (next_cpu == smp_processor_id())
346 return 0; 312 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
347 else if (num_cpus == 1) { 313
348 cpu = first_cpu(mask); 314 /* Fastpath: do that cpu by itself. */
349 return smp_call_function_single(cpu, func, info, wait); 315 if (next_cpu >= nr_cpu_ids) {
316 smp_call_function_single(cpu, func, info, wait);
317 return;
350 } 318 }
351 319
352 data = kmalloc(sizeof(*data), GFP_ATOMIC); 320 data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
353 if (data) { 321 if (unlikely(!data)) {
354 data->csd.flags = CSD_FLAG_ALLOC; 322 /* Slow path. */
355 if (wait) 323 for_each_online_cpu(cpu) {
356 data->csd.flags |= CSD_FLAG_WAIT; 324 if (cpu == smp_processor_id())
357 } else { 325 continue;
358 data = &d; 326 if (cpumask_test_cpu(cpu, mask))
359 data->csd.flags = CSD_FLAG_WAIT; 327 smp_call_function_single(cpu, func, info, wait);
360 wait = 1; 328 }
361 slowpath = 1; 329 return;
362 } 330 }
363 331
364 spin_lock_init(&data->lock); 332 spin_lock_init(&data->lock);
333 data->csd.flags = CSD_FLAG_ALLOC;
334 if (wait)
335 data->csd.flags |= CSD_FLAG_WAIT;
365 data->csd.func = func; 336 data->csd.func = func;
366 data->csd.info = info; 337 data->csd.info = info;
367 data->refs = num_cpus; 338 cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
368 data->cpumask = mask; 339 cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
340 data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
369 341
370 spin_lock_irqsave(&call_function_lock, flags); 342 spin_lock_irqsave(&call_function_lock, flags);
371 list_add_tail_rcu(&data->csd.list, &call_function_queue); 343 list_add_tail_rcu(&data->csd.list, &call_function_queue);
@@ -377,18 +349,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
377 smp_mb(); 349 smp_mb();
378 350
379 /* Send a message to all CPUs in the map */ 351 /* Send a message to all CPUs in the map */
380 arch_send_call_function_ipi(mask); 352 arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
381 353
382 /* optionally wait for the CPUs to complete */ 354 /* optionally wait for the CPUs to complete */
383 if (wait) { 355 if (wait)
384 csd_flag_wait(&data->csd); 356 csd_flag_wait(&data->csd);
385 if (unlikely(slowpath))
386 smp_call_function_mask_quiesce_stack(mask);
387 }
388
389 return 0;
390} 357}
391EXPORT_SYMBOL(smp_call_function_mask); 358EXPORT_SYMBOL(smp_call_function_many);
392 359
393/** 360/**
394 * smp_call_function(): Run a function on all other CPUs. 361 * smp_call_function(): Run a function on all other CPUs.
@@ -396,7 +363,7 @@ EXPORT_SYMBOL(smp_call_function_mask);
396 * @info: An arbitrary pointer to pass to the function. 363 * @info: An arbitrary pointer to pass to the function.
397 * @wait: If true, wait (atomically) until function has completed on other CPUs. 364 * @wait: If true, wait (atomically) until function has completed on other CPUs.
398 * 365 *
399 * Returns 0 on success, else a negative status code. 366 * Returns 0.
400 * 367 *
401 * If @wait is true, then returns once @func has returned; otherwise 368 * If @wait is true, then returns once @func has returned; otherwise
402 * it returns just before the target cpu calls @func. In case of allocation 369 * it returns just before the target cpu calls @func. In case of allocation
@@ -407,12 +374,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
407 */ 374 */
408int smp_call_function(void (*func)(void *), void *info, int wait) 375int smp_call_function(void (*func)(void *), void *info, int wait)
409{ 376{
410 int ret;
411
412 preempt_disable(); 377 preempt_disable();
413 ret = smp_call_function_mask(cpu_online_map, func, info, wait); 378 smp_call_function_many(cpu_online_mask, func, info, wait);
414 preempt_enable(); 379 preempt_enable();
415 return ret; 380 return 0;
416} 381}
417EXPORT_SYMBOL(smp_call_function); 382EXPORT_SYMBOL(smp_call_function);
418 383
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 670c1eca47ec..bdbe9de9cd8d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -733,7 +733,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
733 break; 733 break;
734 /* Unbind so it can run. Fall thru. */ 734 /* Unbind so it can run. Fall thru. */
735 kthread_bind(per_cpu(ksoftirqd, hotcpu), 735 kthread_bind(per_cpu(ksoftirqd, hotcpu),
736 any_online_cpu(cpu_online_map)); 736 cpumask_any(cpu_online_mask));
737 case CPU_DEAD: 737 case CPU_DEAD:
738 case CPU_DEAD_FROZEN: { 738 case CPU_DEAD_FROZEN: {
739 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 739 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 1ab790c67b17..d9188c66278a 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -303,17 +303,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
303 break; 303 break;
304 case CPU_ONLINE: 304 case CPU_ONLINE:
305 case CPU_ONLINE_FROZEN: 305 case CPU_ONLINE_FROZEN:
306 check_cpu = any_online_cpu(cpu_online_map); 306 check_cpu = cpumask_any(cpu_online_mask);
307 wake_up_process(per_cpu(watchdog_task, hotcpu)); 307 wake_up_process(per_cpu(watchdog_task, hotcpu));
308 break; 308 break;
309#ifdef CONFIG_HOTPLUG_CPU 309#ifdef CONFIG_HOTPLUG_CPU
310 case CPU_DOWN_PREPARE: 310 case CPU_DOWN_PREPARE:
311 case CPU_DOWN_PREPARE_FROZEN: 311 case CPU_DOWN_PREPARE_FROZEN:
312 if (hotcpu == check_cpu) { 312 if (hotcpu == check_cpu) {
313 cpumask_t temp_cpu_online_map = cpu_online_map; 313 /* Pick any other online cpu. */
314 314 check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
315 cpu_clear(hotcpu, temp_cpu_online_map);
316 check_cpu = any_online_cpu(temp_cpu_online_map);
317 } 315 }
318 break; 316 break;
319 317
@@ -323,7 +321,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
323 break; 321 break;
324 /* Unbind so it can run. Fall thru. */ 322 /* Unbind so it can run. Fall thru. */
325 kthread_bind(per_cpu(watchdog_task, hotcpu), 323 kthread_bind(per_cpu(watchdog_task, hotcpu),
326 any_online_cpu(cpu_online_map)); 324 cpumask_any(cpu_online_mask));
327 case CPU_DEAD: 325 case CPU_DEAD:
328 case CPU_DEAD_FROZEN: 326 case CPU_DEAD_FROZEN:
329 p = per_cpu(watchdog_task, hotcpu); 327 p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 24e8ceacc388..0cd415ee62a2 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -38,7 +38,10 @@ struct stop_machine_data {
38static unsigned int num_threads; 38static unsigned int num_threads;
39static atomic_t thread_ack; 39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock); 40static DEFINE_MUTEX(lock);
41 41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
42static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
43static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
44static const cpumask_t *active_cpus; 47static const cpumask_t *active_cpus;
@@ -69,10 +72,10 @@ static void stop_cpu(struct work_struct *unused)
69 int err; 72 int err;
70 73
71 if (!active_cpus) { 74 if (!active_cpus) {
72 if (cpu == first_cpu(cpu_online_map)) 75 if (cpu == cpumask_first(cpu_online_mask))
73 smdata = &active; 76 smdata = &active;
74 } else { 77 } else {
75 if (cpu_isset(cpu, *active_cpus)) 78 if (cpumask_test_cpu(cpu, active_cpus))
76 smdata = &active; 79 smdata = &active;
77 } 80 }
78 /* Simple state machine */ 81 /* Simple state machine */
@@ -109,7 +112,44 @@ static int chill(void *unused)
109 return 0; 112 return 0;
110} 113}
111 114
112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
113{ 153{
114 struct work_struct *sm_work; 154 struct work_struct *sm_work;
115 int i, ret; 155 int i, ret;
@@ -142,23 +182,18 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
142 return ret; 182 return ret;
143} 183}
144 184
145int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
146{ 186{
147 int ret; 187 int ret;
148 188
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
149 /* No CPUs can come up or down during this. */ 192 /* No CPUs can come up or down during this. */
150 get_online_cpus(); 193 get_online_cpus();
151 ret = __stop_machine(fn, data, cpus); 194 ret = __stop_machine(fn, data, cpus);
152 put_online_cpus(); 195 put_online_cpus();
153 196 stop_machine_destroy();
154 return ret; 197 return ret;
155} 198}
156EXPORT_SYMBOL_GPL(stop_machine); 199EXPORT_SYMBOL_GPL(stop_machine);
157
158static int __init stop_machine_init(void)
159{
160 stop_machine_wq = create_rt_workqueue("kstop");
161 stop_machine_work = alloc_percpu(struct work_struct);
162 return 0;
163}
164core_initcall(stop_machine_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index d356d79e84ac..763c3c17ded3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/ptrace.h>
36 37
37#include <linux/compat.h> 38#include <linux/compat.h>
38#include <linux/syscalls.h> 39#include <linux/syscalls.h>
@@ -927,6 +928,7 @@ asmlinkage long sys_times(struct tms __user * tbuf)
927 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 928 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
928 return -EFAULT; 929 return -EFAULT;
929 } 930 }
931 force_successful_syscall_return();
930 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 932 return (long) jiffies_64_to_clock_t(get_jiffies_64());
931} 933}
932 934
@@ -1627,6 +1629,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1627 utime = stime = cputime_zero; 1629 utime = stime = cputime_zero;
1628 1630
1629 if (who == RUSAGE_THREAD) { 1631 if (who == RUSAGE_THREAD) {
1632 utime = task_utime(current);
1633 stime = task_stime(current);
1630 accumulate_thread_rusage(p, r); 1634 accumulate_thread_rusage(p, r);
1631 goto out; 1635 goto out;
1632 } 1636 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ff6d45c7626f..92f6e5bc3c24 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -87,10 +87,6 @@ extern int rcutorture_runnable;
87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
88 88
89/* Constants used for minimum and maximum */ 89/* Constants used for minimum and maximum */
90#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
91static int one = 1;
92#endif
93
94#ifdef CONFIG_DETECT_SOFTLOCKUP 90#ifdef CONFIG_DETECT_SOFTLOCKUP
95static int sixty = 60; 91static int sixty = 60;
96static int neg_one = -1; 92static int neg_one = -1;
@@ -101,6 +97,7 @@ static int two = 2;
101#endif 97#endif
102 98
103static int zero; 99static int zero;
100static int one = 1;
104static int one_hundred = 100; 101static int one_hundred = 100;
105 102
106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 103/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -952,12 +949,22 @@ static struct ctl_table vm_table[] = {
952 .data = &dirty_background_ratio, 949 .data = &dirty_background_ratio,
953 .maxlen = sizeof(dirty_background_ratio), 950 .maxlen = sizeof(dirty_background_ratio),
954 .mode = 0644, 951 .mode = 0644,
955 .proc_handler = &proc_dointvec_minmax, 952 .proc_handler = &dirty_background_ratio_handler,
956 .strategy = &sysctl_intvec, 953 .strategy = &sysctl_intvec,
957 .extra1 = &zero, 954 .extra1 = &zero,
958 .extra2 = &one_hundred, 955 .extra2 = &one_hundred,
959 }, 956 },
960 { 957 {
958 .ctl_name = CTL_UNNUMBERED,
959 .procname = "dirty_background_bytes",
960 .data = &dirty_background_bytes,
961 .maxlen = sizeof(dirty_background_bytes),
962 .mode = 0644,
963 .proc_handler = &dirty_background_bytes_handler,
964 .strategy = &sysctl_intvec,
965 .extra1 = &one,
966 },
967 {
961 .ctl_name = VM_DIRTY_RATIO, 968 .ctl_name = VM_DIRTY_RATIO,
962 .procname = "dirty_ratio", 969 .procname = "dirty_ratio",
963 .data = &vm_dirty_ratio, 970 .data = &vm_dirty_ratio,
@@ -969,6 +976,16 @@ static struct ctl_table vm_table[] = {
969 .extra2 = &one_hundred, 976 .extra2 = &one_hundred,
970 }, 977 },
971 { 978 {
979 .ctl_name = CTL_UNNUMBERED,
980 .procname = "dirty_bytes",
981 .data = &vm_dirty_bytes,
982 .maxlen = sizeof(vm_dirty_bytes),
983 .mode = 0644,
984 .proc_handler = &dirty_bytes_handler,
985 .strategy = &sysctl_intvec,
986 .extra1 = &one,
987 },
988 {
972 .procname = "dirty_writeback_centisecs", 989 .procname = "dirty_writeback_centisecs",
973 .data = &dirty_writeback_interval, 990 .data = &dirty_writeback_interval,
974 .maxlen = sizeof(dirty_writeback_interval), 991 .maxlen = sizeof(dirty_writeback_interval),
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index bd6be76303cf..888adbcca30c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,18 +290,17 @@ ret:
290 return; 290 return;
291} 291}
292 292
293static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) 293static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
294{ 294{
295 struct listener_list *listeners; 295 struct listener_list *listeners;
296 struct listener *s, *tmp; 296 struct listener *s, *tmp;
297 unsigned int cpu; 297 unsigned int cpu;
298 cpumask_t mask = *maskp;
299 298
300 if (!cpus_subset(mask, cpu_possible_map)) 299 if (!cpumask_subset(mask, cpu_possible_mask))
301 return -EINVAL; 300 return -EINVAL;
302 301
303 if (isadd == REGISTER) { 302 if (isadd == REGISTER) {
304 for_each_cpu_mask_nr(cpu, mask) { 303 for_each_cpu(cpu, mask) {
305 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 304 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
306 cpu_to_node(cpu)); 305 cpu_to_node(cpu));
307 if (!s) 306 if (!s)
@@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
320 319
321 /* Deregister or cleanup */ 320 /* Deregister or cleanup */
322cleanup: 321cleanup:
323 for_each_cpu_mask_nr(cpu, mask) { 322 for_each_cpu(cpu, mask) {
324 listeners = &per_cpu(listener_array, cpu); 323 listeners = &per_cpu(listener_array, cpu);
325 down_write(&listeners->sem); 324 down_write(&listeners->sem);
326 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 325 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
@@ -335,7 +334,7 @@ cleanup:
335 return 0; 334 return 0;
336} 335}
337 336
338static int parse(struct nlattr *na, cpumask_t *mask) 337static int parse(struct nlattr *na, struct cpumask *mask)
339{ 338{
340 char *data; 339 char *data;
341 int len; 340 int len;
@@ -352,7 +351,7 @@ static int parse(struct nlattr *na, cpumask_t *mask)
352 if (!data) 351 if (!data)
353 return -ENOMEM; 352 return -ENOMEM;
354 nla_strlcpy(data, na, len); 353 nla_strlcpy(data, na, len);
355 ret = cpulist_parse(data, *mask); 354 ret = cpulist_parse(data, mask);
356 kfree(data); 355 kfree(data);
357 return ret; 356 return ret;
358} 357}
@@ -428,23 +427,33 @@ err:
428 427
429static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 428static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
430{ 429{
431 int rc = 0; 430 int rc;
432 struct sk_buff *rep_skb; 431 struct sk_buff *rep_skb;
433 struct taskstats *stats; 432 struct taskstats *stats;
434 size_t size; 433 size_t size;
435 cpumask_t mask; 434 cpumask_var_t mask;
435
436 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
437 return -ENOMEM;
436 438
437 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 439 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
438 if (rc < 0) 440 if (rc < 0)
439 return rc; 441 goto free_return_rc;
440 if (rc == 0) 442 if (rc == 0) {
441 return add_del_listener(info->snd_pid, &mask, REGISTER); 443 rc = add_del_listener(info->snd_pid, mask, REGISTER);
444 goto free_return_rc;
445 }
442 446
443 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); 447 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
444 if (rc < 0) 448 if (rc < 0)
449 goto free_return_rc;
450 if (rc == 0) {
451 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
452free_return_rc:
453 free_cpumask_var(mask);
445 return rc; 454 return rc;
446 if (rc == 0) 455 }
447 return add_del_listener(info->snd_pid, &mask, DEREGISTER); 456 free_cpumask_var(mask);
448 457
449 /* 458 /*
450 * Size includes space for nested attributes 459 * Size includes space for nested attributes
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 06b6395b45b2..4f104515a19b 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -22,21 +22,11 @@
22 22
23static u32 rand1, preh_val, posth_val, jph_val; 23static u32 rand1, preh_val, posth_val, jph_val;
24static int errors, handler_errors, num_tests; 24static int errors, handler_errors, num_tests;
25static u32 (*target)(u32 value);
26static u32 (*target2)(u32 value);
25 27
26static noinline u32 kprobe_target(u32 value) 28static noinline u32 kprobe_target(u32 value)
27{ 29{
28 /*
29 * gcc ignores noinline on some architectures unless we stuff
30 * sufficient lard into the function. The get_kprobe() here is
31 * just for that.
32 *
33 * NOTE: We aren't concerned about the correctness of get_kprobe()
34 * here; hence, this call is neither under !preempt nor with the
35 * kprobe_mutex held. This is fine(tm)
36 */
37 if (get_kprobe((void *)0xdeadbeef))
38 printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
39
40 return (value / div_factor); 30 return (value / div_factor);
41} 31}
42 32
@@ -74,7 +64,7 @@ static int test_kprobe(void)
74 return ret; 64 return ret;
75 } 65 }
76 66
77 ret = kprobe_target(rand1); 67 ret = target(rand1);
78 unregister_kprobe(&kp); 68 unregister_kprobe(&kp);
79 69
80 if (preh_val == 0) { 70 if (preh_val == 0) {
@@ -92,6 +82,84 @@ static int test_kprobe(void)
92 return 0; 82 return 0;
93} 83}
94 84
85static noinline u32 kprobe_target2(u32 value)
86{
87 return (value / div_factor) + 1;
88}
89
90static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs)
91{
92 preh_val = (rand1 / div_factor) + 1;
93 return 0;
94}
95
96static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
97 unsigned long flags)
98{
99 if (preh_val != (rand1 / div_factor) + 1) {
100 handler_errors++;
101 printk(KERN_ERR "Kprobe smoke test failed: "
102 "incorrect value in post_handler2\n");
103 }
104 posth_val = preh_val + div_factor;
105}
106
107static struct kprobe kp2 = {
108 .symbol_name = "kprobe_target2",
109 .pre_handler = kp_pre_handler2,
110 .post_handler = kp_post_handler2
111};
112
113static int test_kprobes(void)
114{
115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2};
117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */
119 ret = register_kprobes(kps, 2);
120 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: "
122 "register_kprobes returned %d\n", ret);
123 return ret;
124 }
125
126 preh_val = 0;
127 posth_val = 0;
128 ret = target(rand1);
129
130 if (preh_val == 0) {
131 printk(KERN_ERR "Kprobe smoke test failed: "
132 "kprobe pre_handler not called\n");
133 handler_errors++;
134 }
135
136 if (posth_val == 0) {
137 printk(KERN_ERR "Kprobe smoke test failed: "
138 "kprobe post_handler not called\n");
139 handler_errors++;
140 }
141
142 preh_val = 0;
143 posth_val = 0;
144 ret = target2(rand1);
145
146 if (preh_val == 0) {
147 printk(KERN_ERR "Kprobe smoke test failed: "
148 "kprobe pre_handler2 not called\n");
149 handler_errors++;
150 }
151
152 if (posth_val == 0) {
153 printk(KERN_ERR "Kprobe smoke test failed: "
154 "kprobe post_handler2 not called\n");
155 handler_errors++;
156 }
157
158 unregister_kprobes(kps, 2);
159 return 0;
160
161}
162
95static u32 j_kprobe_target(u32 value) 163static u32 j_kprobe_target(u32 value)
96{ 164{
97 if (value != rand1) { 165 if (value != rand1) {
@@ -121,7 +189,7 @@ static int test_jprobe(void)
121 return ret; 189 return ret;
122 } 190 }
123 191
124 ret = kprobe_target(rand1); 192 ret = target(rand1);
125 unregister_jprobe(&jp); 193 unregister_jprobe(&jp);
126 if (jph_val == 0) { 194 if (jph_val == 0) {
127 printk(KERN_ERR "Kprobe smoke test failed: " 195 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -132,6 +200,43 @@ static int test_jprobe(void)
132 return 0; 200 return 0;
133} 201}
134 202
203static struct jprobe jp2 = {
204 .entry = j_kprobe_target,
205 .kp.symbol_name = "kprobe_target2"
206};
207
208static int test_jprobes(void)
209{
210 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2};
212
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
214 ret = register_jprobes(jps, 2);
215 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: "
217 "register_jprobes returned %d\n", ret);
218 return ret;
219 }
220
221 jph_val = 0;
222 ret = target(rand1);
223 if (jph_val == 0) {
224 printk(KERN_ERR "Kprobe smoke test failed: "
225 "jprobe handler not called\n");
226 handler_errors++;
227 }
228
229 jph_val = 0;
230 ret = target2(rand1);
231 if (jph_val == 0) {
232 printk(KERN_ERR "Kprobe smoke test failed: "
233 "jprobe handler2 not called\n");
234 handler_errors++;
235 }
236 unregister_jprobes(jps, 2);
237
238 return 0;
239}
135#ifdef CONFIG_KRETPROBES 240#ifdef CONFIG_KRETPROBES
136static u32 krph_val; 241static u32 krph_val;
137 242
@@ -177,7 +282,7 @@ static int test_kretprobe(void)
177 return ret; 282 return ret;
178 } 283 }
179 284
180 ret = kprobe_target(rand1); 285 ret = target(rand1);
181 unregister_kretprobe(&rp); 286 unregister_kretprobe(&rp);
182 if (krph_val != rand1) { 287 if (krph_val != rand1) {
183 printk(KERN_ERR "Kprobe smoke test failed: " 288 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -187,12 +292,72 @@ static int test_kretprobe(void)
187 292
188 return 0; 293 return 0;
189} 294}
295
296static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
297{
298 unsigned long ret = regs_return_value(regs);
299
300 if (ret != (rand1 / div_factor) + 1) {
301 handler_errors++;
302 printk(KERN_ERR "Kprobe smoke test failed: "
303 "incorrect value in kretprobe handler2\n");
304 }
305 if (krph_val == 0) {
306 handler_errors++;
307 printk(KERN_ERR "Kprobe smoke test failed: "
308 "call to kretprobe entry handler failed\n");
309 }
310
311 krph_val = rand1;
312 return 0;
313}
314
315static struct kretprobe rp2 = {
316 .handler = return_handler2,
317 .entry_handler = entry_handler,
318 .kp.symbol_name = "kprobe_target2"
319};
320
321static int test_kretprobes(void)
322{
323 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2};
325
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
327 ret = register_kretprobes(rps, 2);
328 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: "
330 "register_kretprobe returned %d\n", ret);
331 return ret;
332 }
333
334 krph_val = 0;
335 ret = target(rand1);
336 if (krph_val != rand1) {
337 printk(KERN_ERR "Kprobe smoke test failed: "
338 "kretprobe handler not called\n");
339 handler_errors++;
340 }
341
342 krph_val = 0;
343 ret = target2(rand1);
344 if (krph_val != rand1) {
345 printk(KERN_ERR "Kprobe smoke test failed: "
346 "kretprobe handler2 not called\n");
347 handler_errors++;
348 }
349 unregister_kretprobes(rps, 2);
350 return 0;
351}
190#endif /* CONFIG_KRETPROBES */ 352#endif /* CONFIG_KRETPROBES */
191 353
192int init_test_probes(void) 354int init_test_probes(void)
193{ 355{
194 int ret; 356 int ret;
195 357
358 target = kprobe_target;
359 target2 = kprobe_target2;
360
196 do { 361 do {
197 rand1 = random32(); 362 rand1 = random32();
198 } while (rand1 <= div_factor); 363 } while (rand1 <= div_factor);
@@ -204,15 +369,30 @@ int init_test_probes(void)
204 errors++; 369 errors++;
205 370
206 num_tests++; 371 num_tests++;
372 ret = test_kprobes();
373 if (ret < 0)
374 errors++;
375
376 num_tests++;
207 ret = test_jprobe(); 377 ret = test_jprobe();
208 if (ret < 0) 378 if (ret < 0)
209 errors++; 379 errors++;
210 380
381 num_tests++;
382 ret = test_jprobes();
383 if (ret < 0)
384 errors++;
385
211#ifdef CONFIG_KRETPROBES 386#ifdef CONFIG_KRETPROBES
212 num_tests++; 387 num_tests++;
213 ret = test_kretprobe(); 388 ret = test_kretprobe();
214 if (ret < 0) 389 if (ret < 0)
215 errors++; 390 errors++;
391
392 num_tests++;
393 ret = test_kretprobes();
394 if (ret < 0)
395 errors++;
216#endif /* CONFIG_KRETPROBES */ 396#endif /* CONFIG_KRETPROBES */
217 397
218 if (errors) 398 if (errors)
diff --git a/kernel/time.c b/kernel/time.c
index d63a4336fad6..4886e3ce83a4 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -37,6 +37,7 @@
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/math64.h> 39#include <linux/math64.h>
40#include <linux/ptrace.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -65,8 +66,9 @@ asmlinkage long sys_time(time_t __user * tloc)
65 66
66 if (tloc) { 67 if (tloc) {
67 if (put_user(i,tloc)) 68 if (put_user(i,tloc))
68 i = -EFAULT; 69 return -EFAULT;
69 } 70 }
71 force_successful_syscall_return();
70 return i; 72 return i;
71} 73}
72 74
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f8d968063cea..ea2f48af83cf 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -166,6 +166,8 @@ static void clockevents_notify_released(void)
166void clockevents_register_device(struct clock_event_device *dev) 166void clockevents_register_device(struct clock_event_device *dev)
167{ 167{
168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
169 BUG_ON(!dev->cpumask);
170
169 /* 171 /*
170 * A nsec2cyc multiplicator of 0 is invalid and we'd crash 172 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
171 * on it, so fix it up and emit a warning: 173 * on it, so fix it up and emit a warning:
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9ed2eec97526..ca89e1593f08 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,10 +145,11 @@ static void clocksource_watchdog(unsigned long data)
145 * Cycle through CPUs to check if the CPUs stay 145 * Cycle through CPUs to check if the CPUs stay
146 * synchronized to each other. 146 * synchronized to each other.
147 */ 147 */
148 int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); 148 int next_cpu = cpumask_next(raw_smp_processor_id(),
149 cpu_online_mask);
149 150
150 if (next_cpu >= nr_cpu_ids) 151 if (next_cpu >= nr_cpu_ids)
151 next_cpu = first_cpu(cpu_online_map); 152 next_cpu = cpumask_first(cpu_online_mask);
152 watchdog_timer.expires += WATCHDOG_INTERVAL; 153 watchdog_timer.expires += WATCHDOG_INTERVAL;
153 add_timer_on(&watchdog_timer, next_cpu); 154 add_timer_on(&watchdog_timer, next_cpu);
154 } 155 }
@@ -173,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
173 watchdog_last = watchdog->read(); 174 watchdog_last = watchdog->read();
174 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 175 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
175 add_timer_on(&watchdog_timer, 176 add_timer_on(&watchdog_timer,
176 first_cpu(cpu_online_map)); 177 cpumask_first(cpu_online_mask));
177 } 178 }
178 } else { 179 } else {
179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 180 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -195,7 +196,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
195 watchdog_timer.expires = 196 watchdog_timer.expires =
196 jiffies + WATCHDOG_INTERVAL; 197 jiffies + WATCHDOG_INTERVAL;
197 add_timer_on(&watchdog_timer, 198 add_timer_on(&watchdog_timer,
198 first_cpu(cpu_online_map)); 199 cpumask_first(cpu_online_mask));
199 } 200 }
200 } 201 }
201 } 202 }
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1ca99557e929..06f197560f3b 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -45,7 +45,7 @@
45 * 45 *
46 * The value 8 is somewhat carefully chosen, as anything 46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as 47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when 48 * HZ shrinks, so values greater than 8 overflow 32bits when
49 * HZ=100. 49 * HZ=100.
50 */ 50 */
51#define JIFFIES_SHIFT 8 51#define JIFFIES_SHIFT 8
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f98a1b7b16e9..118a3b3b3f9a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,7 +28,9 @@
28 */ 28 */
29 29
30struct tick_device tick_broadcast_device; 30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask; 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS);
32static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_SPINLOCK(tick_broadcast_lock);
33static int tick_broadcast_force; 35static int tick_broadcast_force;
34 36
@@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void)
46 return &tick_broadcast_device; 48 return &tick_broadcast_device;
47} 49}
48 50
49cpumask_t *tick_get_broadcast_mask(void) 51struct cpumask *tick_get_broadcast_mask(void)
50{ 52{
51 return &tick_broadcast_mask; 53 return to_cpumask(tick_broadcast_mask);
52} 54}
53 55
54/* 56/*
@@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
72 74
73 clockevents_exchange_device(NULL, dev); 75 clockevents_exchange_device(NULL, dev);
74 tick_broadcast_device.evtdev = dev; 76 tick_broadcast_device.evtdev = dev;
75 if (!cpus_empty(tick_broadcast_mask)) 77 if (!cpumask_empty(tick_get_broadcast_mask()))
76 tick_broadcast_start_periodic(dev); 78 tick_broadcast_start_periodic(dev);
77 return 1; 79 return 1;
78} 80}
@@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
104 */ 106 */
105 if (!tick_device_is_functional(dev)) { 107 if (!tick_device_is_functional(dev)) {
106 dev->event_handler = tick_handle_periodic; 108 dev->event_handler = tick_handle_periodic;
107 cpu_set(cpu, tick_broadcast_mask); 109 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
108 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 110 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
109 ret = 1; 111 ret = 1;
110 } else { 112 } else {
@@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
116 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 118 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
117 int cpu = smp_processor_id(); 119 int cpu = smp_processor_id();
118 120
119 cpu_clear(cpu, tick_broadcast_mask); 121 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
120 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
121 } 123 }
122 } 124 }
@@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
125} 127}
126 128
127/* 129/*
128 * Broadcast the event to the cpus, which are set in the mask 130 * Broadcast the event to the cpus, which are set in the mask (mangled).
129 */ 131 */
130static void tick_do_broadcast(cpumask_t mask) 132static void tick_do_broadcast(struct cpumask *mask)
131{ 133{
132 int cpu = smp_processor_id(); 134 int cpu = smp_processor_id();
133 struct tick_device *td; 135 struct tick_device *td;
@@ -135,21 +137,20 @@ static void tick_do_broadcast(cpumask_t mask)
135 /* 137 /*
136 * Check, if the current cpu is in the mask 138 * Check, if the current cpu is in the mask
137 */ 139 */
138 if (cpu_isset(cpu, mask)) { 140 if (cpumask_test_cpu(cpu, mask)) {
139 cpu_clear(cpu, mask); 141 cpumask_clear_cpu(cpu, mask);
140 td = &per_cpu(tick_cpu_device, cpu); 142 td = &per_cpu(tick_cpu_device, cpu);
141 td->evtdev->event_handler(td->evtdev); 143 td->evtdev->event_handler(td->evtdev);
142 } 144 }
143 145
144 if (!cpus_empty(mask)) { 146 if (!cpumask_empty(mask)) {
145 /* 147 /*
146 * It might be necessary to actually check whether the devices 148 * It might be necessary to actually check whether the devices
147 * have different broadcast functions. For now, just use the 149 * have different broadcast functions. For now, just use the
148 * one of the first device. This works as long as we have this 150 * one of the first device. This works as long as we have this
149 * misfeature only on x86 (lapic) 151 * misfeature only on x86 (lapic)
150 */ 152 */
151 cpu = first_cpu(mask); 153 td = &per_cpu(tick_cpu_device, cpumask_first(mask));
152 td = &per_cpu(tick_cpu_device, cpu);
153 td->evtdev->broadcast(mask); 154 td->evtdev->broadcast(mask);
154 } 155 }
155} 156}
@@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask)
160 */ 161 */
161static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
162{ 163{
163 cpumask_t mask;
164
165 spin_lock(&tick_broadcast_lock); 164 spin_lock(&tick_broadcast_lock);
166 165
167 cpus_and(mask, cpu_online_map, tick_broadcast_mask); 166 cpumask_and(to_cpumask(tmpmask),
168 tick_do_broadcast(mask); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 spin_unlock(&tick_broadcast_lock);
171} 171}
@@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why)
228 if (!tick_device_is_functional(dev)) 228 if (!tick_device_is_functional(dev))
229 goto out; 229 goto out;
230 230
231 bc_stopped = cpus_empty(tick_broadcast_mask); 231 bc_stopped = cpumask_empty(tick_get_broadcast_mask());
232 232
233 switch (*reason) { 233 switch (*reason) {
234 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 234 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
236 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
237 cpu_set(cpu, tick_broadcast_mask); 237 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
238 if (tick_broadcast_device.mode == 238 if (tick_broadcast_device.mode ==
239 TICKDEV_MODE_PERIODIC) 239 TICKDEV_MODE_PERIODIC)
240 clockevents_shutdown(dev); 240 clockevents_shutdown(dev);
@@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why)
244 break; 244 break;
245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
246 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
247 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
248 cpu_clear(cpu, tick_broadcast_mask); 248 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
249 if (tick_broadcast_device.mode == 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC) 250 TICKDEV_MODE_PERIODIC)
251 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
@@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why)
253 break; 253 break;
254 } 254 }
255 255
256 if (cpus_empty(tick_broadcast_mask)) { 256 if (cpumask_empty(tick_get_broadcast_mask())) {
257 if (!bc_stopped) 257 if (!bc_stopped)
258 clockevents_shutdown(bc); 258 clockevents_shutdown(bc);
259 } else if (bc_stopped) { 259 } else if (bc_stopped) {
@@ -272,7 +272,7 @@ out:
272 */ 272 */
273void tick_broadcast_on_off(unsigned long reason, int *oncpu) 273void tick_broadcast_on_off(unsigned long reason, int *oncpu)
274{ 274{
275 if (!cpu_isset(*oncpu, cpu_online_map)) 275 if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
277 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
278 else 278 else
@@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
303 spin_lock_irqsave(&tick_broadcast_lock, flags); 303 spin_lock_irqsave(&tick_broadcast_lock, flags);
304 304
305 bc = tick_broadcast_device.evtdev; 305 bc = tick_broadcast_device.evtdev;
306 cpu_clear(cpu, tick_broadcast_mask); 306 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
307 307
308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
309 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpumask_empty(tick_get_broadcast_mask()))
310 clockevents_shutdown(bc); 310 clockevents_shutdown(bc);
311 } 311 }
312 312
@@ -342,10 +342,10 @@ int tick_resume_broadcast(void)
342 342
343 switch (tick_broadcast_device.mode) { 343 switch (tick_broadcast_device.mode) {
344 case TICKDEV_MODE_PERIODIC: 344 case TICKDEV_MODE_PERIODIC:
345 if(!cpus_empty(tick_broadcast_mask)) 345 if (!cpumask_empty(tick_get_broadcast_mask()))
346 tick_broadcast_start_periodic(bc); 346 tick_broadcast_start_periodic(bc);
347 broadcast = cpu_isset(smp_processor_id(), 347 broadcast = cpumask_test_cpu(smp_processor_id(),
348 tick_broadcast_mask); 348 tick_get_broadcast_mask());
349 break; 349 break;
350 case TICKDEV_MODE_ONESHOT: 350 case TICKDEV_MODE_ONESHOT:
351 broadcast = tick_resume_broadcast_oneshot(bc); 351 broadcast = tick_resume_broadcast_oneshot(bc);
@@ -360,14 +360,15 @@ int tick_resume_broadcast(void)
360 360
361#ifdef CONFIG_TICK_ONESHOT 361#ifdef CONFIG_TICK_ONESHOT
362 362
363static cpumask_t tick_broadcast_oneshot_mask; 363/* FIXME: use cpumask_var_t. */
364static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
364 365
365/* 366/*
366 * Debugging: see timer_list.c 367 * Exposed for debugging: see timer_list.c
367 */ 368 */
368cpumask_t *tick_get_broadcast_oneshot_mask(void) 369struct cpumask *tick_get_broadcast_oneshot_mask(void)
369{ 370{
370 return &tick_broadcast_oneshot_mask; 371 return to_cpumask(tick_broadcast_oneshot_mask);
371} 372}
372 373
373static int tick_broadcast_set_event(ktime_t expires, int force) 374static int tick_broadcast_set_event(ktime_t expires, int force)
@@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
389 */ 390 */
390void tick_check_oneshot_broadcast(int cpu) 391void tick_check_oneshot_broadcast(int cpu)
391{ 392{
392 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 393 if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
393 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 394 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
394 395
395 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 396 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu)
402static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 403static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
403{ 404{
404 struct tick_device *td; 405 struct tick_device *td;
405 cpumask_t mask;
406 ktime_t now, next_event; 406 ktime_t now, next_event;
407 int cpu; 407 int cpu;
408 408
@@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
410again: 410again:
411 dev->next_event.tv64 = KTIME_MAX; 411 dev->next_event.tv64 = KTIME_MAX;
412 next_event.tv64 = KTIME_MAX; 412 next_event.tv64 = KTIME_MAX;
413 mask = CPU_MASK_NONE; 413 cpumask_clear(to_cpumask(tmpmask));
414 now = ktime_get(); 414 now = ktime_get();
415 /* Find all expired events */ 415 /* Find all expired events */
416 for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { 416 for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
417 td = &per_cpu(tick_cpu_device, cpu); 417 td = &per_cpu(tick_cpu_device, cpu);
418 if (td->evtdev->next_event.tv64 <= now.tv64) 418 if (td->evtdev->next_event.tv64 <= now.tv64)
419 cpu_set(cpu, mask); 419 cpumask_set_cpu(cpu, to_cpumask(tmpmask));
420 else if (td->evtdev->next_event.tv64 < next_event.tv64) 420 else if (td->evtdev->next_event.tv64 < next_event.tv64)
421 next_event.tv64 = td->evtdev->next_event.tv64; 421 next_event.tv64 = td->evtdev->next_event.tv64;
422 } 422 }
@@ -424,7 +424,7 @@ again:
424 /* 424 /*
425 * Wakeup the cpus which have an expired event. 425 * Wakeup the cpus which have an expired event.
426 */ 426 */
427 tick_do_broadcast(mask); 427 tick_do_broadcast(to_cpumask(tmpmask));
428 428
429 /* 429 /*
430 * Two reasons for reprogram: 430 * Two reasons for reprogram:
@@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason)
476 goto out; 476 goto out;
477 477
478 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 478 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
479 if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 479 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
480 cpu_set(cpu, tick_broadcast_oneshot_mask); 480 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
481 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 481 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
482 if (dev->next_event.tv64 < bc->next_event.tv64) 482 if (dev->next_event.tv64 < bc->next_event.tv64)
483 tick_broadcast_set_event(dev->next_event, 1); 483 tick_broadcast_set_event(dev->next_event, 1);
484 } 484 }
485 } else { 485 } else {
486 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 486 if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
487 cpu_clear(cpu, tick_broadcast_oneshot_mask); 487 cpumask_clear_cpu(cpu,
488 tick_get_broadcast_oneshot_mask());
488 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 489 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
489 if (dev->next_event.tv64 != KTIME_MAX) 490 if (dev->next_event.tv64 != KTIME_MAX)
490 tick_program_event(dev->next_event, 1); 491 tick_program_event(dev->next_event, 1);
@@ -502,15 +503,16 @@ out:
502 */ 503 */
503static void tick_broadcast_clear_oneshot(int cpu) 504static void tick_broadcast_clear_oneshot(int cpu)
504{ 505{
505 cpu_clear(cpu, tick_broadcast_oneshot_mask); 506 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
506} 507}
507 508
508static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) 509static void tick_broadcast_init_next_event(struct cpumask *mask,
510 ktime_t expires)
509{ 511{
510 struct tick_device *td; 512 struct tick_device *td;
511 int cpu; 513 int cpu;
512 514
513 for_each_cpu_mask_nr(cpu, *mask) { 515 for_each_cpu(cpu, mask) {
514 td = &per_cpu(tick_cpu_device, cpu); 516 td = &per_cpu(tick_cpu_device, cpu);
515 if (td->evtdev) 517 if (td->evtdev)
516 td->evtdev->next_event = expires; 518 td->evtdev->next_event = expires;
@@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
526 if (bc->event_handler != tick_handle_oneshot_broadcast) { 528 if (bc->event_handler != tick_handle_oneshot_broadcast) {
527 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 529 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
528 int cpu = smp_processor_id(); 530 int cpu = smp_processor_id();
529 cpumask_t mask;
530 531
531 bc->event_handler = tick_handle_oneshot_broadcast; 532 bc->event_handler = tick_handle_oneshot_broadcast;
532 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 533 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
540 * oneshot_mask bits for those and program the 541 * oneshot_mask bits for those and program the
541 * broadcast device to fire. 542 * broadcast device to fire.
542 */ 543 */
543 mask = tick_broadcast_mask; 544 cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
544 cpu_clear(cpu, mask); 545 cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
545 cpus_or(tick_broadcast_oneshot_mask, 546 cpumask_or(tick_get_broadcast_oneshot_mask(),
546 tick_broadcast_oneshot_mask, mask); 547 tick_get_broadcast_oneshot_mask(),
547 548 to_cpumask(tmpmask));
548 if (was_periodic && !cpus_empty(mask)) { 549
549 tick_broadcast_init_next_event(&mask, tick_next_period); 550 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
551 tick_broadcast_init_next_event(to_cpumask(tmpmask),
552 tick_next_period);
550 tick_broadcast_set_event(tick_next_period, 1); 553 tick_broadcast_set_event(tick_next_period, 1);
551 } else 554 } else
552 bc->next_event.tv64 = KTIME_MAX; 555 bc->next_event.tv64 = KTIME_MAX;
@@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
585 * Clear the broadcast mask flag for the dead cpu, but do not 588 * Clear the broadcast mask flag for the dead cpu, but do not
586 * stop the broadcast device! 589 * stop the broadcast device!
587 */ 590 */
588 cpu_clear(cpu, tick_broadcast_oneshot_mask); 591 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
589 592
590 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 593 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
591} 594}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index df12434b43ca..63e05d423a09 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
136 */ 136 */
137static void tick_setup_device(struct tick_device *td, 137static void tick_setup_device(struct tick_device *td,
138 struct clock_event_device *newdev, int cpu, 138 struct clock_event_device *newdev, int cpu,
139 const cpumask_t *cpumask) 139 const struct cpumask *cpumask)
140{ 140{
141 ktime_t next_event; 141 ktime_t next_event;
142 void (*handler)(struct clock_event_device *) = NULL; 142 void (*handler)(struct clock_event_device *) = NULL;
@@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td,
171 * When the device is not per cpu, pin the interrupt to the 171 * When the device is not per cpu, pin the interrupt to the
172 * current cpu: 172 * current cpu:
173 */ 173 */
174 if (!cpus_equal(newdev->cpumask, *cpumask)) 174 if (!cpumask_equal(newdev->cpumask, cpumask))
175 irq_set_affinity(newdev->irq, *cpumask); 175 irq_set_affinity(newdev->irq, cpumask);
176 176
177 /* 177 /*
178 * When global broadcasting is active, check if the current 178 * When global broadcasting is active, check if the current
@@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev)
202 spin_lock_irqsave(&tick_device_lock, flags); 202 spin_lock_irqsave(&tick_device_lock, flags);
203 203
204 cpu = smp_processor_id(); 204 cpu = smp_processor_id();
205 if (!cpu_isset(cpu, newdev->cpumask)) 205 if (!cpumask_test_cpu(cpu, newdev->cpumask))
206 goto out_bc; 206 goto out_bc;
207 207
208 td = &per_cpu(tick_cpu_device, cpu); 208 td = &per_cpu(tick_cpu_device, cpu);
209 curdev = td->evtdev; 209 curdev = td->evtdev;
210 210
211 /* cpu local device ? */ 211 /* cpu local device ? */
212 if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { 212 if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
213 213
214 /* 214 /*
215 * If the cpu affinity of the device interrupt can not 215 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
222 * If we have a cpu local device already, do not replace it 222 * If we have a cpu local device already, do not replace it
223 * by a non cpu local device 223 * by a non cpu local device
224 */ 224 */
225 if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) 225 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
226 goto out_bc; 226 goto out_bc;
227 } 227 }
228 228
@@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
254 curdev = NULL; 254 curdev = NULL;
255 } 255 }
256 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
257 tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); 257 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
259 tick_oneshot_notify(); 259 tick_oneshot_notify();
260 260
@@ -299,9 +299,9 @@ static void tick_shutdown(unsigned int *cpup)
299 } 299 }
300 /* Transfer the do_timer job away from this cpu */ 300 /* Transfer the do_timer job away from this cpu */
301 if (*cpup == tick_do_timer_cpu) { 301 if (*cpup == tick_do_timer_cpu) {
302 int cpu = first_cpu(cpu_online_map); 302 int cpu = cpumask_first(cpu_online_mask);
303 303
304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : 304 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
305 TICK_DO_TIMER_NONE; 305 TICK_DO_TIMER_NONE;
306 } 306 }
307 spin_unlock_irqrestore(&tick_device_lock, flags); 307 spin_unlock_irqrestore(&tick_device_lock, flags);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8f3fc2582d38..1b6c05bd0d0a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
144 if (!ts->tick_stopped) 144 if (!ts->tick_stopped)
145 return; 145 return;
146 146
147 cpu_clear(cpu, nohz_cpu_mask); 147 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get(); 148 now = ktime_get();
149 ts->idle_waketime = now; 149 ts->idle_waketime = now;
150 150
@@ -301,7 +301,7 @@ void tick_nohz_stop_sched_tick(int inidle)
301 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 301 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
302 302
303 if (delta_jiffies > 1) 303 if (delta_jiffies > 1)
304 cpu_set(cpu, nohz_cpu_mask); 304 cpumask_set_cpu(cpu, nohz_cpu_mask);
305 305
306 /* Skip reprogram of event if its not changed */ 306 /* Skip reprogram of event if its not changed */
307 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 307 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
@@ -319,7 +319,7 @@ void tick_nohz_stop_sched_tick(int inidle)
319 /* 319 /*
320 * sched tick not stopped! 320 * sched tick not stopped!
321 */ 321 */
322 cpu_clear(cpu, nohz_cpu_mask); 322 cpumask_clear_cpu(cpu, nohz_cpu_mask);
323 goto out; 323 goto out;
324 } 324 }
325 325
@@ -361,7 +361,7 @@ void tick_nohz_stop_sched_tick(int inidle)
361 * softirq. 361 * softirq.
362 */ 362 */
363 tick_do_update_jiffies64(ktime_get()); 363 tick_do_update_jiffies64(ktime_get());
364 cpu_clear(cpu, nohz_cpu_mask); 364 cpumask_clear_cpu(cpu, nohz_cpu_mask);
365 } 365 }
366 raise_softirq_irqoff(TIMER_SOFTIRQ); 366 raise_softirq_irqoff(TIMER_SOFTIRQ);
367out: 367out:
@@ -419,7 +419,9 @@ void tick_nohz_restart_sched_tick(void)
419{ 419{
420 int cpu = smp_processor_id(); 420 int cpu = smp_processor_id();
421 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 421 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
422#ifndef CONFIG_VIRT_CPU_ACCOUNTING
422 unsigned long ticks; 423 unsigned long ticks;
424#endif
423 ktime_t now; 425 ktime_t now;
424 426
425 local_irq_disable(); 427 local_irq_disable();
@@ -439,8 +441,9 @@ void tick_nohz_restart_sched_tick(void)
439 select_nohz_load_balancer(0); 441 select_nohz_load_balancer(0);
440 now = ktime_get(); 442 now = ktime_get();
441 tick_do_update_jiffies64(now); 443 tick_do_update_jiffies64(now);
442 cpu_clear(cpu, nohz_cpu_mask); 444 cpumask_clear_cpu(cpu, nohz_cpu_mask);
443 445
446#ifndef CONFIG_VIRT_CPU_ACCOUNTING
444 /* 447 /*
445 * We stopped the tick in idle. Update process times would miss the 448 * We stopped the tick in idle. Update process times would miss the
446 * time we slept as update_process_times does only a 1 tick 449 * time we slept as update_process_times does only a 1 tick
@@ -450,12 +453,9 @@ void tick_nohz_restart_sched_tick(void)
450 /* 453 /*
451 * We might be one off. Do not randomly account a huge number of ticks! 454 * We might be one off. Do not randomly account a huge number of ticks!
452 */ 455 */
453 if (ticks && ticks < LONG_MAX) { 456 if (ticks && ticks < LONG_MAX)
454 add_preempt_count(HARDIRQ_OFFSET); 457 account_idle_ticks(ticks);
455 account_system_time(current, HARDIRQ_OFFSET, 458#endif
456 jiffies_to_cputime(ticks));
457 sub_preempt_count(HARDIRQ_OFFSET);
458 }
459 459
460 touch_softlockup_watchdog(); 460 touch_softlockup_watchdog();
461 /* 461 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fa05e88aa76f..900f1b6598d1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 46struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 47static unsigned long total_sleep_time; /* seconds */
48 48
49/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended;
51
49static struct timespec xtime_cache __attribute__ ((aligned (16))); 52static struct timespec xtime_cache __attribute__ ((aligned (16)));
50void update_xtime_cache(u64 nsec) 53void update_xtime_cache(u64 nsec)
51{ 54{
@@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts)
92 unsigned long seq; 95 unsigned long seq;
93 s64 nsecs; 96 s64 nsecs;
94 97
98 WARN_ON(timekeeping_suspended);
99
95 do { 100 do {
96 seq = read_seqbegin(&xtime_lock); 101 seq = read_seqbegin(&xtime_lock);
97 102
@@ -299,8 +304,6 @@ void __init timekeeping_init(void)
299 write_sequnlock_irqrestore(&xtime_lock, flags); 304 write_sequnlock_irqrestore(&xtime_lock, flags);
300} 305}
301 306
302/* flag for if timekeeping is suspended */
303static int timekeeping_suspended;
304/* time in seconds when suspend began */ 307/* time in seconds when suspend began */
305static unsigned long timekeeping_suspend_time; 308static unsigned long timekeeping_suspend_time;
306 309
diff --git a/kernel/timer.c b/kernel/timer.c
index 566257d1dc10..dee3f641a7a7 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1018,21 +1018,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1018} 1018}
1019#endif 1019#endif
1020 1020
1021#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1022void account_process_tick(struct task_struct *p, int user_tick)
1023{
1024 cputime_t one_jiffy = jiffies_to_cputime(1);
1025
1026 if (user_tick) {
1027 account_user_time(p, one_jiffy);
1028 account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
1029 } else {
1030 account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
1031 account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
1032 }
1033}
1034#endif
1035
1036/* 1021/*
1037 * Called from the timer interrupt handler to charge one tick to the current 1022 * Called from the timer interrupt handler to charge one tick to the current
1038 * process. user_tick is 1 if the tick is user time, 0 for system. 1023 * process. user_tick is 1 if the tick is user time, 0 for system.
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d601a7c4587..a9d9760dc7b6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -195,7 +195,7 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
195EXPORT_SYMBOL_GPL(ring_buffer_event_data); 195EXPORT_SYMBOL_GPL(ring_buffer_event_data);
196 196
197#define for_each_buffer_cpu(buffer, cpu) \ 197#define for_each_buffer_cpu(buffer, cpu) \
198 for_each_cpu_mask(cpu, buffer->cpumask) 198 for_each_cpu(cpu, buffer->cpumask)
199 199
200#define TS_SHIFT 27 200#define TS_SHIFT 27
201#define TS_MASK ((1ULL << TS_SHIFT) - 1) 201#define TS_MASK ((1ULL << TS_SHIFT) - 1)
@@ -267,7 +267,7 @@ struct ring_buffer {
267 unsigned pages; 267 unsigned pages;
268 unsigned flags; 268 unsigned flags;
269 int cpus; 269 int cpus;
270 cpumask_t cpumask; 270 cpumask_var_t cpumask;
271 atomic_t record_disabled; 271 atomic_t record_disabled;
272 272
273 struct mutex mutex; 273 struct mutex mutex;
@@ -458,6 +458,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
458 if (!buffer) 458 if (!buffer)
459 return NULL; 459 return NULL;
460 460
461 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
462 goto fail_free_buffer;
463
461 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 464 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
462 buffer->flags = flags; 465 buffer->flags = flags;
463 466
@@ -465,14 +468,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
465 if (buffer->pages == 1) 468 if (buffer->pages == 1)
466 buffer->pages++; 469 buffer->pages++;
467 470
468 buffer->cpumask = cpu_possible_map; 471 cpumask_copy(buffer->cpumask, cpu_possible_mask);
469 buffer->cpus = nr_cpu_ids; 472 buffer->cpus = nr_cpu_ids;
470 473
471 bsize = sizeof(void *) * nr_cpu_ids; 474 bsize = sizeof(void *) * nr_cpu_ids;
472 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), 475 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
473 GFP_KERNEL); 476 GFP_KERNEL);
474 if (!buffer->buffers) 477 if (!buffer->buffers)
475 goto fail_free_buffer; 478 goto fail_free_cpumask;
476 479
477 for_each_buffer_cpu(buffer, cpu) { 480 for_each_buffer_cpu(buffer, cpu) {
478 buffer->buffers[cpu] = 481 buffer->buffers[cpu] =
@@ -492,6 +495,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
492 } 495 }
493 kfree(buffer->buffers); 496 kfree(buffer->buffers);
494 497
498 fail_free_cpumask:
499 free_cpumask_var(buffer->cpumask);
500
495 fail_free_buffer: 501 fail_free_buffer:
496 kfree(buffer); 502 kfree(buffer);
497 return NULL; 503 return NULL;
@@ -510,6 +516,8 @@ ring_buffer_free(struct ring_buffer *buffer)
510 for_each_buffer_cpu(buffer, cpu) 516 for_each_buffer_cpu(buffer, cpu)
511 rb_free_cpu_buffer(buffer->buffers[cpu]); 517 rb_free_cpu_buffer(buffer->buffers[cpu]);
512 518
519 free_cpumask_var(buffer->cpumask);
520
513 kfree(buffer); 521 kfree(buffer);
514} 522}
515EXPORT_SYMBOL_GPL(ring_buffer_free); 523EXPORT_SYMBOL_GPL(ring_buffer_free);
@@ -1283,7 +1291,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1283 1291
1284 cpu = raw_smp_processor_id(); 1292 cpu = raw_smp_processor_id();
1285 1293
1286 if (!cpu_isset(cpu, buffer->cpumask)) 1294 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1287 goto out; 1295 goto out;
1288 1296
1289 cpu_buffer = buffer->buffers[cpu]; 1297 cpu_buffer = buffer->buffers[cpu];
@@ -1396,7 +1404,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1396 1404
1397 cpu = raw_smp_processor_id(); 1405 cpu = raw_smp_processor_id();
1398 1406
1399 if (!cpu_isset(cpu, buffer->cpumask)) 1407 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1400 goto out; 1408 goto out;
1401 1409
1402 cpu_buffer = buffer->buffers[cpu]; 1410 cpu_buffer = buffer->buffers[cpu];
@@ -1478,7 +1486,7 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
1478{ 1486{
1479 struct ring_buffer_per_cpu *cpu_buffer; 1487 struct ring_buffer_per_cpu *cpu_buffer;
1480 1488
1481 if (!cpu_isset(cpu, buffer->cpumask)) 1489 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1482 return; 1490 return;
1483 1491
1484 cpu_buffer = buffer->buffers[cpu]; 1492 cpu_buffer = buffer->buffers[cpu];
@@ -1498,7 +1506,7 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
1498{ 1506{
1499 struct ring_buffer_per_cpu *cpu_buffer; 1507 struct ring_buffer_per_cpu *cpu_buffer;
1500 1508
1501 if (!cpu_isset(cpu, buffer->cpumask)) 1509 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1502 return; 1510 return;
1503 1511
1504 cpu_buffer = buffer->buffers[cpu]; 1512 cpu_buffer = buffer->buffers[cpu];
@@ -1515,7 +1523,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1515{ 1523{
1516 struct ring_buffer_per_cpu *cpu_buffer; 1524 struct ring_buffer_per_cpu *cpu_buffer;
1517 1525
1518 if (!cpu_isset(cpu, buffer->cpumask)) 1526 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1519 return 0; 1527 return 0;
1520 1528
1521 cpu_buffer = buffer->buffers[cpu]; 1529 cpu_buffer = buffer->buffers[cpu];
@@ -1532,7 +1540,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1532{ 1540{
1533 struct ring_buffer_per_cpu *cpu_buffer; 1541 struct ring_buffer_per_cpu *cpu_buffer;
1534 1542
1535 if (!cpu_isset(cpu, buffer->cpumask)) 1543 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1536 return 0; 1544 return 0;
1537 1545
1538 cpu_buffer = buffer->buffers[cpu]; 1546 cpu_buffer = buffer->buffers[cpu];
@@ -1850,7 +1858,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1850 struct buffer_page *reader; 1858 struct buffer_page *reader;
1851 int nr_loops = 0; 1859 int nr_loops = 0;
1852 1860
1853 if (!cpu_isset(cpu, buffer->cpumask)) 1861 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1854 return NULL; 1862 return NULL;
1855 1863
1856 cpu_buffer = buffer->buffers[cpu]; 1864 cpu_buffer = buffer->buffers[cpu];
@@ -2025,7 +2033,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2025 struct ring_buffer_event *event; 2033 struct ring_buffer_event *event;
2026 unsigned long flags; 2034 unsigned long flags;
2027 2035
2028 if (!cpu_isset(cpu, buffer->cpumask)) 2036 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2029 return NULL; 2037 return NULL;
2030 2038
2031 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2039 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2062,7 +2070,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
2062 struct ring_buffer_iter *iter; 2070 struct ring_buffer_iter *iter;
2063 unsigned long flags; 2071 unsigned long flags;
2064 2072
2065 if (!cpu_isset(cpu, buffer->cpumask)) 2073 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2066 return NULL; 2074 return NULL;
2067 2075
2068 iter = kmalloc(sizeof(*iter), GFP_KERNEL); 2076 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
@@ -2172,7 +2180,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2172 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2180 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2173 unsigned long flags; 2181 unsigned long flags;
2174 2182
2175 if (!cpu_isset(cpu, buffer->cpumask)) 2183 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2176 return; 2184 return;
2177 2185
2178 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2186 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2228,7 +2236,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2228{ 2236{
2229 struct ring_buffer_per_cpu *cpu_buffer; 2237 struct ring_buffer_per_cpu *cpu_buffer;
2230 2238
2231 if (!cpu_isset(cpu, buffer->cpumask)) 2239 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2232 return 1; 2240 return 1;
2233 2241
2234 cpu_buffer = buffer->buffers[cpu]; 2242 cpu_buffer = buffer->buffers[cpu];
@@ -2252,8 +2260,8 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2252 struct ring_buffer_per_cpu *cpu_buffer_a; 2260 struct ring_buffer_per_cpu *cpu_buffer_a;
2253 struct ring_buffer_per_cpu *cpu_buffer_b; 2261 struct ring_buffer_per_cpu *cpu_buffer_b;
2254 2262
2255 if (!cpu_isset(cpu, buffer_a->cpumask) || 2263 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
2256 !cpu_isset(cpu, buffer_b->cpumask)) 2264 !cpumask_test_cpu(cpu, buffer_b->cpumask))
2257 return -EINVAL; 2265 return -EINVAL;
2258 2266
2259 /* At least make sure the two buffers are somewhat the same */ 2267 /* At least make sure the two buffers are somewhat the same */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4185d5221633..c580233add95 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -89,10 +89,10 @@ static inline void ftrace_enable_cpu(void)
89 preempt_enable(); 89 preempt_enable();
90} 90}
91 91
92static cpumask_t __read_mostly tracing_buffer_mask; 92static cpumask_var_t __read_mostly tracing_buffer_mask;
93 93
94#define for_each_tracing_cpu(cpu) \ 94#define for_each_tracing_cpu(cpu) \
95 for_each_cpu_mask(cpu, tracing_buffer_mask) 95 for_each_cpu(cpu, tracing_buffer_mask)
96 96
97/* 97/*
98 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops 98 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -1811,10 +1811,10 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1811 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) 1811 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
1812 return; 1812 return;
1813 1813
1814 if (cpu_isset(iter->cpu, iter->started)) 1814 if (cpumask_test_cpu(iter->cpu, iter->started))
1815 return; 1815 return;
1816 1816
1817 cpu_set(iter->cpu, iter->started); 1817 cpumask_set_cpu(iter->cpu, iter->started);
1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); 1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1819} 1819}
1820 1820
@@ -2646,13 +2646,7 @@ static struct file_operations show_traces_fops = {
2646/* 2646/*
2647 * Only trace on a CPU if the bitmask is set: 2647 * Only trace on a CPU if the bitmask is set:
2648 */ 2648 */
2649static cpumask_t tracing_cpumask = CPU_MASK_ALL; 2649static cpumask_var_t tracing_cpumask;
2650
2651/*
2652 * When tracing/tracing_cpu_mask is modified then this holds
2653 * the new bitmask we are about to install:
2654 */
2655static cpumask_t tracing_cpumask_new;
2656 2650
2657/* 2651/*
2658 * The tracer itself will not take this lock, but still we want 2652 * The tracer itself will not take this lock, but still we want
@@ -2693,6 +2687,10 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2693 size_t count, loff_t *ppos) 2687 size_t count, loff_t *ppos)
2694{ 2688{
2695 int err, cpu; 2689 int err, cpu;
2690 cpumask_var_t tracing_cpumask_new;
2691
2692 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2693 return -ENOMEM;
2696 2694
2697 mutex_lock(&tracing_cpumask_update_lock); 2695 mutex_lock(&tracing_cpumask_update_lock);
2698 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); 2696 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
@@ -2706,26 +2704,28 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2706 * Increase/decrease the disabled counter if we are 2704 * Increase/decrease the disabled counter if we are
2707 * about to flip a bit in the cpumask: 2705 * about to flip a bit in the cpumask:
2708 */ 2706 */
2709 if (cpu_isset(cpu, tracing_cpumask) && 2707 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2710 !cpu_isset(cpu, tracing_cpumask_new)) { 2708 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2711 atomic_inc(&global_trace.data[cpu]->disabled); 2709 atomic_inc(&global_trace.data[cpu]->disabled);
2712 } 2710 }
2713 if (!cpu_isset(cpu, tracing_cpumask) && 2711 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2714 cpu_isset(cpu, tracing_cpumask_new)) { 2712 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2715 atomic_dec(&global_trace.data[cpu]->disabled); 2713 atomic_dec(&global_trace.data[cpu]->disabled);
2716 } 2714 }
2717 } 2715 }
2718 __raw_spin_unlock(&ftrace_max_lock); 2716 __raw_spin_unlock(&ftrace_max_lock);
2719 local_irq_enable(); 2717 local_irq_enable();
2720 2718
2721 tracing_cpumask = tracing_cpumask_new; 2719 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
2722 2720
2723 mutex_unlock(&tracing_cpumask_update_lock); 2721 mutex_unlock(&tracing_cpumask_update_lock);
2722 free_cpumask_var(tracing_cpumask_new);
2724 2723
2725 return count; 2724 return count;
2726 2725
2727err_unlock: 2726err_unlock:
2728 mutex_unlock(&tracing_cpumask_update_lock); 2727 mutex_unlock(&tracing_cpumask_update_lock);
2728 free_cpumask_var(tracing_cpumask);
2729 2729
2730 return err; 2730 return err;
2731} 2731}
@@ -3114,10 +3114,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3114 if (!iter) 3114 if (!iter)
3115 return -ENOMEM; 3115 return -ENOMEM;
3116 3116
3117 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3118 kfree(iter);
3119 return -ENOMEM;
3120 }
3121
3117 mutex_lock(&trace_types_lock); 3122 mutex_lock(&trace_types_lock);
3118 3123
3119 /* trace pipe does not show start of buffer */ 3124 /* trace pipe does not show start of buffer */
3120 cpus_setall(iter->started); 3125 cpumask_setall(iter->started);
3121 3126
3122 iter->tr = &global_trace; 3127 iter->tr = &global_trace;
3123 iter->trace = current_trace; 3128 iter->trace = current_trace;
@@ -3134,6 +3139,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3134{ 3139{
3135 struct trace_iterator *iter = file->private_data; 3140 struct trace_iterator *iter = file->private_data;
3136 3141
3142 free_cpumask_var(iter->started);
3137 kfree(iter); 3143 kfree(iter);
3138 atomic_dec(&tracing_reader); 3144 atomic_dec(&tracing_reader);
3139 3145
@@ -3752,7 +3758,6 @@ void ftrace_dump(void)
3752 static DEFINE_SPINLOCK(ftrace_dump_lock); 3758 static DEFINE_SPINLOCK(ftrace_dump_lock);
3753 /* use static because iter can be a bit big for the stack */ 3759 /* use static because iter can be a bit big for the stack */
3754 static struct trace_iterator iter; 3760 static struct trace_iterator iter;
3755 static cpumask_t mask;
3756 static int dump_ran; 3761 static int dump_ran;
3757 unsigned long flags; 3762 unsigned long flags;
3758 int cnt = 0, cpu; 3763 int cnt = 0, cpu;
@@ -3786,8 +3791,6 @@ void ftrace_dump(void)
3786 * and then release the locks again. 3791 * and then release the locks again.
3787 */ 3792 */
3788 3793
3789 cpus_clear(mask);
3790
3791 while (!trace_empty(&iter)) { 3794 while (!trace_empty(&iter)) {
3792 3795
3793 if (!cnt) 3796 if (!cnt)
@@ -3823,19 +3826,28 @@ __init static int tracer_alloc_buffers(void)
3823{ 3826{
3824 struct trace_array_cpu *data; 3827 struct trace_array_cpu *data;
3825 int i; 3828 int i;
3829 int ret = -ENOMEM;
3826 3830
3827 /* TODO: make the number of buffers hot pluggable with CPUS */ 3831 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
3828 tracing_buffer_mask = cpu_possible_map; 3832 goto out;
3833
3834 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
3835 goto out_free_buffer_mask;
3836
3837 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
3838 cpumask_copy(tracing_cpumask, cpu_all_mask);
3829 3839
3840 /* TODO: make the number of buffers hot pluggable with CPUS */
3830 global_trace.buffer = ring_buffer_alloc(trace_buf_size, 3841 global_trace.buffer = ring_buffer_alloc(trace_buf_size,
3831 TRACE_BUFFER_FLAGS); 3842 TRACE_BUFFER_FLAGS);
3832 if (!global_trace.buffer) { 3843 if (!global_trace.buffer) {
3833 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 3844 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
3834 WARN_ON(1); 3845 WARN_ON(1);
3835 return 0; 3846 goto out_free_cpumask;
3836 } 3847 }
3837 global_trace.entries = ring_buffer_size(global_trace.buffer); 3848 global_trace.entries = ring_buffer_size(global_trace.buffer);
3838 3849
3850
3839#ifdef CONFIG_TRACER_MAX_TRACE 3851#ifdef CONFIG_TRACER_MAX_TRACE
3840 max_tr.buffer = ring_buffer_alloc(trace_buf_size, 3852 max_tr.buffer = ring_buffer_alloc(trace_buf_size,
3841 TRACE_BUFFER_FLAGS); 3853 TRACE_BUFFER_FLAGS);
@@ -3843,7 +3855,7 @@ __init static int tracer_alloc_buffers(void)
3843 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 3855 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
3844 WARN_ON(1); 3856 WARN_ON(1);
3845 ring_buffer_free(global_trace.buffer); 3857 ring_buffer_free(global_trace.buffer);
3846 return 0; 3858 goto out_free_cpumask;
3847 } 3859 }
3848 max_tr.entries = ring_buffer_size(max_tr.buffer); 3860 max_tr.entries = ring_buffer_size(max_tr.buffer);
3849 WARN_ON(max_tr.entries != global_trace.entries); 3861 WARN_ON(max_tr.entries != global_trace.entries);
@@ -3873,8 +3885,14 @@ __init static int tracer_alloc_buffers(void)
3873 &trace_panic_notifier); 3885 &trace_panic_notifier);
3874 3886
3875 register_die_notifier(&trace_die_notifier); 3887 register_die_notifier(&trace_die_notifier);
3888 ret = 0;
3876 3889
3877 return 0; 3890out_free_cpumask:
3891 free_cpumask_var(tracing_cpumask);
3892out_free_buffer_mask:
3893 free_cpumask_var(tracing_buffer_mask);
3894out:
3895 return ret;
3878} 3896}
3879early_initcall(tracer_alloc_buffers); 3897early_initcall(tracer_alloc_buffers);
3880fs_initcall(tracer_init_debugfs); 3898fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f864036..4d3d381bfd95 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -368,7 +368,7 @@ struct trace_iterator {
368 loff_t pos; 368 loff_t pos;
369 long idx; 369 long idx;
370 370
371 cpumask_t started; 371 cpumask_var_t started;
372}; 372};
373 373
374int tracing_is_enabled(void); 374int tracing_is_enabled(void);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 3ccebde28482..366c8c333e13 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -42,7 +42,7 @@ static int boot_trace_init(struct trace_array *tr)
42 int cpu; 42 int cpu;
43 boot_trace = tr; 43 boot_trace = tr;
44 44
45 for_each_cpu_mask(cpu, cpu_possible_map) 45 for_each_cpu(cpu, cpu_possible_mask)
46 tracing_reset(tr, cpu); 46 tracing_reset(tr, cpu);
47 47
48 tracing_sched_switch_assign_trace(tr); 48 tracing_sched_switch_assign_trace(tr);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4bf39fcae97a..930c08e5b38e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -79,7 +79,7 @@ print_graph_cpu(struct trace_seq *s, int cpu)
79 int i; 79 int i;
80 int ret; 80 int ret;
81 int log10_this = log10_cpu(cpu); 81 int log10_this = log10_cpu(cpu);
82 int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); 82 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
83 83
84 84
85 /* 85 /*
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index b6a3e20a49a9..649df22d435f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -46,7 +46,7 @@ static void bts_trace_start(struct trace_array *tr)
46 46
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(tr);
48 48
49 for_each_cpu_mask(cpu, cpu_possible_map) 49 for_each_cpu(cpu, cpu_possible_mask)
50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); 50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
51} 51}
52 52
@@ -62,7 +62,7 @@ static void bts_trace_stop(struct trace_array *tr)
62{ 62{
63 int cpu; 63 int cpu;
64 64
65 for_each_cpu_mask(cpu, cpu_possible_map) 65 for_each_cpu(cpu, cpu_possible_mask)
66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); 66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
67} 67}
68 68
@@ -172,7 +172,7 @@ static void trace_bts_prepare(struct trace_iterator *iter)
172{ 172{
173 int cpu; 173 int cpu;
174 174
175 for_each_cpu_mask(cpu, cpu_possible_map) 175 for_each_cpu(cpu, cpu_possible_mask)
176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); 176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
177} 177}
178 178
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index a7172a352f62..7bda248daf55 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -39,7 +39,7 @@ static int power_trace_init(struct trace_array *tr)
39 39
40 trace_power_enabled = 1; 40 trace_power_enabled = 1;
41 41
42 for_each_cpu_mask(cpu, cpu_possible_map) 42 for_each_cpu(cpu, cpu_possible_mask)
43 tracing_reset(tr, cpu); 43 tracing_reset(tr, cpu);
44 return 0; 44 return 0;
45} 45}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index a5779bd975db..eaca5ad803ff 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -196,9 +196,9 @@ static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
196 return HRTIMER_RESTART; 196 return HRTIMER_RESTART;
197} 197}
198 198
199static void start_stack_timer(int cpu) 199static void start_stack_timer(void *unused)
200{ 200{
201 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); 201 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
@@ -208,14 +208,7 @@ static void start_stack_timer(int cpu)
208 208
209static void start_stack_timers(void) 209static void start_stack_timers(void)
210{ 210{
211 cpumask_t saved_mask = current->cpus_allowed; 211 on_each_cpu(start_stack_timer, NULL, 1);
212 int cpu;
213
214 for_each_online_cpu(cpu) {
215 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
216 start_stack_timer(cpu);
217 }
218 set_cpus_allowed_ptr(current, &saved_mask);
219} 212}
220 213
221static void stop_stack_timer(int cpu) 214static void stop_stack_timer(int cpu)
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 2dc06ab35716..43f891b05a4b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
92 mm = get_task_mm(p); 92 mm = get_task_mm(p);
93 if (mm) { 93 if (mm) {
94 /* adjust to KB unit */ 94 /* adjust to KB unit */
95 stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; 95 stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
96 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; 96 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB;
97 mmput(mm); 97 mmput(mm);
98 } 98 }
99 stats->read_char = p->ioac.rchar; 99 stats->read_char = p->ioac.rchar;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4952322cba45..2f445833ae37 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 73static LIST_HEAD(workqueues);
74 74
75static int singlethread_cpu __read_mostly; 75static int singlethread_cpu __read_mostly;
76static cpumask_t cpu_singlethread_map __read_mostly; 76static const struct cpumask *cpu_singlethread_map __read_mostly;
77/* 77/*
78 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD 78 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
79 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work 79 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
@@ -81,7 +81,7 @@ static cpumask_t cpu_singlethread_map __read_mostly;
81 * use cpu_possible_map, the cpumask below is more a documentation 81 * use cpu_possible_map, the cpumask below is more a documentation
82 * than optimization. 82 * than optimization.
83 */ 83 */
84static cpumask_t cpu_populated_map __read_mostly; 84static cpumask_var_t cpu_populated_map __read_mostly;
85 85
86/* If it's single threaded, it isn't in the list of workqueues. */ 86/* If it's single threaded, it isn't in the list of workqueues. */
87static inline int is_wq_single_threaded(struct workqueue_struct *wq) 87static inline int is_wq_single_threaded(struct workqueue_struct *wq)
@@ -89,10 +89,10 @@ static inline int is_wq_single_threaded(struct workqueue_struct *wq)
89 return wq->singlethread; 89 return wq->singlethread;
90} 90}
91 91
92static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) 92static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
93{ 93{
94 return is_wq_single_threaded(wq) 94 return is_wq_single_threaded(wq)
95 ? &cpu_singlethread_map : &cpu_populated_map; 95 ? cpu_singlethread_map : cpu_populated_map;
96} 96}
97 97
98static 98static
@@ -410,7 +410,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
410 */ 410 */
411void flush_workqueue(struct workqueue_struct *wq) 411void flush_workqueue(struct workqueue_struct *wq)
412{ 412{
413 const cpumask_t *cpu_map = wq_cpu_map(wq); 413 const struct cpumask *cpu_map = wq_cpu_map(wq);
414 int cpu; 414 int cpu;
415 415
416 might_sleep(); 416 might_sleep();
@@ -532,7 +532,7 @@ static void wait_on_work(struct work_struct *work)
532{ 532{
533 struct cpu_workqueue_struct *cwq; 533 struct cpu_workqueue_struct *cwq;
534 struct workqueue_struct *wq; 534 struct workqueue_struct *wq;
535 const cpumask_t *cpu_map; 535 const struct cpumask *cpu_map;
536 int cpu; 536 int cpu;
537 537
538 might_sleep(); 538 might_sleep();
@@ -903,7 +903,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
903 */ 903 */
904void destroy_workqueue(struct workqueue_struct *wq) 904void destroy_workqueue(struct workqueue_struct *wq)
905{ 905{
906 const cpumask_t *cpu_map = wq_cpu_map(wq); 906 const struct cpumask *cpu_map = wq_cpu_map(wq);
907 int cpu; 907 int cpu;
908 908
909 cpu_maps_update_begin(); 909 cpu_maps_update_begin();
@@ -933,7 +933,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
933 933
934 switch (action) { 934 switch (action) {
935 case CPU_UP_PREPARE: 935 case CPU_UP_PREPARE:
936 cpu_set(cpu, cpu_populated_map); 936 cpumask_set_cpu(cpu, cpu_populated_map);
937 } 937 }
938undo: 938undo:
939 list_for_each_entry(wq, &workqueues, list) { 939 list_for_each_entry(wq, &workqueues, list) {
@@ -964,7 +964,7 @@ undo:
964 switch (action) { 964 switch (action) {
965 case CPU_UP_CANCELED: 965 case CPU_UP_CANCELED:
966 case CPU_POST_DEAD: 966 case CPU_POST_DEAD:
967 cpu_clear(cpu, cpu_populated_map); 967 cpumask_clear_cpu(cpu, cpu_populated_map);
968 } 968 }
969 969
970 return ret; 970 return ret;
@@ -1017,9 +1017,11 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
1017 1017
1018void __init init_workqueues(void) 1018void __init init_workqueues(void)
1019{ 1019{
1020 cpu_populated_map = cpu_online_map; 1020 alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL);
1021 singlethread_cpu = first_cpu(cpu_possible_map); 1021
1022 cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); 1022 cpumask_copy(cpu_populated_map, cpu_online_mask);
1023 singlethread_cpu = cpumask_first(cpu_possible_mask);
1024 cpu_singlethread_map = cpumask_of(singlethread_cpu);
1023 hotcpu_notifier(workqueue_cpu_callback, 0); 1025 hotcpu_notifier(workqueue_cpu_callback, 0);
1024 keventd_wq = create_workqueue("events"); 1026 keventd_wq = create_workqueue("events");
1025 BUG_ON(!keventd_wq); 1027 BUG_ON(!keventd_wq);