aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/async.c350
-rw-r--r--kernel/audit.h5
-rw-r--r--kernel/audit_tree.c3
-rw-r--r--kernel/auditfilter.c325
-rw-r--r--kernel/auditsc.c739
-rw-r--r--kernel/capability.c8
-rw-r--r--kernel/cgroup.c324
-rw-r--r--kernel/compat.c54
-rw-r--r--kernel/cpu.c157
-rw-r--r--kernel/cpuset.c300
-rw-r--r--kernel/cred.c5
-rw-r--r--kernel/dma-coherent.c47
-rw-r--r--kernel/exec_domain.c3
-rw-r--r--kernel/exit.c38
-rw-r--r--kernel/fork.c45
-rw-r--r--kernel/futex.c89
-rw-r--r--kernel/hrtimer.c192
-rw-r--r--kernel/irq/autoprobe.c20
-rw-r--r--kernel/irq/chip.c4
-rw-r--r--kernel/irq/handle.c64
-rw-r--r--kernel/irq/manage.c27
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/numa_migrate.c14
-rw-r--r--kernel/irq/proc.c57
-rw-r--r--kernel/irq/spurious.c5
-rw-r--r--kernel/itimer.c7
-rw-r--r--kernel/kallsyms.c16
-rw-r--r--kernel/kexec.c7
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c283
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/module.c139
-rw-r--r--kernel/ns_cgroup.c2
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/posix-cpu-timers.c70
-rw-r--r--kernel/posix-timers.c43
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/disk.c16
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/snapshot.c370
-rw-r--r--kernel/power/swsusp.c122
-rw-r--r--kernel/printk.c9
-rw-r--r--kernel/profile.c39
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcuclassic.c34
-rw-r--r--kernel/rcupdate.c11
-rw-r--r--kernel/rcupreempt.c30
-rw-r--r--kernel/rcutorture.c94
-rw-r--r--kernel/rcutree.c15
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/res_counter.c44
-rw-r--r--kernel/resource.c62
-rw-r--r--kernel/sched.c1175
-rw-r--r--kernel/sched_clock.c5
-rw-r--r--kernel/sched_cpupri.c39
-rw-r--r--kernel/sched_cpupri.h5
-rw-r--r--kernel/sched_debug.c21
-rw-r--r--kernel/sched_fair.c131
-rw-r--r--kernel/sched_rt.c78
-rw-r--r--kernel/sched_stats.h36
-rw-r--r--kernel/signal.c64
-rw-r--r--kernel/smp.c181
-rw-r--r--kernel/softirq.c22
-rw-r--r--kernel/softlockup.c19
-rw-r--r--kernel/stop_machine.c63
-rw-r--r--kernel/sys.c90
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c56
-rw-r--r--kernel/taskstats.c41
-rw-r--r--kernel/test_kprobes.c210
-rw-r--r--kernel/time.c18
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/tick-broadcast.c113
-rw-r--r--kernel/time/tick-common.c40
-rw-r--r--kernel/time/tick-sched.c24
-rw-r--r--kernel/time/timekeeping.c7
-rw-r--r--kernel/timer.c33
-rw-r--r--kernel/trace/ftrace.c32
-rw-r--r--kernel/trace/ring_buffer.c65
-rw-r--r--kernel/trace/trace.c73
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_hw_branches.c6
-rw-r--r--kernel/trace/trace_irqsoff.c1
-rw-r--r--kernel/trace/trace_power.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/trace/trace_sysprof.c13
-rw-r--r--kernel/tsacct.c4
-rw-r--r--kernel/uid16.c39
-rw-r--r--kernel/up.c21
-rw-r--r--kernel/wait.c59
-rw-r--r--kernel/workqueue.c46
99 files changed, 4364 insertions, 2834 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e1c5bf3365c0..170a9213c1b6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o
13 14
14ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
15# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -40,6 +41,9 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
40obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 41obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
41obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 42obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
42obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o 43obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
44ifneq ($(CONFIG_SMP),y)
45obj-y += up.o
46endif
43obj-$(CONFIG_SMP) += spinlock.o 47obj-$(CONFIG_SMP) += spinlock.o
44obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 48obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
45obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 49obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
diff --git a/kernel/acct.c b/kernel/acct.c
index d57b7cbb98b6..7afa31564162 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -277,7 +277,7 @@ static int acct_on(char *name)
277 * should be written. If the filename is NULL, accounting will be 277 * should be written. If the filename is NULL, accounting will be
278 * shutdown. 278 * shutdown.
279 */ 279 */
280asmlinkage long sys_acct(const char __user *name) 280SYSCALL_DEFINE1(acct, const char __user *, name)
281{ 281{
282 int error; 282 int error;
283 283
diff --git a/kernel/async.c b/kernel/async.c
new file mode 100644
index 000000000000..67a2be71f517
--- /dev/null
+++ b/kernel/async.c
@@ -0,0 +1,350 @@
1/*
2 * async.c: Asynchronous function calls for boot performance
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13
14/*
15
16Goals and Theory of Operation
17
18The primary goal of this feature is to reduce the kernel boot time,
19by doing various independent hardware delays and discovery operations
20decoupled and not strictly serialized.
21
22More specifically, the asynchronous function call concept allows
23certain operations (primarily during system boot) to happen
24asynchronously, out of order, while these operations still
25have their externally visible parts happen sequentially and in-order.
26(not unlike how out-of-order CPUs retire their instructions in order)
27
28Key to the asynchronous function call implementation is the concept of
29a "sequence cookie" (which, although it has an abstracted type, can be
30thought of as a monotonically incrementing number).
31
32The async core will assign each scheduled event such a sequence cookie and
33pass this to the called functions.
34
35The asynchronously called function should before doing a globally visible
36operation, such as registering device numbers, call the
37async_synchronize_cookie() function and pass in its own cookie. The
38async_synchronize_cookie() function will make sure that all asynchronous
39operations that were scheduled prior to the operation corresponding with the
40cookie have completed.
41
42Subsystem/driver initialization code that scheduled asynchronous probe
43functions, but which shares global resources with other drivers/subsystems
44that do not use the asynchronous call feature, need to do a full
45synchronization with the async_synchronize_full() function, before returning
46from their init function. This is to maintain strict ordering between the
47asynchronous and synchronous parts of the kernel.
48
49*/
50
51#include <linux/async.h>
52#include <linux/module.h>
53#include <linux/wait.h>
54#include <linux/sched.h>
55#include <linux/init.h>
56#include <linux/kthread.h>
57#include <asm/atomic.h>
58
59static async_cookie_t next_cookie = 1;
60
61#define MAX_THREADS 256
62#define MAX_WORK 32768
63
64static LIST_HEAD(async_pending);
65static LIST_HEAD(async_running);
66static DEFINE_SPINLOCK(async_lock);
67
68static int async_enabled = 0;
69
70struct async_entry {
71 struct list_head list;
72 async_cookie_t cookie;
73 async_func_ptr *func;
74 void *data;
75 struct list_head *running;
76};
77
78static DECLARE_WAIT_QUEUE_HEAD(async_done);
79static DECLARE_WAIT_QUEUE_HEAD(async_new);
80
81static atomic_t entry_count;
82static atomic_t thread_count;
83
84extern int initcall_debug;
85
86
87/*
88 * MUST be called with the lock held!
89 */
90static async_cookie_t __lowest_in_progress(struct list_head *running)
91{
92 struct async_entry *entry;
93 if (!list_empty(running)) {
94 entry = list_first_entry(running,
95 struct async_entry, list);
96 return entry->cookie;
97 } else if (!list_empty(&async_pending)) {
98 entry = list_first_entry(&async_pending,
99 struct async_entry, list);
100 return entry->cookie;
101 } else {
102 /* nothing in progress... next_cookie is "infinity" */
103 return next_cookie;
104 }
105
106}
107
108static async_cookie_t lowest_in_progress(struct list_head *running)
109{
110 unsigned long flags;
111 async_cookie_t ret;
112
113 spin_lock_irqsave(&async_lock, flags);
114 ret = __lowest_in_progress(running);
115 spin_unlock_irqrestore(&async_lock, flags);
116 return ret;
117}
118/*
119 * pick the first pending entry and run it
120 */
121static void run_one_entry(void)
122{
123 unsigned long flags;
124 struct async_entry *entry;
125 ktime_t calltime, delta, rettime;
126
127 /* 1) pick one task from the pending queue */
128
129 spin_lock_irqsave(&async_lock, flags);
130 if (list_empty(&async_pending))
131 goto out;
132 entry = list_first_entry(&async_pending, struct async_entry, list);
133
134 /* 2) move it to the running queue */
135 list_del(&entry->list);
136 list_add_tail(&entry->list, &async_running);
137 spin_unlock_irqrestore(&async_lock, flags);
138
139 /* 3) run it (and print duration)*/
140 if (initcall_debug && system_state == SYSTEM_BOOTING) {
141 printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
142 entry->func, task_pid_nr(current));
143 calltime = ktime_get();
144 }
145 entry->func(entry->data, entry->cookie);
146 if (initcall_debug && system_state == SYSTEM_BOOTING) {
147 rettime = ktime_get();
148 delta = ktime_sub(rettime, calltime);
149 printk("initcall %lli_%pF returned 0 after %lld usecs\n",
150 (long long)entry->cookie,
151 entry->func,
152 (long long)ktime_to_ns(delta) >> 10);
153 }
154
155 /* 4) remove it from the running queue */
156 spin_lock_irqsave(&async_lock, flags);
157 list_del(&entry->list);
158
159 /* 5) free the entry */
160 kfree(entry);
161 atomic_dec(&entry_count);
162
163 spin_unlock_irqrestore(&async_lock, flags);
164
165 /* 6) wake up any waiters. */
166 wake_up(&async_done);
167 return;
168
169out:
170 spin_unlock_irqrestore(&async_lock, flags);
171}
172
173
174static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running)
175{
176 struct async_entry *entry;
177 unsigned long flags;
178 async_cookie_t newcookie;
179
180
181 /* allow irq-off callers */
182 entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC);
183
184 /*
185 * If we're out of memory or if there's too much work
186 * pending already, we execute synchronously.
187 */
188 if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) {
189 kfree(entry);
190 spin_lock_irqsave(&async_lock, flags);
191 newcookie = next_cookie++;
192 spin_unlock_irqrestore(&async_lock, flags);
193
194 /* low on memory.. run synchronously */
195 ptr(data, newcookie);
196 return newcookie;
197 }
198 entry->func = ptr;
199 entry->data = data;
200 entry->running = running;
201
202 spin_lock_irqsave(&async_lock, flags);
203 newcookie = entry->cookie = next_cookie++;
204 list_add_tail(&entry->list, &async_pending);
205 atomic_inc(&entry_count);
206 spin_unlock_irqrestore(&async_lock, flags);
207 wake_up(&async_new);
208 return newcookie;
209}
210
211async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
212{
213 return __async_schedule(ptr, data, &async_pending);
214}
215EXPORT_SYMBOL_GPL(async_schedule);
216
217async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
218{
219 return __async_schedule(ptr, data, running);
220}
221EXPORT_SYMBOL_GPL(async_schedule_special);
222
223void async_synchronize_full(void)
224{
225 do {
226 async_synchronize_cookie(next_cookie);
227 } while (!list_empty(&async_running) || !list_empty(&async_pending));
228}
229EXPORT_SYMBOL_GPL(async_synchronize_full);
230
231void async_synchronize_full_special(struct list_head *list)
232{
233 async_synchronize_cookie_special(next_cookie, list);
234}
235EXPORT_SYMBOL_GPL(async_synchronize_full_special);
236
237void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
238{
239 ktime_t starttime, delta, endtime;
240
241 if (initcall_debug && system_state == SYSTEM_BOOTING) {
242 printk("async_waiting @ %i\n", task_pid_nr(current));
243 starttime = ktime_get();
244 }
245
246 wait_event(async_done, lowest_in_progress(running) >= cookie);
247
248 if (initcall_debug && system_state == SYSTEM_BOOTING) {
249 endtime = ktime_get();
250 delta = ktime_sub(endtime, starttime);
251
252 printk("async_continuing @ %i after %lli usec\n",
253 task_pid_nr(current),
254 (long long)ktime_to_ns(delta) >> 10);
255 }
256}
257EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
258
259void async_synchronize_cookie(async_cookie_t cookie)
260{
261 async_synchronize_cookie_special(cookie, &async_running);
262}
263EXPORT_SYMBOL_GPL(async_synchronize_cookie);
264
265
266static int async_thread(void *unused)
267{
268 DECLARE_WAITQUEUE(wq, current);
269 add_wait_queue(&async_new, &wq);
270
271 while (!kthread_should_stop()) {
272 int ret = HZ;
273 set_current_state(TASK_INTERRUPTIBLE);
274 /*
275 * check the list head without lock.. false positives
276 * are dealt with inside run_one_entry() while holding
277 * the lock.
278 */
279 rmb();
280 if (!list_empty(&async_pending))
281 run_one_entry();
282 else
283 ret = schedule_timeout(HZ);
284
285 if (ret == 0) {
286 /*
287 * we timed out, this means we as thread are redundant.
288 * we sign off and die, but we to avoid any races there
289 * is a last-straw check to see if work snuck in.
290 */
291 atomic_dec(&thread_count);
292 wmb(); /* manager must see our departure first */
293 if (list_empty(&async_pending))
294 break;
295 /*
296 * woops work came in between us timing out and us
297 * signing off; we need to stay alive and keep working.
298 */
299 atomic_inc(&thread_count);
300 }
301 }
302 remove_wait_queue(&async_new, &wq);
303
304 return 0;
305}
306
307static int async_manager_thread(void *unused)
308{
309 DECLARE_WAITQUEUE(wq, current);
310 add_wait_queue(&async_new, &wq);
311
312 while (!kthread_should_stop()) {
313 int tc, ec;
314
315 set_current_state(TASK_INTERRUPTIBLE);
316
317 tc = atomic_read(&thread_count);
318 rmb();
319 ec = atomic_read(&entry_count);
320
321 while (tc < ec && tc < MAX_THREADS) {
322 kthread_run(async_thread, NULL, "async/%i", tc);
323 atomic_inc(&thread_count);
324 tc++;
325 }
326
327 schedule();
328 }
329 remove_wait_queue(&async_new, &wq);
330
331 return 0;
332}
333
334static int __init async_init(void)
335{
336 if (async_enabled)
337 kthread_run(async_manager_thread, NULL, "async/mgr");
338 return 0;
339}
340
341static int __init setup_async(char *str)
342{
343 async_enabled = 1;
344 return 1;
345}
346
347__setup("fastboot", setup_async);
348
349
350core_initcall(async_init);
diff --git a/kernel/audit.h b/kernel/audit.h
index 9d6717412fec..16f18cac661b 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -159,11 +159,8 @@ static inline int audit_signal_info(int sig, struct task_struct *t)
159 return __audit_signal_info(sig, t); 159 return __audit_signal_info(sig, t);
160 return 0; 160 return 0;
161} 161}
162extern enum audit_state audit_filter_inodes(struct task_struct *, 162extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
163 struct audit_context *);
164extern void audit_set_auditable(struct audit_context *);
165#else 163#else
166#define audit_signal_info(s,t) AUDIT_DISABLED 164#define audit_signal_info(s,t) AUDIT_DISABLED
167#define audit_filter_inodes(t,c) AUDIT_DISABLED 165#define audit_filter_inodes(t,c) AUDIT_DISABLED
168#define audit_set_auditable(c)
169#endif 166#endif
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8b509441f49a..8ad9545b8db9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -450,6 +450,7 @@ static void kill_rules(struct audit_tree *tree)
450 audit_log_end(ab); 450 audit_log_end(ab);
451 rule->tree = NULL; 451 rule->tree = NULL;
452 list_del_rcu(&entry->list); 452 list_del_rcu(&entry->list);
453 list_del(&entry->rule.list);
453 call_rcu(&entry->rcu, audit_free_rule_rcu); 454 call_rcu(&entry->rcu, audit_free_rule_rcu);
454 } 455 }
455 } 456 }
@@ -617,7 +618,7 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
617 618
618 if (pathname[0] != '/' || 619 if (pathname[0] != '/' ||
619 rule->listnr != AUDIT_FILTER_EXIT || 620 rule->listnr != AUDIT_FILTER_EXIT ||
620 op & ~AUDIT_EQUAL || 621 op != Audit_equal ||
621 rule->inode_f || rule->watch || rule->tree) 622 rule->inode_f || rule->watch || rule->tree)
622 return -EINVAL; 623 return -EINVAL;
623 rule->tree = alloc_tree(pathname); 624 rule->tree = alloc_tree(pathname);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9fd85a4640a0..fbf24d121d97 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -86,6 +86,14 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
86#error Fix audit_filter_list initialiser 86#error Fix audit_filter_list initialiser
87#endif 87#endif
88}; 88};
89static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
90 LIST_HEAD_INIT(audit_rules_list[0]),
91 LIST_HEAD_INIT(audit_rules_list[1]),
92 LIST_HEAD_INIT(audit_rules_list[2]),
93 LIST_HEAD_INIT(audit_rules_list[3]),
94 LIST_HEAD_INIT(audit_rules_list[4]),
95 LIST_HEAD_INIT(audit_rules_list[5]),
96};
89 97
90DEFINE_MUTEX(audit_filter_mutex); 98DEFINE_MUTEX(audit_filter_mutex);
91 99
@@ -244,7 +252,8 @@ static inline int audit_to_inode(struct audit_krule *krule,
244 struct audit_field *f) 252 struct audit_field *f)
245{ 253{
246 if (krule->listnr != AUDIT_FILTER_EXIT || 254 if (krule->listnr != AUDIT_FILTER_EXIT ||
247 krule->watch || krule->inode_f || krule->tree) 255 krule->watch || krule->inode_f || krule->tree ||
256 (f->op != Audit_equal && f->op != Audit_not_equal))
248 return -EINVAL; 257 return -EINVAL;
249 258
250 krule->inode_f = f; 259 krule->inode_f = f;
@@ -262,7 +271,7 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
262 271
263 if (path[0] != '/' || path[len-1] == '/' || 272 if (path[0] != '/' || path[len-1] == '/' ||
264 krule->listnr != AUDIT_FILTER_EXIT || 273 krule->listnr != AUDIT_FILTER_EXIT ||
265 op & ~AUDIT_EQUAL || 274 op != Audit_equal ||
266 krule->inode_f || krule->watch || krule->tree) 275 krule->inode_f || krule->watch || krule->tree)
267 return -EINVAL; 276 return -EINVAL;
268 277
@@ -412,12 +421,32 @@ exit_err:
412 return ERR_PTR(err); 421 return ERR_PTR(err);
413} 422}
414 423
424static u32 audit_ops[] =
425{
426 [Audit_equal] = AUDIT_EQUAL,
427 [Audit_not_equal] = AUDIT_NOT_EQUAL,
428 [Audit_bitmask] = AUDIT_BIT_MASK,
429 [Audit_bittest] = AUDIT_BIT_TEST,
430 [Audit_lt] = AUDIT_LESS_THAN,
431 [Audit_gt] = AUDIT_GREATER_THAN,
432 [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL,
433 [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL,
434};
435
436static u32 audit_to_op(u32 op)
437{
438 u32 n;
439 for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++)
440 ;
441 return n;
442}
443
444
415/* Translate struct audit_rule to kernel's rule respresentation. 445/* Translate struct audit_rule to kernel's rule respresentation.
416 * Exists for backward compatibility with userspace. */ 446 * Exists for backward compatibility with userspace. */
417static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 447static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
418{ 448{
419 struct audit_entry *entry; 449 struct audit_entry *entry;
420 struct audit_field *ino_f;
421 int err = 0; 450 int err = 0;
422 int i; 451 int i;
423 452
@@ -427,12 +456,28 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
427 456
428 for (i = 0; i < rule->field_count; i++) { 457 for (i = 0; i < rule->field_count; i++) {
429 struct audit_field *f = &entry->rule.fields[i]; 458 struct audit_field *f = &entry->rule.fields[i];
459 u32 n;
460
461 n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
462
463 /* Support for legacy operators where
464 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
465 if (n & AUDIT_NEGATE)
466 f->op = Audit_not_equal;
467 else if (!n)
468 f->op = Audit_equal;
469 else
470 f->op = audit_to_op(n);
471
472 entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
430 473
431 f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
432 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 474 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
433 f->val = rule->values[i]; 475 f->val = rule->values[i];
434 476
435 err = -EINVAL; 477 err = -EINVAL;
478 if (f->op == Audit_bad)
479 goto exit_free;
480
436 switch(f->type) { 481 switch(f->type) {
437 default: 482 default:
438 goto exit_free; 483 goto exit_free;
@@ -454,11 +499,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
454 case AUDIT_EXIT: 499 case AUDIT_EXIT:
455 case AUDIT_SUCCESS: 500 case AUDIT_SUCCESS:
456 /* bit ops are only useful on syscall args */ 501 /* bit ops are only useful on syscall args */
457 if (f->op == AUDIT_BIT_MASK || 502 if (f->op == Audit_bitmask || f->op == Audit_bittest)
458 f->op == AUDIT_BIT_TEST) {
459 err = -EINVAL;
460 goto exit_free; 503 goto exit_free;
461 }
462 break; 504 break;
463 case AUDIT_ARG0: 505 case AUDIT_ARG0:
464 case AUDIT_ARG1: 506 case AUDIT_ARG1:
@@ -467,11 +509,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
467 break; 509 break;
468 /* arch is only allowed to be = or != */ 510 /* arch is only allowed to be = or != */
469 case AUDIT_ARCH: 511 case AUDIT_ARCH:
470 if ((f->op != AUDIT_NOT_EQUAL) && (f->op != AUDIT_EQUAL) 512 if (f->op != Audit_not_equal && f->op != Audit_equal)
471 && (f->op != AUDIT_NEGATE) && (f->op)) {
472 err = -EINVAL;
473 goto exit_free; 513 goto exit_free;
474 }
475 entry->rule.arch_f = f; 514 entry->rule.arch_f = f;
476 break; 515 break;
477 case AUDIT_PERM: 516 case AUDIT_PERM:
@@ -488,33 +527,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
488 goto exit_free; 527 goto exit_free;
489 break; 528 break;
490 } 529 }
491
492 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
493
494 /* Support for legacy operators where
495 * AUDIT_NEGATE bit signifies != and otherwise assumes == */
496 if (f->op & AUDIT_NEGATE)
497 f->op = AUDIT_NOT_EQUAL;
498 else if (!f->op)
499 f->op = AUDIT_EQUAL;
500 else if (f->op == AUDIT_OPERATORS) {
501 err = -EINVAL;
502 goto exit_free;
503 }
504 } 530 }
505 531
506 ino_f = entry->rule.inode_f; 532 if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
507 if (ino_f) { 533 entry->rule.inode_f = NULL;
508 switch(ino_f->op) {
509 case AUDIT_NOT_EQUAL:
510 entry->rule.inode_f = NULL;
511 case AUDIT_EQUAL:
512 break;
513 default:
514 err = -EINVAL;
515 goto exit_free;
516 }
517 }
518 534
519exit_nofree: 535exit_nofree:
520 return entry; 536 return entry;
@@ -530,7 +546,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
530{ 546{
531 int err = 0; 547 int err = 0;
532 struct audit_entry *entry; 548 struct audit_entry *entry;
533 struct audit_field *ino_f;
534 void *bufp; 549 void *bufp;
535 size_t remain = datasz - sizeof(struct audit_rule_data); 550 size_t remain = datasz - sizeof(struct audit_rule_data);
536 int i; 551 int i;
@@ -546,11 +561,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
546 struct audit_field *f = &entry->rule.fields[i]; 561 struct audit_field *f = &entry->rule.fields[i];
547 562
548 err = -EINVAL; 563 err = -EINVAL;
549 if (!(data->fieldflags[i] & AUDIT_OPERATORS) || 564
550 data->fieldflags[i] & ~AUDIT_OPERATORS) 565 f->op = audit_to_op(data->fieldflags[i]);
566 if (f->op == Audit_bad)
551 goto exit_free; 567 goto exit_free;
552 568
553 f->op = data->fieldflags[i] & AUDIT_OPERATORS;
554 f->type = data->fields[i]; 569 f->type = data->fields[i];
555 f->val = data->values[i]; 570 f->val = data->values[i];
556 f->lsm_str = NULL; 571 f->lsm_str = NULL;
@@ -662,18 +677,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
662 } 677 }
663 } 678 }
664 679
665 ino_f = entry->rule.inode_f; 680 if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
666 if (ino_f) { 681 entry->rule.inode_f = NULL;
667 switch(ino_f->op) {
668 case AUDIT_NOT_EQUAL:
669 entry->rule.inode_f = NULL;
670 case AUDIT_EQUAL:
671 break;
672 default:
673 err = -EINVAL;
674 goto exit_free;
675 }
676 }
677 682
678exit_nofree: 683exit_nofree:
679 return entry; 684 return entry;
@@ -713,10 +718,10 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
713 rule->fields[i] = krule->fields[i].type; 718 rule->fields[i] = krule->fields[i].type;
714 719
715 if (krule->vers_ops == 1) { 720 if (krule->vers_ops == 1) {
716 if (krule->fields[i].op & AUDIT_NOT_EQUAL) 721 if (krule->fields[i].op == Audit_not_equal)
717 rule->fields[i] |= AUDIT_NEGATE; 722 rule->fields[i] |= AUDIT_NEGATE;
718 } else { 723 } else {
719 rule->fields[i] |= krule->fields[i].op; 724 rule->fields[i] |= audit_ops[krule->fields[i].op];
720 } 725 }
721 } 726 }
722 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; 727 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
@@ -744,7 +749,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
744 struct audit_field *f = &krule->fields[i]; 749 struct audit_field *f = &krule->fields[i];
745 750
746 data->fields[i] = f->type; 751 data->fields[i] = f->type;
747 data->fieldflags[i] = f->op; 752 data->fieldflags[i] = audit_ops[f->op];
748 switch(f->type) { 753 switch(f->type) {
749 case AUDIT_SUBJ_USER: 754 case AUDIT_SUBJ_USER:
750 case AUDIT_SUBJ_ROLE: 755 case AUDIT_SUBJ_ROLE:
@@ -919,6 +924,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
919 new->action = old->action; 924 new->action = old->action;
920 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 925 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
921 new->mask[i] = old->mask[i]; 926 new->mask[i] = old->mask[i];
927 new->prio = old->prio;
922 new->buflen = old->buflen; 928 new->buflen = old->buflen;
923 new->inode_f = old->inode_f; 929 new->inode_f = old->inode_f;
924 new->watch = NULL; 930 new->watch = NULL;
@@ -987,9 +993,8 @@ static void audit_update_watch(struct audit_parent *parent,
987 993
988 /* If the update involves invalidating rules, do the inode-based 994 /* If the update involves invalidating rules, do the inode-based
989 * filtering now, so we don't omit records. */ 995 * filtering now, so we don't omit records. */
990 if (invalidating && current->audit_context && 996 if (invalidating && current->audit_context)
991 audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) 997 audit_filter_inodes(current, current->audit_context);
992 audit_set_auditable(current->audit_context);
993 998
994 nwatch = audit_dupe_watch(owatch); 999 nwatch = audit_dupe_watch(owatch);
995 if (IS_ERR(nwatch)) { 1000 if (IS_ERR(nwatch)) {
@@ -1007,12 +1012,15 @@ static void audit_update_watch(struct audit_parent *parent,
1007 list_del_rcu(&oentry->list); 1012 list_del_rcu(&oentry->list);
1008 1013
1009 nentry = audit_dupe_rule(&oentry->rule, nwatch); 1014 nentry = audit_dupe_rule(&oentry->rule, nwatch);
1010 if (IS_ERR(nentry)) 1015 if (IS_ERR(nentry)) {
1016 list_del(&oentry->rule.list);
1011 audit_panic("error updating watch, removing"); 1017 audit_panic("error updating watch, removing");
1012 else { 1018 } else {
1013 int h = audit_hash_ino((u32)ino); 1019 int h = audit_hash_ino((u32)ino);
1014 list_add(&nentry->rule.rlist, &nwatch->rules); 1020 list_add(&nentry->rule.rlist, &nwatch->rules);
1015 list_add_rcu(&nentry->list, &audit_inode_hash[h]); 1021 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
1022 list_replace(&oentry->rule.list,
1023 &nentry->rule.list);
1016 } 1024 }
1017 1025
1018 call_rcu(&oentry->rcu, audit_free_rule_rcu); 1026 call_rcu(&oentry->rcu, audit_free_rule_rcu);
@@ -1077,6 +1085,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
1077 audit_log_end(ab); 1085 audit_log_end(ab);
1078 } 1086 }
1079 list_del(&r->rlist); 1087 list_del(&r->rlist);
1088 list_del(&r->list);
1080 list_del_rcu(&e->list); 1089 list_del_rcu(&e->list);
1081 call_rcu(&e->rcu, audit_free_rule_rcu); 1090 call_rcu(&e->rcu, audit_free_rule_rcu);
1082 } 1091 }
@@ -1102,12 +1111,16 @@ static void audit_inotify_unregister(struct list_head *in_list)
1102/* Find an existing audit rule. 1111/* Find an existing audit rule.
1103 * Caller must hold audit_filter_mutex to prevent stale rule data. */ 1112 * Caller must hold audit_filter_mutex to prevent stale rule data. */
1104static struct audit_entry *audit_find_rule(struct audit_entry *entry, 1113static struct audit_entry *audit_find_rule(struct audit_entry *entry,
1105 struct list_head *list) 1114 struct list_head **p)
1106{ 1115{
1107 struct audit_entry *e, *found = NULL; 1116 struct audit_entry *e, *found = NULL;
1117 struct list_head *list;
1108 int h; 1118 int h;
1109 1119
1110 if (entry->rule.watch) { 1120 if (entry->rule.inode_f) {
1121 h = audit_hash_ino(entry->rule.inode_f->val);
1122 *p = list = &audit_inode_hash[h];
1123 } else if (entry->rule.watch) {
1111 /* we don't know the inode number, so must walk entire hash */ 1124 /* we don't know the inode number, so must walk entire hash */
1112 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { 1125 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
1113 list = &audit_inode_hash[h]; 1126 list = &audit_inode_hash[h];
@@ -1118,6 +1131,8 @@ static struct audit_entry *audit_find_rule(struct audit_entry *entry,
1118 } 1131 }
1119 } 1132 }
1120 goto out; 1133 goto out;
1134 } else {
1135 *p = list = &audit_filter_list[entry->rule.listnr];
1121 } 1136 }
1122 1137
1123 list_for_each_entry(e, list, list) 1138 list_for_each_entry(e, list, list)
@@ -1258,15 +1273,17 @@ static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1258 return ret; 1273 return ret;
1259} 1274}
1260 1275
1276static u64 prio_low = ~0ULL/2;
1277static u64 prio_high = ~0ULL/2 - 1;
1278
1261/* Add rule to given filterlist if not a duplicate. */ 1279/* Add rule to given filterlist if not a duplicate. */
1262static inline int audit_add_rule(struct audit_entry *entry, 1280static inline int audit_add_rule(struct audit_entry *entry)
1263 struct list_head *list)
1264{ 1281{
1265 struct audit_entry *e; 1282 struct audit_entry *e;
1266 struct audit_field *inode_f = entry->rule.inode_f;
1267 struct audit_watch *watch = entry->rule.watch; 1283 struct audit_watch *watch = entry->rule.watch;
1268 struct audit_tree *tree = entry->rule.tree; 1284 struct audit_tree *tree = entry->rule.tree;
1269 struct nameidata *ndp = NULL, *ndw = NULL; 1285 struct nameidata *ndp = NULL, *ndw = NULL;
1286 struct list_head *list;
1270 int h, err; 1287 int h, err;
1271#ifdef CONFIG_AUDITSYSCALL 1288#ifdef CONFIG_AUDITSYSCALL
1272 int dont_count = 0; 1289 int dont_count = 0;
@@ -1277,13 +1294,8 @@ static inline int audit_add_rule(struct audit_entry *entry,
1277 dont_count = 1; 1294 dont_count = 1;
1278#endif 1295#endif
1279 1296
1280 if (inode_f) {
1281 h = audit_hash_ino(inode_f->val);
1282 list = &audit_inode_hash[h];
1283 }
1284
1285 mutex_lock(&audit_filter_mutex); 1297 mutex_lock(&audit_filter_mutex);
1286 e = audit_find_rule(entry, list); 1298 e = audit_find_rule(entry, &list);
1287 mutex_unlock(&audit_filter_mutex); 1299 mutex_unlock(&audit_filter_mutex);
1288 if (e) { 1300 if (e) {
1289 err = -EEXIST; 1301 err = -EEXIST;
@@ -1319,10 +1331,22 @@ static inline int audit_add_rule(struct audit_entry *entry,
1319 } 1331 }
1320 } 1332 }
1321 1333
1334 entry->rule.prio = ~0ULL;
1335 if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
1336 if (entry->rule.flags & AUDIT_FILTER_PREPEND)
1337 entry->rule.prio = ++prio_high;
1338 else
1339 entry->rule.prio = --prio_low;
1340 }
1341
1322 if (entry->rule.flags & AUDIT_FILTER_PREPEND) { 1342 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
1343 list_add(&entry->rule.list,
1344 &audit_rules_list[entry->rule.listnr]);
1323 list_add_rcu(&entry->list, list); 1345 list_add_rcu(&entry->list, list);
1324 entry->rule.flags &= ~AUDIT_FILTER_PREPEND; 1346 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
1325 } else { 1347 } else {
1348 list_add_tail(&entry->rule.list,
1349 &audit_rules_list[entry->rule.listnr]);
1326 list_add_tail_rcu(&entry->list, list); 1350 list_add_tail_rcu(&entry->list, list);
1327 } 1351 }
1328#ifdef CONFIG_AUDITSYSCALL 1352#ifdef CONFIG_AUDITSYSCALL
@@ -1345,15 +1369,14 @@ error:
1345} 1369}
1346 1370
1347/* Remove an existing rule from filterlist. */ 1371/* Remove an existing rule from filterlist. */
1348static inline int audit_del_rule(struct audit_entry *entry, 1372static inline int audit_del_rule(struct audit_entry *entry)
1349 struct list_head *list)
1350{ 1373{
1351 struct audit_entry *e; 1374 struct audit_entry *e;
1352 struct audit_field *inode_f = entry->rule.inode_f;
1353 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 1375 struct audit_watch *watch, *tmp_watch = entry->rule.watch;
1354 struct audit_tree *tree = entry->rule.tree; 1376 struct audit_tree *tree = entry->rule.tree;
1377 struct list_head *list;
1355 LIST_HEAD(inotify_list); 1378 LIST_HEAD(inotify_list);
1356 int h, ret = 0; 1379 int ret = 0;
1357#ifdef CONFIG_AUDITSYSCALL 1380#ifdef CONFIG_AUDITSYSCALL
1358 int dont_count = 0; 1381 int dont_count = 0;
1359 1382
@@ -1363,13 +1386,8 @@ static inline int audit_del_rule(struct audit_entry *entry,
1363 dont_count = 1; 1386 dont_count = 1;
1364#endif 1387#endif
1365 1388
1366 if (inode_f) {
1367 h = audit_hash_ino(inode_f->val);
1368 list = &audit_inode_hash[h];
1369 }
1370
1371 mutex_lock(&audit_filter_mutex); 1389 mutex_lock(&audit_filter_mutex);
1372 e = audit_find_rule(entry, list); 1390 e = audit_find_rule(entry, &list);
1373 if (!e) { 1391 if (!e) {
1374 mutex_unlock(&audit_filter_mutex); 1392 mutex_unlock(&audit_filter_mutex);
1375 ret = -ENOENT; 1393 ret = -ENOENT;
@@ -1404,6 +1422,7 @@ static inline int audit_del_rule(struct audit_entry *entry,
1404 audit_remove_tree_rule(&e->rule); 1422 audit_remove_tree_rule(&e->rule);
1405 1423
1406 list_del_rcu(&e->list); 1424 list_del_rcu(&e->list);
1425 list_del(&e->rule.list);
1407 call_rcu(&e->rcu, audit_free_rule_rcu); 1426 call_rcu(&e->rcu, audit_free_rule_rcu);
1408 1427
1409#ifdef CONFIG_AUDITSYSCALL 1428#ifdef CONFIG_AUDITSYSCALL
@@ -1432,30 +1451,16 @@ out:
1432static void audit_list(int pid, int seq, struct sk_buff_head *q) 1451static void audit_list(int pid, int seq, struct sk_buff_head *q)
1433{ 1452{
1434 struct sk_buff *skb; 1453 struct sk_buff *skb;
1435 struct audit_entry *entry; 1454 struct audit_krule *r;
1436 int i; 1455 int i;
1437 1456
1438 /* This is a blocking read, so use audit_filter_mutex instead of rcu 1457 /* This is a blocking read, so use audit_filter_mutex instead of rcu
1439 * iterator to sync with list writers. */ 1458 * iterator to sync with list writers. */
1440 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1459 for (i=0; i<AUDIT_NR_FILTERS; i++) {
1441 list_for_each_entry(entry, &audit_filter_list[i], list) { 1460 list_for_each_entry(r, &audit_rules_list[i], list) {
1442 struct audit_rule *rule;
1443
1444 rule = audit_krule_to_rule(&entry->rule);
1445 if (unlikely(!rule))
1446 break;
1447 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
1448 rule, sizeof(*rule));
1449 if (skb)
1450 skb_queue_tail(q, skb);
1451 kfree(rule);
1452 }
1453 }
1454 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
1455 list_for_each_entry(entry, &audit_inode_hash[i], list) {
1456 struct audit_rule *rule; 1461 struct audit_rule *rule;
1457 1462
1458 rule = audit_krule_to_rule(&entry->rule); 1463 rule = audit_krule_to_rule(r);
1459 if (unlikely(!rule)) 1464 if (unlikely(!rule))
1460 break; 1465 break;
1461 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, 1466 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
@@ -1474,30 +1479,16 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q)
1474static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) 1479static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1475{ 1480{
1476 struct sk_buff *skb; 1481 struct sk_buff *skb;
1477 struct audit_entry *e; 1482 struct audit_krule *r;
1478 int i; 1483 int i;
1479 1484
1480 /* This is a blocking read, so use audit_filter_mutex instead of rcu 1485 /* This is a blocking read, so use audit_filter_mutex instead of rcu
1481 * iterator to sync with list writers. */ 1486 * iterator to sync with list writers. */
1482 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1487 for (i=0; i<AUDIT_NR_FILTERS; i++) {
1483 list_for_each_entry(e, &audit_filter_list[i], list) { 1488 list_for_each_entry(r, &audit_rules_list[i], list) {
1484 struct audit_rule_data *data;
1485
1486 data = audit_krule_to_data(&e->rule);
1487 if (unlikely(!data))
1488 break;
1489 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
1490 data, sizeof(*data) + data->buflen);
1491 if (skb)
1492 skb_queue_tail(q, skb);
1493 kfree(data);
1494 }
1495 }
1496 for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
1497 list_for_each_entry(e, &audit_inode_hash[i], list) {
1498 struct audit_rule_data *data; 1489 struct audit_rule_data *data;
1499 1490
1500 data = audit_krule_to_data(&e->rule); 1491 data = audit_krule_to_data(r);
1501 if (unlikely(!data)) 1492 if (unlikely(!data))
1502 break; 1493 break;
1503 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, 1494 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
@@ -1603,8 +1594,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1603 if (IS_ERR(entry)) 1594 if (IS_ERR(entry))
1604 return PTR_ERR(entry); 1595 return PTR_ERR(entry);
1605 1596
1606 err = audit_add_rule(entry, 1597 err = audit_add_rule(entry);
1607 &audit_filter_list[entry->rule.listnr]);
1608 audit_log_rule_change(loginuid, sessionid, sid, "add", 1598 audit_log_rule_change(loginuid, sessionid, sid, "add",
1609 &entry->rule, !err); 1599 &entry->rule, !err);
1610 1600
@@ -1620,8 +1610,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1620 if (IS_ERR(entry)) 1610 if (IS_ERR(entry))
1621 return PTR_ERR(entry); 1611 return PTR_ERR(entry);
1622 1612
1623 err = audit_del_rule(entry, 1613 err = audit_del_rule(entry);
1624 &audit_filter_list[entry->rule.listnr]);
1625 audit_log_rule_change(loginuid, sessionid, sid, "remove", 1614 audit_log_rule_change(loginuid, sessionid, sid, "remove",
1626 &entry->rule, !err); 1615 &entry->rule, !err);
1627 1616
@@ -1634,28 +1623,29 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1634 return err; 1623 return err;
1635} 1624}
1636 1625
1637int audit_comparator(const u32 left, const u32 op, const u32 right) 1626int audit_comparator(u32 left, u32 op, u32 right)
1638{ 1627{
1639 switch (op) { 1628 switch (op) {
1640 case AUDIT_EQUAL: 1629 case Audit_equal:
1641 return (left == right); 1630 return (left == right);
1642 case AUDIT_NOT_EQUAL: 1631 case Audit_not_equal:
1643 return (left != right); 1632 return (left != right);
1644 case AUDIT_LESS_THAN: 1633 case Audit_lt:
1645 return (left < right); 1634 return (left < right);
1646 case AUDIT_LESS_THAN_OR_EQUAL: 1635 case Audit_le:
1647 return (left <= right); 1636 return (left <= right);
1648 case AUDIT_GREATER_THAN: 1637 case Audit_gt:
1649 return (left > right); 1638 return (left > right);
1650 case AUDIT_GREATER_THAN_OR_EQUAL: 1639 case Audit_ge:
1651 return (left >= right); 1640 return (left >= right);
1652 case AUDIT_BIT_MASK: 1641 case Audit_bitmask:
1653 return (left & right); 1642 return (left & right);
1654 case AUDIT_BIT_TEST: 1643 case Audit_bittest:
1655 return ((left & right) == right); 1644 return ((left & right) == right);
1645 default:
1646 BUG();
1647 return 0;
1656 } 1648 }
1657 BUG();
1658 return 0;
1659} 1649}
1660 1650
1661/* Compare given dentry name with last component in given path, 1651/* Compare given dentry name with last component in given path,
@@ -1778,6 +1768,43 @@ unlock_and_return:
1778 return result; 1768 return result;
1779} 1769}
1780 1770
1771static int update_lsm_rule(struct audit_krule *r)
1772{
1773 struct audit_entry *entry = container_of(r, struct audit_entry, rule);
1774 struct audit_entry *nentry;
1775 struct audit_watch *watch;
1776 struct audit_tree *tree;
1777 int err = 0;
1778
1779 if (!security_audit_rule_known(r))
1780 return 0;
1781
1782 watch = r->watch;
1783 tree = r->tree;
1784 nentry = audit_dupe_rule(r, watch);
1785 if (IS_ERR(nentry)) {
1786 /* save the first error encountered for the
1787 * return value */
1788 err = PTR_ERR(nentry);
1789 audit_panic("error updating LSM filters");
1790 if (watch)
1791 list_del(&r->rlist);
1792 list_del_rcu(&entry->list);
1793 list_del(&r->list);
1794 } else {
1795 if (watch) {
1796 list_add(&nentry->rule.rlist, &watch->rules);
1797 list_del(&r->rlist);
1798 } else if (tree)
1799 list_replace_init(&r->rlist, &nentry->rule.rlist);
1800 list_replace_rcu(&entry->list, &nentry->list);
1801 list_replace(&r->list, &nentry->rule.list);
1802 }
1803 call_rcu(&entry->rcu, audit_free_rule_rcu);
1804
1805 return err;
1806}
1807
1781/* This function will re-initialize the lsm_rule field of all applicable rules. 1808/* This function will re-initialize the lsm_rule field of all applicable rules.
1782 * It will traverse the filter lists serarching for rules that contain LSM 1809 * It will traverse the filter lists serarching for rules that contain LSM
1783 * specific filter fields. When such a rule is found, it is copied, the 1810 * specific filter fields. When such a rule is found, it is copied, the
@@ -1785,45 +1812,19 @@ unlock_and_return:
1785 * updated rule. */ 1812 * updated rule. */
1786int audit_update_lsm_rules(void) 1813int audit_update_lsm_rules(void)
1787{ 1814{
1788 struct audit_entry *entry, *n, *nentry; 1815 struct audit_krule *r, *n;
1789 struct audit_watch *watch;
1790 struct audit_tree *tree;
1791 int i, err = 0; 1816 int i, err = 0;
1792 1817
1793 /* audit_filter_mutex synchronizes the writers */ 1818 /* audit_filter_mutex synchronizes the writers */
1794 mutex_lock(&audit_filter_mutex); 1819 mutex_lock(&audit_filter_mutex);
1795 1820
1796 for (i = 0; i < AUDIT_NR_FILTERS; i++) { 1821 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
1797 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { 1822 list_for_each_entry_safe(r, n, &audit_rules_list[i], list) {
1798 if (!security_audit_rule_known(&entry->rule)) 1823 int res = update_lsm_rule(r);
1799 continue; 1824 if (!err)
1800 1825 err = res;
1801 watch = entry->rule.watch;
1802 tree = entry->rule.tree;
1803 nentry = audit_dupe_rule(&entry->rule, watch);
1804 if (IS_ERR(nentry)) {
1805 /* save the first error encountered for the
1806 * return value */
1807 if (!err)
1808 err = PTR_ERR(nentry);
1809 audit_panic("error updating LSM filters");
1810 if (watch)
1811 list_del(&entry->rule.rlist);
1812 list_del_rcu(&entry->list);
1813 } else {
1814 if (watch) {
1815 list_add(&nentry->rule.rlist,
1816 &watch->rules);
1817 list_del(&entry->rule.rlist);
1818 } else if (tree)
1819 list_replace_init(&entry->rule.rlist,
1820 &nentry->rule.rlist);
1821 list_replace_rcu(&entry->list, &nentry->list);
1822 }
1823 call_rcu(&entry->rcu, audit_free_rule_rcu);
1824 } 1826 }
1825 } 1827 }
1826
1827 mutex_unlock(&audit_filter_mutex); 1828 mutex_unlock(&audit_filter_mutex);
1828 1829
1829 return err; 1830 return err;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4819f3711973..8cbddff6c283 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -124,43 +124,6 @@ struct audit_aux_data {
124/* Number of target pids per aux struct. */ 124/* Number of target pids per aux struct. */
125#define AUDIT_AUX_PIDS 16 125#define AUDIT_AUX_PIDS 16
126 126
127struct audit_aux_data_mq_open {
128 struct audit_aux_data d;
129 int oflag;
130 mode_t mode;
131 struct mq_attr attr;
132};
133
134struct audit_aux_data_mq_sendrecv {
135 struct audit_aux_data d;
136 mqd_t mqdes;
137 size_t msg_len;
138 unsigned int msg_prio;
139 struct timespec abs_timeout;
140};
141
142struct audit_aux_data_mq_notify {
143 struct audit_aux_data d;
144 mqd_t mqdes;
145 struct sigevent notification;
146};
147
148struct audit_aux_data_mq_getsetattr {
149 struct audit_aux_data d;
150 mqd_t mqdes;
151 struct mq_attr mqstat;
152};
153
154struct audit_aux_data_ipcctl {
155 struct audit_aux_data d;
156 struct ipc_perm p;
157 unsigned long qbytes;
158 uid_t uid;
159 gid_t gid;
160 mode_t mode;
161 u32 osid;
162};
163
164struct audit_aux_data_execve { 127struct audit_aux_data_execve {
165 struct audit_aux_data d; 128 struct audit_aux_data d;
166 int argc; 129 int argc;
@@ -168,23 +131,6 @@ struct audit_aux_data_execve {
168 struct mm_struct *mm; 131 struct mm_struct *mm;
169}; 132};
170 133
171struct audit_aux_data_socketcall {
172 struct audit_aux_data d;
173 int nargs;
174 unsigned long args[0];
175};
176
177struct audit_aux_data_sockaddr {
178 struct audit_aux_data d;
179 int len;
180 char a[0];
181};
182
183struct audit_aux_data_fd_pair {
184 struct audit_aux_data d;
185 int fd[2];
186};
187
188struct audit_aux_data_pids { 134struct audit_aux_data_pids {
189 struct audit_aux_data d; 135 struct audit_aux_data d;
190 pid_t target_pid[AUDIT_AUX_PIDS]; 136 pid_t target_pid[AUDIT_AUX_PIDS];
@@ -219,14 +165,14 @@ struct audit_tree_refs {
219struct audit_context { 165struct audit_context {
220 int dummy; /* must be the first element */ 166 int dummy; /* must be the first element */
221 int in_syscall; /* 1 if task is in a syscall */ 167 int in_syscall; /* 1 if task is in a syscall */
222 enum audit_state state; 168 enum audit_state state, current_state;
223 unsigned int serial; /* serial number for record */ 169 unsigned int serial; /* serial number for record */
224 struct timespec ctime; /* time of syscall entry */ 170 struct timespec ctime; /* time of syscall entry */
225 int major; /* syscall number */ 171 int major; /* syscall number */
226 unsigned long argv[4]; /* syscall arguments */ 172 unsigned long argv[4]; /* syscall arguments */
227 int return_valid; /* return code is valid */ 173 int return_valid; /* return code is valid */
228 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
229 int auditable; /* 1 if record should be written */ 175 u64 prio;
230 int name_count; 176 int name_count;
231 struct audit_names names[AUDIT_NAMES]; 177 struct audit_names names[AUDIT_NAMES];
232 char * filterkey; /* key for rule that triggered record */ 178 char * filterkey; /* key for rule that triggered record */
@@ -234,7 +180,8 @@ struct audit_context {
234 struct audit_context *previous; /* For nested syscalls */ 180 struct audit_context *previous; /* For nested syscalls */
235 struct audit_aux_data *aux; 181 struct audit_aux_data *aux;
236 struct audit_aux_data *aux_pids; 182 struct audit_aux_data *aux_pids;
237 183 struct sockaddr_storage *sockaddr;
184 size_t sockaddr_len;
238 /* Save things to print about task_struct */ 185 /* Save things to print about task_struct */
239 pid_t pid, ppid; 186 pid_t pid, ppid;
240 uid_t uid, euid, suid, fsuid; 187 uid_t uid, euid, suid, fsuid;
@@ -252,6 +199,49 @@ struct audit_context {
252 struct audit_tree_refs *trees, *first_trees; 199 struct audit_tree_refs *trees, *first_trees;
253 int tree_count; 200 int tree_count;
254 201
202 int type;
203 union {
204 struct {
205 int nargs;
206 long args[6];
207 } socketcall;
208 struct {
209 uid_t uid;
210 gid_t gid;
211 mode_t mode;
212 u32 osid;
213 int has_perm;
214 uid_t perm_uid;
215 gid_t perm_gid;
216 mode_t perm_mode;
217 unsigned long qbytes;
218 } ipc;
219 struct {
220 mqd_t mqdes;
221 struct mq_attr mqstat;
222 } mq_getsetattr;
223 struct {
224 mqd_t mqdes;
225 int sigev_signo;
226 } mq_notify;
227 struct {
228 mqd_t mqdes;
229 size_t msg_len;
230 unsigned int msg_prio;
231 struct timespec abs_timeout;
232 } mq_sendrecv;
233 struct {
234 int oflag;
235 mode_t mode;
236 struct mq_attr attr;
237 } mq_open;
238 struct {
239 pid_t pid;
240 struct audit_cap_data cap;
241 } capset;
242 };
243 int fds[2];
244
255#if AUDIT_DEBUG 245#if AUDIT_DEBUG
256 int put_count; 246 int put_count;
257 int ino_count; 247 int ino_count;
@@ -608,19 +598,12 @@ static int audit_filter_rules(struct task_struct *tsk,
608 } 598 }
609 } 599 }
610 /* Find ipc objects that match */ 600 /* Find ipc objects that match */
611 if (ctx) { 601 if (!ctx || ctx->type != AUDIT_IPC)
612 struct audit_aux_data *aux; 602 break;
613 for (aux = ctx->aux; aux; 603 if (security_audit_rule_match(ctx->ipc.osid,
614 aux = aux->next) { 604 f->type, f->op,
615 if (aux->type == AUDIT_IPC) { 605 f->lsm_rule, ctx))
616 struct audit_aux_data_ipcctl *axi = (void *)aux; 606 ++result;
617 if (security_audit_rule_match(axi->osid, f->type, f->op, f->lsm_rule, ctx)) {
618 ++result;
619 break;
620 }
621 }
622 }
623 }
624 } 607 }
625 break; 608 break;
626 case AUDIT_ARG0: 609 case AUDIT_ARG0:
@@ -647,8 +630,16 @@ static int audit_filter_rules(struct task_struct *tsk,
647 return 0; 630 return 0;
648 } 631 }
649 } 632 }
650 if (rule->filterkey && ctx) 633
651 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); 634 if (ctx) {
635 if (rule->prio <= ctx->prio)
636 return 0;
637 if (rule->filterkey) {
638 kfree(ctx->filterkey);
639 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
640 }
641 ctx->prio = rule->prio;
642 }
652 switch (rule->action) { 643 switch (rule->action) {
653 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 644 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
654 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 645 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
@@ -661,7 +652,7 @@ static int audit_filter_rules(struct task_struct *tsk,
661 * completely disabled for this task. Since we only have the task 652 * completely disabled for this task. Since we only have the task
662 * structure at this point, we can only check uid and gid. 653 * structure at this point, we can only check uid and gid.
663 */ 654 */
664static enum audit_state audit_filter_task(struct task_struct *tsk) 655static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
665{ 656{
666 struct audit_entry *e; 657 struct audit_entry *e;
667 enum audit_state state; 658 enum audit_state state;
@@ -669,6 +660,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
669 rcu_read_lock(); 660 rcu_read_lock();
670 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 661 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
671 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { 662 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
663 if (state == AUDIT_RECORD_CONTEXT)
664 *key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
672 rcu_read_unlock(); 665 rcu_read_unlock();
673 return state; 666 return state;
674 } 667 }
@@ -702,6 +695,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
702 audit_filter_rules(tsk, &e->rule, ctx, NULL, 695 audit_filter_rules(tsk, &e->rule, ctx, NULL,
703 &state)) { 696 &state)) {
704 rcu_read_unlock(); 697 rcu_read_unlock();
698 ctx->current_state = state;
705 return state; 699 return state;
706 } 700 }
707 } 701 }
@@ -715,15 +709,14 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
715 * buckets applicable to the inode numbers in audit_names[]. 709 * buckets applicable to the inode numbers in audit_names[].
716 * Regarding audit_state, same rules apply as for audit_filter_syscall(). 710 * Regarding audit_state, same rules apply as for audit_filter_syscall().
717 */ 711 */
718enum audit_state audit_filter_inodes(struct task_struct *tsk, 712void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
719 struct audit_context *ctx)
720{ 713{
721 int i; 714 int i;
722 struct audit_entry *e; 715 struct audit_entry *e;
723 enum audit_state state; 716 enum audit_state state;
724 717
725 if (audit_pid && tsk->tgid == audit_pid) 718 if (audit_pid && tsk->tgid == audit_pid)
726 return AUDIT_DISABLED; 719 return;
727 720
728 rcu_read_lock(); 721 rcu_read_lock();
729 for (i = 0; i < ctx->name_count; i++) { 722 for (i = 0; i < ctx->name_count; i++) {
@@ -740,17 +733,20 @@ enum audit_state audit_filter_inodes(struct task_struct *tsk,
740 if ((e->rule.mask[word] & bit) == bit && 733 if ((e->rule.mask[word] & bit) == bit &&
741 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { 734 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
742 rcu_read_unlock(); 735 rcu_read_unlock();
743 return state; 736 ctx->current_state = state;
737 return;
744 } 738 }
745 } 739 }
746 } 740 }
747 rcu_read_unlock(); 741 rcu_read_unlock();
748 return AUDIT_BUILD_CONTEXT;
749} 742}
750 743
751void audit_set_auditable(struct audit_context *ctx) 744static void audit_set_auditable(struct audit_context *ctx)
752{ 745{
753 ctx->auditable = 1; 746 if (!ctx->prio) {
747 ctx->prio = 1;
748 ctx->current_state = AUDIT_RECORD_CONTEXT;
749 }
754} 750}
755 751
756static inline struct audit_context *audit_get_context(struct task_struct *tsk, 752static inline struct audit_context *audit_get_context(struct task_struct *tsk,
@@ -781,23 +777,11 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
781 else 777 else
782 context->return_code = return_code; 778 context->return_code = return_code;
783 779
784 if (context->in_syscall && !context->dummy && !context->auditable) { 780 if (context->in_syscall && !context->dummy) {
785 enum audit_state state; 781 audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
786 782 audit_filter_inodes(tsk, context);
787 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
788 if (state == AUDIT_RECORD_CONTEXT) {
789 context->auditable = 1;
790 goto get_context;
791 }
792
793 state = audit_filter_inodes(tsk, context);
794 if (state == AUDIT_RECORD_CONTEXT)
795 context->auditable = 1;
796
797 } 783 }
798 784
799get_context:
800
801 tsk->audit_context = NULL; 785 tsk->audit_context = NULL;
802 return context; 786 return context;
803} 787}
@@ -807,8 +791,7 @@ static inline void audit_free_names(struct audit_context *context)
807 int i; 791 int i;
808 792
809#if AUDIT_DEBUG == 2 793#if AUDIT_DEBUG == 2
810 if (context->auditable 794 if (context->put_count + context->ino_count != context->name_count) {
811 ||context->put_count + context->ino_count != context->name_count) {
812 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" 795 printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
813 " name_count=%d put_count=%d" 796 " name_count=%d put_count=%d"
814 " ino_count=%d [NOT freeing]\n", 797 " ino_count=%d [NOT freeing]\n",
@@ -859,6 +842,7 @@ static inline void audit_zero_context(struct audit_context *context,
859{ 842{
860 memset(context, 0, sizeof(*context)); 843 memset(context, 0, sizeof(*context));
861 context->state = state; 844 context->state = state;
845 context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
862} 846}
863 847
864static inline struct audit_context *audit_alloc_context(enum audit_state state) 848static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -884,18 +868,21 @@ int audit_alloc(struct task_struct *tsk)
884{ 868{
885 struct audit_context *context; 869 struct audit_context *context;
886 enum audit_state state; 870 enum audit_state state;
871 char *key = NULL;
887 872
888 if (likely(!audit_ever_enabled)) 873 if (likely(!audit_ever_enabled))
889 return 0; /* Return if not auditing. */ 874 return 0; /* Return if not auditing. */
890 875
891 state = audit_filter_task(tsk); 876 state = audit_filter_task(tsk, &key);
892 if (likely(state == AUDIT_DISABLED)) 877 if (likely(state == AUDIT_DISABLED))
893 return 0; 878 return 0;
894 879
895 if (!(context = audit_alloc_context(state))) { 880 if (!(context = audit_alloc_context(state))) {
881 kfree(key);
896 audit_log_lost("out of memory in audit_alloc"); 882 audit_log_lost("out of memory in audit_alloc");
897 return -ENOMEM; 883 return -ENOMEM;
898 } 884 }
885 context->filterkey = key;
899 886
900 tsk->audit_context = context; 887 tsk->audit_context = context;
901 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); 888 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
@@ -921,6 +908,7 @@ static inline void audit_free_context(struct audit_context *context)
921 free_tree_refs(context); 908 free_tree_refs(context);
922 audit_free_aux(context); 909 audit_free_aux(context);
923 kfree(context->filterkey); 910 kfree(context->filterkey);
911 kfree(context->sockaddr);
924 kfree(context); 912 kfree(context);
925 context = previous; 913 context = previous;
926 } while (context); 914 } while (context);
@@ -1230,6 +1218,97 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1230 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); 1218 audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
1231} 1219}
1232 1220
1221static void show_special(struct audit_context *context, int *call_panic)
1222{
1223 struct audit_buffer *ab;
1224 int i;
1225
1226 ab = audit_log_start(context, GFP_KERNEL, context->type);
1227 if (!ab)
1228 return;
1229
1230 switch (context->type) {
1231 case AUDIT_SOCKETCALL: {
1232 int nargs = context->socketcall.nargs;
1233 audit_log_format(ab, "nargs=%d", nargs);
1234 for (i = 0; i < nargs; i++)
1235 audit_log_format(ab, " a%d=%lx", i,
1236 context->socketcall.args[i]);
1237 break; }
1238 case AUDIT_IPC: {
1239 u32 osid = context->ipc.osid;
1240
1241 audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
1242 context->ipc.uid, context->ipc.gid, context->ipc.mode);
1243 if (osid) {
1244 char *ctx = NULL;
1245 u32 len;
1246 if (security_secid_to_secctx(osid, &ctx, &len)) {
1247 audit_log_format(ab, " osid=%u", osid);
1248 *call_panic = 1;
1249 } else {
1250 audit_log_format(ab, " obj=%s", ctx);
1251 security_release_secctx(ctx, len);
1252 }
1253 }
1254 if (context->ipc.has_perm) {
1255 audit_log_end(ab);
1256 ab = audit_log_start(context, GFP_KERNEL,
1257 AUDIT_IPC_SET_PERM);
1258 audit_log_format(ab,
1259 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
1260 context->ipc.qbytes,
1261 context->ipc.perm_uid,
1262 context->ipc.perm_gid,
1263 context->ipc.perm_mode);
1264 if (!ab)
1265 return;
1266 }
1267 break; }
1268 case AUDIT_MQ_OPEN: {
1269 audit_log_format(ab,
1270 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
1271 "mq_msgsize=%ld mq_curmsgs=%ld",
1272 context->mq_open.oflag, context->mq_open.mode,
1273 context->mq_open.attr.mq_flags,
1274 context->mq_open.attr.mq_maxmsg,
1275 context->mq_open.attr.mq_msgsize,
1276 context->mq_open.attr.mq_curmsgs);
1277 break; }
1278 case AUDIT_MQ_SENDRECV: {
1279 audit_log_format(ab,
1280 "mqdes=%d msg_len=%zd msg_prio=%u "
1281 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
1282 context->mq_sendrecv.mqdes,
1283 context->mq_sendrecv.msg_len,
1284 context->mq_sendrecv.msg_prio,
1285 context->mq_sendrecv.abs_timeout.tv_sec,
1286 context->mq_sendrecv.abs_timeout.tv_nsec);
1287 break; }
1288 case AUDIT_MQ_NOTIFY: {
1289 audit_log_format(ab, "mqdes=%d sigev_signo=%d",
1290 context->mq_notify.mqdes,
1291 context->mq_notify.sigev_signo);
1292 break; }
1293 case AUDIT_MQ_GETSETATTR: {
1294 struct mq_attr *attr = &context->mq_getsetattr.mqstat;
1295 audit_log_format(ab,
1296 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
1297 "mq_curmsgs=%ld ",
1298 context->mq_getsetattr.mqdes,
1299 attr->mq_flags, attr->mq_maxmsg,
1300 attr->mq_msgsize, attr->mq_curmsgs);
1301 break; }
1302 case AUDIT_CAPSET: {
1303 audit_log_format(ab, "pid=%d", context->capset.pid);
1304 audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
1305 audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
1306 audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
1307 break; }
1308 }
1309 audit_log_end(ab);
1310}
1311
1233static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1312static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
1234{ 1313{
1235 const struct cred *cred; 1314 const struct cred *cred;
@@ -1307,94 +1386,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1307 continue; /* audit_panic has been called */ 1386 continue; /* audit_panic has been called */
1308 1387
1309 switch (aux->type) { 1388 switch (aux->type) {
1310 case AUDIT_MQ_OPEN: {
1311 struct audit_aux_data_mq_open *axi = (void *)aux;
1312 audit_log_format(ab,
1313 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
1314 "mq_msgsize=%ld mq_curmsgs=%ld",
1315 axi->oflag, axi->mode, axi->attr.mq_flags,
1316 axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
1317 axi->attr.mq_curmsgs);
1318 break; }
1319
1320 case AUDIT_MQ_SENDRECV: {
1321 struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
1322 audit_log_format(ab,
1323 "mqdes=%d msg_len=%zd msg_prio=%u "
1324 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
1325 axi->mqdes, axi->msg_len, axi->msg_prio,
1326 axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
1327 break; }
1328
1329 case AUDIT_MQ_NOTIFY: {
1330 struct audit_aux_data_mq_notify *axi = (void *)aux;
1331 audit_log_format(ab,
1332 "mqdes=%d sigev_signo=%d",
1333 axi->mqdes,
1334 axi->notification.sigev_signo);
1335 break; }
1336
1337 case AUDIT_MQ_GETSETATTR: {
1338 struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
1339 audit_log_format(ab,
1340 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
1341 "mq_curmsgs=%ld ",
1342 axi->mqdes,
1343 axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
1344 axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
1345 break; }
1346
1347 case AUDIT_IPC: {
1348 struct audit_aux_data_ipcctl *axi = (void *)aux;
1349 audit_log_format(ab,
1350 "ouid=%u ogid=%u mode=%#o",
1351 axi->uid, axi->gid, axi->mode);
1352 if (axi->osid != 0) {
1353 char *ctx = NULL;
1354 u32 len;
1355 if (security_secid_to_secctx(
1356 axi->osid, &ctx, &len)) {
1357 audit_log_format(ab, " osid=%u",
1358 axi->osid);
1359 call_panic = 1;
1360 } else {
1361 audit_log_format(ab, " obj=%s", ctx);
1362 security_release_secctx(ctx, len);
1363 }
1364 }
1365 break; }
1366
1367 case AUDIT_IPC_SET_PERM: {
1368 struct audit_aux_data_ipcctl *axi = (void *)aux;
1369 audit_log_format(ab,
1370 "qbytes=%lx ouid=%u ogid=%u mode=%#o",
1371 axi->qbytes, axi->uid, axi->gid, axi->mode);
1372 break; }
1373 1389
1374 case AUDIT_EXECVE: { 1390 case AUDIT_EXECVE: {
1375 struct audit_aux_data_execve *axi = (void *)aux; 1391 struct audit_aux_data_execve *axi = (void *)aux;
1376 audit_log_execve_info(context, &ab, axi); 1392 audit_log_execve_info(context, &ab, axi);
1377 break; } 1393 break; }
1378 1394
1379 case AUDIT_SOCKETCALL: {
1380 struct audit_aux_data_socketcall *axs = (void *)aux;
1381 audit_log_format(ab, "nargs=%d", axs->nargs);
1382 for (i=0; i<axs->nargs; i++)
1383 audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
1384 break; }
1385
1386 case AUDIT_SOCKADDR: {
1387 struct audit_aux_data_sockaddr *axs = (void *)aux;
1388
1389 audit_log_format(ab, "saddr=");
1390 audit_log_n_hex(ab, axs->a, axs->len);
1391 break; }
1392
1393 case AUDIT_FD_PAIR: {
1394 struct audit_aux_data_fd_pair *axs = (void *)aux;
1395 audit_log_format(ab, "fd0=%d fd1=%d", axs->fd[0], axs->fd[1]);
1396 break; }
1397
1398 case AUDIT_BPRM_FCAPS: { 1395 case AUDIT_BPRM_FCAPS: {
1399 struct audit_aux_data_bprm_fcaps *axs = (void *)aux; 1396 struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
1400 audit_log_format(ab, "fver=%x", axs->fcap_ver); 1397 audit_log_format(ab, "fver=%x", axs->fcap_ver);
@@ -1409,18 +1406,32 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1409 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); 1406 audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
1410 break; } 1407 break; }
1411 1408
1412 case AUDIT_CAPSET: {
1413 struct audit_aux_data_capset *axs = (void *)aux;
1414 audit_log_format(ab, "pid=%d", axs->pid);
1415 audit_log_cap(ab, "cap_pi", &axs->cap.inheritable);
1416 audit_log_cap(ab, "cap_pp", &axs->cap.permitted);
1417 audit_log_cap(ab, "cap_pe", &axs->cap.effective);
1418 break; }
1419
1420 } 1409 }
1421 audit_log_end(ab); 1410 audit_log_end(ab);
1422 } 1411 }
1423 1412
1413 if (context->type)
1414 show_special(context, &call_panic);
1415
1416 if (context->fds[0] >= 0) {
1417 ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR);
1418 if (ab) {
1419 audit_log_format(ab, "fd0=%d fd1=%d",
1420 context->fds[0], context->fds[1]);
1421 audit_log_end(ab);
1422 }
1423 }
1424
1425 if (context->sockaddr_len) {
1426 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR);
1427 if (ab) {
1428 audit_log_format(ab, "saddr=");
1429 audit_log_n_hex(ab, (void *)context->sockaddr,
1430 context->sockaddr_len);
1431 audit_log_end(ab);
1432 }
1433 }
1434
1424 for (aux = context->aux_pids; aux; aux = aux->next) { 1435 for (aux = context->aux_pids; aux; aux = aux->next) {
1425 struct audit_aux_data_pids *axs = (void *)aux; 1436 struct audit_aux_data_pids *axs = (void *)aux;
1426 1437
@@ -1536,7 +1547,7 @@ void audit_free(struct task_struct *tsk)
1536 * We use GFP_ATOMIC here because we might be doing this 1547 * We use GFP_ATOMIC here because we might be doing this
1537 * in the context of the idle thread */ 1548 * in the context of the idle thread */
1538 /* that can happen only if we are called from do_exit() */ 1549 /* that can happen only if we are called from do_exit() */
1539 if (context->in_syscall && context->auditable) 1550 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1540 audit_log_exit(context, tsk); 1551 audit_log_exit(context, tsk);
1541 1552
1542 audit_free_context(context); 1553 audit_free_context(context);
@@ -1620,15 +1631,17 @@ void audit_syscall_entry(int arch, int major,
1620 1631
1621 state = context->state; 1632 state = context->state;
1622 context->dummy = !audit_n_rules; 1633 context->dummy = !audit_n_rules;
1623 if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) 1634 if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
1635 context->prio = 0;
1624 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); 1636 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
1637 }
1625 if (likely(state == AUDIT_DISABLED)) 1638 if (likely(state == AUDIT_DISABLED))
1626 return; 1639 return;
1627 1640
1628 context->serial = 0; 1641 context->serial = 0;
1629 context->ctime = CURRENT_TIME; 1642 context->ctime = CURRENT_TIME;
1630 context->in_syscall = 1; 1643 context->in_syscall = 1;
1631 context->auditable = !!(state == AUDIT_RECORD_CONTEXT); 1644 context->current_state = state;
1632 context->ppid = 0; 1645 context->ppid = 0;
1633} 1646}
1634 1647
@@ -1636,17 +1649,20 @@ void audit_finish_fork(struct task_struct *child)
1636{ 1649{
1637 struct audit_context *ctx = current->audit_context; 1650 struct audit_context *ctx = current->audit_context;
1638 struct audit_context *p = child->audit_context; 1651 struct audit_context *p = child->audit_context;
1639 if (!p || !ctx || !ctx->auditable) 1652 if (!p || !ctx)
1653 return;
1654 if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
1640 return; 1655 return;
1641 p->arch = ctx->arch; 1656 p->arch = ctx->arch;
1642 p->major = ctx->major; 1657 p->major = ctx->major;
1643 memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); 1658 memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
1644 p->ctime = ctx->ctime; 1659 p->ctime = ctx->ctime;
1645 p->dummy = ctx->dummy; 1660 p->dummy = ctx->dummy;
1646 p->auditable = ctx->auditable;
1647 p->in_syscall = ctx->in_syscall; 1661 p->in_syscall = ctx->in_syscall;
1648 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); 1662 p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
1649 p->ppid = current->pid; 1663 p->ppid = current->pid;
1664 p->prio = ctx->prio;
1665 p->current_state = ctx->current_state;
1650} 1666}
1651 1667
1652/** 1668/**
@@ -1670,11 +1686,11 @@ void audit_syscall_exit(int valid, long return_code)
1670 if (likely(!context)) 1686 if (likely(!context))
1671 return; 1687 return;
1672 1688
1673 if (context->in_syscall && context->auditable) 1689 if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
1674 audit_log_exit(context, tsk); 1690 audit_log_exit(context, tsk);
1675 1691
1676 context->in_syscall = 0; 1692 context->in_syscall = 0;
1677 context->auditable = 0; 1693 context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
1678 1694
1679 if (context->previous) { 1695 if (context->previous) {
1680 struct audit_context *new_context = context->previous; 1696 struct audit_context *new_context = context->previous;
@@ -1689,8 +1705,13 @@ void audit_syscall_exit(int valid, long return_code)
1689 context->aux_pids = NULL; 1705 context->aux_pids = NULL;
1690 context->target_pid = 0; 1706 context->target_pid = 0;
1691 context->target_sid = 0; 1707 context->target_sid = 0;
1692 kfree(context->filterkey); 1708 context->sockaddr_len = 0;
1693 context->filterkey = NULL; 1709 context->type = 0;
1710 context->fds[0] = -1;
1711 if (context->state != AUDIT_RECORD_CONTEXT) {
1712 kfree(context->filterkey);
1713 context->filterkey = NULL;
1714 }
1694 tsk->audit_context = context; 1715 tsk->audit_context = context;
1695 } 1716 }
1696} 1717}
@@ -2081,7 +2102,10 @@ int auditsc_get_stamp(struct audit_context *ctx,
2081 t->tv_sec = ctx->ctime.tv_sec; 2102 t->tv_sec = ctx->ctime.tv_sec;
2082 t->tv_nsec = ctx->ctime.tv_nsec; 2103 t->tv_nsec = ctx->ctime.tv_nsec;
2083 *serial = ctx->serial; 2104 *serial = ctx->serial;
2084 ctx->auditable = 1; 2105 if (!ctx->prio) {
2106 ctx->prio = 1;
2107 ctx->current_state = AUDIT_RECORD_CONTEXT;
2108 }
2085 return 1; 2109 return 1;
2086} 2110}
2087 2111
@@ -2127,132 +2151,46 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2127 * @mode: mode bits 2151 * @mode: mode bits
2128 * @u_attr: queue attributes 2152 * @u_attr: queue attributes
2129 * 2153 *
2130 * Returns 0 for success or NULL context or < 0 on error.
2131 */ 2154 */
2132int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) 2155void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
2133{ 2156{
2134 struct audit_aux_data_mq_open *ax;
2135 struct audit_context *context = current->audit_context; 2157 struct audit_context *context = current->audit_context;
2136 2158
2137 if (!audit_enabled) 2159 if (attr)
2138 return 0; 2160 memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr));
2139 2161 else
2140 if (likely(!context)) 2162 memset(&context->mq_open.attr, 0, sizeof(struct mq_attr));
2141 return 0;
2142
2143 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2144 if (!ax)
2145 return -ENOMEM;
2146
2147 if (u_attr != NULL) {
2148 if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
2149 kfree(ax);
2150 return -EFAULT;
2151 }
2152 } else
2153 memset(&ax->attr, 0, sizeof(ax->attr));
2154 2163
2155 ax->oflag = oflag; 2164 context->mq_open.oflag = oflag;
2156 ax->mode = mode; 2165 context->mq_open.mode = mode;
2157 2166
2158 ax->d.type = AUDIT_MQ_OPEN; 2167 context->type = AUDIT_MQ_OPEN;
2159 ax->d.next = context->aux;
2160 context->aux = (void *)ax;
2161 return 0;
2162} 2168}
2163 2169
2164/** 2170/**
2165 * __audit_mq_timedsend - record audit data for a POSIX MQ timed send 2171 * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive
2166 * @mqdes: MQ descriptor 2172 * @mqdes: MQ descriptor
2167 * @msg_len: Message length 2173 * @msg_len: Message length
2168 * @msg_prio: Message priority 2174 * @msg_prio: Message priority
2169 * @u_abs_timeout: Message timeout in absolute time 2175 * @abs_timeout: Message timeout in absolute time
2170 * 2176 *
2171 * Returns 0 for success or NULL context or < 0 on error.
2172 */ 2177 */
2173int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, 2178void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
2174 const struct timespec __user *u_abs_timeout) 2179 const struct timespec *abs_timeout)
2175{ 2180{
2176 struct audit_aux_data_mq_sendrecv *ax;
2177 struct audit_context *context = current->audit_context; 2181 struct audit_context *context = current->audit_context;
2182 struct timespec *p = &context->mq_sendrecv.abs_timeout;
2178 2183
2179 if (!audit_enabled) 2184 if (abs_timeout)
2180 return 0; 2185 memcpy(p, abs_timeout, sizeof(struct timespec));
2181 2186 else
2182 if (likely(!context)) 2187 memset(p, 0, sizeof(struct timespec));
2183 return 0;
2184
2185 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2186 if (!ax)
2187 return -ENOMEM;
2188
2189 if (u_abs_timeout != NULL) {
2190 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
2191 kfree(ax);
2192 return -EFAULT;
2193 }
2194 } else
2195 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
2196
2197 ax->mqdes = mqdes;
2198 ax->msg_len = msg_len;
2199 ax->msg_prio = msg_prio;
2200
2201 ax->d.type = AUDIT_MQ_SENDRECV;
2202 ax->d.next = context->aux;
2203 context->aux = (void *)ax;
2204 return 0;
2205}
2206
2207/**
2208 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
2209 * @mqdes: MQ descriptor
2210 * @msg_len: Message length
2211 * @u_msg_prio: Message priority
2212 * @u_abs_timeout: Message timeout in absolute time
2213 *
2214 * Returns 0 for success or NULL context or < 0 on error.
2215 */
2216int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
2217 unsigned int __user *u_msg_prio,
2218 const struct timespec __user *u_abs_timeout)
2219{
2220 struct audit_aux_data_mq_sendrecv *ax;
2221 struct audit_context *context = current->audit_context;
2222
2223 if (!audit_enabled)
2224 return 0;
2225
2226 if (likely(!context))
2227 return 0;
2228
2229 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2230 if (!ax)
2231 return -ENOMEM;
2232
2233 if (u_msg_prio != NULL) {
2234 if (get_user(ax->msg_prio, u_msg_prio)) {
2235 kfree(ax);
2236 return -EFAULT;
2237 }
2238 } else
2239 ax->msg_prio = 0;
2240
2241 if (u_abs_timeout != NULL) {
2242 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
2243 kfree(ax);
2244 return -EFAULT;
2245 }
2246 } else
2247 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
2248 2188
2249 ax->mqdes = mqdes; 2189 context->mq_sendrecv.mqdes = mqdes;
2250 ax->msg_len = msg_len; 2190 context->mq_sendrecv.msg_len = msg_len;
2191 context->mq_sendrecv.msg_prio = msg_prio;
2251 2192
2252 ax->d.type = AUDIT_MQ_SENDRECV; 2193 context->type = AUDIT_MQ_SENDRECV;
2253 ax->d.next = context->aux;
2254 context->aux = (void *)ax;
2255 return 0;
2256} 2194}
2257 2195
2258/** 2196/**
@@ -2260,38 +2198,19 @@ int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
2260 * @mqdes: MQ descriptor 2198 * @mqdes: MQ descriptor
2261 * @u_notification: Notification event 2199 * @u_notification: Notification event
2262 * 2200 *
2263 * Returns 0 for success or NULL context or < 0 on error.
2264 */ 2201 */
2265 2202
2266int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) 2203void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
2267{ 2204{
2268 struct audit_aux_data_mq_notify *ax;
2269 struct audit_context *context = current->audit_context; 2205 struct audit_context *context = current->audit_context;
2270 2206
2271 if (!audit_enabled) 2207 if (notification)
2272 return 0; 2208 context->mq_notify.sigev_signo = notification->sigev_signo;
2273 2209 else
2274 if (likely(!context)) 2210 context->mq_notify.sigev_signo = 0;
2275 return 0;
2276
2277 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2278 if (!ax)
2279 return -ENOMEM;
2280
2281 if (u_notification != NULL) {
2282 if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
2283 kfree(ax);
2284 return -EFAULT;
2285 }
2286 } else
2287 memset(&ax->notification, 0, sizeof(ax->notification));
2288
2289 ax->mqdes = mqdes;
2290 2211
2291 ax->d.type = AUDIT_MQ_NOTIFY; 2212 context->mq_notify.mqdes = mqdes;
2292 ax->d.next = context->aux; 2213 context->type = AUDIT_MQ_NOTIFY;
2293 context->aux = (void *)ax;
2294 return 0;
2295} 2214}
2296 2215
2297/** 2216/**
@@ -2299,55 +2218,29 @@ int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
2299 * @mqdes: MQ descriptor 2218 * @mqdes: MQ descriptor
2300 * @mqstat: MQ flags 2219 * @mqstat: MQ flags
2301 * 2220 *
2302 * Returns 0 for success or NULL context or < 0 on error.
2303 */ 2221 */
2304int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) 2222void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
2305{ 2223{
2306 struct audit_aux_data_mq_getsetattr *ax;
2307 struct audit_context *context = current->audit_context; 2224 struct audit_context *context = current->audit_context;
2308 2225 context->mq_getsetattr.mqdes = mqdes;
2309 if (!audit_enabled) 2226 context->mq_getsetattr.mqstat = *mqstat;
2310 return 0; 2227 context->type = AUDIT_MQ_GETSETATTR;
2311
2312 if (likely(!context))
2313 return 0;
2314
2315 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
2316 if (!ax)
2317 return -ENOMEM;
2318
2319 ax->mqdes = mqdes;
2320 ax->mqstat = *mqstat;
2321
2322 ax->d.type = AUDIT_MQ_GETSETATTR;
2323 ax->d.next = context->aux;
2324 context->aux = (void *)ax;
2325 return 0;
2326} 2228}
2327 2229
2328/** 2230/**
2329 * audit_ipc_obj - record audit data for ipc object 2231 * audit_ipc_obj - record audit data for ipc object
2330 * @ipcp: ipc permissions 2232 * @ipcp: ipc permissions
2331 * 2233 *
2332 * Returns 0 for success or NULL context or < 0 on error.
2333 */ 2234 */
2334int __audit_ipc_obj(struct kern_ipc_perm *ipcp) 2235void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2335{ 2236{
2336 struct audit_aux_data_ipcctl *ax;
2337 struct audit_context *context = current->audit_context; 2237 struct audit_context *context = current->audit_context;
2338 2238 context->ipc.uid = ipcp->uid;
2339 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 2239 context->ipc.gid = ipcp->gid;
2340 if (!ax) 2240 context->ipc.mode = ipcp->mode;
2341 return -ENOMEM; 2241 context->ipc.has_perm = 0;
2342 2242 security_ipc_getsecid(ipcp, &context->ipc.osid);
2343 ax->uid = ipcp->uid; 2243 context->type = AUDIT_IPC;
2344 ax->gid = ipcp->gid;
2345 ax->mode = ipcp->mode;
2346 security_ipc_getsecid(ipcp, &ax->osid);
2347 ax->d.type = AUDIT_IPC;
2348 ax->d.next = context->aux;
2349 context->aux = (void *)ax;
2350 return 0;
2351} 2244}
2352 2245
2353/** 2246/**
@@ -2357,26 +2250,17 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
2357 * @gid: msgq group id 2250 * @gid: msgq group id
2358 * @mode: msgq mode (permissions) 2251 * @mode: msgq mode (permissions)
2359 * 2252 *
2360 * Returns 0 for success or NULL context or < 0 on error. 2253 * Called only after audit_ipc_obj().
2361 */ 2254 */
2362int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) 2255void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
2363{ 2256{
2364 struct audit_aux_data_ipcctl *ax;
2365 struct audit_context *context = current->audit_context; 2257 struct audit_context *context = current->audit_context;
2366 2258
2367 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 2259 context->ipc.qbytes = qbytes;
2368 if (!ax) 2260 context->ipc.perm_uid = uid;
2369 return -ENOMEM; 2261 context->ipc.perm_gid = gid;
2370 2262 context->ipc.perm_mode = mode;
2371 ax->qbytes = qbytes; 2263 context->ipc.has_perm = 1;
2372 ax->uid = uid;
2373 ax->gid = gid;
2374 ax->mode = mode;
2375
2376 ax->d.type = AUDIT_IPC_SET_PERM;
2377 ax->d.next = context->aux;
2378 context->aux = (void *)ax;
2379 return 0;
2380} 2264}
2381 2265
2382int audit_bprm(struct linux_binprm *bprm) 2266int audit_bprm(struct linux_binprm *bprm)
@@ -2406,27 +2290,17 @@ int audit_bprm(struct linux_binprm *bprm)
2406 * @nargs: number of args 2290 * @nargs: number of args
2407 * @args: args array 2291 * @args: args array
2408 * 2292 *
2409 * Returns 0 for success or NULL context or < 0 on error.
2410 */ 2293 */
2411int audit_socketcall(int nargs, unsigned long *args) 2294void audit_socketcall(int nargs, unsigned long *args)
2412{ 2295{
2413 struct audit_aux_data_socketcall *ax;
2414 struct audit_context *context = current->audit_context; 2296 struct audit_context *context = current->audit_context;
2415 2297
2416 if (likely(!context || context->dummy)) 2298 if (likely(!context || context->dummy))
2417 return 0; 2299 return;
2418
2419 ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
2420 if (!ax)
2421 return -ENOMEM;
2422
2423 ax->nargs = nargs;
2424 memcpy(ax->args, args, nargs * sizeof(unsigned long));
2425 2300
2426 ax->d.type = AUDIT_SOCKETCALL; 2301 context->type = AUDIT_SOCKETCALL;
2427 ax->d.next = context->aux; 2302 context->socketcall.nargs = nargs;
2428 context->aux = (void *)ax; 2303 memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
2429 return 0;
2430} 2304}
2431 2305
2432/** 2306/**
@@ -2434,29 +2308,12 @@ int audit_socketcall(int nargs, unsigned long *args)
2434 * @fd1: the first file descriptor 2308 * @fd1: the first file descriptor
2435 * @fd2: the second file descriptor 2309 * @fd2: the second file descriptor
2436 * 2310 *
2437 * Returns 0 for success or NULL context or < 0 on error.
2438 */ 2311 */
2439int __audit_fd_pair(int fd1, int fd2) 2312void __audit_fd_pair(int fd1, int fd2)
2440{ 2313{
2441 struct audit_context *context = current->audit_context; 2314 struct audit_context *context = current->audit_context;
2442 struct audit_aux_data_fd_pair *ax; 2315 context->fds[0] = fd1;
2443 2316 context->fds[1] = fd2;
2444 if (likely(!context)) {
2445 return 0;
2446 }
2447
2448 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2449 if (!ax) {
2450 return -ENOMEM;
2451 }
2452
2453 ax->fd[0] = fd1;
2454 ax->fd[1] = fd2;
2455
2456 ax->d.type = AUDIT_FD_PAIR;
2457 ax->d.next = context->aux;
2458 context->aux = (void *)ax;
2459 return 0;
2460} 2317}
2461 2318
2462/** 2319/**
@@ -2468,22 +2325,20 @@ int __audit_fd_pair(int fd1, int fd2)
2468 */ 2325 */
2469int audit_sockaddr(int len, void *a) 2326int audit_sockaddr(int len, void *a)
2470{ 2327{
2471 struct audit_aux_data_sockaddr *ax;
2472 struct audit_context *context = current->audit_context; 2328 struct audit_context *context = current->audit_context;
2473 2329
2474 if (likely(!context || context->dummy)) 2330 if (likely(!context || context->dummy))
2475 return 0; 2331 return 0;
2476 2332
2477 ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); 2333 if (!context->sockaddr) {
2478 if (!ax) 2334 void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
2479 return -ENOMEM; 2335 if (!p)
2480 2336 return -ENOMEM;
2481 ax->len = len; 2337 context->sockaddr = p;
2482 memcpy(ax->a, a, len); 2338 }
2483 2339
2484 ax->d.type = AUDIT_SOCKADDR; 2340 context->sockaddr_len = len;
2485 ax->d.next = context->aux; 2341 memcpy(context->sockaddr, a, len);
2486 context->aux = (void *)ax;
2487 return 0; 2342 return 0;
2488} 2343}
2489 2344
@@ -2617,29 +2472,15 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
2617 * Record the aguments userspace sent to sys_capset for later printing by the 2472 * Record the aguments userspace sent to sys_capset for later printing by the
2618 * audit system if applicable 2473 * audit system if applicable
2619 */ 2474 */
2620int __audit_log_capset(pid_t pid, 2475void __audit_log_capset(pid_t pid,
2621 const struct cred *new, const struct cred *old) 2476 const struct cred *new, const struct cred *old)
2622{ 2477{
2623 struct audit_aux_data_capset *ax;
2624 struct audit_context *context = current->audit_context; 2478 struct audit_context *context = current->audit_context;
2625 2479 context->capset.pid = pid;
2626 if (likely(!audit_enabled || !context || context->dummy)) 2480 context->capset.cap.effective = new->cap_effective;
2627 return 0; 2481 context->capset.cap.inheritable = new->cap_effective;
2628 2482 context->capset.cap.permitted = new->cap_permitted;
2629 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2483 context->type = AUDIT_CAPSET;
2630 if (!ax)
2631 return -ENOMEM;
2632
2633 ax->d.type = AUDIT_CAPSET;
2634 ax->d.next = context->aux;
2635 context->aux = (void *)ax;
2636
2637 ax->pid = pid;
2638 ax->cap.effective = new->cap_effective;
2639 ax->cap.inheritable = new->cap_effective;
2640 ax->cap.permitted = new->cap_permitted;
2641
2642 return 0;
2643} 2484}
2644 2485
2645/** 2486/**
diff --git a/kernel/capability.c b/kernel/capability.c
index df62f53f84ac..4e17041963f5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -161,7 +161,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
161 * 161 *
162 * Returns 0 on success and < 0 on error. 162 * Returns 0 on success and < 0 on error.
163 */ 163 */
164asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) 164SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
165{ 165{
166 int ret = 0; 166 int ret = 0;
167 pid_t pid; 167 pid_t pid;
@@ -235,7 +235,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
235 * 235 *
236 * Returns 0 on success and < 0 on error. 236 * Returns 0 on success and < 0 on error.
237 */ 237 */
238asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) 238SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
239{ 239{
240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 240 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
241 unsigned i, tocopy; 241 unsigned i, tocopy;
@@ -280,9 +280,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
280 if (ret < 0) 280 if (ret < 0)
281 goto error; 281 goto error;
282 282
283 ret = audit_log_capset(pid, new, current_cred()); 283 audit_log_capset(pid, new, current_cred());
284 if (ret < 0)
285 return ret;
286 284
287 return commit_creds(new); 285 return commit_creds(new);
288 286
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 48348dde6d81..5a54ff42874e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
84 /* Tracks how many cgroups are currently defined in hierarchy.*/ 84 /* Tracks how many cgroups are currently defined in hierarchy.*/
85 int number_of_cgroups; 85 int number_of_cgroups;
86 86
87 /* A list running through the mounted hierarchies */ 87 /* A list running through the active hierarchies */
88 struct list_head root_list; 88 struct list_head root_list;
89 89
90 /* Hierarchy-specific flags */ 90 /* Hierarchy-specific flags */
@@ -116,7 +116,6 @@ static int root_count;
116 * be called. 116 * be called.
117 */ 117 */
118static int need_forkexit_callback __read_mostly; 118static int need_forkexit_callback __read_mostly;
119static int need_mm_owner_callback __read_mostly;
120 119
121/* convenient tests for these bits */ 120/* convenient tests for these bits */
122inline int cgroup_is_removed(const struct cgroup *cgrp) 121inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -149,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
149#define for_each_subsys(_root, _ss) \ 148#define for_each_subsys(_root, _ss) \
150list_for_each_entry(_ss, &_root->subsys_list, sibling) 149list_for_each_entry(_ss, &_root->subsys_list, sibling)
151 150
152/* for_each_root() allows you to iterate across the active hierarchies */ 151/* for_each_active_root() allows you to iterate across the active hierarchies */
153#define for_each_root(_root) \ 152#define for_each_active_root(_root) \
154list_for_each_entry(_root, &roots, root_list) 153list_for_each_entry(_root, &roots, root_list)
155 154
156/* the list of cgroups eligible for automatic release. Protected by 155/* the list of cgroups eligible for automatic release. Protected by
@@ -272,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
272 271
273 rcu_read_lock(); 272 rcu_read_lock();
274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 273 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
275 struct cgroup *cgrp = cg->subsys[i]->cgroup; 274 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
276 if (atomic_dec_and_test(&cgrp->count) && 275 if (atomic_dec_and_test(&cgrp->count) &&
277 notify_on_release(cgrp)) { 276 notify_on_release(cgrp)) {
278 if (taskexit) 277 if (taskexit)
@@ -385,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
385 return 0; 384 return 0;
386} 385}
387 386
387/**
388 * link_css_set - a helper function to link a css_set to a cgroup
389 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
390 * @cg: the css_set to be linked
391 * @cgrp: the destination cgroup
392 */
393static void link_css_set(struct list_head *tmp_cg_links,
394 struct css_set *cg, struct cgroup *cgrp)
395{
396 struct cg_cgroup_link *link;
397
398 BUG_ON(list_empty(tmp_cg_links));
399 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
400 cgrp_link_list);
401 link->cg = cg;
402 list_move(&link->cgrp_link_list, &cgrp->css_sets);
403 list_add(&link->cg_link_list, &cg->cg_links);
404}
405
388/* 406/*
389 * find_css_set() takes an existing cgroup group and a 407 * find_css_set() takes an existing cgroup group and a
390 * cgroup object, and returns a css_set object that's 408 * cgroup object, and returns a css_set object that's
@@ -400,7 +418,6 @@ static struct css_set *find_css_set(
400 int i; 418 int i;
401 419
402 struct list_head tmp_cg_links; 420 struct list_head tmp_cg_links;
403 struct cg_cgroup_link *link;
404 421
405 struct hlist_head *hhead; 422 struct hlist_head *hhead;
406 423
@@ -445,26 +462,11 @@ static struct css_set *find_css_set(
445 * only do it for the first subsystem in each 462 * only do it for the first subsystem in each
446 * hierarchy 463 * hierarchy
447 */ 464 */
448 if (ss->root->subsys_list.next == &ss->sibling) { 465 if (ss->root->subsys_list.next == &ss->sibling)
449 BUG_ON(list_empty(&tmp_cg_links)); 466 link_css_set(&tmp_cg_links, res, cgrp);
450 link = list_entry(tmp_cg_links.next,
451 struct cg_cgroup_link,
452 cgrp_link_list);
453 list_del(&link->cgrp_link_list);
454 list_add(&link->cgrp_link_list, &cgrp->css_sets);
455 link->cg = res;
456 list_add(&link->cg_link_list, &res->cg_links);
457 }
458 }
459 if (list_empty(&rootnode.subsys_list)) {
460 link = list_entry(tmp_cg_links.next,
461 struct cg_cgroup_link,
462 cgrp_link_list);
463 list_del(&link->cgrp_link_list);
464 list_add(&link->cgrp_link_list, &dummytop->css_sets);
465 link->cg = res;
466 list_add(&link->cg_link_list, &res->cg_links);
467 } 467 }
468 if (list_empty(&rootnode.subsys_list))
469 link_css_set(&tmp_cg_links, res, dummytop);
468 470
469 BUG_ON(!list_empty(&tmp_cg_links)); 471 BUG_ON(!list_empty(&tmp_cg_links));
470 472
@@ -573,7 +575,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
573 inode->i_mode = mode; 575 inode->i_mode = mode;
574 inode->i_uid = current_fsuid(); 576 inode->i_uid = current_fsuid();
575 inode->i_gid = current_fsgid(); 577 inode->i_gid = current_fsgid();
576 inode->i_blocks = 0;
577 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 578 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
578 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 579 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
579 } 580 }
@@ -588,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
588{ 589{
589 struct cgroup_subsys *ss; 590 struct cgroup_subsys *ss;
590 for_each_subsys(cgrp->root, ss) 591 for_each_subsys(cgrp->root, ss)
591 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) 592 if (ss->pre_destroy)
592 ss->pre_destroy(ss, cgrp); 593 ss->pre_destroy(ss, cgrp);
593 return; 594 return;
594} 595}
595 596
597static void free_cgroup_rcu(struct rcu_head *obj)
598{
599 struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
600
601 kfree(cgrp);
602}
603
596static void cgroup_diput(struct dentry *dentry, struct inode *inode) 604static void cgroup_diput(struct dentry *dentry, struct inode *inode)
597{ 605{
598 /* is dentry a directory ? if so, kfree() associated cgroup */ 606 /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -612,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
612 /* 620 /*
613 * Release the subsystem state objects. 621 * Release the subsystem state objects.
614 */ 622 */
615 for_each_subsys(cgrp->root, ss) { 623 for_each_subsys(cgrp->root, ss)
616 if (cgrp->subsys[ss->subsys_id]) 624 ss->destroy(ss, cgrp);
617 ss->destroy(ss, cgrp);
618 }
619 625
620 cgrp->root->number_of_cgroups--; 626 cgrp->root->number_of_cgroups--;
621 mutex_unlock(&cgroup_mutex); 627 mutex_unlock(&cgroup_mutex);
622 628
623 /* Drop the active superblock reference that we took when we 629 /*
624 * created the cgroup */ 630 * Drop the active superblock reference that we took when we
631 * created the cgroup
632 */
625 deactivate_super(cgrp->root->sb); 633 deactivate_super(cgrp->root->sb);
626 634
627 kfree(cgrp); 635 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
628 } 636 }
629 iput(inode); 637 iput(inode);
630} 638}
@@ -714,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
714 BUG_ON(cgrp->subsys[i]); 722 BUG_ON(cgrp->subsys[i]);
715 BUG_ON(!dummytop->subsys[i]); 723 BUG_ON(!dummytop->subsys[i]);
716 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 724 BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
725 mutex_lock(&ss->hierarchy_mutex);
717 cgrp->subsys[i] = dummytop->subsys[i]; 726 cgrp->subsys[i] = dummytop->subsys[i];
718 cgrp->subsys[i]->cgroup = cgrp; 727 cgrp->subsys[i]->cgroup = cgrp;
719 list_add(&ss->sibling, &root->subsys_list); 728 list_move(&ss->sibling, &root->subsys_list);
720 rcu_assign_pointer(ss->root, root); 729 ss->root = root;
721 if (ss->bind) 730 if (ss->bind)
722 ss->bind(ss, cgrp); 731 ss->bind(ss, cgrp);
723 732 mutex_unlock(&ss->hierarchy_mutex);
724 } else if (bit & removed_bits) { 733 } else if (bit & removed_bits) {
725 /* We're removing this subsystem */ 734 /* We're removing this subsystem */
726 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 735 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
727 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 736 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
737 mutex_lock(&ss->hierarchy_mutex);
728 if (ss->bind) 738 if (ss->bind)
729 ss->bind(ss, dummytop); 739 ss->bind(ss, dummytop);
730 dummytop->subsys[i]->cgroup = dummytop; 740 dummytop->subsys[i]->cgroup = dummytop;
731 cgrp->subsys[i] = NULL; 741 cgrp->subsys[i] = NULL;
732 rcu_assign_pointer(subsys[i]->root, &rootnode); 742 subsys[i]->root = &rootnode;
733 list_del(&ss->sibling); 743 list_move(&ss->sibling, &rootnode.subsys_list);
744 mutex_unlock(&ss->hierarchy_mutex);
734 } else if (bit & final_bits) { 745 } else if (bit & final_bits) {
735 /* Subsystem state should already exist */ 746 /* Subsystem state should already exist */
736 BUG_ON(!cgrp->subsys[i]); 747 BUG_ON(!cgrp->subsys[i]);
@@ -992,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
992 root = NULL; 1003 root = NULL;
993 } else { 1004 } else {
994 /* New superblock */ 1005 /* New superblock */
995 struct cgroup *cgrp = &root->top_cgroup; 1006 struct cgroup *root_cgrp = &root->top_cgroup;
996 struct inode *inode; 1007 struct inode *inode;
997 int i; 1008 int i;
998 1009
@@ -1033,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1033 list_add(&root->root_list, &roots); 1044 list_add(&root->root_list, &roots);
1034 root_count++; 1045 root_count++;
1035 1046
1036 sb->s_root->d_fsdata = &root->top_cgroup; 1047 sb->s_root->d_fsdata = root_cgrp;
1037 root->top_cgroup.dentry = sb->s_root; 1048 root->top_cgroup.dentry = sb->s_root;
1038 1049
1039 /* Link the top cgroup in this hierarchy into all 1050 /* Link the top cgroup in this hierarchy into all
@@ -1044,29 +1055,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1044 struct hlist_node *node; 1055 struct hlist_node *node;
1045 struct css_set *cg; 1056 struct css_set *cg;
1046 1057
1047 hlist_for_each_entry(cg, node, hhead, hlist) { 1058 hlist_for_each_entry(cg, node, hhead, hlist)
1048 struct cg_cgroup_link *link; 1059 link_css_set(&tmp_cg_links, cg, root_cgrp);
1049
1050 BUG_ON(list_empty(&tmp_cg_links));
1051 link = list_entry(tmp_cg_links.next,
1052 struct cg_cgroup_link,
1053 cgrp_link_list);
1054 list_del(&link->cgrp_link_list);
1055 link->cg = cg;
1056 list_add(&link->cgrp_link_list,
1057 &root->top_cgroup.css_sets);
1058 list_add(&link->cg_link_list, &cg->cg_links);
1059 }
1060 } 1060 }
1061 write_unlock(&css_set_lock); 1061 write_unlock(&css_set_lock);
1062 1062
1063 free_cg_links(&tmp_cg_links); 1063 free_cg_links(&tmp_cg_links);
1064 1064
1065 BUG_ON(!list_empty(&cgrp->sibling)); 1065 BUG_ON(!list_empty(&root_cgrp->sibling));
1066 BUG_ON(!list_empty(&cgrp->children)); 1066 BUG_ON(!list_empty(&root_cgrp->children));
1067 BUG_ON(root->number_of_cgroups != 1); 1067 BUG_ON(root->number_of_cgroups != 1);
1068 1068
1069 cgroup_populate_dir(cgrp); 1069 cgroup_populate_dir(root_cgrp);
1070 mutex_unlock(&inode->i_mutex); 1070 mutex_unlock(&inode->i_mutex);
1071 mutex_unlock(&cgroup_mutex); 1071 mutex_unlock(&cgroup_mutex);
1072 } 1072 }
@@ -1119,6 +1119,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1119 list_del(&root->root_list); 1119 list_del(&root->root_list);
1120 root_count--; 1120 root_count--;
1121 } 1121 }
1122
1122 mutex_unlock(&cgroup_mutex); 1123 mutex_unlock(&cgroup_mutex);
1123 1124
1124 kfree(root); 1125 kfree(root);
@@ -1147,14 +1148,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1147 * @buf: the buffer to write the path into 1148 * @buf: the buffer to write the path into
1148 * @buflen: the length of the buffer 1149 * @buflen: the length of the buffer
1149 * 1150 *
1150 * Called with cgroup_mutex held. Writes path of cgroup into buf. 1151 * Called with cgroup_mutex held or else with an RCU-protected cgroup
1151 * Returns 0 on success, -errno on error. 1152 * reference. Writes path of cgroup into buf. Returns 0 on success,
1153 * -errno on error.
1152 */ 1154 */
1153int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1155int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1154{ 1156{
1155 char *start; 1157 char *start;
1158 struct dentry *dentry = rcu_dereference(cgrp->dentry);
1156 1159
1157 if (cgrp == dummytop) { 1160 if (!dentry || cgrp == dummytop) {
1158 /* 1161 /*
1159 * Inactive subsystems have no dentry for their root 1162 * Inactive subsystems have no dentry for their root
1160 * cgroup 1163 * cgroup
@@ -1167,13 +1170,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1167 1170
1168 *--start = '\0'; 1171 *--start = '\0';
1169 for (;;) { 1172 for (;;) {
1170 int len = cgrp->dentry->d_name.len; 1173 int len = dentry->d_name.len;
1171 if ((start -= len) < buf) 1174 if ((start -= len) < buf)
1172 return -ENAMETOOLONG; 1175 return -ENAMETOOLONG;
1173 memcpy(start, cgrp->dentry->d_name.name, len); 1176 memcpy(start, cgrp->dentry->d_name.name, len);
1174 cgrp = cgrp->parent; 1177 cgrp = cgrp->parent;
1175 if (!cgrp) 1178 if (!cgrp)
1176 break; 1179 break;
1180 dentry = rcu_dereference(cgrp->dentry);
1177 if (!cgrp->parent) 1181 if (!cgrp->parent)
1178 continue; 1182 continue;
1179 if (--start < buf) 1183 if (--start < buf)
@@ -1218,7 +1222,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1218 int retval = 0; 1222 int retval = 0;
1219 struct cgroup_subsys *ss; 1223 struct cgroup_subsys *ss;
1220 struct cgroup *oldcgrp; 1224 struct cgroup *oldcgrp;
1221 struct css_set *cg = tsk->cgroups; 1225 struct css_set *cg;
1222 struct css_set *newcg; 1226 struct css_set *newcg;
1223 struct cgroupfs_root *root = cgrp->root; 1227 struct cgroupfs_root *root = cgrp->root;
1224 int subsys_id; 1228 int subsys_id;
@@ -1238,11 +1242,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1238 } 1242 }
1239 } 1243 }
1240 1244
1245 task_lock(tsk);
1246 cg = tsk->cgroups;
1247 get_css_set(cg);
1248 task_unlock(tsk);
1241 /* 1249 /*
1242 * Locate or allocate a new css_set for this task, 1250 * Locate or allocate a new css_set for this task,
1243 * based on its final set of cgroups 1251 * based on its final set of cgroups
1244 */ 1252 */
1245 newcg = find_css_set(cg, cgrp); 1253 newcg = find_css_set(cg, cgrp);
1254 put_css_set(cg);
1246 if (!newcg) 1255 if (!newcg)
1247 return -ENOMEM; 1256 return -ENOMEM;
1248 1257
@@ -1447,7 +1456,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1447 struct cftype *cft = __d_cft(file->f_dentry); 1456 struct cftype *cft = __d_cft(file->f_dentry);
1448 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1457 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1449 1458
1450 if (!cft || cgroup_is_removed(cgrp)) 1459 if (cgroup_is_removed(cgrp))
1451 return -ENODEV; 1460 return -ENODEV;
1452 if (cft->write) 1461 if (cft->write)
1453 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1462 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1492,7 +1501,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1492 struct cftype *cft = __d_cft(file->f_dentry); 1501 struct cftype *cft = __d_cft(file->f_dentry);
1493 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1502 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1494 1503
1495 if (!cft || cgroup_is_removed(cgrp)) 1504 if (cgroup_is_removed(cgrp))
1496 return -ENODEV; 1505 return -ENODEV;
1497 1506
1498 if (cft->read) 1507 if (cft->read)
@@ -1556,10 +1565,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
1556 err = generic_file_open(inode, file); 1565 err = generic_file_open(inode, file);
1557 if (err) 1566 if (err)
1558 return err; 1567 return err;
1559
1560 cft = __d_cft(file->f_dentry); 1568 cft = __d_cft(file->f_dentry);
1561 if (!cft) 1569
1562 return -ENODEV;
1563 if (cft->read_map || cft->read_seq_string) { 1570 if (cft->read_map || cft->read_seq_string) {
1564 struct cgroup_seqfile_state *state = 1571 struct cgroup_seqfile_state *state =
1565 kzalloc(sizeof(*state), GFP_USER); 1572 kzalloc(sizeof(*state), GFP_USER);
@@ -1673,7 +1680,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
1673 if (!error) { 1680 if (!error) {
1674 dentry->d_fsdata = cgrp; 1681 dentry->d_fsdata = cgrp;
1675 inc_nlink(parent->d_inode); 1682 inc_nlink(parent->d_inode);
1676 cgrp->dentry = dentry; 1683 rcu_assign_pointer(cgrp->dentry, dentry);
1677 dget(dentry); 1684 dget(dentry);
1678 } 1685 }
1679 dput(dentry); 1686 dput(dentry);
@@ -1814,6 +1821,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1814{ 1821{
1815 struct task_struct *res; 1822 struct task_struct *res;
1816 struct list_head *l = it->task; 1823 struct list_head *l = it->task;
1824 struct cg_cgroup_link *link;
1817 1825
1818 /* If the iterator cg is NULL, we have no tasks */ 1826 /* If the iterator cg is NULL, we have no tasks */
1819 if (!it->cg_link) 1827 if (!it->cg_link)
@@ -1821,7 +1829,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
1821 res = list_entry(l, struct task_struct, cg_list); 1829 res = list_entry(l, struct task_struct, cg_list);
1822 /* Advance iterator to find next entry */ 1830 /* Advance iterator to find next entry */
1823 l = l->next; 1831 l = l->next;
1824 if (l == &res->cgroups->tasks) { 1832 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
1833 if (l == &link->cg->tasks) {
1825 /* We reached the end of this task list - move on to 1834 /* We reached the end of this task list - move on to
1826 * the next cg_cgroup_link */ 1835 * the next cg_cgroup_link */
1827 cgroup_advance_iter(cgrp, it); 1836 cgroup_advance_iter(cgrp, it);
@@ -2015,14 +2024,16 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2015 */ 2024 */
2016static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2025static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
2017{ 2026{
2018 int n = 0; 2027 int n = 0, pid;
2019 struct cgroup_iter it; 2028 struct cgroup_iter it;
2020 struct task_struct *tsk; 2029 struct task_struct *tsk;
2021 cgroup_iter_start(cgrp, &it); 2030 cgroup_iter_start(cgrp, &it);
2022 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2031 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2023 if (unlikely(n == npids)) 2032 if (unlikely(n == npids))
2024 break; 2033 break;
2025 pidarray[n++] = task_pid_vnr(tsk); 2034 pid = task_pid_vnr(tsk);
2035 if (pid > 0)
2036 pidarray[n++] = pid;
2026 } 2037 }
2027 cgroup_iter_end(cgrp, &it); 2038 cgroup_iter_end(cgrp, &it);
2028 return n; 2039 return n;
@@ -2054,7 +2065,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2054 2065
2055 ret = 0; 2066 ret = 0;
2056 cgrp = dentry->d_fsdata; 2067 cgrp = dentry->d_fsdata;
2057 rcu_read_lock();
2058 2068
2059 cgroup_iter_start(cgrp, &it); 2069 cgroup_iter_start(cgrp, &it);
2060 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2070 while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2079,7 +2089,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
2079 } 2089 }
2080 cgroup_iter_end(cgrp, &it); 2090 cgroup_iter_end(cgrp, &it);
2081 2091
2082 rcu_read_unlock();
2083err: 2092err:
2084 return ret; 2093 return ret;
2085} 2094}
@@ -2326,7 +2335,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2326 struct cgroup *cgrp) 2335 struct cgroup *cgrp)
2327{ 2336{
2328 css->cgroup = cgrp; 2337 css->cgroup = cgrp;
2329 atomic_set(&css->refcnt, 0); 2338 atomic_set(&css->refcnt, 1);
2330 css->flags = 0; 2339 css->flags = 0;
2331 if (cgrp == dummytop) 2340 if (cgrp == dummytop)
2332 set_bit(CSS_ROOT, &css->flags); 2341 set_bit(CSS_ROOT, &css->flags);
@@ -2334,6 +2343,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
2334 cgrp->subsys[ss->subsys_id] = css; 2343 cgrp->subsys[ss->subsys_id] = css;
2335} 2344}
2336 2345
2346static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
2347{
2348 /* We need to take each hierarchy_mutex in a consistent order */
2349 int i;
2350
2351 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2352 struct cgroup_subsys *ss = subsys[i];
2353 if (ss->root == root)
2354 mutex_lock_nested(&ss->hierarchy_mutex, i);
2355 }
2356}
2357
2358static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
2359{
2360 int i;
2361
2362 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2363 struct cgroup_subsys *ss = subsys[i];
2364 if (ss->root == root)
2365 mutex_unlock(&ss->hierarchy_mutex);
2366 }
2367}
2368
2337/* 2369/*
2338 * cgroup_create - create a cgroup 2370 * cgroup_create - create a cgroup
2339 * @parent: cgroup that will be parent of the new cgroup 2371 * @parent: cgroup that will be parent of the new cgroup
@@ -2382,7 +2414,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2382 init_cgroup_css(css, ss, cgrp); 2414 init_cgroup_css(css, ss, cgrp);
2383 } 2415 }
2384 2416
2417 cgroup_lock_hierarchy(root);
2385 list_add(&cgrp->sibling, &cgrp->parent->children); 2418 list_add(&cgrp->sibling, &cgrp->parent->children);
2419 cgroup_unlock_hierarchy(root);
2386 root->number_of_cgroups++; 2420 root->number_of_cgroups++;
2387 2421
2388 err = cgroup_create_dir(cgrp, dentry, mode); 2422 err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2402,7 +2436,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2402 2436
2403 err_remove: 2437 err_remove:
2404 2438
2439 cgroup_lock_hierarchy(root);
2405 list_del(&cgrp->sibling); 2440 list_del(&cgrp->sibling);
2441 cgroup_unlock_hierarchy(root);
2406 root->number_of_cgroups--; 2442 root->number_of_cgroups--;
2407 2443
2408 err_destroy: 2444 err_destroy:
@@ -2433,7 +2469,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2433{ 2469{
2434 /* Check the reference count on each subsystem. Since we 2470 /* Check the reference count on each subsystem. Since we
2435 * already established that there are no tasks in the 2471 * already established that there are no tasks in the
2436 * cgroup, if the css refcount is also 0, then there should 2472 * cgroup, if the css refcount is also 1, then there should
2437 * be no outstanding references, so the subsystem is safe to 2473 * be no outstanding references, so the subsystem is safe to
2438 * destroy. We scan across all subsystems rather than using 2474 * destroy. We scan across all subsystems rather than using
2439 * the per-hierarchy linked list of mounted subsystems since 2475 * the per-hierarchy linked list of mounted subsystems since
@@ -2454,19 +2490,70 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
2454 * matter, since it can only happen if the cgroup 2490 * matter, since it can only happen if the cgroup
2455 * has been deleted and hence no longer needs the 2491 * has been deleted and hence no longer needs the
2456 * release agent to be called anyway. */ 2492 * release agent to be called anyway. */
2457 if (css && atomic_read(&css->refcnt)) 2493 if (css && (atomic_read(&css->refcnt) > 1))
2458 return 1; 2494 return 1;
2459 } 2495 }
2460 return 0; 2496 return 0;
2461} 2497}
2462 2498
2499/*
2500 * Atomically mark all (or else none) of the cgroup's CSS objects as
2501 * CSS_REMOVED. Return true on success, or false if the cgroup has
2502 * busy subsystems. Call with cgroup_mutex held
2503 */
2504
2505static int cgroup_clear_css_refs(struct cgroup *cgrp)
2506{
2507 struct cgroup_subsys *ss;
2508 unsigned long flags;
2509 bool failed = false;
2510 local_irq_save(flags);
2511 for_each_subsys(cgrp->root, ss) {
2512 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2513 int refcnt;
2514 while (1) {
2515 /* We can only remove a CSS with a refcnt==1 */
2516 refcnt = atomic_read(&css->refcnt);
2517 if (refcnt > 1) {
2518 failed = true;
2519 goto done;
2520 }
2521 BUG_ON(!refcnt);
2522 /*
2523 * Drop the refcnt to 0 while we check other
2524 * subsystems. This will cause any racing
2525 * css_tryget() to spin until we set the
2526 * CSS_REMOVED bits or abort
2527 */
2528 if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
2529 break;
2530 cpu_relax();
2531 }
2532 }
2533 done:
2534 for_each_subsys(cgrp->root, ss) {
2535 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
2536 if (failed) {
2537 /*
2538 * Restore old refcnt if we previously managed
2539 * to clear it from 1 to 0
2540 */
2541 if (!atomic_read(&css->refcnt))
2542 atomic_set(&css->refcnt, 1);
2543 } else {
2544 /* Commit the fact that the CSS is removed */
2545 set_bit(CSS_REMOVED, &css->flags);
2546 }
2547 }
2548 local_irq_restore(flags);
2549 return !failed;
2550}
2551
2463static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 2552static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2464{ 2553{
2465 struct cgroup *cgrp = dentry->d_fsdata; 2554 struct cgroup *cgrp = dentry->d_fsdata;
2466 struct dentry *d; 2555 struct dentry *d;
2467 struct cgroup *parent; 2556 struct cgroup *parent;
2468 struct super_block *sb;
2469 struct cgroupfs_root *root;
2470 2557
2471 /* the vfs holds both inode->i_mutex already */ 2558 /* the vfs holds both inode->i_mutex already */
2472 2559
@@ -2489,12 +2576,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2489 2576
2490 mutex_lock(&cgroup_mutex); 2577 mutex_lock(&cgroup_mutex);
2491 parent = cgrp->parent; 2578 parent = cgrp->parent;
2492 root = cgrp->root;
2493 sb = root->sb;
2494 2579
2495 if (atomic_read(&cgrp->count) 2580 if (atomic_read(&cgrp->count)
2496 || !list_empty(&cgrp->children) 2581 || !list_empty(&cgrp->children)
2497 || cgroup_has_css_refs(cgrp)) { 2582 || !cgroup_clear_css_refs(cgrp)) {
2498 mutex_unlock(&cgroup_mutex); 2583 mutex_unlock(&cgroup_mutex);
2499 return -EBUSY; 2584 return -EBUSY;
2500 } 2585 }
@@ -2504,8 +2589,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2504 if (!list_empty(&cgrp->release_list)) 2589 if (!list_empty(&cgrp->release_list))
2505 list_del(&cgrp->release_list); 2590 list_del(&cgrp->release_list);
2506 spin_unlock(&release_list_lock); 2591 spin_unlock(&release_list_lock);
2507 /* delete my sibling from parent->children */ 2592
2593 cgroup_lock_hierarchy(cgrp->root);
2594 /* delete this cgroup from parent->children */
2508 list_del(&cgrp->sibling); 2595 list_del(&cgrp->sibling);
2596 cgroup_unlock_hierarchy(cgrp->root);
2597
2509 spin_lock(&cgrp->dentry->d_lock); 2598 spin_lock(&cgrp->dentry->d_lock);
2510 d = dget(cgrp->dentry); 2599 d = dget(cgrp->dentry);
2511 spin_unlock(&d->d_lock); 2600 spin_unlock(&d->d_lock);
@@ -2527,6 +2616,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2527 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 2616 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2528 2617
2529 /* Create the top cgroup state for this subsystem */ 2618 /* Create the top cgroup state for this subsystem */
2619 list_add(&ss->sibling, &rootnode.subsys_list);
2530 ss->root = &rootnode; 2620 ss->root = &rootnode;
2531 css = ss->create(ss, dummytop); 2621 css = ss->create(ss, dummytop);
2532 /* We don't handle early failures gracefully */ 2622 /* We don't handle early failures gracefully */
@@ -2540,13 +2630,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2540 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 2630 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2541 2631
2542 need_forkexit_callback |= ss->fork || ss->exit; 2632 need_forkexit_callback |= ss->fork || ss->exit;
2543 need_mm_owner_callback |= !!ss->mm_owner_changed;
2544 2633
2545 /* At system boot, before all subsystems have been 2634 /* At system boot, before all subsystems have been
2546 * registered, no tasks have been forked, so we don't 2635 * registered, no tasks have been forked, so we don't
2547 * need to invoke fork callbacks here. */ 2636 * need to invoke fork callbacks here. */
2548 BUG_ON(!list_empty(&init_task.tasks)); 2637 BUG_ON(!list_empty(&init_task.tasks));
2549 2638
2639 mutex_init(&ss->hierarchy_mutex);
2550 ss->active = 1; 2640 ss->active = 1;
2551} 2641}
2552 2642
@@ -2565,7 +2655,6 @@ int __init cgroup_init_early(void)
2565 INIT_HLIST_NODE(&init_css_set.hlist); 2655 INIT_HLIST_NODE(&init_css_set.hlist);
2566 css_set_count = 1; 2656 css_set_count = 1;
2567 init_cgroup_root(&rootnode); 2657 init_cgroup_root(&rootnode);
2568 list_add(&rootnode.root_list, &roots);
2569 root_count = 1; 2658 root_count = 1;
2570 init_task.cgroups = &init_css_set; 2659 init_task.cgroups = &init_css_set;
2571 2660
@@ -2672,15 +2761,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2672 2761
2673 mutex_lock(&cgroup_mutex); 2762 mutex_lock(&cgroup_mutex);
2674 2763
2675 for_each_root(root) { 2764 for_each_active_root(root) {
2676 struct cgroup_subsys *ss; 2765 struct cgroup_subsys *ss;
2677 struct cgroup *cgrp; 2766 struct cgroup *cgrp;
2678 int subsys_id; 2767 int subsys_id;
2679 int count = 0; 2768 int count = 0;
2680 2769
2681 /* Skip this hierarchy if it has no active subsystems */
2682 if (!root->actual_subsys_bits)
2683 continue;
2684 seq_printf(m, "%lu:", root->subsys_bits); 2770 seq_printf(m, "%lu:", root->subsys_bits);
2685 for_each_subsys(root, ss) 2771 for_each_subsys(root, ss)
2686 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 2772 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2790,37 +2876,6 @@ void cgroup_fork_callbacks(struct task_struct *child)
2790 } 2876 }
2791} 2877}
2792 2878
2793#ifdef CONFIG_MM_OWNER
2794/**
2795 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2796 * @p: the new owner
2797 *
2798 * Called on every change to mm->owner. mm_init_owner() does not
2799 * invoke this routine, since it assigns the mm->owner the first time
2800 * and does not change it.
2801 *
2802 * The callbacks are invoked with mmap_sem held in read mode.
2803 */
2804void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2805{
2806 struct cgroup *oldcgrp, *newcgrp = NULL;
2807
2808 if (need_mm_owner_callback) {
2809 int i;
2810 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2811 struct cgroup_subsys *ss = subsys[i];
2812 oldcgrp = task_cgroup(old, ss->subsys_id);
2813 if (new)
2814 newcgrp = task_cgroup(new, ss->subsys_id);
2815 if (oldcgrp == newcgrp)
2816 continue;
2817 if (ss->mm_owner_changed)
2818 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2819 }
2820 }
2821}
2822#endif /* CONFIG_MM_OWNER */
2823
2824/** 2879/**
2825 * cgroup_post_fork - called on a new task after adding it to the task list 2880 * cgroup_post_fork - called on a new task after adding it to the task list
2826 * @child: the task in question 2881 * @child: the task in question
@@ -2834,8 +2889,10 @@ void cgroup_post_fork(struct task_struct *child)
2834{ 2889{
2835 if (use_task_css_set_links) { 2890 if (use_task_css_set_links) {
2836 write_lock(&css_set_lock); 2891 write_lock(&css_set_lock);
2892 task_lock(child);
2837 if (list_empty(&child->cg_list)) 2893 if (list_empty(&child->cg_list))
2838 list_add(&child->cg_list, &child->cgroups->tasks); 2894 list_add(&child->cg_list, &child->cgroups->tasks);
2895 task_unlock(child);
2839 write_unlock(&css_set_lock); 2896 write_unlock(&css_set_lock);
2840 } 2897 }
2841} 2898}
@@ -2941,14 +2998,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2941 mutex_unlock(&cgroup_mutex); 2998 mutex_unlock(&cgroup_mutex);
2942 return 0; 2999 return 0;
2943 } 3000 }
2944 cg = tsk->cgroups;
2945 parent = task_cgroup(tsk, subsys->subsys_id);
2946 3001
2947 /* Pin the hierarchy */ 3002 /* Pin the hierarchy */
2948 atomic_inc(&parent->root->sb->s_active); 3003 if (!atomic_inc_not_zero(&root->sb->s_active)) {
3004 /* We race with the final deactivate_super() */
3005 mutex_unlock(&cgroup_mutex);
3006 return 0;
3007 }
2949 3008
2950 /* Keep the cgroup alive */ 3009 /* Keep the cgroup alive */
3010 task_lock(tsk);
3011 parent = task_cgroup(tsk, subsys->subsys_id);
3012 cg = tsk->cgroups;
2951 get_css_set(cg); 3013 get_css_set(cg);
3014 task_unlock(tsk);
3015
2952 mutex_unlock(&cgroup_mutex); 3016 mutex_unlock(&cgroup_mutex);
2953 3017
2954 /* Now do the VFS work to create a cgroup */ 3018 /* Now do the VFS work to create a cgroup */
@@ -2967,7 +3031,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2967 } 3031 }
2968 3032
2969 /* Create the cgroup directory, which also creates the cgroup */ 3033 /* Create the cgroup directory, which also creates the cgroup */
2970 ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); 3034 ret = vfs_mkdir(inode, dentry, 0755);
2971 child = __d_cgrp(dentry); 3035 child = __d_cgrp(dentry);
2972 dput(dentry); 3036 dput(dentry);
2973 if (ret) { 3037 if (ret) {
@@ -2977,13 +3041,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2977 goto out_release; 3041 goto out_release;
2978 } 3042 }
2979 3043
2980 if (!child) {
2981 printk(KERN_INFO
2982 "Couldn't find new cgroup %s\n", nodename);
2983 ret = -ENOMEM;
2984 goto out_release;
2985 }
2986
2987 /* The cgroup now exists. Retake cgroup_mutex and check 3044 /* The cgroup now exists. Retake cgroup_mutex and check
2988 * that we're still in the same state that we thought we 3045 * that we're still in the same state that we thought we
2989 * were. */ 3046 * were. */
@@ -2994,7 +3051,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2994 mutex_unlock(&inode->i_mutex); 3051 mutex_unlock(&inode->i_mutex);
2995 put_css_set(cg); 3052 put_css_set(cg);
2996 3053
2997 deactivate_super(parent->root->sb); 3054 deactivate_super(root->sb);
2998 /* The cgroup is still accessible in the VFS, but 3055 /* The cgroup is still accessible in the VFS, but
2999 * we're not going to try to rmdir() it at this 3056 * we're not going to try to rmdir() it at this
3000 * point. */ 3057 * point. */
@@ -3020,7 +3077,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
3020 mutex_lock(&cgroup_mutex); 3077 mutex_lock(&cgroup_mutex);
3021 put_css_set(cg); 3078 put_css_set(cg);
3022 mutex_unlock(&cgroup_mutex); 3079 mutex_unlock(&cgroup_mutex);
3023 deactivate_super(parent->root->sb); 3080 deactivate_super(root->sb);
3024 return ret; 3081 return ret;
3025} 3082}
3026 3083
@@ -3079,7 +3136,8 @@ void __css_put(struct cgroup_subsys_state *css)
3079{ 3136{
3080 struct cgroup *cgrp = css->cgroup; 3137 struct cgroup *cgrp = css->cgroup;
3081 rcu_read_lock(); 3138 rcu_read_lock();
3082 if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) { 3139 if ((atomic_dec_return(&css->refcnt) == 1) &&
3140 notify_on_release(cgrp)) {
3083 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3141 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3084 check_for_release(cgrp); 3142 check_for_release(cgrp);
3085 } 3143 }
diff --git a/kernel/compat.c b/kernel/compat.c
index 8eafe3eb50d9..42d56544460f 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -24,6 +24,7 @@
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h> 26#include <linux/times.h>
27#include <linux/ptrace.h>
27 28
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29 30
@@ -229,6 +230,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
229 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 230 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
230 return -EFAULT; 231 return -EFAULT;
231 } 232 }
233 force_successful_syscall_return();
232 return compat_jiffies_to_clock_t(jiffies); 234 return compat_jiffies_to_clock_t(jiffies);
233} 235}
234 236
@@ -454,16 +456,16 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
454} 456}
455 457
456static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, 458static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
457 unsigned len, cpumask_t *new_mask) 459 unsigned len, struct cpumask *new_mask)
458{ 460{
459 unsigned long *k; 461 unsigned long *k;
460 462
461 if (len < sizeof(cpumask_t)) 463 if (len < cpumask_size())
462 memset(new_mask, 0, sizeof(cpumask_t)); 464 memset(new_mask, 0, cpumask_size());
463 else if (len > sizeof(cpumask_t)) 465 else if (len > cpumask_size())
464 len = sizeof(cpumask_t); 466 len = cpumask_size();
465 467
466 k = cpus_addr(*new_mask); 468 k = cpumask_bits(new_mask);
467 return compat_get_bitmap(k, user_mask_ptr, len * 8); 469 return compat_get_bitmap(k, user_mask_ptr, len * 8);
468} 470}
469 471
@@ -471,40 +473,51 @@ asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
471 unsigned int len, 473 unsigned int len,
472 compat_ulong_t __user *user_mask_ptr) 474 compat_ulong_t __user *user_mask_ptr)
473{ 475{
474 cpumask_t new_mask; 476 cpumask_var_t new_mask;
475 int retval; 477 int retval;
476 478
477 retval = compat_get_user_cpu_mask(user_mask_ptr, len, &new_mask); 479 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
480 return -ENOMEM;
481
482 retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask);
478 if (retval) 483 if (retval)
479 return retval; 484 goto out;
480 485
481 return sched_setaffinity(pid, &new_mask); 486 retval = sched_setaffinity(pid, new_mask);
487out:
488 free_cpumask_var(new_mask);
489 return retval;
482} 490}
483 491
484asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, 492asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
485 compat_ulong_t __user *user_mask_ptr) 493 compat_ulong_t __user *user_mask_ptr)
486{ 494{
487 int ret; 495 int ret;
488 cpumask_t mask; 496 cpumask_var_t mask;
489 unsigned long *k; 497 unsigned long *k;
490 unsigned int min_length = sizeof(cpumask_t); 498 unsigned int min_length = cpumask_size();
491 499
492 if (NR_CPUS <= BITS_PER_COMPAT_LONG) 500 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
493 min_length = sizeof(compat_ulong_t); 501 min_length = sizeof(compat_ulong_t);
494 502
495 if (len < min_length) 503 if (len < min_length)
496 return -EINVAL; 504 return -EINVAL;
497 505
498 ret = sched_getaffinity(pid, &mask); 506 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
507 return -ENOMEM;
508
509 ret = sched_getaffinity(pid, mask);
499 if (ret < 0) 510 if (ret < 0)
500 return ret; 511 goto out;
501 512
502 k = cpus_addr(mask); 513 k = cpumask_bits(mask);
503 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 514 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8);
504 if (ret) 515 if (ret == 0)
505 return ret; 516 ret = min_length;
506 517
507 return min_length; 518out:
519 free_cpumask_var(mask);
520 return ret;
508} 521}
509 522
510int get_compat_itimerspec(struct itimerspec *dst, 523int get_compat_itimerspec(struct itimerspec *dst,
@@ -883,8 +896,9 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
883 896
884 if (tloc) { 897 if (tloc) {
885 if (put_user(i,tloc)) 898 if (put_user(i,tloc))
886 i = -EFAULT; 899 return -EFAULT;
887 } 900 }
901 force_successful_syscall_return();
888 return i; 902 return i;
889} 903}
890 904
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ea32e8d68b0..79e40f00dcb8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,29 +15,8 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/* 18#ifdef CONFIG_SMP
19 * Represents all cpu's present in the system 19/* Serializes the updates to cpu_online_mask, cpu_present_mask */
20 * In systems capable of hotplug, this map could dynamically grow
21 * as new cpu's are detected in the system via any platform specific
22 * method, such as ACPI for e.g.
23 */
24cpumask_t cpu_present_map __read_mostly;
25EXPORT_SYMBOL(cpu_present_map);
26
27#ifndef CONFIG_SMP
28
29/*
30 * Represents all cpu's that are currently online.
31 */
32cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
33EXPORT_SYMBOL(cpu_online_map);
34
35cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
36EXPORT_SYMBOL(cpu_possible_map);
37
38#else /* CONFIG_SMP */
39
40/* Serializes the updates to cpu_online_map, cpu_present_map */
41static DEFINE_MUTEX(cpu_add_remove_lock); 20static DEFINE_MUTEX(cpu_add_remove_lock);
42 21
43static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
@@ -64,8 +43,6 @@ void __init cpu_hotplug_init(void)
64 cpu_hotplug.refcount = 0; 43 cpu_hotplug.refcount = 0;
65} 44}
66 45
67cpumask_t cpu_active_map;
68
69#ifdef CONFIG_HOTPLUG_CPU 46#ifdef CONFIG_HOTPLUG_CPU
70 47
71void get_online_cpus(void) 48void get_online_cpus(void)
@@ -96,7 +73,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus);
96 73
97/* 74/*
98 * The following two API's must be used when attempting 75 * The following two API's must be used when attempting
99 * to serialize the updates to cpu_online_map, cpu_present_map. 76 * to serialize the updates to cpu_online_mask, cpu_present_mask.
100 */ 77 */
101void cpu_maps_update_begin(void) 78void cpu_maps_update_begin(void)
102{ 79{
@@ -217,7 +194,7 @@ static int __ref take_cpu_down(void *_param)
217static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 194static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
218{ 195{
219 int err, nr_calls = 0; 196 int err, nr_calls = 0;
220 cpumask_t old_allowed, tmp; 197 cpumask_var_t old_allowed;
221 void *hcpu = (void *)(long)cpu; 198 void *hcpu = (void *)(long)cpu;
222 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 199 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
223 struct take_cpu_down_param tcd_param = { 200 struct take_cpu_down_param tcd_param = {
@@ -231,6 +208,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
231 if (!cpu_online(cpu)) 208 if (!cpu_online(cpu))
232 return -EINVAL; 209 return -EINVAL;
233 210
211 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
212 return -ENOMEM;
213
234 cpu_hotplug_begin(); 214 cpu_hotplug_begin();
235 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 215 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
236 hcpu, -1, &nr_calls); 216 hcpu, -1, &nr_calls);
@@ -245,13 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
245 } 225 }
246 226
247 /* Ensure that we are not runnable on dying cpu */ 227 /* Ensure that we are not runnable on dying cpu */
248 old_allowed = current->cpus_allowed; 228 cpumask_copy(old_allowed, &current->cpus_allowed);
249 cpus_setall(tmp); 229 set_cpus_allowed_ptr(current,
250 cpu_clear(cpu, tmp); 230 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
251 set_cpus_allowed_ptr(current, &tmp);
252 tmp = cpumask_of_cpu(cpu);
253 231
254 err = __stop_machine(take_cpu_down, &tcd_param, &tmp); 232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
255 if (err) { 233 if (err) {
256 /* CPU didn't die: tell everyone. Can't complain. */ 234 /* CPU didn't die: tell everyone. Can't complain. */
257 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 235 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
@@ -277,7 +255,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
277 check_for_tasks(cpu); 255 check_for_tasks(cpu);
278 256
279out_allowed: 257out_allowed:
280 set_cpus_allowed_ptr(current, &old_allowed); 258 set_cpus_allowed_ptr(current, old_allowed);
281out_release: 259out_release:
282 cpu_hotplug_done(); 260 cpu_hotplug_done();
283 if (!err) { 261 if (!err) {
@@ -285,13 +263,17 @@ out_release:
285 hcpu) == NOTIFY_BAD) 263 hcpu) == NOTIFY_BAD)
286 BUG(); 264 BUG();
287 } 265 }
266 free_cpumask_var(old_allowed);
288 return err; 267 return err;
289} 268}
290 269
291int __ref cpu_down(unsigned int cpu) 270int __ref cpu_down(unsigned int cpu)
292{ 271{
293 int err = 0; 272 int err;
294 273
274 err = stop_machine_create();
275 if (err)
276 return err;
295 cpu_maps_update_begin(); 277 cpu_maps_update_begin();
296 278
297 if (cpu_hotplug_disabled) { 279 if (cpu_hotplug_disabled) {
@@ -303,7 +285,7 @@ int __ref cpu_down(unsigned int cpu)
303 285
304 /* 286 /*
305 * Make sure the all cpus did the reschedule and are not 287 * Make sure the all cpus did the reschedule and are not
306 * using stale version of the cpu_active_map. 288 * using stale version of the cpu_active_mask.
307 * This is not strictly necessary becuase stop_machine() 289 * This is not strictly necessary becuase stop_machine()
308 * that we run down the line already provides the required 290 * that we run down the line already provides the required
309 * synchronization. But it's really a side effect and we do not 291 * synchronization. But it's really a side effect and we do not
@@ -318,6 +300,7 @@ int __ref cpu_down(unsigned int cpu)
318 300
319out: 301out:
320 cpu_maps_update_done(); 302 cpu_maps_update_done();
303 stop_machine_destroy();
321 return err; 304 return err;
322} 305}
323EXPORT_SYMBOL(cpu_down); 306EXPORT_SYMBOL(cpu_down);
@@ -367,7 +350,7 @@ out_notify:
367int __cpuinit cpu_up(unsigned int cpu) 350int __cpuinit cpu_up(unsigned int cpu)
368{ 351{
369 int err = 0; 352 int err = 0;
370 if (!cpu_isset(cpu, cpu_possible_map)) { 353 if (!cpu_possible(cpu)) {
371 printk(KERN_ERR "can't online cpu %d because it is not " 354 printk(KERN_ERR "can't online cpu %d because it is not "
372 "configured as may-hotadd at boot time\n", cpu); 355 "configured as may-hotadd at boot time\n", cpu);
373#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) 356#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
@@ -392,25 +375,28 @@ out:
392} 375}
393 376
394#ifdef CONFIG_PM_SLEEP_SMP 377#ifdef CONFIG_PM_SLEEP_SMP
395static cpumask_t frozen_cpus; 378static cpumask_var_t frozen_cpus;
396 379
397int disable_nonboot_cpus(void) 380int disable_nonboot_cpus(void)
398{ 381{
399 int cpu, first_cpu, error = 0; 382 int cpu, first_cpu, error;
400 383
384 error = stop_machine_create();
385 if (error)
386 return error;
401 cpu_maps_update_begin(); 387 cpu_maps_update_begin();
402 first_cpu = first_cpu(cpu_online_map); 388 first_cpu = cpumask_first(cpu_online_mask);
403 /* We take down all of the non-boot CPUs in one shot to avoid races 389 /* We take down all of the non-boot CPUs in one shot to avoid races
404 * with the userspace trying to use the CPU hotplug at the same time 390 * with the userspace trying to use the CPU hotplug at the same time
405 */ 391 */
406 cpus_clear(frozen_cpus); 392 cpumask_clear(frozen_cpus);
407 printk("Disabling non-boot CPUs ...\n"); 393 printk("Disabling non-boot CPUs ...\n");
408 for_each_online_cpu(cpu) { 394 for_each_online_cpu(cpu) {
409 if (cpu == first_cpu) 395 if (cpu == first_cpu)
410 continue; 396 continue;
411 error = _cpu_down(cpu, 1); 397 error = _cpu_down(cpu, 1);
412 if (!error) { 398 if (!error) {
413 cpu_set(cpu, frozen_cpus); 399 cpumask_set_cpu(cpu, frozen_cpus);
414 printk("CPU%d is down\n", cpu); 400 printk("CPU%d is down\n", cpu);
415 } else { 401 } else {
416 printk(KERN_ERR "Error taking CPU%d down: %d\n", 402 printk(KERN_ERR "Error taking CPU%d down: %d\n",
@@ -426,6 +412,7 @@ int disable_nonboot_cpus(void)
426 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 412 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
427 } 413 }
428 cpu_maps_update_done(); 414 cpu_maps_update_done();
415 stop_machine_destroy();
429 return error; 416 return error;
430} 417}
431 418
@@ -436,11 +423,11 @@ void __ref enable_nonboot_cpus(void)
436 /* Allow everyone to use the CPU hotplug again */ 423 /* Allow everyone to use the CPU hotplug again */
437 cpu_maps_update_begin(); 424 cpu_maps_update_begin();
438 cpu_hotplug_disabled = 0; 425 cpu_hotplug_disabled = 0;
439 if (cpus_empty(frozen_cpus)) 426 if (cpumask_empty(frozen_cpus))
440 goto out; 427 goto out;
441 428
442 printk("Enabling non-boot CPUs ...\n"); 429 printk("Enabling non-boot CPUs ...\n");
443 for_each_cpu_mask_nr(cpu, frozen_cpus) { 430 for_each_cpu(cpu, frozen_cpus) {
444 error = _cpu_up(cpu, 1); 431 error = _cpu_up(cpu, 1);
445 if (!error) { 432 if (!error) {
446 printk("CPU%d is up\n", cpu); 433 printk("CPU%d is up\n", cpu);
@@ -448,10 +435,18 @@ void __ref enable_nonboot_cpus(void)
448 } 435 }
449 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 436 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
450 } 437 }
451 cpus_clear(frozen_cpus); 438 cpumask_clear(frozen_cpus);
452out: 439out:
453 cpu_maps_update_done(); 440 cpu_maps_update_done();
454} 441}
442
443static int alloc_frozen_cpus(void)
444{
445 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
446 return -ENOMEM;
447 return 0;
448}
449core_initcall(alloc_frozen_cpus);
455#endif /* CONFIG_PM_SLEEP_SMP */ 450#endif /* CONFIG_PM_SLEEP_SMP */
456 451
457/** 452/**
@@ -467,7 +462,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
467 unsigned long val = CPU_STARTING; 462 unsigned long val = CPU_STARTING;
468 463
469#ifdef CONFIG_PM_SLEEP_SMP 464#ifdef CONFIG_PM_SLEEP_SMP
470 if (cpu_isset(cpu, frozen_cpus)) 465 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
471 val = CPU_STARTING_FROZEN; 466 val = CPU_STARTING_FROZEN;
472#endif /* CONFIG_PM_SLEEP_SMP */ 467#endif /* CONFIG_PM_SLEEP_SMP */
473 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); 468 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
@@ -479,7 +474,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
479 * cpu_bit_bitmap[] is a special, "compressed" data structure that 474 * cpu_bit_bitmap[] is a special, "compressed" data structure that
480 * represents all NR_CPUS bits binary values of 1<<nr. 475 * represents all NR_CPUS bits binary values of 1<<nr.
481 * 476 *
482 * It is used by cpumask_of_cpu() to get a constant address to a CPU 477 * It is used by cpumask_of() to get a constant address to a CPU
483 * mask value that has a single bit set only. 478 * mask value that has a single bit set only.
484 */ 479 */
485 480
@@ -502,3 +497,71 @@ EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
502 497
503const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; 498const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
504EXPORT_SYMBOL(cpu_all_bits); 499EXPORT_SYMBOL(cpu_all_bits);
500
501#ifdef CONFIG_INIT_ALL_POSSIBLE
502static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
503 = CPU_BITS_ALL;
504#else
505static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
506#endif
507const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
508EXPORT_SYMBOL(cpu_possible_mask);
509
510static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
511const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
512EXPORT_SYMBOL(cpu_online_mask);
513
514static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
515const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
516EXPORT_SYMBOL(cpu_present_mask);
517
518static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
519const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
520EXPORT_SYMBOL(cpu_active_mask);
521
522void set_cpu_possible(unsigned int cpu, bool possible)
523{
524 if (possible)
525 cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
526 else
527 cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
528}
529
530void set_cpu_present(unsigned int cpu, bool present)
531{
532 if (present)
533 cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
534 else
535 cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
536}
537
538void set_cpu_online(unsigned int cpu, bool online)
539{
540 if (online)
541 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
542 else
543 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
544}
545
546void set_cpu_active(unsigned int cpu, bool active)
547{
548 if (active)
549 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
550 else
551 cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
552}
553
554void init_cpu_present(const struct cpumask *src)
555{
556 cpumask_copy(to_cpumask(cpu_present_bits), src);
557}
558
559void init_cpu_possible(const struct cpumask *src)
560{
561 cpumask_copy(to_cpumask(cpu_possible_bits), src);
562}
563
564void init_cpu_online(const struct cpumask *src)
565{
566 cpumask_copy(to_cpumask(cpu_online_bits), src);
567}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 96c0ba13b8cd..f76db9dcaa05 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,6 +61,14 @@
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62 62
63/* 63/*
64 * Workqueue for cpuset related tasks.
65 *
66 * Using kevent workqueue may cause deadlock when memory_migrate
67 * is set. So we create a separate workqueue thread for cpuset.
68 */
69static struct workqueue_struct *cpuset_wq;
70
71/*
64 * Tracks how many cpusets are currently defined in system. 72 * Tracks how many cpusets are currently defined in system.
65 * When there is only one cpuset (the root cpuset) we can 73 * When there is only one cpuset (the root cpuset) we can
66 * short circuit some hooks. 74 * short circuit some hooks.
@@ -84,7 +92,7 @@ struct cpuset {
84 struct cgroup_subsys_state css; 92 struct cgroup_subsys_state css;
85 93
86 unsigned long flags; /* "unsigned long" so bitops work */ 94 unsigned long flags; /* "unsigned long" so bitops work */
87 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 95 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 96 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
89 97
90 struct cpuset *parent; /* my parent */ 98 struct cpuset *parent; /* my parent */
@@ -195,8 +203,6 @@ static int cpuset_mems_generation;
195 203
196static struct cpuset top_cpuset = { 204static struct cpuset top_cpuset = {
197 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 205 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
198 .cpus_allowed = CPU_MASK_ALL,
199 .mems_allowed = NODE_MASK_ALL,
200}; 206};
201 207
202/* 208/*
@@ -240,6 +246,17 @@ static struct cpuset top_cpuset = {
240static DEFINE_MUTEX(callback_mutex); 246static DEFINE_MUTEX(callback_mutex);
241 247
242/* 248/*
249 * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
250 * buffers. They are statically allocated to prevent using excess stack
251 * when calling cpuset_print_task_mems_allowed().
252 */
253#define CPUSET_NAME_LEN (128)
254#define CPUSET_NODELIST_LEN (256)
255static char cpuset_name[CPUSET_NAME_LEN];
256static char cpuset_nodelist[CPUSET_NODELIST_LEN];
257static DEFINE_SPINLOCK(cpuset_buffer_lock);
258
259/*
243 * This is ugly, but preserves the userspace API for existing cpuset 260 * This is ugly, but preserves the userspace API for existing cpuset
244 * users. If someone tries to mount the "cpuset" filesystem, we 261 * users. If someone tries to mount the "cpuset" filesystem, we
245 * silently switch it to mount "cgroup" instead 262 * silently switch it to mount "cgroup" instead
@@ -267,7 +284,7 @@ static struct file_system_type cpuset_fs_type = {
267}; 284};
268 285
269/* 286/*
270 * Return in *pmask the portion of a cpusets's cpus_allowed that 287 * Return in pmask the portion of a cpusets's cpus_allowed that
271 * are online. If none are online, walk up the cpuset hierarchy 288 * are online. If none are online, walk up the cpuset hierarchy
272 * until we find one that does have some online cpus. If we get 289 * until we find one that does have some online cpus. If we get
273 * all the way to the top and still haven't found any online cpus, 290 * all the way to the top and still haven't found any online cpus,
@@ -280,15 +297,16 @@ static struct file_system_type cpuset_fs_type = {
280 * Call with callback_mutex held. 297 * Call with callback_mutex held.
281 */ 298 */
282 299
283static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 300static void guarantee_online_cpus(const struct cpuset *cs,
301 struct cpumask *pmask)
284{ 302{
285 while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) 303 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
286 cs = cs->parent; 304 cs = cs->parent;
287 if (cs) 305 if (cs)
288 cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); 306 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
289 else 307 else
290 *pmask = cpu_online_map; 308 cpumask_copy(pmask, cpu_online_mask);
291 BUG_ON(!cpus_intersects(*pmask, cpu_online_map)); 309 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
292} 310}
293 311
294/* 312/*
@@ -364,14 +382,9 @@ void cpuset_update_task_memory_state(void)
364 struct task_struct *tsk = current; 382 struct task_struct *tsk = current;
365 struct cpuset *cs; 383 struct cpuset *cs;
366 384
367 if (task_cs(tsk) == &top_cpuset) { 385 rcu_read_lock();
368 /* Don't need rcu for top_cpuset. It's never freed. */ 386 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
369 my_cpusets_mem_gen = top_cpuset.mems_generation; 387 rcu_read_unlock();
370 } else {
371 rcu_read_lock();
372 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
373 rcu_read_unlock();
374 }
375 388
376 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { 389 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
377 mutex_lock(&callback_mutex); 390 mutex_lock(&callback_mutex);
@@ -403,12 +416,43 @@ void cpuset_update_task_memory_state(void)
403 416
404static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 417static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
405{ 418{
406 return cpus_subset(p->cpus_allowed, q->cpus_allowed) && 419 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
407 nodes_subset(p->mems_allowed, q->mems_allowed) && 420 nodes_subset(p->mems_allowed, q->mems_allowed) &&
408 is_cpu_exclusive(p) <= is_cpu_exclusive(q) && 421 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
409 is_mem_exclusive(p) <= is_mem_exclusive(q); 422 is_mem_exclusive(p) <= is_mem_exclusive(q);
410} 423}
411 424
425/**
426 * alloc_trial_cpuset - allocate a trial cpuset
427 * @cs: the cpuset that the trial cpuset duplicates
428 */
429static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
430{
431 struct cpuset *trial;
432
433 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
434 if (!trial)
435 return NULL;
436
437 if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
438 kfree(trial);
439 return NULL;
440 }
441 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
442
443 return trial;
444}
445
446/**
447 * free_trial_cpuset - free the trial cpuset
448 * @trial: the trial cpuset to be freed
449 */
450static void free_trial_cpuset(struct cpuset *trial)
451{
452 free_cpumask_var(trial->cpus_allowed);
453 kfree(trial);
454}
455
412/* 456/*
413 * validate_change() - Used to validate that any proposed cpuset change 457 * validate_change() - Used to validate that any proposed cpuset change
414 * follows the structural rules for cpusets. 458 * follows the structural rules for cpusets.
@@ -458,7 +502,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
458 c = cgroup_cs(cont); 502 c = cgroup_cs(cont);
459 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 503 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
460 c != cur && 504 c != cur &&
461 cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) 505 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
462 return -EINVAL; 506 return -EINVAL;
463 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 507 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
464 c != cur && 508 c != cur &&
@@ -468,7 +512,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
468 512
469 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ 513 /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
470 if (cgroup_task_count(cur->css.cgroup)) { 514 if (cgroup_task_count(cur->css.cgroup)) {
471 if (cpus_empty(trial->cpus_allowed) || 515 if (cpumask_empty(trial->cpus_allowed) ||
472 nodes_empty(trial->mems_allowed)) { 516 nodes_empty(trial->mems_allowed)) {
473 return -ENOSPC; 517 return -ENOSPC;
474 } 518 }
@@ -483,7 +527,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
483 */ 527 */
484static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 528static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
485{ 529{
486 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 530 return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
487} 531}
488 532
489static void 533static void
@@ -508,7 +552,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
508 cp = list_first_entry(&q, struct cpuset, stack_list); 552 cp = list_first_entry(&q, struct cpuset, stack_list);
509 list_del(q.next); 553 list_del(q.next);
510 554
511 if (cpus_empty(cp->cpus_allowed)) 555 if (cpumask_empty(cp->cpus_allowed))
512 continue; 556 continue;
513 557
514 if (is_sched_load_balance(cp)) 558 if (is_sched_load_balance(cp))
@@ -532,7 +576,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
532 * load balancing domains (sched domains) as specified by that partial 576 * load balancing domains (sched domains) as specified by that partial
533 * partition. 577 * partition.
534 * 578 *
535 * See "What is sched_load_balance" in Documentation/cpusets.txt 579 * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
536 * for a background explanation of this. 580 * for a background explanation of this.
537 * 581 *
538 * Does not return errors, on the theory that the callers of this 582 * Does not return errors, on the theory that the callers of this
@@ -575,7 +619,8 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
575 * element of the partition (one sched domain) to be passed to 619 * element of the partition (one sched domain) to be passed to
576 * partition_sched_domains(). 620 * partition_sched_domains().
577 */ 621 */
578static int generate_sched_domains(cpumask_t **domains, 622/* FIXME: see the FIXME in partition_sched_domains() */
623static int generate_sched_domains(struct cpumask **domains,
579 struct sched_domain_attr **attributes) 624 struct sched_domain_attr **attributes)
580{ 625{
581 LIST_HEAD(q); /* queue of cpusets to be scanned */ 626 LIST_HEAD(q); /* queue of cpusets to be scanned */
@@ -583,10 +628,10 @@ static int generate_sched_domains(cpumask_t **domains,
583 struct cpuset **csa; /* array of all cpuset ptrs */ 628 struct cpuset **csa; /* array of all cpuset ptrs */
584 int csn; /* how many cpuset ptrs in csa so far */ 629 int csn; /* how many cpuset ptrs in csa so far */
585 int i, j, k; /* indices for partition finding loops */ 630 int i, j, k; /* indices for partition finding loops */
586 cpumask_t *doms; /* resulting partition; i.e. sched domains */ 631 struct cpumask *doms; /* resulting partition; i.e. sched domains */
587 struct sched_domain_attr *dattr; /* attributes for custom domains */ 632 struct sched_domain_attr *dattr; /* attributes for custom domains */
588 int ndoms = 0; /* number of sched domains in result */ 633 int ndoms = 0; /* number of sched domains in result */
589 int nslot; /* next empty doms[] cpumask_t slot */ 634 int nslot; /* next empty doms[] struct cpumask slot */
590 635
591 doms = NULL; 636 doms = NULL;
592 dattr = NULL; 637 dattr = NULL;
@@ -594,7 +639,7 @@ static int generate_sched_domains(cpumask_t **domains,
594 639
595 /* Special case for the 99% of systems with one, full, sched domain */ 640 /* Special case for the 99% of systems with one, full, sched domain */
596 if (is_sched_load_balance(&top_cpuset)) { 641 if (is_sched_load_balance(&top_cpuset)) {
597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 642 doms = kmalloc(cpumask_size(), GFP_KERNEL);
598 if (!doms) 643 if (!doms)
599 goto done; 644 goto done;
600 645
@@ -603,7 +648,7 @@ static int generate_sched_domains(cpumask_t **domains,
603 *dattr = SD_ATTR_INIT; 648 *dattr = SD_ATTR_INIT;
604 update_domain_attr_tree(dattr, &top_cpuset); 649 update_domain_attr_tree(dattr, &top_cpuset);
605 } 650 }
606 *doms = top_cpuset.cpus_allowed; 651 cpumask_copy(doms, top_cpuset.cpus_allowed);
607 652
608 ndoms = 1; 653 ndoms = 1;
609 goto done; 654 goto done;
@@ -622,7 +667,7 @@ static int generate_sched_domains(cpumask_t **domains,
622 cp = list_first_entry(&q, struct cpuset, stack_list); 667 cp = list_first_entry(&q, struct cpuset, stack_list);
623 list_del(q.next); 668 list_del(q.next);
624 669
625 if (cpus_empty(cp->cpus_allowed)) 670 if (cpumask_empty(cp->cpus_allowed))
626 continue; 671 continue;
627 672
628 /* 673 /*
@@ -673,7 +718,7 @@ restart:
673 * Now we know how many domains to create. 718 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. 719 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */ 720 */
676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 721 doms = kmalloc(ndoms * cpumask_size(), GFP_KERNEL);
677 if (!doms) 722 if (!doms)
678 goto done; 723 goto done;
679 724
@@ -685,7 +730,7 @@ restart:
685 730
686 for (nslot = 0, i = 0; i < csn; i++) { 731 for (nslot = 0, i = 0; i < csn; i++) {
687 struct cpuset *a = csa[i]; 732 struct cpuset *a = csa[i];
688 cpumask_t *dp; 733 struct cpumask *dp;
689 int apn = a->pn; 734 int apn = a->pn;
690 735
691 if (apn < 0) { 736 if (apn < 0) {
@@ -708,14 +753,14 @@ restart:
708 continue; 753 continue;
709 } 754 }
710 755
711 cpus_clear(*dp); 756 cpumask_clear(dp);
712 if (dattr) 757 if (dattr)
713 *(dattr + nslot) = SD_ATTR_INIT; 758 *(dattr + nslot) = SD_ATTR_INIT;
714 for (j = i; j < csn; j++) { 759 for (j = i; j < csn; j++) {
715 struct cpuset *b = csa[j]; 760 struct cpuset *b = csa[j];
716 761
717 if (apn == b->pn) { 762 if (apn == b->pn) {
718 cpus_or(*dp, *dp, b->cpus_allowed); 763 cpumask_or(dp, dp, b->cpus_allowed);
719 if (dattr) 764 if (dattr)
720 update_domain_attr_tree(dattr + nslot, b); 765 update_domain_attr_tree(dattr + nslot, b);
721 766
@@ -755,7 +800,7 @@ done:
755static void do_rebuild_sched_domains(struct work_struct *unused) 800static void do_rebuild_sched_domains(struct work_struct *unused)
756{ 801{
757 struct sched_domain_attr *attr; 802 struct sched_domain_attr *attr;
758 cpumask_t *doms; 803 struct cpumask *doms;
759 int ndoms; 804 int ndoms;
760 805
761 get_online_cpus(); 806 get_online_cpus();
@@ -794,7 +839,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
794 */ 839 */
795static void async_rebuild_sched_domains(void) 840static void async_rebuild_sched_domains(void)
796{ 841{
797 schedule_work(&rebuild_sched_domains_work); 842 queue_work(cpuset_wq, &rebuild_sched_domains_work);
798} 843}
799 844
800/* 845/*
@@ -824,7 +869,7 @@ void rebuild_sched_domains(void)
824static int cpuset_test_cpumask(struct task_struct *tsk, 869static int cpuset_test_cpumask(struct task_struct *tsk,
825 struct cgroup_scanner *scan) 870 struct cgroup_scanner *scan)
826{ 871{
827 return !cpus_equal(tsk->cpus_allowed, 872 return !cpumask_equal(&tsk->cpus_allowed,
828 (cgroup_cs(scan->cg))->cpus_allowed); 873 (cgroup_cs(scan->cg))->cpus_allowed);
829} 874}
830 875
@@ -842,7 +887,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
842static void cpuset_change_cpumask(struct task_struct *tsk, 887static void cpuset_change_cpumask(struct task_struct *tsk,
843 struct cgroup_scanner *scan) 888 struct cgroup_scanner *scan)
844{ 889{
845 set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed)); 890 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
846} 891}
847 892
848/** 893/**
@@ -874,10 +919,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
874 * @cs: the cpuset to consider 919 * @cs: the cpuset to consider
875 * @buf: buffer of cpu numbers written to this cpuset 920 * @buf: buffer of cpu numbers written to this cpuset
876 */ 921 */
877static int update_cpumask(struct cpuset *cs, const char *buf) 922static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
923 const char *buf)
878{ 924{
879 struct ptr_heap heap; 925 struct ptr_heap heap;
880 struct cpuset trialcs;
881 int retval; 926 int retval;
882 int is_load_balanced; 927 int is_load_balanced;
883 928
@@ -885,8 +930,6 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
885 if (cs == &top_cpuset) 930 if (cs == &top_cpuset)
886 return -EACCES; 931 return -EACCES;
887 932
888 trialcs = *cs;
889
890 /* 933 /*
891 * An empty cpus_allowed is ok only if the cpuset has no tasks. 934 * An empty cpus_allowed is ok only if the cpuset has no tasks.
892 * Since cpulist_parse() fails on an empty mask, we special case 935 * Since cpulist_parse() fails on an empty mask, we special case
@@ -894,31 +937,31 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
894 * with tasks have cpus. 937 * with tasks have cpus.
895 */ 938 */
896 if (!*buf) { 939 if (!*buf) {
897 cpus_clear(trialcs.cpus_allowed); 940 cpumask_clear(trialcs->cpus_allowed);
898 } else { 941 } else {
899 retval = cpulist_parse(buf, trialcs.cpus_allowed); 942 retval = cpulist_parse(buf, trialcs->cpus_allowed);
900 if (retval < 0) 943 if (retval < 0)
901 return retval; 944 return retval;
902 945
903 if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) 946 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask))
904 return -EINVAL; 947 return -EINVAL;
905 } 948 }
906 retval = validate_change(cs, &trialcs); 949 retval = validate_change(cs, trialcs);
907 if (retval < 0) 950 if (retval < 0)
908 return retval; 951 return retval;
909 952
910 /* Nothing to do if the cpus didn't change */ 953 /* Nothing to do if the cpus didn't change */
911 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 954 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
912 return 0; 955 return 0;
913 956
914 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 957 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
915 if (retval) 958 if (retval)
916 return retval; 959 return retval;
917 960
918 is_load_balanced = is_sched_load_balance(&trialcs); 961 is_load_balanced = is_sched_load_balance(trialcs);
919 962
920 mutex_lock(&callback_mutex); 963 mutex_lock(&callback_mutex);
921 cs->cpus_allowed = trialcs.cpus_allowed; 964 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
922 mutex_unlock(&callback_mutex); 965 mutex_unlock(&callback_mutex);
923 966
924 /* 967 /*
@@ -1006,7 +1049,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
1006 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1049 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1007 1050
1008 fudge = 10; /* spare mmarray[] slots */ 1051 fudge = 10; /* spare mmarray[] slots */
1009 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ 1052 fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
1010 retval = -ENOMEM; 1053 retval = -ENOMEM;
1011 1054
1012 /* 1055 /*
@@ -1093,9 +1136,9 @@ done:
1093 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1136 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1094 * their mempolicies to the cpusets new mems_allowed. 1137 * their mempolicies to the cpusets new mems_allowed.
1095 */ 1138 */
1096static int update_nodemask(struct cpuset *cs, const char *buf) 1139static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1140 const char *buf)
1097{ 1141{
1098 struct cpuset trialcs;
1099 nodemask_t oldmem; 1142 nodemask_t oldmem;
1100 int retval; 1143 int retval;
1101 1144
@@ -1106,8 +1149,6 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
1106 if (cs == &top_cpuset) 1149 if (cs == &top_cpuset)
1107 return -EACCES; 1150 return -EACCES;
1108 1151
1109 trialcs = *cs;
1110
1111 /* 1152 /*
1112 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1153 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1113 * Since nodelist_parse() fails on an empty mask, we special case 1154 * Since nodelist_parse() fails on an empty mask, we special case
@@ -1115,27 +1156,27 @@ static int update_nodemask(struct cpuset *cs, const char *buf)
1115 * with tasks have memory. 1156 * with tasks have memory.
1116 */ 1157 */
1117 if (!*buf) { 1158 if (!*buf) {
1118 nodes_clear(trialcs.mems_allowed); 1159 nodes_clear(trialcs->mems_allowed);
1119 } else { 1160 } else {
1120 retval = nodelist_parse(buf, trialcs.mems_allowed); 1161 retval = nodelist_parse(buf, trialcs->mems_allowed);
1121 if (retval < 0) 1162 if (retval < 0)
1122 goto done; 1163 goto done;
1123 1164
1124 if (!nodes_subset(trialcs.mems_allowed, 1165 if (!nodes_subset(trialcs->mems_allowed,
1125 node_states[N_HIGH_MEMORY])) 1166 node_states[N_HIGH_MEMORY]))
1126 return -EINVAL; 1167 return -EINVAL;
1127 } 1168 }
1128 oldmem = cs->mems_allowed; 1169 oldmem = cs->mems_allowed;
1129 if (nodes_equal(oldmem, trialcs.mems_allowed)) { 1170 if (nodes_equal(oldmem, trialcs->mems_allowed)) {
1130 retval = 0; /* Too easy - nothing to do */ 1171 retval = 0; /* Too easy - nothing to do */
1131 goto done; 1172 goto done;
1132 } 1173 }
1133 retval = validate_change(cs, &trialcs); 1174 retval = validate_change(cs, trialcs);
1134 if (retval < 0) 1175 if (retval < 0)
1135 goto done; 1176 goto done;
1136 1177
1137 mutex_lock(&callback_mutex); 1178 mutex_lock(&callback_mutex);
1138 cs->mems_allowed = trialcs.mems_allowed; 1179 cs->mems_allowed = trialcs->mems_allowed;
1139 cs->mems_generation = cpuset_mems_generation++; 1180 cs->mems_generation = cpuset_mems_generation++;
1140 mutex_unlock(&callback_mutex); 1181 mutex_unlock(&callback_mutex);
1141 1182
@@ -1156,7 +1197,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1156 1197
1157 if (val != cs->relax_domain_level) { 1198 if (val != cs->relax_domain_level) {
1158 cs->relax_domain_level = val; 1199 cs->relax_domain_level = val;
1159 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) 1200 if (!cpumask_empty(cs->cpus_allowed) &&
1201 is_sched_load_balance(cs))
1160 async_rebuild_sched_domains(); 1202 async_rebuild_sched_domains();
1161 } 1203 }
1162 1204
@@ -1175,31 +1217,36 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1175static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1217static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1176 int turning_on) 1218 int turning_on)
1177{ 1219{
1178 struct cpuset trialcs; 1220 struct cpuset *trialcs;
1179 int err; 1221 int err;
1180 int balance_flag_changed; 1222 int balance_flag_changed;
1181 1223
1182 trialcs = *cs; 1224 trialcs = alloc_trial_cpuset(cs);
1225 if (!trialcs)
1226 return -ENOMEM;
1227
1183 if (turning_on) 1228 if (turning_on)
1184 set_bit(bit, &trialcs.flags); 1229 set_bit(bit, &trialcs->flags);
1185 else 1230 else
1186 clear_bit(bit, &trialcs.flags); 1231 clear_bit(bit, &trialcs->flags);
1187 1232
1188 err = validate_change(cs, &trialcs); 1233 err = validate_change(cs, trialcs);
1189 if (err < 0) 1234 if (err < 0)
1190 return err; 1235 goto out;
1191 1236
1192 balance_flag_changed = (is_sched_load_balance(cs) != 1237 balance_flag_changed = (is_sched_load_balance(cs) !=
1193 is_sched_load_balance(&trialcs)); 1238 is_sched_load_balance(trialcs));
1194 1239
1195 mutex_lock(&callback_mutex); 1240 mutex_lock(&callback_mutex);
1196 cs->flags = trialcs.flags; 1241 cs->flags = trialcs->flags;
1197 mutex_unlock(&callback_mutex); 1242 mutex_unlock(&callback_mutex);
1198 1243
1199 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) 1244 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1200 async_rebuild_sched_domains(); 1245 async_rebuild_sched_domains();
1201 1246
1202 return 0; 1247out:
1248 free_trial_cpuset(trialcs);
1249 return err;
1203} 1250}
1204 1251
1205/* 1252/*
@@ -1300,42 +1347,47 @@ static int fmeter_getrate(struct fmeter *fmp)
1300 return val; 1347 return val;
1301} 1348}
1302 1349
1350/* Protected by cgroup_lock */
1351static cpumask_var_t cpus_attach;
1352
1303/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1353/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1304static int cpuset_can_attach(struct cgroup_subsys *ss, 1354static int cpuset_can_attach(struct cgroup_subsys *ss,
1305 struct cgroup *cont, struct task_struct *tsk) 1355 struct cgroup *cont, struct task_struct *tsk)
1306{ 1356{
1307 struct cpuset *cs = cgroup_cs(cont); 1357 struct cpuset *cs = cgroup_cs(cont);
1358 int ret = 0;
1308 1359
1309 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1360 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1310 return -ENOSPC; 1361 return -ENOSPC;
1311 if (tsk->flags & PF_THREAD_BOUND) {
1312 cpumask_t mask;
1313 1362
1363 if (tsk->flags & PF_THREAD_BOUND) {
1314 mutex_lock(&callback_mutex); 1364 mutex_lock(&callback_mutex);
1315 mask = cs->cpus_allowed; 1365 if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
1366 ret = -EINVAL;
1316 mutex_unlock(&callback_mutex); 1367 mutex_unlock(&callback_mutex);
1317 if (!cpus_equal(tsk->cpus_allowed, mask))
1318 return -EINVAL;
1319 } 1368 }
1320 1369
1321 return security_task_setscheduler(tsk, 0, NULL); 1370 return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
1322} 1371}
1323 1372
1324static void cpuset_attach(struct cgroup_subsys *ss, 1373static void cpuset_attach(struct cgroup_subsys *ss,
1325 struct cgroup *cont, struct cgroup *oldcont, 1374 struct cgroup *cont, struct cgroup *oldcont,
1326 struct task_struct *tsk) 1375 struct task_struct *tsk)
1327{ 1376{
1328 cpumask_t cpus;
1329 nodemask_t from, to; 1377 nodemask_t from, to;
1330 struct mm_struct *mm; 1378 struct mm_struct *mm;
1331 struct cpuset *cs = cgroup_cs(cont); 1379 struct cpuset *cs = cgroup_cs(cont);
1332 struct cpuset *oldcs = cgroup_cs(oldcont); 1380 struct cpuset *oldcs = cgroup_cs(oldcont);
1333 int err; 1381 int err;
1334 1382
1335 mutex_lock(&callback_mutex); 1383 if (cs == &top_cpuset) {
1336 guarantee_online_cpus(cs, &cpus); 1384 cpumask_copy(cpus_attach, cpu_possible_mask);
1337 err = set_cpus_allowed_ptr(tsk, &cpus); 1385 } else {
1338 mutex_unlock(&callback_mutex); 1386 mutex_lock(&callback_mutex);
1387 guarantee_online_cpus(cs, cpus_attach);
1388 mutex_unlock(&callback_mutex);
1389 }
1390 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1339 if (err) 1391 if (err)
1340 return; 1392 return;
1341 1393
@@ -1348,7 +1400,6 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1348 cpuset_migrate_mm(mm, &from, &to); 1400 cpuset_migrate_mm(mm, &from, &to);
1349 mmput(mm); 1401 mmput(mm);
1350 } 1402 }
1351
1352} 1403}
1353 1404
1354/* The various types of files and directories in a cpuset file system */ 1405/* The various types of files and directories in a cpuset file system */
@@ -1443,21 +1494,29 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1443 const char *buf) 1494 const char *buf)
1444{ 1495{
1445 int retval = 0; 1496 int retval = 0;
1497 struct cpuset *cs = cgroup_cs(cgrp);
1498 struct cpuset *trialcs;
1446 1499
1447 if (!cgroup_lock_live_group(cgrp)) 1500 if (!cgroup_lock_live_group(cgrp))
1448 return -ENODEV; 1501 return -ENODEV;
1449 1502
1503 trialcs = alloc_trial_cpuset(cs);
1504 if (!trialcs)
1505 return -ENOMEM;
1506
1450 switch (cft->private) { 1507 switch (cft->private) {
1451 case FILE_CPULIST: 1508 case FILE_CPULIST:
1452 retval = update_cpumask(cgroup_cs(cgrp), buf); 1509 retval = update_cpumask(cs, trialcs, buf);
1453 break; 1510 break;
1454 case FILE_MEMLIST: 1511 case FILE_MEMLIST:
1455 retval = update_nodemask(cgroup_cs(cgrp), buf); 1512 retval = update_nodemask(cs, trialcs, buf);
1456 break; 1513 break;
1457 default: 1514 default:
1458 retval = -EINVAL; 1515 retval = -EINVAL;
1459 break; 1516 break;
1460 } 1517 }
1518
1519 free_trial_cpuset(trialcs);
1461 cgroup_unlock(); 1520 cgroup_unlock();
1462 return retval; 1521 return retval;
1463} 1522}
@@ -1476,13 +1535,13 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1476 1535
1477static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1536static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1478{ 1537{
1479 cpumask_t mask; 1538 int ret;
1480 1539
1481 mutex_lock(&callback_mutex); 1540 mutex_lock(&callback_mutex);
1482 mask = cs->cpus_allowed; 1541 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1483 mutex_unlock(&callback_mutex); 1542 mutex_unlock(&callback_mutex);
1484 1543
1485 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1544 return ret;
1486} 1545}
1487 1546
1488static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1547static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
@@ -1718,7 +1777,7 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1718 parent_cs = cgroup_cs(parent); 1777 parent_cs = cgroup_cs(parent);
1719 1778
1720 cs->mems_allowed = parent_cs->mems_allowed; 1779 cs->mems_allowed = parent_cs->mems_allowed;
1721 cs->cpus_allowed = parent_cs->cpus_allowed; 1780 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1722 return; 1781 return;
1723} 1782}
1724 1783
@@ -1744,6 +1803,10 @@ static struct cgroup_subsys_state *cpuset_create(
1744 cs = kmalloc(sizeof(*cs), GFP_KERNEL); 1803 cs = kmalloc(sizeof(*cs), GFP_KERNEL);
1745 if (!cs) 1804 if (!cs)
1746 return ERR_PTR(-ENOMEM); 1805 return ERR_PTR(-ENOMEM);
1806 if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
1807 kfree(cs);
1808 return ERR_PTR(-ENOMEM);
1809 }
1747 1810
1748 cpuset_update_task_memory_state(); 1811 cpuset_update_task_memory_state();
1749 cs->flags = 0; 1812 cs->flags = 0;
@@ -1752,7 +1815,7 @@ static struct cgroup_subsys_state *cpuset_create(
1752 if (is_spread_slab(parent)) 1815 if (is_spread_slab(parent))
1753 set_bit(CS_SPREAD_SLAB, &cs->flags); 1816 set_bit(CS_SPREAD_SLAB, &cs->flags);
1754 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1817 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1755 cpus_clear(cs->cpus_allowed); 1818 cpumask_clear(cs->cpus_allowed);
1756 nodes_clear(cs->mems_allowed); 1819 nodes_clear(cs->mems_allowed);
1757 cs->mems_generation = cpuset_mems_generation++; 1820 cs->mems_generation = cpuset_mems_generation++;
1758 fmeter_init(&cs->fmeter); 1821 fmeter_init(&cs->fmeter);
@@ -1779,6 +1842,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1779 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1842 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1780 1843
1781 number_of_cpusets--; 1844 number_of_cpusets--;
1845 free_cpumask_var(cs->cpus_allowed);
1782 kfree(cs); 1846 kfree(cs);
1783} 1847}
1784 1848
@@ -1802,6 +1866,8 @@ struct cgroup_subsys cpuset_subsys = {
1802 1866
1803int __init cpuset_init_early(void) 1867int __init cpuset_init_early(void)
1804{ 1868{
1869 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
1870
1805 top_cpuset.mems_generation = cpuset_mems_generation++; 1871 top_cpuset.mems_generation = cpuset_mems_generation++;
1806 return 0; 1872 return 0;
1807} 1873}
@@ -1817,7 +1883,7 @@ int __init cpuset_init(void)
1817{ 1883{
1818 int err = 0; 1884 int err = 0;
1819 1885
1820 cpus_setall(top_cpuset.cpus_allowed); 1886 cpumask_setall(top_cpuset.cpus_allowed);
1821 nodes_setall(top_cpuset.mems_allowed); 1887 nodes_setall(top_cpuset.mems_allowed);
1822 1888
1823 fmeter_init(&top_cpuset.fmeter); 1889 fmeter_init(&top_cpuset.fmeter);
@@ -1829,6 +1895,9 @@ int __init cpuset_init(void)
1829 if (err < 0) 1895 if (err < 0)
1830 return err; 1896 return err;
1831 1897
1898 if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
1899 BUG();
1900
1832 number_of_cpusets = 1; 1901 number_of_cpusets = 1;
1833 return 0; 1902 return 0;
1834} 1903}
@@ -1903,7 +1972,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1903 * has online cpus, so can't be empty). 1972 * has online cpus, so can't be empty).
1904 */ 1973 */
1905 parent = cs->parent; 1974 parent = cs->parent;
1906 while (cpus_empty(parent->cpus_allowed) || 1975 while (cpumask_empty(parent->cpus_allowed) ||
1907 nodes_empty(parent->mems_allowed)) 1976 nodes_empty(parent->mems_allowed))
1908 parent = parent->parent; 1977 parent = parent->parent;
1909 1978
@@ -1944,7 +2013,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1944 } 2013 }
1945 2014
1946 /* Continue past cpusets with all cpus, mems online */ 2015 /* Continue past cpusets with all cpus, mems online */
1947 if (cpus_subset(cp->cpus_allowed, cpu_online_map) && 2016 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) &&
1948 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2017 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1949 continue; 2018 continue;
1950 2019
@@ -1952,13 +2021,14 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1952 2021
1953 /* Remove offline cpus and mems from this cpuset. */ 2022 /* Remove offline cpus and mems from this cpuset. */
1954 mutex_lock(&callback_mutex); 2023 mutex_lock(&callback_mutex);
1955 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); 2024 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2025 cpu_online_mask);
1956 nodes_and(cp->mems_allowed, cp->mems_allowed, 2026 nodes_and(cp->mems_allowed, cp->mems_allowed,
1957 node_states[N_HIGH_MEMORY]); 2027 node_states[N_HIGH_MEMORY]);
1958 mutex_unlock(&callback_mutex); 2028 mutex_unlock(&callback_mutex);
1959 2029
1960 /* Move tasks from the empty cpuset to a parent */ 2030 /* Move tasks from the empty cpuset to a parent */
1961 if (cpus_empty(cp->cpus_allowed) || 2031 if (cpumask_empty(cp->cpus_allowed) ||
1962 nodes_empty(cp->mems_allowed)) 2032 nodes_empty(cp->mems_allowed))
1963 remove_tasks_in_empty_cpuset(cp); 2033 remove_tasks_in_empty_cpuset(cp);
1964 else { 2034 else {
@@ -1984,7 +2054,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1984 unsigned long phase, void *unused_cpu) 2054 unsigned long phase, void *unused_cpu)
1985{ 2055{
1986 struct sched_domain_attr *attr; 2056 struct sched_domain_attr *attr;
1987 cpumask_t *doms; 2057 struct cpumask *doms;
1988 int ndoms; 2058 int ndoms;
1989 2059
1990 switch (phase) { 2060 switch (phase) {
@@ -1999,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1999 } 2069 }
2000 2070
2001 cgroup_lock(); 2071 cgroup_lock();
2002 top_cpuset.cpus_allowed = cpu_online_map; 2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2003 scan_for_empty_cpusets(&top_cpuset); 2073 scan_for_empty_cpusets(&top_cpuset);
2004 ndoms = generate_sched_domains(&doms, &attr); 2074 ndoms = generate_sched_domains(&doms, &attr);
2005 cgroup_unlock(); 2075 cgroup_unlock();
@@ -2044,25 +2114,28 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2044 2114
2045void __init cpuset_init_smp(void) 2115void __init cpuset_init_smp(void)
2046{ 2116{
2047 top_cpuset.cpus_allowed = cpu_online_map; 2117 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
2048 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2118 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2049 2119
2050 hotcpu_notifier(cpuset_track_online_cpus, 0); 2120 hotcpu_notifier(cpuset_track_online_cpus, 0);
2051 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2121 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2122
2123 cpuset_wq = create_singlethread_workqueue("cpuset");
2124 BUG_ON(!cpuset_wq);
2052} 2125}
2053 2126
2054/** 2127/**
2055 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2128 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
2056 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2129 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
2057 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. 2130 * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
2058 * 2131 *
2059 * Description: Returns the cpumask_t cpus_allowed of the cpuset 2132 * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
2060 * attached to the specified @tsk. Guaranteed to return some non-empty 2133 * attached to the specified @tsk. Guaranteed to return some non-empty
2061 * subset of cpu_online_map, even if this means going outside the 2134 * subset of cpu_online_map, even if this means going outside the
2062 * tasks cpuset. 2135 * tasks cpuset.
2063 **/ 2136 **/
2064 2137
2065void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask) 2138void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2066{ 2139{
2067 mutex_lock(&callback_mutex); 2140 mutex_lock(&callback_mutex);
2068 cpuset_cpus_allowed_locked(tsk, pmask); 2141 cpuset_cpus_allowed_locked(tsk, pmask);
@@ -2073,7 +2146,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask)
2073 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. 2146 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2074 * Must be called with callback_mutex held. 2147 * Must be called with callback_mutex held.
2075 **/ 2148 **/
2076void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask) 2149void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2077{ 2150{
2078 task_lock(tsk); 2151 task_lock(tsk);
2079 guarantee_online_cpus(task_cs(tsk), pmask); 2152 guarantee_online_cpus(task_cs(tsk), pmask);
@@ -2356,6 +2429,29 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
2356 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); 2429 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
2357} 2430}
2358 2431
2432/**
2433 * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
2434 * @task: pointer to task_struct of some task.
2435 *
2436 * Description: Prints @task's name, cpuset name, and cached copy of its
2437 * mems_allowed to the kernel log. Must hold task_lock(task) to allow
2438 * dereferencing task_cs(task).
2439 */
2440void cpuset_print_task_mems_allowed(struct task_struct *tsk)
2441{
2442 struct dentry *dentry;
2443
2444 dentry = task_cs(tsk)->css.cgroup->dentry;
2445 spin_lock(&cpuset_buffer_lock);
2446 snprintf(cpuset_name, CPUSET_NAME_LEN,
2447 dentry ? (const char *)dentry->d_name.name : "/");
2448 nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
2449 tsk->mems_allowed);
2450 printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
2451 tsk->comm, cpuset_name, cpuset_nodelist);
2452 spin_unlock(&cpuset_buffer_lock);
2453}
2454
2359/* 2455/*
2360 * Collection of memory_pressure is suppressed unless 2456 * Collection of memory_pressure is suppressed unless
2361 * this flag is enabled by writing "1" to the special 2457 * this flag is enabled by writing "1" to the special
diff --git a/kernel/cred.c b/kernel/cred.c
index ff7bc071991c..3a039189d707 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -372,7 +372,8 @@ int commit_creds(struct cred *new)
372 old->fsuid != new->fsuid || 372 old->fsuid != new->fsuid ||
373 old->fsgid != new->fsgid || 373 old->fsgid != new->fsgid ||
374 !cap_issubset(new->cap_permitted, old->cap_permitted)) { 374 !cap_issubset(new->cap_permitted, old->cap_permitted)) {
375 set_dumpable(task->mm, suid_dumpable); 375 if (task->mm)
376 set_dumpable(task->mm, suid_dumpable);
376 task->pdeath_signal = 0; 377 task->pdeath_signal = 0;
377 smp_wmb(); 378 smp_wmb();
378 } 379 }
@@ -506,6 +507,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
506 else 507 else
507 old = get_cred(&init_cred); 508 old = get_cred(&init_cred);
508 509
510 *new = *old;
509 get_uid(new->user); 511 get_uid(new->user);
510 get_group_info(new->group_info); 512 get_group_info(new->group_info);
511 513
@@ -529,6 +531,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
529 531
530error: 532error:
531 put_cred(new); 533 put_cred(new);
534 put_cred(old);
532 return NULL; 535 return NULL;
533} 536}
534EXPORT_SYMBOL(prepare_kernel_cred); 537EXPORT_SYMBOL(prepare_kernel_cred);
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index f013a0c2e111..962a3b574f21 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
98 * @size: size of requested memory area 98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle 99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address 100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area. 101 * to allocated area.
102 * 102 *
103 * This function should be only called from per-arch dma_alloc_coherent() 103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools. 104 * to support allocation from per-device coherent memory pools.
@@ -109,20 +109,41 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
109int dma_alloc_from_coherent(struct device *dev, ssize_t size, 109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret) 110 dma_addr_t *dma_handle, void **ret)
111{ 111{
112 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; 112 struct dma_coherent_mem *mem;
113 int order = get_order(size); 113 int order = get_order(size);
114 int pageno;
114 115
115 if (mem) { 116 if (!dev)
116 int page = bitmap_find_free_region(mem->bitmap, mem->size, 117 return 0;
117 order); 118 mem = dev->dma_mem;
118 if (page >= 0) { 119 if (!mem)
119 *dma_handle = mem->device_base + (page << PAGE_SHIFT); 120 return 0;
120 *ret = mem->virt_base + (page << PAGE_SHIFT); 121
121 memset(*ret, 0, size); 122 *ret = NULL;
122 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) 123
123 *ret = NULL; 124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
124 } 125 goto err;
125 return (mem != NULL); 126
127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
128 if (unlikely(pageno < 0))
129 goto err;
130
131 /*
132 * Memory was found in the per-device area.
133 */
134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
136 memset(*ret, 0, size);
137
138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
126} 147}
127EXPORT_SYMBOL(dma_alloc_from_coherent); 148EXPORT_SYMBOL(dma_alloc_from_coherent);
128 149
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 0511716e9424..667c841c2952 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -209,8 +209,7 @@ static int __init proc_execdomains_init(void)
209module_init(proc_execdomains_init); 209module_init(proc_execdomains_init);
210#endif 210#endif
211 211
212asmlinkage long 212SYSCALL_DEFINE1(personality, u_long, personality)
213sys_personality(u_long personality)
214{ 213{
215 u_long old = current->personality; 214 u_long old = current->personality;
216 215
diff --git a/kernel/exit.c b/kernel/exit.c
index c9e5a1c14e08..f80dec3f1875 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -642,35 +642,31 @@ retry:
642 /* 642 /*
643 * We found no owner yet mm_users > 1: this implies that we are 643 * We found no owner yet mm_users > 1: this implies that we are
644 * most likely racing with swapoff (try_to_unuse()) or /proc or 644 * most likely racing with swapoff (try_to_unuse()) or /proc or
645 * ptrace or page migration (get_task_mm()). Mark owner as NULL, 645 * ptrace or page migration (get_task_mm()). Mark owner as NULL.
646 * so that subsystems can understand the callback and take action.
647 */ 646 */
648 down_write(&mm->mmap_sem);
649 cgroup_mm_owner_callbacks(mm->owner, NULL);
650 mm->owner = NULL; 647 mm->owner = NULL;
651 up_write(&mm->mmap_sem);
652 return; 648 return;
653 649
654assign_new_owner: 650assign_new_owner:
655 BUG_ON(c == p); 651 BUG_ON(c == p);
656 get_task_struct(c); 652 get_task_struct(c);
657 read_unlock(&tasklist_lock);
658 down_write(&mm->mmap_sem);
659 /* 653 /*
660 * The task_lock protects c->mm from changing. 654 * The task_lock protects c->mm from changing.
661 * We always want mm->owner->mm == mm 655 * We always want mm->owner->mm == mm
662 */ 656 */
663 task_lock(c); 657 task_lock(c);
658 /*
659 * Delay read_unlock() till we have the task_lock()
660 * to ensure that c does not slip away underneath us
661 */
662 read_unlock(&tasklist_lock);
664 if (c->mm != mm) { 663 if (c->mm != mm) {
665 task_unlock(c); 664 task_unlock(c);
666 up_write(&mm->mmap_sem);
667 put_task_struct(c); 665 put_task_struct(c);
668 goto retry; 666 goto retry;
669 } 667 }
670 cgroup_mm_owner_callbacks(mm->owner, c);
671 mm->owner = c; 668 mm->owner = c;
672 task_unlock(c); 669 task_unlock(c);
673 up_write(&mm->mmap_sem);
674 put_task_struct(c); 670 put_task_struct(c);
675} 671}
676#endif /* CONFIG_MM_OWNER */ 672#endif /* CONFIG_MM_OWNER */
@@ -1055,10 +1051,7 @@ NORET_TYPE void do_exit(long code)
1055 preempt_count()); 1051 preempt_count());
1056 1052
1057 acct_update_integrals(tsk); 1053 acct_update_integrals(tsk);
1058 if (tsk->mm) { 1054
1059 update_hiwater_rss(tsk->mm);
1060 update_hiwater_vm(tsk->mm);
1061 }
1062 group_dead = atomic_dec_and_test(&tsk->signal->live); 1055 group_dead = atomic_dec_and_test(&tsk->signal->live);
1063 if (group_dead) { 1056 if (group_dead) {
1064 hrtimer_cancel(&tsk->signal->real_timer); 1057 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1148,7 +1141,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
1148 1141
1149EXPORT_SYMBOL(complete_and_exit); 1142EXPORT_SYMBOL(complete_and_exit);
1150 1143
1151asmlinkage long sys_exit(int error_code) 1144SYSCALL_DEFINE1(exit, int, error_code)
1152{ 1145{
1153 do_exit((error_code&0xff)<<8); 1146 do_exit((error_code&0xff)<<8);
1154} 1147}
@@ -1189,9 +1182,11 @@ do_group_exit(int exit_code)
1189 * wait4()-ing process will get the correct exit code - even if this 1182 * wait4()-ing process will get the correct exit code - even if this
1190 * thread is not the thread group leader. 1183 * thread is not the thread group leader.
1191 */ 1184 */
1192asmlinkage void sys_exit_group(int error_code) 1185SYSCALL_DEFINE1(exit_group, int, error_code)
1193{ 1186{
1194 do_group_exit((error_code & 0xff) << 8); 1187 do_group_exit((error_code & 0xff) << 8);
1188 /* NOTREACHED */
1189 return 0;
1195} 1190}
1196 1191
1197static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1192static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
@@ -1759,9 +1754,8 @@ end:
1759 return retval; 1754 return retval;
1760} 1755}
1761 1756
1762asmlinkage long sys_waitid(int which, pid_t upid, 1757SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1763 struct siginfo __user *infop, int options, 1758 infop, int, options, struct rusage __user *, ru)
1764 struct rusage __user *ru)
1765{ 1759{
1766 struct pid *pid = NULL; 1760 struct pid *pid = NULL;
1767 enum pid_type type; 1761 enum pid_type type;
@@ -1800,8 +1794,8 @@ asmlinkage long sys_waitid(int which, pid_t upid,
1800 return ret; 1794 return ret;
1801} 1795}
1802 1796
1803asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr, 1797SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1804 int options, struct rusage __user *ru) 1798 int, options, struct rusage __user *, ru)
1805{ 1799{
1806 struct pid *pid = NULL; 1800 struct pid *pid = NULL;
1807 enum pid_type type; 1801 enum pid_type type;
@@ -1838,7 +1832,7 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
1838 * sys_waitpid() remains for compatibility. waitpid() should be 1832 * sys_waitpid() remains for compatibility. waitpid() should be
1839 * implemented by calling sys_wait4() from libc.a. 1833 * implemented by calling sys_wait4() from libc.a.
1840 */ 1834 */
1841asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options) 1835SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
1842{ 1836{
1843 return sys_wait4(pid, stat_addr, options, NULL); 1837 return sys_wait4(pid, stat_addr, options, NULL);
1844} 1838}
diff --git a/kernel/fork.c b/kernel/fork.c
index 43cbf30669e6..242a706e7721 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -400,6 +400,18 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
400#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) 400#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
401#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) 401#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
402 402
403static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
404
405static int __init coredump_filter_setup(char *s)
406{
407 default_dump_filter =
408 (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
409 MMF_DUMP_FILTER_MASK;
410 return 1;
411}
412
413__setup("coredump_filter=", coredump_filter_setup);
414
403#include <linux/init_task.h> 415#include <linux/init_task.h>
404 416
405static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 417static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
@@ -408,8 +420,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
408 atomic_set(&mm->mm_count, 1); 420 atomic_set(&mm->mm_count, 1);
409 init_rwsem(&mm->mmap_sem); 421 init_rwsem(&mm->mmap_sem);
410 INIT_LIST_HEAD(&mm->mmlist); 422 INIT_LIST_HEAD(&mm->mmlist);
411 mm->flags = (current->mm) ? current->mm->flags 423 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
412 : MMF_DUMP_FILTER_DEFAULT;
413 mm->core_state = NULL; 424 mm->core_state = NULL;
414 mm->nr_ptes = 0; 425 mm->nr_ptes = 0;
415 set_mm_counter(mm, file_rss, 0); 426 set_mm_counter(mm, file_rss, 0);
@@ -758,7 +769,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
758{ 769{
759 struct sighand_struct *sig; 770 struct sighand_struct *sig;
760 771
761 if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) { 772 if (clone_flags & CLONE_SIGHAND) {
762 atomic_inc(&current->sighand->count); 773 atomic_inc(&current->sighand->count);
763 return 0; 774 return 0;
764 } 775 }
@@ -806,17 +817,17 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
806static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) 817static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
807{ 818{
808 struct signal_struct *sig; 819 struct signal_struct *sig;
809 int ret;
810 820
811 if (clone_flags & CLONE_THREAD) { 821 if (clone_flags & CLONE_THREAD) {
812 ret = thread_group_cputime_clone_thread(current); 822 atomic_inc(&current->signal->count);
813 if (likely(!ret)) { 823 atomic_inc(&current->signal->live);
814 atomic_inc(&current->signal->count); 824 return 0;
815 atomic_inc(&current->signal->live);
816 }
817 return ret;
818 } 825 }
819 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 826 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
827
828 if (sig)
829 posix_cpu_timers_init_group(sig);
830
820 tsk->signal = sig; 831 tsk->signal = sig;
821 if (!sig) 832 if (!sig)
822 return -ENOMEM; 833 return -ENOMEM;
@@ -853,8 +864,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
853 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 864 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
854 task_unlock(current->group_leader); 865 task_unlock(current->group_leader);
855 866
856 posix_cpu_timers_init_group(sig);
857
858 acct_init_pacct(&sig->pacct); 867 acct_init_pacct(&sig->pacct);
859 868
860 tty_audit_fork(sig); 869 tty_audit_fork(sig);
@@ -890,7 +899,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
890 clear_freeze_flag(p); 899 clear_freeze_flag(p);
891} 900}
892 901
893asmlinkage long sys_set_tid_address(int __user *tidptr) 902SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
894{ 903{
895 current->clear_child_tid = tidptr; 904 current->clear_child_tid = tidptr;
896 905
@@ -1115,12 +1124,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1115 1124
1116 if (pid != &init_struct_pid) { 1125 if (pid != &init_struct_pid) {
1117 retval = -ENOMEM; 1126 retval = -ENOMEM;
1118 pid = alloc_pid(task_active_pid_ns(p)); 1127 pid = alloc_pid(p->nsproxy->pid_ns);
1119 if (!pid) 1128 if (!pid)
1120 goto bad_fork_cleanup_io; 1129 goto bad_fork_cleanup_io;
1121 1130
1122 if (clone_flags & CLONE_NEWPID) { 1131 if (clone_flags & CLONE_NEWPID) {
1123 retval = pid_ns_prepare_proc(task_active_pid_ns(p)); 1132 retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
1124 if (retval < 0) 1133 if (retval < 0)
1125 goto bad_fork_free_pid; 1134 goto bad_fork_free_pid;
1126 } 1135 }
@@ -1470,12 +1479,10 @@ void __init proc_caches_init(void)
1470 fs_cachep = kmem_cache_create("fs_cache", 1479 fs_cachep = kmem_cache_create("fs_cache",
1471 sizeof(struct fs_struct), 0, 1480 sizeof(struct fs_struct), 0,
1472 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1481 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1473 vm_area_cachep = kmem_cache_create("vm_area_struct",
1474 sizeof(struct vm_area_struct), 0,
1475 SLAB_PANIC, NULL);
1476 mm_cachep = kmem_cache_create("mm_struct", 1482 mm_cachep = kmem_cache_create("mm_struct",
1477 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1483 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1478 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1484 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1485 mmap_init();
1479} 1486}
1480 1487
1481/* 1488/*
@@ -1594,7 +1601,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
1594 * constructed. Here we are modifying the current, active, 1601 * constructed. Here we are modifying the current, active,
1595 * task_struct. 1602 * task_struct.
1596 */ 1603 */
1597asmlinkage long sys_unshare(unsigned long unshare_flags) 1604SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1598{ 1605{
1599 int err = 0; 1606 int err = 0;
1600 struct fs_struct *fs, *new_fs = NULL; 1607 struct fs_struct *fs, *new_fs = NULL;
diff --git a/kernel/futex.c b/kernel/futex.c
index 7c6cbabe52b3..f89d373a9c6d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -170,8 +170,11 @@ static void get_futex_key_refs(union futex_key *key)
170 */ 170 */
171static void drop_futex_key_refs(union futex_key *key) 171static void drop_futex_key_refs(union futex_key *key)
172{ 172{
173 if (!key->both.ptr) 173 if (!key->both.ptr) {
174 /* If we're here then we tried to put a key we failed to get */
175 WARN_ON_ONCE(1);
174 return; 176 return;
177 }
175 178
176 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 179 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
177 case FUT_OFF_INODE: 180 case FUT_OFF_INODE:
@@ -730,8 +733,8 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
730 } 733 }
731 734
732 spin_unlock(&hb->lock); 735 spin_unlock(&hb->lock);
733out:
734 put_futex_key(fshared, &key); 736 put_futex_key(fshared, &key);
737out:
735 return ret; 738 return ret;
736} 739}
737 740
@@ -755,7 +758,7 @@ retryfull:
755 goto out; 758 goto out;
756 ret = get_futex_key(uaddr2, fshared, &key2); 759 ret = get_futex_key(uaddr2, fshared, &key2);
757 if (unlikely(ret != 0)) 760 if (unlikely(ret != 0))
758 goto out; 761 goto out_put_key1;
759 762
760 hb1 = hash_futex(&key1); 763 hb1 = hash_futex(&key1);
761 hb2 = hash_futex(&key2); 764 hb2 = hash_futex(&key2);
@@ -777,12 +780,12 @@ retry:
777 * but we might get them from range checking 780 * but we might get them from range checking
778 */ 781 */
779 ret = op_ret; 782 ret = op_ret;
780 goto out; 783 goto out_put_keys;
781#endif 784#endif
782 785
783 if (unlikely(op_ret != -EFAULT)) { 786 if (unlikely(op_ret != -EFAULT)) {
784 ret = op_ret; 787 ret = op_ret;
785 goto out; 788 goto out_put_keys;
786 } 789 }
787 790
788 /* 791 /*
@@ -796,7 +799,7 @@ retry:
796 ret = futex_handle_fault((unsigned long)uaddr2, 799 ret = futex_handle_fault((unsigned long)uaddr2,
797 attempt); 800 attempt);
798 if (ret) 801 if (ret)
799 goto out; 802 goto out_put_keys;
800 goto retry; 803 goto retry;
801 } 804 }
802 805
@@ -834,10 +837,11 @@ retry:
834 spin_unlock(&hb1->lock); 837 spin_unlock(&hb1->lock);
835 if (hb1 != hb2) 838 if (hb1 != hb2)
836 spin_unlock(&hb2->lock); 839 spin_unlock(&hb2->lock);
837out: 840out_put_keys:
838 put_futex_key(fshared, &key2); 841 put_futex_key(fshared, &key2);
842out_put_key1:
839 put_futex_key(fshared, &key1); 843 put_futex_key(fshared, &key1);
840 844out:
841 return ret; 845 return ret;
842} 846}
843 847
@@ -854,13 +858,13 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
854 struct futex_q *this, *next; 858 struct futex_q *this, *next;
855 int ret, drop_count = 0; 859 int ret, drop_count = 0;
856 860
857 retry: 861retry:
858 ret = get_futex_key(uaddr1, fshared, &key1); 862 ret = get_futex_key(uaddr1, fshared, &key1);
859 if (unlikely(ret != 0)) 863 if (unlikely(ret != 0))
860 goto out; 864 goto out;
861 ret = get_futex_key(uaddr2, fshared, &key2); 865 ret = get_futex_key(uaddr2, fshared, &key2);
862 if (unlikely(ret != 0)) 866 if (unlikely(ret != 0))
863 goto out; 867 goto out_put_key1;
864 868
865 hb1 = hash_futex(&key1); 869 hb1 = hash_futex(&key1);
866 hb2 = hash_futex(&key2); 870 hb2 = hash_futex(&key2);
@@ -882,7 +886,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
882 if (!ret) 886 if (!ret)
883 goto retry; 887 goto retry;
884 888
885 return ret; 889 goto out_put_keys;
886 } 890 }
887 if (curval != *cmpval) { 891 if (curval != *cmpval) {
888 ret = -EAGAIN; 892 ret = -EAGAIN;
@@ -927,9 +931,11 @@ out_unlock:
927 while (--drop_count >= 0) 931 while (--drop_count >= 0)
928 drop_futex_key_refs(&key1); 932 drop_futex_key_refs(&key1);
929 933
930out: 934out_put_keys:
931 put_futex_key(fshared, &key2); 935 put_futex_key(fshared, &key2);
936out_put_key1:
932 put_futex_key(fshared, &key1); 937 put_futex_key(fshared, &key1);
938out:
933 return ret; 939 return ret;
934} 940}
935 941
@@ -990,7 +996,7 @@ static int unqueue_me(struct futex_q *q)
990 int ret = 0; 996 int ret = 0;
991 997
992 /* In the common case we don't take the spinlock, which is nice. */ 998 /* In the common case we don't take the spinlock, which is nice. */
993 retry: 999retry:
994 lock_ptr = q->lock_ptr; 1000 lock_ptr = q->lock_ptr;
995 barrier(); 1001 barrier();
996 if (lock_ptr != NULL) { 1002 if (lock_ptr != NULL) {
@@ -1172,11 +1178,11 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1172 1178
1173 q.pi_state = NULL; 1179 q.pi_state = NULL;
1174 q.bitset = bitset; 1180 q.bitset = bitset;
1175 retry: 1181retry:
1176 q.key = FUTEX_KEY_INIT; 1182 q.key = FUTEX_KEY_INIT;
1177 ret = get_futex_key(uaddr, fshared, &q.key); 1183 ret = get_futex_key(uaddr, fshared, &q.key);
1178 if (unlikely(ret != 0)) 1184 if (unlikely(ret != 0))
1179 goto out_release_sem; 1185 goto out;
1180 1186
1181 hb = queue_lock(&q); 1187 hb = queue_lock(&q);
1182 1188
@@ -1204,6 +1210,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1204 1210
1205 if (unlikely(ret)) { 1211 if (unlikely(ret)) {
1206 queue_unlock(&q, hb); 1212 queue_unlock(&q, hb);
1213 put_futex_key(fshared, &q.key);
1207 1214
1208 ret = get_user(uval, uaddr); 1215 ret = get_user(uval, uaddr);
1209 1216
@@ -1213,7 +1220,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1213 } 1220 }
1214 ret = -EWOULDBLOCK; 1221 ret = -EWOULDBLOCK;
1215 if (uval != val) 1222 if (uval != val)
1216 goto out_unlock_release_sem; 1223 goto out_unlock_put_key;
1217 1224
1218 /* Only actually queue if *uaddr contained val. */ 1225 /* Only actually queue if *uaddr contained val. */
1219 queue_me(&q, hb); 1226 queue_me(&q, hb);
@@ -1305,11 +1312,11 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1305 return -ERESTART_RESTARTBLOCK; 1312 return -ERESTART_RESTARTBLOCK;
1306 } 1313 }
1307 1314
1308 out_unlock_release_sem: 1315out_unlock_put_key:
1309 queue_unlock(&q, hb); 1316 queue_unlock(&q, hb);
1310
1311 out_release_sem:
1312 put_futex_key(fshared, &q.key); 1317 put_futex_key(fshared, &q.key);
1318
1319out:
1313 return ret; 1320 return ret;
1314} 1321}
1315 1322
@@ -1358,16 +1365,16 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1358 } 1365 }
1359 1366
1360 q.pi_state = NULL; 1367 q.pi_state = NULL;
1361 retry: 1368retry:
1362 q.key = FUTEX_KEY_INIT; 1369 q.key = FUTEX_KEY_INIT;
1363 ret = get_futex_key(uaddr, fshared, &q.key); 1370 ret = get_futex_key(uaddr, fshared, &q.key);
1364 if (unlikely(ret != 0)) 1371 if (unlikely(ret != 0))
1365 goto out_release_sem; 1372 goto out;
1366 1373
1367 retry_unlocked: 1374retry_unlocked:
1368 hb = queue_lock(&q); 1375 hb = queue_lock(&q);
1369 1376
1370 retry_locked: 1377retry_locked:
1371 ret = lock_taken = 0; 1378 ret = lock_taken = 0;
1372 1379
1373 /* 1380 /*
@@ -1388,14 +1395,14 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1388 */ 1395 */
1389 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { 1396 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1390 ret = -EDEADLK; 1397 ret = -EDEADLK;
1391 goto out_unlock_release_sem; 1398 goto out_unlock_put_key;
1392 } 1399 }
1393 1400
1394 /* 1401 /*
1395 * Surprise - we got the lock. Just return to userspace: 1402 * Surprise - we got the lock. Just return to userspace:
1396 */ 1403 */
1397 if (unlikely(!curval)) 1404 if (unlikely(!curval))
1398 goto out_unlock_release_sem; 1405 goto out_unlock_put_key;
1399 1406
1400 uval = curval; 1407 uval = curval;
1401 1408
@@ -1431,7 +1438,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1431 * We took the lock due to owner died take over. 1438 * We took the lock due to owner died take over.
1432 */ 1439 */
1433 if (unlikely(lock_taken)) 1440 if (unlikely(lock_taken))
1434 goto out_unlock_release_sem; 1441 goto out_unlock_put_key;
1435 1442
1436 /* 1443 /*
1437 * We dont have the lock. Look up the PI state (or create it if 1444 * We dont have the lock. Look up the PI state (or create it if
@@ -1470,7 +1477,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1470 goto retry_locked; 1477 goto retry_locked;
1471 } 1478 }
1472 default: 1479 default:
1473 goto out_unlock_release_sem; 1480 goto out_unlock_put_key;
1474 } 1481 }
1475 } 1482 }
1476 1483
@@ -1561,16 +1568,17 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1561 destroy_hrtimer_on_stack(&to->timer); 1568 destroy_hrtimer_on_stack(&to->timer);
1562 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1569 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1563 1570
1564 out_unlock_release_sem: 1571out_unlock_put_key:
1565 queue_unlock(&q, hb); 1572 queue_unlock(&q, hb);
1566 1573
1567 out_release_sem: 1574out_put_key:
1568 put_futex_key(fshared, &q.key); 1575 put_futex_key(fshared, &q.key);
1576out:
1569 if (to) 1577 if (to)
1570 destroy_hrtimer_on_stack(&to->timer); 1578 destroy_hrtimer_on_stack(&to->timer);
1571 return ret; 1579 return ret;
1572 1580
1573 uaddr_faulted: 1581uaddr_faulted:
1574 /* 1582 /*
1575 * We have to r/w *(int __user *)uaddr, and we have to modify it 1583 * We have to r/w *(int __user *)uaddr, and we have to modify it
1576 * atomically. Therefore, if we continue to fault after get_user() 1584 * atomically. Therefore, if we continue to fault after get_user()
@@ -1583,7 +1591,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1583 if (attempt++) { 1591 if (attempt++) {
1584 ret = futex_handle_fault((unsigned long)uaddr, attempt); 1592 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1585 if (ret) 1593 if (ret)
1586 goto out_release_sem; 1594 goto out_put_key;
1587 goto retry_unlocked; 1595 goto retry_unlocked;
1588 } 1596 }
1589 1597
@@ -1675,9 +1683,9 @@ retry_unlocked:
1675 1683
1676out_unlock: 1684out_unlock:
1677 spin_unlock(&hb->lock); 1685 spin_unlock(&hb->lock);
1678out:
1679 put_futex_key(fshared, &key); 1686 put_futex_key(fshared, &key);
1680 1687
1688out:
1681 return ret; 1689 return ret;
1682 1690
1683pi_faulted: 1691pi_faulted:
@@ -1725,9 +1733,8 @@ pi_faulted:
1725 * @head: pointer to the list-head 1733 * @head: pointer to the list-head
1726 * @len: length of the list-head, as userspace expects 1734 * @len: length of the list-head, as userspace expects
1727 */ 1735 */
1728asmlinkage long 1736SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
1729sys_set_robust_list(struct robust_list_head __user *head, 1737 size_t, len)
1730 size_t len)
1731{ 1738{
1732 if (!futex_cmpxchg_enabled) 1739 if (!futex_cmpxchg_enabled)
1733 return -ENOSYS; 1740 return -ENOSYS;
@@ -1748,9 +1755,9 @@ sys_set_robust_list(struct robust_list_head __user *head,
1748 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 1755 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
1749 * @len_ptr: pointer to a length field, the kernel fills in the header size 1756 * @len_ptr: pointer to a length field, the kernel fills in the header size
1750 */ 1757 */
1751asmlinkage long 1758SYSCALL_DEFINE3(get_robust_list, int, pid,
1752sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, 1759 struct robust_list_head __user * __user *, head_ptr,
1753 size_t __user *len_ptr) 1760 size_t __user *, len_ptr)
1754{ 1761{
1755 struct robust_list_head __user *head; 1762 struct robust_list_head __user *head;
1756 unsigned long ret; 1763 unsigned long ret;
@@ -1970,9 +1977,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1970} 1977}
1971 1978
1972 1979
1973asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, 1980SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1974 struct timespec __user *utime, u32 __user *uaddr2, 1981 struct timespec __user *, utime, u32 __user *, uaddr2,
1975 u32 val3) 1982 u32, val3)
1976{ 1983{
1977 struct timespec ts; 1984 struct timespec ts;
1978 ktime_t t, *tp = NULL; 1985 ktime_t t, *tp = NULL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index bda9cb924276..f394d2a42ca3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,6 @@
32 */ 32 */
33 33
34#include <linux/cpu.h> 34#include <linux/cpu.h>
35#include <linux/irq.h>
36#include <linux/module.h> 35#include <linux/module.h>
37#include <linux/percpu.h> 36#include <linux/percpu.h>
38#include <linux/hrtimer.h> 37#include <linux/hrtimer.h>
@@ -502,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
502 continue; 501 continue;
503 timer = rb_entry(base->first, struct hrtimer, node); 502 timer = rb_entry(base->first, struct hrtimer, node);
504 expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 503 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
504 /*
505 * clock_was_set() has changed base->offset so the
506 * result might be negative. Fix it up to prevent a
507 * false positive in clockevents_program_event()
508 */
509 if (expires.tv64 < 0)
510 expires.tv64 = 0;
505 if (expires.tv64 < cpu_base->expires_next.tv64) 511 if (expires.tv64 < cpu_base->expires_next.tv64)
506 cpu_base->expires_next = expires; 512 cpu_base->expires_next = expires;
507 } 513 }
@@ -615,7 +621,9 @@ void clock_was_set(void)
615 */ 621 */
616void hres_timers_resume(void) 622void hres_timers_resume(void)
617{ 623{
618 /* Retrigger the CPU local events: */ 624 WARN_ONCE(!irqs_disabled(),
625 KERN_INFO "hres_timers_resume() called with IRQs enabled!");
626
619 retrigger_next_event(NULL); 627 retrigger_next_event(NULL);
620} 628}
621 629
@@ -635,7 +643,6 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
635{ 643{
636} 644}
637 645
638static void __run_hrtimer(struct hrtimer *timer);
639 646
640/* 647/*
641 * When High resolution timers are active, try to reprogram. Note, that in case 648 * When High resolution timers are active, try to reprogram. Note, that in case
@@ -647,13 +654,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
647 struct hrtimer_clock_base *base) 654 struct hrtimer_clock_base *base)
648{ 655{
649 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 656 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
650 /* 657 spin_unlock(&base->cpu_base->lock);
651 * XXX: recursion check? 658 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
652 * hrtimer_forward() should round up with timer granularity 659 spin_lock(&base->cpu_base->lock);
653 * so that we never get into inf recursion here,
654 * it doesn't do that though
655 */
656 __run_hrtimer(timer);
657 return 1; 660 return 1;
658 } 661 }
659 return 0; 662 return 0;
@@ -706,11 +709,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
706} 709}
707static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 710static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
708static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 711static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
709static inline int hrtimer_reprogram(struct hrtimer *timer,
710 struct hrtimer_clock_base *base)
711{
712 return 0;
713}
714 712
715#endif /* CONFIG_HIGH_RES_TIMERS */ 713#endif /* CONFIG_HIGH_RES_TIMERS */
716 714
@@ -781,9 +779,11 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
781 * 779 *
782 * The timer is inserted in expiry order. Insertion into the 780 * The timer is inserted in expiry order. Insertion into the
783 * red black tree is O(log(n)). Must hold the base lock. 781 * red black tree is O(log(n)). Must hold the base lock.
782 *
783 * Returns 1 when the new timer is the leftmost timer in the tree.
784 */ 784 */
785static void enqueue_hrtimer(struct hrtimer *timer, 785static int enqueue_hrtimer(struct hrtimer *timer,
786 struct hrtimer_clock_base *base, int reprogram) 786 struct hrtimer_clock_base *base)
787{ 787{
788 struct rb_node **link = &base->active.rb_node; 788 struct rb_node **link = &base->active.rb_node;
789 struct rb_node *parent = NULL; 789 struct rb_node *parent = NULL;
@@ -815,20 +815,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
815 * Insert the timer to the rbtree and check whether it 815 * Insert the timer to the rbtree and check whether it
816 * replaces the first pending timer 816 * replaces the first pending timer
817 */ 817 */
818 if (leftmost) { 818 if (leftmost)
819 /*
820 * Reprogram the clock event device. When the timer is already
821 * expired hrtimer_enqueue_reprogram has either called the
822 * callback or added it to the pending list and raised the
823 * softirq.
824 *
825 * This is a NOP for !HIGHRES
826 */
827 if (reprogram && hrtimer_enqueue_reprogram(timer, base))
828 return;
829
830 base->first = &timer->node; 819 base->first = &timer->node;
831 }
832 820
833 rb_link_node(&timer->node, parent, link); 821 rb_link_node(&timer->node, parent, link);
834 rb_insert_color(&timer->node, &base->active); 822 rb_insert_color(&timer->node, &base->active);
@@ -837,6 +825,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
837 * state of a possibly running callback. 825 * state of a possibly running callback.
838 */ 826 */
839 timer->state |= HRTIMER_STATE_ENQUEUED; 827 timer->state |= HRTIMER_STATE_ENQUEUED;
828
829 return leftmost;
840} 830}
841 831
842/* 832/*
@@ -913,7 +903,7 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
913{ 903{
914 struct hrtimer_clock_base *base, *new_base; 904 struct hrtimer_clock_base *base, *new_base;
915 unsigned long flags; 905 unsigned long flags;
916 int ret; 906 int ret, leftmost;
917 907
918 base = lock_hrtimer_base(timer, &flags); 908 base = lock_hrtimer_base(timer, &flags);
919 909
@@ -941,12 +931,16 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
941 931
942 timer_stats_hrtimer_set_start_info(timer); 932 timer_stats_hrtimer_set_start_info(timer);
943 933
934 leftmost = enqueue_hrtimer(timer, new_base);
935
944 /* 936 /*
945 * Only allow reprogramming if the new base is on this CPU. 937 * Only allow reprogramming if the new base is on this CPU.
946 * (it might still be on another CPU if the timer was pending) 938 * (it might still be on another CPU if the timer was pending)
939 *
940 * XXX send_remote_softirq() ?
947 */ 941 */
948 enqueue_hrtimer(timer, new_base, 942 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
949 new_base->cpu_base == &__get_cpu_var(hrtimer_bases)); 943 hrtimer_enqueue_reprogram(timer, new_base);
950 944
951 unlock_hrtimer_base(timer, &flags); 945 unlock_hrtimer_base(timer, &flags);
952 946
@@ -1158,19 +1152,42 @@ static void __run_hrtimer(struct hrtimer *timer)
1158 spin_lock(&cpu_base->lock); 1152 spin_lock(&cpu_base->lock);
1159 1153
1160 /* 1154 /*
1161 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid 1155 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
1162 * reprogramming of the event hardware. This happens at the end of this 1156 * we do not reprogramm the event hardware. Happens either in
1163 * function anyway. 1157 * hrtimer_start_range_ns() or in hrtimer_interrupt()
1164 */ 1158 */
1165 if (restart != HRTIMER_NORESTART) { 1159 if (restart != HRTIMER_NORESTART) {
1166 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); 1160 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1167 enqueue_hrtimer(timer, base, 0); 1161 enqueue_hrtimer(timer, base);
1168 } 1162 }
1169 timer->state &= ~HRTIMER_STATE_CALLBACK; 1163 timer->state &= ~HRTIMER_STATE_CALLBACK;
1170} 1164}
1171 1165
1172#ifdef CONFIG_HIGH_RES_TIMERS 1166#ifdef CONFIG_HIGH_RES_TIMERS
1173 1167
1168static int force_clock_reprogram;
1169
1170/*
1171 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1172 * is hanging, which could happen with something that slows the interrupt
1173 * such as the tracing. Then we force the clock reprogramming for each future
1174 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1175 * threshold that we will overwrite.
1176 * The next tick event will be scheduled to 3 times we currently spend on
1177 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1178 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1179 * let it running without serious starvation.
1180 */
1181
1182static inline void
1183hrtimer_interrupt_hanging(struct clock_event_device *dev,
1184 ktime_t try_time)
1185{
1186 force_clock_reprogram = 1;
1187 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1188 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1189 "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
1190}
1174/* 1191/*
1175 * High resolution timer interrupt 1192 * High resolution timer interrupt
1176 * Called with interrupts disabled 1193 * Called with interrupts disabled
@@ -1180,6 +1197,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1180 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1197 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1181 struct hrtimer_clock_base *base; 1198 struct hrtimer_clock_base *base;
1182 ktime_t expires_next, now; 1199 ktime_t expires_next, now;
1200 int nr_retries = 0;
1183 int i; 1201 int i;
1184 1202
1185 BUG_ON(!cpu_base->hres_active); 1203 BUG_ON(!cpu_base->hres_active);
@@ -1187,6 +1205,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1187 dev->next_event.tv64 = KTIME_MAX; 1205 dev->next_event.tv64 = KTIME_MAX;
1188 1206
1189 retry: 1207 retry:
1208 /* 5 retries is enough to notice a hang */
1209 if (!(++nr_retries % 5))
1210 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1211
1190 now = ktime_get(); 1212 now = ktime_get();
1191 1213
1192 expires_next.tv64 = KTIME_MAX; 1214 expires_next.tv64 = KTIME_MAX;
@@ -1239,11 +1261,27 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1239 1261
1240 /* Reprogramming necessary ? */ 1262 /* Reprogramming necessary ? */
1241 if (expires_next.tv64 != KTIME_MAX) { 1263 if (expires_next.tv64 != KTIME_MAX) {
1242 if (tick_program_event(expires_next, 0)) 1264 if (tick_program_event(expires_next, force_clock_reprogram))
1243 goto retry; 1265 goto retry;
1244 } 1266 }
1245} 1267}
1246 1268
1269/*
1270 * local version of hrtimer_peek_ahead_timers() called with interrupts
1271 * disabled.
1272 */
1273static void __hrtimer_peek_ahead_timers(void)
1274{
1275 struct tick_device *td;
1276
1277 if (!hrtimer_hres_active())
1278 return;
1279
1280 td = &__get_cpu_var(tick_cpu_device);
1281 if (td && td->evtdev)
1282 hrtimer_interrupt(td->evtdev);
1283}
1284
1247/** 1285/**
1248 * hrtimer_peek_ahead_timers -- run soft-expired timers now 1286 * hrtimer_peek_ahead_timers -- run soft-expired timers now
1249 * 1287 *
@@ -1255,20 +1293,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1255 */ 1293 */
1256void hrtimer_peek_ahead_timers(void) 1294void hrtimer_peek_ahead_timers(void)
1257{ 1295{
1258 struct tick_device *td;
1259 unsigned long flags; 1296 unsigned long flags;
1260 1297
1261 if (!hrtimer_hres_active())
1262 return;
1263
1264 local_irq_save(flags); 1298 local_irq_save(flags);
1265 td = &__get_cpu_var(tick_cpu_device); 1299 __hrtimer_peek_ahead_timers();
1266 if (td && td->evtdev)
1267 hrtimer_interrupt(td->evtdev);
1268 local_irq_restore(flags); 1300 local_irq_restore(flags);
1269} 1301}
1270 1302
1271#endif /* CONFIG_HIGH_RES_TIMERS */ 1303static void run_hrtimer_softirq(struct softirq_action *h)
1304{
1305 hrtimer_peek_ahead_timers();
1306}
1307
1308#else /* CONFIG_HIGH_RES_TIMERS */
1309
1310static inline void __hrtimer_peek_ahead_timers(void) { }
1311
1312#endif /* !CONFIG_HIGH_RES_TIMERS */
1272 1313
1273/* 1314/*
1274 * Called from timer softirq every jiffy, expire hrtimers: 1315 * Called from timer softirq every jiffy, expire hrtimers:
@@ -1463,8 +1504,8 @@ out:
1463 return ret; 1504 return ret;
1464} 1505}
1465 1506
1466asmlinkage long 1507SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
1467sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) 1508 struct timespec __user *, rmtp)
1468{ 1509{
1469 struct timespec tu; 1510 struct timespec tu;
1470 1511
@@ -1514,39 +1555,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1514 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); 1555 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1515 timer->base = new_base; 1556 timer->base = new_base;
1516 /* 1557 /*
1517 * Enqueue the timers on the new cpu, but do not reprogram 1558 * Enqueue the timers on the new cpu. This does not
1518 * the timer as that would enable a deadlock between 1559 * reprogram the event device in case the timer
1519 * hrtimer_enqueue_reprogramm() running the timer and us still 1560 * expires before the earliest on this CPU, but we run
1520 * holding a nested base lock. 1561 * hrtimer_interrupt after we migrated everything to
1521 * 1562 * sort out already expired timers and reprogram the
1522 * Instead we tickle the hrtimer interrupt after the migration 1563 * event device.
1523 * is done, which will run all expired timers and re-programm
1524 * the timer device.
1525 */ 1564 */
1526 enqueue_hrtimer(timer, new_base, 0); 1565 enqueue_hrtimer(timer, new_base);
1527 1566
1528 /* Clear the migration state bit */ 1567 /* Clear the migration state bit */
1529 timer->state &= ~HRTIMER_STATE_MIGRATE; 1568 timer->state &= ~HRTIMER_STATE_MIGRATE;
1530 } 1569 }
1531} 1570}
1532 1571
1533static int migrate_hrtimers(int scpu) 1572static void migrate_hrtimers(int scpu)
1534{ 1573{
1535 struct hrtimer_cpu_base *old_base, *new_base; 1574 struct hrtimer_cpu_base *old_base, *new_base;
1536 int dcpu, i; 1575 int i;
1537 1576
1538 BUG_ON(cpu_online(scpu)); 1577 BUG_ON(cpu_online(scpu));
1539 old_base = &per_cpu(hrtimer_bases, scpu);
1540 new_base = &get_cpu_var(hrtimer_bases);
1541
1542 dcpu = smp_processor_id();
1543
1544 tick_cancel_sched_timer(scpu); 1578 tick_cancel_sched_timer(scpu);
1579
1580 local_irq_disable();
1581 old_base = &per_cpu(hrtimer_bases, scpu);
1582 new_base = &__get_cpu_var(hrtimer_bases);
1545 /* 1583 /*
1546 * The caller is globally serialized and nobody else 1584 * The caller is globally serialized and nobody else
1547 * takes two locks at once, deadlock is not possible. 1585 * takes two locks at once, deadlock is not possible.
1548 */ 1586 */
1549 spin_lock_irq(&new_base->lock); 1587 spin_lock(&new_base->lock);
1550 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1588 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1551 1589
1552 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1590 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
@@ -1555,15 +1593,11 @@ static int migrate_hrtimers(int scpu)
1555 } 1593 }
1556 1594
1557 spin_unlock(&old_base->lock); 1595 spin_unlock(&old_base->lock);
1558 spin_unlock_irq(&new_base->lock); 1596 spin_unlock(&new_base->lock);
1559 put_cpu_var(hrtimer_bases);
1560 1597
1561 return dcpu; 1598 /* Check, if we got expired work to do */
1562} 1599 __hrtimer_peek_ahead_timers();
1563 1600 local_irq_enable();
1564static void tickle_timers(void *arg)
1565{
1566 hrtimer_peek_ahead_timers();
1567} 1601}
1568 1602
1569#endif /* CONFIG_HOTPLUG_CPU */ 1603#endif /* CONFIG_HOTPLUG_CPU */
@@ -1581,14 +1615,15 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1581 break; 1615 break;
1582 1616
1583#ifdef CONFIG_HOTPLUG_CPU 1617#ifdef CONFIG_HOTPLUG_CPU
1618 case CPU_DYING:
1619 case CPU_DYING_FROZEN:
1620 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
1621 break;
1584 case CPU_DEAD: 1622 case CPU_DEAD:
1585 case CPU_DEAD_FROZEN: 1623 case CPU_DEAD_FROZEN:
1586 { 1624 {
1587 int dcpu;
1588
1589 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); 1625 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);
1590 dcpu = migrate_hrtimers(scpu); 1626 migrate_hrtimers(scpu);
1591 smp_call_function_single(dcpu, tickle_timers, NULL, 0);
1592 break; 1627 break;
1593 } 1628 }
1594#endif 1629#endif
@@ -1609,6 +1644,9 @@ void __init hrtimers_init(void)
1609 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, 1644 hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
1610 (void *)(long)smp_processor_id()); 1645 (void *)(long)smp_processor_id());
1611 register_cpu_notifier(&hrtimers_nb); 1646 register_cpu_notifier(&hrtimers_nb);
1647#ifdef CONFIG_HIGH_RES_TIMERS
1648 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1649#endif
1612} 1650}
1613 1651
1614/** 1652/**
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 650ce4102a63..1de9700f416e 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/async.h>
13 14
14#include "internals.h" 15#include "internals.h"
15 16
@@ -34,15 +35,16 @@ unsigned long probe_irq_on(void)
34 unsigned int status; 35 unsigned int status;
35 int i; 36 int i;
36 37
38 /*
39 * quiesce the kernel, or at least the asynchronous portion
40 */
41 async_synchronize_full();
37 mutex_lock(&probing_active); 42 mutex_lock(&probing_active);
38 /* 43 /*
39 * something may have generated an irq long ago and we want to 44 * something may have generated an irq long ago and we want to
40 * flush such a longstanding irq before considering it as spurious. 45 * flush such a longstanding irq before considering it as spurious.
41 */ 46 */
42 for_each_irq_desc_reverse(i, desc) { 47 for_each_irq_desc_reverse(i, desc) {
43 if (!desc)
44 continue;
45
46 spin_lock_irq(&desc->lock); 48 spin_lock_irq(&desc->lock);
47 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
48 /* 50 /*
@@ -71,9 +73,6 @@ unsigned long probe_irq_on(void)
71 * happened in the previous stage, it may have masked itself) 73 * happened in the previous stage, it may have masked itself)
72 */ 74 */
73 for_each_irq_desc_reverse(i, desc) { 75 for_each_irq_desc_reverse(i, desc) {
74 if (!desc)
75 continue;
76
77 spin_lock_irq(&desc->lock); 76 spin_lock_irq(&desc->lock);
78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -92,9 +91,6 @@ unsigned long probe_irq_on(void)
92 * Now filter out any obviously spurious interrupts 91 * Now filter out any obviously spurious interrupts
93 */ 92 */
94 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
95 if (!desc)
96 continue;
97
98 spin_lock_irq(&desc->lock); 94 spin_lock_irq(&desc->lock);
99 status = desc->status; 95 status = desc->status;
100 96
@@ -133,9 +129,6 @@ unsigned int probe_irq_mask(unsigned long val)
133 int i; 129 int i;
134 130
135 for_each_irq_desc(i, desc) { 131 for_each_irq_desc(i, desc) {
136 if (!desc)
137 continue;
138
139 spin_lock_irq(&desc->lock); 132 spin_lock_irq(&desc->lock);
140 status = desc->status; 133 status = desc->status;
141 134
@@ -178,9 +171,6 @@ int probe_irq_off(unsigned long val)
178 unsigned int status; 171 unsigned int status;
179 172
180 for_each_irq_desc(i, desc) { 173 for_each_irq_desc(i, desc) {
181 if (!desc)
182 continue;
183
184 spin_lock_irq(&desc->lock); 174 spin_lock_irq(&desc->lock);
185 status = desc->status; 175 status = desc->status;
186 176
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6eb3c7952b64..7de11bd64dfe 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,7 @@ void dynamic_irq_init(unsigned int irq)
46 desc->irq_count = 0; 46 desc->irq_count = 0;
47 desc->irqs_unhandled = 0; 47 desc->irqs_unhandled = 0;
48#ifdef CONFIG_SMP 48#ifdef CONFIG_SMP
49 cpus_setall(desc->affinity); 49 cpumask_setall(&desc->affinity);
50#endif 50#endif
51 spin_unlock_irqrestore(&desc->lock, flags); 51 spin_unlock_irqrestore(&desc->lock, flags);
52} 52}
@@ -383,6 +383,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
383out_unlock: 383out_unlock:
384 spin_unlock(&desc->lock); 384 spin_unlock(&desc->lock);
385} 385}
386EXPORT_SYMBOL_GPL(handle_level_irq);
386 387
387/** 388/**
388 * handle_fasteoi_irq - irq handler for transparent controllers 389 * handle_fasteoi_irq - irq handler for transparent controllers
@@ -593,6 +594,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
593 } 594 }
594 spin_unlock_irqrestore(&desc->lock, flags); 595 spin_unlock_irqrestore(&desc->lock, flags);
595} 596}
597EXPORT_SYMBOL_GPL(__set_irq_handler);
596 598
597void 599void
598set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip, 600set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6492400cb50d..3aba8d12f328 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -39,6 +39,18 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
39 ack_bad_irq(irq); 39 ack_bad_irq(irq);
40} 40}
41 41
42#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
43static void __init init_irq_default_affinity(void)
44{
45 alloc_bootmem_cpumask_var(&irq_default_affinity);
46 cpumask_setall(irq_default_affinity);
47}
48#else
49static void __init init_irq_default_affinity(void)
50{
51}
52#endif
53
42/* 54/*
43 * Linux has a controller-independent interrupt architecture. 55 * Linux has a controller-independent interrupt architecture.
44 * Every controller has a 'controller-template', that is used 56 * Every controller has a 'controller-template', that is used
@@ -56,10 +68,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
56int nr_irqs = NR_IRQS; 68int nr_irqs = NR_IRQS;
57EXPORT_SYMBOL_GPL(nr_irqs); 69EXPORT_SYMBOL_GPL(nr_irqs);
58 70
59void __init __attribute__((weak)) arch_early_irq_init(void)
60{
61}
62
63#ifdef CONFIG_SPARSE_IRQ 71#ifdef CONFIG_SPARSE_IRQ
64static struct irq_desc irq_desc_init = { 72static struct irq_desc irq_desc_init = {
65 .irq = -1, 73 .irq = -1,
@@ -90,13 +98,11 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
90 desc->kstat_irqs = (unsigned int *)ptr; 98 desc->kstat_irqs = (unsigned int *)ptr;
91} 99}
92 100
93void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
94{
95}
96
97static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) 101static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
98{ 102{
99 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 103 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
104
105 spin_lock_init(&desc->lock);
100 desc->irq = irq; 106 desc->irq = irq;
101#ifdef CONFIG_SMP 107#ifdef CONFIG_SMP
102 desc->cpu = cpu; 108 desc->cpu = cpu;
@@ -134,18 +140,21 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
134/* FIXME: use bootmem alloc ...*/ 140/* FIXME: use bootmem alloc ...*/
135static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS]; 141static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
136 142
137void __init early_irq_init(void) 143int __init early_irq_init(void)
138{ 144{
139 struct irq_desc *desc; 145 struct irq_desc *desc;
140 int legacy_count; 146 int legacy_count;
141 int i; 147 int i;
142 148
149 init_irq_default_affinity();
150
143 desc = irq_desc_legacy; 151 desc = irq_desc_legacy;
144 legacy_count = ARRAY_SIZE(irq_desc_legacy); 152 legacy_count = ARRAY_SIZE(irq_desc_legacy);
145 153
146 for (i = 0; i < legacy_count; i++) { 154 for (i = 0; i < legacy_count; i++) {
147 desc[i].irq = i; 155 desc[i].irq = i;
148 desc[i].kstat_irqs = kstat_irqs_legacy[i]; 156 desc[i].kstat_irqs = kstat_irqs_legacy[i];
157 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
149 158
150 irq_desc_ptrs[i] = desc + i; 159 irq_desc_ptrs[i] = desc + i;
151 } 160 }
@@ -153,7 +162,7 @@ void __init early_irq_init(void)
153 for (i = legacy_count; i < NR_IRQS; i++) 162 for (i = legacy_count; i < NR_IRQS; i++)
154 irq_desc_ptrs[i] = NULL; 163 irq_desc_ptrs[i] = NULL;
155 164
156 arch_early_irq_init(); 165 return arch_early_irq_init();
157} 166}
158 167
159struct irq_desc *irq_to_desc(unsigned int irq) 168struct irq_desc *irq_to_desc(unsigned int irq)
@@ -203,7 +212,7 @@ out_unlock:
203 return desc; 212 return desc;
204} 213}
205 214
206#else 215#else /* !CONFIG_SPARSE_IRQ */
207 216
208struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 217struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
209 [0 ... NR_IRQS-1] = { 218 [0 ... NR_IRQS-1] = {
@@ -218,7 +227,33 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
218 } 227 }
219}; 228};
220 229
221#endif 230int __init early_irq_init(void)
231{
232 struct irq_desc *desc;
233 int count;
234 int i;
235
236 init_irq_default_affinity();
237
238 desc = irq_desc;
239 count = ARRAY_SIZE(irq_desc);
240
241 for (i = 0; i < count; i++)
242 desc[i].irq = i;
243
244 return arch_early_irq_init();
245}
246
247struct irq_desc *irq_to_desc(unsigned int irq)
248{
249 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
250}
251
252struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
253{
254 return irq_to_desc(irq);
255}
256#endif /* !CONFIG_SPARSE_IRQ */
222 257
223/* 258/*
224 * What should we do if we get a hw irq event on an illegal vector? 259 * What should we do if we get a hw irq event on an illegal vector?
@@ -428,9 +463,6 @@ void early_init_irq_lock_class(void)
428 int i; 463 int i;
429 464
430 for_each_irq_desc(i, desc) { 465 for_each_irq_desc(i, desc) {
431 if (!desc)
432 continue;
433
434 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 466 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
435 } 467 }
436} 468}
@@ -439,7 +471,7 @@ void early_init_irq_lock_class(void)
439unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 471unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
440{ 472{
441 struct irq_desc *desc = irq_to_desc(irq); 473 struct irq_desc *desc = irq_to_desc(irq);
442 return desc->kstat_irqs[cpu]; 474 return desc ? desc->kstat_irqs[cpu] : 0;
443} 475}
444#endif 476#endif
445EXPORT_SYMBOL(kstat_irqs_cpu); 477EXPORT_SYMBOL(kstat_irqs_cpu);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 540f6c49f3fa..291f03664552 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -15,9 +15,8 @@
15 15
16#include "internals.h" 16#include "internals.h"
17 17
18#ifdef CONFIG_SMP 18#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
19 19cpumask_var_t irq_default_affinity;
20cpumask_t irq_default_affinity = CPU_MASK_ALL;
21 20
22/** 21/**
23 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 22 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
@@ -79,7 +78,7 @@ int irq_can_set_affinity(unsigned int irq)
79 * @cpumask: cpumask 78 * @cpumask: cpumask
80 * 79 *
81 */ 80 */
82int irq_set_affinity(unsigned int irq, cpumask_t cpumask) 81int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
83{ 82{
84 struct irq_desc *desc = irq_to_desc(irq); 83 struct irq_desc *desc = irq_to_desc(irq);
85 unsigned long flags; 84 unsigned long flags;
@@ -91,14 +90,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
91 90
92#ifdef CONFIG_GENERIC_PENDING_IRQ 91#ifdef CONFIG_GENERIC_PENDING_IRQ
93 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 92 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
94 desc->affinity = cpumask; 93 cpumask_copy(&desc->affinity, cpumask);
95 desc->chip->set_affinity(irq, cpumask); 94 desc->chip->set_affinity(irq, cpumask);
96 } else { 95 } else {
97 desc->status |= IRQ_MOVE_PENDING; 96 desc->status |= IRQ_MOVE_PENDING;
98 desc->pending_mask = cpumask; 97 cpumask_copy(&desc->pending_mask, cpumask);
99 } 98 }
100#else 99#else
101 desc->affinity = cpumask; 100 cpumask_copy(&desc->affinity, cpumask);
102 desc->chip->set_affinity(irq, cpumask); 101 desc->chip->set_affinity(irq, cpumask);
103#endif 102#endif
104 desc->status |= IRQ_AFFINITY_SET; 103 desc->status |= IRQ_AFFINITY_SET;
@@ -112,26 +111,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
112 */ 111 */
113int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) 112int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
114{ 113{
115 cpumask_t mask;
116
117 if (!irq_can_set_affinity(irq)) 114 if (!irq_can_set_affinity(irq))
118 return 0; 115 return 0;
119 116
120 cpus_and(mask, cpu_online_map, irq_default_affinity);
121
122 /* 117 /*
123 * Preserve an userspace affinity setup, but make sure that 118 * Preserve an userspace affinity setup, but make sure that
124 * one of the targets is online. 119 * one of the targets is online.
125 */ 120 */
126 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 121 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
127 if (cpus_intersects(desc->affinity, cpu_online_map)) 122 if (cpumask_any_and(&desc->affinity, cpu_online_mask)
128 mask = desc->affinity; 123 < nr_cpu_ids)
124 goto set_affinity;
129 else 125 else
130 desc->status &= ~IRQ_AFFINITY_SET; 126 desc->status &= ~IRQ_AFFINITY_SET;
131 } 127 }
132 128
133 desc->affinity = mask; 129 cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
134 desc->chip->set_affinity(irq, mask); 130set_affinity:
131 desc->chip->set_affinity(irq, &desc->affinity);
135 132
136 return 0; 133 return 0;
137} 134}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 9db681d95814..bd72329e630c 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,7 +4,6 @@
4void move_masked_irq(int irq) 4void move_masked_irq(int irq)
5{ 5{
6 struct irq_desc *desc = irq_to_desc(irq); 6 struct irq_desc *desc = irq_to_desc(irq);
7 cpumask_t tmp;
8 7
9 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 8 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
10 return; 9 return;
@@ -19,7 +18,7 @@ void move_masked_irq(int irq)
19 18
20 desc->status &= ~IRQ_MOVE_PENDING; 19 desc->status &= ~IRQ_MOVE_PENDING;
21 20
22 if (unlikely(cpus_empty(desc->pending_mask))) 21 if (unlikely(cpumask_empty(&desc->pending_mask)))
23 return; 22 return;
24 23
25 if (!desc->chip->set_affinity) 24 if (!desc->chip->set_affinity)
@@ -27,8 +26,6 @@ void move_masked_irq(int irq)
27 26
28 assert_spin_locked(&desc->lock); 27 assert_spin_locked(&desc->lock);
29 28
30 cpus_and(tmp, desc->pending_mask, cpu_online_map);
31
32 /* 29 /*
33 * If there was a valid mask to work with, please 30 * If there was a valid mask to work with, please
34 * do the disable, re-program, enable sequence. 31 * do the disable, re-program, enable sequence.
@@ -41,10 +38,13 @@ void move_masked_irq(int irq)
41 * For correct operation this depends on the caller 38 * For correct operation this depends on the caller
42 * masking the irqs. 39 * masking the irqs.
43 */ 40 */
44 if (likely(!cpus_empty(tmp))) { 41 if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
45 desc->chip->set_affinity(irq,tmp); 42 < nr_cpu_ids)) {
43 cpumask_and(&desc->affinity,
44 &desc->pending_mask, cpu_online_mask);
45 desc->chip->set_affinity(irq, &desc->affinity);
46 } 46 }
47 cpus_clear(desc->pending_mask); 47 cpumask_clear(&desc->pending_mask);
48} 48}
49 49
50void move_native_irq(int irq) 50void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 089c3746358a..acd88356ac76 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -42,6 +42,7 @@ static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 struct irq_desc *desc, int cpu) 42 struct irq_desc *desc, int cpu)
43{ 43{
44 memcpy(desc, old_desc, sizeof(struct irq_desc)); 44 memcpy(desc, old_desc, sizeof(struct irq_desc));
45 spin_lock_init(&desc->lock);
45 desc->cpu = cpu; 46 desc->cpu = cpu;
46 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
47 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
@@ -70,14 +71,12 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
70 desc = irq_desc_ptrs[irq]; 71 desc = irq_desc_ptrs[irq];
71 72
72 if (desc && old_desc != desc) 73 if (desc && old_desc != desc)
73 goto out_unlock; 74 goto out_unlock;
74 75
75 node = cpu_to_node(cpu); 76 node = cpu_to_node(cpu);
76 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 77 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
77 printk(KERN_DEBUG " move irq_desc for %d to cpu %d node %d\n",
78 irq, cpu, node);
79 if (!desc) { 78 if (!desc) {
80 printk(KERN_ERR "can not get new irq_desc for moving\n"); 79 printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
81 /* still use old one */ 80 /* still use old one */
82 desc = old_desc; 81 desc = old_desc;
83 goto out_unlock; 82 goto out_unlock;
@@ -85,10 +84,15 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
85 init_copy_one_irq_desc(irq, old_desc, desc, cpu); 84 init_copy_one_irq_desc(irq, old_desc, desc, cpu);
86 85
87 irq_desc_ptrs[irq] = desc; 86 irq_desc_ptrs[irq] = desc;
87 spin_unlock_irqrestore(&sparse_irq_lock, flags);
88 88
89 /* free the old one */ 89 /* free the old one */
90 free_one_irq_desc(old_desc, desc); 90 free_one_irq_desc(old_desc, desc);
91 spin_unlock(&old_desc->lock);
91 kfree(old_desc); 92 kfree(old_desc);
93 spin_lock(&desc->lock);
94
95 return desc;
92 96
93out_unlock: 97out_unlock:
94 spin_unlock_irqrestore(&sparse_irq_lock, flags); 98 spin_unlock_irqrestore(&sparse_irq_lock, flags);
@@ -106,8 +110,6 @@ struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
106 return desc; 110 return desc;
107 111
108 old_cpu = desc->cpu; 112 old_cpu = desc->cpu;
109 printk(KERN_DEBUG
110 "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
111 if (old_cpu != cpu) { 113 if (old_cpu != cpu) {
112 node = cpu_to_node(cpu); 114 node = cpu_to_node(cpu);
113 old_node = cpu_to_node(old_cpu); 115 old_node = cpu_to_node(old_cpu);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index f6b3440f05bc..aae3f742bcec 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,7 +20,7 @@ static struct proc_dir_entry *root_irq_dir;
20static int irq_affinity_proc_show(struct seq_file *m, void *v) 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
21{ 21{
22 struct irq_desc *desc = irq_to_desc((long)m->private); 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 cpumask_t *mask = &desc->affinity; 23 const struct cpumask *mask = &desc->affinity;
24 24
25#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
26 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
@@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file,
40 const char __user *buffer, size_t count, loff_t *pos) 40 const char __user *buffer, size_t count, loff_t *pos)
41{ 41{
42 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; 42 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
43 cpumask_t new_value; 43 cpumask_var_t new_value;
44 int err; 44 int err;
45 45
46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || 46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
47 irq_balancing_disabled(irq)) 47 irq_balancing_disabled(irq))
48 return -EIO; 48 return -EIO;
49 49
50 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
51 return -ENOMEM;
52
50 err = cpumask_parse_user(buffer, count, new_value); 53 err = cpumask_parse_user(buffer, count, new_value);
51 if (err) 54 if (err)
52 return err; 55 goto free_cpumask;
53 56
54 if (!is_affinity_mask_valid(new_value)) 57 if (!is_affinity_mask_valid(new_value)) {
55 return -EINVAL; 58 err = -EINVAL;
59 goto free_cpumask;
60 }
56 61
57 /* 62 /*
58 * Do not allow disabling IRQs completely - it's a too easy 63 * Do not allow disabling IRQs completely - it's a too easy
59 * way to make the system unusable accidentally :-) At least 64 * way to make the system unusable accidentally :-) At least
60 * one online CPU still has to be targeted. 65 * one online CPU still has to be targeted.
61 */ 66 */
62 if (!cpus_intersects(new_value, cpu_online_map)) 67 if (!cpumask_intersects(new_value, cpu_online_mask)) {
63 /* Special case for empty set - allow the architecture 68 /* Special case for empty set - allow the architecture
64 code to set default SMP affinity. */ 69 code to set default SMP affinity. */
65 return irq_select_affinity_usr(irq) ? -EINVAL : count; 70 err = irq_select_affinity_usr(irq) ? -EINVAL : count;
66 71 } else {
67 irq_set_affinity(irq, new_value); 72 irq_set_affinity(irq, new_value);
73 err = count;
74 }
68 75
69 return count; 76free_cpumask:
77 free_cpumask_var(new_value);
78 return err;
70} 79}
71 80
72static int irq_affinity_proc_open(struct inode *inode, struct file *file) 81static int irq_affinity_proc_open(struct inode *inode, struct file *file)
@@ -84,7 +93,7 @@ static const struct file_operations irq_affinity_proc_fops = {
84 93
85static int default_affinity_show(struct seq_file *m, void *v) 94static int default_affinity_show(struct seq_file *m, void *v)
86{ 95{
87 seq_cpumask(m, &irq_default_affinity); 96 seq_cpumask(m, irq_default_affinity);
88 seq_putc(m, '\n'); 97 seq_putc(m, '\n');
89 return 0; 98 return 0;
90} 99}
@@ -92,27 +101,37 @@ static int default_affinity_show(struct seq_file *m, void *v)
92static ssize_t default_affinity_write(struct file *file, 101static ssize_t default_affinity_write(struct file *file,
93 const char __user *buffer, size_t count, loff_t *ppos) 102 const char __user *buffer, size_t count, loff_t *ppos)
94{ 103{
95 cpumask_t new_value; 104 cpumask_var_t new_value;
96 int err; 105 int err;
97 106
107 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
108 return -ENOMEM;
109
98 err = cpumask_parse_user(buffer, count, new_value); 110 err = cpumask_parse_user(buffer, count, new_value);
99 if (err) 111 if (err)
100 return err; 112 goto out;
101 113
102 if (!is_affinity_mask_valid(new_value)) 114 if (!is_affinity_mask_valid(new_value)) {
103 return -EINVAL; 115 err = -EINVAL;
116 goto out;
117 }
104 118
105 /* 119 /*
106 * Do not allow disabling IRQs completely - it's a too easy 120 * Do not allow disabling IRQs completely - it's a too easy
107 * way to make the system unusable accidentally :-) At least 121 * way to make the system unusable accidentally :-) At least
108 * one online CPU still has to be targeted. 122 * one online CPU still has to be targeted.
109 */ 123 */
110 if (!cpus_intersects(new_value, cpu_online_map)) 124 if (!cpumask_intersects(new_value, cpu_online_mask)) {
111 return -EINVAL; 125 err = -EINVAL;
126 goto out;
127 }
112 128
113 irq_default_affinity = new_value; 129 cpumask_copy(irq_default_affinity, new_value);
130 err = count;
114 131
115 return count; 132out:
133 free_cpumask_var(new_value);
134 return err;
116} 135}
117 136
118static int default_affinity_open(struct inode *inode, struct file *file) 137static int default_affinity_open(struct inode *inode, struct file *file)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3738107531fd..dd364c11e56e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -91,9 +91,6 @@ static int misrouted_irq(int irq)
91 int i, ok = 0; 91 int i, ok = 0;
92 92
93 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
94 if (!desc)
95 continue;
96
97 if (!i) 94 if (!i)
98 continue; 95 continue;
99 96
@@ -115,8 +112,6 @@ static void poll_spurious_irqs(unsigned long dummy)
115 for_each_irq_desc(i, desc) { 112 for_each_irq_desc(i, desc) {
116 unsigned int status; 113 unsigned int status;
117 114
118 if (!desc)
119 continue;
120 if (!i) 115 if (!i)
121 continue; 116 continue;
122 117
diff --git a/kernel/itimer.c b/kernel/itimer.c
index db7c358b9a02..6a5fe93dd8bd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -100,7 +100,7 @@ int do_getitimer(int which, struct itimerval *value)
100 return 0; 100 return 0;
101} 101}
102 102
103asmlinkage long sys_getitimer(int which, struct itimerval __user *value) 103SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
104{ 104{
105 int error = -EFAULT; 105 int error = -EFAULT;
106 struct itimerval get_buffer; 106 struct itimerval get_buffer;
@@ -260,9 +260,8 @@ unsigned int alarm_setitimer(unsigned int seconds)
260 return it_old.it_value.tv_sec; 260 return it_old.it_value.tv_sec;
261} 261}
262 262
263asmlinkage long sys_setitimer(int which, 263SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
264 struct itimerval __user *value, 264 struct itimerval __user *, ovalue)
265 struct itimerval __user *ovalue)
266{ 265{
267 struct itimerval set_buffer, get_buffer; 266 struct itimerval set_buffer, get_buffer;
268 int error; 267 int error;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index e694afa0eb8c..7b8b0f21a5b1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,19 +30,20 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33extern const unsigned long kallsyms_addresses[]; 33/* These will be re-linked against their real values during the second link stage */
34extern const u8 kallsyms_names[]; 34extern const unsigned long kallsyms_addresses[] __attribute__((weak));
35extern const u8 kallsyms_names[] __attribute__((weak));
35 36
36/* tell the compiler that the count isn't in the small data section if the arch 37/* tell the compiler that the count isn't in the small data section if the arch
37 * has one (eg: FRV) 38 * has one (eg: FRV)
38 */ 39 */
39extern const unsigned long kallsyms_num_syms 40extern const unsigned long kallsyms_num_syms
40 __attribute__((__section__(".rodata"))); 41__attribute__((weak, section(".rodata")));
41 42
42extern const u8 kallsyms_token_table[]; 43extern const u8 kallsyms_token_table[] __attribute__((weak));
43extern const u16 kallsyms_token_index[]; 44extern const u16 kallsyms_token_index[] __attribute__((weak));
44 45
45extern const unsigned long kallsyms_markers[]; 46extern const unsigned long kallsyms_markers[] __attribute__((weak));
46 47
47static inline int is_kernel_inittext(unsigned long addr) 48static inline int is_kernel_inittext(unsigned long addr)
48{ 49{
@@ -167,6 +168,9 @@ static unsigned long get_symbol_pos(unsigned long addr,
167 unsigned long symbol_start = 0, symbol_end = 0; 168 unsigned long symbol_start = 0, symbol_end = 0;
168 unsigned long i, low, high, mid; 169 unsigned long i, low, high, mid;
169 170
171 /* This kernel should never had been booted. */
172 BUG_ON(!kallsyms_addresses);
173
170 /* do a binary search on the sorted kallsyms_addresses array */ 174 /* do a binary search on the sorted kallsyms_addresses array */
171 low = 0; 175 low = 0;
172 high = kallsyms_num_syms; 176 high = kallsyms_num_syms;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ac0fde7b54d0..8a6d7b08864e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -934,9 +934,8 @@ struct kimage *kexec_crash_image;
934 934
935static DEFINE_MUTEX(kexec_mutex); 935static DEFINE_MUTEX(kexec_mutex);
936 936
937asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 937SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
938 struct kexec_segment __user *segments, 938 struct kexec_segment __user *, segments, unsigned long, flags)
939 unsigned long flags)
940{ 939{
941 struct kimage **dest_image, *image; 940 struct kimage **dest_image, *image;
942 int result; 941 int result;
@@ -1116,7 +1115,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
1116 struct elf_prstatus prstatus; 1115 struct elf_prstatus prstatus;
1117 u32 *buf; 1116 u32 *buf;
1118 1117
1119 if ((cpu < 0) || (cpu >= NR_CPUS)) 1118 if ((cpu < 0) || (cpu >= nr_cpu_ids))
1120 return; 1119 return;
1121 1120
1122 /* Using ELF notes here is opportunistic. 1121 /* Using ELF notes here is opportunistic.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index b46dbb908669..a27a5f64443d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -51,8 +51,8 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51 51
52/** 52/**
53 * request_module - try to load a kernel module 53 * request_module - try to load a kernel module
54 * @fmt: printf style format string for the name of the module 54 * @fmt: printf style format string for the name of the module
55 * @varargs: arguements as specified in the format string 55 * @...: arguments as specified in the format string
56 * 56 *
57 * Load a module using the user mode module loader. The function returns 57 * Load a module using the user mode module loader. The function returns
58 * zero on success or a negative errno code on failure. Note that a 58 * zero on success or a negative errno code on failure. Note that a
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9f8a3f25259a..7ba8cd9845cb 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -69,7 +69,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
69/* NOTE: change this value only with kprobe_mutex held */ 69/* NOTE: change this value only with kprobe_mutex held */
70static bool kprobe_enabled; 70static bool kprobe_enabled;
71 71
72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct { 74static struct {
75 spinlock_t lock ____cacheline_aligned_in_smp; 75 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -115,6 +115,7 @@ enum kprobe_slot_state {
115 SLOT_USED = 2, 115 SLOT_USED = 2,
116}; 116};
117 117
118static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
118static struct hlist_head kprobe_insn_pages; 119static struct hlist_head kprobe_insn_pages;
119static int kprobe_garbage_slots; 120static int kprobe_garbage_slots;
120static int collect_garbage_slots(void); 121static int collect_garbage_slots(void);
@@ -122,7 +123,7 @@ static int collect_garbage_slots(void);
122static int __kprobes check_safety(void) 123static int __kprobes check_safety(void)
123{ 124{
124 int ret = 0; 125 int ret = 0;
125#if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) 126#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
126 ret = freeze_processes(); 127 ret = freeze_processes();
127 if (ret == 0) { 128 if (ret == 0) {
128 struct task_struct *p, *q; 129 struct task_struct *p, *q;
@@ -144,10 +145,10 @@ loop_end:
144} 145}
145 146
146/** 147/**
147 * get_insn_slot() - Find a slot on an executable page for an instruction. 148 * __get_insn_slot() - Find a slot on an executable page for an instruction.
148 * We allocate an executable page if there's no room on existing ones. 149 * We allocate an executable page if there's no room on existing ones.
149 */ 150 */
150kprobe_opcode_t __kprobes *get_insn_slot(void) 151static kprobe_opcode_t __kprobes *__get_insn_slot(void)
151{ 152{
152 struct kprobe_insn_page *kip; 153 struct kprobe_insn_page *kip;
153 struct hlist_node *pos; 154 struct hlist_node *pos;
@@ -196,6 +197,15 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
196 return kip->insns; 197 return kip->insns;
197} 198}
198 199
200kprobe_opcode_t __kprobes *get_insn_slot(void)
201{
202 kprobe_opcode_t *ret;
203 mutex_lock(&kprobe_insn_mutex);
204 ret = __get_insn_slot();
205 mutex_unlock(&kprobe_insn_mutex);
206 return ret;
207}
208
199/* Return 1 if all garbages are collected, otherwise 0. */ 209/* Return 1 if all garbages are collected, otherwise 0. */
200static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) 210static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
201{ 211{
@@ -226,9 +236,13 @@ static int __kprobes collect_garbage_slots(void)
226{ 236{
227 struct kprobe_insn_page *kip; 237 struct kprobe_insn_page *kip;
228 struct hlist_node *pos, *next; 238 struct hlist_node *pos, *next;
239 int safety;
229 240
230 /* Ensure no-one is preepmted on the garbages */ 241 /* Ensure no-one is preepmted on the garbages */
231 if (check_safety() != 0) 242 mutex_unlock(&kprobe_insn_mutex);
243 safety = check_safety();
244 mutex_lock(&kprobe_insn_mutex);
245 if (safety != 0)
232 return -EAGAIN; 246 return -EAGAIN;
233 247
234 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 248 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
@@ -251,6 +265,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
251 struct kprobe_insn_page *kip; 265 struct kprobe_insn_page *kip;
252 struct hlist_node *pos; 266 struct hlist_node *pos;
253 267
268 mutex_lock(&kprobe_insn_mutex);
254 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 269 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
255 if (kip->insns <= slot && 270 if (kip->insns <= slot &&
256 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 271 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
@@ -267,6 +282,8 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
267 282
268 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) 283 if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
269 collect_garbage_slots(); 284 collect_garbage_slots();
285
286 mutex_unlock(&kprobe_insn_mutex);
270} 287}
271#endif 288#endif
272 289
@@ -310,7 +327,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
310 struct kprobe *kp; 327 struct kprobe *kp;
311 328
312 list_for_each_entry_rcu(kp, &p->list, list) { 329 list_for_each_entry_rcu(kp, &p->list, list) {
313 if (kp->pre_handler) { 330 if (kp->pre_handler && !kprobe_gone(kp)) {
314 set_kprobe_instance(kp); 331 set_kprobe_instance(kp);
315 if (kp->pre_handler(kp, regs)) 332 if (kp->pre_handler(kp, regs))
316 return 1; 333 return 1;
@@ -326,7 +343,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
326 struct kprobe *kp; 343 struct kprobe *kp;
327 344
328 list_for_each_entry_rcu(kp, &p->list, list) { 345 list_for_each_entry_rcu(kp, &p->list, list) {
329 if (kp->post_handler) { 346 if (kp->post_handler && !kprobe_gone(kp)) {
330 set_kprobe_instance(kp); 347 set_kprobe_instance(kp);
331 kp->post_handler(kp, regs, flags); 348 kp->post_handler(kp, regs, flags);
332 reset_kprobe_instance(); 349 reset_kprobe_instance();
@@ -393,7 +410,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
393 hlist_add_head(&ri->hlist, head); 410 hlist_add_head(&ri->hlist, head);
394} 411}
395 412
396void kretprobe_hash_lock(struct task_struct *tsk, 413void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
397 struct hlist_head **head, unsigned long *flags) 414 struct hlist_head **head, unsigned long *flags)
398{ 415{
399 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 416 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -404,13 +421,15 @@ void kretprobe_hash_lock(struct task_struct *tsk,
404 spin_lock_irqsave(hlist_lock, *flags); 421 spin_lock_irqsave(hlist_lock, *flags);
405} 422}
406 423
407static void kretprobe_table_lock(unsigned long hash, unsigned long *flags) 424static void __kprobes kretprobe_table_lock(unsigned long hash,
425 unsigned long *flags)
408{ 426{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 427 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags); 428 spin_lock_irqsave(hlist_lock, *flags);
411} 429}
412 430
413void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) 431void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
432 unsigned long *flags)
414{ 433{
415 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); 434 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
416 spinlock_t *hlist_lock; 435 spinlock_t *hlist_lock;
@@ -419,7 +438,7 @@ void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
419 spin_unlock_irqrestore(hlist_lock, *flags); 438 spin_unlock_irqrestore(hlist_lock, *flags);
420} 439}
421 440
422void kretprobe_table_unlock(unsigned long hash, unsigned long *flags) 441void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
423{ 442{
424 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); 443 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
425 spin_unlock_irqrestore(hlist_lock, *flags); 444 spin_unlock_irqrestore(hlist_lock, *flags);
@@ -526,9 +545,10 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
526 ap->addr = p->addr; 545 ap->addr = p->addr;
527 ap->pre_handler = aggr_pre_handler; 546 ap->pre_handler = aggr_pre_handler;
528 ap->fault_handler = aggr_fault_handler; 547 ap->fault_handler = aggr_fault_handler;
529 if (p->post_handler) 548 /* We don't care the kprobe which has gone. */
549 if (p->post_handler && !kprobe_gone(p))
530 ap->post_handler = aggr_post_handler; 550 ap->post_handler = aggr_post_handler;
531 if (p->break_handler) 551 if (p->break_handler && !kprobe_gone(p))
532 ap->break_handler = aggr_break_handler; 552 ap->break_handler = aggr_break_handler;
533 553
534 INIT_LIST_HEAD(&ap->list); 554 INIT_LIST_HEAD(&ap->list);
@@ -547,17 +567,41 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
547 int ret = 0; 567 int ret = 0;
548 struct kprobe *ap; 568 struct kprobe *ap;
549 569
570 if (kprobe_gone(old_p)) {
571 /*
572 * Attempting to insert new probe at the same location that
573 * had a probe in the module vaddr area which already
574 * freed. So, the instruction slot has already been
575 * released. We need a new slot for the new probe.
576 */
577 ret = arch_prepare_kprobe(old_p);
578 if (ret)
579 return ret;
580 }
550 if (old_p->pre_handler == aggr_pre_handler) { 581 if (old_p->pre_handler == aggr_pre_handler) {
551 copy_kprobe(old_p, p); 582 copy_kprobe(old_p, p);
552 ret = add_new_kprobe(old_p, p); 583 ret = add_new_kprobe(old_p, p);
584 ap = old_p;
553 } else { 585 } else {
554 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); 586 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
555 if (!ap) 587 if (!ap) {
588 if (kprobe_gone(old_p))
589 arch_remove_kprobe(old_p);
556 return -ENOMEM; 590 return -ENOMEM;
591 }
557 add_aggr_kprobe(ap, old_p); 592 add_aggr_kprobe(ap, old_p);
558 copy_kprobe(ap, p); 593 copy_kprobe(ap, p);
559 ret = add_new_kprobe(ap, p); 594 ret = add_new_kprobe(ap, p);
560 } 595 }
596 if (kprobe_gone(old_p)) {
597 /*
598 * If the old_p has gone, its breakpoint has been disarmed.
599 * We have to arm it again after preparing real kprobes.
600 */
601 ap->flags &= ~KPROBE_FLAG_GONE;
602 if (kprobe_enabled)
603 arch_arm_kprobe(ap);
604 }
561 return ret; 605 return ret;
562} 606}
563 607
@@ -600,8 +644,7 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
600 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 644 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
601} 645}
602 646
603static int __kprobes __register_kprobe(struct kprobe *p, 647int __kprobes register_kprobe(struct kprobe *p)
604 unsigned long called_from)
605{ 648{
606 int ret = 0; 649 int ret = 0;
607 struct kprobe *old_p; 650 struct kprobe *old_p;
@@ -620,28 +663,30 @@ static int __kprobes __register_kprobe(struct kprobe *p,
620 return -EINVAL; 663 return -EINVAL;
621 } 664 }
622 665
623 p->mod_refcounted = 0; 666 p->flags = 0;
624
625 /* 667 /*
626 * Check if are we probing a module. 668 * Check if are we probing a module.
627 */ 669 */
628 probed_mod = __module_text_address((unsigned long) p->addr); 670 probed_mod = __module_text_address((unsigned long) p->addr);
629 if (probed_mod) { 671 if (probed_mod) {
630 struct module *calling_mod;
631 calling_mod = __module_text_address(called_from);
632 /* 672 /*
633 * We must allow modules to probe themself and in this case 673 * We must hold a refcount of the probed module while updating
634 * avoid incrementing the module refcount, so as to allow 674 * its code to prohibit unexpected unloading.
635 * unloading of self probing modules.
636 */ 675 */
637 if (calling_mod && calling_mod != probed_mod) { 676 if (unlikely(!try_module_get(probed_mod))) {
638 if (unlikely(!try_module_get(probed_mod))) { 677 preempt_enable();
639 preempt_enable(); 678 return -EINVAL;
640 return -EINVAL; 679 }
641 } 680 /*
642 p->mod_refcounted = 1; 681 * If the module freed .init.text, we couldn't insert
643 } else 682 * kprobes in there.
644 probed_mod = NULL; 683 */
684 if (within_module_init((unsigned long)p->addr, probed_mod) &&
685 probed_mod->state != MODULE_STATE_COMING) {
686 module_put(probed_mod);
687 preempt_enable();
688 return -EINVAL;
689 }
645 } 690 }
646 preempt_enable(); 691 preempt_enable();
647 692
@@ -668,8 +713,9 @@ static int __kprobes __register_kprobe(struct kprobe *p,
668out: 713out:
669 mutex_unlock(&kprobe_mutex); 714 mutex_unlock(&kprobe_mutex);
670 715
671 if (ret && probed_mod) 716 if (probed_mod)
672 module_put(probed_mod); 717 module_put(probed_mod);
718
673 return ret; 719 return ret;
674} 720}
675 721
@@ -697,16 +743,16 @@ valid_p:
697 list_is_singular(&old_p->list))) { 743 list_is_singular(&old_p->list))) {
698 /* 744 /*
699 * Only probe on the hash list. Disarm only if kprobes are 745 * Only probe on the hash list. Disarm only if kprobes are
700 * enabled - otherwise, the breakpoint would already have 746 * enabled and not gone - otherwise, the breakpoint would
701 * been removed. We save on flushing icache. 747 * already have been removed. We save on flushing icache.
702 */ 748 */
703 if (kprobe_enabled) 749 if (kprobe_enabled && !kprobe_gone(old_p))
704 arch_disarm_kprobe(p); 750 arch_disarm_kprobe(p);
705 hlist_del_rcu(&old_p->hlist); 751 hlist_del_rcu(&old_p->hlist);
706 } else { 752 } else {
707 if (p->break_handler) 753 if (p->break_handler && !kprobe_gone(p))
708 old_p->break_handler = NULL; 754 old_p->break_handler = NULL;
709 if (p->post_handler) { 755 if (p->post_handler && !kprobe_gone(p)) {
710 list_for_each_entry_rcu(list_p, &old_p->list, list) { 756 list_for_each_entry_rcu(list_p, &old_p->list, list) {
711 if ((list_p != p) && (list_p->post_handler)) 757 if ((list_p != p) && (list_p->post_handler))
712 goto noclean; 758 goto noclean;
@@ -721,39 +767,27 @@ noclean:
721 767
722static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) 768static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
723{ 769{
724 struct module *mod;
725 struct kprobe *old_p; 770 struct kprobe *old_p;
726 771
727 if (p->mod_refcounted) { 772 if (list_empty(&p->list))
728 /*
729 * Since we've already incremented refcount,
730 * we don't need to disable preemption.
731 */
732 mod = module_text_address((unsigned long)p->addr);
733 if (mod)
734 module_put(mod);
735 }
736
737 if (list_empty(&p->list) || list_is_singular(&p->list)) {
738 if (!list_empty(&p->list)) {
739 /* "p" is the last child of an aggr_kprobe */
740 old_p = list_entry(p->list.next, struct kprobe, list);
741 list_del(&p->list);
742 kfree(old_p);
743 }
744 arch_remove_kprobe(p); 773 arch_remove_kprobe(p);
774 else if (list_is_singular(&p->list)) {
775 /* "p" is the last child of an aggr_kprobe */
776 old_p = list_entry(p->list.next, struct kprobe, list);
777 list_del(&p->list);
778 arch_remove_kprobe(old_p);
779 kfree(old_p);
745 } 780 }
746} 781}
747 782
748static int __register_kprobes(struct kprobe **kps, int num, 783int __kprobes register_kprobes(struct kprobe **kps, int num)
749 unsigned long called_from)
750{ 784{
751 int i, ret = 0; 785 int i, ret = 0;
752 786
753 if (num <= 0) 787 if (num <= 0)
754 return -EINVAL; 788 return -EINVAL;
755 for (i = 0; i < num; i++) { 789 for (i = 0; i < num; i++) {
756 ret = __register_kprobe(kps[i], called_from); 790 ret = register_kprobe(kps[i]);
757 if (ret < 0) { 791 if (ret < 0) {
758 if (i > 0) 792 if (i > 0)
759 unregister_kprobes(kps, i); 793 unregister_kprobes(kps, i);
@@ -763,26 +797,11 @@ static int __register_kprobes(struct kprobe **kps, int num,
763 return ret; 797 return ret;
764} 798}
765 799
766/*
767 * Registration and unregistration functions for kprobe.
768 */
769int __kprobes register_kprobe(struct kprobe *p)
770{
771 return __register_kprobes(&p, 1,
772 (unsigned long)__builtin_return_address(0));
773}
774
775void __kprobes unregister_kprobe(struct kprobe *p) 800void __kprobes unregister_kprobe(struct kprobe *p)
776{ 801{
777 unregister_kprobes(&p, 1); 802 unregister_kprobes(&p, 1);
778} 803}
779 804
780int __kprobes register_kprobes(struct kprobe **kps, int num)
781{
782 return __register_kprobes(kps, num,
783 (unsigned long)__builtin_return_address(0));
784}
785
786void __kprobes unregister_kprobes(struct kprobe **kps, int num) 805void __kprobes unregister_kprobes(struct kprobe **kps, int num)
787{ 806{
788 int i; 807 int i;
@@ -811,8 +830,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
811 return (unsigned long)entry; 830 return (unsigned long)entry;
812} 831}
813 832
814static int __register_jprobes(struct jprobe **jps, int num, 833int __kprobes register_jprobes(struct jprobe **jps, int num)
815 unsigned long called_from)
816{ 834{
817 struct jprobe *jp; 835 struct jprobe *jp;
818 int ret = 0, i; 836 int ret = 0, i;
@@ -830,7 +848,7 @@ static int __register_jprobes(struct jprobe **jps, int num,
830 /* Todo: Verify probepoint is a function entry point */ 848 /* Todo: Verify probepoint is a function entry point */
831 jp->kp.pre_handler = setjmp_pre_handler; 849 jp->kp.pre_handler = setjmp_pre_handler;
832 jp->kp.break_handler = longjmp_break_handler; 850 jp->kp.break_handler = longjmp_break_handler;
833 ret = __register_kprobe(&jp->kp, called_from); 851 ret = register_kprobe(&jp->kp);
834 } 852 }
835 if (ret < 0) { 853 if (ret < 0) {
836 if (i > 0) 854 if (i > 0)
@@ -843,8 +861,7 @@ static int __register_jprobes(struct jprobe **jps, int num,
843 861
844int __kprobes register_jprobe(struct jprobe *jp) 862int __kprobes register_jprobe(struct jprobe *jp)
845{ 863{
846 return __register_jprobes(&jp, 1, 864 return register_jprobes(&jp, 1);
847 (unsigned long)__builtin_return_address(0));
848} 865}
849 866
850void __kprobes unregister_jprobe(struct jprobe *jp) 867void __kprobes unregister_jprobe(struct jprobe *jp)
@@ -852,12 +869,6 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
852 unregister_jprobes(&jp, 1); 869 unregister_jprobes(&jp, 1);
853} 870}
854 871
855int __kprobes register_jprobes(struct jprobe **jps, int num)
856{
857 return __register_jprobes(jps, num,
858 (unsigned long)__builtin_return_address(0));
859}
860
861void __kprobes unregister_jprobes(struct jprobe **jps, int num) 872void __kprobes unregister_jprobes(struct jprobe **jps, int num)
862{ 873{
863 int i; 874 int i;
@@ -920,8 +931,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
920 return 0; 931 return 0;
921} 932}
922 933
923static int __kprobes __register_kretprobe(struct kretprobe *rp, 934int __kprobes register_kretprobe(struct kretprobe *rp)
924 unsigned long called_from)
925{ 935{
926 int ret = 0; 936 int ret = 0;
927 struct kretprobe_instance *inst; 937 struct kretprobe_instance *inst;
@@ -967,21 +977,20 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
967 977
968 rp->nmissed = 0; 978 rp->nmissed = 0;
969 /* Establish function entry probe point */ 979 /* Establish function entry probe point */
970 ret = __register_kprobe(&rp->kp, called_from); 980 ret = register_kprobe(&rp->kp);
971 if (ret != 0) 981 if (ret != 0)
972 free_rp_inst(rp); 982 free_rp_inst(rp);
973 return ret; 983 return ret;
974} 984}
975 985
976static int __register_kretprobes(struct kretprobe **rps, int num, 986int __kprobes register_kretprobes(struct kretprobe **rps, int num)
977 unsigned long called_from)
978{ 987{
979 int ret = 0, i; 988 int ret = 0, i;
980 989
981 if (num <= 0) 990 if (num <= 0)
982 return -EINVAL; 991 return -EINVAL;
983 for (i = 0; i < num; i++) { 992 for (i = 0; i < num; i++) {
984 ret = __register_kretprobe(rps[i], called_from); 993 ret = register_kretprobe(rps[i]);
985 if (ret < 0) { 994 if (ret < 0) {
986 if (i > 0) 995 if (i > 0)
987 unregister_kretprobes(rps, i); 996 unregister_kretprobes(rps, i);
@@ -991,23 +1000,11 @@ static int __register_kretprobes(struct kretprobe **rps, int num,
991 return ret; 1000 return ret;
992} 1001}
993 1002
994int __kprobes register_kretprobe(struct kretprobe *rp)
995{
996 return __register_kretprobes(&rp, 1,
997 (unsigned long)__builtin_return_address(0));
998}
999
1000void __kprobes unregister_kretprobe(struct kretprobe *rp) 1003void __kprobes unregister_kretprobe(struct kretprobe *rp)
1001{ 1004{
1002 unregister_kretprobes(&rp, 1); 1005 unregister_kretprobes(&rp, 1);
1003} 1006}
1004 1007
1005int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1006{
1007 return __register_kretprobes(rps, num,
1008 (unsigned long)__builtin_return_address(0));
1009}
1010
1011void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1008void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1012{ 1009{
1013 int i; 1010 int i;
@@ -1055,6 +1052,72 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1055 1052
1056#endif /* CONFIG_KRETPROBES */ 1053#endif /* CONFIG_KRETPROBES */
1057 1054
1055/* Set the kprobe gone and remove its instruction buffer. */
1056static void __kprobes kill_kprobe(struct kprobe *p)
1057{
1058 struct kprobe *kp;
1059 p->flags |= KPROBE_FLAG_GONE;
1060 if (p->pre_handler == aggr_pre_handler) {
1061 /*
1062 * If this is an aggr_kprobe, we have to list all the
1063 * chained probes and mark them GONE.
1064 */
1065 list_for_each_entry_rcu(kp, &p->list, list)
1066 kp->flags |= KPROBE_FLAG_GONE;
1067 p->post_handler = NULL;
1068 p->break_handler = NULL;
1069 }
1070 /*
1071 * Here, we can remove insn_slot safely, because no thread calls
1072 * the original probed function (which will be freed soon) any more.
1073 */
1074 arch_remove_kprobe(p);
1075}
1076
1077/* Module notifier call back, checking kprobes on the module */
1078static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1079 unsigned long val, void *data)
1080{
1081 struct module *mod = data;
1082 struct hlist_head *head;
1083 struct hlist_node *node;
1084 struct kprobe *p;
1085 unsigned int i;
1086 int checkcore = (val == MODULE_STATE_GOING);
1087
1088 if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
1089 return NOTIFY_DONE;
1090
1091 /*
1092 * When MODULE_STATE_GOING was notified, both of module .text and
1093 * .init.text sections would be freed. When MODULE_STATE_LIVE was
1094 * notified, only .init.text section would be freed. We need to
1095 * disable kprobes which have been inserted in the sections.
1096 */
1097 mutex_lock(&kprobe_mutex);
1098 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1099 head = &kprobe_table[i];
1100 hlist_for_each_entry_rcu(p, node, head, hlist)
1101 if (within_module_init((unsigned long)p->addr, mod) ||
1102 (checkcore &&
1103 within_module_core((unsigned long)p->addr, mod))) {
1104 /*
1105 * The vaddr this probe is installed will soon
1106 * be vfreed buy not synced to disk. Hence,
1107 * disarming the breakpoint isn't needed.
1108 */
1109 kill_kprobe(p);
1110 }
1111 }
1112 mutex_unlock(&kprobe_mutex);
1113 return NOTIFY_DONE;
1114}
1115
1116static struct notifier_block kprobe_module_nb = {
1117 .notifier_call = kprobes_module_callback,
1118 .priority = 0
1119};
1120
1058static int __init init_kprobes(void) 1121static int __init init_kprobes(void)
1059{ 1122{
1060 int i, err = 0; 1123 int i, err = 0;
@@ -1111,6 +1174,9 @@ static int __init init_kprobes(void)
1111 err = arch_init_kprobes(); 1174 err = arch_init_kprobes();
1112 if (!err) 1175 if (!err)
1113 err = register_die_notifier(&kprobe_exceptions_nb); 1176 err = register_die_notifier(&kprobe_exceptions_nb);
1177 if (!err)
1178 err = register_module_notifier(&kprobe_module_nb);
1179
1114 kprobes_initialized = (err == 0); 1180 kprobes_initialized = (err == 0);
1115 1181
1116 if (!err) 1182 if (!err)
@@ -1131,10 +1197,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1131 else 1197 else
1132 kprobe_type = "k"; 1198 kprobe_type = "k";
1133 if (sym) 1199 if (sym)
1134 seq_printf(pi, "%p %s %s+0x%x %s\n", p->addr, kprobe_type, 1200 seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type,
1135 sym, offset, (modname ? modname : " ")); 1201 sym, offset, (modname ? modname : " "),
1202 (kprobe_gone(p) ? "[GONE]" : ""));
1136 else 1203 else
1137 seq_printf(pi, "%p %s %p\n", p->addr, kprobe_type, p->addr); 1204 seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr,
1205 (kprobe_gone(p) ? "[GONE]" : ""));
1138} 1206}
1139 1207
1140static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1208static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1215,7 +1283,8 @@ static void __kprobes enable_all_kprobes(void)
1215 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1283 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1216 head = &kprobe_table[i]; 1284 head = &kprobe_table[i];
1217 hlist_for_each_entry_rcu(p, node, head, hlist) 1285 hlist_for_each_entry_rcu(p, node, head, hlist)
1218 arch_arm_kprobe(p); 1286 if (!kprobe_gone(p))
1287 arch_arm_kprobe(p);
1219 } 1288 }
1220 1289
1221 kprobe_enabled = true; 1290 kprobe_enabled = true;
@@ -1244,7 +1313,7 @@ static void __kprobes disable_all_kprobes(void)
1244 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1313 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1245 head = &kprobe_table[i]; 1314 head = &kprobe_table[i];
1246 hlist_for_each_entry_rcu(p, node, head, hlist) { 1315 hlist_for_each_entry_rcu(p, node, head, hlist) {
1247 if (!arch_trampoline_kprobe(p)) 1316 if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
1248 arch_disarm_kprobe(p); 1317 arch_disarm_kprobe(p);
1249 } 1318 }
1250 } 1319 }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 08dd8ed86c77..528dd78e7e7e 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,7 +24,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
24static struct kobj_attribute _name##_attr = \ 24static struct kobj_attribute _name##_attr = \
25 __ATTR(_name, 0644, _name##_show, _name##_store) 25 __ATTR(_name, 0644, _name##_show, _name##_store)
26 26
27#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 27#if defined(CONFIG_HOTPLUG)
28/* current uevent sequence number */ 28/* current uevent sequence number */
29static ssize_t uevent_seqnum_show(struct kobject *kobj, 29static ssize_t uevent_seqnum_show(struct kobject *kobj,
30 struct kobj_attribute *attr, char *buf) 30 struct kobj_attribute *attr, char *buf)
@@ -137,7 +137,7 @@ struct kobject *kernel_kobj;
137EXPORT_SYMBOL_GPL(kernel_kobj); 137EXPORT_SYMBOL_GPL(kernel_kobj);
138 138
139static struct attribute * kernel_attrs[] = { 139static struct attribute * kernel_attrs[] = {
140#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 140#if defined(CONFIG_HOTPLUG)
141 &uevent_seqnum_attr.attr, 141 &uevent_seqnum_attr.attr,
142 &uevent_helper_attr.attr, 142 &uevent_helper_attr.attr,
143#endif 143#endif
diff --git a/kernel/module.c b/kernel/module.c
index dd2a54155b54..ba22484a987e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -43,7 +43,6 @@
43#include <linux/device.h> 43#include <linux/device.h>
44#include <linux/string.h> 44#include <linux/string.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/unwind.h>
47#include <linux/rculist.h> 46#include <linux/rculist.h>
48#include <asm/uaccess.h> 47#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 48#include <asm/cacheflush.h>
@@ -51,6 +50,7 @@
51#include <asm/sections.h> 50#include <asm/sections.h>
52#include <linux/tracepoint.h> 51#include <linux/tracepoint.h>
53#include <linux/ftrace.h> 52#include <linux/ftrace.h>
53#include <linux/async.h>
54 54
55#if 0 55#if 0
56#define DEBUGP printk 56#define DEBUGP printk
@@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
573/* Init the unload section of the module. */ 573/* Init the unload section of the module. */
574static void module_unload_init(struct module *mod) 574static void module_unload_init(struct module *mod)
575{ 575{
576 unsigned int i; 576 int cpu;
577 577
578 INIT_LIST_HEAD(&mod->modules_which_use_me); 578 INIT_LIST_HEAD(&mod->modules_which_use_me);
579 for (i = 0; i < NR_CPUS; i++) 579 for_each_possible_cpu(cpu)
580 local_set(&mod->ref[i].count, 0); 580 local_set(__module_ref_addr(mod, cpu), 0);
581 /* Hold reference count during initialization. */ 581 /* Hold reference count during initialization. */
582 local_set(&mod->ref[raw_smp_processor_id()].count, 1); 582 local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
583 /* Backwards compatibility macros put refcount during init. */ 583 /* Backwards compatibility macros put refcount during init. */
584 mod->waiter = current; 584 mod->waiter = current;
585} 585}
@@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
717 717
718unsigned int module_refcount(struct module *mod) 718unsigned int module_refcount(struct module *mod)
719{ 719{
720 unsigned int i, total = 0; 720 unsigned int total = 0;
721 int cpu;
721 722
722 for (i = 0; i < NR_CPUS; i++) 723 for_each_possible_cpu(cpu)
723 total += local_read(&mod->ref[i].count); 724 total += local_read(__module_ref_addr(mod, cpu));
724 return total; 725 return total;
725} 726}
726EXPORT_SYMBOL(module_refcount); 727EXPORT_SYMBOL(module_refcount);
@@ -743,8 +744,8 @@ static void wait_for_zero_refcount(struct module *mod)
743 mutex_lock(&module_mutex); 744 mutex_lock(&module_mutex);
744} 745}
745 746
746asmlinkage long 747SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
747sys_delete_module(const char __user *name_user, unsigned int flags) 748 unsigned int, flags)
748{ 749{
749 struct module *mod; 750 struct module *mod;
750 char name[MODULE_NAME_LEN]; 751 char name[MODULE_NAME_LEN];
@@ -757,8 +758,16 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
757 return -EFAULT; 758 return -EFAULT;
758 name[MODULE_NAME_LEN-1] = '\0'; 759 name[MODULE_NAME_LEN-1] = '\0';
759 760
760 if (mutex_lock_interruptible(&module_mutex) != 0) 761 /* Create stop_machine threads since free_module relies on
761 return -EINTR; 762 * a non-failing stop_machine call. */
763 ret = stop_machine_create();
764 if (ret)
765 return ret;
766
767 if (mutex_lock_interruptible(&module_mutex) != 0) {
768 ret = -EINTR;
769 goto out_stop;
770 }
762 771
763 mod = find_module(name); 772 mod = find_module(name);
764 if (!mod) { 773 if (!mod) {
@@ -809,6 +818,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
809 mod->exit(); 818 mod->exit();
810 blocking_notifier_call_chain(&module_notify_list, 819 blocking_notifier_call_chain(&module_notify_list,
811 MODULE_STATE_GOING, mod); 820 MODULE_STATE_GOING, mod);
821 async_synchronize_full();
812 mutex_lock(&module_mutex); 822 mutex_lock(&module_mutex);
813 /* Store the name of the last unloaded module for diagnostic purposes */ 823 /* Store the name of the last unloaded module for diagnostic purposes */
814 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 824 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
@@ -817,10 +827,12 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
817 827
818 out: 828 out:
819 mutex_unlock(&module_mutex); 829 mutex_unlock(&module_mutex);
830out_stop:
831 stop_machine_destroy();
820 return ret; 832 return ret;
821} 833}
822 834
823static void print_unload_info(struct seq_file *m, struct module *mod) 835static inline void print_unload_info(struct seq_file *m, struct module *mod)
824{ 836{
825 struct module_use *use; 837 struct module_use *use;
826 int printed_something = 0; 838 int printed_something = 0;
@@ -883,7 +895,7 @@ void module_put(struct module *module)
883{ 895{
884 if (module) { 896 if (module) {
885 unsigned int cpu = get_cpu(); 897 unsigned int cpu = get_cpu();
886 local_dec(&module->ref[cpu].count); 898 local_dec(__module_ref_addr(module, cpu));
887 /* Maybe they're waiting for us to drop reference? */ 899 /* Maybe they're waiting for us to drop reference? */
888 if (unlikely(!module_is_live(module))) 900 if (unlikely(!module_is_live(module)))
889 wake_up_process(module->waiter); 901 wake_up_process(module->waiter);
@@ -893,7 +905,7 @@ void module_put(struct module *module)
893EXPORT_SYMBOL(module_put); 905EXPORT_SYMBOL(module_put);
894 906
895#else /* !CONFIG_MODULE_UNLOAD */ 907#else /* !CONFIG_MODULE_UNLOAD */
896static void print_unload_info(struct seq_file *m, struct module *mod) 908static inline void print_unload_info(struct seq_file *m, struct module *mod)
897{ 909{
898 /* We don't know the usage count, or what modules are using. */ 910 /* We don't know the usage count, or what modules are using. */
899 seq_printf(m, " - -"); 911 seq_printf(m, " - -");
@@ -1439,8 +1451,6 @@ static void free_module(struct module *mod)
1439 remove_sect_attrs(mod); 1451 remove_sect_attrs(mod);
1440 mod_kobject_remove(mod); 1452 mod_kobject_remove(mod);
1441 1453
1442 unwind_remove_table(mod->unwind_info, 0);
1443
1444 /* Arch-specific cleanup. */ 1454 /* Arch-specific cleanup. */
1445 module_arch_cleanup(mod); 1455 module_arch_cleanup(mod);
1446 1456
@@ -1455,7 +1465,10 @@ static void free_module(struct module *mod)
1455 kfree(mod->args); 1465 kfree(mod->args);
1456 if (mod->percpu) 1466 if (mod->percpu)
1457 percpu_modfree(mod->percpu); 1467 percpu_modfree(mod->percpu);
1458 1468#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
1469 if (mod->refptr)
1470 percpu_modfree(mod->refptr);
1471#endif
1459 /* Free lock-classes: */ 1472 /* Free lock-classes: */
1460 lockdep_free_key_range(mod->module_core, mod->core_size); 1473 lockdep_free_key_range(mod->module_core, mod->core_size);
1461 1474
@@ -1578,11 +1591,21 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1578 return ret; 1591 return ret;
1579} 1592}
1580 1593
1594/* Additional bytes needed by arch in front of individual sections */
1595unsigned int __weak arch_mod_section_prepend(struct module *mod,
1596 unsigned int section)
1597{
1598 /* default implementation just returns zero */
1599 return 0;
1600}
1601
1581/* Update size with this section: return offset. */ 1602/* Update size with this section: return offset. */
1582static long get_offset(unsigned int *size, Elf_Shdr *sechdr) 1603static long get_offset(struct module *mod, unsigned int *size,
1604 Elf_Shdr *sechdr, unsigned int section)
1583{ 1605{
1584 long ret; 1606 long ret;
1585 1607
1608 *size += arch_mod_section_prepend(mod, section);
1586 ret = ALIGN(*size, sechdr->sh_addralign ?: 1); 1609 ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
1587 *size = ret + sechdr->sh_size; 1610 *size = ret + sechdr->sh_size;
1588 return ret; 1611 return ret;
@@ -1622,7 +1645,7 @@ static void layout_sections(struct module *mod,
1622 || strncmp(secstrings + s->sh_name, 1645 || strncmp(secstrings + s->sh_name,
1623 ".init", 5) == 0) 1646 ".init", 5) == 0)
1624 continue; 1647 continue;
1625 s->sh_entsize = get_offset(&mod->core_size, s); 1648 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1626 DEBUGP("\t%s\n", secstrings + s->sh_name); 1649 DEBUGP("\t%s\n", secstrings + s->sh_name);
1627 } 1650 }
1628 if (m == 0) 1651 if (m == 0)
@@ -1640,7 +1663,7 @@ static void layout_sections(struct module *mod,
1640 || strncmp(secstrings + s->sh_name, 1663 || strncmp(secstrings + s->sh_name,
1641 ".init", 5) != 0) 1664 ".init", 5) != 0)
1642 continue; 1665 continue;
1643 s->sh_entsize = (get_offset(&mod->init_size, s) 1666 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1644 | INIT_OFFSET_MASK); 1667 | INIT_OFFSET_MASK);
1645 DEBUGP("\t%s\n", secstrings + s->sh_name); 1668 DEBUGP("\t%s\n", secstrings + s->sh_name);
1646 } 1669 }
@@ -1725,15 +1748,15 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
1725 return NULL; 1748 return NULL;
1726} 1749}
1727 1750
1728static int is_exported(const char *name, const struct module *mod) 1751static int is_exported(const char *name, unsigned long value,
1752 const struct module *mod)
1729{ 1753{
1730 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1754 const struct kernel_symbol *ks;
1731 return 1; 1755 if (!mod)
1756 ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab);
1732 else 1757 else
1733 if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) 1758 ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms);
1734 return 1; 1759 return ks != NULL && ks->value == value;
1735 else
1736 return 0;
1737} 1760}
1738 1761
1739/* As per nm */ 1762/* As per nm */
@@ -1847,7 +1870,6 @@ static noinline struct module *load_module(void __user *umod,
1847 unsigned int symindex = 0; 1870 unsigned int symindex = 0;
1848 unsigned int strindex = 0; 1871 unsigned int strindex = 0;
1849 unsigned int modindex, versindex, infoindex, pcpuindex; 1872 unsigned int modindex, versindex, infoindex, pcpuindex;
1850 unsigned int unwindex = 0;
1851 unsigned int num_kp, num_mcount; 1873 unsigned int num_kp, num_mcount;
1852 struct kernel_param *kp; 1874 struct kernel_param *kp;
1853 struct module *mod; 1875 struct module *mod;
@@ -1865,6 +1887,13 @@ static noinline struct module *load_module(void __user *umod,
1865 /* vmalloc barfs on "unusual" numbers. Check here */ 1887 /* vmalloc barfs on "unusual" numbers. Check here */
1866 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 1888 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
1867 return ERR_PTR(-ENOMEM); 1889 return ERR_PTR(-ENOMEM);
1890
1891 /* Create stop_machine threads since the error path relies on
1892 * a non-failing stop_machine call. */
1893 err = stop_machine_create();
1894 if (err)
1895 goto free_hdr;
1896
1868 if (copy_from_user(hdr, umod, len) != 0) { 1897 if (copy_from_user(hdr, umod, len) != 0) {
1869 err = -EFAULT; 1898 err = -EFAULT;
1870 goto free_hdr; 1899 goto free_hdr;
@@ -1930,9 +1959,6 @@ static noinline struct module *load_module(void __user *umod,
1930 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1959 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1931 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1960 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1932 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1961 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1933#ifdef ARCH_UNWIND_SECTION_NAME
1934 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1935#endif
1936 1962
1937 /* Don't keep modinfo and version sections. */ 1963 /* Don't keep modinfo and version sections. */
1938 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1964 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1942,8 +1968,6 @@ static noinline struct module *load_module(void __user *umod,
1942 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1968 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1943 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1969 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1944#endif 1970#endif
1945 if (unwindex)
1946 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1947 1971
1948 /* Check module struct version now, before we try to use module. */ 1972 /* Check module struct version now, before we try to use module. */
1949 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1973 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1991,6 +2015,14 @@ static noinline struct module *load_module(void __user *umod,
1991 if (err < 0) 2015 if (err < 0)
1992 goto free_mod; 2016 goto free_mod;
1993 2017
2018#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2019 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
2020 mod->name);
2021 if (!mod->refptr) {
2022 err = -ENOMEM;
2023 goto free_mod;
2024 }
2025#endif
1994 if (pcpuindex) { 2026 if (pcpuindex) {
1995 /* We have a special allocation for this section. */ 2027 /* We have a special allocation for this section. */
1996 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, 2028 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -1998,7 +2030,7 @@ static noinline struct module *load_module(void __user *umod,
1998 mod->name); 2030 mod->name);
1999 if (!percpu) { 2031 if (!percpu) {
2000 err = -ENOMEM; 2032 err = -ENOMEM;
2001 goto free_mod; 2033 goto free_percpu;
2002 } 2034 }
2003 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2035 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2004 mod->percpu = percpu; 2036 mod->percpu = percpu;
@@ -2240,14 +2272,10 @@ static noinline struct module *load_module(void __user *umod,
2240 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2272 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2241 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2273 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2242 2274
2243 /* Size of section 0 is 0, so this works well if no unwind info. */
2244 mod->unwind_info = unwind_add_table(mod,
2245 (void *)sechdrs[unwindex].sh_addr,
2246 sechdrs[unwindex].sh_size);
2247
2248 /* Get rid of temporary copy */ 2275 /* Get rid of temporary copy */
2249 vfree(hdr); 2276 vfree(hdr);
2250 2277
2278 stop_machine_destroy();
2251 /* Done! */ 2279 /* Done! */
2252 return mod; 2280 return mod;
2253 2281
@@ -2266,10 +2294,14 @@ static noinline struct module *load_module(void __user *umod,
2266 free_percpu: 2294 free_percpu:
2267 if (percpu) 2295 if (percpu)
2268 percpu_modfree(percpu); 2296 percpu_modfree(percpu);
2297#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2298 percpu_modfree(mod->refptr);
2299#endif
2269 free_mod: 2300 free_mod:
2270 kfree(args); 2301 kfree(args);
2271 free_hdr: 2302 free_hdr:
2272 vfree(hdr); 2303 vfree(hdr);
2304 stop_machine_destroy();
2273 return ERR_PTR(err); 2305 return ERR_PTR(err);
2274 2306
2275 truncated: 2307 truncated:
@@ -2279,10 +2311,8 @@ static noinline struct module *load_module(void __user *umod,
2279} 2311}
2280 2312
2281/* This is where the real work happens */ 2313/* This is where the real work happens */
2282asmlinkage long 2314SYSCALL_DEFINE3(init_module, void __user *, umod,
2283sys_init_module(void __user *umod, 2315 unsigned long, len, const char __user *, uargs)
2284 unsigned long len,
2285 const char __user *uargs)
2286{ 2316{
2287 struct module *mod; 2317 struct module *mod;
2288 int ret = 0; 2318 int ret = 0;
@@ -2337,11 +2367,12 @@ sys_init_module(void __user *umod,
2337 /* Now it's a first class citizen! Wake up anyone waiting for it. */ 2367 /* Now it's a first class citizen! Wake up anyone waiting for it. */
2338 mod->state = MODULE_STATE_LIVE; 2368 mod->state = MODULE_STATE_LIVE;
2339 wake_up(&module_wq); 2369 wake_up(&module_wq);
2370 blocking_notifier_call_chain(&module_notify_list,
2371 MODULE_STATE_LIVE, mod);
2340 2372
2341 mutex_lock(&module_mutex); 2373 mutex_lock(&module_mutex);
2342 /* Drop initial reference. */ 2374 /* Drop initial reference. */
2343 module_put(mod); 2375 module_put(mod);
2344 unwind_remove_table(mod->unwind_info, 1);
2345 module_free(mod, mod->module_init); 2376 module_free(mod, mod->module_init);
2346 mod->module_init = NULL; 2377 mod->module_init = NULL;
2347 mod->init_size = 0; 2378 mod->init_size = 0;
@@ -2376,7 +2407,7 @@ static const char *get_ksymbol(struct module *mod,
2376 unsigned long nextval; 2407 unsigned long nextval;
2377 2408
2378 /* At worse, next value is at end of module */ 2409 /* At worse, next value is at end of module */
2379 if (within(addr, mod->module_init, mod->init_size)) 2410 if (within_module_init(addr, mod))
2380 nextval = (unsigned long)mod->module_init+mod->init_text_size; 2411 nextval = (unsigned long)mod->module_init+mod->init_text_size;
2381 else 2412 else
2382 nextval = (unsigned long)mod->module_core+mod->core_text_size; 2413 nextval = (unsigned long)mod->module_core+mod->core_text_size;
@@ -2424,8 +2455,8 @@ const char *module_address_lookup(unsigned long addr,
2424 2455
2425 preempt_disable(); 2456 preempt_disable();
2426 list_for_each_entry_rcu(mod, &modules, list) { 2457 list_for_each_entry_rcu(mod, &modules, list) {
2427 if (within(addr, mod->module_init, mod->init_size) 2458 if (within_module_init(addr, mod) ||
2428 || within(addr, mod->module_core, mod->core_size)) { 2459 within_module_core(addr, mod)) {
2429 if (modname) 2460 if (modname)
2430 *modname = mod->name; 2461 *modname = mod->name;
2431 ret = get_ksymbol(mod, addr, size, offset); 2462 ret = get_ksymbol(mod, addr, size, offset);
@@ -2447,8 +2478,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2447 2478
2448 preempt_disable(); 2479 preempt_disable();
2449 list_for_each_entry_rcu(mod, &modules, list) { 2480 list_for_each_entry_rcu(mod, &modules, list) {
2450 if (within(addr, mod->module_init, mod->init_size) || 2481 if (within_module_init(addr, mod) ||
2451 within(addr, mod->module_core, mod->core_size)) { 2482 within_module_core(addr, mod)) {
2452 const char *sym; 2483 const char *sym;
2453 2484
2454 sym = get_ksymbol(mod, addr, NULL, NULL); 2485 sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -2471,8 +2502,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2471 2502
2472 preempt_disable(); 2503 preempt_disable();
2473 list_for_each_entry_rcu(mod, &modules, list) { 2504 list_for_each_entry_rcu(mod, &modules, list) {
2474 if (within(addr, mod->module_init, mod->init_size) || 2505 if (within_module_init(addr, mod) ||
2475 within(addr, mod->module_core, mod->core_size)) { 2506 within_module_core(addr, mod)) {
2476 const char *sym; 2507 const char *sym;
2477 2508
2478 sym = get_ksymbol(mod, addr, size, offset); 2509 sym = get_ksymbol(mod, addr, size, offset);
@@ -2504,7 +2535,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2504 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, 2535 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
2505 KSYM_NAME_LEN); 2536 KSYM_NAME_LEN);
2506 strlcpy(module_name, mod->name, MODULE_NAME_LEN); 2537 strlcpy(module_name, mod->name, MODULE_NAME_LEN);
2507 *exported = is_exported(name, mod); 2538 *exported = is_exported(name, *value, mod);
2508 preempt_enable(); 2539 preempt_enable();
2509 return 0; 2540 return 0;
2510 } 2541 }
@@ -2691,7 +2722,7 @@ int is_module_address(unsigned long addr)
2691 preempt_disable(); 2722 preempt_disable();
2692 2723
2693 list_for_each_entry_rcu(mod, &modules, list) { 2724 list_for_each_entry_rcu(mod, &modules, list) {
2694 if (within(addr, mod->module_core, mod->core_size)) { 2725 if (within_module_core(addr, mod)) {
2695 preempt_enable(); 2726 preempt_enable();
2696 return 1; 2727 return 1;
2697 } 2728 }
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 43c2111cd54d..78bc3fdac0d2 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -13,7 +13,6 @@
13 13
14struct ns_cgroup { 14struct ns_cgroup {
15 struct cgroup_subsys_state css; 15 struct cgroup_subsys_state css;
16 spinlock_t lock;
17}; 16};
18 17
19struct cgroup_subsys ns_subsys; 18struct cgroup_subsys ns_subsys;
@@ -84,7 +83,6 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
84 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); 83 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
85 if (!ns_cgroup) 84 if (!ns_cgroup)
86 return ERR_PTR(-ENOMEM); 85 return ERR_PTR(-ENOMEM);
87 spin_lock_init(&ns_cgroup->lock);
88 return &ns_cgroup->css; 86 return &ns_cgroup->css;
89} 87}
90 88
diff --git a/kernel/panic.c b/kernel/panic.c
index 13f06349a786..2a2ff36ff44d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -299,6 +299,8 @@ static int init_oops_id(void)
299{ 299{
300 if (!oops_id) 300 if (!oops_id)
301 get_random_bytes(&oops_id, sizeof(oops_id)); 301 get_random_bytes(&oops_id, sizeof(oops_id));
302 else
303 oops_id++;
302 304
303 return 0; 305 return 0;
304} 306}
diff --git a/kernel/pid.c b/kernel/pid.c
index 064e76afa507..1b3586fe753a 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -474,8 +474,14 @@ pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
474} 474}
475EXPORT_SYMBOL(task_session_nr_ns); 475EXPORT_SYMBOL(task_session_nr_ns);
476 476
477struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
478{
479 return ns_of_pid(task_pid(tsk));
480}
481EXPORT_SYMBOL_GPL(task_active_pid_ns);
482
477/* 483/*
478 * Used by proc to find the first pid that is greater then or equal to nr. 484 * Used by proc to find the first pid that is greater than or equal to nr.
479 * 485 *
480 * If there is a pid at nr this function is exactly the same as find_pid_ns. 486 * If there is a pid at nr this function is exactly the same as find_pid_ns.
481 */ 487 */
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 157de3a47832..fa07da94d7be 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,76 +10,6 @@
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11 11
12/* 12/*
13 * Allocate the thread_group_cputime structure appropriately and fill in the
14 * current values of the fields. Called from copy_signal() via
15 * thread_group_cputime_clone_thread() when adding a second or subsequent
16 * thread to a thread group. Assumes interrupts are enabled when called.
17 */
18int thread_group_cputime_alloc(struct task_struct *tsk)
19{
20 struct signal_struct *sig = tsk->signal;
21 struct task_cputime *cputime;
22
23 /*
24 * If we have multiple threads and we don't already have a
25 * per-CPU task_cputime struct (checked in the caller), allocate
26 * one and fill it in with the times accumulated so far. We may
27 * race with another thread so recheck after we pick up the sighand
28 * lock.
29 */
30 cputime = alloc_percpu(struct task_cputime);
31 if (cputime == NULL)
32 return -ENOMEM;
33 spin_lock_irq(&tsk->sighand->siglock);
34 if (sig->cputime.totals) {
35 spin_unlock_irq(&tsk->sighand->siglock);
36 free_percpu(cputime);
37 return 0;
38 }
39 sig->cputime.totals = cputime;
40 cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
41 cputime->utime = tsk->utime;
42 cputime->stime = tsk->stime;
43 cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
44 spin_unlock_irq(&tsk->sighand->siglock);
45 return 0;
46}
47
48/**
49 * thread_group_cputime - Sum the thread group time fields across all CPUs.
50 *
51 * @tsk: The task we use to identify the thread group.
52 * @times: task_cputime structure in which we return the summed fields.
53 *
54 * Walk the list of CPUs to sum the per-CPU time fields in the thread group
55 * time structure.
56 */
57void thread_group_cputime(
58 struct task_struct *tsk,
59 struct task_cputime *times)
60{
61 struct task_cputime *totals, *tot;
62 int i;
63
64 totals = tsk->signal->cputime.totals;
65 if (!totals) {
66 times->utime = tsk->utime;
67 times->stime = tsk->stime;
68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
69 return;
70 }
71
72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime;
79 }
80}
81
82/*
83 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 13 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
84 */ 14 */
85void update_rlimit_cpu(unsigned long rlim_new) 15void update_rlimit_cpu(unsigned long rlim_new)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 887c63787de6..052ec4d195c7 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -477,10 +477,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
477 477
478/* Create a POSIX.1b interval timer. */ 478/* Create a POSIX.1b interval timer. */
479 479
480asmlinkage long 480SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
481sys_timer_create(const clockid_t which_clock, 481 struct sigevent __user *, timer_event_spec,
482 struct sigevent __user *timer_event_spec, 482 timer_t __user *, created_timer_id)
483 timer_t __user * created_timer_id)
484{ 483{
485 struct k_itimer *new_timer; 484 struct k_itimer *new_timer;
486 int error, new_timer_id; 485 int error, new_timer_id;
@@ -661,8 +660,8 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
661} 660}
662 661
663/* Get the time remaining on a POSIX.1b interval timer. */ 662/* Get the time remaining on a POSIX.1b interval timer. */
664asmlinkage long 663SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
665sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting) 664 struct itimerspec __user *, setting)
666{ 665{
667 struct k_itimer *timr; 666 struct k_itimer *timr;
668 struct itimerspec cur_setting; 667 struct itimerspec cur_setting;
@@ -691,8 +690,7 @@ sys_timer_gettime(timer_t timer_id, struct itimerspec __user *setting)
691 * the call back to do_schedule_next_timer(). So all we need to do is 690 * the call back to do_schedule_next_timer(). So all we need to do is
692 * to pick up the frozen overrun. 691 * to pick up the frozen overrun.
693 */ 692 */
694asmlinkage long 693SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
695sys_timer_getoverrun(timer_t timer_id)
696{ 694{
697 struct k_itimer *timr; 695 struct k_itimer *timr;
698 int overrun; 696 int overrun;
@@ -760,10 +758,9 @@ common_timer_set(struct k_itimer *timr, int flags,
760} 758}
761 759
762/* Set a POSIX.1b interval timer */ 760/* Set a POSIX.1b interval timer */
763asmlinkage long 761SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
764sys_timer_settime(timer_t timer_id, int flags, 762 const struct itimerspec __user *, new_setting,
765 const struct itimerspec __user *new_setting, 763 struct itimerspec __user *, old_setting)
766 struct itimerspec __user *old_setting)
767{ 764{
768 struct k_itimer *timr; 765 struct k_itimer *timr;
769 struct itimerspec new_spec, old_spec; 766 struct itimerspec new_spec, old_spec;
@@ -816,8 +813,7 @@ static inline int timer_delete_hook(struct k_itimer *timer)
816} 813}
817 814
818/* Delete a POSIX.1b interval timer. */ 815/* Delete a POSIX.1b interval timer. */
819asmlinkage long 816SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
820sys_timer_delete(timer_t timer_id)
821{ 817{
822 struct k_itimer *timer; 818 struct k_itimer *timer;
823 unsigned long flags; 819 unsigned long flags;
@@ -903,8 +899,8 @@ int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
903} 899}
904EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep); 900EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
905 901
906asmlinkage long sys_clock_settime(const clockid_t which_clock, 902SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
907 const struct timespec __user *tp) 903 const struct timespec __user *, tp)
908{ 904{
909 struct timespec new_tp; 905 struct timespec new_tp;
910 906
@@ -916,8 +912,8 @@ asmlinkage long sys_clock_settime(const clockid_t which_clock,
916 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp)); 912 return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
917} 913}
918 914
919asmlinkage long 915SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
920sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp) 916 struct timespec __user *,tp)
921{ 917{
922 struct timespec kernel_tp; 918 struct timespec kernel_tp;
923 int error; 919 int error;
@@ -933,8 +929,8 @@ sys_clock_gettime(const clockid_t which_clock, struct timespec __user *tp)
933 929
934} 930}
935 931
936asmlinkage long 932SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
937sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp) 933 struct timespec __user *, tp)
938{ 934{
939 struct timespec rtn_tp; 935 struct timespec rtn_tp;
940 int error; 936 int error;
@@ -963,10 +959,9 @@ static int common_nsleep(const clockid_t which_clock, int flags,
963 which_clock); 959 which_clock);
964} 960}
965 961
966asmlinkage long 962SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
967sys_clock_nanosleep(const clockid_t which_clock, int flags, 963 const struct timespec __user *, rqtp,
968 const struct timespec __user *rqtp, 964 struct timespec __user *, rmtp)
969 struct timespec __user *rmtp)
970{ 965{
971 struct timespec t; 966 struct timespec t;
972 967
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 597823b5b700..d7a10167a25b 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,8 @@ EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o 6obj-y := main.o
7obj-$(CONFIG_PM_SLEEP) += process.o console.o 7obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o
8obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o 9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o
9 10
10obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f77d3819ef57..432ee575c9ee 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -71,6 +71,14 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)
71 mutex_unlock(&pm_mutex); 71 mutex_unlock(&pm_mutex);
72} 72}
73 73
74static bool entering_platform_hibernation;
75
76bool system_entering_hibernation(void)
77{
78 return entering_platform_hibernation;
79}
80EXPORT_SYMBOL(system_entering_hibernation);
81
74#ifdef CONFIG_PM_DEBUG 82#ifdef CONFIG_PM_DEBUG
75static void hibernation_debug_sleep(void) 83static void hibernation_debug_sleep(void)
76{ 84{
@@ -258,12 +266,12 @@ int hibernation_snapshot(int platform_mode)
258{ 266{
259 int error; 267 int error;
260 268
261 /* Free memory before shutting down devices. */ 269 error = platform_begin(platform_mode);
262 error = swsusp_shrink_memory();
263 if (error) 270 if (error)
264 return error; 271 return error;
265 272
266 error = platform_begin(platform_mode); 273 /* Free memory before shutting down devices. */
274 error = swsusp_shrink_memory();
267 if (error) 275 if (error)
268 goto Close; 276 goto Close;
269 277
@@ -411,6 +419,7 @@ int hibernation_platform_enter(void)
411 if (error) 419 if (error)
412 goto Close; 420 goto Close;
413 421
422 entering_platform_hibernation = true;
414 suspend_console(); 423 suspend_console();
415 error = device_suspend(PMSG_HIBERNATE); 424 error = device_suspend(PMSG_HIBERNATE);
416 if (error) { 425 if (error) {
@@ -445,6 +454,7 @@ int hibernation_platform_enter(void)
445 Finish: 454 Finish:
446 hibernation_ops->finish(); 455 hibernation_ops->finish();
447 Resume_devices: 456 Resume_devices:
457 entering_platform_hibernation = false;
448 device_resume(PMSG_RESTORE); 458 device_resume(PMSG_RESTORE);
449 resume_console(); 459 resume_console();
450 Close: 460 Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 613f16941b85..239988873971 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -615,7 +615,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
615 /* this may fail if the RTC hasn't been initialized */ 615 /* this may fail if the RTC hasn't been initialized */
616 status = rtc_read_time(rtc, &alm.time); 616 status = rtc_read_time(rtc, &alm.time);
617 if (status < 0) { 617 if (status < 0) {
618 printk(err_readtime, rtc->dev.bus_id, status); 618 printk(err_readtime, dev_name(&rtc->dev), status);
619 return; 619 return;
620 } 620 }
621 rtc_tm_to_time(&alm.time, &now); 621 rtc_tm_to_time(&alm.time, &now);
@@ -626,7 +626,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
626 626
627 status = rtc_set_alarm(rtc, &alm); 627 status = rtc_set_alarm(rtc, &alm);
628 if (status < 0) { 628 if (status < 0) {
629 printk(err_wakealarm, rtc->dev.bus_id, status); 629 printk(err_wakealarm, dev_name(&rtc->dev), status);
630 return; 630 return;
631 } 631 }
632 632
@@ -660,7 +660,7 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
660 if (!device_may_wakeup(candidate->dev.parent)) 660 if (!device_may_wakeup(candidate->dev.parent))
661 return 0; 661 return 0;
662 662
663 *(char **)name_ptr = dev->bus_id; 663 *(const char **)name_ptr = dev_name(dev);
664 return 1; 664 return 1;
665} 665}
666 666
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 72016f051477..97890831e1b5 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -27,7 +27,7 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
27static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key, struct tty_struct *tty)
28{ 28{
29 /* run sysrq poweroff on boot cpu */ 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(first_cpu(cpu_online_map), &poweroff_work); 30 schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work);
31} 31}
32 32
33static struct sysrq_key_op sysrq_poweroff_op = { 33static struct sysrq_key_op sysrq_poweroff_op = {
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5d2ab836e998..f5fc2d7680f2 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -25,6 +25,7 @@
25#include <linux/syscalls.h> 25#include <linux/syscalls.h>
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/list.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30#include <asm/mmu_context.h> 31#include <asm/mmu_context.h>
@@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
192 return ret; 193 return ret;
193} 194}
194 195
195static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
196{
197 free_list_of_pages(ca->chain, clear_page_nosave);
198 memset(ca, 0, sizeof(struct chain_allocator));
199}
200
201/** 196/**
202 * Data types related to memory bitmaps. 197 * Data types related to memory bitmaps.
203 * 198 *
@@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
233#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 228#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
234 229
235struct bm_block { 230struct bm_block {
236 struct bm_block *next; /* next element of the list */ 231 struct list_head hook; /* hook into a list of bitmap blocks */
237 unsigned long start_pfn; /* pfn represented by the first bit */ 232 unsigned long start_pfn; /* pfn represented by the first bit */
238 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ 233 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
239 unsigned long *data; /* bitmap representing pages */ 234 unsigned long *data; /* bitmap representing pages */
@@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb)
244 return bb->end_pfn - bb->start_pfn; 239 return bb->end_pfn - bb->start_pfn;
245} 240}
246 241
247struct zone_bitmap {
248 struct zone_bitmap *next; /* next element of the list */
249 unsigned long start_pfn; /* minimal pfn in this zone */
250 unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
251 struct bm_block *bm_blocks; /* list of bitmap blocks */
252 struct bm_block *cur_block; /* recently used bitmap block */
253};
254
255/* strcut bm_position is used for browsing memory bitmaps */ 242/* strcut bm_position is used for browsing memory bitmaps */
256 243
257struct bm_position { 244struct bm_position {
258 struct zone_bitmap *zone_bm;
259 struct bm_block *block; 245 struct bm_block *block;
260 int bit; 246 int bit;
261}; 247};
262 248
263struct memory_bitmap { 249struct memory_bitmap {
264 struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ 250 struct list_head blocks; /* list of bitmap blocks */
265 struct linked_page *p_list; /* list of pages used to store zone 251 struct linked_page *p_list; /* list of pages used to store zone
266 * bitmap objects and bitmap block 252 * bitmap objects and bitmap block
267 * objects 253 * objects
@@ -273,11 +259,7 @@ struct memory_bitmap {
273 259
274static void memory_bm_position_reset(struct memory_bitmap *bm) 260static void memory_bm_position_reset(struct memory_bitmap *bm)
275{ 261{
276 struct zone_bitmap *zone_bm; 262 bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
277
278 zone_bm = bm->zone_bm_list;
279 bm->cur.zone_bm = zone_bm;
280 bm->cur.block = zone_bm->bm_blocks;
281 bm->cur.bit = 0; 263 bm->cur.bit = 0;
282} 264}
283 265
@@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
285 267
286/** 268/**
287 * create_bm_block_list - create a list of block bitmap objects 269 * create_bm_block_list - create a list of block bitmap objects
270 * @nr_blocks - number of blocks to allocate
271 * @list - list to put the allocated blocks into
272 * @ca - chain allocator to be used for allocating memory
288 */ 273 */
289 274static int create_bm_block_list(unsigned long pages,
290static inline struct bm_block * 275 struct list_head *list,
291create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) 276 struct chain_allocator *ca)
292{ 277{
293 struct bm_block *bblist = NULL; 278 unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
294 279
295 while (nr_blocks-- > 0) { 280 while (nr_blocks-- > 0) {
296 struct bm_block *bb; 281 struct bm_block *bb;
297 282
298 bb = chain_alloc(ca, sizeof(struct bm_block)); 283 bb = chain_alloc(ca, sizeof(struct bm_block));
299 if (!bb) 284 if (!bb)
300 return NULL; 285 return -ENOMEM;
301 286 list_add(&bb->hook, list);
302 bb->next = bblist;
303 bblist = bb;
304 } 287 }
305 return bblist; 288
289 return 0;
306} 290}
307 291
292struct mem_extent {
293 struct list_head hook;
294 unsigned long start;
295 unsigned long end;
296};
297
308/** 298/**
309 * create_zone_bm_list - create a list of zone bitmap objects 299 * free_mem_extents - free a list of memory extents
300 * @list - list of extents to empty
310 */ 301 */
302static void free_mem_extents(struct list_head *list)
303{
304 struct mem_extent *ext, *aux;
311 305
312static inline struct zone_bitmap * 306 list_for_each_entry_safe(ext, aux, list, hook) {
313create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) 307 list_del(&ext->hook);
308 kfree(ext);
309 }
310}
311
312/**
313 * create_mem_extents - create a list of memory extents representing
314 * contiguous ranges of PFNs
315 * @list - list to put the extents into
316 * @gfp_mask - mask to use for memory allocations
317 */
318static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
314{ 319{
315 struct zone_bitmap *zbmlist = NULL; 320 struct zone *zone;
316 321
317 while (nr_zones-- > 0) { 322 INIT_LIST_HEAD(list);
318 struct zone_bitmap *zbm;
319 323
320 zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); 324 for_each_zone(zone) {
321 if (!zbm) 325 unsigned long zone_start, zone_end;
322 return NULL; 326 struct mem_extent *ext, *cur, *aux;
327
328 if (!populated_zone(zone))
329 continue;
323 330
324 zbm->next = zbmlist; 331 zone_start = zone->zone_start_pfn;
325 zbmlist = zbm; 332 zone_end = zone->zone_start_pfn + zone->spanned_pages;
333
334 list_for_each_entry(ext, list, hook)
335 if (zone_start <= ext->end)
336 break;
337
338 if (&ext->hook == list || zone_end < ext->start) {
339 /* New extent is necessary */
340 struct mem_extent *new_ext;
341
342 new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask);
343 if (!new_ext) {
344 free_mem_extents(list);
345 return -ENOMEM;
346 }
347 new_ext->start = zone_start;
348 new_ext->end = zone_end;
349 list_add_tail(&new_ext->hook, &ext->hook);
350 continue;
351 }
352
353 /* Merge this zone's range of PFNs with the existing one */
354 if (zone_start < ext->start)
355 ext->start = zone_start;
356 if (zone_end > ext->end)
357 ext->end = zone_end;
358
359 /* More merging may be possible */
360 cur = ext;
361 list_for_each_entry_safe_continue(cur, aux, list, hook) {
362 if (zone_end < cur->start)
363 break;
364 if (zone_end < cur->end)
365 ext->end = cur->end;
366 list_del(&cur->hook);
367 kfree(cur);
368 }
326 } 369 }
327 return zbmlist; 370
371 return 0;
328} 372}
329 373
330/** 374/**
331 * memory_bm_create - allocate memory for a memory bitmap 375 * memory_bm_create - allocate memory for a memory bitmap
332 */ 376 */
333
334static int 377static int
335memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) 378memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
336{ 379{
337 struct chain_allocator ca; 380 struct chain_allocator ca;
338 struct zone *zone; 381 struct list_head mem_extents;
339 struct zone_bitmap *zone_bm; 382 struct mem_extent *ext;
340 struct bm_block *bb; 383 int error;
341 unsigned int nr;
342 384
343 chain_init(&ca, gfp_mask, safe_needed); 385 chain_init(&ca, gfp_mask, safe_needed);
386 INIT_LIST_HEAD(&bm->blocks);
344 387
345 /* Compute the number of zones */ 388 error = create_mem_extents(&mem_extents, gfp_mask);
346 nr = 0; 389 if (error)
347 for_each_zone(zone) 390 return error;
348 if (populated_zone(zone))
349 nr++;
350
351 /* Allocate the list of zones bitmap objects */
352 zone_bm = create_zone_bm_list(nr, &ca);
353 bm->zone_bm_list = zone_bm;
354 if (!zone_bm) {
355 chain_free(&ca, PG_UNSAFE_CLEAR);
356 return -ENOMEM;
357 }
358
359 /* Initialize the zone bitmap objects */
360 for_each_zone(zone) {
361 unsigned long pfn;
362 391
363 if (!populated_zone(zone)) 392 list_for_each_entry(ext, &mem_extents, hook) {
364 continue; 393 struct bm_block *bb;
394 unsigned long pfn = ext->start;
395 unsigned long pages = ext->end - ext->start;
365 396
366 zone_bm->start_pfn = zone->zone_start_pfn; 397 bb = list_entry(bm->blocks.prev, struct bm_block, hook);
367 zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
368 /* Allocate the list of bitmap block objects */
369 nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
370 bb = create_bm_block_list(nr, &ca);
371 zone_bm->bm_blocks = bb;
372 zone_bm->cur_block = bb;
373 if (!bb)
374 goto Free;
375 398
376 nr = zone->spanned_pages; 399 error = create_bm_block_list(pages, bm->blocks.prev, &ca);
377 pfn = zone->zone_start_pfn; 400 if (error)
378 /* Initialize the bitmap block objects */ 401 goto Error;
379 while (bb) {
380 unsigned long *ptr;
381 402
382 ptr = get_image_page(gfp_mask, safe_needed); 403 list_for_each_entry_continue(bb, &bm->blocks, hook) {
383 bb->data = ptr; 404 bb->data = get_image_page(gfp_mask, safe_needed);
384 if (!ptr) 405 if (!bb->data) {
385 goto Free; 406 error = -ENOMEM;
407 goto Error;
408 }
386 409
387 bb->start_pfn = pfn; 410 bb->start_pfn = pfn;
388 if (nr >= BM_BITS_PER_BLOCK) { 411 if (pages >= BM_BITS_PER_BLOCK) {
389 pfn += BM_BITS_PER_BLOCK; 412 pfn += BM_BITS_PER_BLOCK;
390 nr -= BM_BITS_PER_BLOCK; 413 pages -= BM_BITS_PER_BLOCK;
391 } else { 414 } else {
392 /* This is executed only once in the loop */ 415 /* This is executed only once in the loop */
393 pfn += nr; 416 pfn += pages;
394 } 417 }
395 bb->end_pfn = pfn; 418 bb->end_pfn = pfn;
396 bb = bb->next;
397 } 419 }
398 zone_bm = zone_bm->next;
399 } 420 }
421
400 bm->p_list = ca.chain; 422 bm->p_list = ca.chain;
401 memory_bm_position_reset(bm); 423 memory_bm_position_reset(bm);
402 return 0; 424 Exit:
425 free_mem_extents(&mem_extents);
426 return error;
403 427
404 Free: 428 Error:
405 bm->p_list = ca.chain; 429 bm->p_list = ca.chain;
406 memory_bm_free(bm, PG_UNSAFE_CLEAR); 430 memory_bm_free(bm, PG_UNSAFE_CLEAR);
407 return -ENOMEM; 431 goto Exit;
408} 432}
409 433
410/** 434/**
411 * memory_bm_free - free memory occupied by the memory bitmap @bm 435 * memory_bm_free - free memory occupied by the memory bitmap @bm
412 */ 436 */
413
414static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) 437static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
415{ 438{
416 struct zone_bitmap *zone_bm; 439 struct bm_block *bb;
417 440
418 /* Free the list of bit blocks for each zone_bitmap object */ 441 list_for_each_entry(bb, &bm->blocks, hook)
419 zone_bm = bm->zone_bm_list; 442 if (bb->data)
420 while (zone_bm) { 443 free_image_page(bb->data, clear_nosave_free);
421 struct bm_block *bb;
422 444
423 bb = zone_bm->bm_blocks;
424 while (bb) {
425 if (bb->data)
426 free_image_page(bb->data, clear_nosave_free);
427 bb = bb->next;
428 }
429 zone_bm = zone_bm->next;
430 }
431 free_list_of_pages(bm->p_list, clear_nosave_free); 445 free_list_of_pages(bm->p_list, clear_nosave_free);
432 bm->zone_bm_list = NULL; 446
447 INIT_LIST_HEAD(&bm->blocks);
433} 448}
434 449
435/** 450/**
@@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
437 * to given pfn. The cur_zone_bm member of @bm and the cur_block member 452 * to given pfn. The cur_zone_bm member of @bm and the cur_block member
438 * of @bm->cur_zone_bm are updated. 453 * of @bm->cur_zone_bm are updated.
439 */ 454 */
440
441static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, 455static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
442 void **addr, unsigned int *bit_nr) 456 void **addr, unsigned int *bit_nr)
443{ 457{
444 struct zone_bitmap *zone_bm;
445 struct bm_block *bb; 458 struct bm_block *bb;
446 459
447 /* Check if the pfn is from the current zone */ 460 /*
448 zone_bm = bm->cur.zone_bm; 461 * Check if the pfn corresponds to the current bitmap block and find
449 if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { 462 * the block where it fits if this is not the case.
450 zone_bm = bm->zone_bm_list; 463 */
451 /* We don't assume that the zones are sorted by pfns */ 464 bb = bm->cur.block;
452 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
453 zone_bm = zone_bm->next;
454
455 if (!zone_bm)
456 return -EFAULT;
457 }
458 bm->cur.zone_bm = zone_bm;
459 }
460 /* Check if the pfn corresponds to the current bitmap block */
461 bb = zone_bm->cur_block;
462 if (pfn < bb->start_pfn) 465 if (pfn < bb->start_pfn)
463 bb = zone_bm->bm_blocks; 466 list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
467 if (pfn >= bb->start_pfn)
468 break;
464 469
465 while (pfn >= bb->end_pfn) { 470 if (pfn >= bb->end_pfn)
466 bb = bb->next; 471 list_for_each_entry_continue(bb, &bm->blocks, hook)
472 if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
473 break;
467 474
468 BUG_ON(!bb); 475 if (&bb->hook == &bm->blocks)
469 } 476 return -EFAULT;
470 zone_bm->cur_block = bb; 477
478 /* The block has been found */
479 bm->cur.block = bb;
471 pfn -= bb->start_pfn; 480 pfn -= bb->start_pfn;
481 bm->cur.bit = pfn + 1;
472 *bit_nr = pfn; 482 *bit_nr = pfn;
473 *addr = bb->data; 483 *addr = bb->data;
474 return 0; 484 return 0;
@@ -519,6 +529,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
519 return test_bit(bit, addr); 529 return test_bit(bit, addr);
520} 530}
521 531
532static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
533{
534 void *addr;
535 unsigned int bit;
536
537 return !memory_bm_find_bit(bm, pfn, &addr, &bit);
538}
539
522/** 540/**
523 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit 541 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
524 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is 542 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
@@ -530,29 +548,21 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
530 548
531static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) 549static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
532{ 550{
533 struct zone_bitmap *zone_bm;
534 struct bm_block *bb; 551 struct bm_block *bb;
535 int bit; 552 int bit;
536 553
554 bb = bm->cur.block;
537 do { 555 do {
538 bb = bm->cur.block; 556 bit = bm->cur.bit;
539 do { 557 bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
540 bit = bm->cur.bit; 558 if (bit < bm_block_bits(bb))
541 bit = find_next_bit(bb->data, bm_block_bits(bb), bit); 559 goto Return_pfn;
542 if (bit < bm_block_bits(bb)) 560
543 goto Return_pfn; 561 bb = list_entry(bb->hook.next, struct bm_block, hook);
544 562 bm->cur.block = bb;
545 bb = bb->next; 563 bm->cur.bit = 0;
546 bm->cur.block = bb; 564 } while (&bb->hook != &bm->blocks);
547 bm->cur.bit = 0; 565
548 } while (bb);
549 zone_bm = bm->cur.zone_bm->next;
550 if (zone_bm) {
551 bm->cur.zone_bm = zone_bm;
552 bm->cur.block = zone_bm->bm_blocks;
553 bm->cur.bit = 0;
554 }
555 } while (zone_bm);
556 memory_bm_position_reset(bm); 566 memory_bm_position_reset(bm);
557 return BM_END_OF_MAP; 567 return BM_END_OF_MAP;
558 568
@@ -808,8 +818,7 @@ static unsigned int count_free_highmem_pages(void)
808 * We should save the page if it isn't Nosave or NosaveFree, or Reserved, 818 * We should save the page if it isn't Nosave or NosaveFree, or Reserved,
809 * and it isn't a part of a free chunk of pages. 819 * and it isn't a part of a free chunk of pages.
810 */ 820 */
811 821static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
812static struct page *saveable_highmem_page(unsigned long pfn)
813{ 822{
814 struct page *page; 823 struct page *page;
815 824
@@ -817,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn)
817 return NULL; 826 return NULL;
818 827
819 page = pfn_to_page(pfn); 828 page = pfn_to_page(pfn);
829 if (page_zone(page) != zone)
830 return NULL;
820 831
821 BUG_ON(!PageHighMem(page)); 832 BUG_ON(!PageHighMem(page));
822 833
@@ -846,13 +857,16 @@ unsigned int count_highmem_pages(void)
846 mark_free_pages(zone); 857 mark_free_pages(zone);
847 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 858 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
848 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 859 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
849 if (saveable_highmem_page(pfn)) 860 if (saveable_highmem_page(zone, pfn))
850 n++; 861 n++;
851 } 862 }
852 return n; 863 return n;
853} 864}
854#else 865#else
855static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } 866static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
867{
868 return NULL;
869}
856#endif /* CONFIG_HIGHMEM */ 870#endif /* CONFIG_HIGHMEM */
857 871
858/** 872/**
@@ -863,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
863 * of pages statically defined as 'unsaveable', and it isn't a part of 877 * of pages statically defined as 'unsaveable', and it isn't a part of
864 * a free chunk of pages. 878 * a free chunk of pages.
865 */ 879 */
866 880static struct page *saveable_page(struct zone *zone, unsigned long pfn)
867static struct page *saveable_page(unsigned long pfn)
868{ 881{
869 struct page *page; 882 struct page *page;
870 883
@@ -872,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn)
872 return NULL; 885 return NULL;
873 886
874 page = pfn_to_page(pfn); 887 page = pfn_to_page(pfn);
888 if (page_zone(page) != zone)
889 return NULL;
875 890
876 BUG_ON(PageHighMem(page)); 891 BUG_ON(PageHighMem(page));
877 892
@@ -903,7 +918,7 @@ unsigned int count_data_pages(void)
903 mark_free_pages(zone); 918 mark_free_pages(zone);
904 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 919 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
905 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 920 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
906 if(saveable_page(pfn)) 921 if (saveable_page(zone, pfn))
907 n++; 922 n++;
908 } 923 }
909 return n; 924 return n;
@@ -944,7 +959,7 @@ static inline struct page *
944page_is_saveable(struct zone *zone, unsigned long pfn) 959page_is_saveable(struct zone *zone, unsigned long pfn)
945{ 960{
946 return is_highmem(zone) ? 961 return is_highmem(zone) ?
947 saveable_highmem_page(pfn) : saveable_page(pfn); 962 saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn);
948} 963}
949 964
950static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) 965static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
@@ -966,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
966 * data modified by kmap_atomic() 981 * data modified by kmap_atomic()
967 */ 982 */
968 safe_copy_page(buffer, s_page); 983 safe_copy_page(buffer, s_page);
969 dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); 984 dst = kmap_atomic(d_page, KM_USER0);
970 memcpy(dst, buffer, PAGE_SIZE); 985 memcpy(dst, buffer, PAGE_SIZE);
971 kunmap_atomic(dst, KM_USER0); 986 kunmap_atomic(dst, KM_USER0);
972 } else { 987 } else {
@@ -975,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
975 } 990 }
976} 991}
977#else 992#else
978#define page_is_saveable(zone, pfn) saveable_page(pfn) 993#define page_is_saveable(zone, pfn) saveable_page(zone, pfn)
979 994
980static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) 995static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
981{ 996{
@@ -1459,9 +1474,7 @@ load_header(struct swsusp_info *info)
1459 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set 1474 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
1460 * the corresponding bit in the memory bitmap @bm 1475 * the corresponding bit in the memory bitmap @bm
1461 */ 1476 */
1462 1477static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1463static inline void
1464unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1465{ 1478{
1466 int j; 1479 int j;
1467 1480
@@ -1469,8 +1482,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
1469 if (unlikely(buf[j] == BM_END_OF_MAP)) 1482 if (unlikely(buf[j] == BM_END_OF_MAP))
1470 break; 1483 break;
1471 1484
1472 memory_bm_set_bit(bm, buf[j]); 1485 if (memory_bm_pfn_present(bm, buf[j]))
1486 memory_bm_set_bit(bm, buf[j]);
1487 else
1488 return -EFAULT;
1473 } 1489 }
1490
1491 return 0;
1474} 1492}
1475 1493
1476/* List of "safe" pages that may be used to store data loaded from the suspend 1494/* List of "safe" pages that may be used to store data loaded from the suspend
@@ -1608,7 +1626,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1608 pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); 1626 pbe = chain_alloc(ca, sizeof(struct highmem_pbe));
1609 if (!pbe) { 1627 if (!pbe) {
1610 swsusp_free(); 1628 swsusp_free();
1611 return NULL; 1629 return ERR_PTR(-ENOMEM);
1612 } 1630 }
1613 pbe->orig_page = page; 1631 pbe->orig_page = page;
1614 if (safe_highmem_pages > 0) { 1632 if (safe_highmem_pages > 0) {
@@ -1677,7 +1695,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
1677static inline void * 1695static inline void *
1678get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) 1696get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
1679{ 1697{
1680 return NULL; 1698 return ERR_PTR(-EINVAL);
1681} 1699}
1682 1700
1683static inline void copy_last_highmem_page(void) {} 1701static inline void copy_last_highmem_page(void) {}
@@ -1788,8 +1806,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
1788static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) 1806static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1789{ 1807{
1790 struct pbe *pbe; 1808 struct pbe *pbe;
1791 struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); 1809 struct page *page;
1810 unsigned long pfn = memory_bm_next_pfn(bm);
1792 1811
1812 if (pfn == BM_END_OF_MAP)
1813 return ERR_PTR(-EFAULT);
1814
1815 page = pfn_to_page(pfn);
1793 if (PageHighMem(page)) 1816 if (PageHighMem(page))
1794 return get_highmem_page_buffer(page, ca); 1817 return get_highmem_page_buffer(page, ca);
1795 1818
@@ -1805,7 +1828,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
1805 pbe = chain_alloc(ca, sizeof(struct pbe)); 1828 pbe = chain_alloc(ca, sizeof(struct pbe));
1806 if (!pbe) { 1829 if (!pbe) {
1807 swsusp_free(); 1830 swsusp_free();
1808 return NULL; 1831 return ERR_PTR(-ENOMEM);
1809 } 1832 }
1810 pbe->orig_address = page_address(page); 1833 pbe->orig_address = page_address(page);
1811 pbe->address = safe_pages_list; 1834 pbe->address = safe_pages_list;
@@ -1868,7 +1891,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1868 return error; 1891 return error;
1869 1892
1870 } else if (handle->prev <= nr_meta_pages) { 1893 } else if (handle->prev <= nr_meta_pages) {
1871 unpack_orig_pfns(buffer, &copy_bm); 1894 error = unpack_orig_pfns(buffer, &copy_bm);
1895 if (error)
1896 return error;
1897
1872 if (handle->prev == nr_meta_pages) { 1898 if (handle->prev == nr_meta_pages) {
1873 error = prepare_image(&orig_bm, &copy_bm); 1899 error = prepare_image(&orig_bm, &copy_bm);
1874 if (error) 1900 if (error)
@@ -1879,12 +1905,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
1879 restore_pblist = NULL; 1905 restore_pblist = NULL;
1880 handle->buffer = get_buffer(&orig_bm, &ca); 1906 handle->buffer = get_buffer(&orig_bm, &ca);
1881 handle->sync_read = 0; 1907 handle->sync_read = 0;
1882 if (!handle->buffer) 1908 if (IS_ERR(handle->buffer))
1883 return -ENOMEM; 1909 return PTR_ERR(handle->buffer);
1884 } 1910 }
1885 } else { 1911 } else {
1886 copy_last_highmem_page(); 1912 copy_last_highmem_page();
1887 handle->buffer = get_buffer(&orig_bm, &ca); 1913 handle->buffer = get_buffer(&orig_bm, &ca);
1914 if (IS_ERR(handle->buffer))
1915 return PTR_ERR(handle->buffer);
1888 if (handle->buffer != buffer) 1916 if (handle->buffer != buffer)
1889 handle->sync_read = 0; 1917 handle->sync_read = 0;
1890 } 1918 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 023ff2a31d89..a92c91451559 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -262,3 +262,125 @@ int swsusp_shrink_memory(void)
262 262
263 return 0; 263 return 0;
264} 264}
265
266/*
267 * Platforms, like ACPI, may want us to save some memory used by them during
268 * hibernation and to restore the contents of this memory during the subsequent
269 * resume. The code below implements a mechanism allowing us to do that.
270 */
271
272struct nvs_page {
273 unsigned long phys_start;
274 unsigned int size;
275 void *kaddr;
276 void *data;
277 struct list_head node;
278};
279
280static LIST_HEAD(nvs_list);
281
282/**
283 * hibernate_nvs_register - register platform NVS memory region to save
284 * @start - physical address of the region
285 * @size - size of the region
286 *
287 * The NVS region need not be page-aligned (both ends) and we arrange
288 * things so that the data from page-aligned addresses in this region will
289 * be copied into separate RAM pages.
290 */
291int hibernate_nvs_register(unsigned long start, unsigned long size)
292{
293 struct nvs_page *entry, *next;
294
295 while (size > 0) {
296 unsigned int nr_bytes;
297
298 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
299 if (!entry)
300 goto Error;
301
302 list_add_tail(&entry->node, &nvs_list);
303 entry->phys_start = start;
304 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
305 entry->size = (size < nr_bytes) ? size : nr_bytes;
306
307 start += entry->size;
308 size -= entry->size;
309 }
310 return 0;
311
312 Error:
313 list_for_each_entry_safe(entry, next, &nvs_list, node) {
314 list_del(&entry->node);
315 kfree(entry);
316 }
317 return -ENOMEM;
318}
319
320/**
321 * hibernate_nvs_free - free data pages allocated for saving NVS regions
322 */
323void hibernate_nvs_free(void)
324{
325 struct nvs_page *entry;
326
327 list_for_each_entry(entry, &nvs_list, node)
328 if (entry->data) {
329 free_page((unsigned long)entry->data);
330 entry->data = NULL;
331 if (entry->kaddr) {
332 iounmap(entry->kaddr);
333 entry->kaddr = NULL;
334 }
335 }
336}
337
338/**
339 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
340 */
341int hibernate_nvs_alloc(void)
342{
343 struct nvs_page *entry;
344
345 list_for_each_entry(entry, &nvs_list, node) {
346 entry->data = (void *)__get_free_page(GFP_KERNEL);
347 if (!entry->data) {
348 hibernate_nvs_free();
349 return -ENOMEM;
350 }
351 }
352 return 0;
353}
354
355/**
356 * hibernate_nvs_save - save NVS memory regions
357 */
358void hibernate_nvs_save(void)
359{
360 struct nvs_page *entry;
361
362 printk(KERN_INFO "PM: Saving platform NVS memory\n");
363
364 list_for_each_entry(entry, &nvs_list, node)
365 if (entry->data) {
366 entry->kaddr = ioremap(entry->phys_start, entry->size);
367 memcpy(entry->data, entry->kaddr, entry->size);
368 }
369}
370
371/**
372 * hibernate_nvs_restore - restore NVS memory regions
373 *
374 * This function is going to be called with interrupts disabled, so it
375 * cannot iounmap the virtual addresses used to access the NVS region.
376 */
377void hibernate_nvs_restore(void)
378{
379 struct nvs_page *entry;
380
381 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
382
383 list_for_each_entry(entry, &nvs_list, node)
384 if (entry->data)
385 memcpy(entry->kaddr, entry->data, entry->size);
386}
diff --git a/kernel/printk.c b/kernel/printk.c
index e651ab05655f..69188f226a93 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -382,7 +382,7 @@ out:
382 return error; 382 return error;
383} 383}
384 384
385asmlinkage long sys_syslog(int type, char __user *buf, int len) 385SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
386{ 386{
387 return do_syslog(type, buf, len); 387 return do_syslog(type, buf, len);
388} 388}
@@ -619,7 +619,7 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
619static const char recursion_bug_msg [] = 619static const char recursion_bug_msg [] =
620 KERN_CRIT "BUG: recent printk recursion!\n"; 620 KERN_CRIT "BUG: recent printk recursion!\n";
621static int recursion_bug; 621static int recursion_bug;
622 static int new_text_line = 1; 622static int new_text_line = 1;
623static char printk_buf[1024]; 623static char printk_buf[1024];
624 624
625asmlinkage int vprintk(const char *fmt, va_list args) 625asmlinkage int vprintk(const char *fmt, va_list args)
@@ -742,11 +742,6 @@ EXPORT_SYMBOL(vprintk);
742 742
743#else 743#else
744 744
745asmlinkage long sys_syslog(int type, char __user *buf, int len)
746{
747 return -ENOSYS;
748}
749
750static void call_console_drivers(unsigned start, unsigned end) 745static void call_console_drivers(unsigned start, unsigned end)
751{ 746{
752} 747}
diff --git a/kernel/profile.c b/kernel/profile.c
index 60adefb59b5e..784933acf5b8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -45,7 +45,7 @@ static unsigned long prof_len, prof_shift;
45int prof_on __read_mostly; 45int prof_on __read_mostly;
46EXPORT_SYMBOL_GPL(prof_on); 46EXPORT_SYMBOL_GPL(prof_on);
47 47
48static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 48static cpumask_var_t prof_cpu_mask;
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 50static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
51static DEFINE_PER_CPU(int, cpu_profile_flip); 51static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -113,9 +113,13 @@ int __ref profile_init(void)
113 buffer_bytes = prof_len*sizeof(atomic_t); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) { 114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes); 115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
116 return 0; 117 return 0;
117 } 118 }
118 119
120 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
121 return -ENOMEM;
122
119 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 123 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
120 if (prof_buffer) 124 if (prof_buffer)
121 return 0; 125 return 0;
@@ -128,6 +132,7 @@ int __ref profile_init(void)
128 if (prof_buffer) 132 if (prof_buffer)
129 return 0; 133 return 0;
130 134
135 free_cpumask_var(prof_cpu_mask);
131 return -ENOMEM; 136 return -ENOMEM;
132} 137}
133 138
@@ -386,13 +391,15 @@ out_free:
386 return NOTIFY_BAD; 391 return NOTIFY_BAD;
387 case CPU_ONLINE: 392 case CPU_ONLINE:
388 case CPU_ONLINE_FROZEN: 393 case CPU_ONLINE_FROZEN:
389 cpu_set(cpu, prof_cpu_mask); 394 if (prof_cpu_mask != NULL)
395 cpumask_set_cpu(cpu, prof_cpu_mask);
390 break; 396 break;
391 case CPU_UP_CANCELED: 397 case CPU_UP_CANCELED:
392 case CPU_UP_CANCELED_FROZEN: 398 case CPU_UP_CANCELED_FROZEN:
393 case CPU_DEAD: 399 case CPU_DEAD:
394 case CPU_DEAD_FROZEN: 400 case CPU_DEAD_FROZEN:
395 cpu_clear(cpu, prof_cpu_mask); 401 if (prof_cpu_mask != NULL)
402 cpumask_clear_cpu(cpu, prof_cpu_mask);
396 if (per_cpu(cpu_profile_hits, cpu)[0]) { 403 if (per_cpu(cpu_profile_hits, cpu)[0]) {
397 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 404 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
398 per_cpu(cpu_profile_hits, cpu)[0] = NULL; 405 per_cpu(cpu_profile_hits, cpu)[0] = NULL;
@@ -430,19 +437,19 @@ void profile_tick(int type)
430 437
431 if (type == CPU_PROFILING && timer_hook) 438 if (type == CPU_PROFILING && timer_hook)
432 timer_hook(regs); 439 timer_hook(regs);
433 if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask)) 440 if (!user_mode(regs) && prof_cpu_mask != NULL &&
441 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
434 profile_hit(type, (void *)profile_pc(regs)); 442 profile_hit(type, (void *)profile_pc(regs));
435} 443}
436 444
437#ifdef CONFIG_PROC_FS 445#ifdef CONFIG_PROC_FS
438#include <linux/proc_fs.h> 446#include <linux/proc_fs.h>
439#include <asm/uaccess.h> 447#include <asm/uaccess.h>
440#include <asm/ptrace.h>
441 448
442static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 449static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
443 int count, int *eof, void *data) 450 int count, int *eof, void *data)
444{ 451{
445 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); 452 int len = cpumask_scnprintf(page, count, data);
446 if (count - len < 2) 453 if (count - len < 2)
447 return -EINVAL; 454 return -EINVAL;
448 len += sprintf(page + len, "\n"); 455 len += sprintf(page + len, "\n");
@@ -452,16 +459,20 @@ static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
452static int prof_cpu_mask_write_proc(struct file *file, 459static int prof_cpu_mask_write_proc(struct file *file,
453 const char __user *buffer, unsigned long count, void *data) 460 const char __user *buffer, unsigned long count, void *data)
454{ 461{
455 cpumask_t *mask = (cpumask_t *)data; 462 struct cpumask *mask = data;
456 unsigned long full_count = count, err; 463 unsigned long full_count = count, err;
457 cpumask_t new_value; 464 cpumask_var_t new_value;
458 465
459 err = cpumask_parse_user(buffer, count, new_value); 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
460 if (err) 467 return -ENOMEM;
461 return err;
462 468
463 *mask = new_value; 469 err = cpumask_parse_user(buffer, count, new_value);
464 return full_count; 470 if (!err) {
471 cpumask_copy(mask, new_value);
472 err = full_count;
473 }
474 free_cpumask_var(new_value);
475 return err;
465} 476}
466 477
467void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 478void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
@@ -472,7 +483,7 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
472 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 483 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
473 if (!entry) 484 if (!entry)
474 return; 485 return;
475 entry->data = (void *)&prof_cpu_mask; 486 entry->data = prof_cpu_mask;
476 entry->read_proc = prof_cpu_mask_read_proc; 487 entry->read_proc = prof_cpu_mask_read_proc;
477 entry->write_proc = prof_cpu_mask_write_proc; 488 entry->write_proc = prof_cpu_mask_write_proc;
478} 489}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 29dc700e198c..c9cf48b21f05 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -574,7 +574,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
574#define arch_ptrace_attach(child) do { } while (0) 574#define arch_ptrace_attach(child) do { } while (0)
575#endif 575#endif
576 576
577asmlinkage long sys_ptrace(long request, long pid, long addr, long data) 577SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
578{ 578{
579 struct task_struct *child; 579 struct task_struct *child;
580 long ret; 580 long ret;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index e503a002f330..bd5a9003497c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -63,14 +63,14 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
63 .completed = -300, 63 .completed = -300,
64 .pending = -300, 64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_MASK_NONE, 66 .cpumask = CPU_BITS_NONE,
67}; 67};
68static struct rcu_ctrlblk rcu_bh_ctrlblk = { 68static struct rcu_ctrlblk rcu_bh_ctrlblk = {
69 .cur = -300, 69 .cur = -300,
70 .completed = -300, 70 .completed = -300,
71 .pending = -300, 71 .pending = -300,
72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), 72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
73 .cpumask = CPU_MASK_NONE, 73 .cpumask = CPU_BITS_NONE,
74}; 74};
75 75
76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
@@ -85,7 +85,6 @@ static void force_quiescent_state(struct rcu_data *rdp,
85 struct rcu_ctrlblk *rcp) 85 struct rcu_ctrlblk *rcp)
86{ 86{
87 int cpu; 87 int cpu;
88 cpumask_t cpumask;
89 unsigned long flags; 88 unsigned long flags;
90 89
91 set_need_resched(); 90 set_need_resched();
@@ -96,10 +95,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
96 * Don't send IPI to itself. With irqs disabled, 95 * Don't send IPI to itself. With irqs disabled,
97 * rdp->cpu is the current cpu. 96 * rdp->cpu is the current cpu.
98 * 97 *
99 * cpu_online_map is updated by the _cpu_down() 98 * cpu_online_mask is updated by the _cpu_down()
100 * using __stop_machine(). Since we're in irqs disabled 99 * using __stop_machine(). Since we're in irqs disabled
101 * section, __stop_machine() is not exectuting, hence 100 * section, __stop_machine() is not exectuting, hence
102 * the cpu_online_map is stable. 101 * the cpu_online_mask is stable.
103 * 102 *
104 * However, a cpu might have been offlined _just_ before 103 * However, a cpu might have been offlined _just_ before
105 * we disabled irqs while entering here. 104 * we disabled irqs while entering here.
@@ -107,13 +106,14 @@ static void force_quiescent_state(struct rcu_data *rdp,
107 * notification, leading to the offlined cpu's bit 106 * notification, leading to the offlined cpu's bit
108 * being set in the rcp->cpumask. 107 * being set in the rcp->cpumask.
109 * 108 *
110 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent 109 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
111 * sending smp_reschedule() to an offlined CPU. 110 * sending smp_reschedule() to an offlined CPU.
112 */ 111 */
113 cpus_and(cpumask, rcp->cpumask, cpu_online_map); 112 for_each_cpu_and(cpu,
114 cpu_clear(rdp->cpu, cpumask); 113 to_cpumask(rcp->cpumask), cpu_online_mask) {
115 for_each_cpu_mask_nr(cpu, cpumask) 114 if (cpu != rdp->cpu)
116 smp_send_reschedule(cpu); 115 smp_send_reschedule(cpu);
116 }
117 } 117 }
118 spin_unlock_irqrestore(&rcp->lock, flags); 118 spin_unlock_irqrestore(&rcp->lock, flags);
119} 119}
@@ -193,7 +193,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
193 193
194 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
198 } 198 }
199 printk(" (detected by %d, t=%ld jiffies)\n", 199 printk(" (detected by %d, t=%ld jiffies)\n",
@@ -221,7 +221,8 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp)
221 long delta; 221 long delta;
222 222
223 delta = jiffies - rcp->jiffies_stall; 223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { 224 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
225 delta >= 0) {
225 226
226 /* We haven't checked in, so go dump stack. */ 227 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp); 228 print_cpu_stall(rcp);
@@ -393,7 +394,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
393 * unnecessarily. 394 * unnecessarily.
394 */ 395 */
395 smp_mb(); 396 smp_mb();
396 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); 397 cpumask_andnot(to_cpumask(rcp->cpumask),
398 cpu_online_mask, nohz_cpu_mask);
397 399
398 rcp->signaled = 0; 400 rcp->signaled = 0;
399 } 401 }
@@ -406,8 +408,8 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
406 */ 408 */
407static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) 409static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
408{ 410{
409 cpu_clear(cpu, rcp->cpumask); 411 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
410 if (cpus_empty(rcp->cpumask)) { 412 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
411 /* batch completed ! */ 413 /* batch completed ! */
412 rcp->completed = rcp->cur; 414 rcp->completed = rcp->cur;
413 rcu_start_batch(rcp); 415 rcu_start_batch(rcp);
@@ -714,7 +716,7 @@ void rcu_check_callbacks(int cpu, int user)
714 raise_rcu_softirq(); 716 raise_rcu_softirq();
715} 717}
716 718
717static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, 719static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
718 struct rcu_data *rdp) 720 struct rcu_data *rdp)
719{ 721{
720 unsigned long flags; 722 unsigned long flags;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ad63af8b2521..d92a76a881aa 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,8 +77,15 @@ void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 78 * and may be nested.
79 */ 79 */
80void synchronize_rcu(void); /* Makes kernel-doc tools happy */ 80void synchronize_rcu(void)
81synchronize_rcu_xxx(synchronize_rcu, call_rcu) 81{
82 struct rcu_synchronize rcu;
83 init_completion(&rcu.completion);
84 /* Will wake me after RCU finished. */
85 call_rcu(&rcu.head, wakeme_after_rcu);
86 /* Wait for it. */
87 wait_for_completion(&rcu.completion);
88}
82EXPORT_SYMBOL_GPL(synchronize_rcu); 89EXPORT_SYMBOL_GPL(synchronize_rcu);
83 90
84static void rcu_barrier_callback(struct rcu_head *notused) 91static void rcu_barrier_callback(struct rcu_head *notused)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 04982659875a..33cfc50781f9 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -164,7 +164,8 @@ static char *rcu_try_flip_state_names[] =
164 { "idle", "waitack", "waitzero", "waitmb" }; 164 { "idle", "waitack", "waitzero", "waitmb" };
165#endif /* #ifdef CONFIG_RCU_TRACE */ 165#endif /* #ifdef CONFIG_RCU_TRACE */
166 166
167static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; 167static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
168 = CPU_BITS_NONE;
168 169
169/* 170/*
170 * Enum and per-CPU flag to determine when each CPU has seen 171 * Enum and per-CPU flag to determine when each CPU has seen
@@ -758,7 +759,7 @@ rcu_try_flip_idle(void)
758 759
759 /* Now ask each CPU for acknowledgement of the flip. */ 760 /* Now ask each CPU for acknowledgement of the flip. */
760 761
761 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { 762 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
762 per_cpu(rcu_flip_flag, cpu) = rcu_flipped; 763 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
763 dyntick_save_progress_counter(cpu); 764 dyntick_save_progress_counter(cpu);
764 } 765 }
@@ -776,7 +777,7 @@ rcu_try_flip_waitack(void)
776 int cpu; 777 int cpu;
777 778
778 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); 779 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
779 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 780 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
780 if (rcu_try_flip_waitack_needed(cpu) && 781 if (rcu_try_flip_waitack_needed(cpu) &&
781 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { 782 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
782 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); 783 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -808,7 +809,7 @@ rcu_try_flip_waitzero(void)
808 /* Check to see if the sum of the "last" counters is zero. */ 809 /* Check to see if the sum of the "last" counters is zero. */
809 810
810 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); 811 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
811 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 812 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
812 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; 813 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
813 if (sum != 0) { 814 if (sum != 0) {
814 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); 815 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -823,7 +824,7 @@ rcu_try_flip_waitzero(void)
823 smp_mb(); /* ^^^^^^^^^^^^ */ 824 smp_mb(); /* ^^^^^^^^^^^^ */
824 825
825 /* Call for a memory barrier from each CPU. */ 826 /* Call for a memory barrier from each CPU. */
826 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) { 827 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
827 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; 828 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
828 dyntick_save_progress_counter(cpu); 829 dyntick_save_progress_counter(cpu);
829 } 830 }
@@ -843,7 +844,7 @@ rcu_try_flip_waitmb(void)
843 int cpu; 844 int cpu;
844 845
845 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); 846 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
846 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) 847 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
847 if (rcu_try_flip_waitmb_needed(cpu) && 848 if (rcu_try_flip_waitmb_needed(cpu) &&
848 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { 849 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
849 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); 850 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@ -1032,7 +1033,7 @@ void rcu_offline_cpu(int cpu)
1032 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; 1033 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1033 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; 1034 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1034 1035
1035 cpu_clear(cpu, rcu_cpu_online_map); 1036 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1036 1037
1037 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1038 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1038 1039
@@ -1072,7 +1073,7 @@ void __cpuinit rcu_online_cpu(int cpu)
1072 struct rcu_data *rdp; 1073 struct rcu_data *rdp;
1073 1074
1074 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); 1075 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1075 cpu_set(cpu, rcu_cpu_online_map); 1076 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1076 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1077 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1077 1078
1078 /* 1079 /*
@@ -1176,7 +1177,16 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
1176 * in -rt this does -not- necessarily result in all currently executing 1177 * in -rt this does -not- necessarily result in all currently executing
1177 * interrupt -handlers- having completed. 1178 * interrupt -handlers- having completed.
1178 */ 1179 */
1179synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched) 1180void __synchronize_sched(void)
1181{
1182 struct rcu_synchronize rcu;
1183
1184 init_completion(&rcu.completion);
1185 /* Will wake me after RCU finished. */
1186 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1187 /* Wait for it. */
1188 wait_for_completion(&rcu.completion);
1189}
1180EXPORT_SYMBOL_GPL(__synchronize_sched); 1190EXPORT_SYMBOL_GPL(__synchronize_sched);
1181 1191
1182/* 1192/*
@@ -1430,7 +1440,7 @@ void __init __rcu_init(void)
1430 * We don't need protection against CPU-Hotplug here 1440 * We don't need protection against CPU-Hotplug here
1431 * since 1441 * since
1432 * a) If a CPU comes online while we are iterating over the 1442 * a) If a CPU comes online while we are iterating over the
1433 * cpu_online_map below, we would only end up making a 1443 * cpu_online_mask below, we would only end up making a
1434 * duplicate call to rcu_online_cpu() which sets the corresponding 1444 * duplicate call to rcu_online_cpu() which sets the corresponding
1435 * CPU's mask in the rcu_cpu_online_map. 1445 * CPU's mask in the rcu_cpu_online_map.
1436 * 1446 *
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b31065522104..7c4142a79f0a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -136,31 +136,47 @@ static int stutter_pause_test = 0;
136#endif 136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138 138
139#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */ 139/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */ 140
141static int fullstop; /* stop generating callbacks at test end. */ 141#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */ 142#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
143 /* spawning of kthreads. */ 143#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
144static int fullstop = FULLSTOP_RMMOD;
145DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */
146 /* of kthreads. */
144 147
145/* 148/*
146 * Detect and respond to a signal-based shutdown. 149 * Detect and respond to a system shutdown.
147 */ 150 */
148static int 151static int
149rcutorture_shutdown_notify(struct notifier_block *unused1, 152rcutorture_shutdown_notify(struct notifier_block *unused1,
150 unsigned long unused2, void *unused3) 153 unsigned long unused2, void *unused3)
151{ 154{
152 if (fullstop) 155 mutex_lock(&fullstop_mutex);
153 return NOTIFY_DONE; 156 if (fullstop == FULLSTOP_DONTSTOP)
154 if (signal_pending(current)) { 157 fullstop = FULLSTOP_SHUTDOWN;
155 mutex_lock(&fullstop_mutex); 158 else
156 if (!ACCESS_ONCE(fullstop)) 159 printk(KERN_WARNING /* but going down anyway, so... */
157 fullstop = FULLSTOP_SIGNALED; 160 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
158 mutex_unlock(&fullstop_mutex); 161 mutex_unlock(&fullstop_mutex);
159 }
160 return NOTIFY_DONE; 162 return NOTIFY_DONE;
161} 163}
162 164
163/* 165/*
166 * Absorb kthreads into a kernel function that won't return, so that
167 * they won't ever access module text or data again.
168 */
169static void rcutorture_shutdown_absorb(char *title)
170{
171 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
172 printk(KERN_NOTICE
173 "rcutorture thread %s parking due to system shutdown\n",
174 title);
175 schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
176 }
177}
178
179/*
164 * Allocate an element from the rcu_tortures pool. 180 * Allocate an element from the rcu_tortures pool.
165 */ 181 */
166static struct rcu_torture * 182static struct rcu_torture *
@@ -221,13 +237,14 @@ rcu_random(struct rcu_random_state *rrsp)
221} 237}
222 238
223static void 239static void
224rcu_stutter_wait(void) 240rcu_stutter_wait(char *title)
225{ 241{
226 while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) { 242 while (stutter_pause_test || !rcutorture_runnable) {
227 if (rcutorture_runnable) 243 if (rcutorture_runnable)
228 schedule_timeout_interruptible(1); 244 schedule_timeout_interruptible(1);
229 else 245 else
230 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 246 schedule_timeout_interruptible(round_jiffies_relative(HZ));
247 rcutorture_shutdown_absorb(title);
231 } 248 }
232} 249}
233 250
@@ -289,7 +306,7 @@ rcu_torture_cb(struct rcu_head *p)
289 int i; 306 int i;
290 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); 307 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
291 308
292 if (fullstop) { 309 if (fullstop != FULLSTOP_DONTSTOP) {
293 /* Test is ending, just drop callbacks on the floor. */ 310 /* Test is ending, just drop callbacks on the floor. */
294 /* The next initialization will pick up the pieces. */ 311 /* The next initialization will pick up the pieces. */
295 return; 312 return;
@@ -621,10 +638,11 @@ rcu_torture_writer(void *arg)
621 } 638 }
622 rcu_torture_current_version++; 639 rcu_torture_current_version++;
623 oldbatch = cur_ops->completed(); 640 oldbatch = cur_ops->completed();
624 rcu_stutter_wait(); 641 rcu_stutter_wait("rcu_torture_writer");
625 } while (!kthread_should_stop() && !fullstop); 642 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
626 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 643 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
627 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) 644 rcutorture_shutdown_absorb("rcu_torture_writer");
645 while (!kthread_should_stop())
628 schedule_timeout_uninterruptible(1); 646 schedule_timeout_uninterruptible(1);
629 return 0; 647 return 0;
630} 648}
@@ -645,11 +663,12 @@ rcu_torture_fakewriter(void *arg)
645 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 663 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
646 udelay(rcu_random(&rand) & 0x3ff); 664 udelay(rcu_random(&rand) & 0x3ff);
647 cur_ops->sync(); 665 cur_ops->sync();
648 rcu_stutter_wait(); 666 rcu_stutter_wait("rcu_torture_fakewriter");
649 } while (!kthread_should_stop() && !fullstop); 667 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
650 668
651 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 669 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
652 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) 670 rcutorture_shutdown_absorb("rcu_torture_fakewriter");
671 while (!kthread_should_stop())
653 schedule_timeout_uninterruptible(1); 672 schedule_timeout_uninterruptible(1);
654 return 0; 673 return 0;
655} 674}
@@ -754,12 +773,13 @@ rcu_torture_reader(void *arg)
754 preempt_enable(); 773 preempt_enable();
755 cur_ops->readunlock(idx); 774 cur_ops->readunlock(idx);
756 schedule(); 775 schedule();
757 rcu_stutter_wait(); 776 rcu_stutter_wait("rcu_torture_reader");
758 } while (!kthread_should_stop() && !fullstop); 777 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
759 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 778 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
779 rcutorture_shutdown_absorb("rcu_torture_reader");
760 if (irqreader && cur_ops->irqcapable) 780 if (irqreader && cur_ops->irqcapable)
761 del_timer_sync(&t); 781 del_timer_sync(&t);
762 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED) 782 while (!kthread_should_stop())
763 schedule_timeout_uninterruptible(1); 783 schedule_timeout_uninterruptible(1);
764 return 0; 784 return 0;
765} 785}
@@ -856,7 +876,8 @@ rcu_torture_stats(void *arg)
856 do { 876 do {
857 schedule_timeout_interruptible(stat_interval * HZ); 877 schedule_timeout_interruptible(stat_interval * HZ);
858 rcu_torture_stats_print(); 878 rcu_torture_stats_print();
859 } while (!kthread_should_stop() && !fullstop); 879 rcutorture_shutdown_absorb("rcu_torture_stats");
880 } while (!kthread_should_stop());
860 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 881 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
861 return 0; 882 return 0;
862} 883}
@@ -924,7 +945,8 @@ rcu_torture_shuffle(void *arg)
924 do { 945 do {
925 schedule_timeout_interruptible(shuffle_interval * HZ); 946 schedule_timeout_interruptible(shuffle_interval * HZ);
926 rcu_torture_shuffle_tasks(); 947 rcu_torture_shuffle_tasks();
927 } while (!kthread_should_stop() && !fullstop); 948 rcutorture_shutdown_absorb("rcu_torture_shuffle");
949 } while (!kthread_should_stop());
928 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); 950 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
929 return 0; 951 return 0;
930} 952}
@@ -939,10 +961,11 @@ rcu_torture_stutter(void *arg)
939 do { 961 do {
940 schedule_timeout_interruptible(stutter * HZ); 962 schedule_timeout_interruptible(stutter * HZ);
941 stutter_pause_test = 1; 963 stutter_pause_test = 1;
942 if (!kthread_should_stop() && !fullstop) 964 if (!kthread_should_stop())
943 schedule_timeout_interruptible(stutter * HZ); 965 schedule_timeout_interruptible(stutter * HZ);
944 stutter_pause_test = 0; 966 stutter_pause_test = 0;
945 } while (!kthread_should_stop() && !fullstop); 967 rcutorture_shutdown_absorb("rcu_torture_stutter");
968 } while (!kthread_should_stop());
946 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); 969 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
947 return 0; 970 return 0;
948} 971}
@@ -969,15 +992,16 @@ rcu_torture_cleanup(void)
969 int i; 992 int i;
970 993
971 mutex_lock(&fullstop_mutex); 994 mutex_lock(&fullstop_mutex);
972 if (!fullstop) { 995 if (fullstop == FULLSTOP_SHUTDOWN) {
973 /* If being signaled, let it happen, then exit. */ 996 printk(KERN_WARNING /* but going down anyway, so... */
997 "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
974 mutex_unlock(&fullstop_mutex); 998 mutex_unlock(&fullstop_mutex);
975 schedule_timeout_interruptible(10 * HZ); 999 schedule_timeout_uninterruptible(10);
976 if (cur_ops->cb_barrier != NULL) 1000 if (cur_ops->cb_barrier != NULL)
977 cur_ops->cb_barrier(); 1001 cur_ops->cb_barrier();
978 return; 1002 return;
979 } 1003 }
980 fullstop = FULLSTOP_CLEANUP; 1004 fullstop = FULLSTOP_RMMOD;
981 mutex_unlock(&fullstop_mutex); 1005 mutex_unlock(&fullstop_mutex);
982 unregister_reboot_notifier(&rcutorture_nb); 1006 unregister_reboot_notifier(&rcutorture_nb);
983 if (stutter_task) { 1007 if (stutter_task) {
@@ -1077,7 +1101,7 @@ rcu_torture_init(void)
1077 else 1101 else
1078 nrealreaders = 2 * num_online_cpus(); 1102 nrealreaders = 2 * num_online_cpus();
1079 rcu_torture_print_module_parms("Start of test"); 1103 rcu_torture_print_module_parms("Start of test");
1080 fullstop = 0; 1104 fullstop = FULLSTOP_DONTSTOP;
1081 1105
1082 /* Set up the freelist. */ 1106 /* Set up the freelist. */
1083 1107
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a342b032112c..b2fd602a6f6f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -79,7 +79,10 @@ struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 80
81#ifdef CONFIG_NO_HZ 81#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks); 82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
83 .dynticks_nesting = 1,
84 .dynticks = 1,
85};
83#endif /* #ifdef CONFIG_NO_HZ */ 86#endif /* #ifdef CONFIG_NO_HZ */
84 87
85static int blimit = 10; /* Maximum callbacks per softirq. */ 88static int blimit = 10; /* Maximum callbacks per softirq. */
@@ -572,6 +575,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
572 /* Special-case the common single-level case. */ 575 /* Special-case the common single-level case. */
573 if (NUM_RCU_NODES == 1) { 576 if (NUM_RCU_NODES == 1) {
574 rnp->qsmask = rnp->qsmaskinit; 577 rnp->qsmask = rnp->qsmaskinit;
578 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
575 spin_unlock_irqrestore(&rnp->lock, flags); 579 spin_unlock_irqrestore(&rnp->lock, flags);
576 return; 580 return;
577 } 581 }
@@ -1310,7 +1314,7 @@ int rcu_needs_cpu(int cpu)
1310 * access due to the fact that this CPU cannot possibly have any RCU 1314 * access due to the fact that this CPU cannot possibly have any RCU
1311 * callbacks in flight yet. 1315 * callbacks in flight yet.
1312 */ 1316 */
1313static void 1317static void __cpuinit
1314rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1318rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1315{ 1319{
1316 unsigned long flags; 1320 unsigned long flags;
@@ -1379,13 +1383,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1379 1383
1380static void __cpuinit rcu_online_cpu(int cpu) 1384static void __cpuinit rcu_online_cpu(int cpu)
1381{ 1385{
1382#ifdef CONFIG_NO_HZ
1383 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1384
1385 rdtp->dynticks_nesting = 1;
1386 rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */
1387 rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
1388#endif /* #ifdef CONFIG_NO_HZ */
1389 rcu_init_percpu_data(cpu, &rcu_state); 1386 rcu_init_percpu_data(cpu, &rcu_state);
1390 rcu_init_percpu_data(cpu, &rcu_bh_state); 1387 rcu_init_percpu_data(cpu, &rcu_bh_state);
1391 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1388 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/relay.c b/kernel/relay.c
index 09ac2008f77b..9d79b7854fa6 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -663,8 +663,10 @@ int relay_late_setup_files(struct rchan *chan,
663 663
664 mutex_lock(&relay_channels_mutex); 664 mutex_lock(&relay_channels_mutex);
665 /* Is chan already set up? */ 665 /* Is chan already set up? */
666 if (unlikely(chan->has_base_filename)) 666 if (unlikely(chan->has_base_filename)) {
667 mutex_unlock(&relay_channels_mutex);
667 return -EEXIST; 668 return -EEXIST;
669 }
668 chan->has_base_filename = 1; 670 chan->has_base_filename = 1;
669 chan->parent = parent; 671 chan->parent = parent;
670 curr_cpu = get_cpu(); 672 curr_cpu = get_cpu();
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index f275c8eca772..bf8e7534c803 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -15,10 +15,11 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17 17
18void res_counter_init(struct res_counter *counter) 18void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = (unsigned long long)LLONG_MAX; 21 counter->limit = (unsigned long long)LLONG_MAX;
22 counter->parent = parent;
22} 23}
23 24
24int res_counter_charge_locked(struct res_counter *counter, unsigned long val) 25int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
@@ -34,14 +35,34 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
34 return 0; 35 return 0;
35} 36}
36 37
37int res_counter_charge(struct res_counter *counter, unsigned long val) 38int res_counter_charge(struct res_counter *counter, unsigned long val,
39 struct res_counter **limit_fail_at)
38{ 40{
39 int ret; 41 int ret;
40 unsigned long flags; 42 unsigned long flags;
41 43 struct res_counter *c, *u;
42 spin_lock_irqsave(&counter->lock, flags); 44
43 ret = res_counter_charge_locked(counter, val); 45 *limit_fail_at = NULL;
44 spin_unlock_irqrestore(&counter->lock, flags); 46 local_irq_save(flags);
47 for (c = counter; c != NULL; c = c->parent) {
48 spin_lock(&c->lock);
49 ret = res_counter_charge_locked(c, val);
50 spin_unlock(&c->lock);
51 if (ret < 0) {
52 *limit_fail_at = c;
53 goto undo;
54 }
55 }
56 ret = 0;
57 goto done;
58undo:
59 for (u = counter; u != c; u = u->parent) {
60 spin_lock(&u->lock);
61 res_counter_uncharge_locked(u, val);
62 spin_unlock(&u->lock);
63 }
64done:
65 local_irq_restore(flags);
45 return ret; 66 return ret;
46} 67}
47 68
@@ -56,10 +77,15 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
56void res_counter_uncharge(struct res_counter *counter, unsigned long val) 77void res_counter_uncharge(struct res_counter *counter, unsigned long val)
57{ 78{
58 unsigned long flags; 79 unsigned long flags;
80 struct res_counter *c;
59 81
60 spin_lock_irqsave(&counter->lock, flags); 82 local_irq_save(flags);
61 res_counter_uncharge_locked(counter, val); 83 for (c = counter; c != NULL; c = c->parent) {
62 spin_unlock_irqrestore(&counter->lock, flags); 84 spin_lock(&c->lock);
85 res_counter_uncharge_locked(c, val);
86 spin_unlock(&c->lock);
87 }
88 local_irq_restore(flags);
63} 89}
64 90
65 91
diff --git a/kernel/resource.c b/kernel/resource.c
index e633106b12f6..fd5d7d574bb9 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -620,10 +620,11 @@ resource_size_t resource_alignment(struct resource *res)
620 * @start: resource start address 620 * @start: resource start address
621 * @n: resource region size 621 * @n: resource region size
622 * @name: reserving caller's ID string 622 * @name: reserving caller's ID string
623 * @flags: IO resource flags
623 */ 624 */
624struct resource * __request_region(struct resource *parent, 625struct resource * __request_region(struct resource *parent,
625 resource_size_t start, resource_size_t n, 626 resource_size_t start, resource_size_t n,
626 const char *name) 627 const char *name, int flags)
627{ 628{
628 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 629 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
629 630
@@ -634,6 +635,7 @@ struct resource * __request_region(struct resource *parent,
634 res->start = start; 635 res->start = start;
635 res->end = start + n - 1; 636 res->end = start + n - 1;
636 res->flags = IORESOURCE_BUSY; 637 res->flags = IORESOURCE_BUSY;
638 res->flags |= flags;
637 639
638 write_lock(&resource_lock); 640 write_lock(&resource_lock);
639 641
@@ -679,7 +681,7 @@ int __check_region(struct resource *parent, resource_size_t start,
679{ 681{
680 struct resource * res; 682 struct resource * res;
681 683
682 res = __request_region(parent, start, n, "check-region"); 684 res = __request_region(parent, start, n, "check-region", 0);
683 if (!res) 685 if (!res)
684 return -EBUSY; 686 return -EBUSY;
685 687
@@ -776,7 +778,7 @@ struct resource * __devm_request_region(struct device *dev,
776 dr->start = start; 778 dr->start = start;
777 dr->n = n; 779 dr->n = n;
778 780
779 res = __request_region(parent, start, n, name); 781 res = __request_region(parent, start, n, name, 0);
780 if (res) 782 if (res)
781 devres_add(dev, dr); 783 devres_add(dev, dr);
782 else 784 else
@@ -876,3 +878,57 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
876 878
877 return err; 879 return err;
878} 880}
881
882#ifdef CONFIG_STRICT_DEVMEM
883static int strict_iomem_checks = 1;
884#else
885static int strict_iomem_checks;
886#endif
887
888/*
889 * check if an address is reserved in the iomem resource tree
890 * returns 1 if reserved, 0 if not reserved.
891 */
892int iomem_is_exclusive(u64 addr)
893{
894 struct resource *p = &iomem_resource;
895 int err = 0;
896 loff_t l;
897 int size = PAGE_SIZE;
898
899 if (!strict_iomem_checks)
900 return 0;
901
902 addr = addr & PAGE_MASK;
903
904 read_lock(&resource_lock);
905 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
906 /*
907 * We can probably skip the resources without
908 * IORESOURCE_IO attribute?
909 */
910 if (p->start >= addr + size)
911 break;
912 if (p->end < addr)
913 continue;
914 if (p->flags & IORESOURCE_BUSY &&
915 p->flags & IORESOURCE_EXCLUSIVE) {
916 err = 1;
917 break;
918 }
919 }
920 read_unlock(&resource_lock);
921
922 return err;
923}
924
925static int __init strict_iomem(char *str)
926{
927 if (strstr(str, "relaxed"))
928 strict_iomem_checks = 0;
929 if (strstr(str, "strict"))
930 strict_iomem_checks = 1;
931 return 1;
932}
933
934__setup("iomem=", strict_iomem);
diff --git a/kernel/sched.c b/kernel/sched.c
index fff1c4a20b65..8ee437a5ec1d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -125,6 +125,9 @@ DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task); 125DEFINE_TRACE(sched_migrate_task);
126 126
127#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
128
129static void double_rq_lock(struct rq *rq1, struct rq *rq2);
130
128/* 131/*
129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 132 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
130 * Since cpu_power is a 'constant', we can use a reciprocal divide. 133 * Since cpu_power is a 'constant', we can use a reciprocal divide.
@@ -498,18 +501,26 @@ struct rt_rq {
498 */ 501 */
499struct root_domain { 502struct root_domain {
500 atomic_t refcount; 503 atomic_t refcount;
501 cpumask_t span; 504 cpumask_var_t span;
502 cpumask_t online; 505 cpumask_var_t online;
503 506
504 /* 507 /*
505 * The "RT overload" flag: it gets set if a CPU has more than 508 * The "RT overload" flag: it gets set if a CPU has more than
506 * one runnable RT task. 509 * one runnable RT task.
507 */ 510 */
508 cpumask_t rto_mask; 511 cpumask_var_t rto_mask;
509 atomic_t rto_count; 512 atomic_t rto_count;
510#ifdef CONFIG_SMP 513#ifdef CONFIG_SMP
511 struct cpupri cpupri; 514 struct cpupri cpupri;
512#endif 515#endif
516#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
517 /*
518 * Preferred wake up cpu nominated by sched_mc balance that will be
519 * used when most cpus are idle in the system indicating overall very
520 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
521 */
522 unsigned int sched_mc_preferred_wakeup_cpu;
523#endif
513}; 524};
514 525
515/* 526/*
@@ -1312,8 +1323,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1312 * slice expiry etc. 1323 * slice expiry etc.
1313 */ 1324 */
1314 1325
1315#define WEIGHT_IDLEPRIO 2 1326#define WEIGHT_IDLEPRIO 3
1316#define WMULT_IDLEPRIO (1 << 31) 1327#define WMULT_IDLEPRIO 1431655765
1317 1328
1318/* 1329/*
1319 * Nice levels are multiplicative, with a gentle 10% change for every 1330 * Nice levels are multiplicative, with a gentle 10% change for every
@@ -1514,7 +1525,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1514 struct sched_domain *sd = data; 1525 struct sched_domain *sd = data;
1515 int i; 1526 int i;
1516 1527
1517 for_each_cpu_mask(i, sd->span) { 1528 for_each_cpu(i, sched_domain_span(sd)) {
1518 /* 1529 /*
1519 * If there are currently no tasks on the cpu pretend there 1530 * If there are currently no tasks on the cpu pretend there
1520 * is one of average load so that when a new task gets to 1531 * is one of average load so that when a new task gets to
@@ -1535,7 +1546,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1535 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1546 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1536 shares = tg->shares; 1547 shares = tg->shares;
1537 1548
1538 for_each_cpu_mask(i, sd->span) 1549 for_each_cpu(i, sched_domain_span(sd))
1539 update_group_shares_cpu(tg, i, shares, rq_weight); 1550 update_group_shares_cpu(tg, i, shares, rq_weight);
1540 1551
1541 return 0; 1552 return 0;
@@ -2101,15 +2112,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2101 int i; 2112 int i;
2102 2113
2103 /* Skip over this group if it has no CPUs allowed */ 2114 /* Skip over this group if it has no CPUs allowed */
2104 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 2115 if (!cpumask_intersects(sched_group_cpus(group),
2116 &p->cpus_allowed))
2105 continue; 2117 continue;
2106 2118
2107 local_group = cpu_isset(this_cpu, group->cpumask); 2119 local_group = cpumask_test_cpu(this_cpu,
2120 sched_group_cpus(group));
2108 2121
2109 /* Tally up the load of all CPUs in the group */ 2122 /* Tally up the load of all CPUs in the group */
2110 avg_load = 0; 2123 avg_load = 0;
2111 2124
2112 for_each_cpu_mask_nr(i, group->cpumask) { 2125 for_each_cpu(i, sched_group_cpus(group)) {
2113 /* Bias balancing toward cpus of our domain */ 2126 /* Bias balancing toward cpus of our domain */
2114 if (local_group) 2127 if (local_group)
2115 load = source_load(i, load_idx); 2128 load = source_load(i, load_idx);
@@ -2141,17 +2154,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2141 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2154 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2142 */ 2155 */
2143static int 2156static int
2144find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, 2157find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2145 cpumask_t *tmp)
2146{ 2158{
2147 unsigned long load, min_load = ULONG_MAX; 2159 unsigned long load, min_load = ULONG_MAX;
2148 int idlest = -1; 2160 int idlest = -1;
2149 int i; 2161 int i;
2150 2162
2151 /* Traverse only the allowed CPUs */ 2163 /* Traverse only the allowed CPUs */
2152 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2164 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2153
2154 for_each_cpu_mask_nr(i, *tmp) {
2155 load = weighted_cpuload(i); 2165 load = weighted_cpuload(i);
2156 2166
2157 if (load < min_load || (load == min_load && i == this_cpu)) { 2167 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2193,7 +2203,6 @@ static int sched_balance_self(int cpu, int flag)
2193 update_shares(sd); 2203 update_shares(sd);
2194 2204
2195 while (sd) { 2205 while (sd) {
2196 cpumask_t span, tmpmask;
2197 struct sched_group *group; 2206 struct sched_group *group;
2198 int new_cpu, weight; 2207 int new_cpu, weight;
2199 2208
@@ -2202,14 +2211,13 @@ static int sched_balance_self(int cpu, int flag)
2202 continue; 2211 continue;
2203 } 2212 }
2204 2213
2205 span = sd->span;
2206 group = find_idlest_group(sd, t, cpu); 2214 group = find_idlest_group(sd, t, cpu);
2207 if (!group) { 2215 if (!group) {
2208 sd = sd->child; 2216 sd = sd->child;
2209 continue; 2217 continue;
2210 } 2218 }
2211 2219
2212 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); 2220 new_cpu = find_idlest_cpu(group, t, cpu);
2213 if (new_cpu == -1 || new_cpu == cpu) { 2221 if (new_cpu == -1 || new_cpu == cpu) {
2214 /* Now try balancing at a lower domain level of cpu */ 2222 /* Now try balancing at a lower domain level of cpu */
2215 sd = sd->child; 2223 sd = sd->child;
@@ -2218,10 +2226,10 @@ static int sched_balance_self(int cpu, int flag)
2218 2226
2219 /* Now try balancing at a lower domain level of new_cpu */ 2227 /* Now try balancing at a lower domain level of new_cpu */
2220 cpu = new_cpu; 2228 cpu = new_cpu;
2229 weight = cpumask_weight(sched_domain_span(sd));
2221 sd = NULL; 2230 sd = NULL;
2222 weight = cpus_weight(span);
2223 for_each_domain(cpu, tmp) { 2231 for_each_domain(cpu, tmp) {
2224 if (weight <= cpus_weight(tmp->span)) 2232 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2225 break; 2233 break;
2226 if (tmp->flags & flag) 2234 if (tmp->flags & flag)
2227 sd = tmp; 2235 sd = tmp;
@@ -2258,6 +2266,16 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2258 if (!sched_feat(SYNC_WAKEUPS)) 2266 if (!sched_feat(SYNC_WAKEUPS))
2259 sync = 0; 2267 sync = 0;
2260 2268
2269 if (!sync) {
2270 if (current->se.avg_overlap < sysctl_sched_migration_cost &&
2271 p->se.avg_overlap < sysctl_sched_migration_cost)
2272 sync = 1;
2273 } else {
2274 if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
2275 p->se.avg_overlap >= sysctl_sched_migration_cost)
2276 sync = 0;
2277 }
2278
2261#ifdef CONFIG_SMP 2279#ifdef CONFIG_SMP
2262 if (sched_feat(LB_WAKEUP_UPDATE)) { 2280 if (sched_feat(LB_WAKEUP_UPDATE)) {
2263 struct sched_domain *sd; 2281 struct sched_domain *sd;
@@ -2266,7 +2284,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2266 cpu = task_cpu(p); 2284 cpu = task_cpu(p);
2267 2285
2268 for_each_domain(this_cpu, sd) { 2286 for_each_domain(this_cpu, sd) {
2269 if (cpu_isset(cpu, sd->span)) { 2287 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2270 update_shares(sd); 2288 update_shares(sd);
2271 break; 2289 break;
2272 } 2290 }
@@ -2315,7 +2333,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2315 else { 2333 else {
2316 struct sched_domain *sd; 2334 struct sched_domain *sd;
2317 for_each_domain(this_cpu, sd) { 2335 for_each_domain(this_cpu, sd) {
2318 if (cpu_isset(cpu, sd->span)) { 2336 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2319 schedstat_inc(sd, ttwu_wake_remote); 2337 schedstat_inc(sd, ttwu_wake_remote);
2320 break; 2338 break;
2321 } 2339 }
@@ -2846,7 +2864,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2846 struct rq *rq; 2864 struct rq *rq;
2847 2865
2848 rq = task_rq_lock(p, &flags); 2866 rq = task_rq_lock(p, &flags);
2849 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2867 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2850 || unlikely(!cpu_active(dest_cpu))) 2868 || unlikely(!cpu_active(dest_cpu)))
2851 goto out; 2869 goto out;
2852 2870
@@ -2911,7 +2929,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2911 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2929 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2912 * 3) are cache-hot on their current CPU. 2930 * 3) are cache-hot on their current CPU.
2913 */ 2931 */
2914 if (!cpu_isset(this_cpu, p->cpus_allowed)) { 2932 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
2915 schedstat_inc(p, se.nr_failed_migrations_affine); 2933 schedstat_inc(p, se.nr_failed_migrations_affine);
2916 return 0; 2934 return 0;
2917 } 2935 }
@@ -3086,7 +3104,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3086static struct sched_group * 3104static struct sched_group *
3087find_busiest_group(struct sched_domain *sd, int this_cpu, 3105find_busiest_group(struct sched_domain *sd, int this_cpu,
3088 unsigned long *imbalance, enum cpu_idle_type idle, 3106 unsigned long *imbalance, enum cpu_idle_type idle,
3089 int *sd_idle, const cpumask_t *cpus, int *balance) 3107 int *sd_idle, const struct cpumask *cpus, int *balance)
3090{ 3108{
3091 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3109 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3092 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3110 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3122,10 +3140,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3122 unsigned long sum_avg_load_per_task; 3140 unsigned long sum_avg_load_per_task;
3123 unsigned long avg_load_per_task; 3141 unsigned long avg_load_per_task;
3124 3142
3125 local_group = cpu_isset(this_cpu, group->cpumask); 3143 local_group = cpumask_test_cpu(this_cpu,
3144 sched_group_cpus(group));
3126 3145
3127 if (local_group) 3146 if (local_group)
3128 balance_cpu = first_cpu(group->cpumask); 3147 balance_cpu = cpumask_first(sched_group_cpus(group));
3129 3148
3130 /* Tally up the load of all CPUs in the group */ 3149 /* Tally up the load of all CPUs in the group */
3131 sum_weighted_load = sum_nr_running = avg_load = 0; 3150 sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3134,13 +3153,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3134 max_cpu_load = 0; 3153 max_cpu_load = 0;
3135 min_cpu_load = ~0UL; 3154 min_cpu_load = ~0UL;
3136 3155
3137 for_each_cpu_mask_nr(i, group->cpumask) { 3156 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3138 struct rq *rq; 3157 struct rq *rq = cpu_rq(i);
3139
3140 if (!cpu_isset(i, *cpus))
3141 continue;
3142
3143 rq = cpu_rq(i);
3144 3158
3145 if (*sd_idle && rq->nr_running) 3159 if (*sd_idle && rq->nr_running)
3146 *sd_idle = 0; 3160 *sd_idle = 0;
@@ -3251,8 +3265,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3251 */ 3265 */
3252 if ((sum_nr_running < min_nr_running) || 3266 if ((sum_nr_running < min_nr_running) ||
3253 (sum_nr_running == min_nr_running && 3267 (sum_nr_running == min_nr_running &&
3254 first_cpu(group->cpumask) < 3268 cpumask_first(sched_group_cpus(group)) >
3255 first_cpu(group_min->cpumask))) { 3269 cpumask_first(sched_group_cpus(group_min)))) {
3256 group_min = group; 3270 group_min = group;
3257 min_nr_running = sum_nr_running; 3271 min_nr_running = sum_nr_running;
3258 min_load_per_task = sum_weighted_load / 3272 min_load_per_task = sum_weighted_load /
@@ -3267,8 +3281,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3267 if (sum_nr_running <= group_capacity - 1) { 3281 if (sum_nr_running <= group_capacity - 1) {
3268 if (sum_nr_running > leader_nr_running || 3282 if (sum_nr_running > leader_nr_running ||
3269 (sum_nr_running == leader_nr_running && 3283 (sum_nr_running == leader_nr_running &&
3270 first_cpu(group->cpumask) > 3284 cpumask_first(sched_group_cpus(group)) <
3271 first_cpu(group_leader->cpumask))) { 3285 cpumask_first(sched_group_cpus(group_leader)))) {
3272 group_leader = group; 3286 group_leader = group;
3273 leader_nr_running = sum_nr_running; 3287 leader_nr_running = sum_nr_running;
3274 } 3288 }
@@ -3394,6 +3408,10 @@ out_balanced:
3394 3408
3395 if (this == group_leader && group_leader != group_min) { 3409 if (this == group_leader && group_leader != group_min) {
3396 *imbalance = min_load_per_task; 3410 *imbalance = min_load_per_task;
3411 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3412 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3413 cpumask_first(sched_group_cpus(group_leader));
3414 }
3397 return group_min; 3415 return group_min;
3398 } 3416 }
3399#endif 3417#endif
@@ -3407,16 +3425,16 @@ ret:
3407 */ 3425 */
3408static struct rq * 3426static struct rq *
3409find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3427find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3410 unsigned long imbalance, const cpumask_t *cpus) 3428 unsigned long imbalance, const struct cpumask *cpus)
3411{ 3429{
3412 struct rq *busiest = NULL, *rq; 3430 struct rq *busiest = NULL, *rq;
3413 unsigned long max_load = 0; 3431 unsigned long max_load = 0;
3414 int i; 3432 int i;
3415 3433
3416 for_each_cpu_mask_nr(i, group->cpumask) { 3434 for_each_cpu(i, sched_group_cpus(group)) {
3417 unsigned long wl; 3435 unsigned long wl;
3418 3436
3419 if (!cpu_isset(i, *cpus)) 3437 if (!cpumask_test_cpu(i, cpus))
3420 continue; 3438 continue;
3421 3439
3422 rq = cpu_rq(i); 3440 rq = cpu_rq(i);
@@ -3446,7 +3464,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3446 */ 3464 */
3447static int load_balance(int this_cpu, struct rq *this_rq, 3465static int load_balance(int this_cpu, struct rq *this_rq,
3448 struct sched_domain *sd, enum cpu_idle_type idle, 3466 struct sched_domain *sd, enum cpu_idle_type idle,
3449 int *balance, cpumask_t *cpus) 3467 int *balance, struct cpumask *cpus)
3450{ 3468{
3451 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3469 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3452 struct sched_group *group; 3470 struct sched_group *group;
@@ -3454,7 +3472,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3454 struct rq *busiest; 3472 struct rq *busiest;
3455 unsigned long flags; 3473 unsigned long flags;
3456 3474
3457 cpus_setall(*cpus); 3475 cpumask_setall(cpus);
3458 3476
3459 /* 3477 /*
3460 * When power savings policy is enabled for the parent domain, idle 3478 * When power savings policy is enabled for the parent domain, idle
@@ -3514,8 +3532,8 @@ redo:
3514 3532
3515 /* All tasks on this runqueue were pinned by CPU affinity */ 3533 /* All tasks on this runqueue were pinned by CPU affinity */
3516 if (unlikely(all_pinned)) { 3534 if (unlikely(all_pinned)) {
3517 cpu_clear(cpu_of(busiest), *cpus); 3535 cpumask_clear_cpu(cpu_of(busiest), cpus);
3518 if (!cpus_empty(*cpus)) 3536 if (!cpumask_empty(cpus))
3519 goto redo; 3537 goto redo;
3520 goto out_balanced; 3538 goto out_balanced;
3521 } 3539 }
@@ -3532,7 +3550,8 @@ redo:
3532 /* don't kick the migration_thread, if the curr 3550 /* don't kick the migration_thread, if the curr
3533 * task on busiest cpu can't be moved to this_cpu 3551 * task on busiest cpu can't be moved to this_cpu
3534 */ 3552 */
3535 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 3553 if (!cpumask_test_cpu(this_cpu,
3554 &busiest->curr->cpus_allowed)) {
3536 spin_unlock_irqrestore(&busiest->lock, flags); 3555 spin_unlock_irqrestore(&busiest->lock, flags);
3537 all_pinned = 1; 3556 all_pinned = 1;
3538 goto out_one_pinned; 3557 goto out_one_pinned;
@@ -3607,7 +3626,7 @@ out:
3607 */ 3626 */
3608static int 3627static int
3609load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 3628load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3610 cpumask_t *cpus) 3629 struct cpumask *cpus)
3611{ 3630{
3612 struct sched_group *group; 3631 struct sched_group *group;
3613 struct rq *busiest = NULL; 3632 struct rq *busiest = NULL;
@@ -3616,7 +3635,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3616 int sd_idle = 0; 3635 int sd_idle = 0;
3617 int all_pinned = 0; 3636 int all_pinned = 0;
3618 3637
3619 cpus_setall(*cpus); 3638 cpumask_setall(cpus);
3620 3639
3621 /* 3640 /*
3622 * When power savings policy is enabled for the parent domain, idle 3641 * When power savings policy is enabled for the parent domain, idle
@@ -3660,17 +3679,76 @@ redo:
3660 double_unlock_balance(this_rq, busiest); 3679 double_unlock_balance(this_rq, busiest);
3661 3680
3662 if (unlikely(all_pinned)) { 3681 if (unlikely(all_pinned)) {
3663 cpu_clear(cpu_of(busiest), *cpus); 3682 cpumask_clear_cpu(cpu_of(busiest), cpus);
3664 if (!cpus_empty(*cpus)) 3683 if (!cpumask_empty(cpus))
3665 goto redo; 3684 goto redo;
3666 } 3685 }
3667 } 3686 }
3668 3687
3669 if (!ld_moved) { 3688 if (!ld_moved) {
3689 int active_balance = 0;
3690
3670 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); 3691 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
3671 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3692 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3672 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3693 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3673 return -1; 3694 return -1;
3695
3696 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3697 return -1;
3698
3699 if (sd->nr_balance_failed++ < 2)
3700 return -1;
3701
3702 /*
3703 * The only task running in a non-idle cpu can be moved to this
3704 * cpu in an attempt to completely freeup the other CPU
3705 * package. The same method used to move task in load_balance()
3706 * have been extended for load_balance_newidle() to speedup
3707 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
3708 *
3709 * The package power saving logic comes from
3710 * find_busiest_group(). If there are no imbalance, then
3711 * f_b_g() will return NULL. However when sched_mc={1,2} then
3712 * f_b_g() will select a group from which a running task may be
3713 * pulled to this cpu in order to make the other package idle.
3714 * If there is no opportunity to make a package idle and if
3715 * there are no imbalance, then f_b_g() will return NULL and no
3716 * action will be taken in load_balance_newidle().
3717 *
3718 * Under normal task pull operation due to imbalance, there
3719 * will be more than one task in the source run queue and
3720 * move_tasks() will succeed. ld_moved will be true and this
3721 * active balance code will not be triggered.
3722 */
3723
3724 /* Lock busiest in correct order while this_rq is held */
3725 double_lock_balance(this_rq, busiest);
3726
3727 /*
3728 * don't kick the migration_thread, if the curr
3729 * task on busiest cpu can't be moved to this_cpu
3730 */
3731 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
3732 double_unlock_balance(this_rq, busiest);
3733 all_pinned = 1;
3734 return ld_moved;
3735 }
3736
3737 if (!busiest->active_balance) {
3738 busiest->active_balance = 1;
3739 busiest->push_cpu = this_cpu;
3740 active_balance = 1;
3741 }
3742
3743 double_unlock_balance(this_rq, busiest);
3744 /*
3745 * Should not call ttwu while holding a rq->lock
3746 */
3747 spin_unlock(&this_rq->lock);
3748 if (active_balance)
3749 wake_up_process(busiest->migration_thread);
3750 spin_lock(&this_rq->lock);
3751
3674 } else 3752 } else
3675 sd->nr_balance_failed = 0; 3753 sd->nr_balance_failed = 0;
3676 3754
@@ -3696,7 +3774,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3696 struct sched_domain *sd; 3774 struct sched_domain *sd;
3697 int pulled_task = 0; 3775 int pulled_task = 0;
3698 unsigned long next_balance = jiffies + HZ; 3776 unsigned long next_balance = jiffies + HZ;
3699 cpumask_t tmpmask; 3777 cpumask_var_t tmpmask;
3778
3779 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3780 return;
3700 3781
3701 for_each_domain(this_cpu, sd) { 3782 for_each_domain(this_cpu, sd) {
3702 unsigned long interval; 3783 unsigned long interval;
@@ -3707,7 +3788,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3707 if (sd->flags & SD_BALANCE_NEWIDLE) 3788 if (sd->flags & SD_BALANCE_NEWIDLE)
3708 /* If we've pulled tasks over stop searching: */ 3789 /* If we've pulled tasks over stop searching: */
3709 pulled_task = load_balance_newidle(this_cpu, this_rq, 3790 pulled_task = load_balance_newidle(this_cpu, this_rq,
3710 sd, &tmpmask); 3791 sd, tmpmask);
3711 3792
3712 interval = msecs_to_jiffies(sd->balance_interval); 3793 interval = msecs_to_jiffies(sd->balance_interval);
3713 if (time_after(next_balance, sd->last_balance + interval)) 3794 if (time_after(next_balance, sd->last_balance + interval))
@@ -3722,6 +3803,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3722 */ 3803 */
3723 this_rq->next_balance = next_balance; 3804 this_rq->next_balance = next_balance;
3724 } 3805 }
3806 free_cpumask_var(tmpmask);
3725} 3807}
3726 3808
3727/* 3809/*
@@ -3759,7 +3841,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3759 /* Search for an sd spanning us and the target CPU. */ 3841 /* Search for an sd spanning us and the target CPU. */
3760 for_each_domain(target_cpu, sd) { 3842 for_each_domain(target_cpu, sd) {
3761 if ((sd->flags & SD_LOAD_BALANCE) && 3843 if ((sd->flags & SD_LOAD_BALANCE) &&
3762 cpu_isset(busiest_cpu, sd->span)) 3844 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3763 break; 3845 break;
3764 } 3846 }
3765 3847
@@ -3778,10 +3860,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3778#ifdef CONFIG_NO_HZ 3860#ifdef CONFIG_NO_HZ
3779static struct { 3861static struct {
3780 atomic_t load_balancer; 3862 atomic_t load_balancer;
3781 cpumask_t cpu_mask; 3863 cpumask_var_t cpu_mask;
3782} nohz ____cacheline_aligned = { 3864} nohz ____cacheline_aligned = {
3783 .load_balancer = ATOMIC_INIT(-1), 3865 .load_balancer = ATOMIC_INIT(-1),
3784 .cpu_mask = CPU_MASK_NONE,
3785}; 3866};
3786 3867
3787/* 3868/*
@@ -3809,7 +3890,7 @@ int select_nohz_load_balancer(int stop_tick)
3809 int cpu = smp_processor_id(); 3890 int cpu = smp_processor_id();
3810 3891
3811 if (stop_tick) { 3892 if (stop_tick) {
3812 cpu_set(cpu, nohz.cpu_mask); 3893 cpumask_set_cpu(cpu, nohz.cpu_mask);
3813 cpu_rq(cpu)->in_nohz_recently = 1; 3894 cpu_rq(cpu)->in_nohz_recently = 1;
3814 3895
3815 /* 3896 /*
@@ -3823,7 +3904,7 @@ int select_nohz_load_balancer(int stop_tick)
3823 } 3904 }
3824 3905
3825 /* time for ilb owner also to sleep */ 3906 /* time for ilb owner also to sleep */
3826 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 3907 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3827 if (atomic_read(&nohz.load_balancer) == cpu) 3908 if (atomic_read(&nohz.load_balancer) == cpu)
3828 atomic_set(&nohz.load_balancer, -1); 3909 atomic_set(&nohz.load_balancer, -1);
3829 return 0; 3910 return 0;
@@ -3836,10 +3917,10 @@ int select_nohz_load_balancer(int stop_tick)
3836 } else if (atomic_read(&nohz.load_balancer) == cpu) 3917 } else if (atomic_read(&nohz.load_balancer) == cpu)
3837 return 1; 3918 return 1;
3838 } else { 3919 } else {
3839 if (!cpu_isset(cpu, nohz.cpu_mask)) 3920 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3840 return 0; 3921 return 0;
3841 3922
3842 cpu_clear(cpu, nohz.cpu_mask); 3923 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3843 3924
3844 if (atomic_read(&nohz.load_balancer) == cpu) 3925 if (atomic_read(&nohz.load_balancer) == cpu)
3845 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3926 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3867,7 +3948,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3867 unsigned long next_balance = jiffies + 60*HZ; 3948 unsigned long next_balance = jiffies + 60*HZ;
3868 int update_next_balance = 0; 3949 int update_next_balance = 0;
3869 int need_serialize; 3950 int need_serialize;
3870 cpumask_t tmp; 3951 cpumask_var_t tmp;
3952
3953 /* Fails alloc? Rebalancing probably not a priority right now. */
3954 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3955 return;
3871 3956
3872 for_each_domain(cpu, sd) { 3957 for_each_domain(cpu, sd) {
3873 if (!(sd->flags & SD_LOAD_BALANCE)) 3958 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3892,7 +3977,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3892 } 3977 }
3893 3978
3894 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3979 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3895 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { 3980 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
3896 /* 3981 /*
3897 * We've pulled tasks over so either we're no 3982 * We've pulled tasks over so either we're no
3898 * longer idle, or one of our SMT siblings is 3983 * longer idle, or one of our SMT siblings is
@@ -3926,6 +4011,8 @@ out:
3926 */ 4011 */
3927 if (likely(update_next_balance)) 4012 if (likely(update_next_balance))
3928 rq->next_balance = next_balance; 4013 rq->next_balance = next_balance;
4014
4015 free_cpumask_var(tmp);
3929} 4016}
3930 4017
3931/* 4018/*
@@ -3950,12 +4037,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3950 */ 4037 */
3951 if (this_rq->idle_at_tick && 4038 if (this_rq->idle_at_tick &&
3952 atomic_read(&nohz.load_balancer) == this_cpu) { 4039 atomic_read(&nohz.load_balancer) == this_cpu) {
3953 cpumask_t cpus = nohz.cpu_mask;
3954 struct rq *rq; 4040 struct rq *rq;
3955 int balance_cpu; 4041 int balance_cpu;
3956 4042
3957 cpu_clear(this_cpu, cpus); 4043 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3958 for_each_cpu_mask_nr(balance_cpu, cpus) { 4044 if (balance_cpu == this_cpu)
4045 continue;
4046
3959 /* 4047 /*
3960 * If this cpu gets work to do, stop the load balancing 4048 * If this cpu gets work to do, stop the load balancing
3961 * work being done for other cpus. Next load 4049 * work being done for other cpus. Next load
@@ -3993,7 +4081,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
3993 rq->in_nohz_recently = 0; 4081 rq->in_nohz_recently = 0;
3994 4082
3995 if (atomic_read(&nohz.load_balancer) == cpu) { 4083 if (atomic_read(&nohz.load_balancer) == cpu) {
3996 cpu_clear(cpu, nohz.cpu_mask); 4084 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3997 atomic_set(&nohz.load_balancer, -1); 4085 atomic_set(&nohz.load_balancer, -1);
3998 } 4086 }
3999 4087
@@ -4006,7 +4094,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4006 * TBD: Traverse the sched domains and nominate 4094 * TBD: Traverse the sched domains and nominate
4007 * the nearest cpu in the nohz.cpu_mask. 4095 * the nearest cpu in the nohz.cpu_mask.
4008 */ 4096 */
4009 int ilb = first_cpu(nohz.cpu_mask); 4097 int ilb = cpumask_first(nohz.cpu_mask);
4010 4098
4011 if (ilb < nr_cpu_ids) 4099 if (ilb < nr_cpu_ids)
4012 resched_cpu(ilb); 4100 resched_cpu(ilb);
@@ -4018,7 +4106,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4018 * cpus with ticks stopped, is it time for that to stop? 4106 * cpus with ticks stopped, is it time for that to stop?
4019 */ 4107 */
4020 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && 4108 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4021 cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 4109 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4022 resched_cpu(cpu); 4110 resched_cpu(cpu);
4023 return; 4111 return;
4024 } 4112 }
@@ -4028,7 +4116,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4028 * someone else, then no need raise the SCHED_SOFTIRQ 4116 * someone else, then no need raise the SCHED_SOFTIRQ
4029 */ 4117 */
4030 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && 4118 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4031 cpu_isset(cpu, nohz.cpu_mask)) 4119 cpumask_test_cpu(cpu, nohz.cpu_mask))
4032 return; 4120 return;
4033#endif 4121#endif
4034 if (time_after_eq(jiffies, rq->next_balance)) 4122 if (time_after_eq(jiffies, rq->next_balance))
@@ -4080,13 +4168,17 @@ unsigned long long task_delta_exec(struct task_struct *p)
4080 * Account user cpu time to a process. 4168 * Account user cpu time to a process.
4081 * @p: the process that the cpu time gets accounted to 4169 * @p: the process that the cpu time gets accounted to
4082 * @cputime: the cpu time spent in user space since the last update 4170 * @cputime: the cpu time spent in user space since the last update
4171 * @cputime_scaled: cputime scaled by cpu frequency
4083 */ 4172 */
4084void account_user_time(struct task_struct *p, cputime_t cputime) 4173void account_user_time(struct task_struct *p, cputime_t cputime,
4174 cputime_t cputime_scaled)
4085{ 4175{
4086 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4176 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4087 cputime64_t tmp; 4177 cputime64_t tmp;
4088 4178
4179 /* Add user time to process. */
4089 p->utime = cputime_add(p->utime, cputime); 4180 p->utime = cputime_add(p->utime, cputime);
4181 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4090 account_group_user_time(p, cputime); 4182 account_group_user_time(p, cputime);
4091 4183
4092 /* Add user time to cpustat. */ 4184 /* Add user time to cpustat. */
@@ -4103,51 +4195,48 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4103 * Account guest cpu time to a process. 4195 * Account guest cpu time to a process.
4104 * @p: the process that the cpu time gets accounted to 4196 * @p: the process that the cpu time gets accounted to
4105 * @cputime: the cpu time spent in virtual machine since the last update 4197 * @cputime: the cpu time spent in virtual machine since the last update
4198 * @cputime_scaled: cputime scaled by cpu frequency
4106 */ 4199 */
4107static void account_guest_time(struct task_struct *p, cputime_t cputime) 4200static void account_guest_time(struct task_struct *p, cputime_t cputime,
4201 cputime_t cputime_scaled)
4108{ 4202{
4109 cputime64_t tmp; 4203 cputime64_t tmp;
4110 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4204 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4111 4205
4112 tmp = cputime_to_cputime64(cputime); 4206 tmp = cputime_to_cputime64(cputime);
4113 4207
4208 /* Add guest time to process. */
4114 p->utime = cputime_add(p->utime, cputime); 4209 p->utime = cputime_add(p->utime, cputime);
4210 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4115 account_group_user_time(p, cputime); 4211 account_group_user_time(p, cputime);
4116 p->gtime = cputime_add(p->gtime, cputime); 4212 p->gtime = cputime_add(p->gtime, cputime);
4117 4213
4214 /* Add guest time to cpustat. */
4118 cpustat->user = cputime64_add(cpustat->user, tmp); 4215 cpustat->user = cputime64_add(cpustat->user, tmp);
4119 cpustat->guest = cputime64_add(cpustat->guest, tmp); 4216 cpustat->guest = cputime64_add(cpustat->guest, tmp);
4120} 4217}
4121 4218
4122/* 4219/*
4123 * Account scaled user cpu time to a process.
4124 * @p: the process that the cpu time gets accounted to
4125 * @cputime: the cpu time spent in user space since the last update
4126 */
4127void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
4128{
4129 p->utimescaled = cputime_add(p->utimescaled, cputime);
4130}
4131
4132/*
4133 * Account system cpu time to a process. 4220 * Account system cpu time to a process.
4134 * @p: the process that the cpu time gets accounted to 4221 * @p: the process that the cpu time gets accounted to
4135 * @hardirq_offset: the offset to subtract from hardirq_count() 4222 * @hardirq_offset: the offset to subtract from hardirq_count()
4136 * @cputime: the cpu time spent in kernel space since the last update 4223 * @cputime: the cpu time spent in kernel space since the last update
4224 * @cputime_scaled: cputime scaled by cpu frequency
4137 */ 4225 */
4138void account_system_time(struct task_struct *p, int hardirq_offset, 4226void account_system_time(struct task_struct *p, int hardirq_offset,
4139 cputime_t cputime) 4227 cputime_t cputime, cputime_t cputime_scaled)
4140{ 4228{
4141 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4229 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4142 struct rq *rq = this_rq();
4143 cputime64_t tmp; 4230 cputime64_t tmp;
4144 4231
4145 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 4232 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
4146 account_guest_time(p, cputime); 4233 account_guest_time(p, cputime, cputime_scaled);
4147 return; 4234 return;
4148 } 4235 }
4149 4236
4237 /* Add system time to process. */
4150 p->stime = cputime_add(p->stime, cputime); 4238 p->stime = cputime_add(p->stime, cputime);
4239 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
4151 account_group_system_time(p, cputime); 4240 account_group_system_time(p, cputime);
4152 4241
4153 /* Add system time to cpustat. */ 4242 /* Add system time to cpustat. */
@@ -4156,49 +4245,85 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4156 cpustat->irq = cputime64_add(cpustat->irq, tmp); 4245 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4157 else if (softirq_count()) 4246 else if (softirq_count())
4158 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 4247 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
4159 else if (p != rq->idle)
4160 cpustat->system = cputime64_add(cpustat->system, tmp);
4161 else if (atomic_read(&rq->nr_iowait) > 0)
4162 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4163 else 4248 else
4164 cpustat->idle = cputime64_add(cpustat->idle, tmp); 4249 cpustat->system = cputime64_add(cpustat->system, tmp);
4250
4165 /* Account for system time used */ 4251 /* Account for system time used */
4166 acct_update_integrals(p); 4252 acct_update_integrals(p);
4167} 4253}
4168 4254
4169/* 4255/*
4170 * Account scaled system cpu time to a process. 4256 * Account for involuntary wait time.
4171 * @p: the process that the cpu time gets accounted to 4257 * @steal: the cpu time spent in involuntary wait
4172 * @hardirq_offset: the offset to subtract from hardirq_count()
4173 * @cputime: the cpu time spent in kernel space since the last update
4174 */ 4258 */
4175void account_system_time_scaled(struct task_struct *p, cputime_t cputime) 4259void account_steal_time(cputime_t cputime)
4176{ 4260{
4177 p->stimescaled = cputime_add(p->stimescaled, cputime); 4261 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4262 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4263
4264 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
4178} 4265}
4179 4266
4180/* 4267/*
4181 * Account for involuntary wait time. 4268 * Account for idle time.
4182 * @p: the process from which the cpu time has been stolen 4269 * @cputime: the cpu time spent in idle wait
4183 * @steal: the cpu time spent in involuntary wait
4184 */ 4270 */
4185void account_steal_time(struct task_struct *p, cputime_t steal) 4271void account_idle_time(cputime_t cputime)
4186{ 4272{
4187 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4273 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4188 cputime64_t tmp = cputime_to_cputime64(steal); 4274 cputime64_t cputime64 = cputime_to_cputime64(cputime);
4189 struct rq *rq = this_rq(); 4275 struct rq *rq = this_rq();
4190 4276
4191 if (p == rq->idle) { 4277 if (atomic_read(&rq->nr_iowait) > 0)
4192 p->stime = cputime_add(p->stime, steal); 4278 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
4193 if (atomic_read(&rq->nr_iowait) > 0) 4279 else
4194 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4280 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
4195 else 4281}
4196 cpustat->idle = cputime64_add(cpustat->idle, tmp); 4282
4197 } else 4283#ifndef CONFIG_VIRT_CPU_ACCOUNTING
4198 cpustat->steal = cputime64_add(cpustat->steal, tmp); 4284
4285/*
4286 * Account a single tick of cpu time.
4287 * @p: the process that the cpu time gets accounted to
4288 * @user_tick: indicates if the tick is a user or a system tick
4289 */
4290void account_process_tick(struct task_struct *p, int user_tick)
4291{
4292 cputime_t one_jiffy = jiffies_to_cputime(1);
4293 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
4294 struct rq *rq = this_rq();
4295
4296 if (user_tick)
4297 account_user_time(p, one_jiffy, one_jiffy_scaled);
4298 else if (p != rq->idle)
4299 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
4300 one_jiffy_scaled);
4301 else
4302 account_idle_time(one_jiffy);
4199} 4303}
4200 4304
4201/* 4305/*
4306 * Account multiple ticks of steal time.
4307 * @p: the process from which the cpu time has been stolen
4308 * @ticks: number of stolen ticks
4309 */
4310void account_steal_ticks(unsigned long ticks)
4311{
4312 account_steal_time(jiffies_to_cputime(ticks));
4313}
4314
4315/*
4316 * Account multiple ticks of idle time.
4317 * @ticks: number of stolen ticks
4318 */
4319void account_idle_ticks(unsigned long ticks)
4320{
4321 account_idle_time(jiffies_to_cputime(ticks));
4322}
4323
4324#endif
4325
4326/*
4202 * Use precise platform statistics if available: 4327 * Use precise platform statistics if available:
4203 */ 4328 */
4204#ifdef CONFIG_VIRT_CPU_ACCOUNTING 4329#ifdef CONFIG_VIRT_CPU_ACCOUNTING
@@ -4325,7 +4450,7 @@ void __kprobes sub_preempt_count(int val)
4325 /* 4450 /*
4326 * Underflow? 4451 * Underflow?
4327 */ 4452 */
4328 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked()))) 4453 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4329 return; 4454 return;
4330 /* 4455 /*
4331 * Is the spinlock portion underflowing? 4456 * Is the spinlock portion underflowing?
@@ -4572,8 +4697,8 @@ EXPORT_SYMBOL(default_wake_function);
4572 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 4697 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
4573 * zero in this (rare) case, and we handle it by continuing to scan the queue. 4698 * zero in this (rare) case, and we handle it by continuing to scan the queue.
4574 */ 4699 */
4575static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 4700void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4576 int nr_exclusive, int sync, void *key) 4701 int nr_exclusive, int sync, void *key)
4577{ 4702{
4578 wait_queue_t *curr, *next; 4703 wait_queue_t *curr, *next;
4579 4704
@@ -5011,7 +5136,7 @@ int can_nice(const struct task_struct *p, const int nice)
5011 * sys_setpriority is a more generic, but much slower function that 5136 * sys_setpriority is a more generic, but much slower function that
5012 * does similar things. 5137 * does similar things.
5013 */ 5138 */
5014asmlinkage long sys_nice(int increment) 5139SYSCALL_DEFINE1(nice, int, increment)
5015{ 5140{
5016 long nice, retval; 5141 long nice, retval;
5017 5142
@@ -5318,8 +5443,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5318 * @policy: new policy. 5443 * @policy: new policy.
5319 * @param: structure containing the new RT priority. 5444 * @param: structure containing the new RT priority.
5320 */ 5445 */
5321asmlinkage long 5446SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5322sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5447 struct sched_param __user *, param)
5323{ 5448{
5324 /* negative values for policy are not valid */ 5449 /* negative values for policy are not valid */
5325 if (policy < 0) 5450 if (policy < 0)
@@ -5333,7 +5458,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5333 * @pid: the pid in question. 5458 * @pid: the pid in question.
5334 * @param: structure containing the new RT priority. 5459 * @param: structure containing the new RT priority.
5335 */ 5460 */
5336asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) 5461SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5337{ 5462{
5338 return do_sched_setscheduler(pid, -1, param); 5463 return do_sched_setscheduler(pid, -1, param);
5339} 5464}
@@ -5342,7 +5467,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
5342 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 5467 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5343 * @pid: the pid in question. 5468 * @pid: the pid in question.
5344 */ 5469 */
5345asmlinkage long sys_sched_getscheduler(pid_t pid) 5470SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5346{ 5471{
5347 struct task_struct *p; 5472 struct task_struct *p;
5348 int retval; 5473 int retval;
@@ -5367,7 +5492,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
5367 * @pid: the pid in question. 5492 * @pid: the pid in question.
5368 * @param: structure containing the RT priority. 5493 * @param: structure containing the RT priority.
5369 */ 5494 */
5370asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) 5495SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5371{ 5496{
5372 struct sched_param lp; 5497 struct sched_param lp;
5373 struct task_struct *p; 5498 struct task_struct *p;
@@ -5401,10 +5526,9 @@ out_unlock:
5401 return retval; 5526 return retval;
5402} 5527}
5403 5528
5404long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) 5529long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5405{ 5530{
5406 cpumask_t cpus_allowed; 5531 cpumask_var_t cpus_allowed, new_mask;
5407 cpumask_t new_mask = *in_mask;
5408 struct task_struct *p; 5532 struct task_struct *p;
5409 int retval; 5533 int retval;
5410 5534
@@ -5426,6 +5550,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5426 get_task_struct(p); 5550 get_task_struct(p);
5427 read_unlock(&tasklist_lock); 5551 read_unlock(&tasklist_lock);
5428 5552
5553 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5554 retval = -ENOMEM;
5555 goto out_put_task;
5556 }
5557 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5558 retval = -ENOMEM;
5559 goto out_free_cpus_allowed;
5560 }
5429 retval = -EPERM; 5561 retval = -EPERM;
5430 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5562 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
5431 goto out_unlock; 5563 goto out_unlock;
@@ -5434,37 +5566,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5434 if (retval) 5566 if (retval)
5435 goto out_unlock; 5567 goto out_unlock;
5436 5568
5437 cpuset_cpus_allowed(p, &cpus_allowed); 5569 cpuset_cpus_allowed(p, cpus_allowed);
5438 cpus_and(new_mask, new_mask, cpus_allowed); 5570 cpumask_and(new_mask, in_mask, cpus_allowed);
5439 again: 5571 again:
5440 retval = set_cpus_allowed_ptr(p, &new_mask); 5572 retval = set_cpus_allowed_ptr(p, new_mask);
5441 5573
5442 if (!retval) { 5574 if (!retval) {
5443 cpuset_cpus_allowed(p, &cpus_allowed); 5575 cpuset_cpus_allowed(p, cpus_allowed);
5444 if (!cpus_subset(new_mask, cpus_allowed)) { 5576 if (!cpumask_subset(new_mask, cpus_allowed)) {
5445 /* 5577 /*
5446 * We must have raced with a concurrent cpuset 5578 * We must have raced with a concurrent cpuset
5447 * update. Just reset the cpus_allowed to the 5579 * update. Just reset the cpus_allowed to the
5448 * cpuset's cpus_allowed 5580 * cpuset's cpus_allowed
5449 */ 5581 */
5450 new_mask = cpus_allowed; 5582 cpumask_copy(new_mask, cpus_allowed);
5451 goto again; 5583 goto again;
5452 } 5584 }
5453 } 5585 }
5454out_unlock: 5586out_unlock:
5587 free_cpumask_var(new_mask);
5588out_free_cpus_allowed:
5589 free_cpumask_var(cpus_allowed);
5590out_put_task:
5455 put_task_struct(p); 5591 put_task_struct(p);
5456 put_online_cpus(); 5592 put_online_cpus();
5457 return retval; 5593 return retval;
5458} 5594}
5459 5595
5460static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5596static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5461 cpumask_t *new_mask) 5597 struct cpumask *new_mask)
5462{ 5598{
5463 if (len < sizeof(cpumask_t)) { 5599 if (len < cpumask_size())
5464 memset(new_mask, 0, sizeof(cpumask_t)); 5600 cpumask_clear(new_mask);
5465 } else if (len > sizeof(cpumask_t)) { 5601 else if (len > cpumask_size())
5466 len = sizeof(cpumask_t); 5602 len = cpumask_size();
5467 } 5603
5468 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5604 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5469} 5605}
5470 5606
@@ -5474,20 +5610,23 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5474 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5610 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5475 * @user_mask_ptr: user-space pointer to the new cpu mask 5611 * @user_mask_ptr: user-space pointer to the new cpu mask
5476 */ 5612 */
5477asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 5613SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5478 unsigned long __user *user_mask_ptr) 5614 unsigned long __user *, user_mask_ptr)
5479{ 5615{
5480 cpumask_t new_mask; 5616 cpumask_var_t new_mask;
5481 int retval; 5617 int retval;
5482 5618
5483 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 5619 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5484 if (retval) 5620 return -ENOMEM;
5485 return retval;
5486 5621
5487 return sched_setaffinity(pid, &new_mask); 5622 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5623 if (retval == 0)
5624 retval = sched_setaffinity(pid, new_mask);
5625 free_cpumask_var(new_mask);
5626 return retval;
5488} 5627}
5489 5628
5490long sched_getaffinity(pid_t pid, cpumask_t *mask) 5629long sched_getaffinity(pid_t pid, struct cpumask *mask)
5491{ 5630{
5492 struct task_struct *p; 5631 struct task_struct *p;
5493 int retval; 5632 int retval;
@@ -5504,7 +5643,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
5504 if (retval) 5643 if (retval)
5505 goto out_unlock; 5644 goto out_unlock;
5506 5645
5507 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 5646 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5508 5647
5509out_unlock: 5648out_unlock:
5510 read_unlock(&tasklist_lock); 5649 read_unlock(&tasklist_lock);
@@ -5519,23 +5658,28 @@ out_unlock:
5519 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 5658 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5520 * @user_mask_ptr: user-space pointer to hold the current cpu mask 5659 * @user_mask_ptr: user-space pointer to hold the current cpu mask
5521 */ 5660 */
5522asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, 5661SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5523 unsigned long __user *user_mask_ptr) 5662 unsigned long __user *, user_mask_ptr)
5524{ 5663{
5525 int ret; 5664 int ret;
5526 cpumask_t mask; 5665 cpumask_var_t mask;
5527 5666
5528 if (len < sizeof(cpumask_t)) 5667 if (len < cpumask_size())
5529 return -EINVAL; 5668 return -EINVAL;
5530 5669
5531 ret = sched_getaffinity(pid, &mask); 5670 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5532 if (ret < 0) 5671 return -ENOMEM;
5533 return ret;
5534 5672
5535 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 5673 ret = sched_getaffinity(pid, mask);
5536 return -EFAULT; 5674 if (ret == 0) {
5675 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
5676 ret = -EFAULT;
5677 else
5678 ret = cpumask_size();
5679 }
5680 free_cpumask_var(mask);
5537 5681
5538 return sizeof(cpumask_t); 5682 return ret;
5539} 5683}
5540 5684
5541/** 5685/**
@@ -5544,7 +5688,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5544 * This function yields the current CPU to other tasks. If there are no 5688 * This function yields the current CPU to other tasks. If there are no
5545 * other threads running on this CPU then this function will return. 5689 * other threads running on this CPU then this function will return.
5546 */ 5690 */
5547asmlinkage long sys_sched_yield(void) 5691SYSCALL_DEFINE0(sched_yield)
5548{ 5692{
5549 struct rq *rq = this_rq_lock(); 5693 struct rq *rq = this_rq_lock();
5550 5694
@@ -5685,7 +5829,7 @@ long __sched io_schedule_timeout(long timeout)
5685 * this syscall returns the maximum rt_priority that can be used 5829 * this syscall returns the maximum rt_priority that can be used
5686 * by a given scheduling class. 5830 * by a given scheduling class.
5687 */ 5831 */
5688asmlinkage long sys_sched_get_priority_max(int policy) 5832SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5689{ 5833{
5690 int ret = -EINVAL; 5834 int ret = -EINVAL;
5691 5835
@@ -5710,7 +5854,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
5710 * this syscall returns the minimum rt_priority that can be used 5854 * this syscall returns the minimum rt_priority that can be used
5711 * by a given scheduling class. 5855 * by a given scheduling class.
5712 */ 5856 */
5713asmlinkage long sys_sched_get_priority_min(int policy) 5857SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5714{ 5858{
5715 int ret = -EINVAL; 5859 int ret = -EINVAL;
5716 5860
@@ -5735,8 +5879,8 @@ asmlinkage long sys_sched_get_priority_min(int policy)
5735 * this syscall writes the default timeslice value of a given process 5879 * this syscall writes the default timeslice value of a given process
5736 * into the user-space timespec buffer. A value of '0' means infinity. 5880 * into the user-space timespec buffer. A value of '0' means infinity.
5737 */ 5881 */
5738asmlinkage 5882SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5739long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 5883 struct timespec __user *, interval)
5740{ 5884{
5741 struct task_struct *p; 5885 struct task_struct *p;
5742 unsigned int time_slice; 5886 unsigned int time_slice;
@@ -5877,7 +6021,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5877 idle->se.exec_start = sched_clock(); 6021 idle->se.exec_start = sched_clock();
5878 6022
5879 idle->prio = idle->normal_prio = MAX_PRIO; 6023 idle->prio = idle->normal_prio = MAX_PRIO;
5880 idle->cpus_allowed = cpumask_of_cpu(cpu); 6024 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5881 __set_task_cpu(idle, cpu); 6025 __set_task_cpu(idle, cpu);
5882 6026
5883 rq->curr = rq->idle = idle; 6027 rq->curr = rq->idle = idle;
@@ -5904,9 +6048,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5904 * indicates which cpus entered this state. This is used 6048 * indicates which cpus entered this state. This is used
5905 * in the rcu update to wait only for active cpus. For system 6049 * in the rcu update to wait only for active cpus. For system
5906 * which do not switch off the HZ timer nohz_cpu_mask should 6050 * which do not switch off the HZ timer nohz_cpu_mask should
5907 * always be CPU_MASK_NONE. 6051 * always be CPU_BITS_NONE.
5908 */ 6052 */
5909cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 6053cpumask_var_t nohz_cpu_mask;
5910 6054
5911/* 6055/*
5912 * Increase the granularity value when there are more CPUs, 6056 * Increase the granularity value when there are more CPUs,
@@ -5961,7 +6105,7 @@ static inline void sched_init_granularity(void)
5961 * task must not exit() & deallocate itself prematurely. The 6105 * task must not exit() & deallocate itself prematurely. The
5962 * call is not atomic; no spinlocks may be held. 6106 * call is not atomic; no spinlocks may be held.
5963 */ 6107 */
5964int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) 6108int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5965{ 6109{
5966 struct migration_req req; 6110 struct migration_req req;
5967 unsigned long flags; 6111 unsigned long flags;
@@ -5969,13 +6113,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5969 int ret = 0; 6113 int ret = 0;
5970 6114
5971 rq = task_rq_lock(p, &flags); 6115 rq = task_rq_lock(p, &flags);
5972 if (!cpus_intersects(*new_mask, cpu_online_map)) { 6116 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
5973 ret = -EINVAL; 6117 ret = -EINVAL;
5974 goto out; 6118 goto out;
5975 } 6119 }
5976 6120
5977 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6121 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5978 !cpus_equal(p->cpus_allowed, *new_mask))) { 6122 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5979 ret = -EINVAL; 6123 ret = -EINVAL;
5980 goto out; 6124 goto out;
5981 } 6125 }
@@ -5983,15 +6127,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5983 if (p->sched_class->set_cpus_allowed) 6127 if (p->sched_class->set_cpus_allowed)
5984 p->sched_class->set_cpus_allowed(p, new_mask); 6128 p->sched_class->set_cpus_allowed(p, new_mask);
5985 else { 6129 else {
5986 p->cpus_allowed = *new_mask; 6130 cpumask_copy(&p->cpus_allowed, new_mask);
5987 p->rt.nr_cpus_allowed = cpus_weight(*new_mask); 6131 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5988 } 6132 }
5989 6133
5990 /* Can the task run on the task's current CPU? If so, we're done */ 6134 /* Can the task run on the task's current CPU? If so, we're done */
5991 if (cpu_isset(task_cpu(p), *new_mask)) 6135 if (cpumask_test_cpu(task_cpu(p), new_mask))
5992 goto out; 6136 goto out;
5993 6137
5994 if (migrate_task(p, any_online_cpu(*new_mask), &req)) { 6138 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
5995 /* Need help from migration thread: drop lock and wait. */ 6139 /* Need help from migration thread: drop lock and wait. */
5996 task_rq_unlock(rq, &flags); 6140 task_rq_unlock(rq, &flags);
5997 wake_up_process(rq->migration_thread); 6141 wake_up_process(rq->migration_thread);
@@ -6033,7 +6177,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6033 if (task_cpu(p) != src_cpu) 6177 if (task_cpu(p) != src_cpu)
6034 goto done; 6178 goto done;
6035 /* Affinity changed (again). */ 6179 /* Affinity changed (again). */
6036 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 6180 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6037 goto fail; 6181 goto fail;
6038 6182
6039 on_rq = p->se.on_rq; 6183 on_rq = p->se.on_rq;
@@ -6130,50 +6274,41 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6130 */ 6274 */
6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6275static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6132{ 6276{
6133 unsigned long flags;
6134 cpumask_t mask;
6135 struct rq *rq;
6136 int dest_cpu; 6277 int dest_cpu;
6278 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
6137 6279
6138 do { 6280again:
6139 /* On same node? */ 6281 /* Look for allowed, online CPU in same node. */
6140 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 6282 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
6141 cpus_and(mask, mask, p->cpus_allowed); 6283 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6142 dest_cpu = any_online_cpu(mask); 6284 goto move;
6143 6285
6144 /* On any allowed CPU? */ 6286 /* Any allowed, online CPU? */
6145 if (dest_cpu >= nr_cpu_ids) 6287 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
6146 dest_cpu = any_online_cpu(p->cpus_allowed); 6288 if (dest_cpu < nr_cpu_ids)
6289 goto move;
6147 6290
6148 /* No more Mr. Nice Guy. */ 6291 /* No more Mr. Nice Guy. */
6149 if (dest_cpu >= nr_cpu_ids) { 6292 if (dest_cpu >= nr_cpu_ids) {
6150 cpumask_t cpus_allowed; 6293 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
6151 6294 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
6152 cpuset_cpus_allowed_locked(p, &cpus_allowed);
6153 /*
6154 * Try to stay on the same cpuset, where the
6155 * current cpuset may be a subset of all cpus.
6156 * The cpuset_cpus_allowed_locked() variant of
6157 * cpuset_cpus_allowed() will not block. It must be
6158 * called within calls to cpuset_lock/cpuset_unlock.
6159 */
6160 rq = task_rq_lock(p, &flags);
6161 p->cpus_allowed = cpus_allowed;
6162 dest_cpu = any_online_cpu(p->cpus_allowed);
6163 task_rq_unlock(rq, &flags);
6164 6295
6165 /* 6296 /*
6166 * Don't tell them about moving exiting tasks or 6297 * Don't tell them about moving exiting tasks or
6167 * kernel threads (both mm NULL), since they never 6298 * kernel threads (both mm NULL), since they never
6168 * leave kernel. 6299 * leave kernel.
6169 */ 6300 */
6170 if (p->mm && printk_ratelimit()) { 6301 if (p->mm && printk_ratelimit()) {
6171 printk(KERN_INFO "process %d (%s) no " 6302 printk(KERN_INFO "process %d (%s) no "
6172 "longer affine to cpu%d\n", 6303 "longer affine to cpu%d\n",
6173 task_pid_nr(p), p->comm, dead_cpu); 6304 task_pid_nr(p), p->comm, dead_cpu);
6174 }
6175 } 6305 }
6176 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 6306 }
6307
6308move:
6309 /* It can have affinity changed while we were choosing. */
6310 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
6311 goto again;
6177} 6312}
6178 6313
6179/* 6314/*
@@ -6185,7 +6320,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6185 */ 6320 */
6186static void migrate_nr_uninterruptible(struct rq *rq_src) 6321static void migrate_nr_uninterruptible(struct rq *rq_src)
6187{ 6322{
6188 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); 6323 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
6189 unsigned long flags; 6324 unsigned long flags;
6190 6325
6191 local_irq_save(flags); 6326 local_irq_save(flags);
@@ -6475,7 +6610,7 @@ static void set_rq_online(struct rq *rq)
6475 if (!rq->online) { 6610 if (!rq->online) {
6476 const struct sched_class *class; 6611 const struct sched_class *class;
6477 6612
6478 cpu_set(rq->cpu, rq->rd->online); 6613 cpumask_set_cpu(rq->cpu, rq->rd->online);
6479 rq->online = 1; 6614 rq->online = 1;
6480 6615
6481 for_each_class(class) { 6616 for_each_class(class) {
@@ -6495,7 +6630,7 @@ static void set_rq_offline(struct rq *rq)
6495 class->rq_offline(rq); 6630 class->rq_offline(rq);
6496 } 6631 }
6497 6632
6498 cpu_clear(rq->cpu, rq->rd->online); 6633 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6499 rq->online = 0; 6634 rq->online = 0;
6500 } 6635 }
6501} 6636}
@@ -6536,7 +6671,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6536 rq = cpu_rq(cpu); 6671 rq = cpu_rq(cpu);
6537 spin_lock_irqsave(&rq->lock, flags); 6672 spin_lock_irqsave(&rq->lock, flags);
6538 if (rq->rd) { 6673 if (rq->rd) {
6539 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6674 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6540 6675
6541 set_rq_online(rq); 6676 set_rq_online(rq);
6542 } 6677 }
@@ -6550,7 +6685,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6550 break; 6685 break;
6551 /* Unbind it from offline cpu so it can run. Fall thru. */ 6686 /* Unbind it from offline cpu so it can run. Fall thru. */
6552 kthread_bind(cpu_rq(cpu)->migration_thread, 6687 kthread_bind(cpu_rq(cpu)->migration_thread,
6553 any_online_cpu(cpu_online_map)); 6688 cpumask_any(cpu_online_mask));
6554 kthread_stop(cpu_rq(cpu)->migration_thread); 6689 kthread_stop(cpu_rq(cpu)->migration_thread);
6555 cpu_rq(cpu)->migration_thread = NULL; 6690 cpu_rq(cpu)->migration_thread = NULL;
6556 break; 6691 break;
@@ -6600,7 +6735,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6600 rq = cpu_rq(cpu); 6735 rq = cpu_rq(cpu);
6601 spin_lock_irqsave(&rq->lock, flags); 6736 spin_lock_irqsave(&rq->lock, flags);
6602 if (rq->rd) { 6737 if (rq->rd) {
6603 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6738 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6604 set_rq_offline(rq); 6739 set_rq_offline(rq);
6605 } 6740 }
6606 spin_unlock_irqrestore(&rq->lock, flags); 6741 spin_unlock_irqrestore(&rq->lock, flags);
@@ -6639,13 +6774,13 @@ early_initcall(migration_init);
6639#ifdef CONFIG_SCHED_DEBUG 6774#ifdef CONFIG_SCHED_DEBUG
6640 6775
6641static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6776static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6642 cpumask_t *groupmask) 6777 struct cpumask *groupmask)
6643{ 6778{
6644 struct sched_group *group = sd->groups; 6779 struct sched_group *group = sd->groups;
6645 char str[256]; 6780 char str[256];
6646 6781
6647 cpulist_scnprintf(str, sizeof(str), sd->span); 6782 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6648 cpus_clear(*groupmask); 6783 cpumask_clear(groupmask);
6649 6784
6650 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6785 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6651 6786
@@ -6659,11 +6794,11 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6659 6794
6660 printk(KERN_CONT "span %s level %s\n", str, sd->name); 6795 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6661 6796
6662 if (!cpu_isset(cpu, sd->span)) { 6797 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6663 printk(KERN_ERR "ERROR: domain->span does not contain " 6798 printk(KERN_ERR "ERROR: domain->span does not contain "
6664 "CPU%d\n", cpu); 6799 "CPU%d\n", cpu);
6665 } 6800 }
6666 if (!cpu_isset(cpu, group->cpumask)) { 6801 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6667 printk(KERN_ERR "ERROR: domain->groups does not contain" 6802 printk(KERN_ERR "ERROR: domain->groups does not contain"
6668 " CPU%d\n", cpu); 6803 " CPU%d\n", cpu);
6669 } 6804 }
@@ -6683,31 +6818,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6683 break; 6818 break;
6684 } 6819 }
6685 6820
6686 if (!cpus_weight(group->cpumask)) { 6821 if (!cpumask_weight(sched_group_cpus(group))) {
6687 printk(KERN_CONT "\n"); 6822 printk(KERN_CONT "\n");
6688 printk(KERN_ERR "ERROR: empty group\n"); 6823 printk(KERN_ERR "ERROR: empty group\n");
6689 break; 6824 break;
6690 } 6825 }
6691 6826
6692 if (cpus_intersects(*groupmask, group->cpumask)) { 6827 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6693 printk(KERN_CONT "\n"); 6828 printk(KERN_CONT "\n");
6694 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6829 printk(KERN_ERR "ERROR: repeated CPUs\n");
6695 break; 6830 break;
6696 } 6831 }
6697 6832
6698 cpus_or(*groupmask, *groupmask, group->cpumask); 6833 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6699 6834
6700 cpulist_scnprintf(str, sizeof(str), group->cpumask); 6835 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6701 printk(KERN_CONT " %s", str); 6836 printk(KERN_CONT " %s", str);
6702 6837
6703 group = group->next; 6838 group = group->next;
6704 } while (group != sd->groups); 6839 } while (group != sd->groups);
6705 printk(KERN_CONT "\n"); 6840 printk(KERN_CONT "\n");
6706 6841
6707 if (!cpus_equal(sd->span, *groupmask)) 6842 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6708 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6843 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6709 6844
6710 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) 6845 if (sd->parent &&
6846 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6711 printk(KERN_ERR "ERROR: parent span is not a superset " 6847 printk(KERN_ERR "ERROR: parent span is not a superset "
6712 "of domain->span\n"); 6848 "of domain->span\n");
6713 return 0; 6849 return 0;
@@ -6715,7 +6851,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6715 6851
6716static void sched_domain_debug(struct sched_domain *sd, int cpu) 6852static void sched_domain_debug(struct sched_domain *sd, int cpu)
6717{ 6853{
6718 cpumask_t *groupmask; 6854 cpumask_var_t groupmask;
6719 int level = 0; 6855 int level = 0;
6720 6856
6721 if (!sd) { 6857 if (!sd) {
@@ -6725,8 +6861,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6725 6861
6726 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6862 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6727 6863
6728 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 6864 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6729 if (!groupmask) {
6730 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6865 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6731 return; 6866 return;
6732 } 6867 }
@@ -6739,7 +6874,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6739 if (!sd) 6874 if (!sd)
6740 break; 6875 break;
6741 } 6876 }
6742 kfree(groupmask); 6877 free_cpumask_var(groupmask);
6743} 6878}
6744#else /* !CONFIG_SCHED_DEBUG */ 6879#else /* !CONFIG_SCHED_DEBUG */
6745# define sched_domain_debug(sd, cpu) do { } while (0) 6880# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6747,7 +6882,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6747 6882
6748static int sd_degenerate(struct sched_domain *sd) 6883static int sd_degenerate(struct sched_domain *sd)
6749{ 6884{
6750 if (cpus_weight(sd->span) == 1) 6885 if (cpumask_weight(sched_domain_span(sd)) == 1)
6751 return 1; 6886 return 1;
6752 6887
6753 /* Following flags need at least 2 groups */ 6888 /* Following flags need at least 2 groups */
@@ -6778,7 +6913,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6778 if (sd_degenerate(parent)) 6913 if (sd_degenerate(parent))
6779 return 1; 6914 return 1;
6780 6915
6781 if (!cpus_equal(sd->span, parent->span)) 6916 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6782 return 0; 6917 return 0;
6783 6918
6784 /* Does parent contain flags not in child? */ 6919 /* Does parent contain flags not in child? */
@@ -6802,6 +6937,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6802 return 1; 6937 return 1;
6803} 6938}
6804 6939
6940static void free_rootdomain(struct root_domain *rd)
6941{
6942 cpupri_cleanup(&rd->cpupri);
6943
6944 free_cpumask_var(rd->rto_mask);
6945 free_cpumask_var(rd->online);
6946 free_cpumask_var(rd->span);
6947 kfree(rd);
6948}
6949
6805static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6950static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6806{ 6951{
6807 unsigned long flags; 6952 unsigned long flags;
@@ -6811,38 +6956,62 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6811 if (rq->rd) { 6956 if (rq->rd) {
6812 struct root_domain *old_rd = rq->rd; 6957 struct root_domain *old_rd = rq->rd;
6813 6958
6814 if (cpu_isset(rq->cpu, old_rd->online)) 6959 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6815 set_rq_offline(rq); 6960 set_rq_offline(rq);
6816 6961
6817 cpu_clear(rq->cpu, old_rd->span); 6962 cpumask_clear_cpu(rq->cpu, old_rd->span);
6818 6963
6819 if (atomic_dec_and_test(&old_rd->refcount)) 6964 if (atomic_dec_and_test(&old_rd->refcount))
6820 kfree(old_rd); 6965 free_rootdomain(old_rd);
6821 } 6966 }
6822 6967
6823 atomic_inc(&rd->refcount); 6968 atomic_inc(&rd->refcount);
6824 rq->rd = rd; 6969 rq->rd = rd;
6825 6970
6826 cpu_set(rq->cpu, rd->span); 6971 cpumask_set_cpu(rq->cpu, rd->span);
6827 if (cpu_isset(rq->cpu, cpu_online_map)) 6972 if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
6828 set_rq_online(rq); 6973 set_rq_online(rq);
6829 6974
6830 spin_unlock_irqrestore(&rq->lock, flags); 6975 spin_unlock_irqrestore(&rq->lock, flags);
6831} 6976}
6832 6977
6833static void init_rootdomain(struct root_domain *rd) 6978static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
6834{ 6979{
6835 memset(rd, 0, sizeof(*rd)); 6980 memset(rd, 0, sizeof(*rd));
6836 6981
6837 cpus_clear(rd->span); 6982 if (bootmem) {
6838 cpus_clear(rd->online); 6983 alloc_bootmem_cpumask_var(&def_root_domain.span);
6984 alloc_bootmem_cpumask_var(&def_root_domain.online);
6985 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
6986 cpupri_init(&rd->cpupri, true);
6987 return 0;
6988 }
6839 6989
6840 cpupri_init(&rd->cpupri); 6990 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6991 goto out;
6992 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6993 goto free_span;
6994 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6995 goto free_online;
6996
6997 if (cpupri_init(&rd->cpupri, false) != 0)
6998 goto free_rto_mask;
6999 return 0;
7000
7001free_rto_mask:
7002 free_cpumask_var(rd->rto_mask);
7003free_online:
7004 free_cpumask_var(rd->online);
7005free_span:
7006 free_cpumask_var(rd->span);
7007out:
7008 return -ENOMEM;
6841} 7009}
6842 7010
6843static void init_defrootdomain(void) 7011static void init_defrootdomain(void)
6844{ 7012{
6845 init_rootdomain(&def_root_domain); 7013 init_rootdomain(&def_root_domain, true);
7014
6846 atomic_set(&def_root_domain.refcount, 1); 7015 atomic_set(&def_root_domain.refcount, 1);
6847} 7016}
6848 7017
@@ -6854,7 +7023,10 @@ static struct root_domain *alloc_rootdomain(void)
6854 if (!rd) 7023 if (!rd)
6855 return NULL; 7024 return NULL;
6856 7025
6857 init_rootdomain(rd); 7026 if (init_rootdomain(rd, false) != 0) {
7027 kfree(rd);
7028 return NULL;
7029 }
6858 7030
6859 return rd; 7031 return rd;
6860} 7032}
@@ -6896,19 +7068,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6896} 7068}
6897 7069
6898/* cpus with isolated domains */ 7070/* cpus with isolated domains */
6899static cpumask_t cpu_isolated_map = CPU_MASK_NONE; 7071static cpumask_var_t cpu_isolated_map;
6900 7072
6901/* Setup the mask of cpus configured for isolated domains */ 7073/* Setup the mask of cpus configured for isolated domains */
6902static int __init isolated_cpu_setup(char *str) 7074static int __init isolated_cpu_setup(char *str)
6903{ 7075{
6904 static int __initdata ints[NR_CPUS]; 7076 cpulist_parse(str, cpu_isolated_map);
6905 int i;
6906
6907 str = get_options(str, ARRAY_SIZE(ints), ints);
6908 cpus_clear(cpu_isolated_map);
6909 for (i = 1; i <= ints[0]; i++)
6910 if (ints[i] < NR_CPUS)
6911 cpu_set(ints[i], cpu_isolated_map);
6912 return 1; 7077 return 1;
6913} 7078}
6914 7079
@@ -6917,42 +7082,43 @@ __setup("isolcpus=", isolated_cpu_setup);
6917/* 7082/*
6918 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 7083 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6919 * to a function which identifies what group(along with sched group) a CPU 7084 * to a function which identifies what group(along with sched group) a CPU
6920 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS 7085 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6921 * (due to the fact that we keep track of groups covered with a cpumask_t). 7086 * (due to the fact that we keep track of groups covered with a struct cpumask).
6922 * 7087 *
6923 * init_sched_build_groups will build a circular linked list of the groups 7088 * init_sched_build_groups will build a circular linked list of the groups
6924 * covered by the given span, and will set each group's ->cpumask correctly, 7089 * covered by the given span, and will set each group's ->cpumask correctly,
6925 * and ->cpu_power to 0. 7090 * and ->cpu_power to 0.
6926 */ 7091 */
6927static void 7092static void
6928init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, 7093init_sched_build_groups(const struct cpumask *span,
6929 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 7094 const struct cpumask *cpu_map,
7095 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6930 struct sched_group **sg, 7096 struct sched_group **sg,
6931 cpumask_t *tmpmask), 7097 struct cpumask *tmpmask),
6932 cpumask_t *covered, cpumask_t *tmpmask) 7098 struct cpumask *covered, struct cpumask *tmpmask)
6933{ 7099{
6934 struct sched_group *first = NULL, *last = NULL; 7100 struct sched_group *first = NULL, *last = NULL;
6935 int i; 7101 int i;
6936 7102
6937 cpus_clear(*covered); 7103 cpumask_clear(covered);
6938 7104
6939 for_each_cpu_mask_nr(i, *span) { 7105 for_each_cpu(i, span) {
6940 struct sched_group *sg; 7106 struct sched_group *sg;
6941 int group = group_fn(i, cpu_map, &sg, tmpmask); 7107 int group = group_fn(i, cpu_map, &sg, tmpmask);
6942 int j; 7108 int j;
6943 7109
6944 if (cpu_isset(i, *covered)) 7110 if (cpumask_test_cpu(i, covered))
6945 continue; 7111 continue;
6946 7112
6947 cpus_clear(sg->cpumask); 7113 cpumask_clear(sched_group_cpus(sg));
6948 sg->__cpu_power = 0; 7114 sg->__cpu_power = 0;
6949 7115
6950 for_each_cpu_mask_nr(j, *span) { 7116 for_each_cpu(j, span) {
6951 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 7117 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6952 continue; 7118 continue;
6953 7119
6954 cpu_set(j, *covered); 7120 cpumask_set_cpu(j, covered);
6955 cpu_set(j, sg->cpumask); 7121 cpumask_set_cpu(j, sched_group_cpus(sg));
6956 } 7122 }
6957 if (!first) 7123 if (!first)
6958 first = sg; 7124 first = sg;
@@ -7016,23 +7182,21 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
7016 * should be one that prevents unnecessary balancing, but also spreads tasks 7182 * should be one that prevents unnecessary balancing, but also spreads tasks
7017 * out optimally. 7183 * out optimally.
7018 */ 7184 */
7019static void sched_domain_node_span(int node, cpumask_t *span) 7185static void sched_domain_node_span(int node, struct cpumask *span)
7020{ 7186{
7021 nodemask_t used_nodes; 7187 nodemask_t used_nodes;
7022 node_to_cpumask_ptr(nodemask, node);
7023 int i; 7188 int i;
7024 7189
7025 cpus_clear(*span); 7190 cpumask_clear(span);
7026 nodes_clear(used_nodes); 7191 nodes_clear(used_nodes);
7027 7192
7028 cpus_or(*span, *span, *nodemask); 7193 cpumask_or(span, span, cpumask_of_node(node));
7029 node_set(node, used_nodes); 7194 node_set(node, used_nodes);
7030 7195
7031 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 7196 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
7032 int next_node = find_next_best_node(node, &used_nodes); 7197 int next_node = find_next_best_node(node, &used_nodes);
7033 7198
7034 node_to_cpumask_ptr_next(nodemask, next_node); 7199 cpumask_or(span, span, cpumask_of_node(next_node));
7035 cpus_or(*span, *span, *nodemask);
7036 } 7200 }
7037} 7201}
7038#endif /* CONFIG_NUMA */ 7202#endif /* CONFIG_NUMA */
@@ -7040,18 +7204,33 @@ static void sched_domain_node_span(int node, cpumask_t *span)
7040int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7204int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7041 7205
7042/* 7206/*
7207 * The cpus mask in sched_group and sched_domain hangs off the end.
7208 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
7209 * for nr_cpu_ids < CONFIG_NR_CPUS.
7210 */
7211struct static_sched_group {
7212 struct sched_group sg;
7213 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
7214};
7215
7216struct static_sched_domain {
7217 struct sched_domain sd;
7218 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
7219};
7220
7221/*
7043 * SMT sched-domains: 7222 * SMT sched-domains:
7044 */ 7223 */
7045#ifdef CONFIG_SCHED_SMT 7224#ifdef CONFIG_SCHED_SMT
7046static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 7225static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
7047static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7226static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
7048 7227
7049static int 7228static int
7050cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7229cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
7051 cpumask_t *unused) 7230 struct sched_group **sg, struct cpumask *unused)
7052{ 7231{
7053 if (sg) 7232 if (sg)
7054 *sg = &per_cpu(sched_group_cpus, cpu); 7233 *sg = &per_cpu(sched_group_cpus, cpu).sg;
7055 return cpu; 7234 return cpu;
7056} 7235}
7057#endif /* CONFIG_SCHED_SMT */ 7236#endif /* CONFIG_SCHED_SMT */
@@ -7060,56 +7239,53 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7060 * multi-core sched-domains: 7239 * multi-core sched-domains:
7061 */ 7240 */
7062#ifdef CONFIG_SCHED_MC 7241#ifdef CONFIG_SCHED_MC
7063static DEFINE_PER_CPU(struct sched_domain, core_domains); 7242static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
7064static DEFINE_PER_CPU(struct sched_group, sched_group_core); 7243static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
7065#endif /* CONFIG_SCHED_MC */ 7244#endif /* CONFIG_SCHED_MC */
7066 7245
7067#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7246#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7068static int 7247static int
7069cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7248cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7070 cpumask_t *mask) 7249 struct sched_group **sg, struct cpumask *mask)
7071{ 7250{
7072 int group; 7251 int group;
7073 7252
7074 *mask = per_cpu(cpu_sibling_map, cpu); 7253 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7075 cpus_and(*mask, *mask, *cpu_map); 7254 group = cpumask_first(mask);
7076 group = first_cpu(*mask);
7077 if (sg) 7255 if (sg)
7078 *sg = &per_cpu(sched_group_core, group); 7256 *sg = &per_cpu(sched_group_core, group).sg;
7079 return group; 7257 return group;
7080} 7258}
7081#elif defined(CONFIG_SCHED_MC) 7259#elif defined(CONFIG_SCHED_MC)
7082static int 7260static int
7083cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7261cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7084 cpumask_t *unused) 7262 struct sched_group **sg, struct cpumask *unused)
7085{ 7263{
7086 if (sg) 7264 if (sg)
7087 *sg = &per_cpu(sched_group_core, cpu); 7265 *sg = &per_cpu(sched_group_core, cpu).sg;
7088 return cpu; 7266 return cpu;
7089} 7267}
7090#endif 7268#endif
7091 7269
7092static DEFINE_PER_CPU(struct sched_domain, phys_domains); 7270static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
7093static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7271static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
7094 7272
7095static int 7273static int
7096cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7274cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7097 cpumask_t *mask) 7275 struct sched_group **sg, struct cpumask *mask)
7098{ 7276{
7099 int group; 7277 int group;
7100#ifdef CONFIG_SCHED_MC 7278#ifdef CONFIG_SCHED_MC
7101 *mask = cpu_coregroup_map(cpu); 7279 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7102 cpus_and(*mask, *mask, *cpu_map); 7280 group = cpumask_first(mask);
7103 group = first_cpu(*mask);
7104#elif defined(CONFIG_SCHED_SMT) 7281#elif defined(CONFIG_SCHED_SMT)
7105 *mask = per_cpu(cpu_sibling_map, cpu); 7282 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7106 cpus_and(*mask, *mask, *cpu_map); 7283 group = cpumask_first(mask);
7107 group = first_cpu(*mask);
7108#else 7284#else
7109 group = cpu; 7285 group = cpu;
7110#endif 7286#endif
7111 if (sg) 7287 if (sg)
7112 *sg = &per_cpu(sched_group_phys, group); 7288 *sg = &per_cpu(sched_group_phys, group).sg;
7113 return group; 7289 return group;
7114} 7290}
7115 7291
@@ -7119,23 +7295,23 @@ cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7119 * groups, so roll our own. Now each node has its own list of groups which 7295 * groups, so roll our own. Now each node has its own list of groups which
7120 * gets dynamically allocated. 7296 * gets dynamically allocated.
7121 */ 7297 */
7122static DEFINE_PER_CPU(struct sched_domain, node_domains); 7298static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
7123static struct sched_group ***sched_group_nodes_bycpu; 7299static struct sched_group ***sched_group_nodes_bycpu;
7124 7300
7125static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7301static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
7126static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7302static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7127 7303
7128static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7304static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7129 struct sched_group **sg, cpumask_t *nodemask) 7305 struct sched_group **sg,
7306 struct cpumask *nodemask)
7130{ 7307{
7131 int group; 7308 int group;
7132 7309
7133 *nodemask = node_to_cpumask(cpu_to_node(cpu)); 7310 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7134 cpus_and(*nodemask, *nodemask, *cpu_map); 7311 group = cpumask_first(nodemask);
7135 group = first_cpu(*nodemask);
7136 7312
7137 if (sg) 7313 if (sg)
7138 *sg = &per_cpu(sched_group_allnodes, group); 7314 *sg = &per_cpu(sched_group_allnodes, group).sg;
7139 return group; 7315 return group;
7140} 7316}
7141 7317
@@ -7147,11 +7323,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7147 if (!sg) 7323 if (!sg)
7148 return; 7324 return;
7149 do { 7325 do {
7150 for_each_cpu_mask_nr(j, sg->cpumask) { 7326 for_each_cpu(j, sched_group_cpus(sg)) {
7151 struct sched_domain *sd; 7327 struct sched_domain *sd;
7152 7328
7153 sd = &per_cpu(phys_domains, j); 7329 sd = &per_cpu(phys_domains, j).sd;
7154 if (j != first_cpu(sd->groups->cpumask)) { 7330 if (j != cpumask_first(sched_group_cpus(sd->groups))) {
7155 /* 7331 /*
7156 * Only add "power" once for each 7332 * Only add "power" once for each
7157 * physical package. 7333 * physical package.
@@ -7168,11 +7344,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7168 7344
7169#ifdef CONFIG_NUMA 7345#ifdef CONFIG_NUMA
7170/* Free memory allocated for various sched_group structures */ 7346/* Free memory allocated for various sched_group structures */
7171static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7347static void free_sched_groups(const struct cpumask *cpu_map,
7348 struct cpumask *nodemask)
7172{ 7349{
7173 int cpu, i; 7350 int cpu, i;
7174 7351
7175 for_each_cpu_mask_nr(cpu, *cpu_map) { 7352 for_each_cpu(cpu, cpu_map) {
7176 struct sched_group **sched_group_nodes 7353 struct sched_group **sched_group_nodes
7177 = sched_group_nodes_bycpu[cpu]; 7354 = sched_group_nodes_bycpu[cpu];
7178 7355
@@ -7182,9 +7359,8 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7182 for (i = 0; i < nr_node_ids; i++) { 7359 for (i = 0; i < nr_node_ids; i++) {
7183 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7360 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7184 7361
7185 *nodemask = node_to_cpumask(i); 7362 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7186 cpus_and(*nodemask, *nodemask, *cpu_map); 7363 if (cpumask_empty(nodemask))
7187 if (cpus_empty(*nodemask))
7188 continue; 7364 continue;
7189 7365
7190 if (sg == NULL) 7366 if (sg == NULL)
@@ -7202,7 +7378,8 @@ next_sg:
7202 } 7378 }
7203} 7379}
7204#else /* !CONFIG_NUMA */ 7380#else /* !CONFIG_NUMA */
7205static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7381static void free_sched_groups(const struct cpumask *cpu_map,
7382 struct cpumask *nodemask)
7206{ 7383{
7207} 7384}
7208#endif /* CONFIG_NUMA */ 7385#endif /* CONFIG_NUMA */
@@ -7228,7 +7405,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7228 7405
7229 WARN_ON(!sd || !sd->groups); 7406 WARN_ON(!sd || !sd->groups);
7230 7407
7231 if (cpu != first_cpu(sd->groups->cpumask)) 7408 if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
7232 return; 7409 return;
7233 7410
7234 child = sd->child; 7411 child = sd->child;
@@ -7293,48 +7470,6 @@ SD_INIT_FUNC(CPU)
7293 SD_INIT_FUNC(MC) 7470 SD_INIT_FUNC(MC)
7294#endif 7471#endif
7295 7472
7296/*
7297 * To minimize stack usage kmalloc room for cpumasks and share the
7298 * space as the usage in build_sched_domains() dictates. Used only
7299 * if the amount of space is significant.
7300 */
7301struct allmasks {
7302 cpumask_t tmpmask; /* make this one first */
7303 union {
7304 cpumask_t nodemask;
7305 cpumask_t this_sibling_map;
7306 cpumask_t this_core_map;
7307 };
7308 cpumask_t send_covered;
7309
7310#ifdef CONFIG_NUMA
7311 cpumask_t domainspan;
7312 cpumask_t covered;
7313 cpumask_t notcovered;
7314#endif
7315};
7316
7317#if NR_CPUS > 128
7318#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7319static inline void sched_cpumask_alloc(struct allmasks **masks)
7320{
7321 *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
7322}
7323static inline void sched_cpumask_free(struct allmasks *masks)
7324{
7325 kfree(masks);
7326}
7327#else
7328#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7329static inline void sched_cpumask_alloc(struct allmasks **masks)
7330{ }
7331static inline void sched_cpumask_free(struct allmasks *masks)
7332{ }
7333#endif
7334
7335#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7336 ((unsigned long)(a) + offsetof(struct allmasks, v))
7337
7338static int default_relax_domain_level = -1; 7473static int default_relax_domain_level = -1;
7339 7474
7340static int __init setup_relax_domain_level(char *str) 7475static int __init setup_relax_domain_level(char *str)
@@ -7374,17 +7509,38 @@ static void set_domain_attribute(struct sched_domain *sd,
7374 * Build sched domains for a given set of cpus and attach the sched domains 7509 * Build sched domains for a given set of cpus and attach the sched domains
7375 * to the individual cpus 7510 * to the individual cpus
7376 */ 7511 */
7377static int __build_sched_domains(const cpumask_t *cpu_map, 7512static int __build_sched_domains(const struct cpumask *cpu_map,
7378 struct sched_domain_attr *attr) 7513 struct sched_domain_attr *attr)
7379{ 7514{
7380 int i; 7515 int i, err = -ENOMEM;
7381 struct root_domain *rd; 7516 struct root_domain *rd;
7382 SCHED_CPUMASK_DECLARE(allmasks); 7517 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
7383 cpumask_t *tmpmask; 7518 tmpmask;
7384#ifdef CONFIG_NUMA 7519#ifdef CONFIG_NUMA
7520 cpumask_var_t domainspan, covered, notcovered;
7385 struct sched_group **sched_group_nodes = NULL; 7521 struct sched_group **sched_group_nodes = NULL;
7386 int sd_allnodes = 0; 7522 int sd_allnodes = 0;
7387 7523
7524 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
7525 goto out;
7526 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
7527 goto free_domainspan;
7528 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
7529 goto free_covered;
7530#endif
7531
7532 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
7533 goto free_notcovered;
7534 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
7535 goto free_nodemask;
7536 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
7537 goto free_this_sibling_map;
7538 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
7539 goto free_this_core_map;
7540 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
7541 goto free_send_covered;
7542
7543#ifdef CONFIG_NUMA
7388 /* 7544 /*
7389 * Allocate the per-node list of sched groups 7545 * Allocate the per-node list of sched groups
7390 */ 7546 */
@@ -7392,75 +7548,57 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7392 GFP_KERNEL); 7548 GFP_KERNEL);
7393 if (!sched_group_nodes) { 7549 if (!sched_group_nodes) {
7394 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7550 printk(KERN_WARNING "Can not alloc sched group node list\n");
7395 return -ENOMEM; 7551 goto free_tmpmask;
7396 } 7552 }
7397#endif 7553#endif
7398 7554
7399 rd = alloc_rootdomain(); 7555 rd = alloc_rootdomain();
7400 if (!rd) { 7556 if (!rd) {
7401 printk(KERN_WARNING "Cannot alloc root domain\n"); 7557 printk(KERN_WARNING "Cannot alloc root domain\n");
7402#ifdef CONFIG_NUMA 7558 goto free_sched_groups;
7403 kfree(sched_group_nodes);
7404#endif
7405 return -ENOMEM;
7406 } 7559 }
7407 7560
7408 /* get space for all scratch cpumask variables */
7409 sched_cpumask_alloc(&allmasks);
7410 if (!allmasks) {
7411 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7412 kfree(rd);
7413#ifdef CONFIG_NUMA 7561#ifdef CONFIG_NUMA
7414 kfree(sched_group_nodes); 7562 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
7415#endif
7416 return -ENOMEM;
7417 }
7418
7419 tmpmask = (cpumask_t *)allmasks;
7420
7421
7422#ifdef CONFIG_NUMA
7423 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7424#endif 7563#endif
7425 7564
7426 /* 7565 /*
7427 * Set up domains for cpus specified by the cpu_map. 7566 * Set up domains for cpus specified by the cpu_map.
7428 */ 7567 */
7429 for_each_cpu_mask_nr(i, *cpu_map) { 7568 for_each_cpu(i, cpu_map) {
7430 struct sched_domain *sd = NULL, *p; 7569 struct sched_domain *sd = NULL, *p;
7431 SCHED_CPUMASK_VAR(nodemask, allmasks);
7432 7570
7433 *nodemask = node_to_cpumask(cpu_to_node(i)); 7571 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
7434 cpus_and(*nodemask, *nodemask, *cpu_map);
7435 7572
7436#ifdef CONFIG_NUMA 7573#ifdef CONFIG_NUMA
7437 if (cpus_weight(*cpu_map) > 7574 if (cpumask_weight(cpu_map) >
7438 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { 7575 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
7439 sd = &per_cpu(allnodes_domains, i); 7576 sd = &per_cpu(allnodes_domains, i).sd;
7440 SD_INIT(sd, ALLNODES); 7577 SD_INIT(sd, ALLNODES);
7441 set_domain_attribute(sd, attr); 7578 set_domain_attribute(sd, attr);
7442 sd->span = *cpu_map; 7579 cpumask_copy(sched_domain_span(sd), cpu_map);
7443 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7580 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7444 p = sd; 7581 p = sd;
7445 sd_allnodes = 1; 7582 sd_allnodes = 1;
7446 } else 7583 } else
7447 p = NULL; 7584 p = NULL;
7448 7585
7449 sd = &per_cpu(node_domains, i); 7586 sd = &per_cpu(node_domains, i).sd;
7450 SD_INIT(sd, NODE); 7587 SD_INIT(sd, NODE);
7451 set_domain_attribute(sd, attr); 7588 set_domain_attribute(sd, attr);
7452 sched_domain_node_span(cpu_to_node(i), &sd->span); 7589 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7453 sd->parent = p; 7590 sd->parent = p;
7454 if (p) 7591 if (p)
7455 p->child = sd; 7592 p->child = sd;
7456 cpus_and(sd->span, sd->span, *cpu_map); 7593 cpumask_and(sched_domain_span(sd),
7594 sched_domain_span(sd), cpu_map);
7457#endif 7595#endif
7458 7596
7459 p = sd; 7597 p = sd;
7460 sd = &per_cpu(phys_domains, i); 7598 sd = &per_cpu(phys_domains, i).sd;
7461 SD_INIT(sd, CPU); 7599 SD_INIT(sd, CPU);
7462 set_domain_attribute(sd, attr); 7600 set_domain_attribute(sd, attr);
7463 sd->span = *nodemask; 7601 cpumask_copy(sched_domain_span(sd), nodemask);
7464 sd->parent = p; 7602 sd->parent = p;
7465 if (p) 7603 if (p)
7466 p->child = sd; 7604 p->child = sd;
@@ -7468,11 +7606,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7468 7606
7469#ifdef CONFIG_SCHED_MC 7607#ifdef CONFIG_SCHED_MC
7470 p = sd; 7608 p = sd;
7471 sd = &per_cpu(core_domains, i); 7609 sd = &per_cpu(core_domains, i).sd;
7472 SD_INIT(sd, MC); 7610 SD_INIT(sd, MC);
7473 set_domain_attribute(sd, attr); 7611 set_domain_attribute(sd, attr);
7474 sd->span = cpu_coregroup_map(i); 7612 cpumask_and(sched_domain_span(sd), cpu_map,
7475 cpus_and(sd->span, sd->span, *cpu_map); 7613 cpu_coregroup_mask(i));
7476 sd->parent = p; 7614 sd->parent = p;
7477 p->child = sd; 7615 p->child = sd;
7478 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); 7616 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7480,11 +7618,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7480 7618
7481#ifdef CONFIG_SCHED_SMT 7619#ifdef CONFIG_SCHED_SMT
7482 p = sd; 7620 p = sd;
7483 sd = &per_cpu(cpu_domains, i); 7621 sd = &per_cpu(cpu_domains, i).sd;
7484 SD_INIT(sd, SIBLING); 7622 SD_INIT(sd, SIBLING);
7485 set_domain_attribute(sd, attr); 7623 set_domain_attribute(sd, attr);
7486 sd->span = per_cpu(cpu_sibling_map, i); 7624 cpumask_and(sched_domain_span(sd),
7487 cpus_and(sd->span, sd->span, *cpu_map); 7625 &per_cpu(cpu_sibling_map, i), cpu_map);
7488 sd->parent = p; 7626 sd->parent = p;
7489 p->child = sd; 7627 p->child = sd;
7490 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 7628 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7493,13 +7631,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7493 7631
7494#ifdef CONFIG_SCHED_SMT 7632#ifdef CONFIG_SCHED_SMT
7495 /* Set up CPU (sibling) groups */ 7633 /* Set up CPU (sibling) groups */
7496 for_each_cpu_mask_nr(i, *cpu_map) { 7634 for_each_cpu(i, cpu_map) {
7497 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7635 cpumask_and(this_sibling_map,
7498 SCHED_CPUMASK_VAR(send_covered, allmasks); 7636 &per_cpu(cpu_sibling_map, i), cpu_map);
7499 7637 if (i != cpumask_first(this_sibling_map))
7500 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7501 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7502 if (i != first_cpu(*this_sibling_map))
7503 continue; 7638 continue;
7504 7639
7505 init_sched_build_groups(this_sibling_map, cpu_map, 7640 init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7510,13 +7645,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7510 7645
7511#ifdef CONFIG_SCHED_MC 7646#ifdef CONFIG_SCHED_MC
7512 /* Set up multi-core groups */ 7647 /* Set up multi-core groups */
7513 for_each_cpu_mask_nr(i, *cpu_map) { 7648 for_each_cpu(i, cpu_map) {
7514 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7649 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
7515 SCHED_CPUMASK_VAR(send_covered, allmasks); 7650 if (i != cpumask_first(this_core_map))
7516
7517 *this_core_map = cpu_coregroup_map(i);
7518 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7519 if (i != first_cpu(*this_core_map))
7520 continue; 7651 continue;
7521 7652
7522 init_sched_build_groups(this_core_map, cpu_map, 7653 init_sched_build_groups(this_core_map, cpu_map,
@@ -7527,12 +7658,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7527 7658
7528 /* Set up physical groups */ 7659 /* Set up physical groups */
7529 for (i = 0; i < nr_node_ids; i++) { 7660 for (i = 0; i < nr_node_ids; i++) {
7530 SCHED_CPUMASK_VAR(nodemask, allmasks); 7661 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7531 SCHED_CPUMASK_VAR(send_covered, allmasks); 7662 if (cpumask_empty(nodemask))
7532
7533 *nodemask = node_to_cpumask(i);
7534 cpus_and(*nodemask, *nodemask, *cpu_map);
7535 if (cpus_empty(*nodemask))
7536 continue; 7663 continue;
7537 7664
7538 init_sched_build_groups(nodemask, cpu_map, 7665 init_sched_build_groups(nodemask, cpu_map,
@@ -7543,8 +7670,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7543#ifdef CONFIG_NUMA 7670#ifdef CONFIG_NUMA
7544 /* Set up node groups */ 7671 /* Set up node groups */
7545 if (sd_allnodes) { 7672 if (sd_allnodes) {
7546 SCHED_CPUMASK_VAR(send_covered, allmasks);
7547
7548 init_sched_build_groups(cpu_map, cpu_map, 7673 init_sched_build_groups(cpu_map, cpu_map,
7549 &cpu_to_allnodes_group, 7674 &cpu_to_allnodes_group,
7550 send_covered, tmpmask); 7675 send_covered, tmpmask);
@@ -7553,58 +7678,53 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7553 for (i = 0; i < nr_node_ids; i++) { 7678 for (i = 0; i < nr_node_ids; i++) {
7554 /* Set up node groups */ 7679 /* Set up node groups */
7555 struct sched_group *sg, *prev; 7680 struct sched_group *sg, *prev;
7556 SCHED_CPUMASK_VAR(nodemask, allmasks);
7557 SCHED_CPUMASK_VAR(domainspan, allmasks);
7558 SCHED_CPUMASK_VAR(covered, allmasks);
7559 int j; 7681 int j;
7560 7682
7561 *nodemask = node_to_cpumask(i); 7683 cpumask_clear(covered);
7562 cpus_clear(*covered); 7684 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
7563 7685 if (cpumask_empty(nodemask)) {
7564 cpus_and(*nodemask, *nodemask, *cpu_map);
7565 if (cpus_empty(*nodemask)) {
7566 sched_group_nodes[i] = NULL; 7686 sched_group_nodes[i] = NULL;
7567 continue; 7687 continue;
7568 } 7688 }
7569 7689
7570 sched_domain_node_span(i, domainspan); 7690 sched_domain_node_span(i, domainspan);
7571 cpus_and(*domainspan, *domainspan, *cpu_map); 7691 cpumask_and(domainspan, domainspan, cpu_map);
7572 7692
7573 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7693 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7694 GFP_KERNEL, i);
7574 if (!sg) { 7695 if (!sg) {
7575 printk(KERN_WARNING "Can not alloc domain group for " 7696 printk(KERN_WARNING "Can not alloc domain group for "
7576 "node %d\n", i); 7697 "node %d\n", i);
7577 goto error; 7698 goto error;
7578 } 7699 }
7579 sched_group_nodes[i] = sg; 7700 sched_group_nodes[i] = sg;
7580 for_each_cpu_mask_nr(j, *nodemask) { 7701 for_each_cpu(j, nodemask) {
7581 struct sched_domain *sd; 7702 struct sched_domain *sd;
7582 7703
7583 sd = &per_cpu(node_domains, j); 7704 sd = &per_cpu(node_domains, j).sd;
7584 sd->groups = sg; 7705 sd->groups = sg;
7585 } 7706 }
7586 sg->__cpu_power = 0; 7707 sg->__cpu_power = 0;
7587 sg->cpumask = *nodemask; 7708 cpumask_copy(sched_group_cpus(sg), nodemask);
7588 sg->next = sg; 7709 sg->next = sg;
7589 cpus_or(*covered, *covered, *nodemask); 7710 cpumask_or(covered, covered, nodemask);
7590 prev = sg; 7711 prev = sg;
7591 7712
7592 for (j = 0; j < nr_node_ids; j++) { 7713 for (j = 0; j < nr_node_ids; j++) {
7593 SCHED_CPUMASK_VAR(notcovered, allmasks);
7594 int n = (i + j) % nr_node_ids; 7714 int n = (i + j) % nr_node_ids;
7595 node_to_cpumask_ptr(pnodemask, n);
7596 7715
7597 cpus_complement(*notcovered, *covered); 7716 cpumask_complement(notcovered, covered);
7598 cpus_and(*tmpmask, *notcovered, *cpu_map); 7717 cpumask_and(tmpmask, notcovered, cpu_map);
7599 cpus_and(*tmpmask, *tmpmask, *domainspan); 7718 cpumask_and(tmpmask, tmpmask, domainspan);
7600 if (cpus_empty(*tmpmask)) 7719 if (cpumask_empty(tmpmask))
7601 break; 7720 break;
7602 7721
7603 cpus_and(*tmpmask, *tmpmask, *pnodemask); 7722 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
7604 if (cpus_empty(*tmpmask)) 7723 if (cpumask_empty(tmpmask))
7605 continue; 7724 continue;
7606 7725
7607 sg = kmalloc_node(sizeof(struct sched_group), 7726 sg = kmalloc_node(sizeof(struct sched_group) +
7727 cpumask_size(),
7608 GFP_KERNEL, i); 7728 GFP_KERNEL, i);
7609 if (!sg) { 7729 if (!sg) {
7610 printk(KERN_WARNING 7730 printk(KERN_WARNING
@@ -7612,9 +7732,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7612 goto error; 7732 goto error;
7613 } 7733 }
7614 sg->__cpu_power = 0; 7734 sg->__cpu_power = 0;
7615 sg->cpumask = *tmpmask; 7735 cpumask_copy(sched_group_cpus(sg), tmpmask);
7616 sg->next = prev->next; 7736 sg->next = prev->next;
7617 cpus_or(*covered, *covered, *tmpmask); 7737 cpumask_or(covered, covered, tmpmask);
7618 prev->next = sg; 7738 prev->next = sg;
7619 prev = sg; 7739 prev = sg;
7620 } 7740 }
@@ -7623,22 +7743,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7623 7743
7624 /* Calculate CPU power for physical packages and nodes */ 7744 /* Calculate CPU power for physical packages and nodes */
7625#ifdef CONFIG_SCHED_SMT 7745#ifdef CONFIG_SCHED_SMT
7626 for_each_cpu_mask_nr(i, *cpu_map) { 7746 for_each_cpu(i, cpu_map) {
7627 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7747 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
7628 7748
7629 init_sched_groups_power(i, sd); 7749 init_sched_groups_power(i, sd);
7630 } 7750 }
7631#endif 7751#endif
7632#ifdef CONFIG_SCHED_MC 7752#ifdef CONFIG_SCHED_MC
7633 for_each_cpu_mask_nr(i, *cpu_map) { 7753 for_each_cpu(i, cpu_map) {
7634 struct sched_domain *sd = &per_cpu(core_domains, i); 7754 struct sched_domain *sd = &per_cpu(core_domains, i).sd;
7635 7755
7636 init_sched_groups_power(i, sd); 7756 init_sched_groups_power(i, sd);
7637 } 7757 }
7638#endif 7758#endif
7639 7759
7640 for_each_cpu_mask_nr(i, *cpu_map) { 7760 for_each_cpu(i, cpu_map) {
7641 struct sched_domain *sd = &per_cpu(phys_domains, i); 7761 struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
7642 7762
7643 init_sched_groups_power(i, sd); 7763 init_sched_groups_power(i, sd);
7644 } 7764 }
@@ -7650,53 +7770,78 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7650 if (sd_allnodes) { 7770 if (sd_allnodes) {
7651 struct sched_group *sg; 7771 struct sched_group *sg;
7652 7772
7653 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, 7773 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7654 tmpmask); 7774 tmpmask);
7655 init_numa_sched_groups_power(sg); 7775 init_numa_sched_groups_power(sg);
7656 } 7776 }
7657#endif 7777#endif
7658 7778
7659 /* Attach the domains */ 7779 /* Attach the domains */
7660 for_each_cpu_mask_nr(i, *cpu_map) { 7780 for_each_cpu(i, cpu_map) {
7661 struct sched_domain *sd; 7781 struct sched_domain *sd;
7662#ifdef CONFIG_SCHED_SMT 7782#ifdef CONFIG_SCHED_SMT
7663 sd = &per_cpu(cpu_domains, i); 7783 sd = &per_cpu(cpu_domains, i).sd;
7664#elif defined(CONFIG_SCHED_MC) 7784#elif defined(CONFIG_SCHED_MC)
7665 sd = &per_cpu(core_domains, i); 7785 sd = &per_cpu(core_domains, i).sd;
7666#else 7786#else
7667 sd = &per_cpu(phys_domains, i); 7787 sd = &per_cpu(phys_domains, i).sd;
7668#endif 7788#endif
7669 cpu_attach_domain(sd, rd, i); 7789 cpu_attach_domain(sd, rd, i);
7670 } 7790 }
7671 7791
7672 sched_cpumask_free(allmasks); 7792 err = 0;
7673 return 0; 7793
7794free_tmpmask:
7795 free_cpumask_var(tmpmask);
7796free_send_covered:
7797 free_cpumask_var(send_covered);
7798free_this_core_map:
7799 free_cpumask_var(this_core_map);
7800free_this_sibling_map:
7801 free_cpumask_var(this_sibling_map);
7802free_nodemask:
7803 free_cpumask_var(nodemask);
7804free_notcovered:
7805#ifdef CONFIG_NUMA
7806 free_cpumask_var(notcovered);
7807free_covered:
7808 free_cpumask_var(covered);
7809free_domainspan:
7810 free_cpumask_var(domainspan);
7811out:
7812#endif
7813 return err;
7814
7815free_sched_groups:
7816#ifdef CONFIG_NUMA
7817 kfree(sched_group_nodes);
7818#endif
7819 goto free_tmpmask;
7674 7820
7675#ifdef CONFIG_NUMA 7821#ifdef CONFIG_NUMA
7676error: 7822error:
7677 free_sched_groups(cpu_map, tmpmask); 7823 free_sched_groups(cpu_map, tmpmask);
7678 sched_cpumask_free(allmasks); 7824 free_rootdomain(rd);
7679 kfree(rd); 7825 goto free_tmpmask;
7680 return -ENOMEM;
7681#endif 7826#endif
7682} 7827}
7683 7828
7684static int build_sched_domains(const cpumask_t *cpu_map) 7829static int build_sched_domains(const struct cpumask *cpu_map)
7685{ 7830{
7686 return __build_sched_domains(cpu_map, NULL); 7831 return __build_sched_domains(cpu_map, NULL);
7687} 7832}
7688 7833
7689static cpumask_t *doms_cur; /* current sched domains */ 7834static struct cpumask *doms_cur; /* current sched domains */
7690static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7835static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7691static struct sched_domain_attr *dattr_cur; 7836static struct sched_domain_attr *dattr_cur;
7692 /* attribues of custom domains in 'doms_cur' */ 7837 /* attribues of custom domains in 'doms_cur' */
7693 7838
7694/* 7839/*
7695 * Special case: If a kmalloc of a doms_cur partition (array of 7840 * Special case: If a kmalloc of a doms_cur partition (array of
7696 * cpumask_t) fails, then fallback to a single sched domain, 7841 * cpumask) fails, then fallback to a single sched domain,
7697 * as determined by the single cpumask_t fallback_doms. 7842 * as determined by the single cpumask fallback_doms.
7698 */ 7843 */
7699static cpumask_t fallback_doms; 7844static cpumask_var_t fallback_doms;
7700 7845
7701/* 7846/*
7702 * arch_update_cpu_topology lets virtualized architectures update the 7847 * arch_update_cpu_topology lets virtualized architectures update the
@@ -7713,16 +7858,16 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
7713 * For now this just excludes isolated cpus, but could be used to 7858 * For now this just excludes isolated cpus, but could be used to
7714 * exclude other special cases in the future. 7859 * exclude other special cases in the future.
7715 */ 7860 */
7716static int arch_init_sched_domains(const cpumask_t *cpu_map) 7861static int arch_init_sched_domains(const struct cpumask *cpu_map)
7717{ 7862{
7718 int err; 7863 int err;
7719 7864
7720 arch_update_cpu_topology(); 7865 arch_update_cpu_topology();
7721 ndoms_cur = 1; 7866 ndoms_cur = 1;
7722 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 7867 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
7723 if (!doms_cur) 7868 if (!doms_cur)
7724 doms_cur = &fallback_doms; 7869 doms_cur = fallback_doms;
7725 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7870 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
7726 dattr_cur = NULL; 7871 dattr_cur = NULL;
7727 err = build_sched_domains(doms_cur); 7872 err = build_sched_domains(doms_cur);
7728 register_sched_domain_sysctl(); 7873 register_sched_domain_sysctl();
@@ -7730,8 +7875,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
7730 return err; 7875 return err;
7731} 7876}
7732 7877
7733static void arch_destroy_sched_domains(const cpumask_t *cpu_map, 7878static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7734 cpumask_t *tmpmask) 7879 struct cpumask *tmpmask)
7735{ 7880{
7736 free_sched_groups(cpu_map, tmpmask); 7881 free_sched_groups(cpu_map, tmpmask);
7737} 7882}
@@ -7740,15 +7885,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7740 * Detach sched domains from a group of cpus specified in cpu_map 7885 * Detach sched domains from a group of cpus specified in cpu_map
7741 * These cpus will now be attached to the NULL domain 7886 * These cpus will now be attached to the NULL domain
7742 */ 7887 */
7743static void detach_destroy_domains(const cpumask_t *cpu_map) 7888static void detach_destroy_domains(const struct cpumask *cpu_map)
7744{ 7889{
7745 cpumask_t tmpmask; 7890 /* Save because hotplug lock held. */
7891 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7746 int i; 7892 int i;
7747 7893
7748 for_each_cpu_mask_nr(i, *cpu_map) 7894 for_each_cpu(i, cpu_map)
7749 cpu_attach_domain(NULL, &def_root_domain, i); 7895 cpu_attach_domain(NULL, &def_root_domain, i);
7750 synchronize_sched(); 7896 synchronize_sched();
7751 arch_destroy_sched_domains(cpu_map, &tmpmask); 7897 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7752} 7898}
7753 7899
7754/* handle null as "default" */ 7900/* handle null as "default" */
@@ -7773,7 +7919,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7773 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7919 * doms_new[] to the current sched domain partitioning, doms_cur[].
7774 * It destroys each deleted domain and builds each new domain. 7920 * It destroys each deleted domain and builds each new domain.
7775 * 7921 *
7776 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. 7922 * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
7777 * The masks don't intersect (don't overlap.) We should setup one 7923 * The masks don't intersect (don't overlap.) We should setup one
7778 * sched domain for each mask. CPUs not in any of the cpumasks will 7924 * sched domain for each mask. CPUs not in any of the cpumasks will
7779 * not be load balanced. If the same cpumask appears both in the 7925 * not be load balanced. If the same cpumask appears both in the
@@ -7787,13 +7933,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7787 * the single partition 'fallback_doms', it also forces the domains 7933 * the single partition 'fallback_doms', it also forces the domains
7788 * to be rebuilt. 7934 * to be rebuilt.
7789 * 7935 *
7790 * If doms_new == NULL it will be replaced with cpu_online_map. 7936 * If doms_new == NULL it will be replaced with cpu_online_mask.
7791 * ndoms_new == 0 is a special case for destroying existing domains, 7937 * ndoms_new == 0 is a special case for destroying existing domains,
7792 * and it will not create the default domain. 7938 * and it will not create the default domain.
7793 * 7939 *
7794 * Call with hotplug lock held 7940 * Call with hotplug lock held
7795 */ 7941 */
7796void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7942/* FIXME: Change to struct cpumask *doms_new[] */
7943void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
7797 struct sched_domain_attr *dattr_new) 7944 struct sched_domain_attr *dattr_new)
7798{ 7945{
7799 int i, j, n; 7946 int i, j, n;
@@ -7812,7 +7959,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7812 /* Destroy deleted domains */ 7959 /* Destroy deleted domains */
7813 for (i = 0; i < ndoms_cur; i++) { 7960 for (i = 0; i < ndoms_cur; i++) {
7814 for (j = 0; j < n && !new_topology; j++) { 7961 for (j = 0; j < n && !new_topology; j++) {
7815 if (cpus_equal(doms_cur[i], doms_new[j]) 7962 if (cpumask_equal(&doms_cur[i], &doms_new[j])
7816 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7963 && dattrs_equal(dattr_cur, i, dattr_new, j))
7817 goto match1; 7964 goto match1;
7818 } 7965 }
@@ -7824,15 +7971,15 @@ match1:
7824 7971
7825 if (doms_new == NULL) { 7972 if (doms_new == NULL) {
7826 ndoms_cur = 0; 7973 ndoms_cur = 0;
7827 doms_new = &fallback_doms; 7974 doms_new = fallback_doms;
7828 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7975 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
7829 WARN_ON_ONCE(dattr_new); 7976 WARN_ON_ONCE(dattr_new);
7830 } 7977 }
7831 7978
7832 /* Build new domains */ 7979 /* Build new domains */
7833 for (i = 0; i < ndoms_new; i++) { 7980 for (i = 0; i < ndoms_new; i++) {
7834 for (j = 0; j < ndoms_cur && !new_topology; j++) { 7981 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7835 if (cpus_equal(doms_new[i], doms_cur[j]) 7982 if (cpumask_equal(&doms_new[i], &doms_cur[j])
7836 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7983 && dattrs_equal(dattr_new, i, dattr_cur, j))
7837 goto match2; 7984 goto match2;
7838 } 7985 }
@@ -7844,7 +7991,7 @@ match2:
7844 } 7991 }
7845 7992
7846 /* Remember the new sched domains */ 7993 /* Remember the new sched domains */
7847 if (doms_cur != &fallback_doms) 7994 if (doms_cur != fallback_doms)
7848 kfree(doms_cur); 7995 kfree(doms_cur);
7849 kfree(dattr_cur); /* kfree(NULL) is safe */ 7996 kfree(dattr_cur); /* kfree(NULL) is safe */
7850 doms_cur = doms_new; 7997 doms_cur = doms_new;
@@ -7857,7 +8004,7 @@ match2:
7857} 8004}
7858 8005
7859#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 8006#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7860int arch_reinit_sched_domains(void) 8007static void arch_reinit_sched_domains(void)
7861{ 8008{
7862 get_online_cpus(); 8009 get_online_cpus();
7863 8010
@@ -7866,25 +8013,33 @@ int arch_reinit_sched_domains(void)
7866 8013
7867 rebuild_sched_domains(); 8014 rebuild_sched_domains();
7868 put_online_cpus(); 8015 put_online_cpus();
7869
7870 return 0;
7871} 8016}
7872 8017
7873static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 8018static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7874{ 8019{
7875 int ret; 8020 unsigned int level = 0;
8021
8022 if (sscanf(buf, "%u", &level) != 1)
8023 return -EINVAL;
7876 8024
7877 if (buf[0] != '0' && buf[0] != '1') 8025 /*
8026 * level is always be positive so don't check for
8027 * level < POWERSAVINGS_BALANCE_NONE which is 0
8028 * What happens on 0 or 1 byte write,
8029 * need to check for count as well?
8030 */
8031
8032 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7878 return -EINVAL; 8033 return -EINVAL;
7879 8034
7880 if (smt) 8035 if (smt)
7881 sched_smt_power_savings = (buf[0] == '1'); 8036 sched_smt_power_savings = level;
7882 else 8037 else
7883 sched_mc_power_savings = (buf[0] == '1'); 8038 sched_mc_power_savings = level;
7884 8039
7885 ret = arch_reinit_sched_domains(); 8040 arch_reinit_sched_domains();
7886 8041
7887 return ret ? ret : count; 8042 return count;
7888} 8043}
7889 8044
7890#ifdef CONFIG_SCHED_MC 8045#ifdef CONFIG_SCHED_MC
@@ -7919,7 +8074,7 @@ static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7919 sched_smt_power_savings_store); 8074 sched_smt_power_savings_store);
7920#endif 8075#endif
7921 8076
7922int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 8077int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7923{ 8078{
7924 int err = 0; 8079 int err = 0;
7925 8080
@@ -7984,7 +8139,9 @@ static int update_runtime(struct notifier_block *nfb,
7984 8139
7985void __init sched_init_smp(void) 8140void __init sched_init_smp(void)
7986{ 8141{
7987 cpumask_t non_isolated_cpus; 8142 cpumask_var_t non_isolated_cpus;
8143
8144 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7988 8145
7989#if defined(CONFIG_NUMA) 8146#if defined(CONFIG_NUMA)
7990 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 8147 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -7993,10 +8150,10 @@ void __init sched_init_smp(void)
7993#endif 8150#endif
7994 get_online_cpus(); 8151 get_online_cpus();
7995 mutex_lock(&sched_domains_mutex); 8152 mutex_lock(&sched_domains_mutex);
7996 arch_init_sched_domains(&cpu_online_map); 8153 arch_init_sched_domains(cpu_online_mask);
7997 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 8154 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7998 if (cpus_empty(non_isolated_cpus)) 8155 if (cpumask_empty(non_isolated_cpus))
7999 cpu_set(smp_processor_id(), non_isolated_cpus); 8156 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8000 mutex_unlock(&sched_domains_mutex); 8157 mutex_unlock(&sched_domains_mutex);
8001 put_online_cpus(); 8158 put_online_cpus();
8002 8159
@@ -8011,9 +8168,13 @@ void __init sched_init_smp(void)
8011 init_hrtick(); 8168 init_hrtick();
8012 8169
8013 /* Move init over to a non-isolated CPU */ 8170 /* Move init over to a non-isolated CPU */
8014 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 8171 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8015 BUG(); 8172 BUG();
8016 sched_init_granularity(); 8173 sched_init_granularity();
8174 free_cpumask_var(non_isolated_cpus);
8175
8176 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8177 init_sched_rt_class();
8017} 8178}
8018#else 8179#else
8019void __init sched_init_smp(void) 8180void __init sched_init_smp(void)
@@ -8328,6 +8489,15 @@ void __init sched_init(void)
8328 */ 8489 */
8329 current->sched_class = &fair_sched_class; 8490 current->sched_class = &fair_sched_class;
8330 8491
8492 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8493 alloc_bootmem_cpumask_var(&nohz_cpu_mask);
8494#ifdef CONFIG_SMP
8495#ifdef CONFIG_NO_HZ
8496 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
8497#endif
8498 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8499#endif /* SMP */
8500
8331 scheduler_running = 1; 8501 scheduler_running = 1;
8332} 8502}
8333 8503
@@ -8890,6 +9060,13 @@ static int tg_schedulable(struct task_group *tg, void *data)
8890 runtime = d->rt_runtime; 9060 runtime = d->rt_runtime;
8891 } 9061 }
8892 9062
9063#ifdef CONFIG_USER_SCHED
9064 if (tg == &root_task_group) {
9065 period = global_rt_period();
9066 runtime = global_rt_runtime();
9067 }
9068#endif
9069
8893 /* 9070 /*
8894 * Cannot have more runtime than the period. 9071 * Cannot have more runtime than the period.
8895 */ 9072 */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e8ab096ddfe3..a0b0852414cc 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -124,7 +124,7 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 124
125 clock = scd->tick_gtod + delta; 125 clock = scd->tick_gtod + delta;
126 min_clock = wrap_max(scd->tick_gtod, scd->clock); 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = scd->tick_gtod + TICK_NSEC; 127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
128 128
129 clock = wrap_max(clock, min_clock); 129 clock = wrap_max(clock, min_clock);
130 clock = wrap_min(clock, max_clock); 130 clock = wrap_min(clock, max_clock);
@@ -227,6 +227,9 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
227 */ 227 */
228void sched_clock_idle_wakeup_event(u64 delta_ns) 228void sched_clock_idle_wakeup_event(u64 delta_ns)
229{ 229{
230 if (timekeeping_suspended)
231 return;
232
230 sched_clock_tick(); 233 sched_clock_tick();
231 touch_softlockup_watchdog(); 234 touch_softlockup_watchdog();
232} 235}
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 52154fefab7e..1e00bfacf9b8 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -67,24 +67,21 @@ static int convert_prio(int prio)
67 * Returns: (int)bool - CPUs were found 67 * Returns: (int)bool - CPUs were found
68 */ 68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p, 69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask) 70 struct cpumask *lowest_mask)
71{ 71{
72 int idx = 0; 72 int idx = 0;
73 int task_pri = convert_prio(p->prio); 73 int task_pri = convert_prio(p->prio);
74 74
75 for_each_cpupri_active(cp->pri_active, idx) { 75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78 77
79 if (idx >= task_pri) 78 if (idx >= task_pri)
80 break; 79 break;
81 80
82 cpus_and(mask, p->cpus_allowed, vec->mask); 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
83
84 if (cpus_empty(mask))
85 continue; 82 continue;
86 83
87 *lowest_mask = mask; 84 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
88 return 1; 85 return 1;
89 } 86 }
90 87
@@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
126 vec->count--; 123 vec->count--;
127 if (!vec->count) 124 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active); 125 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask); 126 cpumask_clear_cpu(cpu, vec->mask);
130 127
131 spin_unlock_irqrestore(&vec->lock, flags); 128 spin_unlock_irqrestore(&vec->lock, flags);
132 } 129 }
@@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
136 133
137 spin_lock_irqsave(&vec->lock, flags); 134 spin_lock_irqsave(&vec->lock, flags);
138 135
139 cpu_set(cpu, vec->mask); 136 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 137 vec->count++;
141 if (vec->count == 1) 138 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active); 139 set_bit(newpri, cp->pri_active);
@@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
150/** 147/**
151 * cpupri_init - initialize the cpupri structure 148 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context 149 * @cp: The cpupri context
150 * @bootmem: true if allocations need to use bootmem
153 * 151 *
154 * Returns: (void) 152 * Returns: -ENOMEM if memory fails.
155 */ 153 */
156void cpupri_init(struct cpupri *cp) 154int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
157{ 155{
158 int i; 156 int i;
159 157
@@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp)
164 162
165 spin_lock_init(&vec->lock); 163 spin_lock_init(&vec->lock);
166 vec->count = 0; 164 vec->count = 0;
167 cpus_clear(vec->mask); 165 if (bootmem)
166 alloc_bootmem_cpumask_var(&vec->mask);
167 else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
168 goto cleanup;
168 } 169 }
169 170
170 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID; 172 cp->cpu_to_pri[i] = CPUPRI_INVALID;
173 return 0;
174
175cleanup:
176 for (i--; i >= 0; i--)
177 free_cpumask_var(cp->pri_to_cpu[i].mask);
178 return -ENOMEM;
172} 179}
173 180
181/**
182 * cpupri_cleanup - clean up the cpupri structure
183 * @cp: The cpupri context
184 */
185void cpupri_cleanup(struct cpupri *cp)
186{
187 int i;
174 188
189 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
190 free_cpumask_var(cp->pri_to_cpu[i].mask);
191}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index f25811b0f931..642a94ef8a0a 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -14,7 +14,7 @@
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 spinlock_t lock;
16 int count; 16 int count;
17 cpumask_t mask; 17 cpumask_var_t mask;
18}; 18};
19 19
20struct cpupri { 20struct cpupri {
@@ -27,7 +27,8 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask); 28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp); 30int cpupri_init(struct cpupri *cp, bool bootmem);
31void cpupri_cleanup(struct cpupri *cp);
31#else 32#else
32#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0) 34#define cpupri_init() do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4293cfa9681d..16eeba4e4169 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -145,6 +145,19 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
145 read_unlock_irqrestore(&tasklist_lock, flags); 145 read_unlock_irqrestore(&tasklist_lock, flags);
146} 146}
147 147
148#if defined(CONFIG_CGROUP_SCHED) && \
149 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
150static void task_group_path(struct task_group *tg, char *buf, int buflen)
151{
152 /* may be NULL if the underlying cgroup isn't fully-created yet */
153 if (!tg->css.cgroup) {
154 buf[0] = '\0';
155 return;
156 }
157 cgroup_path(tg->css.cgroup, buf, buflen);
158}
159#endif
160
148void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 161void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
149{ 162{
150 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -154,10 +167,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
154 unsigned long flags; 167 unsigned long flags;
155 168
156#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 169#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
157 char path[128] = ""; 170 char path[128];
158 struct task_group *tg = cfs_rq->tg; 171 struct task_group *tg = cfs_rq->tg;
159 172
160 cgroup_path(tg->css.cgroup, path, sizeof(path)); 173 task_group_path(tg, path, sizeof(path));
161 174
162 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 175 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
163#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 176#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
@@ -208,10 +221,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
208void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 221void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
209{ 222{
210#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 223#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
211 char path[128] = ""; 224 char path[128];
212 struct task_group *tg = rt_rq->tg; 225 struct task_group *tg = rt_rq->tg;
213 226
214 cgroup_path(tg->css.cgroup, path, sizeof(path)); 227 task_group_path(tg, path, sizeof(path));
215 228
216 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); 229 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
217#else 230#else
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5ad4440f0fc4..a7e50ba185ac 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -283,7 +283,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
283 struct sched_entity, 283 struct sched_entity,
284 run_node); 284 run_node);
285 285
286 if (vruntime == cfs_rq->min_vruntime) 286 if (!cfs_rq->curr)
287 vruntime = se->vruntime; 287 vruntime = se->vruntime;
288 else 288 else
289 vruntime = min_vruntime(vruntime, se->vruntime); 289 vruntime = min_vruntime(vruntime, se->vruntime);
@@ -386,20 +386,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
386#endif 386#endif
387 387
388/* 388/*
389 * delta *= P[w / rw]
390 */
391static inline unsigned long
392calc_delta_weight(unsigned long delta, struct sched_entity *se)
393{
394 for_each_sched_entity(se) {
395 delta = calc_delta_mine(delta,
396 se->load.weight, &cfs_rq_of(se)->load);
397 }
398
399 return delta;
400}
401
402/*
403 * delta /= w 389 * delta /= w
404 */ 390 */
405static inline unsigned long 391static inline unsigned long
@@ -440,12 +426,23 @@ static u64 __sched_period(unsigned long nr_running)
440 */ 426 */
441static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 427static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
442{ 428{
443 unsigned long nr_running = cfs_rq->nr_running; 429 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
430
431 for_each_sched_entity(se) {
432 struct load_weight *load;
433
434 cfs_rq = cfs_rq_of(se);
435 load = &cfs_rq->load;
444 436
445 if (unlikely(!se->on_rq)) 437 if (unlikely(!se->on_rq)) {
446 nr_running++; 438 struct load_weight lw = cfs_rq->load;
447 439
448 return calc_delta_weight(__sched_period(nr_running), se); 440 update_load_add(&lw, se->load.weight);
441 load = &lw;
442 }
443 slice = calc_delta_mine(slice, se->load.weight, load);
444 }
445 return slice;
449} 446}
450 447
451/* 448/*
@@ -683,9 +680,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
683 unsigned long thresh = sysctl_sched_latency; 680 unsigned long thresh = sysctl_sched_latency;
684 681
685 /* 682 /*
686 * convert the sleeper threshold into virtual time 683 * Convert the sleeper threshold into virtual time.
684 * SCHED_IDLE is a special sub-class. We care about
685 * fairness only relative to other SCHED_IDLE tasks,
686 * all of which have the same weight.
687 */ 687 */
688 if (sched_feat(NORMALIZED_SLEEPER)) 688 if (sched_feat(NORMALIZED_SLEEPER) &&
689 task_of(se)->policy != SCHED_IDLE)
689 thresh = calc_delta_fair(thresh, se); 690 thresh = calc_delta_fair(thresh, se);
690 691
691 vruntime -= thresh; 692 vruntime -= thresh;
@@ -718,7 +719,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
718 __enqueue_entity(cfs_rq, se); 719 __enqueue_entity(cfs_rq, se);
719} 720}
720 721
721static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 722static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
722{ 723{
723 if (cfs_rq->last == se) 724 if (cfs_rq->last == se)
724 cfs_rq->last = NULL; 725 cfs_rq->last = NULL;
@@ -727,6 +728,12 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
727 cfs_rq->next = NULL; 728 cfs_rq->next = NULL;
728} 729}
729 730
731static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
732{
733 for_each_sched_entity(se)
734 __clear_buddies(cfs_rq_of(se), se);
735}
736
730static void 737static void
731dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 738dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
732{ 739{
@@ -767,8 +774,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
767 774
768 ideal_runtime = sched_slice(cfs_rq, curr); 775 ideal_runtime = sched_slice(cfs_rq, curr);
769 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 776 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
770 if (delta_exec > ideal_runtime) 777 if (delta_exec > ideal_runtime) {
771 resched_task(rq_of(cfs_rq)->curr); 778 resched_task(rq_of(cfs_rq)->curr);
779 /*
780 * The current task ran long enough, ensure it doesn't get
781 * re-elected due to buddy favours.
782 */
783 clear_buddies(cfs_rq, curr);
784 }
772} 785}
773 786
774static void 787static void
@@ -1019,16 +1032,33 @@ static void yield_task_fair(struct rq *rq)
1019 * search starts with cpus closest then further out as needed, 1032 * search starts with cpus closest then further out as needed,
1020 * so we always favor a closer, idle cpu. 1033 * so we always favor a closer, idle cpu.
1021 * Domains may include CPUs that are not usable for migration, 1034 * Domains may include CPUs that are not usable for migration,
1022 * hence we need to mask them out (cpu_active_map) 1035 * hence we need to mask them out (cpu_active_mask)
1023 * 1036 *
1024 * Returns the CPU we should wake onto. 1037 * Returns the CPU we should wake onto.
1025 */ 1038 */
1026#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1039#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1027static int wake_idle(int cpu, struct task_struct *p) 1040static int wake_idle(int cpu, struct task_struct *p)
1028{ 1041{
1029 cpumask_t tmp;
1030 struct sched_domain *sd; 1042 struct sched_domain *sd;
1031 int i; 1043 int i;
1044 unsigned int chosen_wakeup_cpu;
1045 int this_cpu;
1046
1047 /*
1048 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1049 * are idle and this is not a kernel thread and this task's affinity
1050 * allows it to be moved to preferred cpu, then just move!
1051 */
1052
1053 this_cpu = smp_processor_id();
1054 chosen_wakeup_cpu =
1055 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1056
1057 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1058 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1059 p->mm && !(p->flags & PF_KTHREAD) &&
1060 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1061 return chosen_wakeup_cpu;
1032 1062
1033 /* 1063 /*
1034 * If it is idle, then it is the best cpu to run this task. 1064 * If it is idle, then it is the best cpu to run this task.
@@ -1046,10 +1076,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1046 if ((sd->flags & SD_WAKE_IDLE) 1076 if ((sd->flags & SD_WAKE_IDLE)
1047 || ((sd->flags & SD_WAKE_IDLE_FAR) 1077 || ((sd->flags & SD_WAKE_IDLE_FAR)
1048 && !task_hot(p, task_rq(p)->clock, sd))) { 1078 && !task_hot(p, task_rq(p)->clock, sd))) {
1049 cpus_and(tmp, sd->span, p->cpus_allowed); 1079 for_each_cpu_and(i, sched_domain_span(sd),
1050 cpus_and(tmp, tmp, cpu_active_map); 1080 &p->cpus_allowed) {
1051 for_each_cpu_mask_nr(i, tmp) { 1081 if (cpu_active(i) && idle_cpu(i)) {
1052 if (idle_cpu(i)) {
1053 if (i != task_cpu(p)) { 1082 if (i != task_cpu(p)) {
1054 schedstat_inc(p, 1083 schedstat_inc(p,
1055 se.nr_wakeups_idle); 1084 se.nr_wakeups_idle);
@@ -1162,20 +1191,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1162 int idx, unsigned long load, unsigned long this_load, 1191 int idx, unsigned long load, unsigned long this_load,
1163 unsigned int imbalance) 1192 unsigned int imbalance)
1164{ 1193{
1165 struct task_struct *curr = this_rq->curr;
1166 struct task_group *tg;
1167 unsigned long tl = this_load; 1194 unsigned long tl = this_load;
1168 unsigned long tl_per_task; 1195 unsigned long tl_per_task;
1196 struct task_group *tg;
1169 unsigned long weight; 1197 unsigned long weight;
1170 int balanced; 1198 int balanced;
1171 1199
1172 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1200 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1173 return 0; 1201 return 0;
1174 1202
1175 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1176 p->se.avg_overlap > sysctl_sched_migration_cost))
1177 sync = 0;
1178
1179 /* 1203 /*
1180 * If sync wakeup then subtract the (maximum possible) 1204 * If sync wakeup then subtract the (maximum possible)
1181 * effect of the currently running task from the load 1205 * effect of the currently running task from the load
@@ -1242,13 +1266,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1242 * this_cpu and prev_cpu are present in: 1266 * this_cpu and prev_cpu are present in:
1243 */ 1267 */
1244 for_each_domain(this_cpu, sd) { 1268 for_each_domain(this_cpu, sd) {
1245 if (cpu_isset(prev_cpu, sd->span)) { 1269 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
1246 this_sd = sd; 1270 this_sd = sd;
1247 break; 1271 break;
1248 } 1272 }
1249 } 1273 }
1250 1274
1251 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1275 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
1252 goto out; 1276 goto out;
1253 1277
1254 /* 1278 /*
@@ -1330,14 +1354,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1330 1354
1331static void set_last_buddy(struct sched_entity *se) 1355static void set_last_buddy(struct sched_entity *se)
1332{ 1356{
1333 for_each_sched_entity(se) 1357 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1334 cfs_rq_of(se)->last = se; 1358 for_each_sched_entity(se)
1359 cfs_rq_of(se)->last = se;
1360 }
1335} 1361}
1336 1362
1337static void set_next_buddy(struct sched_entity *se) 1363static void set_next_buddy(struct sched_entity *se)
1338{ 1364{
1339 for_each_sched_entity(se) 1365 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1340 cfs_rq_of(se)->next = se; 1366 for_each_sched_entity(se)
1367 cfs_rq_of(se)->next = se;
1368 }
1341} 1369}
1342 1370
1343/* 1371/*
@@ -1383,18 +1411,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1383 return; 1411 return;
1384 1412
1385 /* 1413 /*
1386 * Batch tasks do not preempt (their preemption is driven by 1414 * Batch and idle tasks do not preempt (their preemption is driven by
1387 * the tick): 1415 * the tick):
1388 */ 1416 */
1389 if (unlikely(p->policy == SCHED_BATCH)) 1417 if (unlikely(p->policy != SCHED_NORMAL))
1390 return; 1418 return;
1391 1419
1420 /* Idle tasks are by definition preempted by everybody. */
1421 if (unlikely(curr->policy == SCHED_IDLE)) {
1422 resched_task(curr);
1423 return;
1424 }
1425
1392 if (!sched_feat(WAKEUP_PREEMPT)) 1426 if (!sched_feat(WAKEUP_PREEMPT))
1393 return; 1427 return;
1394 1428
1395 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1429 if (sched_feat(WAKEUP_OVERLAP) && sync) {
1396 (se->avg_overlap < sysctl_sched_migration_cost &&
1397 pse->avg_overlap < sysctl_sched_migration_cost))) {
1398 resched_task(curr); 1430 resched_task(curr);
1399 return; 1431 return;
1400 } 1432 }
@@ -1425,6 +1457,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1425 1457
1426 do { 1458 do {
1427 se = pick_next_entity(cfs_rq); 1459 se = pick_next_entity(cfs_rq);
1460 /*
1461 * If se was a buddy, clear it so that it will have to earn
1462 * the favour again.
1463 */
1464 __clear_buddies(cfs_rq, se);
1428 set_next_entity(cfs_rq, se); 1465 set_next_entity(cfs_rq, se);
1429 cfs_rq = group_cfs_rq(se); 1466 cfs_rq = group_cfs_rq(se);
1430 } while (cfs_rq); 1467 } while (cfs_rq);
@@ -1607,8 +1644,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1607 } 1644 }
1608} 1645}
1609 1646
1610#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1611
1612/* 1647/*
1613 * Share the fairness runtime between parent and child, thus the 1648 * Share the fairness runtime between parent and child, thus the
1614 * total amount of pressure for CPU stays equal - new tasks 1649 * total amount of pressure for CPU stays equal - new tasks
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 51d2af3e6191..bac1061cea2f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq)
15 if (!rq->online) 15 if (!rq->online)
16 return; 16 return;
17 17
18 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
19 /* 19 /*
20 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
21 * the overload count. That is checked to determine 21 * the overload count. That is checked to determine
@@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq)
34 34
35 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
36 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
37 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
38} 38}
39 39
40static void update_rt_migration(struct rq *rq) 40static void update_rt_migration(struct rq *rq)
@@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
139} 139}
140 140
141#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
142static inline cpumask_t sched_rt_period_mask(void) 142static inline const struct cpumask *sched_rt_period_mask(void)
143{ 143{
144 return cpu_rq(smp_processor_id())->rd->span; 144 return cpu_rq(smp_processor_id())->rd->span;
145} 145}
146#else 146#else
147static inline cpumask_t sched_rt_period_mask(void) 147static inline const struct cpumask *sched_rt_period_mask(void)
148{ 148{
149 return cpu_online_map; 149 return cpu_online_mask;
150} 150}
151#endif 151#endif
152 152
@@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
212 return rt_rq->rt_throttled; 212 return rt_rq->rt_throttled;
213} 213}
214 214
215static inline cpumask_t sched_rt_period_mask(void) 215static inline const struct cpumask *sched_rt_period_mask(void)
216{ 216{
217 return cpu_online_map; 217 return cpu_online_mask;
218} 218}
219 219
220static inline 220static inline
@@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
241 int i, weight, more = 0; 241 int i, weight, more = 0;
242 u64 rt_period; 242 u64 rt_period;
243 243
244 weight = cpus_weight(rd->span); 244 weight = cpumask_weight(rd->span);
245 245
246 spin_lock(&rt_b->rt_runtime_lock); 246 spin_lock(&rt_b->rt_runtime_lock);
247 rt_period = ktime_to_ns(rt_b->rt_period); 247 rt_period = ktime_to_ns(rt_b->rt_period);
248 for_each_cpu_mask_nr(i, rd->span) { 248 for_each_cpu(i, rd->span) {
249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
250 s64 diff; 250 s64 diff;
251 251
@@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq)
324 /* 324 /*
325 * Greedy reclaim, take back as much as we can. 325 * Greedy reclaim, take back as much as we can.
326 */ 326 */
327 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu(i, rd->span) {
328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
329 s64 diff; 329 s64 diff;
330 330
@@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
430{ 430{
431 int i, idle = 1; 431 int i, idle = 1;
432 cpumask_t span; 432 const struct cpumask *span;
433 433
434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
435 return 1; 435 return 1;
436 436
437 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
438 for_each_cpu_mask(i, span) { 438 for_each_cpu(i, span) {
439 int enqueue = 0; 439 int enqueue = 0;
440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
441 struct rq *rq = rq_of_rt_rq(rt_rq); 441 struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
805 805
806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
807{ 807{
808 cpumask_t mask; 808 cpumask_var_t mask;
809 809
810 if (rq->curr->rt.nr_cpus_allowed == 1) 810 if (rq->curr->rt.nr_cpus_allowed == 1)
811 return; 811 return;
812 812
813 if (p->rt.nr_cpus_allowed != 1 813 if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
814 && cpupri_find(&rq->rd->cpupri, p, &mask))
815 return; 814 return;
816 815
817 if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) 816 if (p->rt.nr_cpus_allowed != 1
818 return; 817 && cpupri_find(&rq->rd->cpupri, p, mask))
818 goto free;
819
820 if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
821 goto free;
819 822
820 /* 823 /*
821 * There appears to be other cpus that can accept 824 * There appears to be other cpus that can accept
@@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
824 */ 827 */
825 requeue_task_rt(rq, p, 1); 828 requeue_task_rt(rq, p, 1);
826 resched_task(rq->curr); 829 resched_task(rq->curr);
830free:
831 free_cpumask_var(mask);
827} 832}
828 833
829#endif /* CONFIG_SMP */ 834#endif /* CONFIG_SMP */
@@ -914,7 +919,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
914static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 919static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
915{ 920{
916 if (!task_running(rq, p) && 921 if (!task_running(rq, p) &&
917 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && 922 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
918 (p->rt.nr_cpus_allowed > 1)) 923 (p->rt.nr_cpus_allowed > 1))
919 return 1; 924 return 1;
920 return 0; 925 return 0;
@@ -953,7 +958,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
953 return next; 958 return next;
954} 959}
955 960
956static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
957 962
958static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 963static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
959{ 964{
@@ -963,8 +968,8 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
963 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) 968 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
964 return this_cpu; 969 return this_cpu;
965 970
966 first = first_cpu(*mask); 971 first = cpumask_first(mask);
967 if (first != NR_CPUS) 972 if (first < nr_cpu_ids)
968 return first; 973 return first;
969 974
970 return -1; 975 return -1;
@@ -973,7 +978,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
973static int find_lowest_rq(struct task_struct *task) 978static int find_lowest_rq(struct task_struct *task)
974{ 979{
975 struct sched_domain *sd; 980 struct sched_domain *sd;
976 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 981 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
977 int this_cpu = smp_processor_id(); 982 int this_cpu = smp_processor_id();
978 int cpu = task_cpu(task); 983 int cpu = task_cpu(task);
979 984
@@ -988,7 +993,7 @@ static int find_lowest_rq(struct task_struct *task)
988 * I guess we might want to change cpupri_find() to ignore those 993 * I guess we might want to change cpupri_find() to ignore those
989 * in the first place. 994 * in the first place.
990 */ 995 */
991 cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); 996 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
992 997
993 /* 998 /*
994 * At this point we have built a mask of cpus representing the 999 * At this point we have built a mask of cpus representing the
@@ -998,7 +1003,7 @@ static int find_lowest_rq(struct task_struct *task)
998 * We prioritize the last cpu that the task executed on since 1003 * We prioritize the last cpu that the task executed on since
999 * it is most likely cache-hot in that location. 1004 * it is most likely cache-hot in that location.
1000 */ 1005 */
1001 if (cpu_isset(cpu, *lowest_mask)) 1006 if (cpumask_test_cpu(cpu, lowest_mask))
1002 return cpu; 1007 return cpu;
1003 1008
1004 /* 1009 /*
@@ -1013,7 +1018,8 @@ static int find_lowest_rq(struct task_struct *task)
1013 cpumask_t domain_mask; 1018 cpumask_t domain_mask;
1014 int best_cpu; 1019 int best_cpu;
1015 1020
1016 cpus_and(domain_mask, sd->span, *lowest_mask); 1021 cpumask_and(&domain_mask, sched_domain_span(sd),
1022 lowest_mask);
1017 1023
1018 best_cpu = pick_optimal_cpu(this_cpu, 1024 best_cpu = pick_optimal_cpu(this_cpu,
1019 &domain_mask); 1025 &domain_mask);
@@ -1054,8 +1060,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1054 * Also make sure that it wasn't scheduled on its rq. 1060 * Also make sure that it wasn't scheduled on its rq.
1055 */ 1061 */
1056 if (unlikely(task_rq(task) != rq || 1062 if (unlikely(task_rq(task) != rq ||
1057 !cpu_isset(lowest_rq->cpu, 1063 !cpumask_test_cpu(lowest_rq->cpu,
1058 task->cpus_allowed) || 1064 &task->cpus_allowed) ||
1059 task_running(rq, task) || 1065 task_running(rq, task) ||
1060 !task->se.on_rq)) { 1066 !task->se.on_rq)) {
1061 1067
@@ -1176,7 +1182,7 @@ static int pull_rt_task(struct rq *this_rq)
1176 1182
1177 next = pick_next_task_rt(this_rq); 1183 next = pick_next_task_rt(this_rq);
1178 1184
1179 for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { 1185 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1180 if (this_cpu == cpu) 1186 if (this_cpu == cpu)
1181 continue; 1187 continue;
1182 1188
@@ -1305,9 +1311,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1305} 1311}
1306 1312
1307static void set_cpus_allowed_rt(struct task_struct *p, 1313static void set_cpus_allowed_rt(struct task_struct *p,
1308 const cpumask_t *new_mask) 1314 const struct cpumask *new_mask)
1309{ 1315{
1310 int weight = cpus_weight(*new_mask); 1316 int weight = cpumask_weight(new_mask);
1311 1317
1312 BUG_ON(!rt_task(p)); 1318 BUG_ON(!rt_task(p));
1313 1319
@@ -1328,7 +1334,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1328 update_rt_migration(rq); 1334 update_rt_migration(rq);
1329 } 1335 }
1330 1336
1331 p->cpus_allowed = *new_mask; 1337 cpumask_copy(&p->cpus_allowed, new_mask);
1332 p->rt.nr_cpus_allowed = weight; 1338 p->rt.nr_cpus_allowed = weight;
1333} 1339}
1334 1340
@@ -1371,6 +1377,15 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1371 if (!rq->rt.rt_nr_running) 1377 if (!rq->rt.rt_nr_running)
1372 pull_rt_task(rq); 1378 pull_rt_task(rq);
1373} 1379}
1380
1381static inline void init_sched_rt_class(void)
1382{
1383 unsigned int i;
1384
1385 for_each_possible_cpu(i)
1386 alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1387 GFP_KERNEL, cpu_to_node(i));
1388}
1374#endif /* CONFIG_SMP */ 1389#endif /* CONFIG_SMP */
1375 1390
1376/* 1391/*
@@ -1541,3 +1556,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1541 rcu_read_unlock(); 1556 rcu_read_unlock();
1542} 1557}
1543#endif /* CONFIG_SCHED_DEBUG */ 1558#endif /* CONFIG_SCHED_DEBUG */
1559
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 3b01098164c8..8ab0cef8ecab 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
42 for_each_domain(cpu, sd) { 42 for_each_domain(cpu, sd) {
43 enum cpu_idle_type itype; 43 enum cpu_idle_type itype;
44 44
45 cpumask_scnprintf(mask_str, mask_len, sd->span); 45 cpumask_scnprintf(mask_str, mask_len,
46 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str); 47 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 48 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) { 49 itype++) {
@@ -295,6 +296,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
295static inline void account_group_user_time(struct task_struct *tsk, 296static inline void account_group_user_time(struct task_struct *tsk,
296 cputime_t cputime) 297 cputime_t cputime)
297{ 298{
299 struct task_cputime *times;
298 struct signal_struct *sig; 300 struct signal_struct *sig;
299 301
300 /* tsk == current, ensure it is safe to use ->signal */ 302 /* tsk == current, ensure it is safe to use ->signal */
@@ -302,13 +304,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
302 return; 304 return;
303 305
304 sig = tsk->signal; 306 sig = tsk->signal;
305 if (sig->cputime.totals) { 307 times = &sig->cputime.totals;
306 struct task_cputime *times;
307 308
308 times = per_cpu_ptr(sig->cputime.totals, get_cpu()); 309 spin_lock(&times->lock);
309 times->utime = cputime_add(times->utime, cputime); 310 times->utime = cputime_add(times->utime, cputime);
310 put_cpu_no_resched(); 311 spin_unlock(&times->lock);
311 }
312} 312}
313 313
314/** 314/**
@@ -324,6 +324,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
324static inline void account_group_system_time(struct task_struct *tsk, 324static inline void account_group_system_time(struct task_struct *tsk,
325 cputime_t cputime) 325 cputime_t cputime)
326{ 326{
327 struct task_cputime *times;
327 struct signal_struct *sig; 328 struct signal_struct *sig;
328 329
329 /* tsk == current, ensure it is safe to use ->signal */ 330 /* tsk == current, ensure it is safe to use ->signal */
@@ -331,13 +332,11 @@ static inline void account_group_system_time(struct task_struct *tsk,
331 return; 332 return;
332 333
333 sig = tsk->signal; 334 sig = tsk->signal;
334 if (sig->cputime.totals) { 335 times = &sig->cputime.totals;
335 struct task_cputime *times;
336 336
337 times = per_cpu_ptr(sig->cputime.totals, get_cpu()); 337 spin_lock(&times->lock);
338 times->stime = cputime_add(times->stime, cputime); 338 times->stime = cputime_add(times->stime, cputime);
339 put_cpu_no_resched(); 339 spin_unlock(&times->lock);
340 }
341} 340}
342 341
343/** 342/**
@@ -353,6 +352,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
353static inline void account_group_exec_runtime(struct task_struct *tsk, 352static inline void account_group_exec_runtime(struct task_struct *tsk,
354 unsigned long long ns) 353 unsigned long long ns)
355{ 354{
355 struct task_cputime *times;
356 struct signal_struct *sig; 356 struct signal_struct *sig;
357 357
358 sig = tsk->signal; 358 sig = tsk->signal;
@@ -361,11 +361,9 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
361 if (unlikely(!sig)) 361 if (unlikely(!sig))
362 return; 362 return;
363 363
364 if (sig->cputime.totals) { 364 times = &sig->cputime.totals;
365 struct task_cputime *times;
366 365
367 times = per_cpu_ptr(sig->cputime.totals, get_cpu()); 366 spin_lock(&times->lock);
368 times->sum_exec_runtime += ns; 367 times->sum_exec_runtime += ns;
369 put_cpu_no_resched(); 368 spin_unlock(&times->lock);
370 }
371} 369}
diff --git a/kernel/signal.c b/kernel/signal.c
index 8e95855ff3cf..b6b36768b758 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -858,7 +858,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
858 q->info.si_signo = sig; 858 q->info.si_signo = sig;
859 q->info.si_errno = 0; 859 q->info.si_errno = 0;
860 q->info.si_code = SI_USER; 860 q->info.si_code = SI_USER;
861 q->info.si_pid = task_pid_vnr(current); 861 q->info.si_pid = task_tgid_nr_ns(current,
862 task_active_pid_ns(t));
862 q->info.si_uid = current_uid(); 863 q->info.si_uid = current_uid();
863 break; 864 break;
864 case (unsigned long) SEND_SIG_PRIV: 865 case (unsigned long) SEND_SIG_PRIV:
@@ -908,7 +909,9 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
908 } 909 }
909#endif 910#endif
910 printk("\n"); 911 printk("\n");
912 preempt_disable();
911 show_regs(regs); 913 show_regs(regs);
914 preempt_enable();
912} 915}
913 916
914static int __init setup_print_fatal_signals(char *str) 917static int __init setup_print_fatal_signals(char *str)
@@ -1960,7 +1963,7 @@ EXPORT_SYMBOL(unblock_all_signals);
1960 * System call entry points. 1963 * System call entry points.
1961 */ 1964 */
1962 1965
1963asmlinkage long sys_restart_syscall(void) 1966SYSCALL_DEFINE0(restart_syscall)
1964{ 1967{
1965 struct restart_block *restart = &current_thread_info()->restart_block; 1968 struct restart_block *restart = &current_thread_info()->restart_block;
1966 return restart->fn(restart); 1969 return restart->fn(restart);
@@ -2013,8 +2016,8 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
2013 return error; 2016 return error;
2014} 2017}
2015 2018
2016asmlinkage long 2019SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
2017sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize) 2020 sigset_t __user *, oset, size_t, sigsetsize)
2018{ 2021{
2019 int error = -EINVAL; 2022 int error = -EINVAL;
2020 sigset_t old_set, new_set; 2023 sigset_t old_set, new_set;
@@ -2073,8 +2076,7 @@ out:
2073 return error; 2076 return error;
2074} 2077}
2075 2078
2076asmlinkage long 2079SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
2077sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize)
2078{ 2080{
2079 return do_sigpending(set, sigsetsize); 2081 return do_sigpending(set, sigsetsize);
2080} 2082}
@@ -2145,11 +2147,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
2145 2147
2146#endif 2148#endif
2147 2149
2148asmlinkage long 2150SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
2149sys_rt_sigtimedwait(const sigset_t __user *uthese, 2151 siginfo_t __user *, uinfo, const struct timespec __user *, uts,
2150 siginfo_t __user *uinfo, 2152 size_t, sigsetsize)
2151 const struct timespec __user *uts,
2152 size_t sigsetsize)
2153{ 2153{
2154 int ret, sig; 2154 int ret, sig;
2155 sigset_t these; 2155 sigset_t these;
@@ -2222,8 +2222,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2222 return ret; 2222 return ret;
2223} 2223}
2224 2224
2225asmlinkage long 2225SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2226sys_kill(pid_t pid, int sig)
2227{ 2226{
2228 struct siginfo info; 2227 struct siginfo info;
2229 2228
@@ -2282,7 +2281,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2282 * exists but it's not belonging to the target process anymore. This 2281 * exists but it's not belonging to the target process anymore. This
2283 * method solves the problem of threads exiting and PIDs getting reused. 2282 * method solves the problem of threads exiting and PIDs getting reused.
2284 */ 2283 */
2285asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig) 2284SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
2286{ 2285{
2287 /* This is only valid for single tasks */ 2286 /* This is only valid for single tasks */
2288 if (pid <= 0 || tgid <= 0) 2287 if (pid <= 0 || tgid <= 0)
@@ -2294,8 +2293,7 @@ asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
2294/* 2293/*
2295 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2294 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2296 */ 2295 */
2297asmlinkage long 2296SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
2298sys_tkill(pid_t pid, int sig)
2299{ 2297{
2300 /* This is only valid for single tasks */ 2298 /* This is only valid for single tasks */
2301 if (pid <= 0) 2299 if (pid <= 0)
@@ -2304,8 +2302,8 @@ sys_tkill(pid_t pid, int sig)
2304 return do_tkill(0, pid, sig); 2302 return do_tkill(0, pid, sig);
2305} 2303}
2306 2304
2307asmlinkage long 2305SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2308sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo) 2306 siginfo_t __user *, uinfo)
2309{ 2307{
2310 siginfo_t info; 2308 siginfo_t info;
2311 2309
@@ -2433,8 +2431,7 @@ out:
2433 2431
2434#ifdef __ARCH_WANT_SYS_SIGPENDING 2432#ifdef __ARCH_WANT_SYS_SIGPENDING
2435 2433
2436asmlinkage long 2434SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
2437sys_sigpending(old_sigset_t __user *set)
2438{ 2435{
2439 return do_sigpending(set, sizeof(*set)); 2436 return do_sigpending(set, sizeof(*set));
2440} 2437}
@@ -2445,8 +2442,8 @@ sys_sigpending(old_sigset_t __user *set)
2445/* Some platforms have their own version with special arguments others 2442/* Some platforms have their own version with special arguments others
2446 support only sys_rt_sigprocmask. */ 2443 support only sys_rt_sigprocmask. */
2447 2444
2448asmlinkage long 2445SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
2449sys_sigprocmask(int how, old_sigset_t __user *set, old_sigset_t __user *oset) 2446 old_sigset_t __user *, oset)
2450{ 2447{
2451 int error; 2448 int error;
2452 old_sigset_t old_set, new_set; 2449 old_sigset_t old_set, new_set;
@@ -2496,11 +2493,10 @@ out:
2496#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ 2493#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
2497 2494
2498#ifdef __ARCH_WANT_SYS_RT_SIGACTION 2495#ifdef __ARCH_WANT_SYS_RT_SIGACTION
2499asmlinkage long 2496SYSCALL_DEFINE4(rt_sigaction, int, sig,
2500sys_rt_sigaction(int sig, 2497 const struct sigaction __user *, act,
2501 const struct sigaction __user *act, 2498 struct sigaction __user *, oact,
2502 struct sigaction __user *oact, 2499 size_t, sigsetsize)
2503 size_t sigsetsize)
2504{ 2500{
2505 struct k_sigaction new_sa, old_sa; 2501 struct k_sigaction new_sa, old_sa;
2506 int ret = -EINVAL; 2502 int ret = -EINVAL;
@@ -2530,15 +2526,13 @@ out:
2530/* 2526/*
2531 * For backwards compatibility. Functionality superseded by sigprocmask. 2527 * For backwards compatibility. Functionality superseded by sigprocmask.
2532 */ 2528 */
2533asmlinkage long 2529SYSCALL_DEFINE0(sgetmask)
2534sys_sgetmask(void)
2535{ 2530{
2536 /* SMP safe */ 2531 /* SMP safe */
2537 return current->blocked.sig[0]; 2532 return current->blocked.sig[0];
2538} 2533}
2539 2534
2540asmlinkage long 2535SYSCALL_DEFINE1(ssetmask, int, newmask)
2541sys_ssetmask(int newmask)
2542{ 2536{
2543 int old; 2537 int old;
2544 2538
@@ -2558,8 +2552,7 @@ sys_ssetmask(int newmask)
2558/* 2552/*
2559 * For backwards compatibility. Functionality superseded by sigaction. 2553 * For backwards compatibility. Functionality superseded by sigaction.
2560 */ 2554 */
2561asmlinkage unsigned long 2555SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
2562sys_signal(int sig, __sighandler_t handler)
2563{ 2556{
2564 struct k_sigaction new_sa, old_sa; 2557 struct k_sigaction new_sa, old_sa;
2565 int ret; 2558 int ret;
@@ -2576,8 +2569,7 @@ sys_signal(int sig, __sighandler_t handler)
2576 2569
2577#ifdef __ARCH_WANT_SYS_PAUSE 2570#ifdef __ARCH_WANT_SYS_PAUSE
2578 2571
2579asmlinkage long 2572SYSCALL_DEFINE0(pause)
2580sys_pause(void)
2581{ 2573{
2582 current->state = TASK_INTERRUPTIBLE; 2574 current->state = TASK_INTERRUPTIBLE;
2583 schedule(); 2575 schedule();
@@ -2587,7 +2579,7 @@ sys_pause(void)
2587#endif 2579#endif
2588 2580
2589#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND 2581#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
2590asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) 2582SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
2591{ 2583{
2592 sigset_t newset; 2584 sigset_t newset;
2593 2585
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c8dde58c55..bbedbb7efe32 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -18,14 +18,15 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
18enum { 18enum {
19 CSD_FLAG_WAIT = 0x01, 19 CSD_FLAG_WAIT = 0x01,
20 CSD_FLAG_ALLOC = 0x02, 20 CSD_FLAG_ALLOC = 0x02,
21 CSD_FLAG_LOCK = 0x04,
21}; 22};
22 23
23struct call_function_data { 24struct call_function_data {
24 struct call_single_data csd; 25 struct call_single_data csd;
25 spinlock_t lock; 26 spinlock_t lock;
26 unsigned int refs; 27 unsigned int refs;
27 cpumask_t cpumask;
28 struct rcu_head rcu_head; 28 struct rcu_head rcu_head;
29 unsigned long cpumask_bits[];
29}; 30};
30 31
31struct call_single_queue { 32struct call_single_queue {
@@ -110,13 +111,13 @@ void generic_smp_call_function_interrupt(void)
110 list_for_each_entry_rcu(data, &call_function_queue, csd.list) { 111 list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
111 int refs; 112 int refs;
112 113
113 if (!cpu_isset(cpu, data->cpumask)) 114 if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
114 continue; 115 continue;
115 116
116 data->csd.func(data->csd.info); 117 data->csd.func(data->csd.info);
117 118
118 spin_lock(&data->lock); 119 spin_lock(&data->lock);
119 cpu_clear(cpu, data->cpumask); 120 cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
120 WARN_ON(data->refs == 0); 121 WARN_ON(data->refs == 0);
121 data->refs--; 122 data->refs--;
122 refs = data->refs; 123 refs = data->refs;
@@ -186,6 +187,9 @@ void generic_smp_call_function_single_interrupt(void)
186 if (data_flags & CSD_FLAG_WAIT) { 187 if (data_flags & CSD_FLAG_WAIT) {
187 smp_wmb(); 188 smp_wmb();
188 data->flags &= ~CSD_FLAG_WAIT; 189 data->flags &= ~CSD_FLAG_WAIT;
190 } else if (data_flags & CSD_FLAG_LOCK) {
191 smp_wmb();
192 data->flags &= ~CSD_FLAG_LOCK;
189 } else if (data_flags & CSD_FLAG_ALLOC) 193 } else if (data_flags & CSD_FLAG_ALLOC)
190 kfree(data); 194 kfree(data);
191 } 195 }
@@ -196,6 +200,8 @@ void generic_smp_call_function_single_interrupt(void)
196 } 200 }
197} 201}
198 202
203static DEFINE_PER_CPU(struct call_single_data, csd_data);
204
199/* 205/*
200 * smp_call_function_single - Run a function on a specific CPU 206 * smp_call_function_single - Run a function on a specific CPU
201 * @func: The function to run. This must be fast and non-blocking. 207 * @func: The function to run. This must be fast and non-blocking.
@@ -223,15 +229,39 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
223 local_irq_save(flags); 229 local_irq_save(flags);
224 func(info); 230 func(info);
225 local_irq_restore(flags); 231 local_irq_restore(flags);
226 } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) { 232 } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
227 struct call_single_data *data = NULL; 233 struct call_single_data *data;
228 234
229 if (!wait) { 235 if (!wait) {
236 /*
237 * We are calling a function on a single CPU
238 * and we are not going to wait for it to finish.
239 * We first try to allocate the data, but if we
240 * fail, we fall back to use a per cpu data to pass
241 * the information to that CPU. Since all callers
242 * of this code will use the same data, we must
243 * synchronize the callers to prevent a new caller
244 * from corrupting the data before the callee
245 * can access it.
246 *
247 * The CSD_FLAG_LOCK is used to let us know when
248 * the IPI handler is done with the data.
249 * The first caller will set it, and the callee
250 * will clear it. The next caller must wait for
251 * it to clear before we set it again. This
252 * will make sure the callee is done with the
253 * data before a new caller will use it.
254 */
230 data = kmalloc(sizeof(*data), GFP_ATOMIC); 255 data = kmalloc(sizeof(*data), GFP_ATOMIC);
231 if (data) 256 if (data)
232 data->flags = CSD_FLAG_ALLOC; 257 data->flags = CSD_FLAG_ALLOC;
233 } 258 else {
234 if (!data) { 259 data = &per_cpu(csd_data, me);
260 while (data->flags & CSD_FLAG_LOCK)
261 cpu_relax();
262 data->flags = CSD_FLAG_LOCK;
263 }
264 } else {
235 data = &d; 265 data = &d;
236 data->flags = CSD_FLAG_WAIT; 266 data->flags = CSD_FLAG_WAIT;
237 } 267 }
@@ -266,51 +296,19 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
266 generic_exec_single(cpu, data); 296 generic_exec_single(cpu, data);
267} 297}
268 298
269/* Dummy function */ 299/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
270static void quiesce_dummy(void *unused) 300#ifndef arch_send_call_function_ipi_mask
271{ 301#define arch_send_call_function_ipi_mask(maskp) \
272} 302 arch_send_call_function_ipi(*(maskp))
273 303#endif
274/*
275 * Ensure stack based data used in call function mask is safe to free.
276 *
277 * This is needed by smp_call_function_mask when using on-stack data, because
278 * a single call function queue is shared by all CPUs, and any CPU may pick up
279 * the data item on the queue at any time before it is deleted. So we need to
280 * ensure that all CPUs have transitioned through a quiescent state after
281 * this call.
282 *
283 * This is a very slow function, implemented by sending synchronous IPIs to
284 * all possible CPUs. For this reason, we have to alloc data rather than use
285 * stack based data even in the case of synchronous calls. The stack based
286 * data is then just used for deadlock/oom fallback which will be very rare.
287 *
288 * If a faster scheme can be made, we could go back to preferring stack based
289 * data -- the data allocation/free is non-zero cost.
290 */
291static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
292{
293 struct call_single_data data;
294 int cpu;
295
296 data.func = quiesce_dummy;
297 data.info = NULL;
298
299 for_each_cpu_mask(cpu, mask) {
300 data.flags = CSD_FLAG_WAIT;
301 generic_exec_single(cpu, &data);
302 }
303}
304 304
305/** 305/**
306 * smp_call_function_mask(): Run a function on a set of other CPUs. 306 * smp_call_function_many(): Run a function on a set of other CPUs.
307 * @mask: The set of cpus to run on. 307 * @mask: The set of cpus to run on (only runs on online subset).
308 * @func: The function to run. This must be fast and non-blocking. 308 * @func: The function to run. This must be fast and non-blocking.
309 * @info: An arbitrary pointer to pass to the function. 309 * @info: An arbitrary pointer to pass to the function.
310 * @wait: If true, wait (atomically) until function has completed on other CPUs. 310 * @wait: If true, wait (atomically) until function has completed on other CPUs.
311 * 311 *
312 * Returns 0 on success, else a negative status code.
313 *
314 * If @wait is true, then returns once @func has returned. Note that @wait 312 * If @wait is true, then returns once @func has returned. Note that @wait
315 * will be implicitly turned on in case of allocation failures, since 313 * will be implicitly turned on in case of allocation failures, since
316 * we fall back to on-stack allocation. 314 * we fall back to on-stack allocation.
@@ -319,53 +317,57 @@ static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
319 * hardware interrupt handler or from a bottom half handler. Preemption 317 * hardware interrupt handler or from a bottom half handler. Preemption
320 * must be disabled when calling this function. 318 * must be disabled when calling this function.
321 */ 319 */
322int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info, 320void smp_call_function_many(const struct cpumask *mask,
323 int wait) 321 void (*func)(void *), void *info,
322 bool wait)
324{ 323{
325 struct call_function_data d; 324 struct call_function_data *data;
326 struct call_function_data *data = NULL;
327 cpumask_t allbutself;
328 unsigned long flags; 325 unsigned long flags;
329 int cpu, num_cpus; 326 int cpu, next_cpu;
330 int slowpath = 0;
331 327
332 /* Can deadlock when called with interrupts disabled */ 328 /* Can deadlock when called with interrupts disabled */
333 WARN_ON(irqs_disabled()); 329 WARN_ON(irqs_disabled());
334 330
335 cpu = smp_processor_id(); 331 /* So, what's a CPU they want? Ignoring this one. */
336 allbutself = cpu_online_map; 332 cpu = cpumask_first_and(mask, cpu_online_mask);
337 cpu_clear(cpu, allbutself); 333 if (cpu == smp_processor_id())
338 cpus_and(mask, mask, allbutself); 334 cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
339 num_cpus = cpus_weight(mask); 335 /* No online cpus? We're done. */
340 336 if (cpu >= nr_cpu_ids)
341 /* 337 return;
342 * If zero CPUs, return. If just a single CPU, turn this request 338
343 * into a targetted single call instead since it's faster. 339 /* Do we have another CPU which isn't us? */
344 */ 340 next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
345 if (!num_cpus) 341 if (next_cpu == smp_processor_id())
346 return 0; 342 next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
347 else if (num_cpus == 1) { 343
348 cpu = first_cpu(mask); 344 /* Fastpath: do that cpu by itself. */
349 return smp_call_function_single(cpu, func, info, wait); 345 if (next_cpu >= nr_cpu_ids) {
346 smp_call_function_single(cpu, func, info, wait);
347 return;
350 } 348 }
351 349
352 data = kmalloc(sizeof(*data), GFP_ATOMIC); 350 data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
353 if (data) { 351 if (unlikely(!data)) {
354 data->csd.flags = CSD_FLAG_ALLOC; 352 /* Slow path. */
355 if (wait) 353 for_each_online_cpu(cpu) {
356 data->csd.flags |= CSD_FLAG_WAIT; 354 if (cpu == smp_processor_id())
357 } else { 355 continue;
358 data = &d; 356 if (cpumask_test_cpu(cpu, mask))
359 data->csd.flags = CSD_FLAG_WAIT; 357 smp_call_function_single(cpu, func, info, wait);
360 wait = 1; 358 }
361 slowpath = 1; 359 return;
362 } 360 }
363 361
364 spin_lock_init(&data->lock); 362 spin_lock_init(&data->lock);
363 data->csd.flags = CSD_FLAG_ALLOC;
364 if (wait)
365 data->csd.flags |= CSD_FLAG_WAIT;
365 data->csd.func = func; 366 data->csd.func = func;
366 data->csd.info = info; 367 data->csd.info = info;
367 data->refs = num_cpus; 368 cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
368 data->cpumask = mask; 369 cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
370 data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
369 371
370 spin_lock_irqsave(&call_function_lock, flags); 372 spin_lock_irqsave(&call_function_lock, flags);
371 list_add_tail_rcu(&data->csd.list, &call_function_queue); 373 list_add_tail_rcu(&data->csd.list, &call_function_queue);
@@ -377,18 +379,13 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
377 smp_mb(); 379 smp_mb();
378 380
379 /* Send a message to all CPUs in the map */ 381 /* Send a message to all CPUs in the map */
380 arch_send_call_function_ipi(mask); 382 arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
381 383
382 /* optionally wait for the CPUs to complete */ 384 /* optionally wait for the CPUs to complete */
383 if (wait) { 385 if (wait)
384 csd_flag_wait(&data->csd); 386 csd_flag_wait(&data->csd);
385 if (unlikely(slowpath))
386 smp_call_function_mask_quiesce_stack(mask);
387 }
388
389 return 0;
390} 387}
391EXPORT_SYMBOL(smp_call_function_mask); 388EXPORT_SYMBOL(smp_call_function_many);
392 389
393/** 390/**
394 * smp_call_function(): Run a function on all other CPUs. 391 * smp_call_function(): Run a function on all other CPUs.
@@ -396,7 +393,7 @@ EXPORT_SYMBOL(smp_call_function_mask);
396 * @info: An arbitrary pointer to pass to the function. 393 * @info: An arbitrary pointer to pass to the function.
397 * @wait: If true, wait (atomically) until function has completed on other CPUs. 394 * @wait: If true, wait (atomically) until function has completed on other CPUs.
398 * 395 *
399 * Returns 0 on success, else a negative status code. 396 * Returns 0.
400 * 397 *
401 * If @wait is true, then returns once @func has returned; otherwise 398 * If @wait is true, then returns once @func has returned; otherwise
402 * it returns just before the target cpu calls @func. In case of allocation 399 * it returns just before the target cpu calls @func. In case of allocation
@@ -407,12 +404,10 @@ EXPORT_SYMBOL(smp_call_function_mask);
407 */ 404 */
408int smp_call_function(void (*func)(void *), void *info, int wait) 405int smp_call_function(void (*func)(void *), void *info, int wait)
409{ 406{
410 int ret;
411
412 preempt_disable(); 407 preempt_disable();
413 ret = smp_call_function_mask(cpu_online_map, func, info, wait); 408 smp_call_function_many(cpu_online_mask, func, info, wait);
414 preempt_enable(); 409 preempt_enable();
415 return ret; 410 return 0;
416} 411}
417EXPORT_SYMBOL(smp_call_function); 412EXPORT_SYMBOL(smp_call_function);
418 413
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 466e75ce271a..bdbe9de9cd8d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -733,7 +733,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
733 break; 733 break;
734 /* Unbind so it can run. Fall thru. */ 734 /* Unbind so it can run. Fall thru. */
735 kthread_bind(per_cpu(ksoftirqd, hotcpu), 735 kthread_bind(per_cpu(ksoftirqd, hotcpu),
736 any_online_cpu(cpu_online_map)); 736 cpumask_any(cpu_online_mask));
737 case CPU_DEAD: 737 case CPU_DEAD:
738 case CPU_DEAD_FROZEN: { 738 case CPU_DEAD_FROZEN: {
739 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 739 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -784,3 +784,23 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
784} 784}
785EXPORT_SYMBOL(on_each_cpu); 785EXPORT_SYMBOL(on_each_cpu);
786#endif 786#endif
787
788/*
789 * [ These __weak aliases are kept in a separate compilation unit, so that
790 * GCC does not inline them incorrectly. ]
791 */
792
793int __init __weak early_irq_init(void)
794{
795 return 0;
796}
797
798int __init __weak arch_early_irq_init(void)
799{
800 return 0;
801}
802
803int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
804{
805 return 0;
806}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 1ab790c67b17..85d5a2455103 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
16#include <linux/lockdep.h> 16#include <linux/lockdep.h>
17#include <linux/notifier.h> 17#include <linux/notifier.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/sysctl.h>
19 20
20#include <asm/irq_regs.h> 21#include <asm/irq_regs.h>
21 22
@@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void)
88} 89}
89EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
90 91
92int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
93 struct file *filp, void __user *buffer,
94 size_t *lenp, loff_t *ppos)
95{
96 touch_all_softlockup_watchdogs();
97 return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
98}
99
91/* 100/*
92 * This callback runs from the timer interrupt, and checks 101 * This callback runs from the timer interrupt, and checks
93 * whether the watchdog thread has hung or not: 102 * whether the watchdog thread has hung or not:
@@ -303,17 +312,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
303 break; 312 break;
304 case CPU_ONLINE: 313 case CPU_ONLINE:
305 case CPU_ONLINE_FROZEN: 314 case CPU_ONLINE_FROZEN:
306 check_cpu = any_online_cpu(cpu_online_map); 315 check_cpu = cpumask_any(cpu_online_mask);
307 wake_up_process(per_cpu(watchdog_task, hotcpu)); 316 wake_up_process(per_cpu(watchdog_task, hotcpu));
308 break; 317 break;
309#ifdef CONFIG_HOTPLUG_CPU 318#ifdef CONFIG_HOTPLUG_CPU
310 case CPU_DOWN_PREPARE: 319 case CPU_DOWN_PREPARE:
311 case CPU_DOWN_PREPARE_FROZEN: 320 case CPU_DOWN_PREPARE_FROZEN:
312 if (hotcpu == check_cpu) { 321 if (hotcpu == check_cpu) {
313 cpumask_t temp_cpu_online_map = cpu_online_map; 322 /* Pick any other online cpu. */
314 323 check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
315 cpu_clear(hotcpu, temp_cpu_online_map);
316 check_cpu = any_online_cpu(temp_cpu_online_map);
317 } 324 }
318 break; 325 break;
319 326
@@ -323,7 +330,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
323 break; 330 break;
324 /* Unbind so it can run. Fall thru. */ 331 /* Unbind so it can run. Fall thru. */
325 kthread_bind(per_cpu(watchdog_task, hotcpu), 332 kthread_bind(per_cpu(watchdog_task, hotcpu),
326 any_online_cpu(cpu_online_map)); 333 cpumask_any(cpu_online_mask));
327 case CPU_DEAD: 334 case CPU_DEAD:
328 case CPU_DEAD_FROZEN: 335 case CPU_DEAD_FROZEN:
329 p = per_cpu(watchdog_task, hotcpu); 336 p = per_cpu(watchdog_task, hotcpu);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 24e8ceacc388..0cd415ee62a2 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -38,7 +38,10 @@ struct stop_machine_data {
38static unsigned int num_threads; 38static unsigned int num_threads;
39static atomic_t thread_ack; 39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock); 40static DEFINE_MUTEX(lock);
41 41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
42static struct workqueue_struct *stop_machine_wq; 45static struct workqueue_struct *stop_machine_wq;
43static struct stop_machine_data active, idle; 46static struct stop_machine_data active, idle;
44static const cpumask_t *active_cpus; 47static const cpumask_t *active_cpus;
@@ -69,10 +72,10 @@ static void stop_cpu(struct work_struct *unused)
69 int err; 72 int err;
70 73
71 if (!active_cpus) { 74 if (!active_cpus) {
72 if (cpu == first_cpu(cpu_online_map)) 75 if (cpu == cpumask_first(cpu_online_mask))
73 smdata = &active; 76 smdata = &active;
74 } else { 77 } else {
75 if (cpu_isset(cpu, *active_cpus)) 78 if (cpumask_test_cpu(cpu, active_cpus))
76 smdata = &active; 79 smdata = &active;
77 } 80 }
78 /* Simple state machine */ 81 /* Simple state machine */
@@ -109,7 +112,44 @@ static int chill(void *unused)
109 return 0; 112 return 0;
110} 113}
111 114
112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
113{ 153{
114 struct work_struct *sm_work; 154 struct work_struct *sm_work;
115 int i, ret; 155 int i, ret;
@@ -142,23 +182,18 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
142 return ret; 182 return ret;
143} 183}
144 184
145int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) 185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
146{ 186{
147 int ret; 187 int ret;
148 188
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
149 /* No CPUs can come up or down during this. */ 192 /* No CPUs can come up or down during this. */
150 get_online_cpus(); 193 get_online_cpus();
151 ret = __stop_machine(fn, data, cpus); 194 ret = __stop_machine(fn, data, cpus);
152 put_online_cpus(); 195 put_online_cpus();
153 196 stop_machine_destroy();
154 return ret; 197 return ret;
155} 198}
156EXPORT_SYMBOL_GPL(stop_machine); 199EXPORT_SYMBOL_GPL(stop_machine);
157
158static int __init stop_machine_init(void)
159{
160 stop_machine_wq = create_rt_workqueue("kstop");
161 stop_machine_work = alloc_percpu(struct work_struct);
162 return 0;
163}
164core_initcall(stop_machine_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index d356d79e84ac..f145c415bc16 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -33,6 +33,7 @@
33#include <linux/task_io_accounting_ops.h> 33#include <linux/task_io_accounting_ops.h>
34#include <linux/seccomp.h> 34#include <linux/seccomp.h>
35#include <linux/cpu.h> 35#include <linux/cpu.h>
36#include <linux/ptrace.h>
36 37
37#include <linux/compat.h> 38#include <linux/compat.h>
38#include <linux/syscalls.h> 39#include <linux/syscalls.h>
@@ -142,7 +143,7 @@ out:
142 return error; 143 return error;
143} 144}
144 145
145asmlinkage long sys_setpriority(int which, int who, int niceval) 146SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
146{ 147{
147 struct task_struct *g, *p; 148 struct task_struct *g, *p;
148 struct user_struct *user; 149 struct user_struct *user;
@@ -207,7 +208,7 @@ out:
207 * has been offset by 20 (ie it returns 40..1 instead of -20..19) 208 * has been offset by 20 (ie it returns 40..1 instead of -20..19)
208 * to stay compatible. 209 * to stay compatible.
209 */ 210 */
210asmlinkage long sys_getpriority(int which, int who) 211SYSCALL_DEFINE2(getpriority, int, which, int, who)
211{ 212{
212 struct task_struct *g, *p; 213 struct task_struct *g, *p;
213 struct user_struct *user; 214 struct user_struct *user;
@@ -354,7 +355,8 @@ EXPORT_SYMBOL_GPL(kernel_power_off);
354 * 355 *
355 * reboot doesn't sync: do that yourself before calling this. 356 * reboot doesn't sync: do that yourself before calling this.
356 */ 357 */
357asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg) 358SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
359 void __user *, arg)
358{ 360{
359 char buffer[256]; 361 char buffer[256];
360 362
@@ -477,7 +479,7 @@ void ctrl_alt_del(void)
477 * SMP: There are not races, the GIDs are checked only by filesystem 479 * SMP: There are not races, the GIDs are checked only by filesystem
478 * operations (as far as semantic preservation is concerned). 480 * operations (as far as semantic preservation is concerned).
479 */ 481 */
480asmlinkage long sys_setregid(gid_t rgid, gid_t egid) 482SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
481{ 483{
482 const struct cred *old; 484 const struct cred *old;
483 struct cred *new; 485 struct cred *new;
@@ -528,7 +530,7 @@ error:
528 * 530 *
529 * SMP: Same implicit races as above. 531 * SMP: Same implicit races as above.
530 */ 532 */
531asmlinkage long sys_setgid(gid_t gid) 533SYSCALL_DEFINE1(setgid, gid_t, gid)
532{ 534{
533 const struct cred *old; 535 const struct cred *old;
534 struct cred *new; 536 struct cred *new;
@@ -596,7 +598,7 @@ static int set_user(struct cred *new)
596 * 100% compatible with BSD. A program which uses just setuid() will be 598 * 100% compatible with BSD. A program which uses just setuid() will be
597 * 100% compatible with POSIX with saved IDs. 599 * 100% compatible with POSIX with saved IDs.
598 */ 600 */
599asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) 601SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
600{ 602{
601 const struct cred *old; 603 const struct cred *old;
602 struct cred *new; 604 struct cred *new;
@@ -660,7 +662,7 @@ error:
660 * will allow a root program to temporarily drop privileges and be able to 662 * will allow a root program to temporarily drop privileges and be able to
661 * regain them by swapping the real and effective uid. 663 * regain them by swapping the real and effective uid.
662 */ 664 */
663asmlinkage long sys_setuid(uid_t uid) 665SYSCALL_DEFINE1(setuid, uid_t, uid)
664{ 666{
665 const struct cred *old; 667 const struct cred *old;
666 struct cred *new; 668 struct cred *new;
@@ -704,7 +706,7 @@ error:
704 * This function implements a generic ability to update ruid, euid, 706 * This function implements a generic ability to update ruid, euid,
705 * and suid. This allows you to implement the 4.4 compatible seteuid(). 707 * and suid. This allows you to implement the 4.4 compatible seteuid().
706 */ 708 */
707asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) 709SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
708{ 710{
709 const struct cred *old; 711 const struct cred *old;
710 struct cred *new; 712 struct cred *new;
@@ -755,7 +757,7 @@ error:
755 return retval; 757 return retval;
756} 758}
757 759
758asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid) 760SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid)
759{ 761{
760 const struct cred *cred = current_cred(); 762 const struct cred *cred = current_cred();
761 int retval; 763 int retval;
@@ -770,7 +772,7 @@ asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __us
770/* 772/*
771 * Same as above, but for rgid, egid, sgid. 773 * Same as above, but for rgid, egid, sgid.
772 */ 774 */
773asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) 775SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
774{ 776{
775 const struct cred *old; 777 const struct cred *old;
776 struct cred *new; 778 struct cred *new;
@@ -813,7 +815,7 @@ error:
813 return retval; 815 return retval;
814} 816}
815 817
816asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid) 818SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid)
817{ 819{
818 const struct cred *cred = current_cred(); 820 const struct cred *cred = current_cred();
819 int retval; 821 int retval;
@@ -832,7 +834,7 @@ asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __us
832 * whatever uid it wants to). It normally shadows "euid", except when 834 * whatever uid it wants to). It normally shadows "euid", except when
833 * explicitly set by setfsuid() or for access.. 835 * explicitly set by setfsuid() or for access..
834 */ 836 */
835asmlinkage long sys_setfsuid(uid_t uid) 837SYSCALL_DEFINE1(setfsuid, uid_t, uid)
836{ 838{
837 const struct cred *old; 839 const struct cred *old;
838 struct cred *new; 840 struct cred *new;
@@ -869,7 +871,7 @@ change_okay:
869/* 871/*
870 * Samma på svenska.. 872 * Samma på svenska..
871 */ 873 */
872asmlinkage long sys_setfsgid(gid_t gid) 874SYSCALL_DEFINE1(setfsgid, gid_t, gid)
873{ 875{
874 const struct cred *old; 876 const struct cred *old;
875 struct cred *new; 877 struct cred *new;
@@ -918,7 +920,7 @@ void do_sys_times(struct tms *tms)
918 tms->tms_cstime = cputime_to_clock_t(cstime); 920 tms->tms_cstime = cputime_to_clock_t(cstime);
919} 921}
920 922
921asmlinkage long sys_times(struct tms __user * tbuf) 923SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
922{ 924{
923 if (tbuf) { 925 if (tbuf) {
924 struct tms tmp; 926 struct tms tmp;
@@ -927,6 +929,7 @@ asmlinkage long sys_times(struct tms __user * tbuf)
927 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 929 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
928 return -EFAULT; 930 return -EFAULT;
929 } 931 }
932 force_successful_syscall_return();
930 return (long) jiffies_64_to_clock_t(get_jiffies_64()); 933 return (long) jiffies_64_to_clock_t(get_jiffies_64());
931} 934}
932 935
@@ -942,7 +945,7 @@ asmlinkage long sys_times(struct tms __user * tbuf)
942 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. 945 * Auch. Had to add the 'did_exec' flag to conform completely to POSIX.
943 * LBT 04.03.94 946 * LBT 04.03.94
944 */ 947 */
945asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) 948SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
946{ 949{
947 struct task_struct *p; 950 struct task_struct *p;
948 struct task_struct *group_leader = current->group_leader; 951 struct task_struct *group_leader = current->group_leader;
@@ -1013,7 +1016,7 @@ out:
1013 return err; 1016 return err;
1014} 1017}
1015 1018
1016asmlinkage long sys_getpgid(pid_t pid) 1019SYSCALL_DEFINE1(getpgid, pid_t, pid)
1017{ 1020{
1018 struct task_struct *p; 1021 struct task_struct *p;
1019 struct pid *grp; 1022 struct pid *grp;
@@ -1043,14 +1046,14 @@ out:
1043 1046
1044#ifdef __ARCH_WANT_SYS_GETPGRP 1047#ifdef __ARCH_WANT_SYS_GETPGRP
1045 1048
1046asmlinkage long sys_getpgrp(void) 1049SYSCALL_DEFINE0(getpgrp)
1047{ 1050{
1048 return sys_getpgid(0); 1051 return sys_getpgid(0);
1049} 1052}
1050 1053
1051#endif 1054#endif
1052 1055
1053asmlinkage long sys_getsid(pid_t pid) 1056SYSCALL_DEFINE1(getsid, pid_t, pid)
1054{ 1057{
1055 struct task_struct *p; 1058 struct task_struct *p;
1056 struct pid *sid; 1059 struct pid *sid;
@@ -1078,7 +1081,7 @@ out:
1078 return retval; 1081 return retval;
1079} 1082}
1080 1083
1081asmlinkage long sys_setsid(void) 1084SYSCALL_DEFINE0(setsid)
1082{ 1085{
1083 struct task_struct *group_leader = current->group_leader; 1086 struct task_struct *group_leader = current->group_leader;
1084 struct pid *sid = task_pid(group_leader); 1087 struct pid *sid = task_pid(group_leader);
@@ -1309,7 +1312,7 @@ int set_current_groups(struct group_info *group_info)
1309 1312
1310EXPORT_SYMBOL(set_current_groups); 1313EXPORT_SYMBOL(set_current_groups);
1311 1314
1312asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist) 1315SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
1313{ 1316{
1314 const struct cred *cred = current_cred(); 1317 const struct cred *cred = current_cred();
1315 int i; 1318 int i;
@@ -1338,7 +1341,7 @@ out:
1338 * without another task interfering. 1341 * without another task interfering.
1339 */ 1342 */
1340 1343
1341asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist) 1344SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
1342{ 1345{
1343 struct group_info *group_info; 1346 struct group_info *group_info;
1344 int retval; 1347 int retval;
@@ -1392,7 +1395,7 @@ EXPORT_SYMBOL(in_egroup_p);
1392 1395
1393DECLARE_RWSEM(uts_sem); 1396DECLARE_RWSEM(uts_sem);
1394 1397
1395asmlinkage long sys_newuname(struct new_utsname __user * name) 1398SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
1396{ 1399{
1397 int errno = 0; 1400 int errno = 0;
1398 1401
@@ -1403,7 +1406,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name)
1403 return errno; 1406 return errno;
1404} 1407}
1405 1408
1406asmlinkage long sys_sethostname(char __user *name, int len) 1409SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
1407{ 1410{
1408 int errno; 1411 int errno;
1409 char tmp[__NEW_UTS_LEN]; 1412 char tmp[__NEW_UTS_LEN];
@@ -1427,7 +1430,7 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1427 1430
1428#ifdef __ARCH_WANT_SYS_GETHOSTNAME 1431#ifdef __ARCH_WANT_SYS_GETHOSTNAME
1429 1432
1430asmlinkage long sys_gethostname(char __user *name, int len) 1433SYSCALL_DEFINE2(gethostname, char __user *, name, int, len)
1431{ 1434{
1432 int i, errno; 1435 int i, errno;
1433 struct new_utsname *u; 1436 struct new_utsname *u;
@@ -1452,7 +1455,7 @@ asmlinkage long sys_gethostname(char __user *name, int len)
1452 * Only setdomainname; getdomainname can be implemented by calling 1455 * Only setdomainname; getdomainname can be implemented by calling
1453 * uname() 1456 * uname()
1454 */ 1457 */
1455asmlinkage long sys_setdomainname(char __user *name, int len) 1458SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
1456{ 1459{
1457 int errno; 1460 int errno;
1458 char tmp[__NEW_UTS_LEN]; 1461 char tmp[__NEW_UTS_LEN];
@@ -1475,7 +1478,7 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
1475 return errno; 1478 return errno;
1476} 1479}
1477 1480
1478asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim) 1481SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1479{ 1482{
1480 if (resource >= RLIM_NLIMITS) 1483 if (resource >= RLIM_NLIMITS)
1481 return -EINVAL; 1484 return -EINVAL;
@@ -1494,7 +1497,8 @@ asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim)
1494 * Back compatibility for getrlimit. Needed for some apps. 1497 * Back compatibility for getrlimit. Needed for some apps.
1495 */ 1498 */
1496 1499
1497asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim) 1500SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1501 struct rlimit __user *, rlim)
1498{ 1502{
1499 struct rlimit x; 1503 struct rlimit x;
1500 if (resource >= RLIM_NLIMITS) 1504 if (resource >= RLIM_NLIMITS)
@@ -1512,7 +1516,7 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
1512 1516
1513#endif 1517#endif
1514 1518
1515asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) 1519SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1516{ 1520{
1517 struct rlimit new_rlim, *old_rlim; 1521 struct rlimit new_rlim, *old_rlim;
1518 int retval; 1522 int retval;
@@ -1521,22 +1525,14 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1521 return -EINVAL; 1525 return -EINVAL;
1522 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1526 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1523 return -EFAULT; 1527 return -EFAULT;
1528 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1529 return -EINVAL;
1524 old_rlim = current->signal->rlim + resource; 1530 old_rlim = current->signal->rlim + resource;
1525 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1531 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1526 !capable(CAP_SYS_RESOURCE)) 1532 !capable(CAP_SYS_RESOURCE))
1527 return -EPERM; 1533 return -EPERM;
1528 1534 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
1529 if (resource == RLIMIT_NOFILE) { 1535 return -EPERM;
1530 if (new_rlim.rlim_max == RLIM_INFINITY)
1531 new_rlim.rlim_max = sysctl_nr_open;
1532 if (new_rlim.rlim_cur == RLIM_INFINITY)
1533 new_rlim.rlim_cur = sysctl_nr_open;
1534 if (new_rlim.rlim_max > sysctl_nr_open)
1535 return -EPERM;
1536 }
1537
1538 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1539 return -EINVAL;
1540 1536
1541 retval = security_task_setrlimit(resource, &new_rlim); 1537 retval = security_task_setrlimit(resource, &new_rlim);
1542 if (retval) 1538 if (retval)
@@ -1627,6 +1623,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1627 utime = stime = cputime_zero; 1623 utime = stime = cputime_zero;
1628 1624
1629 if (who == RUSAGE_THREAD) { 1625 if (who == RUSAGE_THREAD) {
1626 utime = task_utime(current);
1627 stime = task_stime(current);
1630 accumulate_thread_rusage(p, r); 1628 accumulate_thread_rusage(p, r);
1631 goto out; 1629 goto out;
1632 } 1630 }
@@ -1683,7 +1681,7 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1683 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1681 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1684} 1682}
1685 1683
1686asmlinkage long sys_getrusage(int who, struct rusage __user *ru) 1684SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
1687{ 1685{
1688 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && 1686 if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
1689 who != RUSAGE_THREAD) 1687 who != RUSAGE_THREAD)
@@ -1691,14 +1689,14 @@ asmlinkage long sys_getrusage(int who, struct rusage __user *ru)
1691 return getrusage(current, who, ru); 1689 return getrusage(current, who, ru);
1692} 1690}
1693 1691
1694asmlinkage long sys_umask(int mask) 1692SYSCALL_DEFINE1(umask, int, mask)
1695{ 1693{
1696 mask = xchg(&current->fs->umask, mask & S_IRWXUGO); 1694 mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
1697 return mask; 1695 return mask;
1698} 1696}
1699 1697
1700asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1698SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1701 unsigned long arg4, unsigned long arg5) 1699 unsigned long, arg4, unsigned long, arg5)
1702{ 1700{
1703 struct task_struct *me = current; 1701 struct task_struct *me = current;
1704 unsigned char comm[sizeof(me->comm)]; 1702 unsigned char comm[sizeof(me->comm)];
@@ -1811,8 +1809,8 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1811 return error; 1809 return error;
1812} 1810}
1813 1811
1814asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, 1812SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
1815 struct getcpu_cache __user *unused) 1813 struct getcpu_cache __user *, unused)
1816{ 1814{
1817 int err = 0; 1815 int err = 0;
1818 int cpu = raw_smp_processor_id(); 1816 int cpu = raw_smp_processor_id();
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a23281707..27dad2967387 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -131,6 +131,7 @@ cond_syscall(sys_io_destroy);
131cond_syscall(sys_io_submit); 131cond_syscall(sys_io_submit);
132cond_syscall(sys_io_cancel); 132cond_syscall(sys_io_cancel);
133cond_syscall(sys_io_getevents); 133cond_syscall(sys_io_getevents);
134cond_syscall(sys_syslog);
134 135
135/* arch-specific weak syscall entries */ 136/* arch-specific weak syscall entries */
136cond_syscall(sys_pciconfig_read); 137cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ff6d45c7626f..790f9d785663 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,15 +82,14 @@ extern int percpu_pagelist_fraction;
82extern int compat_log; 82extern int compat_log;
83extern int latencytop_enabled; 83extern int latencytop_enabled;
84extern int sysctl_nr_open_min, sysctl_nr_open_max; 84extern int sysctl_nr_open_min, sysctl_nr_open_max;
85#ifndef CONFIG_MMU
86extern int sysctl_nr_trim_pages;
87#endif
85#ifdef CONFIG_RCU_TORTURE_TEST 88#ifdef CONFIG_RCU_TORTURE_TEST
86extern int rcutorture_runnable; 89extern int rcutorture_runnable;
87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 90#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
88 91
89/* Constants used for minimum and maximum */ 92/* Constants used for minimum and maximum */
90#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
91static int one = 1;
92#endif
93
94#ifdef CONFIG_DETECT_SOFTLOCKUP 93#ifdef CONFIG_DETECT_SOFTLOCKUP
95static int sixty = 60; 94static int sixty = 60;
96static int neg_one = -1; 95static int neg_one = -1;
@@ -101,6 +100,7 @@ static int two = 2;
101#endif 100#endif
102 101
103static int zero; 102static int zero;
103static int one = 1;
104static int one_hundred = 100; 104static int one_hundred = 100;
105 105
106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -144,6 +144,7 @@ extern int acct_parm[];
144 144
145#ifdef CONFIG_IA64 145#ifdef CONFIG_IA64
146extern int no_unaligned_warning; 146extern int no_unaligned_warning;
147extern int unaligned_dump_stack;
147#endif 148#endif
148 149
149#ifdef CONFIG_RT_MUTEXES 150#ifdef CONFIG_RT_MUTEXES
@@ -781,6 +782,14 @@ static struct ctl_table kern_table[] = {
781 .mode = 0644, 782 .mode = 0644,
782 .proc_handler = &proc_dointvec, 783 .proc_handler = &proc_dointvec,
783 }, 784 },
785 {
786 .ctl_name = CTL_UNNUMBERED,
787 .procname = "unaligned-dump-stack",
788 .data = &unaligned_dump_stack,
789 .maxlen = sizeof (int),
790 .mode = 0644,
791 .proc_handler = &proc_dointvec,
792 },
784#endif 793#endif
785#ifdef CONFIG_DETECT_SOFTLOCKUP 794#ifdef CONFIG_DETECT_SOFTLOCKUP
786 { 795 {
@@ -800,7 +809,7 @@ static struct ctl_table kern_table[] = {
800 .data = &softlockup_thresh, 809 .data = &softlockup_thresh,
801 .maxlen = sizeof(int), 810 .maxlen = sizeof(int),
802 .mode = 0644, 811 .mode = 0644,
803 .proc_handler = &proc_dointvec_minmax, 812 .proc_handler = &proc_dosoftlockup_thresh,
804 .strategy = &sysctl_intvec, 813 .strategy = &sysctl_intvec,
805 .extra1 = &neg_one, 814 .extra1 = &neg_one,
806 .extra2 = &sixty, 815 .extra2 = &sixty,
@@ -952,12 +961,22 @@ static struct ctl_table vm_table[] = {
952 .data = &dirty_background_ratio, 961 .data = &dirty_background_ratio,
953 .maxlen = sizeof(dirty_background_ratio), 962 .maxlen = sizeof(dirty_background_ratio),
954 .mode = 0644, 963 .mode = 0644,
955 .proc_handler = &proc_dointvec_minmax, 964 .proc_handler = &dirty_background_ratio_handler,
956 .strategy = &sysctl_intvec, 965 .strategy = &sysctl_intvec,
957 .extra1 = &zero, 966 .extra1 = &zero,
958 .extra2 = &one_hundred, 967 .extra2 = &one_hundred,
959 }, 968 },
960 { 969 {
970 .ctl_name = CTL_UNNUMBERED,
971 .procname = "dirty_background_bytes",
972 .data = &dirty_background_bytes,
973 .maxlen = sizeof(dirty_background_bytes),
974 .mode = 0644,
975 .proc_handler = &dirty_background_bytes_handler,
976 .strategy = &sysctl_intvec,
977 .extra1 = &one,
978 },
979 {
961 .ctl_name = VM_DIRTY_RATIO, 980 .ctl_name = VM_DIRTY_RATIO,
962 .procname = "dirty_ratio", 981 .procname = "dirty_ratio",
963 .data = &vm_dirty_ratio, 982 .data = &vm_dirty_ratio,
@@ -969,6 +988,16 @@ static struct ctl_table vm_table[] = {
969 .extra2 = &one_hundred, 988 .extra2 = &one_hundred,
970 }, 989 },
971 { 990 {
991 .ctl_name = CTL_UNNUMBERED,
992 .procname = "dirty_bytes",
993 .data = &vm_dirty_bytes,
994 .maxlen = sizeof(vm_dirty_bytes),
995 .mode = 0644,
996 .proc_handler = &dirty_bytes_handler,
997 .strategy = &sysctl_intvec,
998 .extra1 = &one,
999 },
1000 {
972 .procname = "dirty_writeback_centisecs", 1001 .procname = "dirty_writeback_centisecs",
973 .data = &dirty_writeback_interval, 1002 .data = &dirty_writeback_interval,
974 .maxlen = sizeof(dirty_writeback_interval), 1003 .maxlen = sizeof(dirty_writeback_interval),
@@ -1085,6 +1114,17 @@ static struct ctl_table vm_table[] = {
1085 .mode = 0644, 1114 .mode = 0644,
1086 .proc_handler = &proc_dointvec 1115 .proc_handler = &proc_dointvec
1087 }, 1116 },
1117#else
1118 {
1119 .ctl_name = CTL_UNNUMBERED,
1120 .procname = "nr_trim_pages",
1121 .data = &sysctl_nr_trim_pages,
1122 .maxlen = sizeof(sysctl_nr_trim_pages),
1123 .mode = 0644,
1124 .proc_handler = &proc_dointvec_minmax,
1125 .strategy = &sysctl_intvec,
1126 .extra1 = &zero,
1127 },
1088#endif 1128#endif
1089 { 1129 {
1090 .ctl_name = VM_LAPTOP_MODE, 1130 .ctl_name = VM_LAPTOP_MODE,
@@ -1657,7 +1697,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1657 return error; 1697 return error;
1658} 1698}
1659 1699
1660asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 1700SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
1661{ 1701{
1662 struct __sysctl_args tmp; 1702 struct __sysctl_args tmp;
1663 int error; 1703 int error;
@@ -2958,7 +2998,7 @@ int sysctl_ms_jiffies(struct ctl_table *table,
2958#else /* CONFIG_SYSCTL_SYSCALL */ 2998#else /* CONFIG_SYSCTL_SYSCALL */
2959 2999
2960 3000
2961asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 3001SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
2962{ 3002{
2963 struct __sysctl_args tmp; 3003 struct __sysctl_args tmp;
2964 int error; 3004 int error;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index bd6be76303cf..888adbcca30c 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -290,18 +290,17 @@ ret:
290 return; 290 return;
291} 291}
292 292
293static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) 293static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
294{ 294{
295 struct listener_list *listeners; 295 struct listener_list *listeners;
296 struct listener *s, *tmp; 296 struct listener *s, *tmp;
297 unsigned int cpu; 297 unsigned int cpu;
298 cpumask_t mask = *maskp;
299 298
300 if (!cpus_subset(mask, cpu_possible_map)) 299 if (!cpumask_subset(mask, cpu_possible_mask))
301 return -EINVAL; 300 return -EINVAL;
302 301
303 if (isadd == REGISTER) { 302 if (isadd == REGISTER) {
304 for_each_cpu_mask_nr(cpu, mask) { 303 for_each_cpu(cpu, mask) {
305 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 304 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
306 cpu_to_node(cpu)); 305 cpu_to_node(cpu));
307 if (!s) 306 if (!s)
@@ -320,7 +319,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
320 319
321 /* Deregister or cleanup */ 320 /* Deregister or cleanup */
322cleanup: 321cleanup:
323 for_each_cpu_mask_nr(cpu, mask) { 322 for_each_cpu(cpu, mask) {
324 listeners = &per_cpu(listener_array, cpu); 323 listeners = &per_cpu(listener_array, cpu);
325 down_write(&listeners->sem); 324 down_write(&listeners->sem);
326 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 325 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
@@ -335,7 +334,7 @@ cleanup:
335 return 0; 334 return 0;
336} 335}
337 336
338static int parse(struct nlattr *na, cpumask_t *mask) 337static int parse(struct nlattr *na, struct cpumask *mask)
339{ 338{
340 char *data; 339 char *data;
341 int len; 340 int len;
@@ -352,7 +351,7 @@ static int parse(struct nlattr *na, cpumask_t *mask)
352 if (!data) 351 if (!data)
353 return -ENOMEM; 352 return -ENOMEM;
354 nla_strlcpy(data, na, len); 353 nla_strlcpy(data, na, len);
355 ret = cpulist_parse(data, *mask); 354 ret = cpulist_parse(data, mask);
356 kfree(data); 355 kfree(data);
357 return ret; 356 return ret;
358} 357}
@@ -428,23 +427,33 @@ err:
428 427
429static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) 428static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
430{ 429{
431 int rc = 0; 430 int rc;
432 struct sk_buff *rep_skb; 431 struct sk_buff *rep_skb;
433 struct taskstats *stats; 432 struct taskstats *stats;
434 size_t size; 433 size_t size;
435 cpumask_t mask; 434 cpumask_var_t mask;
435
436 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
437 return -ENOMEM;
436 438
437 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); 439 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
438 if (rc < 0) 440 if (rc < 0)
439 return rc; 441 goto free_return_rc;
440 if (rc == 0) 442 if (rc == 0) {
441 return add_del_listener(info->snd_pid, &mask, REGISTER); 443 rc = add_del_listener(info->snd_pid, mask, REGISTER);
444 goto free_return_rc;
445 }
442 446
443 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); 447 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
444 if (rc < 0) 448 if (rc < 0)
449 goto free_return_rc;
450 if (rc == 0) {
451 rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
452free_return_rc:
453 free_cpumask_var(mask);
445 return rc; 454 return rc;
446 if (rc == 0) 455 }
447 return add_del_listener(info->snd_pid, &mask, DEREGISTER); 456 free_cpumask_var(mask);
448 457
449 /* 458 /*
450 * Size includes space for nested attributes 459 * Size includes space for nested attributes
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 06b6395b45b2..4f104515a19b 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -22,21 +22,11 @@
22 22
23static u32 rand1, preh_val, posth_val, jph_val; 23static u32 rand1, preh_val, posth_val, jph_val;
24static int errors, handler_errors, num_tests; 24static int errors, handler_errors, num_tests;
25static u32 (*target)(u32 value);
26static u32 (*target2)(u32 value);
25 27
26static noinline u32 kprobe_target(u32 value) 28static noinline u32 kprobe_target(u32 value)
27{ 29{
28 /*
29 * gcc ignores noinline on some architectures unless we stuff
30 * sufficient lard into the function. The get_kprobe() here is
31 * just for that.
32 *
33 * NOTE: We aren't concerned about the correctness of get_kprobe()
34 * here; hence, this call is neither under !preempt nor with the
35 * kprobe_mutex held. This is fine(tm)
36 */
37 if (get_kprobe((void *)0xdeadbeef))
38 printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
39
40 return (value / div_factor); 30 return (value / div_factor);
41} 31}
42 32
@@ -74,7 +64,7 @@ static int test_kprobe(void)
74 return ret; 64 return ret;
75 } 65 }
76 66
77 ret = kprobe_target(rand1); 67 ret = target(rand1);
78 unregister_kprobe(&kp); 68 unregister_kprobe(&kp);
79 69
80 if (preh_val == 0) { 70 if (preh_val == 0) {
@@ -92,6 +82,84 @@ static int test_kprobe(void)
92 return 0; 82 return 0;
93} 83}
94 84
85static noinline u32 kprobe_target2(u32 value)
86{
87 return (value / div_factor) + 1;
88}
89
90static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs)
91{
92 preh_val = (rand1 / div_factor) + 1;
93 return 0;
94}
95
96static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
97 unsigned long flags)
98{
99 if (preh_val != (rand1 / div_factor) + 1) {
100 handler_errors++;
101 printk(KERN_ERR "Kprobe smoke test failed: "
102 "incorrect value in post_handler2\n");
103 }
104 posth_val = preh_val + div_factor;
105}
106
107static struct kprobe kp2 = {
108 .symbol_name = "kprobe_target2",
109 .pre_handler = kp_pre_handler2,
110 .post_handler = kp_post_handler2
111};
112
113static int test_kprobes(void)
114{
115 int ret;
116 struct kprobe *kps[2] = {&kp, &kp2};
117
118 kp.addr = 0; /* addr should be cleard for reusing kprobe. */
119 ret = register_kprobes(kps, 2);
120 if (ret < 0) {
121 printk(KERN_ERR "Kprobe smoke test failed: "
122 "register_kprobes returned %d\n", ret);
123 return ret;
124 }
125
126 preh_val = 0;
127 posth_val = 0;
128 ret = target(rand1);
129
130 if (preh_val == 0) {
131 printk(KERN_ERR "Kprobe smoke test failed: "
132 "kprobe pre_handler not called\n");
133 handler_errors++;
134 }
135
136 if (posth_val == 0) {
137 printk(KERN_ERR "Kprobe smoke test failed: "
138 "kprobe post_handler not called\n");
139 handler_errors++;
140 }
141
142 preh_val = 0;
143 posth_val = 0;
144 ret = target2(rand1);
145
146 if (preh_val == 0) {
147 printk(KERN_ERR "Kprobe smoke test failed: "
148 "kprobe pre_handler2 not called\n");
149 handler_errors++;
150 }
151
152 if (posth_val == 0) {
153 printk(KERN_ERR "Kprobe smoke test failed: "
154 "kprobe post_handler2 not called\n");
155 handler_errors++;
156 }
157
158 unregister_kprobes(kps, 2);
159 return 0;
160
161}
162
95static u32 j_kprobe_target(u32 value) 163static u32 j_kprobe_target(u32 value)
96{ 164{
97 if (value != rand1) { 165 if (value != rand1) {
@@ -121,7 +189,7 @@ static int test_jprobe(void)
121 return ret; 189 return ret;
122 } 190 }
123 191
124 ret = kprobe_target(rand1); 192 ret = target(rand1);
125 unregister_jprobe(&jp); 193 unregister_jprobe(&jp);
126 if (jph_val == 0) { 194 if (jph_val == 0) {
127 printk(KERN_ERR "Kprobe smoke test failed: " 195 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -132,6 +200,43 @@ static int test_jprobe(void)
132 return 0; 200 return 0;
133} 201}
134 202
203static struct jprobe jp2 = {
204 .entry = j_kprobe_target,
205 .kp.symbol_name = "kprobe_target2"
206};
207
208static int test_jprobes(void)
209{
210 int ret;
211 struct jprobe *jps[2] = {&jp, &jp2};
212
213 jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
214 ret = register_jprobes(jps, 2);
215 if (ret < 0) {
216 printk(KERN_ERR "Kprobe smoke test failed: "
217 "register_jprobes returned %d\n", ret);
218 return ret;
219 }
220
221 jph_val = 0;
222 ret = target(rand1);
223 if (jph_val == 0) {
224 printk(KERN_ERR "Kprobe smoke test failed: "
225 "jprobe handler not called\n");
226 handler_errors++;
227 }
228
229 jph_val = 0;
230 ret = target2(rand1);
231 if (jph_val == 0) {
232 printk(KERN_ERR "Kprobe smoke test failed: "
233 "jprobe handler2 not called\n");
234 handler_errors++;
235 }
236 unregister_jprobes(jps, 2);
237
238 return 0;
239}
135#ifdef CONFIG_KRETPROBES 240#ifdef CONFIG_KRETPROBES
136static u32 krph_val; 241static u32 krph_val;
137 242
@@ -177,7 +282,7 @@ static int test_kretprobe(void)
177 return ret; 282 return ret;
178 } 283 }
179 284
180 ret = kprobe_target(rand1); 285 ret = target(rand1);
181 unregister_kretprobe(&rp); 286 unregister_kretprobe(&rp);
182 if (krph_val != rand1) { 287 if (krph_val != rand1) {
183 printk(KERN_ERR "Kprobe smoke test failed: " 288 printk(KERN_ERR "Kprobe smoke test failed: "
@@ -187,12 +292,72 @@ static int test_kretprobe(void)
187 292
188 return 0; 293 return 0;
189} 294}
295
296static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
297{
298 unsigned long ret = regs_return_value(regs);
299
300 if (ret != (rand1 / div_factor) + 1) {
301 handler_errors++;
302 printk(KERN_ERR "Kprobe smoke test failed: "
303 "incorrect value in kretprobe handler2\n");
304 }
305 if (krph_val == 0) {
306 handler_errors++;
307 printk(KERN_ERR "Kprobe smoke test failed: "
308 "call to kretprobe entry handler failed\n");
309 }
310
311 krph_val = rand1;
312 return 0;
313}
314
315static struct kretprobe rp2 = {
316 .handler = return_handler2,
317 .entry_handler = entry_handler,
318 .kp.symbol_name = "kprobe_target2"
319};
320
321static int test_kretprobes(void)
322{
323 int ret;
324 struct kretprobe *rps[2] = {&rp, &rp2};
325
326 rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */
327 ret = register_kretprobes(rps, 2);
328 if (ret < 0) {
329 printk(KERN_ERR "Kprobe smoke test failed: "
330 "register_kretprobe returned %d\n", ret);
331 return ret;
332 }
333
334 krph_val = 0;
335 ret = target(rand1);
336 if (krph_val != rand1) {
337 printk(KERN_ERR "Kprobe smoke test failed: "
338 "kretprobe handler not called\n");
339 handler_errors++;
340 }
341
342 krph_val = 0;
343 ret = target2(rand1);
344 if (krph_val != rand1) {
345 printk(KERN_ERR "Kprobe smoke test failed: "
346 "kretprobe handler2 not called\n");
347 handler_errors++;
348 }
349 unregister_kretprobes(rps, 2);
350 return 0;
351}
190#endif /* CONFIG_KRETPROBES */ 352#endif /* CONFIG_KRETPROBES */
191 353
192int init_test_probes(void) 354int init_test_probes(void)
193{ 355{
194 int ret; 356 int ret;
195 357
358 target = kprobe_target;
359 target2 = kprobe_target2;
360
196 do { 361 do {
197 rand1 = random32(); 362 rand1 = random32();
198 } while (rand1 <= div_factor); 363 } while (rand1 <= div_factor);
@@ -204,15 +369,30 @@ int init_test_probes(void)
204 errors++; 369 errors++;
205 370
206 num_tests++; 371 num_tests++;
372 ret = test_kprobes();
373 if (ret < 0)
374 errors++;
375
376 num_tests++;
207 ret = test_jprobe(); 377 ret = test_jprobe();
208 if (ret < 0) 378 if (ret < 0)
209 errors++; 379 errors++;
210 380
381 num_tests++;
382 ret = test_jprobes();
383 if (ret < 0)
384 errors++;
385
211#ifdef CONFIG_KRETPROBES 386#ifdef CONFIG_KRETPROBES
212 num_tests++; 387 num_tests++;
213 ret = test_kretprobe(); 388 ret = test_kretprobe();
214 if (ret < 0) 389 if (ret < 0)
215 errors++; 390 errors++;
391
392 num_tests++;
393 ret = test_kretprobes();
394 if (ret < 0)
395 errors++;
216#endif /* CONFIG_KRETPROBES */ 396#endif /* CONFIG_KRETPROBES */
217 397
218 if (errors) 398 if (errors)
diff --git a/kernel/time.c b/kernel/time.c
index d63a4336fad6..29511943871a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -37,6 +37,7 @@
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/math64.h> 39#include <linux/math64.h>
40#include <linux/ptrace.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -59,14 +60,15 @@ EXPORT_SYMBOL(sys_tz);
59 * why not move it into the appropriate arch directory (for those 60 * why not move it into the appropriate arch directory (for those
60 * architectures that need it). 61 * architectures that need it).
61 */ 62 */
62asmlinkage long sys_time(time_t __user * tloc) 63SYSCALL_DEFINE1(time, time_t __user *, tloc)
63{ 64{
64 time_t i = get_seconds(); 65 time_t i = get_seconds();
65 66
66 if (tloc) { 67 if (tloc) {
67 if (put_user(i,tloc)) 68 if (put_user(i,tloc))
68 i = -EFAULT; 69 return -EFAULT;
69 } 70 }
71 force_successful_syscall_return();
70 return i; 72 return i;
71} 73}
72 74
@@ -77,7 +79,7 @@ asmlinkage long sys_time(time_t __user * tloc)
77 * architectures that need it). 79 * architectures that need it).
78 */ 80 */
79 81
80asmlinkage long sys_stime(time_t __user *tptr) 82SYSCALL_DEFINE1(stime, time_t __user *, tptr)
81{ 83{
82 struct timespec tv; 84 struct timespec tv;
83 int err; 85 int err;
@@ -97,8 +99,8 @@ asmlinkage long sys_stime(time_t __user *tptr)
97 99
98#endif /* __ARCH_WANT_SYS_TIME */ 100#endif /* __ARCH_WANT_SYS_TIME */
99 101
100asmlinkage long sys_gettimeofday(struct timeval __user *tv, 102SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
101 struct timezone __user *tz) 103 struct timezone __user *, tz)
102{ 104{
103 if (likely(tv != NULL)) { 105 if (likely(tv != NULL)) {
104 struct timeval ktv; 106 struct timeval ktv;
@@ -182,8 +184,8 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
182 return 0; 184 return 0;
183} 185}
184 186
185asmlinkage long sys_settimeofday(struct timeval __user *tv, 187SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
186 struct timezone __user *tz) 188 struct timezone __user *, tz)
187{ 189{
188 struct timeval user_tv; 190 struct timeval user_tv;
189 struct timespec new_ts; 191 struct timespec new_ts;
@@ -203,7 +205,7 @@ asmlinkage long sys_settimeofday(struct timeval __user *tv,
203 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); 205 return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
204} 206}
205 207
206asmlinkage long sys_adjtimex(struct timex __user *txc_p) 208SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
207{ 209{
208 struct timex txc; /* Local copy of parameter */ 210 struct timex txc; /* Local copy of parameter */
209 int ret; 211 int ret;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f8d968063cea..ea2f48af83cf 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -166,6 +166,8 @@ static void clockevents_notify_released(void)
166void clockevents_register_device(struct clock_event_device *dev) 166void clockevents_register_device(struct clock_event_device *dev)
167{ 167{
168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 168 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
169 BUG_ON(!dev->cpumask);
170
169 /* 171 /*
170 * A nsec2cyc multiplicator of 0 is invalid and we'd crash 172 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
171 * on it, so fix it up and emit a warning: 173 * on it, so fix it up and emit a warning:
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9ed2eec97526..ca89e1593f08 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,10 +145,11 @@ static void clocksource_watchdog(unsigned long data)
145 * Cycle through CPUs to check if the CPUs stay 145 * Cycle through CPUs to check if the CPUs stay
146 * synchronized to each other. 146 * synchronized to each other.
147 */ 147 */
148 int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map); 148 int next_cpu = cpumask_next(raw_smp_processor_id(),
149 cpu_online_mask);
149 150
150 if (next_cpu >= nr_cpu_ids) 151 if (next_cpu >= nr_cpu_ids)
151 next_cpu = first_cpu(cpu_online_map); 152 next_cpu = cpumask_first(cpu_online_mask);
152 watchdog_timer.expires += WATCHDOG_INTERVAL; 153 watchdog_timer.expires += WATCHDOG_INTERVAL;
153 add_timer_on(&watchdog_timer, next_cpu); 154 add_timer_on(&watchdog_timer, next_cpu);
154 } 155 }
@@ -173,7 +174,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
173 watchdog_last = watchdog->read(); 174 watchdog_last = watchdog->read();
174 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 175 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
175 add_timer_on(&watchdog_timer, 176 add_timer_on(&watchdog_timer,
176 first_cpu(cpu_online_map)); 177 cpumask_first(cpu_online_mask));
177 } 178 }
178 } else { 179 } else {
179 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 180 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -195,7 +196,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
195 watchdog_timer.expires = 196 watchdog_timer.expires =
196 jiffies + WATCHDOG_INTERVAL; 197 jiffies + WATCHDOG_INTERVAL;
197 add_timer_on(&watchdog_timer, 198 add_timer_on(&watchdog_timer,
198 first_cpu(cpu_online_map)); 199 cpumask_first(cpu_online_mask));
199 } 200 }
200 } 201 }
201 } 202 }
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 1ca99557e929..06f197560f3b 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -45,7 +45,7 @@
45 * 45 *
46 * The value 8 is somewhat carefully chosen, as anything 46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as 47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when 48 * HZ shrinks, so values greater than 8 overflow 32bits when
49 * HZ=100. 49 * HZ=100.
50 */ 50 */
51#define JIFFIES_SHIFT 8 51#define JIFFIES_SHIFT 8
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f98a1b7b16e9..118a3b3b3f9a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,7 +28,9 @@
28 */ 28 */
29 29
30struct tick_device tick_broadcast_device; 30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask; 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS);
32static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_SPINLOCK(tick_broadcast_lock);
33static int tick_broadcast_force; 35static int tick_broadcast_force;
34 36
@@ -46,9 +48,9 @@ struct tick_device *tick_get_broadcast_device(void)
46 return &tick_broadcast_device; 48 return &tick_broadcast_device;
47} 49}
48 50
49cpumask_t *tick_get_broadcast_mask(void) 51struct cpumask *tick_get_broadcast_mask(void)
50{ 52{
51 return &tick_broadcast_mask; 53 return to_cpumask(tick_broadcast_mask);
52} 54}
53 55
54/* 56/*
@@ -72,7 +74,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
72 74
73 clockevents_exchange_device(NULL, dev); 75 clockevents_exchange_device(NULL, dev);
74 tick_broadcast_device.evtdev = dev; 76 tick_broadcast_device.evtdev = dev;
75 if (!cpus_empty(tick_broadcast_mask)) 77 if (!cpumask_empty(tick_get_broadcast_mask()))
76 tick_broadcast_start_periodic(dev); 78 tick_broadcast_start_periodic(dev);
77 return 1; 79 return 1;
78} 80}
@@ -104,7 +106,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
104 */ 106 */
105 if (!tick_device_is_functional(dev)) { 107 if (!tick_device_is_functional(dev)) {
106 dev->event_handler = tick_handle_periodic; 108 dev->event_handler = tick_handle_periodic;
107 cpu_set(cpu, tick_broadcast_mask); 109 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
108 tick_broadcast_start_periodic(tick_broadcast_device.evtdev); 110 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
109 ret = 1; 111 ret = 1;
110 } else { 112 } else {
@@ -116,7 +118,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
116 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { 118 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
117 int cpu = smp_processor_id(); 119 int cpu = smp_processor_id();
118 120
119 cpu_clear(cpu, tick_broadcast_mask); 121 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
120 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
121 } 123 }
122 } 124 }
@@ -125,9 +127,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
125} 127}
126 128
127/* 129/*
128 * Broadcast the event to the cpus, which are set in the mask 130 * Broadcast the event to the cpus, which are set in the mask (mangled).
129 */ 131 */
130static void tick_do_broadcast(cpumask_t mask) 132static void tick_do_broadcast(struct cpumask *mask)
131{ 133{
132 int cpu = smp_processor_id(); 134 int cpu = smp_processor_id();
133 struct tick_device *td; 135 struct tick_device *td;
@@ -135,21 +137,20 @@ static void tick_do_broadcast(cpumask_t mask)
135 /* 137 /*
136 * Check, if the current cpu is in the mask 138 * Check, if the current cpu is in the mask
137 */ 139 */
138 if (cpu_isset(cpu, mask)) { 140 if (cpumask_test_cpu(cpu, mask)) {
139 cpu_clear(cpu, mask); 141 cpumask_clear_cpu(cpu, mask);
140 td = &per_cpu(tick_cpu_device, cpu); 142 td = &per_cpu(tick_cpu_device, cpu);
141 td->evtdev->event_handler(td->evtdev); 143 td->evtdev->event_handler(td->evtdev);
142 } 144 }
143 145
144 if (!cpus_empty(mask)) { 146 if (!cpumask_empty(mask)) {
145 /* 147 /*
146 * It might be necessary to actually check whether the devices 148 * It might be necessary to actually check whether the devices
147 * have different broadcast functions. For now, just use the 149 * have different broadcast functions. For now, just use the
148 * one of the first device. This works as long as we have this 150 * one of the first device. This works as long as we have this
149 * misfeature only on x86 (lapic) 151 * misfeature only on x86 (lapic)
150 */ 152 */
151 cpu = first_cpu(mask); 153 td = &per_cpu(tick_cpu_device, cpumask_first(mask));
152 td = &per_cpu(tick_cpu_device, cpu);
153 td->evtdev->broadcast(mask); 154 td->evtdev->broadcast(mask);
154 } 155 }
155} 156}
@@ -160,12 +161,11 @@ static void tick_do_broadcast(cpumask_t mask)
160 */ 161 */
161static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
162{ 163{
163 cpumask_t mask;
164
165 spin_lock(&tick_broadcast_lock); 164 spin_lock(&tick_broadcast_lock);
166 165
167 cpus_and(mask, cpu_online_map, tick_broadcast_mask); 166 cpumask_and(to_cpumask(tmpmask),
168 tick_do_broadcast(mask); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 spin_unlock(&tick_broadcast_lock);
171} 171}
@@ -228,13 +228,13 @@ static void tick_do_broadcast_on_off(void *why)
228 if (!tick_device_is_functional(dev)) 228 if (!tick_device_is_functional(dev))
229 goto out; 229 goto out;
230 230
231 bc_stopped = cpus_empty(tick_broadcast_mask); 231 bc_stopped = cpumask_empty(tick_get_broadcast_mask());
232 232
233 switch (*reason) { 233 switch (*reason) {
234 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 234 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
236 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
237 cpu_set(cpu, tick_broadcast_mask); 237 cpumask_set_cpu(cpu, tick_get_broadcast_mask());
238 if (tick_broadcast_device.mode == 238 if (tick_broadcast_device.mode ==
239 TICKDEV_MODE_PERIODIC) 239 TICKDEV_MODE_PERIODIC)
240 clockevents_shutdown(dev); 240 clockevents_shutdown(dev);
@@ -244,8 +244,8 @@ static void tick_do_broadcast_on_off(void *why)
244 break; 244 break;
245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
246 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
247 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
248 cpu_clear(cpu, tick_broadcast_mask); 248 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
249 if (tick_broadcast_device.mode == 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC) 250 TICKDEV_MODE_PERIODIC)
251 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
@@ -253,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why)
253 break; 253 break;
254 } 254 }
255 255
256 if (cpus_empty(tick_broadcast_mask)) { 256 if (cpumask_empty(tick_get_broadcast_mask())) {
257 if (!bc_stopped) 257 if (!bc_stopped)
258 clockevents_shutdown(bc); 258 clockevents_shutdown(bc);
259 } else if (bc_stopped) { 259 } else if (bc_stopped) {
@@ -272,7 +272,7 @@ out:
272 */ 272 */
273void tick_broadcast_on_off(unsigned long reason, int *oncpu) 273void tick_broadcast_on_off(unsigned long reason, int *oncpu)
274{ 274{
275 if (!cpu_isset(*oncpu, cpu_online_map)) 275 if (!cpumask_test_cpu(*oncpu, cpu_online_mask))
276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
277 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
278 else 278 else
@@ -303,10 +303,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
303 spin_lock_irqsave(&tick_broadcast_lock, flags); 303 spin_lock_irqsave(&tick_broadcast_lock, flags);
304 304
305 bc = tick_broadcast_device.evtdev; 305 bc = tick_broadcast_device.evtdev;
306 cpu_clear(cpu, tick_broadcast_mask); 306 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
307 307
308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
309 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpumask_empty(tick_get_broadcast_mask()))
310 clockevents_shutdown(bc); 310 clockevents_shutdown(bc);
311 } 311 }
312 312
@@ -342,10 +342,10 @@ int tick_resume_broadcast(void)
342 342
343 switch (tick_broadcast_device.mode) { 343 switch (tick_broadcast_device.mode) {
344 case TICKDEV_MODE_PERIODIC: 344 case TICKDEV_MODE_PERIODIC:
345 if(!cpus_empty(tick_broadcast_mask)) 345 if (!cpumask_empty(tick_get_broadcast_mask()))
346 tick_broadcast_start_periodic(bc); 346 tick_broadcast_start_periodic(bc);
347 broadcast = cpu_isset(smp_processor_id(), 347 broadcast = cpumask_test_cpu(smp_processor_id(),
348 tick_broadcast_mask); 348 tick_get_broadcast_mask());
349 break; 349 break;
350 case TICKDEV_MODE_ONESHOT: 350 case TICKDEV_MODE_ONESHOT:
351 broadcast = tick_resume_broadcast_oneshot(bc); 351 broadcast = tick_resume_broadcast_oneshot(bc);
@@ -360,14 +360,15 @@ int tick_resume_broadcast(void)
360 360
361#ifdef CONFIG_TICK_ONESHOT 361#ifdef CONFIG_TICK_ONESHOT
362 362
363static cpumask_t tick_broadcast_oneshot_mask; 363/* FIXME: use cpumask_var_t. */
364static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
364 365
365/* 366/*
366 * Debugging: see timer_list.c 367 * Exposed for debugging: see timer_list.c
367 */ 368 */
368cpumask_t *tick_get_broadcast_oneshot_mask(void) 369struct cpumask *tick_get_broadcast_oneshot_mask(void)
369{ 370{
370 return &tick_broadcast_oneshot_mask; 371 return to_cpumask(tick_broadcast_oneshot_mask);
371} 372}
372 373
373static int tick_broadcast_set_event(ktime_t expires, int force) 374static int tick_broadcast_set_event(ktime_t expires, int force)
@@ -389,7 +390,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
389 */ 390 */
390void tick_check_oneshot_broadcast(int cpu) 391void tick_check_oneshot_broadcast(int cpu)
391{ 392{
392 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 393 if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
393 struct tick_device *td = &per_cpu(tick_cpu_device, cpu); 394 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
394 395
395 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); 396 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -402,7 +403,6 @@ void tick_check_oneshot_broadcast(int cpu)
402static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 403static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
403{ 404{
404 struct tick_device *td; 405 struct tick_device *td;
405 cpumask_t mask;
406 ktime_t now, next_event; 406 ktime_t now, next_event;
407 int cpu; 407 int cpu;
408 408
@@ -410,13 +410,13 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
410again: 410again:
411 dev->next_event.tv64 = KTIME_MAX; 411 dev->next_event.tv64 = KTIME_MAX;
412 next_event.tv64 = KTIME_MAX; 412 next_event.tv64 = KTIME_MAX;
413 mask = CPU_MASK_NONE; 413 cpumask_clear(to_cpumask(tmpmask));
414 now = ktime_get(); 414 now = ktime_get();
415 /* Find all expired events */ 415 /* Find all expired events */
416 for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) { 416 for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
417 td = &per_cpu(tick_cpu_device, cpu); 417 td = &per_cpu(tick_cpu_device, cpu);
418 if (td->evtdev->next_event.tv64 <= now.tv64) 418 if (td->evtdev->next_event.tv64 <= now.tv64)
419 cpu_set(cpu, mask); 419 cpumask_set_cpu(cpu, to_cpumask(tmpmask));
420 else if (td->evtdev->next_event.tv64 < next_event.tv64) 420 else if (td->evtdev->next_event.tv64 < next_event.tv64)
421 next_event.tv64 = td->evtdev->next_event.tv64; 421 next_event.tv64 = td->evtdev->next_event.tv64;
422 } 422 }
@@ -424,7 +424,7 @@ again:
424 /* 424 /*
425 * Wakeup the cpus which have an expired event. 425 * Wakeup the cpus which have an expired event.
426 */ 426 */
427 tick_do_broadcast(mask); 427 tick_do_broadcast(to_cpumask(tmpmask));
428 428
429 /* 429 /*
430 * Two reasons for reprogram: 430 * Two reasons for reprogram:
@@ -476,15 +476,16 @@ void tick_broadcast_oneshot_control(unsigned long reason)
476 goto out; 476 goto out;
477 477
478 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { 478 if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
479 if (!cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 479 if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
480 cpu_set(cpu, tick_broadcast_oneshot_mask); 480 cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
481 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); 481 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
482 if (dev->next_event.tv64 < bc->next_event.tv64) 482 if (dev->next_event.tv64 < bc->next_event.tv64)
483 tick_broadcast_set_event(dev->next_event, 1); 483 tick_broadcast_set_event(dev->next_event, 1);
484 } 484 }
485 } else { 485 } else {
486 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { 486 if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
487 cpu_clear(cpu, tick_broadcast_oneshot_mask); 487 cpumask_clear_cpu(cpu,
488 tick_get_broadcast_oneshot_mask());
488 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); 489 clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
489 if (dev->next_event.tv64 != KTIME_MAX) 490 if (dev->next_event.tv64 != KTIME_MAX)
490 tick_program_event(dev->next_event, 1); 491 tick_program_event(dev->next_event, 1);
@@ -502,15 +503,16 @@ out:
502 */ 503 */
503static void tick_broadcast_clear_oneshot(int cpu) 504static void tick_broadcast_clear_oneshot(int cpu)
504{ 505{
505 cpu_clear(cpu, tick_broadcast_oneshot_mask); 506 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
506} 507}
507 508
508static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) 509static void tick_broadcast_init_next_event(struct cpumask *mask,
510 ktime_t expires)
509{ 511{
510 struct tick_device *td; 512 struct tick_device *td;
511 int cpu; 513 int cpu;
512 514
513 for_each_cpu_mask_nr(cpu, *mask) { 515 for_each_cpu(cpu, mask) {
514 td = &per_cpu(tick_cpu_device, cpu); 516 td = &per_cpu(tick_cpu_device, cpu);
515 if (td->evtdev) 517 if (td->evtdev)
516 td->evtdev->next_event = expires; 518 td->evtdev->next_event = expires;
@@ -526,7 +528,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
526 if (bc->event_handler != tick_handle_oneshot_broadcast) { 528 if (bc->event_handler != tick_handle_oneshot_broadcast) {
527 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; 529 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
528 int cpu = smp_processor_id(); 530 int cpu = smp_processor_id();
529 cpumask_t mask;
530 531
531 bc->event_handler = tick_handle_oneshot_broadcast; 532 bc->event_handler = tick_handle_oneshot_broadcast;
532 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 533 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -540,13 +541,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
540 * oneshot_mask bits for those and program the 541 * oneshot_mask bits for those and program the
541 * broadcast device to fire. 542 * broadcast device to fire.
542 */ 543 */
543 mask = tick_broadcast_mask; 544 cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
544 cpu_clear(cpu, mask); 545 cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
545 cpus_or(tick_broadcast_oneshot_mask, 546 cpumask_or(tick_get_broadcast_oneshot_mask(),
546 tick_broadcast_oneshot_mask, mask); 547 tick_get_broadcast_oneshot_mask(),
547 548 to_cpumask(tmpmask));
548 if (was_periodic && !cpus_empty(mask)) { 549
549 tick_broadcast_init_next_event(&mask, tick_next_period); 550 if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
551 tick_broadcast_init_next_event(to_cpumask(tmpmask),
552 tick_next_period);
550 tick_broadcast_set_event(tick_next_period, 1); 553 tick_broadcast_set_event(tick_next_period, 1);
551 } else 554 } else
552 bc->next_event.tv64 = KTIME_MAX; 555 bc->next_event.tv64 = KTIME_MAX;
@@ -585,7 +588,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
585 * Clear the broadcast mask flag for the dead cpu, but do not 588 * Clear the broadcast mask flag for the dead cpu, but do not
586 * stop the broadcast device! 589 * stop the broadcast device!
587 */ 590 */
588 cpu_clear(cpu, tick_broadcast_oneshot_mask); 591 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
589 592
590 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 593 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
591} 594}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index df12434b43ca..21a5ca849514 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
136 */ 136 */
137static void tick_setup_device(struct tick_device *td, 137static void tick_setup_device(struct tick_device *td,
138 struct clock_event_device *newdev, int cpu, 138 struct clock_event_device *newdev, int cpu,
139 const cpumask_t *cpumask) 139 const struct cpumask *cpumask)
140{ 140{
141 ktime_t next_event; 141 ktime_t next_event;
142 void (*handler)(struct clock_event_device *) = NULL; 142 void (*handler)(struct clock_event_device *) = NULL;
@@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td,
171 * When the device is not per cpu, pin the interrupt to the 171 * When the device is not per cpu, pin the interrupt to the
172 * current cpu: 172 * current cpu:
173 */ 173 */
174 if (!cpus_equal(newdev->cpumask, *cpumask)) 174 if (!cpumask_equal(newdev->cpumask, cpumask))
175 irq_set_affinity(newdev->irq, *cpumask); 175 irq_set_affinity(newdev->irq, cpumask);
176 176
177 /* 177 /*
178 * When global broadcasting is active, check if the current 178 * When global broadcasting is active, check if the current
@@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev)
202 spin_lock_irqsave(&tick_device_lock, flags); 202 spin_lock_irqsave(&tick_device_lock, flags);
203 203
204 cpu = smp_processor_id(); 204 cpu = smp_processor_id();
205 if (!cpu_isset(cpu, newdev->cpumask)) 205 if (!cpumask_test_cpu(cpu, newdev->cpumask))
206 goto out_bc; 206 goto out_bc;
207 207
208 td = &per_cpu(tick_cpu_device, cpu); 208 td = &per_cpu(tick_cpu_device, cpu);
209 curdev = td->evtdev; 209 curdev = td->evtdev;
210 210
211 /* cpu local device ? */ 211 /* cpu local device ? */
212 if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) { 212 if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
213 213
214 /* 214 /*
215 * If the cpu affinity of the device interrupt can not 215 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
222 * If we have a cpu local device already, do not replace it 222 * If we have a cpu local device already, do not replace it
223 * by a non cpu local device 223 * by a non cpu local device
224 */ 224 */
225 if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu))) 225 if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
226 goto out_bc; 226 goto out_bc;
227 } 227 }
228 228
@@ -254,7 +254,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
254 curdev = NULL; 254 curdev = NULL;
255 } 255 }
256 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
257 tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu)); 257 tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
259 tick_oneshot_notify(); 259 tick_oneshot_notify();
260 260
@@ -274,6 +274,21 @@ out_bc:
274} 274}
275 275
276/* 276/*
277 * Transfer the do_timer job away from a dying cpu.
278 *
279 * Called with interrupts disabled.
280 */
281static void tick_handover_do_timer(int *cpup)
282{
283 if (*cpup == tick_do_timer_cpu) {
284 int cpu = cpumask_first(cpu_online_mask);
285
286 tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
287 TICK_DO_TIMER_NONE;
288 }
289}
290
291/*
277 * Shutdown an event device on a given cpu: 292 * Shutdown an event device on a given cpu:
278 * 293 *
279 * This is called on a life CPU, when a CPU is dead. So we cannot 294 * This is called on a life CPU, when a CPU is dead. So we cannot
@@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup)
297 clockevents_exchange_device(dev, NULL); 312 clockevents_exchange_device(dev, NULL);
298 td->evtdev = NULL; 313 td->evtdev = NULL;
299 } 314 }
300 /* Transfer the do_timer job away from this cpu */
301 if (*cpup == tick_do_timer_cpu) {
302 int cpu = first_cpu(cpu_online_map);
303
304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
305 TICK_DO_TIMER_NONE;
306 }
307 spin_unlock_irqrestore(&tick_device_lock, flags); 315 spin_unlock_irqrestore(&tick_device_lock, flags);
308} 316}
309 317
@@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
357 tick_broadcast_oneshot_control(reason); 365 tick_broadcast_oneshot_control(reason);
358 break; 366 break;
359 367
368 case CLOCK_EVT_NOTIFY_CPU_DYING:
369 tick_handover_do_timer(dev);
370 break;
371
360 case CLOCK_EVT_NOTIFY_CPU_DEAD: 372 case CLOCK_EVT_NOTIFY_CPU_DEAD:
361 tick_shutdown_broadcast_oneshot(dev); 373 tick_shutdown_broadcast_oneshot(dev);
362 tick_shutdown_broadcast(dev); 374 tick_shutdown_broadcast(dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 8f3fc2582d38..d3f1ef4d5cbe 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,7 +134,7 @@ __setup("nohz=", setup_tick_nohz);
134 * value. We do this unconditionally on any cpu, as we don't know whether the 134 * value. We do this unconditionally on any cpu, as we don't know whether the
135 * cpu, which has the update task assigned is in a long sleep. 135 * cpu, which has the update task assigned is in a long sleep.
136 */ 136 */
137void tick_nohz_update_jiffies(void) 137static void tick_nohz_update_jiffies(void)
138{ 138{
139 int cpu = smp_processor_id(); 139 int cpu = smp_processor_id();
140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
144 if (!ts->tick_stopped) 144 if (!ts->tick_stopped)
145 return; 145 return;
146 146
147 cpu_clear(cpu, nohz_cpu_mask); 147 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get(); 148 now = ktime_get();
149 ts->idle_waketime = now; 149 ts->idle_waketime = now;
150 150
@@ -301,7 +301,7 @@ void tick_nohz_stop_sched_tick(int inidle)
301 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 301 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
302 302
303 if (delta_jiffies > 1) 303 if (delta_jiffies > 1)
304 cpu_set(cpu, nohz_cpu_mask); 304 cpumask_set_cpu(cpu, nohz_cpu_mask);
305 305
306 /* Skip reprogram of event if its not changed */ 306 /* Skip reprogram of event if its not changed */
307 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 307 if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
@@ -319,7 +319,7 @@ void tick_nohz_stop_sched_tick(int inidle)
319 /* 319 /*
320 * sched tick not stopped! 320 * sched tick not stopped!
321 */ 321 */
322 cpu_clear(cpu, nohz_cpu_mask); 322 cpumask_clear_cpu(cpu, nohz_cpu_mask);
323 goto out; 323 goto out;
324 } 324 }
325 325
@@ -361,7 +361,7 @@ void tick_nohz_stop_sched_tick(int inidle)
361 * softirq. 361 * softirq.
362 */ 362 */
363 tick_do_update_jiffies64(ktime_get()); 363 tick_do_update_jiffies64(ktime_get());
364 cpu_clear(cpu, nohz_cpu_mask); 364 cpumask_clear_cpu(cpu, nohz_cpu_mask);
365 } 365 }
366 raise_softirq_irqoff(TIMER_SOFTIRQ); 366 raise_softirq_irqoff(TIMER_SOFTIRQ);
367out: 367out:
@@ -419,7 +419,9 @@ void tick_nohz_restart_sched_tick(void)
419{ 419{
420 int cpu = smp_processor_id(); 420 int cpu = smp_processor_id();
421 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 421 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
422#ifndef CONFIG_VIRT_CPU_ACCOUNTING
422 unsigned long ticks; 423 unsigned long ticks;
424#endif
423 ktime_t now; 425 ktime_t now;
424 426
425 local_irq_disable(); 427 local_irq_disable();
@@ -439,8 +441,9 @@ void tick_nohz_restart_sched_tick(void)
439 select_nohz_load_balancer(0); 441 select_nohz_load_balancer(0);
440 now = ktime_get(); 442 now = ktime_get();
441 tick_do_update_jiffies64(now); 443 tick_do_update_jiffies64(now);
442 cpu_clear(cpu, nohz_cpu_mask); 444 cpumask_clear_cpu(cpu, nohz_cpu_mask);
443 445
446#ifndef CONFIG_VIRT_CPU_ACCOUNTING
444 /* 447 /*
445 * We stopped the tick in idle. Update process times would miss the 448 * We stopped the tick in idle. Update process times would miss the
446 * time we slept as update_process_times does only a 1 tick 449 * time we slept as update_process_times does only a 1 tick
@@ -450,12 +453,9 @@ void tick_nohz_restart_sched_tick(void)
450 /* 453 /*
451 * We might be one off. Do not randomly account a huge number of ticks! 454 * We might be one off. Do not randomly account a huge number of ticks!
452 */ 455 */
453 if (ticks && ticks < LONG_MAX) { 456 if (ticks && ticks < LONG_MAX)
454 add_preempt_count(HARDIRQ_OFFSET); 457 account_idle_ticks(ticks);
455 account_system_time(current, HARDIRQ_OFFSET, 458#endif
456 jiffies_to_cputime(ticks));
457 sub_preempt_count(HARDIRQ_OFFSET);
458 }
459 459
460 touch_softlockup_watchdog(); 460 touch_softlockup_watchdog();
461 /* 461 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fa05e88aa76f..900f1b6598d1 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -46,6 +46,9 @@ struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 46struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 47static unsigned long total_sleep_time; /* seconds */
48 48
49/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended;
51
49static struct timespec xtime_cache __attribute__ ((aligned (16))); 52static struct timespec xtime_cache __attribute__ ((aligned (16)));
50void update_xtime_cache(u64 nsec) 53void update_xtime_cache(u64 nsec)
51{ 54{
@@ -92,6 +95,8 @@ void getnstimeofday(struct timespec *ts)
92 unsigned long seq; 95 unsigned long seq;
93 s64 nsecs; 96 s64 nsecs;
94 97
98 WARN_ON(timekeeping_suspended);
99
95 do { 100 do {
96 seq = read_seqbegin(&xtime_lock); 101 seq = read_seqbegin(&xtime_lock);
97 102
@@ -299,8 +304,6 @@ void __init timekeeping_init(void)
299 write_sequnlock_irqrestore(&xtime_lock, flags); 304 write_sequnlock_irqrestore(&xtime_lock, flags);
300} 305}
301 306
302/* flag for if timekeeping is suspended */
303static int timekeeping_suspended;
304/* time in seconds when suspend began */ 307/* time in seconds when suspend began */
305static unsigned long timekeeping_suspend_time; 308static unsigned long timekeeping_suspend_time;
306 309
diff --git a/kernel/timer.c b/kernel/timer.c
index 566257d1dc10..13dd64fe143d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1018,21 +1018,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1018} 1018}
1019#endif 1019#endif
1020 1020
1021#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1022void account_process_tick(struct task_struct *p, int user_tick)
1023{
1024 cputime_t one_jiffy = jiffies_to_cputime(1);
1025
1026 if (user_tick) {
1027 account_user_time(p, one_jiffy);
1028 account_user_time_scaled(p, cputime_to_scaled(one_jiffy));
1029 } else {
1030 account_system_time(p, HARDIRQ_OFFSET, one_jiffy);
1031 account_system_time_scaled(p, cputime_to_scaled(one_jiffy));
1032 }
1033}
1034#endif
1035
1036/* 1021/*
1037 * Called from the timer interrupt handler to charge one tick to the current 1022 * Called from the timer interrupt handler to charge one tick to the current
1038 * process. user_tick is 1 if the tick is user time, 0 for system. 1023 * process. user_tick is 1 if the tick is user time, 0 for system.
@@ -1144,7 +1129,7 @@ void do_timer(unsigned long ticks)
1144 * For backwards compatibility? This can be done in libc so Alpha 1129 * For backwards compatibility? This can be done in libc so Alpha
1145 * and all newer ports shouldn't need it. 1130 * and all newer ports shouldn't need it.
1146 */ 1131 */
1147asmlinkage unsigned long sys_alarm(unsigned int seconds) 1132SYSCALL_DEFINE1(alarm, unsigned int, seconds)
1148{ 1133{
1149 return alarm_setitimer(seconds); 1134 return alarm_setitimer(seconds);
1150} 1135}
@@ -1167,7 +1152,7 @@ asmlinkage unsigned long sys_alarm(unsigned int seconds)
1167 * 1152 *
1168 * This is SMP safe as current->tgid does not change. 1153 * This is SMP safe as current->tgid does not change.
1169 */ 1154 */
1170asmlinkage long sys_getpid(void) 1155SYSCALL_DEFINE0(getpid)
1171{ 1156{
1172 return task_tgid_vnr(current); 1157 return task_tgid_vnr(current);
1173} 1158}
@@ -1178,7 +1163,7 @@ asmlinkage long sys_getpid(void)
1178 * value of ->real_parent under rcu_read_lock(), see 1163 * value of ->real_parent under rcu_read_lock(), see
1179 * release_task()->call_rcu(delayed_put_task_struct). 1164 * release_task()->call_rcu(delayed_put_task_struct).
1180 */ 1165 */
1181asmlinkage long sys_getppid(void) 1166SYSCALL_DEFINE0(getppid)
1182{ 1167{
1183 int pid; 1168 int pid;
1184 1169
@@ -1189,25 +1174,25 @@ asmlinkage long sys_getppid(void)
1189 return pid; 1174 return pid;
1190} 1175}
1191 1176
1192asmlinkage long sys_getuid(void) 1177SYSCALL_DEFINE0(getuid)
1193{ 1178{
1194 /* Only we change this so SMP safe */ 1179 /* Only we change this so SMP safe */
1195 return current_uid(); 1180 return current_uid();
1196} 1181}
1197 1182
1198asmlinkage long sys_geteuid(void) 1183SYSCALL_DEFINE0(geteuid)
1199{ 1184{
1200 /* Only we change this so SMP safe */ 1185 /* Only we change this so SMP safe */
1201 return current_euid(); 1186 return current_euid();
1202} 1187}
1203 1188
1204asmlinkage long sys_getgid(void) 1189SYSCALL_DEFINE0(getgid)
1205{ 1190{
1206 /* Only we change this so SMP safe */ 1191 /* Only we change this so SMP safe */
1207 return current_gid(); 1192 return current_gid();
1208} 1193}
1209 1194
1210asmlinkage long sys_getegid(void) 1195SYSCALL_DEFINE0(getegid)
1211{ 1196{
1212 /* Only we change this so SMP safe */ 1197 /* Only we change this so SMP safe */
1213 return current_egid(); 1198 return current_egid();
@@ -1323,7 +1308,7 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1323EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1308EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1324 1309
1325/* Thread ID - the internal kernel "pid" */ 1310/* Thread ID - the internal kernel "pid" */
1326asmlinkage long sys_gettid(void) 1311SYSCALL_DEFINE0(gettid)
1327{ 1312{
1328 return task_pid_vnr(current); 1313 return task_pid_vnr(current);
1329} 1314}
@@ -1415,7 +1400,7 @@ out:
1415 return 0; 1400 return 0;
1416} 1401}
1417 1402
1418asmlinkage long sys_sysinfo(struct sysinfo __user *info) 1403SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
1419{ 1404{
1420 struct sysinfo val; 1405 struct sysinfo val;
1421 1406
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2f32969c09df..9a236ffe2aa4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -17,6 +17,7 @@
17#include <linux/clocksource.h> 17#include <linux/clocksource.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/suspend.h>
20#include <linux/debugfs.h> 21#include <linux/debugfs.h>
21#include <linux/hardirq.h> 22#include <linux/hardirq.h>
22#include <linux/kthread.h> 23#include <linux/kthread.h>
@@ -1736,9 +1737,12 @@ static void clear_ftrace_pid(struct pid *pid)
1736{ 1737{
1737 struct task_struct *p; 1738 struct task_struct *p;
1738 1739
1740 rcu_read_lock();
1739 do_each_pid_task(pid, PIDTYPE_PID, p) { 1741 do_each_pid_task(pid, PIDTYPE_PID, p) {
1740 clear_tsk_trace_trace(p); 1742 clear_tsk_trace_trace(p);
1741 } while_each_pid_task(pid, PIDTYPE_PID, p); 1743 } while_each_pid_task(pid, PIDTYPE_PID, p);
1744 rcu_read_unlock();
1745
1742 put_pid(pid); 1746 put_pid(pid);
1743} 1747}
1744 1748
@@ -1746,9 +1750,11 @@ static void set_ftrace_pid(struct pid *pid)
1746{ 1750{
1747 struct task_struct *p; 1751 struct task_struct *p;
1748 1752
1753 rcu_read_lock();
1749 do_each_pid_task(pid, PIDTYPE_PID, p) { 1754 do_each_pid_task(pid, PIDTYPE_PID, p) {
1750 set_tsk_trace_trace(p); 1755 set_tsk_trace_trace(p);
1751 } while_each_pid_task(pid, PIDTYPE_PID, p); 1756 } while_each_pid_task(pid, PIDTYPE_PID, p);
1757 rcu_read_unlock();
1752} 1758}
1753 1759
1754static void clear_ftrace_pid_task(struct pid **pid) 1760static void clear_ftrace_pid_task(struct pid **pid)
@@ -1965,6 +1971,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1965#ifdef CONFIG_FUNCTION_GRAPH_TRACER 1971#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1966 1972
1967static atomic_t ftrace_graph_active; 1973static atomic_t ftrace_graph_active;
1974static struct notifier_block ftrace_suspend_notifier;
1968 1975
1969int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 1976int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
1970{ 1977{
@@ -2043,6 +2050,27 @@ static int start_graph_tracing(void)
2043 return ret; 2050 return ret;
2044} 2051}
2045 2052
2053/*
2054 * Hibernation protection.
2055 * The state of the current task is too much unstable during
2056 * suspend/restore to disk. We want to protect against that.
2057 */
2058static int
2059ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
2060 void *unused)
2061{
2062 switch (state) {
2063 case PM_HIBERNATION_PREPARE:
2064 pause_graph_tracing();
2065 break;
2066
2067 case PM_POST_HIBERNATION:
2068 unpause_graph_tracing();
2069 break;
2070 }
2071 return NOTIFY_DONE;
2072}
2073
2046int register_ftrace_graph(trace_func_graph_ret_t retfunc, 2074int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2047 trace_func_graph_ent_t entryfunc) 2075 trace_func_graph_ent_t entryfunc)
2048{ 2076{
@@ -2050,6 +2078,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2050 2078
2051 mutex_lock(&ftrace_sysctl_lock); 2079 mutex_lock(&ftrace_sysctl_lock);
2052 2080
2081 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
2082 register_pm_notifier(&ftrace_suspend_notifier);
2083
2053 atomic_inc(&ftrace_graph_active); 2084 atomic_inc(&ftrace_graph_active);
2054 ret = start_graph_tracing(); 2085 ret = start_graph_tracing();
2055 if (ret) { 2086 if (ret) {
@@ -2075,6 +2106,7 @@ void unregister_ftrace_graph(void)
2075 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 2106 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2076 ftrace_graph_entry = ftrace_graph_entry_stub; 2107 ftrace_graph_entry = ftrace_graph_entry_stub;
2077 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 2108 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
2109 unregister_pm_notifier(&ftrace_suspend_notifier);
2078 2110
2079 mutex_unlock(&ftrace_sysctl_lock); 2111 mutex_unlock(&ftrace_sysctl_lock);
2080} 2112}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d601a7c4587..bd38c5cfd8ad 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -168,7 +168,13 @@ rb_event_length(struct ring_buffer_event *event)
168 */ 168 */
169unsigned ring_buffer_event_length(struct ring_buffer_event *event) 169unsigned ring_buffer_event_length(struct ring_buffer_event *event)
170{ 170{
171 return rb_event_length(event); 171 unsigned length = rb_event_length(event);
172 if (event->type != RINGBUF_TYPE_DATA)
173 return length;
174 length -= RB_EVNT_HDR_SIZE;
175 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
176 length -= sizeof(event->array[0]);
177 return length;
172} 178}
173EXPORT_SYMBOL_GPL(ring_buffer_event_length); 179EXPORT_SYMBOL_GPL(ring_buffer_event_length);
174 180
@@ -195,7 +201,7 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
195EXPORT_SYMBOL_GPL(ring_buffer_event_data); 201EXPORT_SYMBOL_GPL(ring_buffer_event_data);
196 202
197#define for_each_buffer_cpu(buffer, cpu) \ 203#define for_each_buffer_cpu(buffer, cpu) \
198 for_each_cpu_mask(cpu, buffer->cpumask) 204 for_each_cpu(cpu, buffer->cpumask)
199 205
200#define TS_SHIFT 27 206#define TS_SHIFT 27
201#define TS_MASK ((1ULL << TS_SHIFT) - 1) 207#define TS_MASK ((1ULL << TS_SHIFT) - 1)
@@ -240,7 +246,7 @@ static inline int test_time_stamp(u64 delta)
240 return 0; 246 return 0;
241} 247}
242 248
243#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page)) 249#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data))
244 250
245/* 251/*
246 * head_page == tail_page && head == tail then buffer is empty. 252 * head_page == tail_page && head == tail then buffer is empty.
@@ -267,7 +273,7 @@ struct ring_buffer {
267 unsigned pages; 273 unsigned pages;
268 unsigned flags; 274 unsigned flags;
269 int cpus; 275 int cpus;
270 cpumask_t cpumask; 276 cpumask_var_t cpumask;
271 atomic_t record_disabled; 277 atomic_t record_disabled;
272 278
273 struct mutex mutex; 279 struct mutex mutex;
@@ -458,6 +464,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
458 if (!buffer) 464 if (!buffer)
459 return NULL; 465 return NULL;
460 466
467 if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
468 goto fail_free_buffer;
469
461 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 470 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
462 buffer->flags = flags; 471 buffer->flags = flags;
463 472
@@ -465,14 +474,14 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
465 if (buffer->pages == 1) 474 if (buffer->pages == 1)
466 buffer->pages++; 475 buffer->pages++;
467 476
468 buffer->cpumask = cpu_possible_map; 477 cpumask_copy(buffer->cpumask, cpu_possible_mask);
469 buffer->cpus = nr_cpu_ids; 478 buffer->cpus = nr_cpu_ids;
470 479
471 bsize = sizeof(void *) * nr_cpu_ids; 480 bsize = sizeof(void *) * nr_cpu_ids;
472 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), 481 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
473 GFP_KERNEL); 482 GFP_KERNEL);
474 if (!buffer->buffers) 483 if (!buffer->buffers)
475 goto fail_free_buffer; 484 goto fail_free_cpumask;
476 485
477 for_each_buffer_cpu(buffer, cpu) { 486 for_each_buffer_cpu(buffer, cpu) {
478 buffer->buffers[cpu] = 487 buffer->buffers[cpu] =
@@ -492,6 +501,9 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
492 } 501 }
493 kfree(buffer->buffers); 502 kfree(buffer->buffers);
494 503
504 fail_free_cpumask:
505 free_cpumask_var(buffer->cpumask);
506
495 fail_free_buffer: 507 fail_free_buffer:
496 kfree(buffer); 508 kfree(buffer);
497 return NULL; 509 return NULL;
@@ -510,6 +522,8 @@ ring_buffer_free(struct ring_buffer *buffer)
510 for_each_buffer_cpu(buffer, cpu) 522 for_each_buffer_cpu(buffer, cpu)
511 rb_free_cpu_buffer(buffer->buffers[cpu]); 523 rb_free_cpu_buffer(buffer->buffers[cpu]);
512 524
525 free_cpumask_var(buffer->cpumask);
526
513 kfree(buffer); 527 kfree(buffer);
514} 528}
515EXPORT_SYMBOL_GPL(ring_buffer_free); 529EXPORT_SYMBOL_GPL(ring_buffer_free);
@@ -1011,12 +1025,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1011 } 1025 }
1012 1026
1013 if (next_page == head_page) { 1027 if (next_page == head_page) {
1014 if (!(buffer->flags & RB_FL_OVERWRITE)) { 1028 if (!(buffer->flags & RB_FL_OVERWRITE))
1015 /* reset write */
1016 if (tail <= BUF_PAGE_SIZE)
1017 local_set(&tail_page->write, tail);
1018 goto out_unlock; 1029 goto out_unlock;
1019 }
1020 1030
1021 /* tail_page has not moved yet? */ 1031 /* tail_page has not moved yet? */
1022 if (tail_page == cpu_buffer->tail_page) { 1032 if (tail_page == cpu_buffer->tail_page) {
@@ -1091,6 +1101,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1091 return event; 1101 return event;
1092 1102
1093 out_unlock: 1103 out_unlock:
1104 /* reset write */
1105 if (tail <= BUF_PAGE_SIZE)
1106 local_set(&tail_page->write, tail);
1107
1094 __raw_spin_unlock(&cpu_buffer->lock); 1108 __raw_spin_unlock(&cpu_buffer->lock);
1095 local_irq_restore(flags); 1109 local_irq_restore(flags);
1096 return NULL; 1110 return NULL;
@@ -1283,7 +1297,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1283 1297
1284 cpu = raw_smp_processor_id(); 1298 cpu = raw_smp_processor_id();
1285 1299
1286 if (!cpu_isset(cpu, buffer->cpumask)) 1300 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1287 goto out; 1301 goto out;
1288 1302
1289 cpu_buffer = buffer->buffers[cpu]; 1303 cpu_buffer = buffer->buffers[cpu];
@@ -1396,7 +1410,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1396 1410
1397 cpu = raw_smp_processor_id(); 1411 cpu = raw_smp_processor_id();
1398 1412
1399 if (!cpu_isset(cpu, buffer->cpumask)) 1413 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1400 goto out; 1414 goto out;
1401 1415
1402 cpu_buffer = buffer->buffers[cpu]; 1416 cpu_buffer = buffer->buffers[cpu];
@@ -1478,7 +1492,7 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
1478{ 1492{
1479 struct ring_buffer_per_cpu *cpu_buffer; 1493 struct ring_buffer_per_cpu *cpu_buffer;
1480 1494
1481 if (!cpu_isset(cpu, buffer->cpumask)) 1495 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1482 return; 1496 return;
1483 1497
1484 cpu_buffer = buffer->buffers[cpu]; 1498 cpu_buffer = buffer->buffers[cpu];
@@ -1498,7 +1512,7 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
1498{ 1512{
1499 struct ring_buffer_per_cpu *cpu_buffer; 1513 struct ring_buffer_per_cpu *cpu_buffer;
1500 1514
1501 if (!cpu_isset(cpu, buffer->cpumask)) 1515 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1502 return; 1516 return;
1503 1517
1504 cpu_buffer = buffer->buffers[cpu]; 1518 cpu_buffer = buffer->buffers[cpu];
@@ -1515,7 +1529,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1515{ 1529{
1516 struct ring_buffer_per_cpu *cpu_buffer; 1530 struct ring_buffer_per_cpu *cpu_buffer;
1517 1531
1518 if (!cpu_isset(cpu, buffer->cpumask)) 1532 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1519 return 0; 1533 return 0;
1520 1534
1521 cpu_buffer = buffer->buffers[cpu]; 1535 cpu_buffer = buffer->buffers[cpu];
@@ -1532,7 +1546,7 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1532{ 1546{
1533 struct ring_buffer_per_cpu *cpu_buffer; 1547 struct ring_buffer_per_cpu *cpu_buffer;
1534 1548
1535 if (!cpu_isset(cpu, buffer->cpumask)) 1549 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1536 return 0; 1550 return 0;
1537 1551
1538 cpu_buffer = buffer->buffers[cpu]; 1552 cpu_buffer = buffer->buffers[cpu];
@@ -1850,7 +1864,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1850 struct buffer_page *reader; 1864 struct buffer_page *reader;
1851 int nr_loops = 0; 1865 int nr_loops = 0;
1852 1866
1853 if (!cpu_isset(cpu, buffer->cpumask)) 1867 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1854 return NULL; 1868 return NULL;
1855 1869
1856 cpu_buffer = buffer->buffers[cpu]; 1870 cpu_buffer = buffer->buffers[cpu];
@@ -2025,7 +2039,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2025 struct ring_buffer_event *event; 2039 struct ring_buffer_event *event;
2026 unsigned long flags; 2040 unsigned long flags;
2027 2041
2028 if (!cpu_isset(cpu, buffer->cpumask)) 2042 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2029 return NULL; 2043 return NULL;
2030 2044
2031 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2045 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2062,7 +2076,7 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
2062 struct ring_buffer_iter *iter; 2076 struct ring_buffer_iter *iter;
2063 unsigned long flags; 2077 unsigned long flags;
2064 2078
2065 if (!cpu_isset(cpu, buffer->cpumask)) 2079 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2066 return NULL; 2080 return NULL;
2067 2081
2068 iter = kmalloc(sizeof(*iter), GFP_KERNEL); 2082 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
@@ -2160,6 +2174,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2160 2174
2161 cpu_buffer->overrun = 0; 2175 cpu_buffer->overrun = 0;
2162 cpu_buffer->entries = 0; 2176 cpu_buffer->entries = 0;
2177
2178 cpu_buffer->write_stamp = 0;
2179 cpu_buffer->read_stamp = 0;
2163} 2180}
2164 2181
2165/** 2182/**
@@ -2172,7 +2189,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2172 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2189 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2173 unsigned long flags; 2190 unsigned long flags;
2174 2191
2175 if (!cpu_isset(cpu, buffer->cpumask)) 2192 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2176 return; 2193 return;
2177 2194
2178 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2195 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2228,7 +2245,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2228{ 2245{
2229 struct ring_buffer_per_cpu *cpu_buffer; 2246 struct ring_buffer_per_cpu *cpu_buffer;
2230 2247
2231 if (!cpu_isset(cpu, buffer->cpumask)) 2248 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2232 return 1; 2249 return 1;
2233 2250
2234 cpu_buffer = buffer->buffers[cpu]; 2251 cpu_buffer = buffer->buffers[cpu];
@@ -2252,8 +2269,8 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2252 struct ring_buffer_per_cpu *cpu_buffer_a; 2269 struct ring_buffer_per_cpu *cpu_buffer_a;
2253 struct ring_buffer_per_cpu *cpu_buffer_b; 2270 struct ring_buffer_per_cpu *cpu_buffer_b;
2254 2271
2255 if (!cpu_isset(cpu, buffer_a->cpumask) || 2272 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
2256 !cpu_isset(cpu, buffer_b->cpumask)) 2273 !cpumask_test_cpu(cpu, buffer_b->cpumask))
2257 return -EINVAL; 2274 return -EINVAL;
2258 2275
2259 /* At least make sure the two buffers are somewhat the same */ 2276 /* At least make sure the two buffers are somewhat the same */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4185d5221633..17bb88d86ac2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -40,7 +40,7 @@
40 40
41#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 41#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
42 42
43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 43unsigned long __read_mostly tracing_max_latency;
44unsigned long __read_mostly tracing_thresh; 44unsigned long __read_mostly tracing_thresh;
45 45
46/* 46/*
@@ -89,10 +89,10 @@ static inline void ftrace_enable_cpu(void)
89 preempt_enable(); 89 preempt_enable();
90} 90}
91 91
92static cpumask_t __read_mostly tracing_buffer_mask; 92static cpumask_var_t __read_mostly tracing_buffer_mask;
93 93
94#define for_each_tracing_cpu(cpu) \ 94#define for_each_tracing_cpu(cpu) \
95 for_each_cpu_mask(cpu, tracing_buffer_mask) 95 for_each_cpu(cpu, tracing_buffer_mask)
96 96
97/* 97/*
98 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops 98 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
@@ -1811,10 +1811,10 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1811 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) 1811 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
1812 return; 1812 return;
1813 1813
1814 if (cpu_isset(iter->cpu, iter->started)) 1814 if (cpumask_test_cpu(iter->cpu, iter->started))
1815 return; 1815 return;
1816 1816
1817 cpu_set(iter->cpu, iter->started); 1817 cpumask_set_cpu(iter->cpu, iter->started);
1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); 1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1819} 1819}
1820 1820
@@ -2646,13 +2646,7 @@ static struct file_operations show_traces_fops = {
2646/* 2646/*
2647 * Only trace on a CPU if the bitmask is set: 2647 * Only trace on a CPU if the bitmask is set:
2648 */ 2648 */
2649static cpumask_t tracing_cpumask = CPU_MASK_ALL; 2649static cpumask_var_t tracing_cpumask;
2650
2651/*
2652 * When tracing/tracing_cpu_mask is modified then this holds
2653 * the new bitmask we are about to install:
2654 */
2655static cpumask_t tracing_cpumask_new;
2656 2650
2657/* 2651/*
2658 * The tracer itself will not take this lock, but still we want 2652 * The tracer itself will not take this lock, but still we want
@@ -2693,6 +2687,10 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2693 size_t count, loff_t *ppos) 2687 size_t count, loff_t *ppos)
2694{ 2688{
2695 int err, cpu; 2689 int err, cpu;
2690 cpumask_var_t tracing_cpumask_new;
2691
2692 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2693 return -ENOMEM;
2696 2694
2697 mutex_lock(&tracing_cpumask_update_lock); 2695 mutex_lock(&tracing_cpumask_update_lock);
2698 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); 2696 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
@@ -2706,26 +2704,28 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2706 * Increase/decrease the disabled counter if we are 2704 * Increase/decrease the disabled counter if we are
2707 * about to flip a bit in the cpumask: 2705 * about to flip a bit in the cpumask:
2708 */ 2706 */
2709 if (cpu_isset(cpu, tracing_cpumask) && 2707 if (cpumask_test_cpu(cpu, tracing_cpumask) &&
2710 !cpu_isset(cpu, tracing_cpumask_new)) { 2708 !cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2711 atomic_inc(&global_trace.data[cpu]->disabled); 2709 atomic_inc(&global_trace.data[cpu]->disabled);
2712 } 2710 }
2713 if (!cpu_isset(cpu, tracing_cpumask) && 2711 if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
2714 cpu_isset(cpu, tracing_cpumask_new)) { 2712 cpumask_test_cpu(cpu, tracing_cpumask_new)) {
2715 atomic_dec(&global_trace.data[cpu]->disabled); 2713 atomic_dec(&global_trace.data[cpu]->disabled);
2716 } 2714 }
2717 } 2715 }
2718 __raw_spin_unlock(&ftrace_max_lock); 2716 __raw_spin_unlock(&ftrace_max_lock);
2719 local_irq_enable(); 2717 local_irq_enable();
2720 2718
2721 tracing_cpumask = tracing_cpumask_new; 2719 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
2722 2720
2723 mutex_unlock(&tracing_cpumask_update_lock); 2721 mutex_unlock(&tracing_cpumask_update_lock);
2722 free_cpumask_var(tracing_cpumask_new);
2724 2723
2725 return count; 2724 return count;
2726 2725
2727err_unlock: 2726err_unlock:
2728 mutex_unlock(&tracing_cpumask_update_lock); 2727 mutex_unlock(&tracing_cpumask_update_lock);
2728 free_cpumask_var(tracing_cpumask);
2729 2729
2730 return err; 2730 return err;
2731} 2731}
@@ -3114,10 +3114,15 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3114 if (!iter) 3114 if (!iter)
3115 return -ENOMEM; 3115 return -ENOMEM;
3116 3116
3117 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3118 kfree(iter);
3119 return -ENOMEM;
3120 }
3121
3117 mutex_lock(&trace_types_lock); 3122 mutex_lock(&trace_types_lock);
3118 3123
3119 /* trace pipe does not show start of buffer */ 3124 /* trace pipe does not show start of buffer */
3120 cpus_setall(iter->started); 3125 cpumask_setall(iter->started);
3121 3126
3122 iter->tr = &global_trace; 3127 iter->tr = &global_trace;
3123 iter->trace = current_trace; 3128 iter->trace = current_trace;
@@ -3134,6 +3139,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
3134{ 3139{
3135 struct trace_iterator *iter = file->private_data; 3140 struct trace_iterator *iter = file->private_data;
3136 3141
3142 free_cpumask_var(iter->started);
3137 kfree(iter); 3143 kfree(iter);
3138 atomic_dec(&tracing_reader); 3144 atomic_dec(&tracing_reader);
3139 3145
@@ -3730,7 +3736,7 @@ static struct notifier_block trace_die_notifier = {
3730 * it if we decide to change what log level the ftrace dump 3736 * it if we decide to change what log level the ftrace dump
3731 * should be at. 3737 * should be at.
3732 */ 3738 */
3733#define KERN_TRACE KERN_INFO 3739#define KERN_TRACE KERN_EMERG
3734 3740
3735static void 3741static void
3736trace_printk_seq(struct trace_seq *s) 3742trace_printk_seq(struct trace_seq *s)
@@ -3752,7 +3758,6 @@ void ftrace_dump(void)
3752 static DEFINE_SPINLOCK(ftrace_dump_lock); 3758 static DEFINE_SPINLOCK(ftrace_dump_lock);
3753 /* use static because iter can be a bit big for the stack */ 3759 /* use static because iter can be a bit big for the stack */
3754 static struct trace_iterator iter; 3760 static struct trace_iterator iter;
3755 static cpumask_t mask;
3756 static int dump_ran; 3761 static int dump_ran;
3757 unsigned long flags; 3762 unsigned long flags;
3758 int cnt = 0, cpu; 3763 int cnt = 0, cpu;
@@ -3765,6 +3770,7 @@ void ftrace_dump(void)
3765 dump_ran = 1; 3770 dump_ran = 1;
3766 3771
3767 /* No turning back! */ 3772 /* No turning back! */
3773 tracing_off();
3768 ftrace_kill(); 3774 ftrace_kill();
3769 3775
3770 for_each_tracing_cpu(cpu) { 3776 for_each_tracing_cpu(cpu) {
@@ -3786,8 +3792,6 @@ void ftrace_dump(void)
3786 * and then release the locks again. 3792 * and then release the locks again.
3787 */ 3793 */
3788 3794
3789 cpus_clear(mask);
3790
3791 while (!trace_empty(&iter)) { 3795 while (!trace_empty(&iter)) {
3792 3796
3793 if (!cnt) 3797 if (!cnt)
@@ -3823,19 +3827,28 @@ __init static int tracer_alloc_buffers(void)
3823{ 3827{
3824 struct trace_array_cpu *data; 3828 struct trace_array_cpu *data;
3825 int i; 3829 int i;
3830 int ret = -ENOMEM;
3826 3831
3827 /* TODO: make the number of buffers hot pluggable with CPUS */ 3832 if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
3828 tracing_buffer_mask = cpu_possible_map; 3833 goto out;
3834
3835 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
3836 goto out_free_buffer_mask;
3837
3838 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
3839 cpumask_copy(tracing_cpumask, cpu_all_mask);
3829 3840
3841 /* TODO: make the number of buffers hot pluggable with CPUS */
3830 global_trace.buffer = ring_buffer_alloc(trace_buf_size, 3842 global_trace.buffer = ring_buffer_alloc(trace_buf_size,
3831 TRACE_BUFFER_FLAGS); 3843 TRACE_BUFFER_FLAGS);
3832 if (!global_trace.buffer) { 3844 if (!global_trace.buffer) {
3833 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 3845 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
3834 WARN_ON(1); 3846 WARN_ON(1);
3835 return 0; 3847 goto out_free_cpumask;
3836 } 3848 }
3837 global_trace.entries = ring_buffer_size(global_trace.buffer); 3849 global_trace.entries = ring_buffer_size(global_trace.buffer);
3838 3850
3851
3839#ifdef CONFIG_TRACER_MAX_TRACE 3852#ifdef CONFIG_TRACER_MAX_TRACE
3840 max_tr.buffer = ring_buffer_alloc(trace_buf_size, 3853 max_tr.buffer = ring_buffer_alloc(trace_buf_size,
3841 TRACE_BUFFER_FLAGS); 3854 TRACE_BUFFER_FLAGS);
@@ -3843,7 +3856,7 @@ __init static int tracer_alloc_buffers(void)
3843 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 3856 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
3844 WARN_ON(1); 3857 WARN_ON(1);
3845 ring_buffer_free(global_trace.buffer); 3858 ring_buffer_free(global_trace.buffer);
3846 return 0; 3859 goto out_free_cpumask;
3847 } 3860 }
3848 max_tr.entries = ring_buffer_size(max_tr.buffer); 3861 max_tr.entries = ring_buffer_size(max_tr.buffer);
3849 WARN_ON(max_tr.entries != global_trace.entries); 3862 WARN_ON(max_tr.entries != global_trace.entries);
@@ -3873,8 +3886,14 @@ __init static int tracer_alloc_buffers(void)
3873 &trace_panic_notifier); 3886 &trace_panic_notifier);
3874 3887
3875 register_die_notifier(&trace_die_notifier); 3888 register_die_notifier(&trace_die_notifier);
3889 ret = 0;
3876 3890
3877 return 0; 3891out_free_cpumask:
3892 free_cpumask_var(tracing_cpumask);
3893out_free_buffer_mask:
3894 free_cpumask_var(tracing_buffer_mask);
3895out:
3896 return ret;
3878} 3897}
3879early_initcall(tracer_alloc_buffers); 3898early_initcall(tracer_alloc_buffers);
3880fs_initcall(tracer_init_debugfs); 3899fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f864036..4d3d381bfd95 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -368,7 +368,7 @@ struct trace_iterator {
368 loff_t pos; 368 loff_t pos;
369 long idx; 369 long idx;
370 370
371 cpumask_t started; 371 cpumask_var_t started;
372}; 372};
373 373
374int tracing_is_enabled(void); 374int tracing_is_enabled(void);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 3ccebde28482..366c8c333e13 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -42,7 +42,7 @@ static int boot_trace_init(struct trace_array *tr)
42 int cpu; 42 int cpu;
43 boot_trace = tr; 43 boot_trace = tr;
44 44
45 for_each_cpu_mask(cpu, cpu_possible_map) 45 for_each_cpu(cpu, cpu_possible_mask)
46 tracing_reset(tr, cpu); 46 tracing_reset(tr, cpu);
47 47
48 tracing_sched_switch_assign_trace(tr); 48 tracing_sched_switch_assign_trace(tr);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4bf39fcae97a..930c08e5b38e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -79,7 +79,7 @@ print_graph_cpu(struct trace_seq *s, int cpu)
79 int i; 79 int i;
80 int ret; 80 int ret;
81 int log10_this = log10_cpu(cpu); 81 int log10_this = log10_cpu(cpu);
82 int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map)); 82 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
83 83
84 84
85 /* 85 /*
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index b6a3e20a49a9..649df22d435f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -46,7 +46,7 @@ static void bts_trace_start(struct trace_array *tr)
46 46
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(tr);
48 48
49 for_each_cpu_mask(cpu, cpu_possible_map) 49 for_each_cpu(cpu, cpu_possible_mask)
50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); 50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
51} 51}
52 52
@@ -62,7 +62,7 @@ static void bts_trace_stop(struct trace_array *tr)
62{ 62{
63 int cpu; 63 int cpu;
64 64
65 for_each_cpu_mask(cpu, cpu_possible_map) 65 for_each_cpu(cpu, cpu_possible_mask)
66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); 66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
67} 67}
68 68
@@ -172,7 +172,7 @@ static void trace_bts_prepare(struct trace_iterator *iter)
172{ 172{
173 int cpu; 173 int cpu;
174 174
175 for_each_cpu_mask(cpu, cpu_possible_map) 175 for_each_cpu(cpu, cpu_possible_mask)
176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1); 176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
177} 177}
178 178
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7c2e326bbc8b..62a78d943534 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
380 380
381static void __irqsoff_tracer_init(struct trace_array *tr) 381static void __irqsoff_tracer_init(struct trace_array *tr)
382{ 382{
383 tracing_max_latency = 0;
383 irqsoff_trace = tr; 384 irqsoff_trace = tr;
384 /* make sure that the tracer is visible */ 385 /* make sure that the tracer is visible */
385 smp_wmb(); 386 smp_wmb();
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index a7172a352f62..7bda248daf55 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -39,7 +39,7 @@ static int power_trace_init(struct trace_array *tr)
39 39
40 trace_power_enabled = 1; 40 trace_power_enabled = 1;
41 41
42 for_each_cpu_mask(cpu, cpu_possible_map) 42 for_each_cpu(cpu, cpu_possible_mask)
43 tracing_reset(tr, cpu); 43 tracing_reset(tr, cpu);
44 return 0; 44 return 0;
45} 45}
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 43586b689e31..42ae1e77b6b3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -333,6 +333,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
333 333
334static int wakeup_tracer_init(struct trace_array *tr) 334static int wakeup_tracer_init(struct trace_array *tr)
335{ 335{
336 tracing_max_latency = 0;
336 wakeup_trace = tr; 337 wakeup_trace = tr;
337 start_wakeup_tracer(tr); 338 start_wakeup_tracer(tr);
338 return 0; 339 return 0;
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index a5779bd975db..eaca5ad803ff 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -196,9 +196,9 @@ static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
196 return HRTIMER_RESTART; 196 return HRTIMER_RESTART;
197} 197}
198 198
199static void start_stack_timer(int cpu) 199static void start_stack_timer(void *unused)
200{ 200{
201 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); 201 struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer);
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
@@ -208,14 +208,7 @@ static void start_stack_timer(int cpu)
208 208
209static void start_stack_timers(void) 209static void start_stack_timers(void)
210{ 210{
211 cpumask_t saved_mask = current->cpus_allowed; 211 on_each_cpu(start_stack_timer, NULL, 1);
212 int cpu;
213
214 for_each_online_cpu(cpu) {
215 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
216 start_stack_timer(cpu);
217 }
218 set_cpus_allowed_ptr(current, &saved_mask);
219} 212}
220 213
221static void stop_stack_timer(int cpu) 214static void stop_stack_timer(int cpu)
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 2dc06ab35716..43f891b05a4b 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -92,8 +92,8 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
92 mm = get_task_mm(p); 92 mm = get_task_mm(p);
93 if (mm) { 93 if (mm) {
94 /* adjust to KB unit */ 94 /* adjust to KB unit */
95 stats->hiwater_rss = mm->hiwater_rss * PAGE_SIZE / KB; 95 stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB;
96 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; 96 stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB;
97 mmput(mm); 97 mmput(mm);
98 } 98 }
99 stats->read_char = p->ioac.rchar; 99 stats->read_char = p->ioac.rchar;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 2460c3199b5a..0314501688b9 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -17,7 +17,7 @@
17 17
18#include <asm/uaccess.h> 18#include <asm/uaccess.h>
19 19
20asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gid_t group) 20SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
21{ 21{
22 long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); 22 long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
23 /* avoid REGPARM breakage on x86: */ 23 /* avoid REGPARM breakage on x86: */
@@ -25,7 +25,7 @@ asmlinkage long sys_chown16(const char __user * filename, old_uid_t user, old_gi
25 return ret; 25 return ret;
26} 26}
27 27
28asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_gid_t group) 28SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
29{ 29{
30 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); 30 long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
31 /* avoid REGPARM breakage on x86: */ 31 /* avoid REGPARM breakage on x86: */
@@ -33,7 +33,7 @@ asmlinkage long sys_lchown16(const char __user * filename, old_uid_t user, old_g
33 return ret; 33 return ret;
34} 34}
35 35
36asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group) 36SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
37{ 37{
38 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); 38 long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
39 /* avoid REGPARM breakage on x86: */ 39 /* avoid REGPARM breakage on x86: */
@@ -41,7 +41,7 @@ asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group)
41 return ret; 41 return ret;
42} 42}
43 43
44asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid) 44SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
45{ 45{
46 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); 46 long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
47 /* avoid REGPARM breakage on x86: */ 47 /* avoid REGPARM breakage on x86: */
@@ -49,7 +49,7 @@ asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid)
49 return ret; 49 return ret;
50} 50}
51 51
52asmlinkage long sys_setgid16(old_gid_t gid) 52SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
53{ 53{
54 long ret = sys_setgid(low2highgid(gid)); 54 long ret = sys_setgid(low2highgid(gid));
55 /* avoid REGPARM breakage on x86: */ 55 /* avoid REGPARM breakage on x86: */
@@ -57,7 +57,7 @@ asmlinkage long sys_setgid16(old_gid_t gid)
57 return ret; 57 return ret;
58} 58}
59 59
60asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid) 60SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
61{ 61{
62 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); 62 long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
63 /* avoid REGPARM breakage on x86: */ 63 /* avoid REGPARM breakage on x86: */
@@ -65,7 +65,7 @@ asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid)
65 return ret; 65 return ret;
66} 66}
67 67
68asmlinkage long sys_setuid16(old_uid_t uid) 68SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
69{ 69{
70 long ret = sys_setuid(low2highuid(uid)); 70 long ret = sys_setuid(low2highuid(uid));
71 /* avoid REGPARM breakage on x86: */ 71 /* avoid REGPARM breakage on x86: */
@@ -73,7 +73,7 @@ asmlinkage long sys_setuid16(old_uid_t uid)
73 return ret; 73 return ret;
74} 74}
75 75
76asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid) 76SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
77{ 77{
78 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), 78 long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
79 low2highuid(suid)); 79 low2highuid(suid));
@@ -82,7 +82,7 @@ asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid)
82 return ret; 82 return ret;
83} 83}
84 84
85asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid, old_uid_t __user *suid) 85SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid)
86{ 86{
87 const struct cred *cred = current_cred(); 87 const struct cred *cred = current_cred();
88 int retval; 88 int retval;
@@ -94,7 +94,7 @@ asmlinkage long sys_getresuid16(old_uid_t __user *ruid, old_uid_t __user *euid,
94 return retval; 94 return retval;
95} 95}
96 96
97asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid) 97SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
98{ 98{
99 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), 99 long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
100 low2highgid(sgid)); 100 low2highgid(sgid));
@@ -103,7 +103,8 @@ asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid)
103 return ret; 103 return ret;
104} 104}
105 105
106asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid, old_gid_t __user *sgid) 106
107SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid)
107{ 108{
108 const struct cred *cred = current_cred(); 109 const struct cred *cred = current_cred();
109 int retval; 110 int retval;
@@ -115,7 +116,7 @@ asmlinkage long sys_getresgid16(old_gid_t __user *rgid, old_gid_t __user *egid,
115 return retval; 116 return retval;
116} 117}
117 118
118asmlinkage long sys_setfsuid16(old_uid_t uid) 119SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
119{ 120{
120 long ret = sys_setfsuid(low2highuid(uid)); 121 long ret = sys_setfsuid(low2highuid(uid));
121 /* avoid REGPARM breakage on x86: */ 122 /* avoid REGPARM breakage on x86: */
@@ -123,7 +124,7 @@ asmlinkage long sys_setfsuid16(old_uid_t uid)
123 return ret; 124 return ret;
124} 125}
125 126
126asmlinkage long sys_setfsgid16(old_gid_t gid) 127SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
127{ 128{
128 long ret = sys_setfsgid(low2highgid(gid)); 129 long ret = sys_setfsgid(low2highgid(gid));
129 /* avoid REGPARM breakage on x86: */ 130 /* avoid REGPARM breakage on x86: */
@@ -161,7 +162,7 @@ static int groups16_from_user(struct group_info *group_info,
161 return 0; 162 return 0;
162} 163}
163 164
164asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist) 165SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
165{ 166{
166 const struct cred *cred = current_cred(); 167 const struct cred *cred = current_cred();
167 int i; 168 int i;
@@ -184,7 +185,7 @@ out:
184 return i; 185 return i;
185} 186}
186 187
187asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist) 188SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
188{ 189{
189 struct group_info *group_info; 190 struct group_info *group_info;
190 int retval; 191 int retval;
@@ -209,22 +210,22 @@ asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist)
209 return retval; 210 return retval;
210} 211}
211 212
212asmlinkage long sys_getuid16(void) 213SYSCALL_DEFINE0(getuid16)
213{ 214{
214 return high2lowuid(current_uid()); 215 return high2lowuid(current_uid());
215} 216}
216 217
217asmlinkage long sys_geteuid16(void) 218SYSCALL_DEFINE0(geteuid16)
218{ 219{
219 return high2lowuid(current_euid()); 220 return high2lowuid(current_euid());
220} 221}
221 222
222asmlinkage long sys_getgid16(void) 223SYSCALL_DEFINE0(getgid16)
223{ 224{
224 return high2lowgid(current_gid()); 225 return high2lowgid(current_gid());
225} 226}
226 227
227asmlinkage long sys_getegid16(void) 228SYSCALL_DEFINE0(getegid16)
228{ 229{
229 return high2lowgid(current_egid()); 230 return high2lowgid(current_egid());
230} 231}
diff --git a/kernel/up.c b/kernel/up.c
new file mode 100644
index 000000000000..1ff27a28bb7d
--- /dev/null
+++ b/kernel/up.c
@@ -0,0 +1,21 @@
1/*
2 * Uniprocessor-only support functions. The counterpart to kernel/smp.c
3 */
4
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/module.h>
8#include <linux/smp.h>
9
10int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
11 int wait)
12{
13 WARN_ON(cpu != 0);
14
15 local_irq_disable();
16 (func)(info);
17 local_irq_enable();
18
19 return 0;
20}
21EXPORT_SYMBOL(smp_call_function_single);
diff --git a/kernel/wait.c b/kernel/wait.c
index cd87131f2fc2..42a2dbc181c8 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -91,6 +91,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
91} 91}
92EXPORT_SYMBOL(prepare_to_wait_exclusive); 92EXPORT_SYMBOL(prepare_to_wait_exclusive);
93 93
94/*
95 * finish_wait - clean up after waiting in a queue
96 * @q: waitqueue waited on
97 * @wait: wait descriptor
98 *
99 * Sets current thread back to running state and removes
100 * the wait descriptor from the given waitqueue if still
101 * queued.
102 */
94void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) 103void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
95{ 104{
96 unsigned long flags; 105 unsigned long flags;
@@ -117,6 +126,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
117} 126}
118EXPORT_SYMBOL(finish_wait); 127EXPORT_SYMBOL(finish_wait);
119 128
129/*
130 * abort_exclusive_wait - abort exclusive waiting in a queue
131 * @q: waitqueue waited on
132 * @wait: wait descriptor
133 * @state: runstate of the waiter to be woken
134 * @key: key to identify a wait bit queue or %NULL
135 *
136 * Sets current thread back to running state and removes
137 * the wait descriptor from the given waitqueue if still
138 * queued.
139 *
140 * Wakes up the next waiter if the caller is concurrently
141 * woken up through the queue.
142 *
143 * This prevents waiter starvation where an exclusive waiter
144 * aborts and is woken up concurrently and noone wakes up
145 * the next waiter.
146 */
147void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
148 unsigned int mode, void *key)
149{
150 unsigned long flags;
151
152 __set_current_state(TASK_RUNNING);
153 spin_lock_irqsave(&q->lock, flags);
154 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key);
158 spin_unlock_irqrestore(&q->lock, flags);
159}
160EXPORT_SYMBOL(abort_exclusive_wait);
161
120int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) 162int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
121{ 163{
122 int ret = default_wake_function(wait, mode, sync, key); 164 int ret = default_wake_function(wait, mode, sync, key);
@@ -177,17 +219,20 @@ int __sched
177__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 219__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
178 int (*action)(void *), unsigned mode) 220 int (*action)(void *), unsigned mode)
179{ 221{
180 int ret = 0;
181
182 do { 222 do {
223 int ret;
224
183 prepare_to_wait_exclusive(wq, &q->wait, mode); 225 prepare_to_wait_exclusive(wq, &q->wait, mode);
184 if (test_bit(q->key.bit_nr, q->key.flags)) { 226 if (!test_bit(q->key.bit_nr, q->key.flags))
185 if ((ret = (*action)(q->key.flags))) 227 continue;
186 break; 228 ret = action(q->key.flags);
187 } 229 if (!ret)
230 continue;
231 abort_exclusive_wait(wq, &q->wait, mode, &q->key);
232 return ret;
188 } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); 233 } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
189 finish_wait(wq, &q->wait); 234 finish_wait(wq, &q->wait);
190 return ret; 235 return 0;
191} 236}
192EXPORT_SYMBOL(__wait_on_bit_lock); 237EXPORT_SYMBOL(__wait_on_bit_lock);
193 238
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4952322cba45..1f0c509b40d3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 73static LIST_HEAD(workqueues);
74 74
75static int singlethread_cpu __read_mostly; 75static int singlethread_cpu __read_mostly;
76static cpumask_t cpu_singlethread_map __read_mostly; 76static const struct cpumask *cpu_singlethread_map __read_mostly;
77/* 77/*
78 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD 78 * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD
79 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work 79 * flushes cwq->worklist. This means that flush_workqueue/wait_on_work
@@ -81,7 +81,7 @@ static cpumask_t cpu_singlethread_map __read_mostly;
81 * use cpu_possible_map, the cpumask below is more a documentation 81 * use cpu_possible_map, the cpumask below is more a documentation
82 * than optimization. 82 * than optimization.
83 */ 83 */
84static cpumask_t cpu_populated_map __read_mostly; 84static cpumask_var_t cpu_populated_map __read_mostly;
85 85
86/* If it's single threaded, it isn't in the list of workqueues. */ 86/* If it's single threaded, it isn't in the list of workqueues. */
87static inline int is_wq_single_threaded(struct workqueue_struct *wq) 87static inline int is_wq_single_threaded(struct workqueue_struct *wq)
@@ -89,10 +89,10 @@ static inline int is_wq_single_threaded(struct workqueue_struct *wq)
89 return wq->singlethread; 89 return wq->singlethread;
90} 90}
91 91
92static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq) 92static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq)
93{ 93{
94 return is_wq_single_threaded(wq) 94 return is_wq_single_threaded(wq)
95 ? &cpu_singlethread_map : &cpu_populated_map; 95 ? cpu_singlethread_map : cpu_populated_map;
96} 96}
97 97
98static 98static
@@ -410,7 +410,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
410 */ 410 */
411void flush_workqueue(struct workqueue_struct *wq) 411void flush_workqueue(struct workqueue_struct *wq)
412{ 412{
413 const cpumask_t *cpu_map = wq_cpu_map(wq); 413 const struct cpumask *cpu_map = wq_cpu_map(wq);
414 int cpu; 414 int cpu;
415 415
416 might_sleep(); 416 might_sleep();
@@ -532,7 +532,7 @@ static void wait_on_work(struct work_struct *work)
532{ 532{
533 struct cpu_workqueue_struct *cwq; 533 struct cpu_workqueue_struct *cwq;
534 struct workqueue_struct *wq; 534 struct workqueue_struct *wq;
535 const cpumask_t *cpu_map; 535 const struct cpumask *cpu_map;
536 int cpu; 536 int cpu;
537 537
538 might_sleep(); 538 might_sleep();
@@ -903,7 +903,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
903 */ 903 */
904void destroy_workqueue(struct workqueue_struct *wq) 904void destroy_workqueue(struct workqueue_struct *wq)
905{ 905{
906 const cpumask_t *cpu_map = wq_cpu_map(wq); 906 const struct cpumask *cpu_map = wq_cpu_map(wq);
907 int cpu; 907 int cpu;
908 908
909 cpu_maps_update_begin(); 909 cpu_maps_update_begin();
@@ -933,7 +933,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
933 933
934 switch (action) { 934 switch (action) {
935 case CPU_UP_PREPARE: 935 case CPU_UP_PREPARE:
936 cpu_set(cpu, cpu_populated_map); 936 cpumask_set_cpu(cpu, cpu_populated_map);
937 } 937 }
938undo: 938undo:
939 list_for_each_entry(wq, &workqueues, list) { 939 list_for_each_entry(wq, &workqueues, list) {
@@ -964,13 +964,15 @@ undo:
964 switch (action) { 964 switch (action) {
965 case CPU_UP_CANCELED: 965 case CPU_UP_CANCELED:
966 case CPU_POST_DEAD: 966 case CPU_POST_DEAD:
967 cpu_clear(cpu, cpu_populated_map); 967 cpumask_clear_cpu(cpu, cpu_populated_map);
968 } 968 }
969 969
970 return ret; 970 return ret;
971} 971}
972 972
973#ifdef CONFIG_SMP 973#ifdef CONFIG_SMP
974static struct workqueue_struct *work_on_cpu_wq __read_mostly;
975
974struct work_for_cpu { 976struct work_for_cpu {
975 struct work_struct work; 977 struct work_struct work;
976 long (*fn)(void *); 978 long (*fn)(void *);
@@ -991,8 +993,8 @@ static void do_work_for_cpu(struct work_struct *w)
991 * @fn: the function to run 993 * @fn: the function to run
992 * @arg: the function arg 994 * @arg: the function arg
993 * 995 *
994 * This will return -EINVAL in the cpu is not online, or the return value 996 * This will return the value @fn returns.
995 * of @fn otherwise. 997 * It is up to the caller to ensure that the cpu doesn't go offline.
996 */ 998 */
997long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) 999long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
998{ 1000{
@@ -1001,14 +1003,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
1001 INIT_WORK(&wfc.work, do_work_for_cpu); 1003 INIT_WORK(&wfc.work, do_work_for_cpu);
1002 wfc.fn = fn; 1004 wfc.fn = fn;
1003 wfc.arg = arg; 1005 wfc.arg = arg;
1004 get_online_cpus(); 1006 queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
1005 if (unlikely(!cpu_online(cpu))) 1007 flush_work(&wfc.work);
1006 wfc.ret = -EINVAL;
1007 else {
1008 schedule_work_on(cpu, &wfc.work);
1009 flush_work(&wfc.work);
1010 }
1011 put_online_cpus();
1012 1008
1013 return wfc.ret; 1009 return wfc.ret;
1014} 1010}
@@ -1017,10 +1013,16 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
1017 1013
1018void __init init_workqueues(void) 1014void __init init_workqueues(void)
1019{ 1015{
1020 cpu_populated_map = cpu_online_map; 1016 alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL);
1021 singlethread_cpu = first_cpu(cpu_possible_map); 1017
1022 cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu); 1018 cpumask_copy(cpu_populated_map, cpu_online_mask);
1019 singlethread_cpu = cpumask_first(cpu_possible_mask);
1020 cpu_singlethread_map = cpumask_of(singlethread_cpu);
1023 hotcpu_notifier(workqueue_cpu_callback, 0); 1021 hotcpu_notifier(workqueue_cpu_callback, 0);
1024 keventd_wq = create_workqueue("events"); 1022 keventd_wq = create_workqueue("events");
1025 BUG_ON(!keventd_wq); 1023 BUG_ON(!keventd_wq);
1024#ifdef CONFIG_SMP
1025 work_on_cpu_wq = create_workqueue("work_on_cpu");
1026 BUG_ON(!work_on_cpu_wq);
1027#endif
1026} 1028}